diff --git "a/sft/Full_remoe/checkpoint-16632/trainer_state.json" "b/sft/Full_remoe/checkpoint-16632/trainer_state.json" new file mode 100644--- /dev/null +++ "b/sft/Full_remoe/checkpoint-16632/trainer_state.json" @@ -0,0 +1,282777 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.04940867, + "auxiliary_loss_mlp": 0.02022554, + "balance_loss_clip": 2.42054653, + "balance_loss_mlp": 1.60770607, + "epoch": 6.012325266796934e-05, + "flos": 24456507091200.0, + "grad_norm": 431.42742291568476, + "language_loss": 2.93780541, + "learning_rate": 0.0, + "loss": 1.9937849, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 25.140625, + "router_z_loss_mlp": 4.15039062, + "step": 1, + "time_per_iteration": 18.08106780052185 + }, + { + "auxiliary_loss_clip": 0.03306889, + "auxiliary_loss_mlp": 0.01334092, + "balance_loss_clip": 1.61681187, + "balance_loss_mlp": 1.08403873, + "epoch": 0.00012024650533593868, + "flos": 20225931246720.0, + "grad_norm": 56.60211952861628, + "language_loss": 1.85136652, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.89777637, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 16.875, + "router_z_loss_mlp": 2.49804688, + "step": 2, + "time_per_iteration": 2.448704242706299 + }, + { + "auxiliary_loss_clip": 0.03262712, + "auxiliary_loss_mlp": 0.01333688, + "balance_loss_clip": 1.61833572, + "balance_loss_mlp": 1.08916557, + "epoch": 0.000180369758003908, + "flos": 22309935454080.0, + "grad_norm": 231.8354364196141, + "language_loss": 1.62016344, + "learning_rate": 7.073439208833112e-07, + "loss": 1.66612744, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 16.453125, + "router_z_loss_mlp": 2.44335938, + "step": 3, + "time_per_iteration": 2.451188325881958 + }, + { + "auxiliary_loss_clip": 0.03285281, + "auxiliary_loss_mlp": 0.01353865, + "balance_loss_clip": 1.61045289, + "balance_loss_mlp": 1.07462955, + "epoch": 0.00024049301067187735, + "flos": 22414650577920.0, + "grad_norm": 75.43225617412423, + "language_loss": 1.71785331, + "learning_rate": 8.925686513863519e-07, + "loss": 1.76424468, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 16.75, + "router_z_loss_mlp": 2.79296875, + "step": 4, + "time_per_iteration": 2.549875259399414 + }, + { + "auxiliary_loss_clip": 0.03328519, + "auxiliary_loss_mlp": 0.01362364, + "balance_loss_clip": 1.61330354, + "balance_loss_mlp": 1.09514475, + "epoch": 0.0003006162633398467, + "flos": 21396978449280.0, + "grad_norm": 63.29544907072378, + "language_loss": 1.95051217, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.99742103, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 17.15625, + "router_z_loss_mlp": 2.66992188, + "step": 5, + "time_per_iteration": 2.734335422515869 + }, + { + "auxiliary_loss_clip": 0.03290469, + "auxiliary_loss_mlp": 0.01366025, + "balance_loss_clip": 1.60763097, + "balance_loss_mlp": 1.09689856, + "epoch": 0.000360739516007816, + "flos": 21652375127040.0, + "grad_norm": 123.76547633955164, + "language_loss": 1.64787006, + "learning_rate": 1.153628246576487e-06, + "loss": 1.694435, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 16.84375, + "router_z_loss_mlp": 2.69335938, + "step": 6, + "time_per_iteration": 2.7639269828796387 + }, + { + "auxiliary_loss_clip": 0.0325183, + "auxiliary_loss_mlp": 0.0132199, + "balance_loss_clip": 1.60599482, + "balance_loss_mlp": 1.06659627, + "epoch": 0.0004208627686757854, + "flos": 27159742897920.0, + "grad_norm": 263.0274420605537, + "language_loss": 1.55358648, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.5993247, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 16.46875, + "router_z_loss_mlp": 2.55273438, + "step": 7, + "time_per_iteration": 2.7299726009368896 + }, + { + "auxiliary_loss_clip": 0.03272678, + "auxiliary_loss_mlp": 0.01337296, + "balance_loss_clip": 1.60518384, + "balance_loss_mlp": 1.07923198, + "epoch": 0.0004809860213437547, + "flos": 31319096135040.0, + "grad_norm": 151.48753424455907, + "language_loss": 1.77746153, + "learning_rate": 1.338852977079528e-06, + "loss": 1.82356131, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 16.6875, + "router_z_loss_mlp": 2.58398438, + "step": 8, + "time_per_iteration": 2.8052265644073486 + }, + { + "auxiliary_loss_clip": 0.03283289, + "auxiliary_loss_mlp": 0.01347131, + "balance_loss_clip": 1.6042273, + "balance_loss_mlp": 1.08372593, + "epoch": 0.000541109274011724, + "flos": 32160411463680.0, + "grad_norm": 548.4434083033489, + "language_loss": 1.51649261, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.56279671, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 16.796875, + "router_z_loss_mlp": 2.63476562, + "step": 9, + "time_per_iteration": 2.7609801292419434 + }, + { + "auxiliary_loss_clip": 0.03260777, + "auxiliary_loss_mlp": 0.01341184, + "balance_loss_clip": 1.60495639, + "balance_loss_mlp": 1.08578992, + "epoch": 0.0006012325266796934, + "flos": 18916808163840.0, + "grad_norm": 54.602955404329194, + "language_loss": 1.49180126, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.53782082, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 16.546875, + "router_z_loss_mlp": 2.5546875, + "step": 10, + "time_per_iteration": 2.715550422668457 + }, + { + "auxiliary_loss_clip": 0.03292792, + "auxiliary_loss_mlp": 0.01373374, + "balance_loss_clip": 1.61044312, + "balance_loss_mlp": 1.10768056, + "epoch": 0.0006613557793476627, + "flos": 20774861867520.0, + "grad_norm": 97.40042078981905, + "language_loss": 1.4816246, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.52828634, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 16.828125, + "router_z_loss_mlp": 2.65820312, + "step": 11, + "time_per_iteration": 2.695802927017212 + }, + { + "auxiliary_loss_clip": 0.03240874, + "auxiliary_loss_mlp": 0.01329048, + "balance_loss_clip": 1.60257936, + "balance_loss_mlp": 1.07441664, + "epoch": 0.000721479032015632, + "flos": 16581680997120.0, + "grad_norm": 33.74826511093444, + "language_loss": 1.46082878, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.50652802, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 16.390625, + "router_z_loss_mlp": 2.54492188, + "step": 12, + "time_per_iteration": 2.7831013202667236 + }, + { + "auxiliary_loss_clip": 0.03259551, + "auxiliary_loss_mlp": 0.01320259, + "balance_loss_clip": 1.60987043, + "balance_loss_mlp": 1.07802606, + "epoch": 0.0007816022846836014, + "flos": 23805471144960.0, + "grad_norm": 52.88327014025935, + "language_loss": 1.32444811, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.37024617, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 16.484375, + "router_z_loss_mlp": 2.421875, + "step": 13, + "time_per_iteration": 2.719764471054077 + }, + { + "auxiliary_loss_clip": 0.03254492, + "auxiliary_loss_mlp": 0.01332719, + "balance_loss_clip": 1.60936332, + "balance_loss_mlp": 1.07217491, + "epoch": 0.0008417255373515708, + "flos": 19172204841600.0, + "grad_norm": 1457.4443877763251, + "language_loss": 1.37907791, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.42495012, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 16.4765625, + "router_z_loss_mlp": 2.60546875, + "step": 14, + "time_per_iteration": 2.833561420440674 + }, + { + "auxiliary_loss_clip": 0.03223786, + "auxiliary_loss_mlp": 0.0133393, + "balance_loss_clip": 1.60960174, + "balance_loss_mlp": 1.07872701, + "epoch": 0.00090184879001954, + "flos": 26395564026240.0, + "grad_norm": 1348.3446942659677, + "language_loss": 1.16258526, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.20816243, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 16.1484375, + "router_z_loss_mlp": 2.55078125, + "step": 15, + "time_per_iteration": 2.720165729522705 + }, + { + "auxiliary_loss_clip": 0.03210462, + "auxiliary_loss_mlp": 0.01316266, + "balance_loss_clip": 1.59970784, + "balance_loss_mlp": 1.07193434, + "epoch": 0.0009619720426875094, + "flos": 24679500785280.0, + "grad_norm": 22.782794260985284, + "language_loss": 1.13624287, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.18151021, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 16.109375, + "router_z_loss_mlp": 2.44140625, + "step": 16, + "time_per_iteration": 2.729309558868408 + }, + { + "auxiliary_loss_clip": 0.03191205, + "auxiliary_loss_mlp": 0.01315719, + "balance_loss_clip": 1.60227561, + "balance_loss_mlp": 1.06967103, + "epoch": 0.0010220952953554788, + "flos": 18624531196800.0, + "grad_norm": 920.2190802851648, + "language_loss": 1.14784718, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.19291639, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 15.8828125, + "router_z_loss_mlp": 2.45898438, + "step": 17, + "time_per_iteration": 2.7290642261505127 + }, + { + "auxiliary_loss_clip": 0.03138035, + "auxiliary_loss_mlp": 0.01302877, + "balance_loss_clip": 1.59850764, + "balance_loss_mlp": 1.06731999, + "epoch": 0.001082218548023448, + "flos": 26142537646080.0, + "grad_norm": 20.61944900501725, + "language_loss": 1.11251807, + "learning_rate": 1.860972167459798e-06, + "loss": 1.15692711, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 15.3828125, + "router_z_loss_mlp": 2.35546875, + "step": 18, + "time_per_iteration": 2.727212905883789 + }, + { + "auxiliary_loss_clip": 0.03155722, + "auxiliary_loss_mlp": 0.01321716, + "balance_loss_clip": 1.60159111, + "balance_loss_mlp": 1.06193519, + "epoch": 0.0011423418006914173, + "flos": 19609776322560.0, + "grad_norm": 110.53388480760488, + "language_loss": 1.05804229, + "learning_rate": 1.89578346593066e-06, + "loss": 1.10281658, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 15.5390625, + "router_z_loss_mlp": 2.59765625, + "step": 19, + "time_per_iteration": 4.157699108123779 + }, + { + "auxiliary_loss_clip": 0.03110102, + "auxiliary_loss_mlp": 0.01308076, + "balance_loss_clip": 1.59889913, + "balance_loss_mlp": 1.07595205, + "epoch": 0.0012024650533593868, + "flos": 17895365107200.0, + "grad_norm": 265.8383866319236, + "language_loss": 1.21631849, + "learning_rate": 1.928808765521199e-06, + "loss": 1.26050031, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 15.109375, + "router_z_loss_mlp": 2.32226562, + "step": 20, + "time_per_iteration": 4.193562030792236 + }, + { + "auxiliary_loss_clip": 0.03097398, + "auxiliary_loss_mlp": 0.01302734, + "balance_loss_clip": 1.5930295, + "balance_loss_mlp": 1.05668616, + "epoch": 0.001262588306027356, + "flos": 21252043071360.0, + "grad_norm": 1041.224967491002, + "language_loss": 1.11708641, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.16108775, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 15.046875, + "router_z_loss_mlp": 2.4609375, + "step": 21, + "time_per_iteration": 2.689336061477661 + }, + { + "auxiliary_loss_clip": 0.03089181, + "auxiliary_loss_mlp": 0.01326468, + "balance_loss_clip": 1.58431077, + "balance_loss_mlp": 1.07984781, + "epoch": 0.0013227115586953253, + "flos": 26104077158400.0, + "grad_norm": 62.80105985588221, + "language_loss": 1.1550374, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.19919395, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 15.046875, + "router_z_loss_mlp": 2.46289062, + "step": 22, + "time_per_iteration": 2.823751926422119 + }, + { + "auxiliary_loss_clip": 0.03085493, + "auxiliary_loss_mlp": 0.01307047, + "balance_loss_clip": 1.58930326, + "balance_loss_mlp": 1.07168055, + "epoch": 0.0013828348113632948, + "flos": 23951376190080.0, + "grad_norm": 34.61602067786599, + "language_loss": 0.9921748, + "learning_rate": 2.018794797290208e-06, + "loss": 1.03610015, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 14.9765625, + "router_z_loss_mlp": 2.3515625, + "step": 23, + "time_per_iteration": 2.759662628173828 + }, + { + "auxiliary_loss_clip": 0.03078863, + "auxiliary_loss_mlp": 0.0128745, + "balance_loss_clip": 1.586483, + "balance_loss_mlp": 1.04807782, + "epoch": 0.001442958064031264, + "flos": 15959851724160.0, + "grad_norm": 28.24948465353262, + "language_loss": 1.16537142, + "learning_rate": 2.046196897962839e-06, + "loss": 1.20903444, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 14.921875, + "router_z_loss_mlp": 2.39257812, + "step": 24, + "time_per_iteration": 2.8151495456695557 + }, + { + "auxiliary_loss_clip": 0.02970239, + "auxiliary_loss_mlp": 0.01298334, + "balance_loss_clip": 1.57416224, + "balance_loss_mlp": 1.06144118, + "epoch": 0.0015030813166992333, + "flos": 18108350801280.0, + "grad_norm": 2130.6784355704317, + "language_loss": 1.13982749, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.18251324, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 13.96875, + "router_z_loss_mlp": 2.36914062, + "step": 25, + "time_per_iteration": 2.794750213623047 + }, + { + "auxiliary_loss_clip": 0.0297638, + "auxiliary_loss_mlp": 0.0129326, + "balance_loss_clip": 1.57925808, + "balance_loss_mlp": 1.05064487, + "epoch": 0.0015632045693672028, + "flos": 22234558763520.0, + "grad_norm": 90.52587352155847, + "language_loss": 1.13118839, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.17388487, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 13.984375, + "router_z_loss_mlp": 2.42773438, + "step": 26, + "time_per_iteration": 2.72493314743042 + }, + { + "auxiliary_loss_clip": 0.0295081, + "auxiliary_loss_mlp": 0.01281096, + "balance_loss_clip": 1.57585168, + "balance_loss_mlp": 1.04305911, + "epoch": 0.001623327822035172, + "flos": 23991955580160.0, + "grad_norm": 23.692710902198804, + "language_loss": 0.99895966, + "learning_rate": 2.122031762649933e-06, + "loss": 1.0412786, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 13.7421875, + "router_z_loss_mlp": 2.37695312, + "step": 27, + "time_per_iteration": 2.828873634338379 + }, + { + "auxiliary_loss_clip": 0.02924742, + "auxiliary_loss_mlp": 0.01286247, + "balance_loss_clip": 1.56620121, + "balance_loss_mlp": 1.06175172, + "epoch": 0.0016834510747031415, + "flos": 19677647070720.0, + "grad_norm": 31.75135772225323, + "language_loss": 1.11496043, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.1570704, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 13.59375, + "router_z_loss_mlp": 2.25, + "step": 28, + "time_per_iteration": 2.720031976699829 + }, + { + "auxiliary_loss_clip": 0.0287684, + "auxiliary_loss_mlp": 0.01300301, + "balance_loss_clip": 1.55965185, + "balance_loss_mlp": 1.0716095, + "epoch": 0.0017435743273711108, + "flos": 20923819568640.0, + "grad_norm": 66.86658138440043, + "language_loss": 1.08543491, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.12720633, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 13.1484375, + "router_z_loss_mlp": 2.28710938, + "step": 29, + "time_per_iteration": 2.7586584091186523 + }, + { + "auxiliary_loss_clip": 0.02855522, + "auxiliary_loss_mlp": 0.01310865, + "balance_loss_clip": 1.55363822, + "balance_loss_mlp": 1.07368672, + "epoch": 0.00180369758003908, + "flos": 19528976678400.0, + "grad_norm": 62.02926246580285, + "language_loss": 1.23949707, + "learning_rate": 2.189868360711334e-06, + "loss": 1.28116107, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 13.0078125, + "router_z_loss_mlp": 2.37011719, + "step": 30, + "time_per_iteration": 2.653543710708618 + }, + { + "auxiliary_loss_clip": 0.02808271, + "auxiliary_loss_mlp": 0.01299935, + "balance_loss_clip": 1.5520829, + "balance_loss_mlp": 1.07725203, + "epoch": 0.0018638208327070496, + "flos": 27453169100160.0, + "grad_norm": 42.371809069278704, + "language_loss": 1.09587204, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.13695407, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 12.5625, + "router_z_loss_mlp": 2.22558594, + "step": 31, + "time_per_iteration": 2.7936320304870605 + }, + { + "auxiliary_loss_clip": 0.02761796, + "auxiliary_loss_mlp": 0.01328903, + "balance_loss_clip": 1.55384374, + "balance_loss_mlp": 1.10364532, + "epoch": 0.0019239440853750188, + "flos": 13589460380160.0, + "grad_norm": 21.60605904386998, + "language_loss": 1.01039648, + "learning_rate": 2.2314216284658796e-06, + "loss": 1.05130339, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 12.078125, + "router_z_loss_mlp": 2.25390625, + "step": 32, + "time_per_iteration": 2.6848011016845703 + }, + { + "auxiliary_loss_clip": 0.02753326, + "auxiliary_loss_mlp": 0.01331473, + "balance_loss_clip": 1.54402947, + "balance_loss_mlp": 1.10354495, + "epoch": 0.001984067338042988, + "flos": 11253866336640.0, + "grad_norm": 49.09215801277915, + "language_loss": 1.02446342, + "learning_rate": 2.2512340280885094e-06, + "loss": 1.06531143, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 12.0859375, + "router_z_loss_mlp": 2.28320312, + "step": 33, + "time_per_iteration": 2.701322317123413 + }, + { + "auxiliary_loss_clip": 0.02702875, + "auxiliary_loss_mlp": 0.01312201, + "balance_loss_clip": 1.54197383, + "balance_loss_mlp": 1.09905469, + "epoch": 0.0020441905907109576, + "flos": 22386245898240.0, + "grad_norm": 65.90723670698493, + "language_loss": 0.94838488, + "learning_rate": 2.270454923596497e-06, + "loss": 0.98853564, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 11.6015625, + "router_z_loss_mlp": 2.12988281, + "step": 34, + "time_per_iteration": 2.764930009841919 + }, + { + "auxiliary_loss_clip": 0.02684131, + "auxiliary_loss_mlp": 0.01303448, + "balance_loss_clip": 1.52230144, + "balance_loss_mlp": 1.08610559, + "epoch": 0.0021043138433789266, + "flos": 49778580337920.0, + "grad_norm": 32.86618588314514, + "language_loss": 0.82620692, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.86608279, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 11.625, + "router_z_loss_mlp": 2.17480469, + "step": 35, + "time_per_iteration": 2.9288556575775146 + }, + { + "auxiliary_loss_clip": 0.02669394, + "auxiliary_loss_mlp": 0.01346577, + "balance_loss_clip": 1.53418922, + "balance_loss_mlp": 1.12990212, + "epoch": 0.002164437096046896, + "flos": 20557961591040.0, + "grad_norm": 444.95011720707805, + "language_loss": 0.93787003, + "learning_rate": 2.307256493152974e-06, + "loss": 0.97802973, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 11.3515625, + "router_z_loss_mlp": 2.16796875, + "step": 36, + "time_per_iteration": 2.775209903717041 + }, + { + "auxiliary_loss_clip": 0.02642179, + "auxiliary_loss_mlp": 0.01322472, + "balance_loss_clip": 1.53311217, + "balance_loss_mlp": 1.10903931, + "epoch": 0.0022245603487148656, + "flos": 26542295084160.0, + "grad_norm": 35.91763941301846, + "language_loss": 0.98899949, + "learning_rate": 2.3248973825097614e-06, + "loss": 1.02864599, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 11.0859375, + "router_z_loss_mlp": 2.13476562, + "step": 37, + "time_per_iteration": 2.7308075428009033 + }, + { + "auxiliary_loss_clip": 0.02620757, + "auxiliary_loss_mlp": 0.01316861, + "balance_loss_clip": 1.53019023, + "balance_loss_mlp": 1.12459993, + "epoch": 0.0022846836013828346, + "flos": 20338188226560.0, + "grad_norm": 22.968780722988555, + "language_loss": 1.07722783, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.11660409, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 10.8984375, + "router_z_loss_mlp": 1.92285156, + "step": 38, + "time_per_iteration": 2.69674015045166 + }, + { + "auxiliary_loss_clip": 0.02601211, + "auxiliary_loss_mlp": 0.01287829, + "balance_loss_clip": 1.52835381, + "balance_loss_mlp": 1.09060931, + "epoch": 0.002344806854050804, + "flos": 26247575992320.0, + "grad_norm": 99.21884267032314, + "language_loss": 0.89673287, + "learning_rate": 2.358792165262154e-06, + "loss": 0.93562323, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 10.75, + "router_z_loss_mlp": 1.97363281, + "step": 39, + "time_per_iteration": 2.737496852874756 + }, + { + "auxiliary_loss_clip": 0.02588993, + "auxiliary_loss_mlp": 0.01328981, + "balance_loss_clip": 1.52042925, + "balance_loss_mlp": 1.11154342, + "epoch": 0.0024049301067187736, + "flos": 11801539981440.0, + "grad_norm": 123.46383223716182, + "language_loss": 0.96389353, + "learning_rate": 2.3750930912143747e-06, + "loss": 1.00307322, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 10.6953125, + "router_z_loss_mlp": 2.17480469, + "step": 40, + "time_per_iteration": 2.6452231407165527 + }, + { + "auxiliary_loss_clip": 0.02517774, + "auxiliary_loss_mlp": 0.013075, + "balance_loss_clip": 1.50534654, + "balance_loss_mlp": 1.10551167, + "epoch": 0.0024650533593867426, + "flos": 20631506688000.0, + "grad_norm": 62.80579277283349, + "language_loss": 0.99583501, + "learning_rate": 2.3909914837471044e-06, + "loss": 1.03408778, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 10.1328125, + "router_z_loss_mlp": 2.02050781, + "step": 41, + "time_per_iteration": 2.6673805713653564 + }, + { + "auxiliary_loss_clip": 0.02477733, + "auxiliary_loss_mlp": 0.01301428, + "balance_loss_clip": 1.4929018, + "balance_loss_mlp": 1.1058296, + "epoch": 0.002525176612054712, + "flos": 18406122549120.0, + "grad_norm": 90.31424620439675, + "language_loss": 1.01721597, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.05500758, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 9.859375, + "router_z_loss_mlp": 1.95703125, + "step": 42, + "time_per_iteration": 2.6653618812561035 + }, + { + "auxiliary_loss_clip": 0.02434214, + "auxiliary_loss_mlp": 0.01327007, + "balance_loss_clip": 1.48420453, + "balance_loss_mlp": 1.12749863, + "epoch": 0.0025852998647226816, + "flos": 28184023128960.0, + "grad_norm": 65.64319758603754, + "language_loss": 1.0512743, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.0888865, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 9.5, + "router_z_loss_mlp": 1.99414062, + "step": 43, + "time_per_iteration": 2.7091448307037354 + }, + { + "auxiliary_loss_clip": 0.02462031, + "auxiliary_loss_mlp": 0.01331977, + "balance_loss_clip": 1.48602533, + "balance_loss_mlp": 1.11434817, + "epoch": 0.0026454231173906506, + "flos": 14283110897280.0, + "grad_norm": 47.931902994763774, + "language_loss": 1.0010066, + "learning_rate": 2.4364587585915504e-06, + "loss": 1.03894663, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 9.7421875, + "router_z_loss_mlp": 2.17675781, + "step": 44, + "time_per_iteration": 2.6538941860198975 + }, + { + "auxiliary_loss_clip": 0.0243126, + "auxiliary_loss_mlp": 0.01321845, + "balance_loss_clip": 1.48442781, + "balance_loss_mlp": 1.12376654, + "epoch": 0.00270554637005862, + "flos": 22419211605120.0, + "grad_norm": 140.88107798702873, + "language_loss": 1.05196345, + "learning_rate": 2.450927955901469e-06, + "loss": 1.08949459, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 9.46875, + "router_z_loss_mlp": 1.98046875, + "step": 45, + "time_per_iteration": 2.715353012084961 + }, + { + "auxiliary_loss_clip": 0.0240242, + "auxiliary_loss_mlp": 0.01341412, + "balance_loss_clip": 1.48302877, + "balance_loss_mlp": 1.15086818, + "epoch": 0.0027656696227265896, + "flos": 23985778440960.0, + "grad_norm": 14.239642536771683, + "language_loss": 1.06964588, + "learning_rate": 2.465079122983384e-06, + "loss": 1.10708416, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 9.1875, + "router_z_loss_mlp": 1.90722656, + "step": 46, + "time_per_iteration": 2.7373414039611816 + }, + { + "auxiliary_loss_clip": 0.02392466, + "auxiliary_loss_mlp": 0.01309438, + "balance_loss_clip": 1.47576904, + "balance_loss_mlp": 1.12461567, + "epoch": 0.0028257928753945586, + "flos": 37669503087360.0, + "grad_norm": 485.89104997546167, + "language_loss": 0.94949025, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.98650926, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 9.171875, + "router_z_loss_mlp": 1.84765625, + "step": 47, + "time_per_iteration": 2.8968899250030518 + }, + { + "auxiliary_loss_clip": 0.02338982, + "auxiliary_loss_mlp": 0.01305856, + "balance_loss_clip": 1.46775496, + "balance_loss_mlp": 1.13085663, + "epoch": 0.002885916128062528, + "flos": 22454547609600.0, + "grad_norm": 37.85568929195237, + "language_loss": 0.92363203, + "learning_rate": 2.492481223656015e-06, + "loss": 0.96008039, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 8.7109375, + "router_z_loss_mlp": 1.75195312, + "step": 48, + "time_per_iteration": 2.716738700866699 + }, + { + "auxiliary_loss_clip": 0.02336327, + "auxiliary_loss_mlp": 0.01283174, + "balance_loss_clip": 1.4516778, + "balance_loss_mlp": 1.09577692, + "epoch": 0.0029460393807304976, + "flos": 27012796358400.0, + "grad_norm": 19.500970315534268, + "language_loss": 0.95528328, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.99147832, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 8.84375, + "router_z_loss_mlp": 1.87402344, + "step": 49, + "time_per_iteration": 2.847170829772949 + }, + { + "auxiliary_loss_clip": 0.02329412, + "auxiliary_loss_mlp": 0.01258851, + "balance_loss_clip": 1.45790052, + "balance_loss_mlp": 1.0739336, + "epoch": 0.0030061626333984666, + "flos": 15851832549120.0, + "grad_norm": 5.603780464897849, + "language_loss": 0.96002185, + "learning_rate": 2.51876455396287e-06, + "loss": 0.99590451, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 8.734375, + "router_z_loss_mlp": 1.84960938, + "step": 50, + "time_per_iteration": 2.8095386028289795 + }, + { + "auxiliary_loss_clip": 0.02309276, + "auxiliary_loss_mlp": 0.01242595, + "balance_loss_clip": 1.45265245, + "balance_loss_mlp": 1.07083797, + "epoch": 0.003066285886066436, + "flos": 31827052316160.0, + "grad_norm": 28.45143537386336, + "language_loss": 0.95147896, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.98699766, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 8.5546875, + "router_z_loss_mlp": 1.71679688, + "step": 51, + "time_per_iteration": 2.8305585384368896 + }, + { + "auxiliary_loss_clip": 0.02256136, + "auxiliary_loss_mlp": 0.01225732, + "balance_loss_clip": 1.43516695, + "balance_loss_mlp": 1.06351173, + "epoch": 0.0031264091387344056, + "flos": 41427482774400.0, + "grad_norm": 94.42222103638578, + "language_loss": 1.00930095, + "learning_rate": 2.5440168957651953e-06, + "loss": 1.04411972, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 8.21484375, + "router_z_loss_mlp": 1.62207031, + "step": 52, + "time_per_iteration": 2.8783364295959473 + }, + { + "auxiliary_loss_clip": 0.02247874, + "auxiliary_loss_mlp": 0.01242103, + "balance_loss_clip": 1.43771672, + "balance_loss_mlp": 1.06796217, + "epoch": 0.0031865323914023747, + "flos": 23440941970560.0, + "grad_norm": 63312.633389651266, + "language_loss": 0.97376406, + "learning_rate": 2.5562811176888872e-06, + "loss": 1.00866389, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 8.1015625, + "router_z_loss_mlp": 1.74023438, + "step": 53, + "time_per_iteration": 2.6935160160064697 + }, + { + "auxiliary_loss_clip": 0.02221918, + "auxiliary_loss_mlp": 0.01199666, + "balance_loss_clip": 1.43675661, + "balance_loss_mlp": 1.03992581, + "epoch": 0.003246655644070344, + "flos": 14429195510400.0, + "grad_norm": 247.4179747170599, + "language_loss": 0.89060074, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.92481649, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 7.8671875, + "router_z_loss_mlp": 1.59765625, + "step": 54, + "time_per_iteration": 2.752204418182373 + }, + { + "auxiliary_loss_clip": 0.02200308, + "auxiliary_loss_mlp": 0.01194186, + "balance_loss_clip": 1.42074502, + "balance_loss_mlp": 1.03787887, + "epoch": 0.0033067788967383136, + "flos": 35918247496320.0, + "grad_norm": 101.76188740785923, + "language_loss": 0.89212346, + "learning_rate": 2.580130221340046e-06, + "loss": 0.92606837, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 7.79296875, + "router_z_loss_mlp": 1.56445312, + "step": 55, + "time_per_iteration": 2.822024345397949 + }, + { + "auxiliary_loss_clip": 0.02183447, + "auxiliary_loss_mlp": 0.01195923, + "balance_loss_clip": 1.40635633, + "balance_loss_mlp": 1.02960277, + "epoch": 0.003366902149406283, + "flos": 22958732862720.0, + "grad_norm": 172.27893092354108, + "language_loss": 0.9559992, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.98979294, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 7.7734375, + "router_z_loss_mlp": 1.66503906, + "step": 56, + "time_per_iteration": 2.7651193141937256 + }, + { + "auxiliary_loss_clip": 0.0220836, + "auxiliary_loss_mlp": 0.01165465, + "balance_loss_clip": 1.41535497, + "balance_loss_mlp": 1.01163781, + "epoch": 0.003427025402074252, + "flos": 26582838560640.0, + "grad_norm": 23.064460238577755, + "language_loss": 0.98894596, + "learning_rate": 2.6031273868139713e-06, + "loss": 1.02268422, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 7.92578125, + "router_z_loss_mlp": 1.53808594, + "step": 57, + "time_per_iteration": 2.7371528148651123 + }, + { + "auxiliary_loss_clip": 0.02138088, + "auxiliary_loss_mlp": 0.01206659, + "balance_loss_clip": 1.40319359, + "balance_loss_mlp": 1.05254483, + "epoch": 0.0034871486547422216, + "flos": 23951196622080.0, + "grad_norm": 103.12747954339874, + "language_loss": 1.04919195, + "learning_rate": 2.614325098333948e-06, + "loss": 1.08263934, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 7.34765625, + "router_z_loss_mlp": 1.54101562, + "step": 58, + "time_per_iteration": 2.6921706199645996 + }, + { + "auxiliary_loss_clip": 0.02156405, + "auxiliary_loss_mlp": 0.01169474, + "balance_loss_clip": 1.41648829, + "balance_loss_mlp": 1.01764917, + "epoch": 0.003547271907410191, + "flos": 21214983214080.0, + "grad_norm": 145.14848650609827, + "language_loss": 0.94359553, + "learning_rate": 2.625331386578098e-06, + "loss": 0.97685438, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 7.3984375, + "router_z_loss_mlp": 1.51660156, + "step": 59, + "time_per_iteration": 2.658005952835083 + }, + { + "auxiliary_loss_clip": 0.02177339, + "auxiliary_loss_mlp": 0.01133527, + "balance_loss_clip": 1.42449474, + "balance_loss_mlp": 0.98580259, + "epoch": 0.00360739516007816, + "flos": 16504903676160.0, + "grad_norm": 142.4494646255864, + "language_loss": 1.00314593, + "learning_rate": 2.63615268640451e-06, + "loss": 1.03625464, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 7.5390625, + "router_z_loss_mlp": 1.47558594, + "step": 60, + "time_per_iteration": 2.7726120948791504 + }, + { + "auxiliary_loss_clip": 0.02182476, + "auxiliary_loss_mlp": 0.01171676, + "balance_loss_clip": 1.4197166, + "balance_loss_mlp": 1.02185404, + "epoch": 0.0036675184127461296, + "flos": 19464805031040.0, + "grad_norm": 77.88206981789193, + "language_loss": 0.96767461, + "learning_rate": 2.6467951135575943e-06, + "loss": 1.00121617, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 7.6328125, + "router_z_loss_mlp": 1.5, + "step": 61, + "time_per_iteration": 4.129663944244385 + }, + { + "auxiliary_loss_clip": 0.02184578, + "auxiliary_loss_mlp": 0.01115908, + "balance_loss_clip": 1.42362571, + "balance_loss_mlp": 0.97962797, + "epoch": 0.003727641665414099, + "flos": 20957323979520.0, + "grad_norm": 20.725746417473825, + "language_loss": 0.93849456, + "learning_rate": 2.657264485425803e-06, + "loss": 0.97149944, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 7.6015625, + "router_z_loss_mlp": 1.36425781, + "step": 62, + "time_per_iteration": 4.0925445556640625 + }, + { + "auxiliary_loss_clip": 0.02194393, + "auxiliary_loss_mlp": 0.01165293, + "balance_loss_clip": 1.42796373, + "balance_loss_mlp": 1.00993955, + "epoch": 0.003787764918082068, + "flos": 18406050721920.0, + "grad_norm": 27.073009705191886, + "language_loss": 1.01741993, + "learning_rate": 2.6675663401385186e-06, + "loss": 1.05101681, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 7.6640625, + "router_z_loss_mlp": 1.55566406, + "step": 63, + "time_per_iteration": 2.694244861602783 + }, + { + "auxiliary_loss_clip": 0.02193907, + "auxiliary_loss_mlp": 0.01098423, + "balance_loss_clip": 1.42856252, + "balance_loss_mlp": 0.96071202, + "epoch": 0.0038478881707500376, + "flos": 12459243962880.0, + "grad_norm": 1335.0696375835914, + "language_loss": 1.06253374, + "learning_rate": 2.677705954159056e-06, + "loss": 1.09545708, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 7.65234375, + "router_z_loss_mlp": 1.37695312, + "step": 64, + "time_per_iteration": 2.651221990585327 + }, + { + "auxiliary_loss_clip": 0.02214258, + "auxiliary_loss_mlp": 0.01120772, + "balance_loss_clip": 1.43472815, + "balance_loss_mlp": 0.9811545, + "epoch": 0.003908011423418007, + "flos": 13553334276480.0, + "grad_norm": 173.61352144807333, + "language_loss": 0.92714763, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.96049798, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 7.80078125, + "router_z_loss_mlp": 1.39648438, + "step": 65, + "time_per_iteration": 2.6508736610412598 + }, + { + "auxiliary_loss_clip": 0.02212337, + "auxiliary_loss_mlp": 0.01121401, + "balance_loss_clip": 1.42848206, + "balance_loss_mlp": 0.97749192, + "epoch": 0.003968134676085976, + "flos": 18333475292160.0, + "grad_norm": 37.07558923721963, + "language_loss": 0.92252994, + "learning_rate": 2.697518353781685e-06, + "loss": 0.95586729, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 7.83203125, + "router_z_loss_mlp": 1.43945312, + "step": 66, + "time_per_iteration": 2.714247941970825 + }, + { + "auxiliary_loss_clip": 0.02225378, + "auxiliary_loss_mlp": 0.01124971, + "balance_loss_clip": 1.43057001, + "balance_loss_mlp": 0.97419536, + "epoch": 0.004028257928753946, + "flos": 20485242506880.0, + "grad_norm": 140.09729530731164, + "language_loss": 1.02202392, + "learning_rate": 2.7072005239581103e-06, + "loss": 1.05552721, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 7.953125, + "router_z_loss_mlp": 1.5078125, + "step": 67, + "time_per_iteration": 2.6621944904327393 + }, + { + "auxiliary_loss_clip": 0.02190293, + "auxiliary_loss_mlp": 0.0113862, + "balance_loss_clip": 1.42736208, + "balance_loss_mlp": 1.00233984, + "epoch": 0.004088381181421915, + "flos": 18843837684480.0, + "grad_norm": 22.033752812797886, + "language_loss": 1.00362921, + "learning_rate": 2.7167392492896727e-06, + "loss": 1.0369184, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 7.62890625, + "router_z_loss_mlp": 1.36328125, + "step": 68, + "time_per_iteration": 2.664377450942993 + }, + { + "auxiliary_loss_clip": 0.02195274, + "auxiliary_loss_mlp": 0.01110621, + "balance_loss_clip": 1.42951298, + "balance_loss_mlp": 0.97691542, + "epoch": 0.004148504434089885, + "flos": 19427817000960.0, + "grad_norm": 19.55597657055439, + "language_loss": 1.01218116, + "learning_rate": 2.7261387181735195e-06, + "loss": 1.04524004, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 7.6640625, + "router_z_loss_mlp": 1.33789062, + "step": 69, + "time_per_iteration": 2.678802490234375 + }, + { + "auxiliary_loss_clip": 0.02196015, + "auxiliary_loss_mlp": 0.01142339, + "balance_loss_clip": 1.42513716, + "balance_loss_mlp": 1.00348401, + "epoch": 0.004208627686757853, + "flos": 20811023884800.0, + "grad_norm": 125.86886574365333, + "language_loss": 1.05500841, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.0883919, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 7.70703125, + "router_z_loss_mlp": 1.38769531, + "step": 70, + "time_per_iteration": 2.725579023361206 + }, + { + "auxiliary_loss_clip": 0.02225095, + "auxiliary_loss_mlp": 0.01086273, + "balance_loss_clip": 1.43085718, + "balance_loss_mlp": 0.95094681, + "epoch": 0.004268750939425823, + "flos": 19098623831040.0, + "grad_norm": 50.70332776348157, + "language_loss": 1.06066394, + "learning_rate": 2.7445357464116983e-06, + "loss": 1.09377766, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 7.94140625, + "router_z_loss_mlp": 1.35449219, + "step": 71, + "time_per_iteration": 2.7464962005615234 + }, + { + "auxiliary_loss_clip": 0.02049182, + "auxiliary_loss_mlp": 0.00866868, + "balance_loss_clip": 1.43269873, + "balance_loss_mlp": 0.78904808, + "epoch": 0.004328874192093792, + "flos": 52439635514880.0, + "grad_norm": 2.4793740484044093, + "language_loss": 0.65862, + "learning_rate": 2.75354081884615e-06, + "loss": 0.6877805, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 6.1875, + "router_z_loss_mlp": 0.77734375, + "step": 72, + "time_per_iteration": 3.345369815826416 + }, + { + "auxiliary_loss_clip": 0.02003517, + "auxiliary_loss_mlp": 0.00987034, + "balance_loss_clip": 1.41772044, + "balance_loss_mlp": 0.90921462, + "epoch": 0.004388997444761762, + "flos": 66473239564800.0, + "grad_norm": 95.07444356128, + "language_loss": 0.63359886, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66350436, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.77734375, + "step": 73, + "time_per_iteration": 3.6851634979248047 + }, + { + "auxiliary_loss_clip": 0.02176877, + "auxiliary_loss_mlp": 0.01064429, + "balance_loss_clip": 1.41189611, + "balance_loss_mlp": 0.93730426, + "epoch": 0.004449120697429731, + "flos": 18952970181120.0, + "grad_norm": 44.927423806678945, + "language_loss": 0.93187869, + "learning_rate": 2.771181708202938e-06, + "loss": 0.96429169, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 7.65234375, + "router_z_loss_mlp": 1.27148438, + "step": 74, + "time_per_iteration": 2.992326259613037 + }, + { + "auxiliary_loss_clip": 0.02179771, + "auxiliary_loss_mlp": 0.01088197, + "balance_loss_clip": 1.41291952, + "balance_loss_mlp": 0.95744801, + "epoch": 0.004509243950097701, + "flos": 21105491581440.0, + "grad_norm": 151.1805947336832, + "language_loss": 1.03513861, + "learning_rate": 2.779824149153005e-06, + "loss": 1.06781816, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 7.66015625, + "router_z_loss_mlp": 1.30761719, + "step": 75, + "time_per_iteration": 2.9348511695861816 + }, + { + "auxiliary_loss_clip": 0.02158967, + "auxiliary_loss_mlp": 0.01081527, + "balance_loss_clip": 1.40469193, + "balance_loss_mlp": 0.95502257, + "epoch": 0.004569367202765669, + "flos": 20698730991360.0, + "grad_norm": 25.40560884135099, + "language_loss": 0.93477261, + "learning_rate": 2.788352117317012e-06, + "loss": 0.96717757, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 7.54296875, + "router_z_loss_mlp": 1.26513672, + "step": 76, + "time_per_iteration": 2.6567721366882324 + }, + { + "auxiliary_loss_clip": 0.02172508, + "auxiliary_loss_mlp": 0.01069911, + "balance_loss_clip": 1.40563083, + "balance_loss_mlp": 0.94402647, + "epoch": 0.004629490455433639, + "flos": 28658474899200.0, + "grad_norm": 19.35003661391888, + "language_loss": 0.96598023, + "learning_rate": 2.796768605577095e-06, + "loss": 0.9984045, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 7.671875, + "router_z_loss_mlp": 1.25878906, + "step": 77, + "time_per_iteration": 2.735846996307373 + }, + { + "auxiliary_loss_clip": 0.02160073, + "auxiliary_loss_mlp": 0.0107145, + "balance_loss_clip": 1.40726519, + "balance_loss_mlp": 0.95066726, + "epoch": 0.004689613708101608, + "flos": 11072409805440.0, + "grad_norm": 25.48323626119104, + "language_loss": 0.99792445, + "learning_rate": 2.80507649095533e-06, + "loss": 1.03023958, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 7.53515625, + "router_z_loss_mlp": 1.20751953, + "step": 78, + "time_per_iteration": 2.7138967514038086 + }, + { + "auxiliary_loss_clip": 0.02170112, + "auxiliary_loss_mlp": 0.01056534, + "balance_loss_clip": 1.40753651, + "balance_loss_mlp": 0.93827856, + "epoch": 0.004749736960769578, + "flos": 21799106184960.0, + "grad_norm": 241.3534720633038, + "language_loss": 0.89993793, + "learning_rate": 2.813278540517843e-06, + "loss": 0.93220437, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 7.62109375, + "router_z_loss_mlp": 1.18066406, + "step": 79, + "time_per_iteration": 2.6628527641296387 + }, + { + "auxiliary_loss_clip": 0.0219081, + "auxiliary_loss_mlp": 0.01064344, + "balance_loss_clip": 1.40085912, + "balance_loss_mlp": 0.93617076, + "epoch": 0.004809860213437547, + "flos": 19792597570560.0, + "grad_norm": 91.02662319578228, + "language_loss": 0.95361269, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.98616421, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 7.90234375, + "router_z_loss_mlp": 1.28027344, + "step": 80, + "time_per_iteration": 2.656294822692871 + }, + { + "auxiliary_loss_clip": 0.0217854, + "auxiliary_loss_mlp": 0.01046879, + "balance_loss_clip": 1.40287113, + "balance_loss_mlp": 0.92290121, + "epoch": 0.004869983466105517, + "flos": 26574327037440.0, + "grad_norm": 12.237590315667472, + "language_loss": 1.00894713, + "learning_rate": 2.829375683533245e-06, + "loss": 1.04120135, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 7.75, + "router_z_loss_mlp": 1.24023438, + "step": 81, + "time_per_iteration": 2.7014718055725098 + }, + { + "auxiliary_loss_clip": 0.02199176, + "auxiliary_loss_mlp": 0.01068031, + "balance_loss_clip": 1.40907454, + "balance_loss_mlp": 0.93957126, + "epoch": 0.004930106718773485, + "flos": 12823378087680.0, + "grad_norm": 36.006855457394046, + "language_loss": 1.03224838, + "learning_rate": 2.8372758094402803e-06, + "loss": 1.06492043, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 7.90234375, + "router_z_loss_mlp": 1.28417969, + "step": 82, + "time_per_iteration": 2.7453949451446533 + }, + { + "auxiliary_loss_clip": 0.02184234, + "auxiliary_loss_mlp": 0.01075117, + "balance_loss_clip": 1.39907908, + "balance_loss_mlp": 0.9440825, + "epoch": 0.004990229971441455, + "flos": 25774919902080.0, + "grad_norm": 464.8680436658363, + "language_loss": 0.92199713, + "learning_rate": 2.84508017388607e-06, + "loss": 0.95459062, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 7.8515625, + "router_z_loss_mlp": 1.31152344, + "step": 83, + "time_per_iteration": 2.7285687923431396 + }, + { + "auxiliary_loss_clip": 0.0217107, + "auxiliary_loss_mlp": 0.01081099, + "balance_loss_clip": 1.40076947, + "balance_loss_mlp": 0.95521432, + "epoch": 0.005050353224109424, + "flos": 17457254922240.0, + "grad_norm": 139.8480965057028, + "language_loss": 0.97223222, + "learning_rate": 2.852791070641559e-06, + "loss": 1.00475383, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 7.7109375, + "router_z_loss_mlp": 1.25976562, + "step": 84, + "time_per_iteration": 2.6686079502105713 + }, + { + "auxiliary_loss_clip": 0.02067746, + "auxiliary_loss_mlp": 0.01042959, + "balance_loss_clip": 1.42078733, + "balance_loss_mlp": 0.9632315, + "epoch": 0.005110476476777394, + "flos": 69805460367360.0, + "grad_norm": 1.182275793685502, + "language_loss": 0.61750686, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.64861381, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 6.4375, + "router_z_loss_mlp": 0.796875, + "step": 85, + "time_per_iteration": 3.2537620067596436 + }, + { + "auxiliary_loss_clip": 0.02156857, + "auxiliary_loss_mlp": 0.01063804, + "balance_loss_clip": 1.39681888, + "balance_loss_mlp": 0.94168591, + "epoch": 0.005170599729445363, + "flos": 24790105739520.0, + "grad_norm": 40.025983457987955, + "language_loss": 0.96041584, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.99262238, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 7.59765625, + "router_z_loss_mlp": 1.22119141, + "step": 86, + "time_per_iteration": 2.7150163650512695 + }, + { + "auxiliary_loss_clip": 0.02142156, + "auxiliary_loss_mlp": 0.01070653, + "balance_loss_clip": 1.38717043, + "balance_loss_mlp": 0.94629389, + "epoch": 0.005230722982113333, + "flos": 23258048895360.0, + "grad_norm": 51.90947836306321, + "language_loss": 0.8888706, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.92099869, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 7.55078125, + "router_z_loss_mlp": 1.24414062, + "step": 87, + "time_per_iteration": 2.7187232971191406 + }, + { + "auxiliary_loss_clip": 0.02119578, + "auxiliary_loss_mlp": 0.01057995, + "balance_loss_clip": 1.37867022, + "balance_loss_mlp": 0.94050193, + "epoch": 0.005290846234781301, + "flos": 16727909264640.0, + "grad_norm": 67.43317709442191, + "language_loss": 1.00044632, + "learning_rate": 2.8827430842847267e-06, + "loss": 1.03222203, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 7.41015625, + "router_z_loss_mlp": 1.17480469, + "step": 88, + "time_per_iteration": 2.732900381088257 + }, + { + "auxiliary_loss_clip": 0.02129642, + "auxiliary_loss_mlp": 0.01021508, + "balance_loss_clip": 1.37711203, + "balance_loss_mlp": 0.90749621, + "epoch": 0.005350969487449271, + "flos": 20886077352960.0, + "grad_norm": 1253.0786690730397, + "language_loss": 0.91518873, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.94670022, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 7.52734375, + "router_z_loss_mlp": 1.13916016, + "step": 89, + "time_per_iteration": 2.736537456512451 + }, + { + "auxiliary_loss_clip": 0.02130883, + "auxiliary_loss_mlp": 0.01017509, + "balance_loss_clip": 1.38667631, + "balance_loss_mlp": 0.90893358, + "epoch": 0.00541109274011724, + "flos": 26209977431040.0, + "grad_norm": 20.547820733114815, + "language_loss": 0.98645926, + "learning_rate": 2.8972122815946455e-06, + "loss": 1.01794314, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 7.44140625, + "router_z_loss_mlp": 1.08642578, + "step": 90, + "time_per_iteration": 2.72572922706604 + }, + { + "auxiliary_loss_clip": 0.02070509, + "auxiliary_loss_mlp": 0.01016198, + "balance_loss_clip": 1.35954988, + "balance_loss_mlp": 0.90719283, + "epoch": 0.00547121599278521, + "flos": 21178569801600.0, + "grad_norm": 230.31490285568063, + "language_loss": 0.92482209, + "learning_rate": 2.90432674275074e-06, + "loss": 0.95568907, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 7.109375, + "router_z_loss_mlp": 1.09033203, + "step": 91, + "time_per_iteration": 2.7694270610809326 + }, + { + "auxiliary_loss_clip": 0.02080731, + "auxiliary_loss_mlp": 0.01033199, + "balance_loss_clip": 1.36302543, + "balance_loss_mlp": 0.92138112, + "epoch": 0.005531339245453179, + "flos": 19718801078400.0, + "grad_norm": 6.924376700314359, + "language_loss": 0.94420964, + "learning_rate": 2.91136344867656e-06, + "loss": 0.97534895, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 7.17578125, + "router_z_loss_mlp": 1.1171875, + "step": 92, + "time_per_iteration": 2.7769083976745605 + }, + { + "auxiliary_loss_clip": 0.02088772, + "auxiliary_loss_mlp": 0.01048953, + "balance_loss_clip": 1.36210716, + "balance_loss_mlp": 0.93150806, + "epoch": 0.005591462498121149, + "flos": 17636089760640.0, + "grad_norm": 964.0360410097962, + "language_loss": 1.02574992, + "learning_rate": 2.918324080615938e-06, + "loss": 1.05712712, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 7.26171875, + "router_z_loss_mlp": 1.17578125, + "step": 93, + "time_per_iteration": 2.6762332916259766 + }, + { + "auxiliary_loss_clip": 0.02099035, + "auxiliary_loss_mlp": 0.01025731, + "balance_loss_clip": 1.36745358, + "balance_loss_mlp": 0.9121002, + "epoch": 0.005651585750789117, + "flos": 20011221699840.0, + "grad_norm": 23.0494808323307, + "language_loss": 0.94119775, + "learning_rate": 2.925210265866963e-06, + "loss": 0.97244549, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 7.3203125, + "router_z_loss_mlp": 1.13671875, + "step": 94, + "time_per_iteration": 2.671448230743408 + }, + { + "auxiliary_loss_clip": 0.0182264, + "auxiliary_loss_mlp": 0.00722016, + "balance_loss_clip": 1.31438065, + "balance_loss_mlp": 0.67433184, + "epoch": 0.005711709003457087, + "flos": 59812957981440.0, + "grad_norm": 1.1700782818630062, + "language_loss": 0.67012691, + "learning_rate": 2.932023580065507e-06, + "loss": 0.69557345, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.4765625, + "step": 95, + "time_per_iteration": 3.119431257247925 + }, + { + "auxiliary_loss_clip": 0.02031375, + "auxiliary_loss_mlp": 0.00947526, + "balance_loss_clip": 1.34114122, + "balance_loss_mlp": 0.8508234, + "epoch": 0.005771832256125056, + "flos": 15559591495680.0, + "grad_norm": 77.34193106643143, + "language_loss": 0.97181404, + "learning_rate": 2.9387655493491906e-06, + "loss": 1.00160301, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 6.8984375, + "router_z_loss_mlp": 0.96826172, + "step": 96, + "time_per_iteration": 2.642866611480713 + }, + { + "auxiliary_loss_clip": 0.02009174, + "auxiliary_loss_mlp": 0.00924142, + "balance_loss_clip": 1.33040726, + "balance_loss_mlp": 0.82615167, + "epoch": 0.005831955508793026, + "flos": 22528380015360.0, + "grad_norm": 48.050049411535326, + "language_loss": 0.97001088, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.99934405, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 6.78515625, + "router_z_loss_mlp": 0.97998047, + "step": 97, + "time_per_iteration": 2.7164926528930664 + }, + { + "auxiliary_loss_clip": 0.01991181, + "auxiliary_loss_mlp": 0.00910185, + "balance_loss_clip": 1.32509971, + "balance_loss_mlp": 0.80623412, + "epoch": 0.005892078761460995, + "flos": 22049834094720.0, + "grad_norm": 32.220105580473806, + "language_loss": 0.81935883, + "learning_rate": 2.952041322436969e-06, + "loss": 0.84837246, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 6.6640625, + "router_z_loss_mlp": 1.03857422, + "step": 98, + "time_per_iteration": 2.789259433746338 + }, + { + "auxiliary_loss_clip": 0.01684029, + "auxiliary_loss_mlp": 0.0069636, + "balance_loss_clip": 1.25979209, + "balance_loss_mlp": 0.65306324, + "epoch": 0.005952202014128965, + "flos": 68539143317760.0, + "grad_norm": 1.0758018993447673, + "language_loss": 0.64717102, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.67097485, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.43359375, + "step": 99, + "time_per_iteration": 3.288888692855835 + }, + { + "auxiliary_loss_clip": 0.01948164, + "auxiliary_loss_mlp": 0.00894639, + "balance_loss_clip": 1.30847979, + "balance_loss_mlp": 0.79788864, + "epoch": 0.006012325266796933, + "flos": 22960887678720.0, + "grad_norm": 169.1745340797044, + "language_loss": 0.98576927, + "learning_rate": 2.9650488796560464e-06, + "loss": 1.01419723, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 6.39453125, + "router_z_loss_mlp": 0.96728516, + "step": 100, + "time_per_iteration": 2.6935875415802 + }, + { + "auxiliary_loss_clip": 0.01960751, + "auxiliary_loss_mlp": 0.00878936, + "balance_loss_clip": 1.3112216, + "balance_loss_mlp": 0.7778942, + "epoch": 0.006072448519464903, + "flos": 17347942857600.0, + "grad_norm": 47.07254933490482, + "language_loss": 0.97072709, + "learning_rate": 2.971455421902446e-06, + "loss": 0.99912393, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 6.5, + "router_z_loss_mlp": 1.00927734, + "step": 101, + "time_per_iteration": 2.725368022918701 + }, + { + "auxiliary_loss_clip": 0.01931863, + "auxiliary_loss_mlp": 0.00826707, + "balance_loss_clip": 1.30742383, + "balance_loss_mlp": 0.73486876, + "epoch": 0.006132571772132872, + "flos": 24681116897280.0, + "grad_norm": 48.141580874145475, + "language_loss": 0.97692084, + "learning_rate": 2.9777988444798075e-06, + "loss": 1.00450659, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 6.24609375, + "router_z_loss_mlp": 0.91796875, + "step": 102, + "time_per_iteration": 2.732429265975952 + }, + { + "auxiliary_loss_clip": 0.01920405, + "auxiliary_loss_mlp": 0.00842023, + "balance_loss_clip": 1.30536985, + "balance_loss_mlp": 0.74698955, + "epoch": 0.006192695024800842, + "flos": 21465675210240.0, + "grad_norm": 10.538882813098272, + "language_loss": 0.9406389, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.96826321, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 6.16015625, + "router_z_loss_mlp": 0.95117188, + "step": 103, + "time_per_iteration": 4.10395884513855 + }, + { + "auxiliary_loss_clip": 0.01901529, + "auxiliary_loss_mlp": 0.00808728, + "balance_loss_clip": 1.29738867, + "balance_loss_mlp": 0.71998864, + "epoch": 0.006252818277468811, + "flos": 17420410546560.0, + "grad_norm": 97.33011414098947, + "language_loss": 0.98254943, + "learning_rate": 2.990301221458371e-06, + "loss": 1.00965202, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 6.0390625, + "router_z_loss_mlp": 0.88671875, + "step": 104, + "time_per_iteration": 3.9994308948516846 + }, + { + "auxiliary_loss_clip": 0.01880987, + "auxiliary_loss_mlp": 0.00803401, + "balance_loss_clip": 1.29134119, + "balance_loss_mlp": 0.71485209, + "epoch": 0.006312941530136781, + "flos": 19099557584640.0, + "grad_norm": 288.15173888344356, + "language_loss": 1.03769183, + "learning_rate": 2.9964625333900544e-06, + "loss": 1.06453586, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 0.88525391, + "step": 105, + "time_per_iteration": 2.80711030960083 + }, + { + "auxiliary_loss_clip": 0.01874455, + "auxiliary_loss_mlp": 0.00826528, + "balance_loss_clip": 1.28512263, + "balance_loss_mlp": 0.72634411, + "epoch": 0.006373064782804749, + "flos": 24060831909120.0, + "grad_norm": 386.94822596291954, + "language_loss": 0.97950977, + "learning_rate": 3.002565443382063e-06, + "loss": 1.00651956, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 5.89453125, + "router_z_loss_mlp": 1.00244141, + "step": 106, + "time_per_iteration": 2.70206880569458 + }, + { + "auxiliary_loss_clip": 0.01857568, + "auxiliary_loss_mlp": 0.00825123, + "balance_loss_clip": 1.27497029, + "balance_loss_mlp": 0.727705, + "epoch": 0.006433188035472719, + "flos": 18332433797760.0, + "grad_norm": 81.25182199587908, + "language_loss": 0.92604768, + "learning_rate": 3.008611048208843e-06, + "loss": 0.95287454, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 5.82421875, + "router_z_loss_mlp": 0.97412109, + "step": 107, + "time_per_iteration": 2.6596474647521973 + }, + { + "auxiliary_loss_clip": 0.01538748, + "auxiliary_loss_mlp": 0.00482339, + "balance_loss_clip": 1.18675041, + "balance_loss_mlp": 0.4346548, + "epoch": 0.006493311288140688, + "flos": 62562387594240.0, + "grad_norm": 0.9782370975553005, + "language_loss": 0.64388406, + "learning_rate": 3.014600414036285e-06, + "loss": 0.66409492, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.4765625, + "step": 108, + "time_per_iteration": 3.2340567111968994 + }, + { + "auxiliary_loss_clip": 0.0181261, + "auxiliary_loss_mlp": 0.00778817, + "balance_loss_clip": 1.26208007, + "balance_loss_mlp": 0.68945777, + "epoch": 0.006553434540808658, + "flos": 19500141035520.0, + "grad_norm": 1347.3978630558404, + "language_loss": 1.05323362, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.07914793, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 5.5078125, + "router_z_loss_mlp": 0.89306641, + "step": 109, + "time_per_iteration": 2.7123146057128906 + }, + { + "auxiliary_loss_clip": 0.01789383, + "auxiliary_loss_mlp": 0.00806952, + "balance_loss_clip": 1.25275242, + "balance_loss_mlp": 0.71292007, + "epoch": 0.006613557793476627, + "flos": 21105132445440.0, + "grad_norm": 28.25374315634908, + "language_loss": 0.91203845, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.93800175, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 5.36328125, + "router_z_loss_mlp": 0.93945312, + "step": 110, + "time_per_iteration": 2.713517189025879 + }, + { + "auxiliary_loss_clip": 0.01780471, + "auxiliary_loss_mlp": 0.00778436, + "balance_loss_clip": 1.24193454, + "balance_loss_mlp": 0.68774199, + "epoch": 0.006673681046144597, + "flos": 26030747543040.0, + "grad_norm": 438.167999056945, + "language_loss": 0.8765887, + "learning_rate": 3.032241303393073e-06, + "loss": 0.90217769, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 5.38671875, + "router_z_loss_mlp": 0.90673828, + "step": 111, + "time_per_iteration": 2.7387893199920654 + }, + { + "auxiliary_loss_clip": 0.01762534, + "auxiliary_loss_mlp": 0.00778245, + "balance_loss_clip": 1.23906755, + "balance_loss_mlp": 0.69146079, + "epoch": 0.006733804298812566, + "flos": 23147767163520.0, + "grad_norm": 193.05633382336595, + "language_loss": 1.0101614, + "learning_rate": 3.0380158011446e-06, + "loss": 1.03556919, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 5.234375, + "router_z_loss_mlp": 0.86767578, + "step": 112, + "time_per_iteration": 2.7015302181243896 + }, + { + "auxiliary_loss_clip": 0.0175362, + "auxiliary_loss_mlp": 0.00759923, + "balance_loss_clip": 1.23133671, + "balance_loss_mlp": 0.67528403, + "epoch": 0.006793927551480535, + "flos": 11764444210560.0, + "grad_norm": 94.36321292513273, + "language_loss": 0.86510372, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.89023912, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 5.22265625, + "router_z_loss_mlp": 0.84667969, + "step": 113, + "time_per_iteration": 2.7100846767425537 + }, + { + "auxiliary_loss_clip": 0.01737425, + "auxiliary_loss_mlp": 0.00760596, + "balance_loss_clip": 1.2305243, + "balance_loss_mlp": 0.67481261, + "epoch": 0.006854050804148504, + "flos": 19171953446400.0, + "grad_norm": 141.05572596946854, + "language_loss": 1.01577854, + "learning_rate": 3.0494117125071475e-06, + "loss": 1.04075873, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 5.07421875, + "router_z_loss_mlp": 0.85742188, + "step": 114, + "time_per_iteration": 2.641103982925415 + }, + { + "auxiliary_loss_clip": 0.01757001, + "auxiliary_loss_mlp": 0.00748572, + "balance_loss_clip": 1.23863637, + "balance_loss_mlp": 0.66398066, + "epoch": 0.006914174056816474, + "flos": 21981891519360.0, + "grad_norm": 102.45540254769315, + "language_loss": 1.01735425, + "learning_rate": 3.055034911425055e-06, + "loss": 1.04241002, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 5.1875, + "router_z_loss_mlp": 0.84570312, + "step": 115, + "time_per_iteration": 2.7194206714630127 + }, + { + "auxiliary_loss_clip": 0.01753099, + "auxiliary_loss_mlp": 0.00773998, + "balance_loss_clip": 1.23089552, + "balance_loss_mlp": 0.68082356, + "epoch": 0.006974297309484443, + "flos": 16289152634880.0, + "grad_norm": 570.2237930641412, + "language_loss": 0.90330911, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.92858005, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 5.2265625, + "router_z_loss_mlp": 0.93212891, + "step": 116, + "time_per_iteration": 2.6253557205200195 + }, + { + "auxiliary_loss_clip": 0.01743625, + "auxiliary_loss_mlp": 0.00744717, + "balance_loss_clip": 1.23006773, + "balance_loss_mlp": 0.65712225, + "epoch": 0.007034420562152413, + "flos": 26104005331200.0, + "grad_norm": 52.22874466357756, + "language_loss": 0.96376693, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.98865038, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 5.1328125, + "router_z_loss_mlp": 0.87597656, + "step": 117, + "time_per_iteration": 2.8993194103240967 + }, + { + "auxiliary_loss_clip": 0.01747987, + "auxiliary_loss_mlp": 0.00784745, + "balance_loss_clip": 1.23875809, + "balance_loss_mlp": 0.69343102, + "epoch": 0.007094543814820382, + "flos": 14204609723520.0, + "grad_norm": 52.47309873150167, + "language_loss": 0.93600887, + "learning_rate": 3.071615712271274e-06, + "loss": 0.96133614, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 5.08984375, + "router_z_loss_mlp": 0.9140625, + "step": 118, + "time_per_iteration": 2.6580498218536377 + }, + { + "auxiliary_loss_clip": 0.01769533, + "auxiliary_loss_mlp": 0.00760741, + "balance_loss_clip": 1.23823047, + "balance_loss_mlp": 0.67033303, + "epoch": 0.007154667067488351, + "flos": 14976007228800.0, + "grad_norm": 69.29585503079377, + "language_loss": 1.05577409, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.08107674, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 5.3125, + "router_z_loss_mlp": 0.90429688, + "step": 119, + "time_per_iteration": 2.6431193351745605 + }, + { + "auxiliary_loss_clip": 0.01785732, + "auxiliary_loss_mlp": 0.00832628, + "balance_loss_clip": 1.24209511, + "balance_loss_mlp": 0.73006034, + "epoch": 0.00721479032015632, + "flos": 20193288762240.0, + "grad_norm": 141.55905065710016, + "language_loss": 0.99686277, + "learning_rate": 3.082437012097686e-06, + "loss": 1.02304626, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 5.43359375, + "router_z_loss_mlp": 1.02587891, + "step": 120, + "time_per_iteration": 2.673266887664795 + }, + { + "auxiliary_loss_clip": 0.01751374, + "auxiliary_loss_mlp": 0.0078848, + "balance_loss_clip": 1.23402703, + "balance_loss_mlp": 0.69654536, + "epoch": 0.00727491357282429, + "flos": 23147228459520.0, + "grad_norm": 77.25770422616483, + "language_loss": 0.98269451, + "learning_rate": 3.0877802144103967e-06, + "loss": 1.00809312, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 5.17578125, + "router_z_loss_mlp": 0.91943359, + "step": 121, + "time_per_iteration": 2.6723721027374268 + }, + { + "auxiliary_loss_clip": 0.01766994, + "auxiliary_loss_mlp": 0.00786209, + "balance_loss_clip": 1.24295712, + "balance_loss_mlp": 0.6934641, + "epoch": 0.007335036825492259, + "flos": 15521669712000.0, + "grad_norm": 36.64401628206922, + "language_loss": 0.9861933, + "learning_rate": 3.09307943925077e-06, + "loss": 1.01172543, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 5.234375, + "router_z_loss_mlp": 0.92724609, + "step": 122, + "time_per_iteration": 2.5889391899108887 + }, + { + "auxiliary_loss_clip": 0.01771978, + "auxiliary_loss_mlp": 0.00776311, + "balance_loss_clip": 1.24376607, + "balance_loss_mlp": 0.68194455, + "epoch": 0.007395160078160229, + "flos": 24243365848320.0, + "grad_norm": 60.670169491101674, + "language_loss": 1.00440967, + "learning_rate": 3.0983354046304154e-06, + "loss": 1.02989244, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 5.28125, + "router_z_loss_mlp": 0.94384766, + "step": 123, + "time_per_iteration": 2.6796882152557373 + }, + { + "auxiliary_loss_clip": 0.01775125, + "auxiliary_loss_mlp": 0.00781009, + "balance_loss_clip": 1.24194121, + "balance_loss_mlp": 0.69408208, + "epoch": 0.007455283330828198, + "flos": 31759792099200.0, + "grad_norm": 35.394390487063916, + "language_loss": 0.77560151, + "learning_rate": 3.103548811118979e-06, + "loss": 0.80116284, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 5.3359375, + "router_z_loss_mlp": 0.86962891, + "step": 124, + "time_per_iteration": 2.713456153869629 + }, + { + "auxiliary_loss_clip": 0.01752454, + "auxiliary_loss_mlp": 0.00767223, + "balance_loss_clip": 1.24400115, + "balance_loss_mlp": 0.67729127, + "epoch": 0.007515406583496167, + "flos": 26615157822720.0, + "grad_norm": 428.15630342270236, + "language_loss": 0.96539855, + "learning_rate": 3.108720342404542e-06, + "loss": 0.99059528, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 5.08984375, + "router_z_loss_mlp": 0.89990234, + "step": 125, + "time_per_iteration": 2.6760177612304688 + }, + { + "auxiliary_loss_clip": 0.017887, + "auxiliary_loss_mlp": 0.00797788, + "balance_loss_clip": 1.25548112, + "balance_loss_mlp": 0.70299309, + "epoch": 0.007575529836164136, + "flos": 18223696350720.0, + "grad_norm": 22.260902466392288, + "language_loss": 0.90518093, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.93104583, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 5.328125, + "router_z_loss_mlp": 0.94824219, + "step": 126, + "time_per_iteration": 2.651451587677002 + }, + { + "auxiliary_loss_clip": 0.01788478, + "auxiliary_loss_mlp": 0.00772681, + "balance_loss_clip": 1.25122523, + "balance_loss_mlp": 0.68513334, + "epoch": 0.007635653088832106, + "flos": 21580410228480.0, + "grad_norm": 167.07985154842163, + "language_loss": 0.77010608, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.79571766, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 5.375, + "router_z_loss_mlp": 0.87646484, + "step": 127, + "time_per_iteration": 2.643202543258667 + }, + { + "auxiliary_loss_clip": 0.01773221, + "auxiliary_loss_mlp": 0.00768555, + "balance_loss_clip": 1.25460029, + "balance_loss_mlp": 0.68124622, + "epoch": 0.007695776341500075, + "flos": 25375054723200.0, + "grad_norm": 53.59254096758381, + "language_loss": 0.93846619, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.96388394, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 5.1875, + "router_z_loss_mlp": 0.87353516, + "step": 128, + "time_per_iteration": 2.684917688369751 + }, + { + "auxiliary_loss_clip": 0.01775106, + "auxiliary_loss_mlp": 0.00763329, + "balance_loss_clip": 1.24901199, + "balance_loss_mlp": 0.67187178, + "epoch": 0.007755899594168045, + "flos": 22343906741760.0, + "grad_norm": 39.02329277609331, + "language_loss": 0.90290201, + "learning_rate": 3.129000827968184e-06, + "loss": 0.92828631, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 5.26171875, + "router_z_loss_mlp": 0.91552734, + "step": 129, + "time_per_iteration": 2.6390843391418457 + }, + { + "auxiliary_loss_clip": 0.01756609, + "auxiliary_loss_mlp": 0.00773448, + "balance_loss_clip": 1.2491858, + "balance_loss_mlp": 0.68041694, + "epoch": 0.007816022846836013, + "flos": 22638230784000.0, + "grad_norm": 12.976926796179098, + "language_loss": 1.03236341, + "learning_rate": 3.133972684206866e-06, + "loss": 1.05766404, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 5.07421875, + "router_z_loss_mlp": 0.92919922, + "step": 130, + "time_per_iteration": 2.7191576957702637 + }, + { + "auxiliary_loss_clip": 0.01750998, + "auxiliary_loss_mlp": 0.00757384, + "balance_loss_clip": 1.24581182, + "balance_loss_mlp": 0.67098141, + "epoch": 0.007876146099503984, + "flos": 18182901479040.0, + "grad_norm": 57.47025478369975, + "language_loss": 0.88960469, + "learning_rate": 3.138906441556014e-06, + "loss": 0.91468859, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 5.0546875, + "router_z_loss_mlp": 0.86376953, + "step": 131, + "time_per_iteration": 2.644083261489868 + }, + { + "auxiliary_loss_clip": 0.01776655, + "auxiliary_loss_mlp": 0.00786279, + "balance_loss_clip": 1.25448513, + "balance_loss_mlp": 0.6949169, + "epoch": 0.007936269352171952, + "flos": 27119486730240.0, + "grad_norm": 154.18035094910812, + "language_loss": 0.87372893, + "learning_rate": 3.143802679474861e-06, + "loss": 0.89935827, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 5.22265625, + "router_z_loss_mlp": 0.91357422, + "step": 132, + "time_per_iteration": 2.750429153442383 + }, + { + "auxiliary_loss_clip": 0.01762673, + "auxiliary_loss_mlp": 0.00787466, + "balance_loss_clip": 1.24702501, + "balance_loss_mlp": 0.69743967, + "epoch": 0.007996392604839923, + "flos": 19026335710080.0, + "grad_norm": 88.40975584435802, + "language_loss": 1.02845812, + "learning_rate": 3.1486619643025565e-06, + "loss": 1.05395961, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 5.15625, + "router_z_loss_mlp": 0.90087891, + "step": 133, + "time_per_iteration": 2.6239280700683594 + }, + { + "auxiliary_loss_clip": 0.01744155, + "auxiliary_loss_mlp": 0.00788276, + "balance_loss_clip": 1.2458055, + "balance_loss_mlp": 0.69848716, + "epoch": 0.008056515857507891, + "flos": 25484151306240.0, + "grad_norm": 95.20562204907687, + "language_loss": 0.78226084, + "learning_rate": 3.153484849651286e-06, + "loss": 0.80758518, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 4.984375, + "router_z_loss_mlp": 0.89746094, + "step": 134, + "time_per_iteration": 2.7143054008483887 + }, + { + "auxiliary_loss_clip": 0.0174332, + "auxiliary_loss_mlp": 0.00777515, + "balance_loss_clip": 1.24046564, + "balance_loss_mlp": 0.68596184, + "epoch": 0.00811663911017586, + "flos": 20557566541440.0, + "grad_norm": 436.21684023090097, + "language_loss": 0.97615826, + "learning_rate": 3.1582718767847806e-06, + "loss": 1.00136662, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 5.02734375, + "router_z_loss_mlp": 0.91650391, + "step": 135, + "time_per_iteration": 2.610452651977539 + }, + { + "auxiliary_loss_clip": 0.01740659, + "auxiliary_loss_mlp": 0.00724727, + "balance_loss_clip": 1.24308956, + "balance_loss_mlp": 0.64170939, + "epoch": 0.00817676236284383, + "flos": 18799738761600.0, + "grad_norm": 1203.2494406193493, + "language_loss": 0.95668411, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.9813379, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 4.98046875, + "router_z_loss_mlp": 0.82958984, + "step": 136, + "time_per_iteration": 2.6823713779449463 + }, + { + "auxiliary_loss_clip": 0.01759191, + "auxiliary_loss_mlp": 0.00758185, + "balance_loss_clip": 1.24827206, + "balance_loss_mlp": 0.67178237, + "epoch": 0.008236885615511799, + "flos": 23873593288320.0, + "grad_norm": 780.2701440131653, + "language_loss": 0.92436039, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.94953412, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 5.1171875, + "router_z_loss_mlp": 0.86376953, + "step": 137, + "time_per_iteration": 2.6557068824768066 + }, + { + "auxiliary_loss_clip": 0.01746173, + "auxiliary_loss_mlp": 0.00751694, + "balance_loss_clip": 1.24646688, + "balance_loss_mlp": 0.66729343, + "epoch": 0.00829700886817977, + "flos": 24643626076800.0, + "grad_norm": 134.19126751062498, + "language_loss": 0.95734376, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.98232239, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.84375, + "step": 138, + "time_per_iteration": 2.713535785675049 + }, + { + "auxiliary_loss_clip": 0.01732029, + "auxiliary_loss_mlp": 0.00795707, + "balance_loss_clip": 1.24104059, + "balance_loss_mlp": 0.7008642, + "epoch": 0.008357132120847738, + "flos": 25262007644160.0, + "grad_norm": 598.7182429439498, + "language_loss": 0.96529615, + "learning_rate": 3.177071816289865e-06, + "loss": 0.99057353, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 4.9140625, + "router_z_loss_mlp": 0.94824219, + "step": 139, + "time_per_iteration": 2.7126951217651367 + }, + { + "auxiliary_loss_clip": 0.01754028, + "auxiliary_loss_mlp": 0.00786769, + "balance_loss_clip": 1.24849916, + "balance_loss_mlp": 0.69273639, + "epoch": 0.008417255373515706, + "flos": 27344898529920.0, + "grad_norm": 886.678802051824, + "language_loss": 0.95855319, + "learning_rate": 3.181687263893095e-06, + "loss": 0.98396116, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 5.0625, + "router_z_loss_mlp": 0.93994141, + "step": 140, + "time_per_iteration": 2.7129945755004883 + }, + { + "auxiliary_loss_clip": 0.01739299, + "auxiliary_loss_mlp": 0.00790665, + "balance_loss_clip": 1.24510455, + "balance_loss_mlp": 0.69415271, + "epoch": 0.008477378626183677, + "flos": 17639070589440.0, + "grad_norm": 38137.036250938516, + "language_loss": 0.92888939, + "learning_rate": 3.186269861057098e-06, + "loss": 0.95418906, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 4.9453125, + "router_z_loss_mlp": 0.96533203, + "step": 141, + "time_per_iteration": 2.64109206199646 + }, + { + "auxiliary_loss_clip": 0.01743335, + "auxiliary_loss_mlp": 0.00795547, + "balance_loss_clip": 1.2420994, + "balance_loss_mlp": 0.69979835, + "epoch": 0.008537501878851645, + "flos": 13881342297600.0, + "grad_norm": 164.6775671611334, + "language_loss": 0.88635027, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.91173911, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 5.02734375, + "router_z_loss_mlp": 0.95849609, + "step": 142, + "time_per_iteration": 2.650099039077759 + }, + { + "auxiliary_loss_clip": 0.01565533, + "auxiliary_loss_mlp": 0.00581501, + "balance_loss_clip": 1.21773124, + "balance_loss_mlp": 0.52504373, + "epoch": 0.008597625131519616, + "flos": 71248101281280.0, + "grad_norm": 5.389600543782937, + "language_loss": 0.66106755, + "learning_rate": 3.195338351584042e-06, + "loss": 0.68253791, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.56640625, + "step": 143, + "time_per_iteration": 3.493806838989258 + }, + { + "auxiliary_loss_clip": 0.01715963, + "auxiliary_loss_mlp": 0.00759861, + "balance_loss_clip": 1.24156356, + "balance_loss_mlp": 0.67283833, + "epoch": 0.008657748384187584, + "flos": 17602836744960.0, + "grad_norm": 68.87181438319264, + "language_loss": 0.92641509, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.95117337, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 4.73828125, + "router_z_loss_mlp": 0.87011719, + "step": 144, + "time_per_iteration": 2.8749797344207764 + }, + { + "auxiliary_loss_clip": 0.01706701, + "auxiliary_loss_mlp": 0.00772767, + "balance_loss_clip": 1.24137139, + "balance_loss_mlp": 0.68245435, + "epoch": 0.008717871636855555, + "flos": 19715317459200.0, + "grad_norm": 17.116239848777173, + "language_loss": 0.95647281, + "learning_rate": 3.204280886775619e-06, + "loss": 0.98126757, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 4.64453125, + "router_z_loss_mlp": 0.90429688, + "step": 145, + "time_per_iteration": 4.194197177886963 + }, + { + "auxiliary_loss_clip": 0.01720869, + "auxiliary_loss_mlp": 0.00800799, + "balance_loss_clip": 1.24171853, + "balance_loss_mlp": 0.70237935, + "epoch": 0.008777994889523523, + "flos": 24717422568960.0, + "grad_norm": 98.68573447680528, + "language_loss": 0.9223997, + "learning_rate": 3.208706005112005e-06, + "loss": 0.9476164, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.984375, + "step": 146, + "time_per_iteration": 4.1476662158966064 + }, + { + "auxiliary_loss_clip": 0.01533834, + "auxiliary_loss_mlp": 0.00530087, + "balance_loss_clip": 1.21718585, + "balance_loss_mlp": 0.48507333, + "epoch": 0.008838118142191492, + "flos": 70132067758080.0, + "grad_norm": 0.9153580819035948, + "language_loss": 0.59278697, + "learning_rate": 3.213100917627104e-06, + "loss": 0.61342621, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.44921875, + "step": 147, + "time_per_iteration": 3.3022539615631104 + }, + { + "auxiliary_loss_clip": 0.01673189, + "auxiliary_loss_mlp": 0.00775806, + "balance_loss_clip": 1.22842979, + "balance_loss_mlp": 0.68668514, + "epoch": 0.008898241394859462, + "flos": 20044797937920.0, + "grad_norm": 48.01713386653333, + "language_loss": 0.9028042, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.92729408, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 4.44921875, + "router_z_loss_mlp": 0.89111328, + "step": 148, + "time_per_iteration": 2.662189483642578 + }, + { + "auxiliary_loss_clip": 0.01653112, + "auxiliary_loss_mlp": 0.00740834, + "balance_loss_clip": 1.21770346, + "balance_loss_mlp": 0.65886551, + "epoch": 0.008958364647527431, + "flos": 10743611685120.0, + "grad_norm": 91.81741903217146, + "language_loss": 0.96288693, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.98682642, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 4.35546875, + "router_z_loss_mlp": 0.81933594, + "step": 149, + "time_per_iteration": 2.6241729259490967 + }, + { + "auxiliary_loss_clip": 0.01670923, + "auxiliary_loss_mlp": 0.00782092, + "balance_loss_clip": 1.22522414, + "balance_loss_mlp": 0.69402009, + "epoch": 0.009018487900195401, + "flos": 29127467802240.0, + "grad_norm": 676.3265451256625, + "language_loss": 0.99732471, + "learning_rate": 3.226108474846181e-06, + "loss": 1.02185488, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 4.46875, + "router_z_loss_mlp": 0.88037109, + "step": 150, + "time_per_iteration": 2.6795654296875 + }, + { + "auxiliary_loss_clip": 0.01649499, + "auxiliary_loss_mlp": 0.00763406, + "balance_loss_clip": 1.21304679, + "balance_loss_mlp": 0.68344015, + "epoch": 0.00907861115286337, + "flos": 32963661354240.0, + "grad_norm": 1049.5309711403313, + "language_loss": 0.8071866, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.83131564, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 4.3671875, + "router_z_loss_mlp": 0.79980469, + "step": 151, + "time_per_iteration": 2.789259195327759 + }, + { + "auxiliary_loss_clip": 0.01661436, + "auxiliary_loss_mlp": 0.00767244, + "balance_loss_clip": 1.21939147, + "balance_loss_mlp": 0.68546689, + "epoch": 0.009138734405531338, + "flos": 21762441377280.0, + "grad_norm": 245.5696027946517, + "language_loss": 0.97121191, + "learning_rate": 3.234636443010188e-06, + "loss": 0.99549878, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 4.421875, + "router_z_loss_mlp": 0.81835938, + "step": 152, + "time_per_iteration": 2.8077077865600586 + }, + { + "auxiliary_loss_clip": 0.01641907, + "auxiliary_loss_mlp": 0.0075845, + "balance_loss_clip": 1.21144855, + "balance_loss_mlp": 0.67652923, + "epoch": 0.009198857658199309, + "flos": 20842517134080.0, + "grad_norm": 21.13924437047166, + "language_loss": 0.95071375, + "learning_rate": 3.238858439669943e-06, + "loss": 0.97471732, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.81982422, + "step": 153, + "time_per_iteration": 2.779595136642456 + }, + { + "auxiliary_loss_clip": 0.01638204, + "auxiliary_loss_mlp": 0.00748329, + "balance_loss_clip": 1.20797575, + "balance_loss_mlp": 0.66631269, + "epoch": 0.009258980910867277, + "flos": 24827381078400.0, + "grad_norm": 12.885582570076016, + "language_loss": 0.95361781, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.97748315, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 4.30273438, + "router_z_loss_mlp": 0.8203125, + "step": 154, + "time_per_iteration": 2.7090158462524414 + }, + { + "auxiliary_loss_clip": 0.0162463, + "auxiliary_loss_mlp": 0.00739291, + "balance_loss_clip": 1.20600653, + "balance_loss_mlp": 0.6628536, + "epoch": 0.009319104163535248, + "flos": 28767786963840.0, + "grad_norm": 80.2292186789532, + "language_loss": 0.96439672, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.98803592, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.76416016, + "step": 155, + "time_per_iteration": 2.713118076324463 + }, + { + "auxiliary_loss_clip": 0.01637668, + "auxiliary_loss_mlp": 0.00736488, + "balance_loss_clip": 1.20523357, + "balance_loss_mlp": 0.65842986, + "epoch": 0.009379227416203216, + "flos": 16582004219520.0, + "grad_norm": 46.789258511663554, + "language_loss": 0.96475333, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.98849487, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 4.3203125, + "router_z_loss_mlp": 0.78027344, + "step": 156, + "time_per_iteration": 2.712151288986206 + }, + { + "auxiliary_loss_clip": 0.01629038, + "auxiliary_loss_mlp": 0.00696354, + "balance_loss_clip": 1.21185493, + "balance_loss_mlp": 0.62482822, + "epoch": 0.009439350668871187, + "flos": 18329919845760.0, + "grad_norm": 66.26567585416505, + "language_loss": 1.07964456, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.10289848, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 4.171875, + "router_z_loss_mlp": 0.71484375, + "step": 157, + "time_per_iteration": 2.6144847869873047 + }, + { + "auxiliary_loss_clip": 0.01609093, + "auxiliary_loss_mlp": 0.00750568, + "balance_loss_clip": 1.20453691, + "balance_loss_mlp": 0.67498958, + "epoch": 0.009499473921539155, + "flos": 24349912565760.0, + "grad_norm": 117.80569634588933, + "language_loss": 0.96375167, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.98734832, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 4.05078125, + "router_z_loss_mlp": 0.75634766, + "step": 158, + "time_per_iteration": 2.7037343978881836 + }, + { + "auxiliary_loss_clip": 0.01609644, + "auxiliary_loss_mlp": 0.00731084, + "balance_loss_clip": 1.19778621, + "balance_loss_mlp": 0.65569592, + "epoch": 0.009559597174207124, + "flos": 16399326625920.0, + "grad_norm": 149.38512302956997, + "language_loss": 0.95625806, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.97966534, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 4.1171875, + "router_z_loss_mlp": 0.75390625, + "step": 159, + "time_per_iteration": 2.5914180278778076 + }, + { + "auxiliary_loss_clip": 0.0159316, + "auxiliary_loss_mlp": 0.00751342, + "balance_loss_clip": 1.19011819, + "balance_loss_mlp": 0.67261571, + "epoch": 0.009619720426875094, + "flos": 22856890826880.0, + "grad_norm": 30.788354582864105, + "language_loss": 0.93491191, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.95835692, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 4.03710938, + "router_z_loss_mlp": 0.78710938, + "step": 160, + "time_per_iteration": 2.71919846534729 + }, + { + "auxiliary_loss_clip": 0.0160286, + "auxiliary_loss_mlp": 0.0072706, + "balance_loss_clip": 1.19454634, + "balance_loss_mlp": 0.6561541, + "epoch": 0.009679843679543063, + "flos": 19135001329920.0, + "grad_norm": 105.50496687828468, + "language_loss": 0.99424708, + "learning_rate": 3.2716732956621042e-06, + "loss": 1.01754618, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 4.08007812, + "router_z_loss_mlp": 0.70849609, + "step": 161, + "time_per_iteration": 2.6637136936187744 + }, + { + "auxiliary_loss_clip": 0.01608946, + "auxiliary_loss_mlp": 0.007541, + "balance_loss_clip": 1.19606805, + "balance_loss_mlp": 0.6782825, + "epoch": 0.009739966932211033, + "flos": 20302995876480.0, + "grad_norm": 104.5269765446991, + "language_loss": 1.00560844, + "learning_rate": 3.2756600092264203e-06, + "loss": 1.02923894, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 4.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 162, + "time_per_iteration": 2.685962438583374 + }, + { + "auxiliary_loss_clip": 0.01442512, + "auxiliary_loss_mlp": 0.00651847, + "balance_loss_clip": 1.15969419, + "balance_loss_mlp": 0.61007589, + "epoch": 0.009800090184879002, + "flos": 67034234177280.0, + "grad_norm": 1.1970724915005706, + "language_loss": 0.71661985, + "learning_rate": 3.279622189013474e-06, + "loss": 0.73756337, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.41796875, + "step": 163, + "time_per_iteration": 3.178384304046631 + }, + { + "auxiliary_loss_clip": 0.01580206, + "auxiliary_loss_mlp": 0.00733139, + "balance_loss_clip": 1.18762159, + "balance_loss_mlp": 0.66385496, + "epoch": 0.00986021343754697, + "flos": 17164690646400.0, + "grad_norm": 647.0366806010128, + "language_loss": 0.95580047, + "learning_rate": 3.283560135133457e-06, + "loss": 0.97893393, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 3.93164062, + "router_z_loss_mlp": 0.69238281, + "step": 164, + "time_per_iteration": 2.750044822692871 + }, + { + "auxiliary_loss_clip": 0.01591246, + "auxiliary_loss_mlp": 0.00705772, + "balance_loss_clip": 1.1910032, + "balance_loss_mlp": 0.63653523, + "epoch": 0.00992033669021494, + "flos": 17749424148480.0, + "grad_norm": 181.3048081787111, + "language_loss": 0.97304893, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.99601912, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 4.00390625, + "router_z_loss_mlp": 0.69189453, + "step": 165, + "time_per_iteration": 2.6562674045562744 + }, + { + "auxiliary_loss_clip": 0.01591912, + "auxiliary_loss_mlp": 0.00690902, + "balance_loss_clip": 1.19175887, + "balance_loss_mlp": 0.6221897, + "epoch": 0.00998045994288291, + "flos": 25297164080640.0, + "grad_norm": 26.034375133662355, + "language_loss": 0.8747412, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.8975693, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 4.00390625, + "router_z_loss_mlp": 0.68701172, + "step": 166, + "time_per_iteration": 2.6884992122650146 + }, + { + "auxiliary_loss_clip": 0.01583873, + "auxiliary_loss_mlp": 0.00675401, + "balance_loss_clip": 1.18845034, + "balance_loss_mlp": 0.60530549, + "epoch": 0.01004058319555088, + "flos": 32298954220800.0, + "grad_norm": 3743.241036658859, + "language_loss": 0.99003947, + "learning_rate": 3.2952314912845914e-06, + "loss": 1.01263213, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.70117188, + "step": 167, + "time_per_iteration": 2.7227530479431152 + }, + { + "auxiliary_loss_clip": 0.01573326, + "auxiliary_loss_mlp": 0.00656226, + "balance_loss_clip": 1.18747842, + "balance_loss_mlp": 0.59461874, + "epoch": 0.010100706448218848, + "flos": 11319941404800.0, + "grad_norm": 45.28732651477756, + "language_loss": 1.00718939, + "learning_rate": 3.299075396334735e-06, + "loss": 1.02948487, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 3.86328125, + "router_z_loss_mlp": 0.61572266, + "step": 168, + "time_per_iteration": 2.5909481048583984 + }, + { + "auxiliary_loss_clip": 0.01566265, + "auxiliary_loss_mlp": 0.00623007, + "balance_loss_clip": 1.18383694, + "balance_loss_mlp": 0.56445163, + "epoch": 0.010160829700886819, + "flos": 29719491765120.0, + "grad_norm": 16.852707871288015, + "language_loss": 0.93939906, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.96129179, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 3.82617188, + "router_z_loss_mlp": 0.5859375, + "step": 169, + "time_per_iteration": 2.7504444122314453 + }, + { + "auxiliary_loss_clip": 0.01566277, + "auxiliary_loss_mlp": 0.006676, + "balance_loss_clip": 1.18478215, + "balance_loss_mlp": 0.6027028, + "epoch": 0.010220952953554787, + "flos": 20412343854720.0, + "grad_norm": 96.56175336320626, + "language_loss": 0.91784328, + "learning_rate": 3.306695037731344e-06, + "loss": 0.94018209, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 0.64892578, + "step": 170, + "time_per_iteration": 2.618197202682495 + }, + { + "auxiliary_loss_clip": 0.01591934, + "auxiliary_loss_mlp": 0.00661083, + "balance_loss_clip": 1.19585764, + "balance_loss_mlp": 0.59423083, + "epoch": 0.010281076206222756, + "flos": 31285124847360.0, + "grad_norm": 12.454374482471781, + "language_loss": 0.95813894, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.98066902, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 3.95898438, + "router_z_loss_mlp": 0.66894531, + "step": 171, + "time_per_iteration": 2.744884490966797 + }, + { + "auxiliary_loss_clip": 0.01571606, + "auxiliary_loss_mlp": 0.00647152, + "balance_loss_clip": 1.19140601, + "balance_loss_mlp": 0.58172965, + "epoch": 0.010341199458890726, + "flos": 21982286568960.0, + "grad_norm": 1489.8922747559545, + "language_loss": 0.96083951, + "learning_rate": 3.314225558471224e-06, + "loss": 0.9830271, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.65429688, + "step": 172, + "time_per_iteration": 2.63980770111084 + }, + { + "auxiliary_loss_clip": 0.01543207, + "auxiliary_loss_mlp": 0.00631501, + "balance_loss_clip": 1.17982912, + "balance_loss_mlp": 0.57056147, + "epoch": 0.010401322711558695, + "flos": 30810529422720.0, + "grad_norm": 39.35570339201625, + "language_loss": 0.88731027, + "learning_rate": 3.317958045350308e-06, + "loss": 0.90905738, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 0.609375, + "step": 173, + "time_per_iteration": 2.7362773418426514 + }, + { + "auxiliary_loss_clip": 0.01579008, + "auxiliary_loss_mlp": 0.00634998, + "balance_loss_clip": 1.19688904, + "balance_loss_mlp": 0.57257998, + "epoch": 0.010461445964226665, + "flos": 24715124098560.0, + "grad_norm": 34.04474970822509, + "language_loss": 0.90769964, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.92983967, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.62402344, + "step": 174, + "time_per_iteration": 2.6625475883483887 + }, + { + "auxiliary_loss_clip": 0.01559468, + "auxiliary_loss_mlp": 0.00627325, + "balance_loss_clip": 1.18548846, + "balance_loss_mlp": 0.5672431, + "epoch": 0.010521569216894634, + "flos": 27710361457920.0, + "grad_norm": 25.326286542282165, + "language_loss": 0.79154444, + "learning_rate": 3.325358726641591e-06, + "loss": 0.81341231, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 0.60009766, + "step": 175, + "time_per_iteration": 2.7443044185638428 + }, + { + "auxiliary_loss_clip": 0.01545853, + "auxiliary_loss_mlp": 0.00624451, + "balance_loss_clip": 1.17718959, + "balance_loss_mlp": 0.55869508, + "epoch": 0.010581692469562603, + "flos": 12458346122880.0, + "grad_norm": 58.38761549940081, + "language_loss": 1.06906199, + "learning_rate": 3.329027409977902e-06, + "loss": 1.09076512, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 0.65722656, + "step": 176, + "time_per_iteration": 2.642580270767212 + }, + { + "auxiliary_loss_clip": 0.01553325, + "auxiliary_loss_mlp": 0.00616911, + "balance_loss_clip": 1.18271804, + "balance_loss_mlp": 0.55549425, + "epoch": 0.010641815722230573, + "flos": 19427601519360.0, + "grad_norm": 86.77421767160196, + "language_loss": 0.85222119, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.87392354, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.61474609, + "step": 177, + "time_per_iteration": 2.6845474243164062 + }, + { + "auxiliary_loss_clip": 0.0157588, + "auxiliary_loss_mlp": 0.00596528, + "balance_loss_clip": 1.18894255, + "balance_loss_mlp": 0.53415787, + "epoch": 0.010701938974898541, + "flos": 18332577452160.0, + "grad_norm": 38.932481738464716, + "language_loss": 0.89594585, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.91767001, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 3.87304688, + "router_z_loss_mlp": 0.62353516, + "step": 178, + "time_per_iteration": 2.6692745685577393 + }, + { + "auxiliary_loss_clip": 0.01564698, + "auxiliary_loss_mlp": 0.00607416, + "balance_loss_clip": 1.18618131, + "balance_loss_mlp": 0.54557061, + "epoch": 0.010762062227566512, + "flos": 19203985399680.0, + "grad_norm": 13.056752990531393, + "language_loss": 0.92749262, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.9492138, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 0.61914062, + "step": 179, + "time_per_iteration": 2.634744882583618 + }, + { + "auxiliary_loss_clip": 0.01580928, + "auxiliary_loss_mlp": 0.00584861, + "balance_loss_clip": 1.19354916, + "balance_loss_mlp": 0.52168041, + "epoch": 0.01082218548023448, + "flos": 31425427370880.0, + "grad_norm": 268.3947636101802, + "language_loss": 0.92995822, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.95161617, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 3.875, + "router_z_loss_mlp": 0.63110352, + "step": 180, + "time_per_iteration": 2.727526903152466 + }, + { + "auxiliary_loss_clip": 0.01589344, + "auxiliary_loss_mlp": 0.00616983, + "balance_loss_clip": 1.20378017, + "balance_loss_mlp": 0.55179977, + "epoch": 0.01088230873290245, + "flos": 25046436170880.0, + "grad_norm": 12.207737165871551, + "language_loss": 0.8537823, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.87584555, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 3.85742188, + "router_z_loss_mlp": 0.65136719, + "step": 181, + "time_per_iteration": 2.725041151046753 + }, + { + "auxiliary_loss_clip": 0.01594552, + "auxiliary_loss_mlp": 0.00587309, + "balance_loss_clip": 1.20277941, + "balance_loss_mlp": 0.52422357, + "epoch": 0.01094243198557042, + "flos": 22893411980160.0, + "grad_norm": 28.91984465366057, + "language_loss": 0.88539469, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.90721333, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 3.91601562, + "router_z_loss_mlp": 0.63085938, + "step": 182, + "time_per_iteration": 2.7122724056243896 + }, + { + "auxiliary_loss_clip": 0.01611154, + "auxiliary_loss_mlp": 0.00574381, + "balance_loss_clip": 1.21825707, + "balance_loss_mlp": 0.51196289, + "epoch": 0.011002555238238388, + "flos": 17165049782400.0, + "grad_norm": 207.6845951392771, + "language_loss": 0.96730793, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.98916328, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 3.92773438, + "router_z_loss_mlp": 0.62451172, + "step": 183, + "time_per_iteration": 2.6229159832000732 + }, + { + "auxiliary_loss_clip": 0.01602611, + "auxiliary_loss_mlp": 0.00606016, + "balance_loss_clip": 1.21063662, + "balance_loss_mlp": 0.54040277, + "epoch": 0.011062678490906358, + "flos": 22310150935680.0, + "grad_norm": 131.47900804308713, + "language_loss": 0.94533432, + "learning_rate": 3.357647774369736e-06, + "loss": 0.96742058, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 3.921875, + "router_z_loss_mlp": 0.65625, + "step": 184, + "time_per_iteration": 2.7038180828094482 + }, + { + "auxiliary_loss_clip": 0.01613853, + "auxiliary_loss_mlp": 0.00538514, + "balance_loss_clip": 1.22603488, + "balance_loss_mlp": 0.47926664, + "epoch": 0.011122801743574327, + "flos": 24388373053440.0, + "grad_norm": 27.018539465882636, + "language_loss": 0.91888285, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.94040644, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 3.87890625, + "router_z_loss_mlp": 0.59301758, + "step": 185, + "time_per_iteration": 2.6613054275512695 + }, + { + "auxiliary_loss_clip": 0.01622394, + "auxiliary_loss_mlp": 0.00608331, + "balance_loss_clip": 1.22446799, + "balance_loss_mlp": 0.54028678, + "epoch": 0.011182924996242297, + "flos": 18150258994560.0, + "grad_norm": 68.79651855810427, + "language_loss": 0.80130154, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.82360876, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 3.98046875, + "router_z_loss_mlp": 0.68066406, + "step": 186, + "time_per_iteration": 2.6364312171936035 + }, + { + "auxiliary_loss_clip": 0.01612141, + "auxiliary_loss_mlp": 0.0058272, + "balance_loss_clip": 1.22270823, + "balance_loss_mlp": 0.52201867, + "epoch": 0.011243048248910266, + "flos": 15486800584320.0, + "grad_norm": 75.16434513413567, + "language_loss": 1.10130453, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.12325311, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.60644531, + "step": 187, + "time_per_iteration": 4.127197027206421 + }, + { + "auxiliary_loss_clip": 0.01619047, + "auxiliary_loss_mlp": 0.0056733, + "balance_loss_clip": 1.23301578, + "balance_loss_mlp": 0.50720108, + "epoch": 0.011303171501578235, + "flos": 40916868986880.0, + "grad_norm": 13.68927363712344, + "language_loss": 0.81994116, + "learning_rate": 3.371494591560139e-06, + "loss": 0.84180486, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 3.86328125, + "router_z_loss_mlp": 0.60083008, + "step": 188, + "time_per_iteration": 4.269714832305908 + }, + { + "auxiliary_loss_clip": 0.01461046, + "auxiliary_loss_mlp": 0.00506188, + "balance_loss_clip": 1.17419195, + "balance_loss_mlp": 0.47700557, + "epoch": 0.011363294754246205, + "flos": 66302697790080.0, + "grad_norm": 0.7738953343683723, + "language_loss": 0.55504274, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.57471502, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.29101562, + "step": 189, + "time_per_iteration": 3.2164976596832275 + }, + { + "auxiliary_loss_clip": 0.01620531, + "auxiliary_loss_mlp": 0.00570427, + "balance_loss_clip": 1.23081672, + "balance_loss_mlp": 0.50815183, + "epoch": 0.011423418006914174, + "flos": 24900279730560.0, + "grad_norm": 21.9776498791501, + "language_loss": 1.04523301, + "learning_rate": 3.3783079057586833e-06, + "loss": 1.06714249, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 3.89648438, + "router_z_loss_mlp": 0.62353516, + "step": 190, + "time_per_iteration": 2.6951098442077637 + }, + { + "auxiliary_loss_clip": 0.01595422, + "auxiliary_loss_mlp": 0.00588818, + "balance_loss_clip": 1.21567011, + "balance_loss_mlp": 0.53209782, + "epoch": 0.011483541259582144, + "flos": 19791879298560.0, + "grad_norm": 257.96569814789456, + "language_loss": 0.91753006, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.93937242, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.56738281, + "step": 191, + "time_per_iteration": 2.6398918628692627 + }, + { + "auxiliary_loss_clip": 0.0159607, + "auxiliary_loss_mlp": 0.00563988, + "balance_loss_clip": 1.21540308, + "balance_loss_mlp": 0.50831723, + "epoch": 0.011543664512250112, + "flos": 26176939896960.0, + "grad_norm": 4.655397784307342, + "language_loss": 0.98557943, + "learning_rate": 3.385049875042367e-06, + "loss": 1.00717998, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.55761719, + "step": 192, + "time_per_iteration": 2.7107768058776855 + }, + { + "auxiliary_loss_clip": 0.01583962, + "auxiliary_loss_mlp": 0.00638442, + "balance_loss_clip": 1.20853102, + "balance_loss_mlp": 0.57492709, + "epoch": 0.011603787764918083, + "flos": 23768985905280.0, + "grad_norm": 154.5776993070653, + "language_loss": 0.950854, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.97307807, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 0.63525391, + "step": 193, + "time_per_iteration": 2.6503746509552 + }, + { + "auxiliary_loss_clip": 0.01576143, + "auxiliary_loss_mlp": 0.00598983, + "balance_loss_clip": 1.20520329, + "balance_loss_mlp": 0.54624504, + "epoch": 0.011663911017586051, + "flos": 25954688494080.0, + "grad_norm": 13.18657964905435, + "language_loss": 1.00363481, + "learning_rate": 3.3917219781023906e-06, + "loss": 1.0253861, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 0.52758789, + "step": 194, + "time_per_iteration": 2.678149700164795 + }, + { + "auxiliary_loss_clip": 0.01569893, + "auxiliary_loss_mlp": 0.00636595, + "balance_loss_clip": 1.19969916, + "balance_loss_mlp": 0.58163995, + "epoch": 0.01172403427025402, + "flos": 17895149625600.0, + "grad_norm": 286.0909421657423, + "language_loss": 1.01452959, + "learning_rate": 3.3950322793970014e-06, + "loss": 1.03659451, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 0.55029297, + "step": 195, + "time_per_iteration": 2.6224329471588135 + }, + { + "auxiliary_loss_clip": 0.01565051, + "auxiliary_loss_mlp": 0.00619708, + "balance_loss_clip": 1.20195889, + "balance_loss_mlp": 0.56611192, + "epoch": 0.01178415752292199, + "flos": 17894539094400.0, + "grad_norm": 71.11732831761992, + "language_loss": 0.955984, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.97783154, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.53564453, + "step": 196, + "time_per_iteration": 2.6526803970336914 + }, + { + "auxiliary_loss_clip": 0.01556823, + "auxiliary_loss_mlp": 0.0066075, + "balance_loss_clip": 1.19599283, + "balance_loss_mlp": 0.60758305, + "epoch": 0.011844280775589959, + "flos": 22893555634560.0, + "grad_norm": 64.45095057168676, + "language_loss": 1.01164472, + "learning_rate": 3.4016022566445335e-06, + "loss": 1.03382051, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.53125, + "step": 197, + "time_per_iteration": 2.6944479942321777 + }, + { + "auxiliary_loss_clip": 0.01547856, + "auxiliary_loss_mlp": 0.00655646, + "balance_loss_clip": 1.18900299, + "balance_loss_mlp": 0.60021365, + "epoch": 0.01190440402825793, + "flos": 26980333441920.0, + "grad_norm": 56.588184045404624, + "language_loss": 0.87437308, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.89640808, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.55517578, + "step": 198, + "time_per_iteration": 2.770284414291382 + }, + { + "auxiliary_loss_clip": 0.01544437, + "auxiliary_loss_mlp": 0.00664084, + "balance_loss_clip": 1.19115996, + "balance_loss_mlp": 0.61556578, + "epoch": 0.011964527280925898, + "flos": 20521584092160.0, + "grad_norm": 65.43650243893312, + "language_loss": 0.94706368, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.96914881, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.48510742, + "step": 199, + "time_per_iteration": 2.783076763153076 + }, + { + "auxiliary_loss_clip": 0.0155405, + "auxiliary_loss_mlp": 0.00742345, + "balance_loss_clip": 1.19207299, + "balance_loss_mlp": 0.68479085, + "epoch": 0.012024650533593867, + "flos": 27745984771200.0, + "grad_norm": 18.741226555700443, + "language_loss": 0.88611114, + "learning_rate": 3.411333205349222e-06, + "loss": 0.90907514, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.57617188, + "step": 200, + "time_per_iteration": 2.698286294937134 + }, + { + "auxiliary_loss_clip": 0.01561633, + "auxiliary_loss_mlp": 0.00734543, + "balance_loss_clip": 1.1927526, + "balance_loss_mlp": 0.67560554, + "epoch": 0.012084773786261837, + "flos": 10452017076480.0, + "grad_norm": 10.506897469222446, + "language_loss": 0.98028105, + "learning_rate": 3.4145444448414217e-06, + "loss": 1.00324273, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 0.58935547, + "step": 201, + "time_per_iteration": 2.617231845855713 + }, + { + "auxiliary_loss_clip": 0.01568704, + "auxiliary_loss_mlp": 0.00709125, + "balance_loss_clip": 1.20021772, + "balance_loss_mlp": 0.65486121, + "epoch": 0.012144897038929806, + "flos": 23105751229440.0, + "grad_norm": 58.30983622113242, + "language_loss": 0.91112447, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.9339028, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 0.54272461, + "step": 202, + "time_per_iteration": 2.654141426086426 + }, + { + "auxiliary_loss_clip": 0.01563421, + "auxiliary_loss_mlp": 0.00745806, + "balance_loss_clip": 1.19565558, + "balance_loss_mlp": 0.68818057, + "epoch": 0.012205020291597776, + "flos": 21033203460480.0, + "grad_norm": 58.105315983245575, + "language_loss": 0.97845, + "learning_rate": 3.4209192710126685e-06, + "loss": 1.00154233, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 0.57617188, + "step": 203, + "time_per_iteration": 2.724932909011841 + }, + { + "auxiliary_loss_clip": 0.01400112, + "auxiliary_loss_mlp": 0.00602754, + "balance_loss_clip": 1.12785184, + "balance_loss_mlp": 0.572999, + "epoch": 0.012265143544265745, + "flos": 68447785075200.0, + "grad_norm": 1.004336381848441, + "language_loss": 0.60621649, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.62624514, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.296875, + "step": 204, + "time_per_iteration": 3.1386635303497314 + }, + { + "auxiliary_loss_clip": 0.0156447, + "auxiliary_loss_mlp": 0.00714389, + "balance_loss_clip": 1.19143367, + "balance_loss_mlp": 0.65609545, + "epoch": 0.012325266796933715, + "flos": 17019252478080.0, + "grad_norm": 1007.4251654515991, + "language_loss": 0.99754077, + "learning_rate": 3.4272315978819516e-06, + "loss": 1.02032936, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.58276367, + "step": 205, + "time_per_iteration": 2.656306266784668 + }, + { + "auxiliary_loss_clip": 0.01572238, + "auxiliary_loss_mlp": 0.00617556, + "balance_loss_clip": 1.19592643, + "balance_loss_mlp": 0.56238568, + "epoch": 0.012385390049601683, + "flos": 20190056538240.0, + "grad_norm": 7.591315498388621, + "language_loss": 0.97027284, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.99217075, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 0.55224609, + "step": 206, + "time_per_iteration": 2.612391233444214 + }, + { + "auxiliary_loss_clip": 0.01566899, + "auxiliary_loss_mlp": 0.00642157, + "balance_loss_clip": 1.19064474, + "balance_loss_mlp": 0.58510369, + "epoch": 0.012445513302269652, + "flos": 16253134272000.0, + "grad_norm": 19.111539624077594, + "language_loss": 1.05933738, + "learning_rate": 3.43348263905683e-06, + "loss": 1.08142793, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 0.57055664, + "step": 207, + "time_per_iteration": 2.631415367126465 + }, + { + "auxiliary_loss_clip": 0.0155764, + "auxiliary_loss_mlp": 0.00654626, + "balance_loss_clip": 1.1859194, + "balance_loss_mlp": 0.59206522, + "epoch": 0.012505636554937622, + "flos": 23769380954880.0, + "grad_norm": 39.73522262294019, + "language_loss": 0.82715881, + "learning_rate": 3.436585547151547e-06, + "loss": 0.84928143, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.62548828, + "step": 208, + "time_per_iteration": 2.6905648708343506 + }, + { + "auxiliary_loss_clip": 0.0155132, + "auxiliary_loss_mlp": 0.00652378, + "balance_loss_clip": 1.18596292, + "balance_loss_mlp": 0.59167635, + "epoch": 0.012565759807605591, + "flos": 30591546157440.0, + "grad_norm": 10.55452011055248, + "language_loss": 1.06887817, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.09091508, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.60742188, + "step": 209, + "time_per_iteration": 2.761862277984619 + }, + { + "auxiliary_loss_clip": 0.01557561, + "auxiliary_loss_mlp": 0.0056308, + "balance_loss_clip": 1.18741155, + "balance_loss_mlp": 0.50745672, + "epoch": 0.012625883060273561, + "flos": 40113511355520.0, + "grad_norm": 33.648708585269524, + "language_loss": 0.94585407, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.96706045, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 0.5559082, + "step": 210, + "time_per_iteration": 2.7711572647094727 + }, + { + "auxiliary_loss_clip": 0.01551024, + "auxiliary_loss_mlp": 0.00572182, + "balance_loss_clip": 1.18215847, + "balance_loss_mlp": 0.51457977, + "epoch": 0.01268600631294153, + "flos": 27089178629760.0, + "grad_norm": 38.57271449529399, + "language_loss": 1.0240258, + "learning_rate": 3.445805545042314e-06, + "loss": 1.04525781, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 0.57617188, + "step": 211, + "time_per_iteration": 2.779919385910034 + }, + { + "auxiliary_loss_clip": 0.01563064, + "auxiliary_loss_mlp": 0.00592989, + "balance_loss_clip": 1.19039619, + "balance_loss_mlp": 0.53071451, + "epoch": 0.012746129565609499, + "flos": 16982767238400.0, + "grad_norm": 68.35424893992301, + "language_loss": 1.03842223, + "learning_rate": 3.448849769075239e-06, + "loss": 1.05998278, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 0.62255859, + "step": 212, + "time_per_iteration": 2.607290267944336 + }, + { + "auxiliary_loss_clip": 0.01558939, + "auxiliary_loss_mlp": 0.0054631, + "balance_loss_clip": 1.19377351, + "balance_loss_mlp": 0.49190316, + "epoch": 0.012806252818277469, + "flos": 46533476995200.0, + "grad_norm": 9.16904552373241, + "language_loss": 0.84267753, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.86373001, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 0.54443359, + "step": 213, + "time_per_iteration": 2.8693034648895264 + }, + { + "auxiliary_loss_clip": 0.01556984, + "auxiliary_loss_mlp": 0.0055737, + "balance_loss_clip": 1.18931401, + "balance_loss_mlp": 0.50422692, + "epoch": 0.012866376070945438, + "flos": 14388616120320.0, + "grad_norm": 234.90236881503495, + "language_loss": 0.95823032, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.97937381, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 0.53125, + "step": 214, + "time_per_iteration": 2.6336557865142822 + }, + { + "auxiliary_loss_clip": 0.01559645, + "auxiliary_loss_mlp": 0.00549369, + "balance_loss_clip": 1.19710743, + "balance_loss_mlp": 0.49698812, + "epoch": 0.012926499323613408, + "flos": 26140813793280.0, + "grad_norm": 354.88863794634847, + "language_loss": 0.86068559, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.8817758, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 0.52416992, + "step": 215, + "time_per_iteration": 2.6882364749908447 + }, + { + "auxiliary_loss_clip": 0.01588028, + "auxiliary_loss_mlp": 0.00634422, + "balance_loss_clip": 1.21162534, + "balance_loss_mlp": 0.5725044, + "epoch": 0.012986622576281377, + "flos": 30117202128000.0, + "grad_norm": 172.79752638185997, + "language_loss": 1.00035024, + "learning_rate": 3.460884739729461e-06, + "loss": 1.02257466, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 0.61889648, + "step": 216, + "time_per_iteration": 2.752753257751465 + }, + { + "auxiliary_loss_clip": 0.01594513, + "auxiliary_loss_mlp": 0.00586919, + "balance_loss_clip": 1.216097, + "balance_loss_mlp": 0.52864909, + "epoch": 0.013046745828949347, + "flos": 13954025468160.0, + "grad_norm": 51.54354903624533, + "language_loss": 1.06081915, + "learning_rate": 3.463858658104523e-06, + "loss": 1.0826335, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 3.78320312, + "router_z_loss_mlp": 0.58300781, + "step": 217, + "time_per_iteration": 2.6360628604888916 + }, + { + "auxiliary_loss_clip": 0.0159155, + "auxiliary_loss_mlp": 0.00590419, + "balance_loss_clip": 1.21482003, + "balance_loss_mlp": 0.53079009, + "epoch": 0.013106869081617315, + "flos": 17347835116800.0, + "grad_norm": 77.50927215667582, + "language_loss": 1.0089674, + "learning_rate": 3.4668189032433696e-06, + "loss": 1.03078699, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 0.59594727, + "step": 218, + "time_per_iteration": 2.6822335720062256 + }, + { + "auxiliary_loss_clip": 0.01596823, + "auxiliary_loss_mlp": 0.00571851, + "balance_loss_clip": 1.22095132, + "balance_loss_mlp": 0.51525068, + "epoch": 0.013166992334285284, + "flos": 25884914325120.0, + "grad_norm": 237.59621494351723, + "language_loss": 0.93932462, + "learning_rate": 3.46976560030214e-06, + "loss": 0.96101141, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 0.56591797, + "step": 219, + "time_per_iteration": 2.6951491832733154 + }, + { + "auxiliary_loss_clip": 0.01627395, + "auxiliary_loss_mlp": 0.005767, + "balance_loss_clip": 1.24566078, + "balance_loss_mlp": 0.52119625, + "epoch": 0.013227115586953254, + "flos": 31175956437120.0, + "grad_norm": 12.711095472783795, + "language_loss": 0.94856256, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.97060347, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 0.55419922, + "step": 220, + "time_per_iteration": 2.7985880374908447 + }, + { + "auxiliary_loss_clip": 0.01636169, + "auxiliary_loss_mlp": 0.00548354, + "balance_loss_clip": 1.25030375, + "balance_loss_mlp": 0.4983815, + "epoch": 0.013287238839621223, + "flos": 20409470766720.0, + "grad_norm": 221.0083291963642, + "language_loss": 0.93181264, + "learning_rate": 3.475618842282164e-06, + "loss": 0.95365787, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 3.86132812, + "router_z_loss_mlp": 0.5, + "step": 221, + "time_per_iteration": 2.64241886138916 + }, + { + "auxiliary_loss_clip": 0.01685497, + "auxiliary_loss_mlp": 0.00559162, + "balance_loss_clip": 1.28239822, + "balance_loss_mlp": 0.50141662, + "epoch": 0.013347362092289193, + "flos": 14137134024960.0, + "grad_norm": 23.650828221004197, + "language_loss": 1.00632095, + "learning_rate": 3.4785256290862486e-06, + "loss": 1.02876759, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 4.02929688, + "router_z_loss_mlp": 0.57714844, + "step": 222, + "time_per_iteration": 2.6198606491088867 + }, + { + "auxiliary_loss_clip": 0.01698042, + "auxiliary_loss_mlp": 0.00598077, + "balance_loss_clip": 1.29351223, + "balance_loss_mlp": 0.53737533, + "epoch": 0.013407485344957162, + "flos": 21797705554560.0, + "grad_norm": 58.051248063839516, + "language_loss": 1.02626121, + "learning_rate": 3.481419351635897e-06, + "loss": 1.04922235, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 4.04492188, + "router_z_loss_mlp": 0.60693359, + "step": 223, + "time_per_iteration": 2.6706135272979736 + }, + { + "auxiliary_loss_clip": 0.01721336, + "auxiliary_loss_mlp": 0.005916, + "balance_loss_clip": 1.30073416, + "balance_loss_mlp": 0.53404534, + "epoch": 0.013467608597625132, + "flos": 18621622195200.0, + "grad_norm": 6535.04896122925, + "language_loss": 0.98268652, + "learning_rate": 3.484300126837776e-06, + "loss": 1.00581586, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 4.20898438, + "router_z_loss_mlp": 0.57543945, + "step": 224, + "time_per_iteration": 2.636991262435913 + }, + { + "auxiliary_loss_clip": 0.01760255, + "auxiliary_loss_mlp": 0.00595519, + "balance_loss_clip": 1.32432592, + "balance_loss_mlp": 0.53705883, + "epoch": 0.013527731850293101, + "flos": 18552314903040.0, + "grad_norm": 54.000050666786414, + "language_loss": 0.9857409, + "learning_rate": 3.487168070036317e-06, + "loss": 1.00929856, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.58447266, + "step": 225, + "time_per_iteration": 2.746201515197754 + }, + { + "auxiliary_loss_clip": 0.01767756, + "auxiliary_loss_mlp": 0.00577927, + "balance_loss_clip": 1.33693051, + "balance_loss_mlp": 0.52197039, + "epoch": 0.01358785510296107, + "flos": 19165381257600.0, + "grad_norm": 3167.3129793268704, + "language_loss": 1.0526216, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.07607841, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.55883789, + "step": 226, + "time_per_iteration": 2.657371759414673 + }, + { + "auxiliary_loss_clip": 0.01810365, + "auxiliary_loss_mlp": 0.00585321, + "balance_loss_clip": 1.35201335, + "balance_loss_mlp": 0.52504897, + "epoch": 0.01364797835562904, + "flos": 23329941966720.0, + "grad_norm": 56.58997924325406, + "language_loss": 0.99236029, + "learning_rate": 3.4928659141555727e-06, + "loss": 1.01631713, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 4.58203125, + "router_z_loss_mlp": 0.60302734, + "step": 227, + "time_per_iteration": 2.727694272994995 + }, + { + "auxiliary_loss_clip": 0.01819124, + "auxiliary_loss_mlp": 0.00458225, + "balance_loss_clip": 1.41110277, + "balance_loss_mlp": 0.42770791, + "epoch": 0.013708101608297009, + "flos": 70993746097920.0, + "grad_norm": 0.9981676246161005, + "language_loss": 0.57178611, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.59455955, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 4.0625, + "router_z_loss_mlp": 0.3046875, + "step": 228, + "time_per_iteration": 3.2424325942993164 + }, + { + "auxiliary_loss_clip": 0.01779778, + "auxiliary_loss_mlp": 0.00539834, + "balance_loss_clip": 1.34304762, + "balance_loss_mlp": 0.48747757, + "epoch": 0.013768224860964979, + "flos": 16325170997760.0, + "grad_norm": 167.36505794293024, + "language_loss": 0.96456593, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.98776203, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.5234375, + "step": 229, + "time_per_iteration": 5.511472702026367 + }, + { + "auxiliary_loss_clip": 0.01819645, + "auxiliary_loss_mlp": 0.00590883, + "balance_loss_clip": 1.35915709, + "balance_loss_mlp": 0.53187484, + "epoch": 0.013828348113632948, + "flos": 20193037367040.0, + "grad_norm": 222.86560992138374, + "language_loss": 0.91976517, + "learning_rate": 3.501319237118231e-06, + "loss": 0.94387048, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 4.59765625, + "router_z_loss_mlp": 0.58959961, + "step": 230, + "time_per_iteration": 4.105376958847046 + }, + { + "auxiliary_loss_clip": 0.01787017, + "auxiliary_loss_mlp": 0.00552044, + "balance_loss_clip": 1.34928548, + "balance_loss_mlp": 0.49890053, + "epoch": 0.013888471366300916, + "flos": 20741070147840.0, + "grad_norm": 9.947321187751191, + "language_loss": 0.96040511, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.98379576, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 4.37304688, + "router_z_loss_mlp": 0.53051758, + "step": 231, + "time_per_iteration": 2.660130739212036 + }, + { + "auxiliary_loss_clip": 0.01787912, + "auxiliary_loss_mlp": 0.00562979, + "balance_loss_clip": 1.34769464, + "balance_loss_mlp": 0.51026428, + "epoch": 0.013948594618968886, + "flos": 22090628966400.0, + "grad_norm": 24.128470373150417, + "language_loss": 0.93142104, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.95492995, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 4.40234375, + "router_z_loss_mlp": 0.52661133, + "step": 232, + "time_per_iteration": 2.609828233718872 + }, + { + "auxiliary_loss_clip": 0.0180468, + "auxiliary_loss_mlp": 0.00570568, + "balance_loss_clip": 1.34959757, + "balance_loss_mlp": 0.51566029, + "epoch": 0.014008717871636855, + "flos": 19063108258560.0, + "grad_norm": 762.0860104031582, + "language_loss": 0.83401281, + "learning_rate": 3.509663010692652e-06, + "loss": 0.85776532, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 4.55078125, + "router_z_loss_mlp": 0.54907227, + "step": 233, + "time_per_iteration": 2.627774715423584 + }, + { + "auxiliary_loss_clip": 0.01778407, + "auxiliary_loss_mlp": 0.00584006, + "balance_loss_clip": 1.34249258, + "balance_loss_mlp": 0.5257839, + "epoch": 0.014068841124304825, + "flos": 14530822064640.0, + "grad_norm": 67.4954048858811, + "language_loss": 0.94033861, + "learning_rate": 3.512420411838642e-06, + "loss": 0.96396279, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 4.35546875, + "router_z_loss_mlp": 0.58251953, + "step": 234, + "time_per_iteration": 2.5852134227752686 + }, + { + "auxiliary_loss_clip": 0.01774071, + "auxiliary_loss_mlp": 0.00575399, + "balance_loss_clip": 1.34351325, + "balance_loss_mlp": 0.51841748, + "epoch": 0.014128964376972794, + "flos": 18077396256000.0, + "grad_norm": 87.2419566677951, + "language_loss": 0.9858191, + "learning_rate": 3.515166054308634e-06, + "loss": 1.00931382, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 4.30078125, + "router_z_loss_mlp": 0.56982422, + "step": 235, + "time_per_iteration": 2.6216847896575928 + }, + { + "auxiliary_loss_clip": 0.01761737, + "auxiliary_loss_mlp": 0.0058501, + "balance_loss_clip": 1.33448374, + "balance_loss_mlp": 0.52988744, + "epoch": 0.014189087629640764, + "flos": 25334331678720.0, + "grad_norm": 54.99610967094566, + "language_loss": 0.93348372, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.95695126, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 4.2734375, + "router_z_loss_mlp": 0.55102539, + "step": 236, + "time_per_iteration": 2.6516685485839844 + }, + { + "auxiliary_loss_clip": 0.01785741, + "auxiliary_loss_mlp": 0.00570218, + "balance_loss_clip": 1.34407997, + "balance_loss_mlp": 0.51530993, + "epoch": 0.014249210882308733, + "flos": 36139744713600.0, + "grad_norm": 373.833610998033, + "language_loss": 0.90477121, + "learning_rate": 3.520622461401154e-06, + "loss": 0.92833072, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 4.421875, + "router_z_loss_mlp": 0.54882812, + "step": 237, + "time_per_iteration": 2.8423547744750977 + }, + { + "auxiliary_loss_clip": 0.01821179, + "auxiliary_loss_mlp": 0.0062562, + "balance_loss_clip": 1.358881, + "balance_loss_mlp": 0.561867, + "epoch": 0.014309334134976702, + "flos": 12932977461120.0, + "grad_norm": 850.7657494068205, + "language_loss": 0.83836877, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.86283678, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.63769531, + "step": 238, + "time_per_iteration": 2.6175143718719482 + }, + { + "auxiliary_loss_clip": 0.01860131, + "auxiliary_loss_mlp": 0.00578941, + "balance_loss_clip": 1.39353371, + "balance_loss_mlp": 0.52365196, + "epoch": 0.014369457387644672, + "flos": 20777519473920.0, + "grad_norm": 162.63140767673028, + "language_loss": 0.93906862, + "learning_rate": 3.526033015791284e-06, + "loss": 0.96345931, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 4.66015625, + "router_z_loss_mlp": 0.55297852, + "step": 239, + "time_per_iteration": 2.6669375896453857 + }, + { + "auxiliary_loss_clip": 0.01874818, + "auxiliary_loss_mlp": 0.00527436, + "balance_loss_clip": 1.40589428, + "balance_loss_mlp": 0.47488856, + "epoch": 0.01442958064031264, + "flos": 25848536826240.0, + "grad_norm": 120.14794067570288, + "language_loss": 1.00595963, + "learning_rate": 3.528721337790862e-06, + "loss": 1.02998209, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 4.68359375, + "router_z_loss_mlp": 0.52587891, + "step": 240, + "time_per_iteration": 2.7326700687408447 + }, + { + "auxiliary_loss_clip": 0.01920968, + "auxiliary_loss_mlp": 0.00618437, + "balance_loss_clip": 1.41369092, + "balance_loss_mlp": 0.56016791, + "epoch": 0.014489703892980611, + "flos": 28219718269440.0, + "grad_norm": 15.653623606455913, + "language_loss": 0.91664922, + "learning_rate": 3.531398481704111e-06, + "loss": 0.9420433, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.58276367, + "step": 241, + "time_per_iteration": 2.798095464706421 + }, + { + "auxiliary_loss_clip": 0.01905582, + "auxiliary_loss_mlp": 0.00572836, + "balance_loss_clip": 1.41286266, + "balance_loss_mlp": 0.51611674, + "epoch": 0.01454982714564858, + "flos": 22490925108480.0, + "grad_norm": 22.887515893572623, + "language_loss": 0.95485985, + "learning_rate": 3.534064540103573e-06, + "loss": 0.979644, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 4.92578125, + "router_z_loss_mlp": 0.56713867, + "step": 242, + "time_per_iteration": 2.724915027618408 + }, + { + "auxiliary_loss_clip": 0.01929853, + "auxiliary_loss_mlp": 0.00611535, + "balance_loss_clip": 1.41785908, + "balance_loss_mlp": 0.54782945, + "epoch": 0.014609950398316548, + "flos": 21653201139840.0, + "grad_norm": 47.95976028230824, + "language_loss": 0.94221067, + "learning_rate": 3.536719604416555e-06, + "loss": 0.96762455, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 5.12109375, + "router_z_loss_mlp": 0.63720703, + "step": 243, + "time_per_iteration": 2.686410665512085 + }, + { + "auxiliary_loss_clip": 0.01948453, + "auxiliary_loss_mlp": 0.00593192, + "balance_loss_clip": 1.41908062, + "balance_loss_mlp": 0.53177518, + "epoch": 0.014670073650984519, + "flos": 21869993675520.0, + "grad_norm": 28.447695152734994, + "language_loss": 0.89414114, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.91955757, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 5.296875, + "router_z_loss_mlp": 0.61474609, + "step": 244, + "time_per_iteration": 2.7025434970855713 + }, + { + "auxiliary_loss_clip": 0.01958994, + "auxiliary_loss_mlp": 0.00653424, + "balance_loss_clip": 1.4127785, + "balance_loss_mlp": 0.58113539, + "epoch": 0.014730196903652487, + "flos": 23183713699200.0, + "grad_norm": 28.765803168613186, + "language_loss": 0.87027651, + "learning_rate": 3.54199711087864e-06, + "loss": 0.89640069, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 5.46484375, + "router_z_loss_mlp": 0.72265625, + "step": 245, + "time_per_iteration": 2.640968084335327 + }, + { + "auxiliary_loss_clip": 0.01960035, + "auxiliary_loss_mlp": 0.0060384, + "balance_loss_clip": 1.41385865, + "balance_loss_mlp": 0.53779823, + "epoch": 0.014790320156320457, + "flos": 23222605150080.0, + "grad_norm": 40.73299354052646, + "language_loss": 0.89651424, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.92215294, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 5.46484375, + "router_z_loss_mlp": 0.66064453, + "step": 246, + "time_per_iteration": 2.9166998863220215 + }, + { + "auxiliary_loss_clip": 0.01948442, + "auxiliary_loss_mlp": 0.00586716, + "balance_loss_clip": 1.40752637, + "balance_loss_mlp": 0.52682549, + "epoch": 0.014850443408988426, + "flos": 15815490963840.0, + "grad_norm": 16.194323766638902, + "language_loss": 0.97404552, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.9993971, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 5.40234375, + "router_z_loss_mlp": 0.59912109, + "step": 247, + "time_per_iteration": 2.6574418544769287 + }, + { + "auxiliary_loss_clip": 0.01997519, + "auxiliary_loss_mlp": 0.00591367, + "balance_loss_clip": 1.41272092, + "balance_loss_mlp": 0.53080893, + "epoch": 0.014910566661656396, + "flos": 22781657790720.0, + "grad_norm": 316.46072931021536, + "language_loss": 0.84265119, + "learning_rate": 3.549833136812155e-06, + "loss": 0.86854005, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 5.84375, + "router_z_loss_mlp": 0.60498047, + "step": 248, + "time_per_iteration": 2.6765387058258057 + }, + { + "auxiliary_loss_clip": 0.01961814, + "auxiliary_loss_mlp": 0.00586562, + "balance_loss_clip": 1.40723526, + "balance_loss_mlp": 0.52123535, + "epoch": 0.014970689914324365, + "flos": 26865023806080.0, + "grad_norm": 16.30719544402728, + "language_loss": 0.89288592, + "learning_rate": 3.552424094769381e-06, + "loss": 0.91836965, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 5.55078125, + "router_z_loss_mlp": 0.65283203, + "step": 249, + "time_per_iteration": 2.6739673614501953 + }, + { + "auxiliary_loss_clip": 0.02000153, + "auxiliary_loss_mlp": 0.00554754, + "balance_loss_clip": 1.43031454, + "balance_loss_mlp": 0.49414763, + "epoch": 0.015030813166992334, + "flos": 13985662371840.0, + "grad_norm": 5.908390935835151, + "language_loss": 1.01359701, + "learning_rate": 3.5550046680977174e-06, + "loss": 1.03914607, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 5.6953125, + "router_z_loss_mlp": 0.60693359, + "step": 250, + "time_per_iteration": 2.5972321033477783 + }, + { + "auxiliary_loss_clip": 0.02018793, + "auxiliary_loss_mlp": 0.00592404, + "balance_loss_clip": 1.43168259, + "balance_loss_mlp": 0.52645773, + "epoch": 0.015090936419660304, + "flos": 24717817618560.0, + "grad_norm": 123.37580980487063, + "language_loss": 1.0430038, + "learning_rate": 3.5575749397087034e-06, + "loss": 1.06911576, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 5.87109375, + "router_z_loss_mlp": 0.65966797, + "step": 251, + "time_per_iteration": 2.7801220417022705 + }, + { + "auxiliary_loss_clip": 0.02040659, + "auxiliary_loss_mlp": 0.005622, + "balance_loss_clip": 1.44415021, + "balance_loss_mlp": 0.50354862, + "epoch": 0.015151059672328273, + "flos": 25738793798400.0, + "grad_norm": 99.75393012587074, + "language_loss": 0.91932654, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.94535518, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 5.95703125, + "router_z_loss_mlp": 0.58642578, + "step": 252, + "time_per_iteration": 2.699920415878296 + }, + { + "auxiliary_loss_clip": 0.02076542, + "auxiliary_loss_mlp": 0.00589782, + "balance_loss_clip": 1.46503699, + "balance_loss_mlp": 0.5281744, + "epoch": 0.015211182924996243, + "flos": 21871214737920.0, + "grad_norm": 96.61481723701019, + "language_loss": 1.07489049, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.10155368, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 6.1171875, + "router_z_loss_mlp": 0.61572266, + "step": 253, + "time_per_iteration": 2.674912929534912 + }, + { + "auxiliary_loss_clip": 0.02030678, + "auxiliary_loss_mlp": 0.0035376, + "balance_loss_clip": 1.60078955, + "balance_loss_mlp": 0.32591292, + "epoch": 0.015271306177664212, + "flos": 66895080888960.0, + "grad_norm": 0.8480610043062556, + "language_loss": 0.55287009, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.57671446, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 4.3125, + "router_z_loss_mlp": 0.27929688, + "step": 254, + "time_per_iteration": 3.1742122173309326 + }, + { + "auxiliary_loss_clip": 0.02122121, + "auxiliary_loss_mlp": 0.00585752, + "balance_loss_clip": 1.47170329, + "balance_loss_mlp": 0.52340591, + "epoch": 0.01533142943033218, + "flos": 26834069260800.0, + "grad_norm": 36.01824518401435, + "language_loss": 0.98110729, + "learning_rate": 3.567754632921479e-06, + "loss": 1.0081861, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 6.49609375, + "router_z_loss_mlp": 0.62329102, + "step": 255, + "time_per_iteration": 2.7435643672943115 + }, + { + "auxiliary_loss_clip": 0.02129079, + "auxiliary_loss_mlp": 0.00573858, + "balance_loss_clip": 1.47937739, + "balance_loss_mlp": 0.50757784, + "epoch": 0.01539155268300015, + "flos": 20813753318400.0, + "grad_norm": 17.751031441267152, + "language_loss": 0.92396587, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.95099521, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 6.5, + "router_z_loss_mlp": 0.66357422, + "step": 256, + "time_per_iteration": 2.686060667037964 + }, + { + "auxiliary_loss_clip": 0.02100683, + "auxiliary_loss_mlp": 0.00586784, + "balance_loss_clip": 1.46657598, + "balance_loss_mlp": 0.51964569, + "epoch": 0.01545167593566812, + "flos": 15961862885760.0, + "grad_norm": 62.954281188432596, + "language_loss": 0.81132758, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.83820218, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 6.34765625, + "router_z_loss_mlp": 0.67089844, + "step": 257, + "time_per_iteration": 2.7117342948913574 + }, + { + "auxiliary_loss_clip": 0.02122183, + "auxiliary_loss_mlp": 0.00554315, + "balance_loss_clip": 1.48824203, + "balance_loss_mlp": 0.49435231, + "epoch": 0.01551179918833609, + "flos": 22601745544320.0, + "grad_norm": 16.64617576633985, + "language_loss": 1.01906121, + "learning_rate": 3.5752851536613596e-06, + "loss": 1.0458262, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 6.34375, + "router_z_loss_mlp": 0.59936523, + "step": 258, + "time_per_iteration": 2.739394426345825 + }, + { + "auxiliary_loss_clip": 0.02117414, + "auxiliary_loss_mlp": 0.00598855, + "balance_loss_clip": 1.48084521, + "balance_loss_mlp": 0.53810585, + "epoch": 0.015571922441004058, + "flos": 22816706486400.0, + "grad_norm": 50.297320485888726, + "language_loss": 0.98817581, + "learning_rate": 3.577775880881658e-06, + "loss": 1.01533842, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 6.36328125, + "router_z_loss_mlp": 0.60693359, + "step": 259, + "time_per_iteration": 2.6861932277679443 + }, + { + "auxiliary_loss_clip": 0.02116685, + "auxiliary_loss_mlp": 0.005311, + "balance_loss_clip": 1.50014329, + "balance_loss_mlp": 0.47435668, + "epoch": 0.015632045693672027, + "flos": 18947439486720.0, + "grad_norm": 22.85837295159286, + "language_loss": 1.02622962, + "learning_rate": 3.5802570099000424e-06, + "loss": 1.05270743, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 6.16015625, + "router_z_loss_mlp": 0.56689453, + "step": 260, + "time_per_iteration": 2.66146183013916 + }, + { + "auxiliary_loss_clip": 0.02144236, + "auxiliary_loss_mlp": 0.00540054, + "balance_loss_clip": 1.4931252, + "balance_loss_mlp": 0.48025876, + "epoch": 0.015692168946339995, + "flos": 29971728046080.0, + "grad_norm": 21.61432442038631, + "language_loss": 0.96622723, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.99307007, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 6.515625, + "router_z_loss_mlp": 0.59814453, + "step": 261, + "time_per_iteration": 2.7040562629699707 + }, + { + "auxiliary_loss_clip": 0.02172117, + "auxiliary_loss_mlp": 0.00569892, + "balance_loss_clip": 1.50275397, + "balance_loss_mlp": 0.50566173, + "epoch": 0.015752292199007967, + "flos": 19392085946880.0, + "grad_norm": 113.0554563532551, + "language_loss": 0.72484016, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.75226027, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 6.6953125, + "router_z_loss_mlp": 0.64208984, + "step": 262, + "time_per_iteration": 2.6863486766815186 + }, + { + "auxiliary_loss_clip": 0.0218678, + "auxiliary_loss_mlp": 0.00555706, + "balance_loss_clip": 1.5097369, + "balance_loss_mlp": 0.49047509, + "epoch": 0.015812415451675936, + "flos": 20339804338560.0, + "grad_norm": 72.52714285890461, + "language_loss": 0.77276534, + "learning_rate": 3.587643540438383e-06, + "loss": 0.80019021, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 6.77734375, + "router_z_loss_mlp": 0.65283203, + "step": 263, + "time_per_iteration": 2.6862237453460693 + }, + { + "auxiliary_loss_clip": 0.02244302, + "auxiliary_loss_mlp": 0.00581325, + "balance_loss_clip": 1.53684866, + "balance_loss_mlp": 0.51881182, + "epoch": 0.015872538704343905, + "flos": 17525412979200.0, + "grad_norm": 155.59638997344967, + "language_loss": 0.93721497, + "learning_rate": 3.590087005168037e-06, + "loss": 0.96547121, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 7.078125, + "router_z_loss_mlp": 0.62451172, + "step": 264, + "time_per_iteration": 2.692227363586426 + }, + { + "auxiliary_loss_clip": 0.02229739, + "auxiliary_loss_mlp": 0.00553546, + "balance_loss_clip": 1.53102446, + "balance_loss_mlp": 0.49174747, + "epoch": 0.015932661957011873, + "flos": 15260490944640.0, + "grad_norm": 89.71012884301727, + "language_loss": 1.09384394, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.12167668, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 6.98828125, + "router_z_loss_mlp": 0.61816406, + "step": 265, + "time_per_iteration": 2.6428778171539307 + }, + { + "auxiliary_loss_clip": 0.02256601, + "auxiliary_loss_mlp": 0.00606123, + "balance_loss_clip": 1.54505742, + "balance_loss_mlp": 0.53884155, + "epoch": 0.015992785209679845, + "flos": 20302528999680.0, + "grad_norm": 202.7442210169537, + "language_loss": 0.82801276, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.85663998, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 7.1171875, + "router_z_loss_mlp": 0.67333984, + "step": 266, + "time_per_iteration": 2.7283992767333984 + }, + { + "auxiliary_loss_clip": 0.02246724, + "auxiliary_loss_mlp": 0.00576986, + "balance_loss_clip": 1.54379869, + "balance_loss_mlp": 0.51060987, + "epoch": 0.016052908462347814, + "flos": 23362368969600.0, + "grad_norm": 219.70753829087977, + "language_loss": 0.95850682, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.98674393, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 7.02734375, + "router_z_loss_mlp": 0.66357422, + "step": 267, + "time_per_iteration": 2.6664535999298096 + }, + { + "auxiliary_loss_clip": 0.0225273, + "auxiliary_loss_mlp": 0.00598825, + "balance_loss_clip": 1.53619194, + "balance_loss_mlp": 0.53340256, + "epoch": 0.016113031715015783, + "flos": 21286588976640.0, + "grad_norm": 49.50453103677667, + "language_loss": 0.93735957, + "learning_rate": 3.599769175344462e-06, + "loss": 0.96587509, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 7.1640625, + "router_z_loss_mlp": 0.65332031, + "step": 268, + "time_per_iteration": 2.726308584213257 + }, + { + "auxiliary_loss_clip": 0.0226132, + "auxiliary_loss_mlp": 0.00590741, + "balance_loss_clip": 1.55828714, + "balance_loss_mlp": 0.52460372, + "epoch": 0.01617315496768375, + "flos": 18914689261440.0, + "grad_norm": 54.60269103378473, + "language_loss": 0.94503599, + "learning_rate": 3.602167137831432e-06, + "loss": 0.97355664, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 7.03515625, + "router_z_loss_mlp": 0.66162109, + "step": 269, + "time_per_iteration": 2.632283926010132 + }, + { + "auxiliary_loss_clip": 0.02273287, + "auxiliary_loss_mlp": 0.00629812, + "balance_loss_clip": 1.54582047, + "balance_loss_mlp": 0.55494857, + "epoch": 0.01623327822035172, + "flos": 16546488647040.0, + "grad_norm": 14.245814619000443, + "language_loss": 1.02824736, + "learning_rate": 3.6045562024779565e-06, + "loss": 1.05727839, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 7.26953125, + "router_z_loss_mlp": 0.74804688, + "step": 270, + "time_per_iteration": 2.8481240272521973 + }, + { + "auxiliary_loss_clip": 0.02260972, + "auxiliary_loss_mlp": 0.0056089, + "balance_loss_clip": 1.5500288, + "balance_loss_mlp": 0.5012852, + "epoch": 0.016293401473019692, + "flos": 23513481486720.0, + "grad_norm": 13.429992469819332, + "language_loss": 0.94599068, + "learning_rate": 3.606936435072361e-06, + "loss": 0.97420919, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 7.109375, + "router_z_loss_mlp": 0.59619141, + "step": 271, + "time_per_iteration": 2.796449661254883 + }, + { + "auxiliary_loss_clip": 0.02281675, + "auxiliary_loss_mlp": 0.00547963, + "balance_loss_clip": 1.55234146, + "balance_loss_mlp": 0.48440036, + "epoch": 0.01635352472568766, + "flos": 29016072748800.0, + "grad_norm": 56.236567732993066, + "language_loss": 0.88716447, + "learning_rate": 3.609307900676025e-06, + "loss": 0.91546088, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 7.296875, + "router_z_loss_mlp": 0.63500977, + "step": 272, + "time_per_iteration": 5.5638954639434814 + }, + { + "auxiliary_loss_clip": 0.02276875, + "auxiliary_loss_mlp": 0.00587464, + "balance_loss_clip": 1.55196035, + "balance_loss_mlp": 0.52542782, + "epoch": 0.01641364797835563, + "flos": 13370513028480.0, + "grad_norm": 340.5576221885258, + "language_loss": 0.89525419, + "learning_rate": 3.611670663634051e-06, + "loss": 0.92389762, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 7.25, + "router_z_loss_mlp": 0.62060547, + "step": 273, + "time_per_iteration": 2.626847505569458 + }, + { + "auxiliary_loss_clip": 0.02281522, + "auxiliary_loss_mlp": 0.00637369, + "balance_loss_clip": 1.54046559, + "balance_loss_mlp": 0.56565297, + "epoch": 0.016473771231023598, + "flos": 18878239935360.0, + "grad_norm": 81.95737835237048, + "language_loss": 0.99150485, + "learning_rate": 3.614024787585744e-06, + "loss": 1.02069378, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 7.41015625, + "router_z_loss_mlp": 0.71777344, + "step": 274, + "time_per_iteration": 2.6829895973205566 + }, + { + "auxiliary_loss_clip": 0.02232075, + "auxiliary_loss_mlp": 0.00553647, + "balance_loss_clip": 1.53371572, + "balance_loss_mlp": 0.49170592, + "epoch": 0.016533894483691566, + "flos": 22601637803520.0, + "grad_norm": 57.6556460487762, + "language_loss": 0.95856875, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.986426, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 6.984375, + "router_z_loss_mlp": 0.62011719, + "step": 275, + "time_per_iteration": 2.649658441543579 + }, + { + "auxiliary_loss_clip": 0.02256956, + "auxiliary_loss_mlp": 0.00544893, + "balance_loss_clip": 1.54077637, + "balance_loss_mlp": 0.48285621, + "epoch": 0.01659401773635954, + "flos": 21507188353920.0, + "grad_norm": 74.78714364879785, + "language_loss": 0.86479264, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.89281106, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 7.1640625, + "router_z_loss_mlp": 0.62060547, + "step": 276, + "time_per_iteration": 2.7019007205963135 + }, + { + "auxiliary_loss_clip": 0.02277047, + "auxiliary_loss_mlp": 0.00576845, + "balance_loss_clip": 1.55702615, + "balance_loss_mlp": 0.51528513, + "epoch": 0.016654140989027507, + "flos": 32850973411200.0, + "grad_norm": 28.492665225728416, + "language_loss": 0.86281979, + "learning_rate": 3.621035951423551e-06, + "loss": 0.89135867, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 7.20703125, + "router_z_loss_mlp": 0.61572266, + "step": 277, + "time_per_iteration": 2.753699779510498 + }, + { + "auxiliary_loss_clip": 0.02231439, + "auxiliary_loss_mlp": 0.00571895, + "balance_loss_clip": 1.53533983, + "balance_loss_mlp": 0.50814199, + "epoch": 0.016714264241695476, + "flos": 12306228024960.0, + "grad_norm": 65.16904526154283, + "language_loss": 0.86543435, + "learning_rate": 3.623356141983041e-06, + "loss": 0.89346766, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 6.95703125, + "router_z_loss_mlp": 0.63769531, + "step": 278, + "time_per_iteration": 2.6593353748321533 + }, + { + "auxiliary_loss_clip": 0.02257508, + "auxiliary_loss_mlp": 0.00587678, + "balance_loss_clip": 1.54377651, + "balance_loss_mlp": 0.52521193, + "epoch": 0.016774387494363444, + "flos": 27123796362240.0, + "grad_norm": 2554.167843947644, + "language_loss": 0.97359329, + "learning_rate": 3.6256680014992486e-06, + "loss": 1.00204515, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 7.13671875, + "router_z_loss_mlp": 0.625, + "step": 279, + "time_per_iteration": 2.6829869747161865 + }, + { + "auxiliary_loss_clip": 0.02251308, + "auxiliary_loss_mlp": 0.00569188, + "balance_loss_clip": 1.53438139, + "balance_loss_mlp": 0.50624561, + "epoch": 0.016834510747031413, + "flos": 20191493082240.0, + "grad_norm": 79.23945148130363, + "language_loss": 1.01273894, + "learning_rate": 3.6279715895862713e-06, + "loss": 1.04094398, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 7.16015625, + "router_z_loss_mlp": 0.62939453, + "step": 280, + "time_per_iteration": 2.686588764190674 + }, + { + "auxiliary_loss_clip": 0.0221417, + "auxiliary_loss_mlp": 0.00587778, + "balance_loss_clip": 1.51844347, + "balance_loss_mlp": 0.52464527, + "epoch": 0.016894633999699385, + "flos": 27274262434560.0, + "grad_norm": 18.152716349919157, + "language_loss": 0.80668306, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.83470255, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 6.953125, + "router_z_loss_mlp": 0.63085938, + "step": 281, + "time_per_iteration": 2.7612662315368652 + }, + { + "auxiliary_loss_clip": 0.02253296, + "auxiliary_loss_mlp": 0.00595143, + "balance_loss_clip": 1.53594601, + "balance_loss_mlp": 0.53091317, + "epoch": 0.016954757252367354, + "flos": 14902964922240.0, + "grad_norm": 369.06384780400793, + "language_loss": 0.89860511, + "learning_rate": 3.632554186750274e-06, + "loss": 0.92708945, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 7.17578125, + "router_z_loss_mlp": 0.64257812, + "step": 282, + "time_per_iteration": 2.6820385456085205 + }, + { + "auxiliary_loss_clip": 0.02247081, + "auxiliary_loss_mlp": 0.00600348, + "balance_loss_clip": 1.5313133, + "balance_loss_mlp": 0.53568876, + "epoch": 0.017014880505035322, + "flos": 21358805270400.0, + "grad_norm": 30.047066493164632, + "language_loss": 0.84262067, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.87109494, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 7.1484375, + "router_z_loss_mlp": 0.64648438, + "step": 283, + "time_per_iteration": 2.6617631912231445 + }, + { + "auxiliary_loss_clip": 0.02220001, + "auxiliary_loss_mlp": 0.00559302, + "balance_loss_clip": 1.53008354, + "balance_loss_mlp": 0.50050843, + "epoch": 0.01707500375770329, + "flos": 35333154858240.0, + "grad_norm": 75.78034843239011, + "language_loss": 0.9115634, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.93935645, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 6.90234375, + "router_z_loss_mlp": 0.58862305, + "step": 284, + "time_per_iteration": 2.793515920639038 + }, + { + "auxiliary_loss_clip": 0.02179936, + "auxiliary_loss_mlp": 0.0055666, + "balance_loss_clip": 1.51460993, + "balance_loss_mlp": 0.49562502, + "epoch": 0.01713512701037126, + "flos": 23582070506880.0, + "grad_norm": 187.72072771350332, + "language_loss": 1.0542345, + "learning_rate": 3.639367500948819e-06, + "loss": 1.08160043, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 6.65625, + "router_z_loss_mlp": 0.60986328, + "step": 285, + "time_per_iteration": 2.8574185371398926 + }, + { + "auxiliary_loss_clip": 0.02241883, + "auxiliary_loss_mlp": 0.00620696, + "balance_loss_clip": 1.53609371, + "balance_loss_mlp": 0.55331892, + "epoch": 0.01719525026303923, + "flos": 27634661544960.0, + "grad_norm": 102.56481080203969, + "language_loss": 0.99409777, + "learning_rate": 3.6416226772772178e-06, + "loss": 1.02272356, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 7.0625, + "router_z_loss_mlp": 0.67431641, + "step": 286, + "time_per_iteration": 2.743102788925171 + }, + { + "auxiliary_loss_clip": 0.02215812, + "auxiliary_loss_mlp": 0.00563183, + "balance_loss_clip": 1.52384841, + "balance_loss_mlp": 0.49954891, + "epoch": 0.0172553735157072, + "flos": 26979722910720.0, + "grad_norm": 112.71859709995559, + "language_loss": 0.9883315, + "learning_rate": 3.643869982119001e-06, + "loss": 1.01612139, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 6.91796875, + "router_z_loss_mlp": 0.63598633, + "step": 287, + "time_per_iteration": 2.734180450439453 + }, + { + "auxiliary_loss_clip": 0.02237627, + "auxiliary_loss_mlp": 0.00587276, + "balance_loss_clip": 1.52910841, + "balance_loss_mlp": 0.52161545, + "epoch": 0.01731549676837517, + "flos": 14056621689600.0, + "grad_norm": 499.55486428713584, + "language_loss": 1.09135747, + "learning_rate": 3.646109470232502e-06, + "loss": 1.11960649, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 7.0859375, + "router_z_loss_mlp": 0.65649414, + "step": 288, + "time_per_iteration": 2.6088435649871826 + }, + { + "auxiliary_loss_clip": 0.01981643, + "auxiliary_loss_mlp": 0.00282682, + "balance_loss_clip": 1.57284904, + "balance_loss_mlp": 0.26074716, + "epoch": 0.017375620021043137, + "flos": 66510694471680.0, + "grad_norm": 1.3655305970700842, + "language_loss": 0.63465434, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.65729761, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 4.0625, + "router_z_loss_mlp": 0.21972656, + "step": 289, + "time_per_iteration": 3.3067214488983154 + }, + { + "auxiliary_loss_clip": 0.02184603, + "auxiliary_loss_mlp": 0.00590008, + "balance_loss_clip": 1.51327443, + "balance_loss_mlp": 0.52623117, + "epoch": 0.01743574327371111, + "flos": 15225154940160.0, + "grad_norm": 119.13430869550395, + "language_loss": 0.95603848, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.98378462, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 6.7109375, + "router_z_loss_mlp": 0.63745117, + "step": 290, + "time_per_iteration": 2.6351704597473145 + }, + { + "auxiliary_loss_clip": 0.02219156, + "auxiliary_loss_mlp": 0.00565994, + "balance_loss_clip": 1.53261089, + "balance_loss_mlp": 0.50245506, + "epoch": 0.017495866526379078, + "flos": 25373869574400.0, + "grad_norm": 252.48313055341464, + "language_loss": 0.92350888, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.9513604, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 6.859375, + "router_z_loss_mlp": 0.63574219, + "step": 291, + "time_per_iteration": 2.7097580432891846 + }, + { + "auxiliary_loss_clip": 0.02211272, + "auxiliary_loss_mlp": 0.00578071, + "balance_loss_clip": 1.53518248, + "balance_loss_mlp": 0.51331609, + "epoch": 0.017555989779047047, + "flos": 26359473836160.0, + "grad_norm": 3.557829154290891, + "language_loss": 0.77330887, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.80120236, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 6.7578125, + "router_z_loss_mlp": 0.64746094, + "step": 292, + "time_per_iteration": 2.695697784423828 + }, + { + "auxiliary_loss_clip": 0.02187446, + "auxiliary_loss_mlp": 0.00577156, + "balance_loss_clip": 1.52475905, + "balance_loss_mlp": 0.51268774, + "epoch": 0.017616113031715015, + "flos": 22338807010560.0, + "grad_norm": 297.1329070283204, + "language_loss": 0.94681865, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.97446471, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 6.62109375, + "router_z_loss_mlp": 0.64428711, + "step": 293, + "time_per_iteration": 2.8104803562164307 + }, + { + "auxiliary_loss_clip": 0.0217286, + "auxiliary_loss_mlp": 0.00584504, + "balance_loss_clip": 1.51650751, + "balance_loss_mlp": 0.52401686, + "epoch": 0.017676236284382984, + "flos": 20156911263360.0, + "grad_norm": 66.88611021201943, + "language_loss": 0.87539965, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.90297329, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 6.5625, + "router_z_loss_mlp": 0.60473633, + "step": 294, + "time_per_iteration": 2.6958553791046143 + }, + { + "auxiliary_loss_clip": 0.0214242, + "auxiliary_loss_mlp": 0.00605323, + "balance_loss_clip": 1.49480438, + "balance_loss_mlp": 0.54152197, + "epoch": 0.017736359537050956, + "flos": 25223331674880.0, + "grad_norm": 7.040968336726166, + "language_loss": 0.90252101, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.9299984, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 6.47265625, + "router_z_loss_mlp": 0.63818359, + "step": 295, + "time_per_iteration": 2.6923816204071045 + }, + { + "auxiliary_loss_clip": 0.02124755, + "auxiliary_loss_mlp": 0.00598808, + "balance_loss_clip": 1.50211668, + "balance_loss_mlp": 0.53600812, + "epoch": 0.017796482789718925, + "flos": 20338798757760.0, + "grad_norm": 18.68642373603933, + "language_loss": 0.91229928, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.9395349, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 6.2265625, + "router_z_loss_mlp": 0.62841797, + "step": 296, + "time_per_iteration": 2.672752618789673 + }, + { + "auxiliary_loss_clip": 0.02119006, + "auxiliary_loss_mlp": 0.00562705, + "balance_loss_clip": 1.49071825, + "balance_loss_mlp": 0.5029096, + "epoch": 0.017856606042386893, + "flos": 22379206832640.0, + "grad_norm": 14.923547052602913, + "language_loss": 0.93438566, + "learning_rate": 3.665921869855132e-06, + "loss": 0.9612028, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 6.28125, + "router_z_loss_mlp": 0.59863281, + "step": 297, + "time_per_iteration": 2.656914710998535 + }, + { + "auxiliary_loss_clip": 0.02108881, + "auxiliary_loss_mlp": 0.00570442, + "balance_loss_clip": 1.49264336, + "balance_loss_mlp": 0.50797617, + "epoch": 0.017916729295054862, + "flos": 20230061310720.0, + "grad_norm": 20.269381645749043, + "language_loss": 0.95909017, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.98588336, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 6.16015625, + "router_z_loss_mlp": 0.62426758, + "step": 298, + "time_per_iteration": 2.6483123302459717 + }, + { + "auxiliary_loss_clip": 0.02037936, + "auxiliary_loss_mlp": 0.00597463, + "balance_loss_clip": 1.46666598, + "balance_loss_mlp": 0.53523582, + "epoch": 0.01797685254772283, + "flos": 19390972625280.0, + "grad_norm": 792.2793042870924, + "language_loss": 0.94363475, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.9699887, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 5.71875, + "router_z_loss_mlp": 0.62255859, + "step": 299, + "time_per_iteration": 2.6219515800476074 + }, + { + "auxiliary_loss_clip": 0.02056771, + "auxiliary_loss_mlp": 0.00589462, + "balance_loss_clip": 1.4606595, + "balance_loss_mlp": 0.5253278, + "epoch": 0.018036975800390802, + "flos": 24426007528320.0, + "grad_norm": 13.351104301649913, + "language_loss": 0.73285842, + "learning_rate": 3.672392800539357e-06, + "loss": 0.7593208, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 5.9609375, + "router_z_loss_mlp": 0.64111328, + "step": 300, + "time_per_iteration": 2.7350857257843018 + }, + { + "auxiliary_loss_clip": 0.02068434, + "auxiliary_loss_mlp": 0.0058627, + "balance_loss_clip": 1.48357356, + "balance_loss_mlp": 0.52459145, + "epoch": 0.01809709905305877, + "flos": 15778933896960.0, + "grad_norm": 17.40931694446122, + "language_loss": 0.96841168, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.99495864, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 5.85546875, + "router_z_loss_mlp": 0.61645508, + "step": 301, + "time_per_iteration": 2.616377830505371 + }, + { + "auxiliary_loss_clip": 0.01875618, + "auxiliary_loss_mlp": 0.00367908, + "balance_loss_clip": 1.50937164, + "balance_loss_mlp": 0.34683144, + "epoch": 0.01815722230572674, + "flos": 67348382526720.0, + "grad_norm": 2.52405345772591, + "language_loss": 0.61594856, + "learning_rate": 3.676670903877158e-06, + "loss": 0.63838387, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.2109375, + "step": 302, + "time_per_iteration": 3.3501651287078857 + }, + { + "auxiliary_loss_clip": 0.01991982, + "auxiliary_loss_mlp": 0.00620386, + "balance_loss_clip": 1.44394743, + "balance_loss_mlp": 0.55482101, + "epoch": 0.01821734555839471, + "flos": 15485615435520.0, + "grad_norm": 67.5894417984717, + "language_loss": 0.9751035, + "learning_rate": 3.6787993427857567e-06, + "loss": 1.00122726, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 5.484375, + "router_z_loss_mlp": 0.65478516, + "step": 303, + "time_per_iteration": 2.6427557468414307 + }, + { + "auxiliary_loss_clip": 0.02027375, + "auxiliary_loss_mlp": 0.00618955, + "balance_loss_clip": 1.46076298, + "balance_loss_mlp": 0.55162561, + "epoch": 0.018277468811062677, + "flos": 24097424889600.0, + "grad_norm": 8.740594504733139, + "language_loss": 0.87040031, + "learning_rate": 3.680920768703364e-06, + "loss": 0.89686358, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 5.66796875, + "router_z_loss_mlp": 0.67285156, + "step": 304, + "time_per_iteration": 2.670032024383545 + }, + { + "auxiliary_loss_clip": 0.01957326, + "auxiliary_loss_mlp": 0.00578348, + "balance_loss_clip": 1.44150114, + "balance_loss_mlp": 0.51678872, + "epoch": 0.01833759206373065, + "flos": 20959335141120.0, + "grad_norm": 186.29371339630384, + "language_loss": 0.87482071, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.90017748, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 5.16015625, + "router_z_loss_mlp": 0.61523438, + "step": 305, + "time_per_iteration": 2.8225855827331543 + }, + { + "auxiliary_loss_clip": 0.01950558, + "auxiliary_loss_mlp": 0.00587778, + "balance_loss_clip": 1.41957462, + "balance_loss_mlp": 0.52521718, + "epoch": 0.018397715316398618, + "flos": 19390757143680.0, + "grad_norm": 64.70021577125354, + "language_loss": 0.96552229, + "learning_rate": 3.685142765363119e-06, + "loss": 0.9909057, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 5.31640625, + "router_z_loss_mlp": 0.62548828, + "step": 306, + "time_per_iteration": 2.643157482147217 + }, + { + "auxiliary_loss_clip": 0.01952758, + "auxiliary_loss_mlp": 0.00626587, + "balance_loss_clip": 1.41488981, + "balance_loss_mlp": 0.56183267, + "epoch": 0.018457838569066586, + "flos": 29132531619840.0, + "grad_norm": 532.5524800914161, + "language_loss": 0.93367171, + "learning_rate": 3.687243426879095e-06, + "loss": 0.95946515, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 5.3828125, + "router_z_loss_mlp": 0.64746094, + "step": 307, + "time_per_iteration": 2.7934508323669434 + }, + { + "auxiliary_loss_clip": 0.01943029, + "auxiliary_loss_mlp": 0.00627135, + "balance_loss_clip": 1.42122304, + "balance_loss_mlp": 0.56137931, + "epoch": 0.018517961821734555, + "flos": 19208654167680.0, + "grad_norm": 137.1687420238854, + "language_loss": 0.77668875, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.8023904, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 5.21484375, + "router_z_loss_mlp": 0.65771484, + "step": 308, + "time_per_iteration": 2.6190052032470703 + }, + { + "auxiliary_loss_clip": 0.0192778, + "auxiliary_loss_mlp": 0.00633262, + "balance_loss_clip": 1.39624834, + "balance_loss_mlp": 0.56450224, + "epoch": 0.018578085074402523, + "flos": 19863018184320.0, + "grad_norm": 87.61787371984016, + "language_loss": 0.95723361, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.982844, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 5.31640625, + "router_z_loss_mlp": 0.6875, + "step": 309, + "time_per_iteration": 2.7256393432617188 + }, + { + "auxiliary_loss_clip": 0.01940024, + "auxiliary_loss_mlp": 0.00604975, + "balance_loss_clip": 1.40905523, + "balance_loss_mlp": 0.53588146, + "epoch": 0.018638208327070496, + "flos": 29606947476480.0, + "grad_norm": 141.67762967809463, + "language_loss": 0.80049896, + "learning_rate": 3.69350459956065e-06, + "loss": 0.82594895, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 5.30078125, + "router_z_loss_mlp": 0.69091797, + "step": 310, + "time_per_iteration": 2.706019163131714 + }, + { + "auxiliary_loss_clip": 0.01911132, + "auxiliary_loss_mlp": 0.00564405, + "balance_loss_clip": 1.40414667, + "balance_loss_mlp": 0.50367987, + "epoch": 0.018698331579738464, + "flos": 45731555907840.0, + "grad_norm": 27.169703525625188, + "language_loss": 0.80480087, + "learning_rate": 3.695578199367497e-06, + "loss": 0.82955623, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 5.0703125, + "router_z_loss_mlp": 0.60791016, + "step": 311, + "time_per_iteration": 2.9174864292144775 + }, + { + "auxiliary_loss_clip": 0.01918838, + "auxiliary_loss_mlp": 0.00583053, + "balance_loss_clip": 1.40000701, + "balance_loss_mlp": 0.52180374, + "epoch": 0.018758454832406433, + "flos": 20483662308480.0, + "grad_norm": 67.65161688327657, + "language_loss": 0.98480678, + "learning_rate": 3.6976451423416825e-06, + "loss": 1.00982571, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 5.19140625, + "router_z_loss_mlp": 0.61254883, + "step": 312, + "time_per_iteration": 2.6283090114593506 + }, + { + "auxiliary_loss_clip": 0.01896005, + "auxiliary_loss_mlp": 0.00558838, + "balance_loss_clip": 1.38950479, + "balance_loss_mlp": 0.50161779, + "epoch": 0.0188185780850744, + "flos": 15777784661760.0, + "grad_norm": 127.75495787855547, + "language_loss": 0.98651153, + "learning_rate": 3.699705471087043e-06, + "loss": 1.01105988, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 5.06640625, + "router_z_loss_mlp": 0.57226562, + "step": 313, + "time_per_iteration": 2.671684741973877 + }, + { + "auxiliary_loss_clip": 0.01927028, + "auxiliary_loss_mlp": 0.00652108, + "balance_loss_clip": 1.39609647, + "balance_loss_mlp": 0.579247, + "epoch": 0.018878701337742373, + "flos": 22455732758400.0, + "grad_norm": 79.78003727905573, + "language_loss": 0.84097493, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.86676621, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 5.3125, + "router_z_loss_mlp": 0.72851562, + "step": 314, + "time_per_iteration": 4.141700506210327 + }, + { + "auxiliary_loss_clip": 0.01880948, + "auxiliary_loss_mlp": 0.00649402, + "balance_loss_clip": 1.37381172, + "balance_loss_mlp": 0.58026052, + "epoch": 0.018938824590410342, + "flos": 30993530238720.0, + "grad_norm": 26.186092506709006, + "language_loss": 0.98196876, + "learning_rate": 3.7038064542733654e-06, + "loss": 1.00727224, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 5.06640625, + "router_z_loss_mlp": 0.69042969, + "step": 315, + "time_per_iteration": 4.144528865814209 + }, + { + "auxiliary_loss_clip": 0.01894662, + "auxiliary_loss_mlp": 0.00625582, + "balance_loss_clip": 1.386235, + "balance_loss_mlp": 0.55901575, + "epoch": 0.01899894784307831, + "flos": 23258910821760.0, + "grad_norm": 16.899518832477554, + "language_loss": 0.86744428, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.89264667, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 5.08203125, + "router_z_loss_mlp": 0.66601562, + "step": 316, + "time_per_iteration": 2.637929677963257 + }, + { + "auxiliary_loss_clip": 0.01853121, + "auxiliary_loss_mlp": 0.00571831, + "balance_loss_clip": 1.37452281, + "balance_loss_mlp": 0.51213133, + "epoch": 0.01905907109574628, + "flos": 17457901367040.0, + "grad_norm": 32.999764804712555, + "language_loss": 0.94601923, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.97026873, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 4.7890625, + "router_z_loss_mlp": 0.59765625, + "step": 317, + "time_per_iteration": 2.6296045780181885 + }, + { + "auxiliary_loss_clip": 0.0184487, + "auxiliary_loss_mlp": 0.00557567, + "balance_loss_clip": 1.36584783, + "balance_loss_mlp": 0.49805823, + "epoch": 0.019119194348414248, + "flos": 14970225139200.0, + "grad_norm": 364.02355462429205, + "language_loss": 1.00598466, + "learning_rate": 3.709909364265374e-06, + "loss": 1.03000903, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.59521484, + "step": 318, + "time_per_iteration": 2.598317861557007 + }, + { + "auxiliary_loss_clip": 0.0183217, + "auxiliary_loss_mlp": 0.00571069, + "balance_loss_clip": 1.35594416, + "balance_loss_mlp": 0.51294261, + "epoch": 0.01917931760108222, + "flos": 25482822503040.0, + "grad_norm": 67.82298671830179, + "language_loss": 1.01693892, + "learning_rate": 3.7119308798459706e-06, + "loss": 1.0409714, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.58154297, + "step": 319, + "time_per_iteration": 2.7049930095672607 + }, + { + "auxiliary_loss_clip": 0.01575436, + "auxiliary_loss_mlp": 0.00491821, + "balance_loss_clip": 1.27333307, + "balance_loss_mlp": 0.46454597, + "epoch": 0.01923944085375019, + "flos": 71556967353600.0, + "grad_norm": 0.9469011974605895, + "language_loss": 0.59268308, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.61335564, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.2734375, + "step": 320, + "time_per_iteration": 3.0543596744537354 + }, + { + "auxiliary_loss_clip": 0.01788905, + "auxiliary_loss_mlp": 0.00573487, + "balance_loss_clip": 1.32543993, + "balance_loss_mlp": 0.50692064, + "epoch": 0.019299564106418157, + "flos": 19682495406720.0, + "grad_norm": 170.3814386167926, + "language_loss": 0.99715489, + "learning_rate": 3.715954969092154e-06, + "loss": 1.02077889, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 4.6328125, + "router_z_loss_mlp": 0.66503906, + "step": 321, + "time_per_iteration": 2.679600715637207 + }, + { + "auxiliary_loss_clip": 0.01773872, + "auxiliary_loss_mlp": 0.00563566, + "balance_loss_clip": 1.31902266, + "balance_loss_mlp": 0.50138676, + "epoch": 0.019359687359086126, + "flos": 24387151991040.0, + "grad_norm": 67.16829898159335, + "language_loss": 0.90888399, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.93225837, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 4.546875, + "router_z_loss_mlp": 0.62158203, + "step": 322, + "time_per_iteration": 2.670535087585449 + }, + { + "auxiliary_loss_clip": 0.01764579, + "auxiliary_loss_mlp": 0.00511119, + "balance_loss_clip": 1.31445599, + "balance_loss_mlp": 0.45041743, + "epoch": 0.019419810611754094, + "flos": 23951376190080.0, + "grad_norm": 25.036396611729852, + "language_loss": 0.81708264, + "learning_rate": 3.719954063833981e-06, + "loss": 0.83983958, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 4.5078125, + "router_z_loss_mlp": 0.60668945, + "step": 323, + "time_per_iteration": 2.671778917312622 + }, + { + "auxiliary_loss_clip": 0.01737965, + "auxiliary_loss_mlp": 0.00504715, + "balance_loss_clip": 1.29733872, + "balance_loss_mlp": 0.44277409, + "epoch": 0.019479933864422067, + "flos": 22160223567360.0, + "grad_norm": 11.807415918632262, + "language_loss": 0.9827792, + "learning_rate": 3.721944334919596e-06, + "loss": 1.00520599, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 4.40820312, + "router_z_loss_mlp": 0.61914062, + "step": 324, + "time_per_iteration": 2.702497959136963 + }, + { + "auxiliary_loss_clip": 0.01731413, + "auxiliary_loss_mlp": 0.00508918, + "balance_loss_clip": 1.29354072, + "balance_loss_mlp": 0.44790635, + "epoch": 0.019540057117090035, + "flos": 22236821320320.0, + "grad_norm": 96.16596751443934, + "language_loss": 0.74214327, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.76454657, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 4.37695312, + "router_z_loss_mlp": 0.60986328, + "step": 325, + "time_per_iteration": 2.651331901550293 + }, + { + "auxiliary_loss_clip": 0.01681344, + "auxiliary_loss_mlp": 0.00460343, + "balance_loss_clip": 1.27323341, + "balance_loss_mlp": 0.40333682, + "epoch": 0.019600180369758004, + "flos": 23076771932160.0, + "grad_norm": 26.16831652607451, + "language_loss": 0.82556951, + "learning_rate": 3.72590651470665e-06, + "loss": 0.84698641, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 4.0859375, + "router_z_loss_mlp": 0.56958008, + "step": 326, + "time_per_iteration": 2.687748908996582 + }, + { + "auxiliary_loss_clip": 0.01674731, + "auxiliary_loss_mlp": 0.00442401, + "balance_loss_clip": 1.26869357, + "balance_loss_mlp": 0.38801777, + "epoch": 0.019660303622425972, + "flos": 25410857604480.0, + "grad_norm": 20.897883022641754, + "language_loss": 0.86422735, + "learning_rate": 3.727878498433505e-06, + "loss": 0.88539869, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 4.0625, + "router_z_loss_mlp": 0.54394531, + "step": 327, + "time_per_iteration": 2.688828468322754 + }, + { + "auxiliary_loss_clip": 0.01680565, + "auxiliary_loss_mlp": 0.00484639, + "balance_loss_clip": 1.2719723, + "balance_loss_mlp": 0.4284679, + "epoch": 0.01972042687509394, + "flos": 23657519024640.0, + "grad_norm": 12.996775435534014, + "language_loss": 0.87821972, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.89987171, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 4.08203125, + "router_z_loss_mlp": 0.56201172, + "step": 328, + "time_per_iteration": 2.696183681488037 + }, + { + "auxiliary_loss_clip": 0.0166814, + "auxiliary_loss_mlp": 0.00520772, + "balance_loss_clip": 1.26261473, + "balance_loss_mlp": 0.45897388, + "epoch": 0.019780550127761913, + "flos": 18223480869120.0, + "grad_norm": 514.2828161640098, + "language_loss": 1.05165446, + "learning_rate": 3.731804438545683e-06, + "loss": 1.07354355, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 4.04882812, + "router_z_loss_mlp": 0.61767578, + "step": 329, + "time_per_iteration": 2.700099468231201 + }, + { + "auxiliary_loss_clip": 0.01646876, + "auxiliary_loss_mlp": 0.0050209, + "balance_loss_clip": 1.24925864, + "balance_loss_mlp": 0.44401163, + "epoch": 0.01984067338042988, + "flos": 22418780641920.0, + "grad_norm": 65.24107477841241, + "language_loss": 0.84408891, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.86557865, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 3.984375, + "router_z_loss_mlp": 0.58007812, + "step": 330, + "time_per_iteration": 2.716856002807617 + }, + { + "auxiliary_loss_clip": 0.01649218, + "auxiliary_loss_mlp": 0.00550871, + "balance_loss_clip": 1.2538352, + "balance_loss_mlp": 0.49150488, + "epoch": 0.01990079663309785, + "flos": 17055199013760.0, + "grad_norm": 12.52852625095614, + "language_loss": 1.03892481, + "learning_rate": 3.7357065849353186e-06, + "loss": 1.0609256, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.59375, + "step": 331, + "time_per_iteration": 2.6292076110839844 + }, + { + "auxiliary_loss_clip": 0.01608305, + "auxiliary_loss_mlp": 0.00480475, + "balance_loss_clip": 1.23974895, + "balance_loss_mlp": 0.42866617, + "epoch": 0.01996091988576582, + "flos": 15961791058560.0, + "grad_norm": 19.01419541058959, + "language_loss": 1.01349318, + "learning_rate": 3.737648825272422e-06, + "loss": 1.03438091, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.51879883, + "step": 332, + "time_per_iteration": 2.7464144229888916 + }, + { + "auxiliary_loss_clip": 0.01618529, + "auxiliary_loss_mlp": 0.00512484, + "balance_loss_clip": 1.23843884, + "balance_loss_mlp": 0.45268869, + "epoch": 0.02002104313843379, + "flos": 23586451966080.0, + "grad_norm": 85.18298770950838, + "language_loss": 0.87455326, + "learning_rate": 3.739585224276384e-06, + "loss": 0.89586341, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.59765625, + "step": 333, + "time_per_iteration": 2.6837961673736572 + }, + { + "auxiliary_loss_clip": 0.01615747, + "auxiliary_loss_mlp": 0.00549613, + "balance_loss_clip": 1.23368192, + "balance_loss_mlp": 0.49069938, + "epoch": 0.02008116639110176, + "flos": 34094883352320.0, + "grad_norm": 139.65863162406126, + "language_loss": 0.89745402, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.91910768, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 0.58959961, + "step": 334, + "time_per_iteration": 2.7973129749298096 + }, + { + "auxiliary_loss_clip": 0.01605423, + "auxiliary_loss_mlp": 0.00495666, + "balance_loss_clip": 1.23113739, + "balance_loss_mlp": 0.43846929, + "epoch": 0.020141289643769728, + "flos": 19683716469120.0, + "grad_norm": 420.27676017331135, + "language_loss": 0.89613837, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.91714931, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 0.57250977, + "step": 335, + "time_per_iteration": 2.6402931213378906 + }, + { + "auxiliary_loss_clip": 0.01581489, + "auxiliary_loss_mlp": 0.00513036, + "balance_loss_clip": 1.21737957, + "balance_loss_mlp": 0.45927253, + "epoch": 0.020201412896437697, + "flos": 20740567357440.0, + "grad_norm": 43.20996183303322, + "language_loss": 0.98710144, + "learning_rate": 3.745359722027911e-06, + "loss": 1.00804663, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 0.53833008, + "step": 336, + "time_per_iteration": 2.637401819229126 + }, + { + "auxiliary_loss_clip": 0.01600632, + "auxiliary_loss_mlp": 0.00493784, + "balance_loss_clip": 1.22920382, + "balance_loss_mlp": 0.43770826, + "epoch": 0.020261536149105665, + "flos": 20266510636800.0, + "grad_norm": 186.48634814785888, + "language_loss": 0.94911402, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.9700582, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.56054688, + "step": 337, + "time_per_iteration": 2.6537132263183594 + }, + { + "auxiliary_loss_clip": 0.01566339, + "auxiliary_loss_mlp": 0.0049742, + "balance_loss_clip": 1.21586204, + "balance_loss_mlp": 0.44339469, + "epoch": 0.020321659401773638, + "flos": 25848752307840.0, + "grad_norm": 269.21351390933336, + "language_loss": 0.95424926, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.97488683, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.54003906, + "step": 338, + "time_per_iteration": 2.6590757369995117 + }, + { + "auxiliary_loss_clip": 0.01584851, + "auxiliary_loss_mlp": 0.00495898, + "balance_loss_clip": 1.22041345, + "balance_loss_mlp": 0.44230118, + "epoch": 0.020381782654441606, + "flos": 17495033051520.0, + "grad_norm": 166.23521792484578, + "language_loss": 0.92984003, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.95064759, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.53564453, + "step": 339, + "time_per_iteration": 2.6070399284362793 + }, + { + "auxiliary_loss_clip": 0.01589192, + "auxiliary_loss_mlp": 0.00559817, + "balance_loss_clip": 1.21699238, + "balance_loss_mlp": 0.50190467, + "epoch": 0.020441905907109575, + "flos": 24243940465920.0, + "grad_norm": 538.9376044915294, + "language_loss": 0.94868124, + "learning_rate": 3.75297936342452e-06, + "loss": 0.97017133, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.57885742, + "step": 340, + "time_per_iteration": 2.8064749240875244 + }, + { + "auxiliary_loss_clip": 0.01577495, + "auxiliary_loss_mlp": 0.00540694, + "balance_loss_clip": 1.21853173, + "balance_loss_mlp": 0.48526165, + "epoch": 0.020502029159777543, + "flos": 22233301787520.0, + "grad_norm": 56.72102413440594, + "language_loss": 0.94670618, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.96788812, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 0.55395508, + "step": 341, + "time_per_iteration": 2.7288832664489746 + }, + { + "auxiliary_loss_clip": 0.01574016, + "auxiliary_loss_mlp": 0.00459179, + "balance_loss_clip": 1.21408892, + "balance_loss_mlp": 0.4109472, + "epoch": 0.020562152412445512, + "flos": 23987861429760.0, + "grad_norm": 21.715956417489668, + "language_loss": 0.92123783, + "learning_rate": 3.756755633390458e-06, + "loss": 0.94156981, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 0.48242188, + "step": 342, + "time_per_iteration": 2.658841848373413 + }, + { + "auxiliary_loss_clip": 0.01555781, + "auxiliary_loss_mlp": 0.00505225, + "balance_loss_clip": 1.20409107, + "balance_loss_mlp": 0.44764715, + "epoch": 0.020622275665113484, + "flos": 26975305537920.0, + "grad_norm": 355.7760497545173, + "language_loss": 0.96989119, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.99050122, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.57519531, + "step": 343, + "time_per_iteration": 2.7005555629730225 + }, + { + "auxiliary_loss_clip": 0.01548089, + "auxiliary_loss_mlp": 0.00499678, + "balance_loss_clip": 1.20106864, + "balance_loss_mlp": 0.44972908, + "epoch": 0.020682398917781453, + "flos": 22600704049920.0, + "grad_norm": 49.813031470563764, + "language_loss": 0.84791827, + "learning_rate": 3.7605098841644e-06, + "loss": 0.86839592, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.49951172, + "step": 344, + "time_per_iteration": 2.688504934310913 + }, + { + "auxiliary_loss_clip": 0.01555407, + "auxiliary_loss_mlp": 0.00529, + "balance_loss_clip": 1.20427155, + "balance_loss_mlp": 0.47323391, + "epoch": 0.02074252217044942, + "flos": 15013605790080.0, + "grad_norm": 41.64105873649356, + "language_loss": 0.85191011, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.87275422, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 0.55786133, + "step": 345, + "time_per_iteration": 2.8025686740875244 + }, + { + "auxiliary_loss_clip": 0.01531499, + "auxiliary_loss_mlp": 0.00493695, + "balance_loss_clip": 1.19356287, + "balance_loss_mlp": 0.44427055, + "epoch": 0.02080264542311739, + "flos": 25337958952320.0, + "grad_norm": 9.121240579578473, + "language_loss": 0.96176076, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.98201269, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.49438477, + "step": 346, + "time_per_iteration": 2.728078842163086 + }, + { + "auxiliary_loss_clip": 0.0154885, + "auxiliary_loss_mlp": 0.00492269, + "balance_loss_clip": 1.20300698, + "balance_loss_mlp": 0.43848136, + "epoch": 0.02086276867578536, + "flos": 24388804016640.0, + "grad_norm": 100.64245956440361, + "language_loss": 0.89130104, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.91171217, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 0.53808594, + "step": 347, + "time_per_iteration": 2.750784397125244 + }, + { + "auxiliary_loss_clip": 0.01523387, + "auxiliary_loss_mlp": 0.00485177, + "balance_loss_clip": 1.18697691, + "balance_loss_mlp": 0.43458438, + "epoch": 0.02092289192845333, + "flos": 24462205459200.0, + "grad_norm": 31.142157314495158, + "language_loss": 0.81494319, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.83502883, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.50561523, + "step": 348, + "time_per_iteration": 2.719184637069702 + }, + { + "auxiliary_loss_clip": 0.01535608, + "auxiliary_loss_mlp": 0.00460714, + "balance_loss_clip": 1.18873239, + "balance_loss_mlp": 0.40954989, + "epoch": 0.0209830151811213, + "flos": 17451185523840.0, + "grad_norm": 26.211156367775544, + "language_loss": 0.87328959, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.89325279, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.51147461, + "step": 349, + "time_per_iteration": 2.6466236114501953 + }, + { + "auxiliary_loss_clip": 0.01517702, + "auxiliary_loss_mlp": 0.00462923, + "balance_loss_clip": 1.18648422, + "balance_loss_mlp": 0.41664594, + "epoch": 0.021043138433789268, + "flos": 24573995562240.0, + "grad_norm": 11.984612101666762, + "language_loss": 0.91260052, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.93240678, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.46240234, + "step": 350, + "time_per_iteration": 2.699597120285034 + }, + { + "auxiliary_loss_clip": 0.01523505, + "auxiliary_loss_mlp": 0.00463708, + "balance_loss_clip": 1.18674469, + "balance_loss_mlp": 0.41483241, + "epoch": 0.021103261686457236, + "flos": 24454053072000.0, + "grad_norm": 8.956580187560727, + "language_loss": 0.86202234, + "learning_rate": 3.773480007028776e-06, + "loss": 0.88189447, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.48828125, + "step": 351, + "time_per_iteration": 2.692387819290161 + }, + { + "auxiliary_loss_clip": 0.01513763, + "auxiliary_loss_mlp": 0.00426332, + "balance_loss_clip": 1.17884672, + "balance_loss_mlp": 0.37843406, + "epoch": 0.021163384939125205, + "flos": 14683083816960.0, + "grad_norm": 35.0760203666088, + "language_loss": 0.9445309, + "learning_rate": 3.775311735671078e-06, + "loss": 0.96393192, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.47900391, + "step": 352, + "time_per_iteration": 2.7070248126983643 + }, + { + "auxiliary_loss_clip": 0.01523317, + "auxiliary_loss_mlp": 0.0041904, + "balance_loss_clip": 1.18905723, + "balance_loss_mlp": 0.37056965, + "epoch": 0.021223508191793177, + "flos": 24493195918080.0, + "grad_norm": 31.76226518990824, + "language_loss": 0.89309633, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.91251987, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.48461914, + "step": 353, + "time_per_iteration": 2.72784686088562 + }, + { + "auxiliary_loss_clip": 0.01521784, + "auxiliary_loss_mlp": 0.00380775, + "balance_loss_clip": 1.18873739, + "balance_loss_mlp": 0.3322331, + "epoch": 0.021283631444461146, + "flos": 24126978804480.0, + "grad_norm": 9.925819161780565, + "language_loss": 0.88594818, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.9049738, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.48510742, + "step": 354, + "time_per_iteration": 2.7640459537506104 + }, + { + "auxiliary_loss_clip": 0.01513916, + "auxiliary_loss_mlp": 0.00371801, + "balance_loss_clip": 1.17788553, + "balance_loss_mlp": 0.32271087, + "epoch": 0.021343754697129114, + "flos": 25192233475200.0, + "grad_norm": 29.738870651972434, + "language_loss": 0.89208066, + "learning_rate": 3.780775860546545e-06, + "loss": 0.91093791, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.49047852, + "step": 355, + "time_per_iteration": 2.677489757537842 + }, + { + "auxiliary_loss_clip": 0.01529443, + "auxiliary_loss_mlp": 0.00328227, + "balance_loss_clip": 1.19072616, + "balance_loss_mlp": 0.2762281, + "epoch": 0.021403877949797083, + "flos": 17274182279040.0, + "grad_norm": 98.02544639954667, + "language_loss": 0.97622085, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.99479759, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.52001953, + "step": 356, + "time_per_iteration": 4.057261943817139 + }, + { + "auxiliary_loss_clip": 0.01509145, + "auxiliary_loss_mlp": 0.00339883, + "balance_loss_clip": 1.17671204, + "balance_loss_mlp": 0.2927236, + "epoch": 0.021464001202465055, + "flos": 30917435276160.0, + "grad_norm": 7.057764249280363, + "language_loss": 0.88654959, + "learning_rate": 3.784393017158528e-06, + "loss": 0.90503991, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.47192383, + "step": 357, + "time_per_iteration": 4.111879348754883 + }, + { + "auxiliary_loss_clip": 0.0151194, + "auxiliary_loss_mlp": 0.00317522, + "balance_loss_clip": 1.18188429, + "balance_loss_mlp": 0.27246064, + "epoch": 0.021524124455133024, + "flos": 18186385098240.0, + "grad_norm": 60.287967632089305, + "language_loss": 0.86501372, + "learning_rate": 3.786194003461506e-06, + "loss": 0.88330829, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.45092773, + "step": 358, + "time_per_iteration": 2.629321575164795 + }, + { + "auxiliary_loss_clip": 0.01509169, + "auxiliary_loss_mlp": 0.00334755, + "balance_loss_clip": 1.17580867, + "balance_loss_mlp": 0.28547344, + "epoch": 0.021584247707800992, + "flos": 13805786039040.0, + "grad_norm": 14.208377939748068, + "language_loss": 0.98343897, + "learning_rate": 3.787989966086264e-06, + "loss": 1.00187826, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.4921875, + "step": 359, + "time_per_iteration": 2.6666781902313232 + }, + { + "auxiliary_loss_clip": 0.01527921, + "auxiliary_loss_mlp": 0.00339357, + "balance_loss_clip": 1.18625307, + "balance_loss_mlp": 0.29238832, + "epoch": 0.02164437096046896, + "flos": 23294713703040.0, + "grad_norm": 45.84299647618601, + "language_loss": 0.87531233, + "learning_rate": 3.789780932980997e-06, + "loss": 0.89398509, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 0.4699707, + "step": 360, + "time_per_iteration": 2.643038749694824 + }, + { + "auxiliary_loss_clip": 0.01429652, + "auxiliary_loss_mlp": 0.00214293, + "balance_loss_clip": 1.1649189, + "balance_loss_mlp": 0.20017846, + "epoch": 0.02170449421313693, + "flos": 68899578341760.0, + "grad_norm": 0.9529653064056531, + "language_loss": 0.64688778, + "learning_rate": 3.79156693186132e-06, + "loss": 0.66332722, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.14160156, + "step": 361, + "time_per_iteration": 3.26251482963562 + }, + { + "auxiliary_loss_clip": 0.01509172, + "auxiliary_loss_mlp": 0.00316629, + "balance_loss_clip": 1.17540598, + "balance_loss_mlp": 0.26830164, + "epoch": 0.0217646174658049, + "flos": 25228539146880.0, + "grad_norm": 613.6648658632408, + "language_loss": 0.9185428, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.93680078, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.48291016, + "step": 362, + "time_per_iteration": 2.7215042114257812 + }, + { + "auxiliary_loss_clip": 0.01511226, + "auxiliary_loss_mlp": 0.00334452, + "balance_loss_clip": 1.17727757, + "balance_loss_mlp": 0.28893808, + "epoch": 0.02182474071847287, + "flos": 22893124671360.0, + "grad_norm": 24.13255144442568, + "language_loss": 1.0062834, + "learning_rate": 3.7951241352937077e-06, + "loss": 1.0247401, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.45507812, + "step": 363, + "time_per_iteration": 2.707303285598755 + }, + { + "auxiliary_loss_clip": 0.01520518, + "auxiliary_loss_mlp": 0.00311269, + "balance_loss_clip": 1.19119895, + "balance_loss_mlp": 0.26470545, + "epoch": 0.02188486397114084, + "flos": 23658991482240.0, + "grad_norm": 88.32388624868497, + "language_loss": 0.98755634, + "learning_rate": 3.7968953941370915e-06, + "loss": 1.00587428, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.46557617, + "step": 364, + "time_per_iteration": 2.67305064201355 + }, + { + "auxiliary_loss_clip": 0.01515798, + "auxiliary_loss_mlp": 0.00333592, + "balance_loss_clip": 1.18681383, + "balance_loss_mlp": 0.28447765, + "epoch": 0.021944987223808807, + "flos": 21543637680000.0, + "grad_norm": 21.01746795026877, + "language_loss": 0.8771807, + "learning_rate": 3.798661793553676e-06, + "loss": 0.89567459, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.49145508, + "step": 365, + "time_per_iteration": 2.6399991512298584 + }, + { + "auxiliary_loss_clip": 0.01502205, + "auxiliary_loss_mlp": 0.00322314, + "balance_loss_clip": 1.18171906, + "balance_loss_mlp": 0.27765781, + "epoch": 0.022005110476476776, + "flos": 16070887641600.0, + "grad_norm": 38.780389958421054, + "language_loss": 0.90803719, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.92628235, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.4465332, + "step": 366, + "time_per_iteration": 2.6642067432403564 + }, + { + "auxiliary_loss_clip": 0.01535178, + "auxiliary_loss_mlp": 0.00349082, + "balance_loss_clip": 1.19863892, + "balance_loss_mlp": 0.30285305, + "epoch": 0.022065233729144748, + "flos": 21433715084160.0, + "grad_norm": 38.01086816308675, + "language_loss": 0.96822864, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.98707128, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.4621582, + "step": 367, + "time_per_iteration": 2.6914007663726807 + }, + { + "auxiliary_loss_clip": 0.01533183, + "auxiliary_loss_mlp": 0.00362311, + "balance_loss_clip": 1.19617915, + "balance_loss_mlp": 0.31343508, + "epoch": 0.022125356981812717, + "flos": 21543709507200.0, + "grad_norm": 43.225859883623606, + "language_loss": 0.9389894, + "learning_rate": 3.803932100062912e-06, + "loss": 0.95794439, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.48852539, + "step": 368, + "time_per_iteration": 2.7055225372314453 + }, + { + "auxiliary_loss_clip": 0.01544261, + "auxiliary_loss_mlp": 0.00406056, + "balance_loss_clip": 1.19792843, + "balance_loss_mlp": 0.35386607, + "epoch": 0.022185480234480685, + "flos": 20704153944960.0, + "grad_norm": 75.62107275637335, + "language_loss": 0.89077866, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.91028184, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.52172852, + "step": 369, + "time_per_iteration": 2.6808202266693115 + }, + { + "auxiliary_loss_clip": 0.0151256, + "auxiliary_loss_mlp": 0.00333646, + "balance_loss_clip": 1.18819463, + "balance_loss_mlp": 0.28672522, + "epoch": 0.022245603487148654, + "flos": 25193203142400.0, + "grad_norm": 386.12195641403605, + "language_loss": 0.92814636, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.94660842, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.46948242, + "step": 370, + "time_per_iteration": 2.7185535430908203 + }, + { + "auxiliary_loss_clip": 0.01517399, + "auxiliary_loss_mlp": 0.00363723, + "balance_loss_clip": 1.18639851, + "balance_loss_mlp": 0.3199493, + "epoch": 0.022305726739816623, + "flos": 21395936954880.0, + "grad_norm": 18.86696379815136, + "language_loss": 0.89356893, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.91238016, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.43774414, + "step": 371, + "time_per_iteration": 2.6834590435028076 + }, + { + "auxiliary_loss_clip": 0.01527466, + "auxiliary_loss_mlp": 0.00353966, + "balance_loss_clip": 1.19729149, + "balance_loss_mlp": 0.30706865, + "epoch": 0.022365849992484595, + "flos": 22492146170880.0, + "grad_norm": 7334.19949890276, + "language_loss": 0.92881465, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.94762897, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.46923828, + "step": 372, + "time_per_iteration": 2.6618363857269287 + }, + { + "auxiliary_loss_clip": 0.01514259, + "auxiliary_loss_mlp": 0.00317182, + "balance_loss_clip": 1.19431376, + "balance_loss_mlp": 0.27305084, + "epoch": 0.022425973245152563, + "flos": 17856581397120.0, + "grad_norm": 42.379007433544956, + "language_loss": 0.90764785, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.92596233, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.44116211, + "step": 373, + "time_per_iteration": 3.0775949954986572 + }, + { + "auxiliary_loss_clip": 0.01521169, + "auxiliary_loss_mlp": 0.00319969, + "balance_loss_clip": 1.19598556, + "balance_loss_mlp": 0.27476451, + "epoch": 0.022486096497820532, + "flos": 15483029656320.0, + "grad_norm": 12.264853926991645, + "language_loss": 0.90925711, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.92766851, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.45214844, + "step": 374, + "time_per_iteration": 2.8823177814483643 + }, + { + "auxiliary_loss_clip": 0.0151253, + "auxiliary_loss_mlp": 0.00329569, + "balance_loss_clip": 1.19073188, + "balance_loss_mlp": 0.28319687, + "epoch": 0.0225462197504885, + "flos": 27784157950080.0, + "grad_norm": 25.789442353711863, + "language_loss": 0.9232924, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.94171345, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.46362305, + "step": 375, + "time_per_iteration": 2.708775281906128 + }, + { + "auxiliary_loss_clip": 0.01538233, + "auxiliary_loss_mlp": 0.00341524, + "balance_loss_clip": 1.21063495, + "balance_loss_mlp": 0.2903598, + "epoch": 0.02260634300315647, + "flos": 19975490645760.0, + "grad_norm": 24.57225782089914, + "language_loss": 0.96388578, + "learning_rate": 3.817778917253314e-06, + "loss": 0.98268342, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.51196289, + "step": 376, + "time_per_iteration": 2.6269731521606445 + }, + { + "auxiliary_loss_clip": 0.01549991, + "auxiliary_loss_mlp": 0.00361838, + "balance_loss_clip": 1.21508563, + "balance_loss_mlp": 0.31758755, + "epoch": 0.02266646625582444, + "flos": 16028189349120.0, + "grad_norm": 38.46674291964797, + "language_loss": 0.85004634, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.86916459, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.44238281, + "step": 377, + "time_per_iteration": 2.6011924743652344 + }, + { + "auxiliary_loss_clip": 0.01543211, + "auxiliary_loss_mlp": 0.00326384, + "balance_loss_clip": 1.21542931, + "balance_loss_mlp": 0.28323036, + "epoch": 0.02272658950849241, + "flos": 20404622430720.0, + "grad_norm": 39.127833122263, + "language_loss": 1.07027102, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.08896685, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.43164062, + "step": 378, + "time_per_iteration": 2.6778993606567383 + }, + { + "auxiliary_loss_clip": 0.01360279, + "auxiliary_loss_mlp": 0.00309604, + "balance_loss_clip": 1.1094451, + "balance_loss_mlp": 0.29453617, + "epoch": 0.02278671276116038, + "flos": 69847332647040.0, + "grad_norm": 1.1115497447768135, + "language_loss": 0.75309598, + "learning_rate": 3.822895650276492e-06, + "loss": 0.76979476, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.15039062, + "step": 379, + "time_per_iteration": 3.212510585784912 + }, + { + "auxiliary_loss_clip": 0.01575326, + "auxiliary_loss_mlp": 0.00360175, + "balance_loss_clip": 1.22719979, + "balance_loss_mlp": 0.3130154, + "epoch": 0.022846836013828347, + "flos": 38508771340800.0, + "grad_norm": 5886.55427360622, + "language_loss": 0.86944425, + "learning_rate": 3.824592231451859e-06, + "loss": 0.88879931, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 0.47192383, + "step": 380, + "time_per_iteration": 2.7725703716278076 + }, + { + "auxiliary_loss_clip": 0.01570244, + "auxiliary_loss_mlp": 0.0033501, + "balance_loss_clip": 1.23151445, + "balance_loss_mlp": 0.28985393, + "epoch": 0.02290695926649632, + "flos": 20959478795520.0, + "grad_norm": 517.2507037819446, + "language_loss": 1.06674385, + "learning_rate": 3.826284353801652e-06, + "loss": 1.08579659, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.4519043, + "step": 381, + "time_per_iteration": 2.6238043308258057 + }, + { + "auxiliary_loss_clip": 0.01562872, + "auxiliary_loss_mlp": 0.0034918, + "balance_loss_clip": 1.22451746, + "balance_loss_mlp": 0.30323696, + "epoch": 0.022967082519164288, + "flos": 24022407335040.0, + "grad_norm": 19.10157888637605, + "language_loss": 0.95642018, + "learning_rate": 3.827972040701142e-06, + "loss": 0.9755407, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.4597168, + "step": 382, + "time_per_iteration": 2.6926281452178955 + }, + { + "auxiliary_loss_clip": 0.01541195, + "auxiliary_loss_mlp": 0.00345423, + "balance_loss_clip": 1.21146035, + "balance_loss_mlp": 0.30169746, + "epoch": 0.023027205771832256, + "flos": 20997149184000.0, + "grad_norm": 15.047680114401052, + "language_loss": 0.94004655, + "learning_rate": 3.829655315342268e-06, + "loss": 0.95891273, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.4375, + "step": 383, + "time_per_iteration": 2.66292405128479 + }, + { + "auxiliary_loss_clip": 0.01509651, + "auxiliary_loss_mlp": 0.00341772, + "balance_loss_clip": 1.18782377, + "balance_loss_mlp": 0.29971534, + "epoch": 0.023087329024500225, + "flos": 21360816432000.0, + "grad_norm": 41.04131544865794, + "language_loss": 0.93791032, + "learning_rate": 3.831334200735543e-06, + "loss": 0.95642447, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.42041016, + "step": 384, + "time_per_iteration": 2.729787588119507 + }, + { + "auxiliary_loss_clip": 0.0150381, + "auxiliary_loss_mlp": 0.00321713, + "balance_loss_clip": 1.1899091, + "balance_loss_mlp": 0.28089529, + "epoch": 0.023147452277168194, + "flos": 21872435800320.0, + "grad_norm": 10.47499622636741, + "language_loss": 0.96072507, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.9789803, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.40844727, + "step": 385, + "time_per_iteration": 2.6384949684143066 + }, + { + "auxiliary_loss_clip": 0.01498257, + "auxiliary_loss_mlp": 0.00337882, + "balance_loss_clip": 1.1809392, + "balance_loss_mlp": 0.29773244, + "epoch": 0.023207575529836166, + "flos": 18916700423040.0, + "grad_norm": 59.67904665395745, + "language_loss": 0.76697737, + "learning_rate": 3.83467889492477e-06, + "loss": 0.78533876, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.40136719, + "step": 386, + "time_per_iteration": 2.6650912761688232 + }, + { + "auxiliary_loss_clip": 0.01492622, + "auxiliary_loss_mlp": 0.0036505, + "balance_loss_clip": 1.16879439, + "balance_loss_mlp": 0.32406634, + "epoch": 0.023267698782504134, + "flos": 25046005207680.0, + "grad_norm": 15.321456637892089, + "language_loss": 0.94538677, + "learning_rate": 3.836344748851495e-06, + "loss": 0.96396351, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.40991211, + "step": 387, + "time_per_iteration": 2.7699034214019775 + }, + { + "auxiliary_loss_clip": 0.01488843, + "auxiliary_loss_mlp": 0.00353733, + "balance_loss_clip": 1.16618443, + "balance_loss_mlp": 0.31093735, + "epoch": 0.023327822035172103, + "flos": 28879217930880.0, + "grad_norm": 167.72664415642237, + "language_loss": 0.90978992, + "learning_rate": 3.838006303795566e-06, + "loss": 0.92821574, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 3.2265625, + "router_z_loss_mlp": 0.42773438, + "step": 388, + "time_per_iteration": 2.7591171264648438 + }, + { + "auxiliary_loss_clip": 0.01488672, + "auxiliary_loss_mlp": 0.00386302, + "balance_loss_clip": 1.1630466, + "balance_loss_mlp": 0.34457856, + "epoch": 0.02338794528784007, + "flos": 27121533805440.0, + "grad_norm": 122.31283401229604, + "language_loss": 1.03253531, + "learning_rate": 3.839663581888206e-06, + "loss": 1.05128515, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.41772461, + "step": 389, + "time_per_iteration": 2.708311080932617 + }, + { + "auxiliary_loss_clip": 0.0146032, + "auxiliary_loss_mlp": 0.00322361, + "balance_loss_clip": 1.15428615, + "balance_loss_mlp": 0.28290242, + "epoch": 0.02344806854050804, + "flos": 21322355944320.0, + "grad_norm": 77.0883118132372, + "language_loss": 0.96493512, + "learning_rate": 3.841316605090178e-06, + "loss": 0.98276198, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.39453125, + "step": 390, + "time_per_iteration": 2.702151298522949 + }, + { + "auxiliary_loss_clip": 0.01462422, + "auxiliary_loss_mlp": 0.00365464, + "balance_loss_clip": 1.15242827, + "balance_loss_mlp": 0.32564798, + "epoch": 0.023508191793176012, + "flos": 24789997998720.0, + "grad_norm": 10.148867398654644, + "language_loss": 1.01097083, + "learning_rate": 3.842965395193529e-06, + "loss": 1.02924967, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.39868164, + "step": 391, + "time_per_iteration": 2.683629035949707 + }, + { + "auxiliary_loss_clip": 0.01465617, + "auxiliary_loss_mlp": 0.00357577, + "balance_loss_clip": 1.15055084, + "balance_loss_mlp": 0.31821409, + "epoch": 0.02356831504584398, + "flos": 25995375624960.0, + "grad_norm": 62.51213740886254, + "language_loss": 0.94825232, + "learning_rate": 3.84460997382332e-06, + "loss": 0.96648425, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.39379883, + "step": 392, + "time_per_iteration": 2.6983795166015625 + }, + { + "auxiliary_loss_clip": 0.01436542, + "auxiliary_loss_mlp": 0.00344158, + "balance_loss_clip": 1.14328551, + "balance_loss_mlp": 0.30617833, + "epoch": 0.02362843829851195, + "flos": 19062461813760.0, + "grad_norm": 247.14878208640872, + "language_loss": 0.98110199, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.99890906, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.37988281, + "step": 393, + "time_per_iteration": 2.6159582138061523 + }, + { + "auxiliary_loss_clip": 0.01453291, + "auxiliary_loss_mlp": 0.00359151, + "balance_loss_clip": 1.15109539, + "balance_loss_mlp": 0.32143313, + "epoch": 0.023688561551179918, + "flos": 16071031296000.0, + "grad_norm": 43.41067005792844, + "language_loss": 0.89193499, + "learning_rate": 3.84788658233771e-06, + "loss": 0.91005933, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.37719727, + "step": 394, + "time_per_iteration": 2.6035096645355225 + }, + { + "auxiliary_loss_clip": 0.01440423, + "auxiliary_loss_mlp": 0.00344552, + "balance_loss_clip": 1.13887262, + "balance_loss_mlp": 0.30490261, + "epoch": 0.023748684803847887, + "flos": 21724375939200.0, + "grad_norm": 192.5227778711521, + "language_loss": 0.93150342, + "learning_rate": 3.84951865465269e-06, + "loss": 0.94935322, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.39672852, + "step": 395, + "time_per_iteration": 2.679187774658203 + }, + { + "auxiliary_loss_clip": 0.01305619, + "auxiliary_loss_mlp": 0.00313634, + "balance_loss_clip": 1.06917858, + "balance_loss_mlp": 0.29856643, + "epoch": 0.02380880805651586, + "flos": 61926192881280.0, + "grad_norm": 1.4795646620529492, + "language_loss": 0.63490987, + "learning_rate": 3.851146600358172e-06, + "loss": 0.65110242, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.15039062, + "step": 396, + "time_per_iteration": 3.0325188636779785 + }, + { + "auxiliary_loss_clip": 0.01441148, + "auxiliary_loss_mlp": 0.00343902, + "balance_loss_clip": 1.13871348, + "balance_loss_mlp": 0.30630368, + "epoch": 0.023868931309183827, + "flos": 20266331068800.0, + "grad_norm": 23.64073367094155, + "language_loss": 0.95396185, + "learning_rate": 3.852770440269372e-06, + "loss": 0.97181237, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.37597656, + "step": 397, + "time_per_iteration": 2.6354172229766846 + }, + { + "auxiliary_loss_clip": 0.01454733, + "auxiliary_loss_mlp": 0.00381303, + "balance_loss_clip": 1.14698672, + "balance_loss_mlp": 0.34174931, + "epoch": 0.023929054561851796, + "flos": 21139103733120.0, + "grad_norm": 54.2604897699147, + "language_loss": 0.95084643, + "learning_rate": 3.854390195044404e-06, + "loss": 0.96920675, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.39575195, + "step": 398, + "time_per_iteration": 4.0485146045684814 + }, + { + "auxiliary_loss_clip": 0.01452601, + "auxiliary_loss_mlp": 0.00386179, + "balance_loss_clip": 1.1419481, + "balance_loss_mlp": 0.34400269, + "epoch": 0.023989177814519765, + "flos": 13698521049600.0, + "grad_norm": 16.771937191662516, + "language_loss": 0.98184246, + "learning_rate": 3.856005885185868e-06, + "loss": 1.00023031, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.42163086, + "step": 399, + "time_per_iteration": 5.570476055145264 + }, + { + "auxiliary_loss_clip": 0.01426035, + "auxiliary_loss_mlp": 0.00319738, + "balance_loss_clip": 1.13257658, + "balance_loss_mlp": 0.28142446, + "epoch": 0.024049301067187733, + "flos": 26322018929280.0, + "grad_norm": 10.568282220440201, + "language_loss": 0.93230581, + "learning_rate": 3.857617531042398e-06, + "loss": 0.94976354, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.38330078, + "step": 400, + "time_per_iteration": 2.676722764968872 + }, + { + "auxiliary_loss_clip": 0.01431544, + "auxiliary_loss_mlp": 0.00318261, + "balance_loss_clip": 1.13156176, + "balance_loss_mlp": 0.28078112, + "epoch": 0.024109424319855705, + "flos": 24425432910720.0, + "grad_norm": 24.00082735376984, + "language_loss": 0.86950696, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.88700497, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.375, + "step": 401, + "time_per_iteration": 2.6696531772613525 + }, + { + "auxiliary_loss_clip": 0.01441826, + "auxiliary_loss_mlp": 0.00322226, + "balance_loss_clip": 1.13702631, + "balance_loss_mlp": 0.28198087, + "epoch": 0.024169547572523674, + "flos": 29604397610880.0, + "grad_norm": 3.5523075894526444, + "language_loss": 0.9022274, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.91986793, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 3.04492188, + "router_z_loss_mlp": 0.40258789, + "step": 402, + "time_per_iteration": 2.6798934936523438 + }, + { + "auxiliary_loss_clip": 0.01476831, + "auxiliary_loss_mlp": 0.0039759, + "balance_loss_clip": 1.14419782, + "balance_loss_mlp": 0.35171807, + "epoch": 0.024229670825191642, + "flos": 22601458235520.0, + "grad_norm": 20.24102196538997, + "language_loss": 1.04564536, + "learning_rate": 3.86242840411147e-06, + "loss": 1.06438947, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.45849609, + "step": 403, + "time_per_iteration": 2.6480603218078613 + }, + { + "auxiliary_loss_clip": 0.01468691, + "auxiliary_loss_mlp": 0.00362143, + "balance_loss_clip": 1.14455867, + "balance_loss_mlp": 0.31829736, + "epoch": 0.02428979407785961, + "flos": 18150258994560.0, + "grad_norm": 124.81872321059176, + "language_loss": 1.07451332, + "learning_rate": 3.864024073288798e-06, + "loss": 1.0928216, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.43823242, + "step": 404, + "time_per_iteration": 2.6177539825439453 + }, + { + "auxiliary_loss_clip": 0.01497079, + "auxiliary_loss_mlp": 0.00383814, + "balance_loss_clip": 1.16568434, + "balance_loss_mlp": 0.33882481, + "epoch": 0.024349917330527583, + "flos": 15304984917120.0, + "grad_norm": 11.914263017202467, + "language_loss": 0.98148525, + "learning_rate": 3.865615797668091e-06, + "loss": 1.00029421, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.44995117, + "step": 405, + "time_per_iteration": 2.6843278408050537 + }, + { + "auxiliary_loss_clip": 0.01567696, + "auxiliary_loss_mlp": 0.00429368, + "balance_loss_clip": 1.20863152, + "balance_loss_mlp": 0.38144609, + "epoch": 0.024410040583195552, + "flos": 20773892200320.0, + "grad_norm": 24.611002263020453, + "language_loss": 1.01854444, + "learning_rate": 3.867203596705844e-06, + "loss": 1.03851509, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 0.47900391, + "step": 406, + "time_per_iteration": 2.648547410964966 + }, + { + "auxiliary_loss_clip": 0.01576812, + "auxiliary_loss_mlp": 0.00427859, + "balance_loss_clip": 1.21369028, + "balance_loss_mlp": 0.37822077, + "epoch": 0.02447016383586352, + "flos": 21798854789760.0, + "grad_norm": 53.63628982535537, + "language_loss": 0.95902008, + "learning_rate": 3.86878748971496e-06, + "loss": 0.97906685, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.49658203, + "step": 407, + "time_per_iteration": 2.650451183319092 + }, + { + "auxiliary_loss_clip": 0.01601904, + "auxiliary_loss_mlp": 0.00436098, + "balance_loss_clip": 1.22742486, + "balance_loss_mlp": 0.39106107, + "epoch": 0.02453028708853149, + "flos": 33948116380800.0, + "grad_norm": 13.620053661620686, + "language_loss": 0.80098253, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.82136256, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 0.45019531, + "step": 408, + "time_per_iteration": 2.7871968746185303 + }, + { + "auxiliary_loss_clip": 0.01658732, + "auxiliary_loss_mlp": 0.00453093, + "balance_loss_clip": 1.25224447, + "balance_loss_mlp": 0.40285772, + "epoch": 0.024590410341199458, + "flos": 21793000872960.0, + "grad_norm": 8.531554986303604, + "language_loss": 1.01906967, + "learning_rate": 3.871943634189376e-06, + "loss": 1.04018784, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 4.06640625, + "router_z_loss_mlp": 0.50268555, + "step": 409, + "time_per_iteration": 2.6576178073883057 + }, + { + "auxiliary_loss_clip": 0.01692906, + "auxiliary_loss_mlp": 0.00460758, + "balance_loss_clip": 1.26603675, + "balance_loss_mlp": 0.41214493, + "epoch": 0.02465053359386743, + "flos": 35114782124160.0, + "grad_norm": 393.06278710061054, + "language_loss": 0.89926088, + "learning_rate": 3.873515923575128e-06, + "loss": 0.92079759, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 4.27734375, + "router_z_loss_mlp": 0.48608398, + "step": 410, + "time_per_iteration": 2.7898566722869873 + }, + { + "auxiliary_loss_clip": 0.01751991, + "auxiliary_loss_mlp": 0.00478314, + "balance_loss_clip": 1.29831076, + "balance_loss_mlp": 0.42681584, + "epoch": 0.0247106568465354, + "flos": 27451409333760.0, + "grad_norm": 48.13319885688254, + "language_loss": 0.85088003, + "learning_rate": 3.875084382775879e-06, + "loss": 0.87318313, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.51586914, + "step": 411, + "time_per_iteration": 2.679041862487793 + }, + { + "auxiliary_loss_clip": 0.0175643, + "auxiliary_loss_mlp": 0.00468497, + "balance_loss_clip": 1.29576516, + "balance_loss_mlp": 0.41485265, + "epoch": 0.024770780099203367, + "flos": 20703794808960.0, + "grad_norm": 169.38057031166306, + "language_loss": 0.96551132, + "learning_rate": 3.87664903040738e-06, + "loss": 0.9877606, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 4.609375, + "router_z_loss_mlp": 0.53637695, + "step": 412, + "time_per_iteration": 2.7272403240203857 + }, + { + "auxiliary_loss_clip": 0.01536049, + "auxiliary_loss_mlp": 0.00260981, + "balance_loss_clip": 1.20265138, + "balance_loss_mlp": 0.24658008, + "epoch": 0.024830903351871336, + "flos": 69551859369600.0, + "grad_norm": 0.8399841194984358, + "language_loss": 0.58072162, + "learning_rate": 3.878209884949994e-06, + "loss": 0.59869188, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.14355469, + "step": 413, + "time_per_iteration": 3.256260633468628 + }, + { + "auxiliary_loss_clip": 0.01801652, + "auxiliary_loss_mlp": 0.00468195, + "balance_loss_clip": 1.30706239, + "balance_loss_mlp": 0.41047412, + "epoch": 0.024891026604539304, + "flos": 32270477713920.0, + "grad_norm": 15.593118246125881, + "language_loss": 0.87347746, + "learning_rate": 3.879766964750006e-06, + "loss": 0.89617592, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 4.94921875, + "router_z_loss_mlp": 0.5769043, + "step": 414, + "time_per_iteration": 2.769810676574707 + }, + { + "auxiliary_loss_clip": 0.01826005, + "auxiliary_loss_mlp": 0.00453289, + "balance_loss_clip": 1.32519507, + "balance_loss_mlp": 0.40188599, + "epoch": 0.024951149857207276, + "flos": 18840282238080.0, + "grad_norm": 31.937992567950268, + "language_loss": 0.88616091, + "learning_rate": 3.881320288020917e-06, + "loss": 0.90895385, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 5.01171875, + "router_z_loss_mlp": 0.51391602, + "step": 415, + "time_per_iteration": 2.6401093006134033 + }, + { + "auxiliary_loss_clip": 0.0187456, + "auxiliary_loss_mlp": 0.00468046, + "balance_loss_clip": 1.33668232, + "balance_loss_mlp": 0.4147594, + "epoch": 0.025011273109875245, + "flos": 15377201210880.0, + "grad_norm": 44.13028506879389, + "language_loss": 1.08054113, + "learning_rate": 3.882869872844723e-06, + "loss": 1.10396719, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 5.375, + "router_z_loss_mlp": 0.53320312, + "step": 416, + "time_per_iteration": 2.7485015392303467 + }, + { + "auxiliary_loss_clip": 0.01886013, + "auxiliary_loss_mlp": 0.00512743, + "balance_loss_clip": 1.33771777, + "balance_loss_mlp": 0.45371011, + "epoch": 0.025071396362543213, + "flos": 18915515274240.0, + "grad_norm": 24.57796699676221, + "language_loss": 0.8260203, + "learning_rate": 3.884415737173176e-06, + "loss": 0.85000789, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 5.48046875, + "router_z_loss_mlp": 0.58959961, + "step": 417, + "time_per_iteration": 2.7337136268615723 + }, + { + "auxiliary_loss_clip": 0.01886415, + "auxiliary_loss_mlp": 0.00462626, + "balance_loss_clip": 1.35136271, + "balance_loss_mlp": 0.40812334, + "epoch": 0.025131519615211182, + "flos": 25337958952320.0, + "grad_norm": 56.44568362393487, + "language_loss": 0.84310174, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.86659217, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 5.3515625, + "router_z_loss_mlp": 0.54516602, + "step": 418, + "time_per_iteration": 2.6680052280426025 + }, + { + "auxiliary_loss_clip": 0.01905592, + "auxiliary_loss_mlp": 0.00448095, + "balance_loss_clip": 1.35417175, + "balance_loss_mlp": 0.39621529, + "epoch": 0.02519164286787915, + "flos": 18953149749120.0, + "grad_norm": 287.0761373730547, + "language_loss": 0.93945718, + "learning_rate": 3.887496375507294e-06, + "loss": 0.96299404, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 5.51171875, + "router_z_loss_mlp": 0.51855469, + "step": 419, + "time_per_iteration": 2.6204447746276855 + }, + { + "auxiliary_loss_clip": 0.01892131, + "auxiliary_loss_mlp": 0.00465204, + "balance_loss_clip": 1.34680414, + "balance_loss_mlp": 0.41201332, + "epoch": 0.025251766120547123, + "flos": 17421092904960.0, + "grad_norm": 9.219796122688148, + "language_loss": 0.81337059, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.83694398, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 5.453125, + "router_z_loss_mlp": 0.53271484, + "step": 420, + "time_per_iteration": 2.6334424018859863 + }, + { + "auxiliary_loss_clip": 0.01915238, + "auxiliary_loss_mlp": 0.0046521, + "balance_loss_clip": 1.34768367, + "balance_loss_mlp": 0.41476095, + "epoch": 0.02531188937321509, + "flos": 25045430590080.0, + "grad_norm": 15.69342414969331, + "language_loss": 0.86411136, + "learning_rate": 3.890562344079484e-06, + "loss": 0.88791585, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 5.66796875, + "router_z_loss_mlp": 0.50439453, + "step": 421, + "time_per_iteration": 2.757634162902832 + }, + { + "auxiliary_loss_clip": 0.01919809, + "auxiliary_loss_mlp": 0.0047349, + "balance_loss_clip": 1.35392189, + "balance_loss_mlp": 0.42161012, + "epoch": 0.02537201262588306, + "flos": 30592228515840.0, + "grad_norm": 79.02758993556142, + "language_loss": 0.90216053, + "learning_rate": 3.89208987073549e-06, + "loss": 0.92609352, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 5.6640625, + "router_z_loss_mlp": 0.51806641, + "step": 422, + "time_per_iteration": 2.725576877593994 + }, + { + "auxiliary_loss_clip": 0.01934553, + "auxiliary_loss_mlp": 0.00454766, + "balance_loss_clip": 1.35145044, + "balance_loss_mlp": 0.4050315, + "epoch": 0.02543213587855103, + "flos": 26065365275520.0, + "grad_norm": 87.63871128429972, + "language_loss": 0.89437306, + "learning_rate": 3.893613781940409e-06, + "loss": 0.91826624, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 5.83984375, + "router_z_loss_mlp": 0.49755859, + "step": 423, + "time_per_iteration": 2.715895414352417 + }, + { + "auxiliary_loss_clip": 0.01953064, + "auxiliary_loss_mlp": 0.00486886, + "balance_loss_clip": 1.36693192, + "balance_loss_mlp": 0.4337188, + "epoch": 0.025492259131218997, + "flos": 36022818965760.0, + "grad_norm": 2921.2046451277474, + "language_loss": 0.79795778, + "learning_rate": 3.895134094768415e-06, + "loss": 0.8223573, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 5.86328125, + "router_z_loss_mlp": 0.53173828, + "step": 424, + "time_per_iteration": 2.753077268600464 + }, + { + "auxiliary_loss_clip": 0.01960533, + "auxiliary_loss_mlp": 0.00488344, + "balance_loss_clip": 1.36884391, + "balance_loss_mlp": 0.43829975, + "epoch": 0.02555238238388697, + "flos": 18588045957120.0, + "grad_norm": 9.224667483097814, + "language_loss": 0.91462213, + "learning_rate": 3.896650826173015e-06, + "loss": 0.93911093, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 5.9140625, + "router_z_loss_mlp": 0.50073242, + "step": 425, + "time_per_iteration": 2.645948648452759 + }, + { + "auxiliary_loss_clip": 0.01958062, + "auxiliary_loss_mlp": 0.00490973, + "balance_loss_clip": 1.35766709, + "balance_loss_mlp": 0.43673301, + "epoch": 0.025612505636554938, + "flos": 24243186280320.0, + "grad_norm": 63.03664372839104, + "language_loss": 0.91528869, + "learning_rate": 3.898163992988186e-06, + "loss": 0.93977904, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 6.00390625, + "router_z_loss_mlp": 0.54272461, + "step": 426, + "time_per_iteration": 2.6336944103240967 + }, + { + "auxiliary_loss_clip": 0.01670869, + "auxiliary_loss_mlp": 0.00431966, + "balance_loss_clip": 1.29185009, + "balance_loss_mlp": 0.41546726, + "epoch": 0.025672628889222907, + "flos": 60586941265920.0, + "grad_norm": 0.90655101929836, + "language_loss": 0.56794798, + "learning_rate": 3.899673611929491e-06, + "loss": 0.58897638, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 0.16503906, + "step": 427, + "time_per_iteration": 3.297841787338257 + }, + { + "auxiliary_loss_clip": 0.01969685, + "auxiliary_loss_mlp": 0.00493751, + "balance_loss_clip": 1.3699851, + "balance_loss_mlp": 0.44315836, + "epoch": 0.025732752141890875, + "flos": 19573255169280.0, + "grad_norm": 10.304390367867011, + "language_loss": 0.95777792, + "learning_rate": 3.901179699595194e-06, + "loss": 0.98241228, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 6.0, + "router_z_loss_mlp": 0.5065918, + "step": 428, + "time_per_iteration": 2.6635453701019287 + }, + { + "auxiliary_loss_clip": 0.01925572, + "auxiliary_loss_mlp": 0.0048904, + "balance_loss_clip": 1.35072351, + "balance_loss_mlp": 0.43642142, + "epoch": 0.025792875394558847, + "flos": 31284262920960.0, + "grad_norm": 47.04621176664694, + "language_loss": 0.91731411, + "learning_rate": 3.902682272467353e-06, + "loss": 0.94146025, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 5.75, + "router_z_loss_mlp": 0.52661133, + "step": 429, + "time_per_iteration": 2.734067678451538 + }, + { + "auxiliary_loss_clip": 0.01946465, + "auxiliary_loss_mlp": 0.00482067, + "balance_loss_clip": 1.35330558, + "balance_loss_mlp": 0.42868459, + "epoch": 0.025852998647226816, + "flos": 32379610210560.0, + "grad_norm": 36.55793556515902, + "language_loss": 0.93248236, + "learning_rate": 3.904181346912895e-06, + "loss": 0.95676768, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 5.92578125, + "router_z_loss_mlp": 0.53442383, + "step": 430, + "time_per_iteration": 2.752880573272705 + }, + { + "auxiliary_loss_clip": 0.01915667, + "auxiliary_loss_mlp": 0.00437971, + "balance_loss_clip": 1.35594988, + "balance_loss_mlp": 0.38909552, + "epoch": 0.025913121899894784, + "flos": 20193288762240.0, + "grad_norm": 27.348754325705023, + "language_loss": 0.89985824, + "learning_rate": 3.905676939184698e-06, + "loss": 0.92339456, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 5.59375, + "router_z_loss_mlp": 0.48852539, + "step": 431, + "time_per_iteration": 2.6637299060821533 + }, + { + "auxiliary_loss_clip": 0.01938013, + "auxiliary_loss_mlp": 0.00469542, + "balance_loss_clip": 1.36681557, + "balance_loss_mlp": 0.41837755, + "epoch": 0.025973245152562753, + "flos": 14720430983040.0, + "grad_norm": 25.941064600871343, + "language_loss": 0.96615708, + "learning_rate": 3.907169065422638e-06, + "loss": 0.99023259, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 5.71484375, + "router_z_loss_mlp": 0.51220703, + "step": 432, + "time_per_iteration": 2.6887614727020264 + }, + { + "auxiliary_loss_clip": 0.01942939, + "auxiliary_loss_mlp": 0.00422268, + "balance_loss_clip": 1.37782359, + "balance_loss_mlp": 0.37777928, + "epoch": 0.02603336840523072, + "flos": 30992991534720.0, + "grad_norm": 66.27044034611329, + "language_loss": 0.83125114, + "learning_rate": 3.908657741654636e-06, + "loss": 0.85490316, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 5.65234375, + "router_z_loss_mlp": 0.4453125, + "step": 433, + "time_per_iteration": 2.750136137008667 + }, + { + "auxiliary_loss_clip": 0.0193993, + "auxiliary_loss_mlp": 0.00451461, + "balance_loss_clip": 1.37188828, + "balance_loss_mlp": 0.40315735, + "epoch": 0.026093491657898694, + "flos": 17674262939520.0, + "grad_norm": 99.74953088672527, + "language_loss": 0.97187757, + "learning_rate": 3.910142983797699e-06, + "loss": 0.99579149, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 5.6796875, + "router_z_loss_mlp": 0.48291016, + "step": 434, + "time_per_iteration": 2.850921869277954 + }, + { + "auxiliary_loss_clip": 0.01939192, + "auxiliary_loss_mlp": 0.00461281, + "balance_loss_clip": 1.37358069, + "balance_loss_mlp": 0.41307271, + "epoch": 0.026153614910566662, + "flos": 17857874286720.0, + "grad_norm": 731.863262841361, + "language_loss": 0.86682081, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.89082551, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 5.66015625, + "router_z_loss_mlp": 0.48266602, + "step": 435, + "time_per_iteration": 2.7048747539520264 + }, + { + "auxiliary_loss_clip": 0.01951588, + "auxiliary_loss_mlp": 0.00486545, + "balance_loss_clip": 1.37643385, + "balance_loss_mlp": 0.43275762, + "epoch": 0.02621373816323463, + "flos": 20011113959040.0, + "grad_norm": 55.174905445967404, + "language_loss": 0.93517882, + "learning_rate": 3.913103228936546e-06, + "loss": 0.95956016, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 5.74609375, + "router_z_loss_mlp": 0.5378418, + "step": 436, + "time_per_iteration": 2.700579881668091 + }, + { + "auxiliary_loss_clip": 0.01946465, + "auxiliary_loss_mlp": 0.00456365, + "balance_loss_clip": 1.38551855, + "balance_loss_mlp": 0.40691659, + "epoch": 0.0262738614159026, + "flos": 19281193683840.0, + "grad_norm": 19.81804861277507, + "language_loss": 0.82856333, + "learning_rate": 3.914578263220868e-06, + "loss": 0.85259157, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 5.609375, + "router_z_loss_mlp": 0.49462891, + "step": 437, + "time_per_iteration": 2.6619081497192383 + }, + { + "auxiliary_loss_clip": 0.01938884, + "auxiliary_loss_mlp": 0.00480396, + "balance_loss_clip": 1.37848711, + "balance_loss_mlp": 0.43099555, + "epoch": 0.026333984668570568, + "flos": 18807208790400.0, + "grad_norm": 15.902963620949732, + "language_loss": 0.99318653, + "learning_rate": 3.916049925995316e-06, + "loss": 1.01737928, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 5.6015625, + "router_z_loss_mlp": 0.49365234, + "step": 438, + "time_per_iteration": 2.644674301147461 + }, + { + "auxiliary_loss_clip": 0.01750927, + "auxiliary_loss_mlp": 0.00454274, + "balance_loss_clip": 1.38240564, + "balance_loss_mlp": 0.43701228, + "epoch": 0.02639410792123854, + "flos": 64572020691840.0, + "grad_norm": 1.4720357441998169, + "language_loss": 0.62343764, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64548969, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 0.17285156, + "step": 439, + "time_per_iteration": 3.3001835346221924 + }, + { + "auxiliary_loss_clip": 0.0194043, + "auxiliary_loss_mlp": 0.00489717, + "balance_loss_clip": 1.38442683, + "balance_loss_mlp": 0.43786147, + "epoch": 0.02645423117390651, + "flos": 28473462921600.0, + "grad_norm": 20.78358588700867, + "language_loss": 0.83345759, + "learning_rate": 3.918983198419573e-06, + "loss": 0.85775912, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 5.55859375, + "router_z_loss_mlp": 0.51831055, + "step": 440, + "time_per_iteration": 4.1120805740356445 + }, + { + "auxiliary_loss_clip": 0.01941359, + "auxiliary_loss_mlp": 0.00438517, + "balance_loss_clip": 1.39317942, + "balance_loss_mlp": 0.3917152, + "epoch": 0.026514354426574478, + "flos": 18551237495040.0, + "grad_norm": 2247.6513005122347, + "language_loss": 0.90906537, + "learning_rate": 3.920444838510415e-06, + "loss": 0.93286413, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 5.484375, + "router_z_loss_mlp": 0.46826172, + "step": 441, + "time_per_iteration": 5.567206382751465 + }, + { + "auxiliary_loss_clip": 0.01954785, + "auxiliary_loss_mlp": 0.00451507, + "balance_loss_clip": 1.40021086, + "balance_loss_mlp": 0.40260708, + "epoch": 0.026574477679242446, + "flos": 20667812359680.0, + "grad_norm": 506.99370759632956, + "language_loss": 0.849334, + "learning_rate": 3.92190316797534e-06, + "loss": 0.87339687, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 5.546875, + "router_z_loss_mlp": 0.48925781, + "step": 442, + "time_per_iteration": 2.626519203186035 + }, + { + "auxiliary_loss_clip": 0.01658803, + "auxiliary_loss_mlp": 0.00217584, + "balance_loss_clip": 1.33796549, + "balance_loss_mlp": 0.2046144, + "epoch": 0.026634600931910415, + "flos": 57956125340160.0, + "grad_norm": 0.9679991114437049, + "language_loss": 0.64158452, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66034842, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.12988281, + "step": 443, + "time_per_iteration": 3.059054374694824 + }, + { + "auxiliary_loss_clip": 0.01909781, + "auxiliary_loss_mlp": 0.0044616, + "balance_loss_clip": 1.39098167, + "balance_loss_mlp": 0.39571083, + "epoch": 0.026694724184578387, + "flos": 15815131827840.0, + "grad_norm": 13.290039460576928, + "language_loss": 0.90822178, + "learning_rate": 3.924809954779425e-06, + "loss": 0.93178117, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 5.18359375, + "router_z_loss_mlp": 0.50463867, + "step": 444, + "time_per_iteration": 2.6563777923583984 + }, + { + "auxiliary_loss_clip": 0.01898828, + "auxiliary_loss_mlp": 0.00458561, + "balance_loss_clip": 1.38008046, + "balance_loss_mlp": 0.40637124, + "epoch": 0.026754847437246355, + "flos": 23440259612160.0, + "grad_norm": 57.76755443032764, + "language_loss": 1.01194012, + "learning_rate": 3.9262584417424425e-06, + "loss": 1.035514, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 5.1875, + "router_z_loss_mlp": 0.52270508, + "step": 445, + "time_per_iteration": 2.692028284072876 + }, + { + "auxiliary_loss_clip": 0.01880676, + "auxiliary_loss_mlp": 0.00410256, + "balance_loss_clip": 1.38432562, + "balance_loss_mlp": 0.36621976, + "epoch": 0.026814970689914324, + "flos": 17341801632000.0, + "grad_norm": 36.98819617157792, + "language_loss": 1.01574707, + "learning_rate": 3.9277036773290725e-06, + "loss": 1.03865635, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 4.96875, + "router_z_loss_mlp": 0.44042969, + "step": 446, + "time_per_iteration": 2.5932154655456543 + }, + { + "auxiliary_loss_clip": 0.01862618, + "auxiliary_loss_mlp": 0.00411635, + "balance_loss_clip": 1.38006401, + "balance_loss_mlp": 0.36590642, + "epoch": 0.026875093942582293, + "flos": 17894718662400.0, + "grad_norm": 23.815238136463286, + "language_loss": 0.86685622, + "learning_rate": 3.92914567610317e-06, + "loss": 0.88959873, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 4.83203125, + "router_z_loss_mlp": 0.45727539, + "step": 447, + "time_per_iteration": 2.7016611099243164 + }, + { + "auxiliary_loss_clip": 0.01872129, + "auxiliary_loss_mlp": 0.00452209, + "balance_loss_clip": 1.38635755, + "balance_loss_mlp": 0.40378606, + "epoch": 0.026935217195250265, + "flos": 21723980889600.0, + "grad_norm": 94.45400929868842, + "language_loss": 0.94554722, + "learning_rate": 3.930584452530952e-06, + "loss": 0.96879065, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 4.85546875, + "router_z_loss_mlp": 0.48388672, + "step": 448, + "time_per_iteration": 2.676861524581909 + }, + { + "auxiliary_loss_clip": 0.01835772, + "auxiliary_loss_mlp": 0.00441551, + "balance_loss_clip": 1.36642444, + "balance_loss_mlp": 0.39689541, + "epoch": 0.026995340447918233, + "flos": 23622685810560.0, + "grad_norm": 7.402532700974451, + "language_loss": 0.94567442, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.96844769, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 4.69140625, + "router_z_loss_mlp": 0.44677734, + "step": 449, + "time_per_iteration": 2.75723934173584 + }, + { + "auxiliary_loss_clip": 0.0182571, + "auxiliary_loss_mlp": 0.00420423, + "balance_loss_clip": 1.35864973, + "balance_loss_mlp": 0.37273955, + "epoch": 0.027055463700586202, + "flos": 17931275729280.0, + "grad_norm": 81.60930322476118, + "language_loss": 0.8950932, + "learning_rate": 3.933452395729493e-06, + "loss": 0.9175545, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 4.67578125, + "router_z_loss_mlp": 0.47680664, + "step": 450, + "time_per_iteration": 2.7558090686798096 + }, + { + "auxiliary_loss_clip": 0.01811312, + "auxiliary_loss_mlp": 0.00416045, + "balance_loss_clip": 1.360623, + "balance_loss_mlp": 0.37315381, + "epoch": 0.02711558695325417, + "flos": 25118903859840.0, + "grad_norm": 42.85678937337225, + "language_loss": 0.85984957, + "learning_rate": 3.934881590952304e-06, + "loss": 0.88212311, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 4.50390625, + "router_z_loss_mlp": 0.42895508, + "step": 451, + "time_per_iteration": 2.6904289722442627 + }, + { + "auxiliary_loss_clip": 0.01789084, + "auxiliary_loss_mlp": 0.00435235, + "balance_loss_clip": 1.3475492, + "balance_loss_mlp": 0.3929871, + "epoch": 0.02717571020592214, + "flos": 24239559006720.0, + "grad_norm": 13.679177810339048, + "language_loss": 0.82072639, + "learning_rate": 3.936307620734599e-06, + "loss": 0.84296966, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 4.41015625, + "router_z_loss_mlp": 0.42236328, + "step": 452, + "time_per_iteration": 2.6572868824005127 + }, + { + "auxiliary_loss_clip": 0.01771109, + "auxiliary_loss_mlp": 0.00416559, + "balance_loss_clip": 1.33754241, + "balance_loss_mlp": 0.37452608, + "epoch": 0.02723583345859011, + "flos": 25118939773440.0, + "grad_norm": 1109.0197702294663, + "language_loss": 0.78876501, + "learning_rate": 3.937730499067294e-06, + "loss": 0.81064165, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.42016602, + "step": 453, + "time_per_iteration": 2.698025703430176 + }, + { + "auxiliary_loss_clip": 0.01780293, + "auxiliary_loss_mlp": 0.00448948, + "balance_loss_clip": 1.3382268, + "balance_loss_mlp": 0.40479296, + "epoch": 0.02729595671125808, + "flos": 42741597847680.0, + "grad_norm": 232.20089385861013, + "language_loss": 0.89494789, + "learning_rate": 3.939150239848748e-06, + "loss": 0.91724026, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 4.41015625, + "router_z_loss_mlp": 0.44116211, + "step": 454, + "time_per_iteration": 2.900243043899536 + }, + { + "auxiliary_loss_clip": 0.0174922, + "auxiliary_loss_mlp": 0.00411056, + "balance_loss_clip": 1.32154298, + "balance_loss_mlp": 0.37045363, + "epoch": 0.02735607996392605, + "flos": 21430985650560.0, + "grad_norm": 23.775145100919225, + "language_loss": 0.81177461, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.8333773, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 4.265625, + "router_z_loss_mlp": 0.40600586, + "step": 455, + "time_per_iteration": 2.6646695137023926 + }, + { + "auxiliary_loss_clip": 0.01747247, + "auxiliary_loss_mlp": 0.00433986, + "balance_loss_clip": 1.31197917, + "balance_loss_mlp": 0.39216718, + "epoch": 0.027416203216594017, + "flos": 20851280052480.0, + "grad_norm": 17.454564835644767, + "language_loss": 0.87627399, + "learning_rate": 3.941980363893499e-06, + "loss": 0.89808631, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 4.35546875, + "router_z_loss_mlp": 0.41845703, + "step": 456, + "time_per_iteration": 2.7151741981506348 + }, + { + "auxiliary_loss_clip": 0.01733935, + "auxiliary_loss_mlp": 0.00402339, + "balance_loss_clip": 1.31142581, + "balance_loss_mlp": 0.36066338, + "epoch": 0.027476326469261986, + "flos": 13224500242560.0, + "grad_norm": 16.493579754209, + "language_loss": 0.88958716, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.91094989, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 4.22265625, + "router_z_loss_mlp": 0.41674805, + "step": 457, + "time_per_iteration": 2.6008777618408203 + }, + { + "auxiliary_loss_clip": 0.01716322, + "auxiliary_loss_mlp": 0.00415937, + "balance_loss_clip": 1.29540324, + "balance_loss_mlp": 0.37080467, + "epoch": 0.027536449721929958, + "flos": 24024526237440.0, + "grad_norm": 40.7736951299237, + "language_loss": 1.00295758, + "learning_rate": 3.944798102235412e-06, + "loss": 1.02428019, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 4.2109375, + "router_z_loss_mlp": 0.45141602, + "step": 458, + "time_per_iteration": 2.703333616256714 + }, + { + "auxiliary_loss_clip": 0.01699359, + "auxiliary_loss_mlp": 0.00457321, + "balance_loss_clip": 1.276389, + "balance_loss_mlp": 0.41233116, + "epoch": 0.027596572974597926, + "flos": 13006055681280.0, + "grad_norm": 92.30125692059008, + "language_loss": 0.87570715, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.89727396, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 4.23046875, + "router_z_loss_mlp": 0.44995117, + "step": 459, + "time_per_iteration": 2.628692150115967 + }, + { + "auxiliary_loss_clip": 0.0169973, + "auxiliary_loss_mlp": 0.00442701, + "balance_loss_clip": 1.28536344, + "balance_loss_mlp": 0.39749649, + "epoch": 0.027656696227265895, + "flos": 26143076350080.0, + "grad_norm": 39.57679013343044, + "language_loss": 0.88234496, + "learning_rate": 3.947603562811407e-06, + "loss": 0.90376925, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 4.15039062, + "router_z_loss_mlp": 0.45214844, + "step": 460, + "time_per_iteration": 2.8593802452087402 + }, + { + "auxiliary_loss_clip": 0.01505137, + "auxiliary_loss_mlp": 0.00693502, + "balance_loss_clip": 1.21468282, + "balance_loss_mlp": 0.66279376, + "epoch": 0.027716819479933864, + "flos": 60697222997760.0, + "grad_norm": 1.554594634949751, + "language_loss": 0.73609626, + "learning_rate": 3.949001722282675e-06, + "loss": 0.75808263, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.30664062, + "step": 461, + "time_per_iteration": 3.1300318241119385 + }, + { + "auxiliary_loss_clip": 0.01675542, + "auxiliary_loss_mlp": 0.00465729, + "balance_loss_clip": 1.27117693, + "balance_loss_mlp": 0.42185968, + "epoch": 0.027776942732601832, + "flos": 31211938886400.0, + "grad_norm": 57.232253487117234, + "language_loss": 0.9064694, + "learning_rate": 3.950396852153582e-06, + "loss": 0.92788208, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 4.0390625, + "router_z_loss_mlp": 0.4387207, + "step": 462, + "time_per_iteration": 2.7181713581085205 + }, + { + "auxiliary_loss_clip": 0.01676767, + "auxiliary_loss_mlp": 0.00416642, + "balance_loss_clip": 1.263762, + "balance_loss_mlp": 0.37191498, + "epoch": 0.027837065985269804, + "flos": 22674644196480.0, + "grad_norm": 24.095409608614425, + "language_loss": 0.98577988, + "learning_rate": 3.951788965525118e-06, + "loss": 1.00671399, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 4.12695312, + "router_z_loss_mlp": 0.44702148, + "step": 463, + "time_per_iteration": 2.648257255554199 + }, + { + "auxiliary_loss_clip": 0.01530792, + "auxiliary_loss_mlp": 0.00279071, + "balance_loss_clip": 1.24043727, + "balance_loss_mlp": 0.25408459, + "epoch": 0.027897189237937773, + "flos": 62182487399040.0, + "grad_norm": 0.8842510867762596, + "language_loss": 0.58598363, + "learning_rate": 3.953178075413476e-06, + "loss": 0.60408223, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.24902344, + "step": 464, + "time_per_iteration": 3.1209614276885986 + }, + { + "auxiliary_loss_clip": 0.01707636, + "auxiliary_loss_mlp": 0.00493769, + "balance_loss_clip": 1.26912069, + "balance_loss_mlp": 0.43936178, + "epoch": 0.02795731249060574, + "flos": 24493160004480.0, + "grad_norm": 137.6766620633682, + "language_loss": 0.90081096, + "learning_rate": 3.954564194750784e-06, + "loss": 0.92282498, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 4.3828125, + "router_z_loss_mlp": 0.54321289, + "step": 465, + "time_per_iteration": 2.689724922180176 + }, + { + "auxiliary_loss_clip": 0.0168916, + "auxiliary_loss_mlp": 0.00424534, + "balance_loss_clip": 1.26498342, + "balance_loss_mlp": 0.37868589, + "epoch": 0.02801743574327371, + "flos": 23733003456000.0, + "grad_norm": 14.990376893959041, + "language_loss": 0.84852701, + "learning_rate": 3.955947336385828e-06, + "loss": 0.86966395, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.45874023, + "step": 466, + "time_per_iteration": 2.6538822650909424 + }, + { + "auxiliary_loss_clip": 0.01664629, + "auxiliary_loss_mlp": 0.00386995, + "balance_loss_clip": 1.25750756, + "balance_loss_mlp": 0.34038395, + "epoch": 0.02807755899594168, + "flos": 20629100476800.0, + "grad_norm": 482.0439172078701, + "language_loss": 0.93490696, + "learning_rate": 3.957327513084761e-06, + "loss": 0.95542324, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 4.07421875, + "router_z_loss_mlp": 0.46582031, + "step": 467, + "time_per_iteration": 2.6369411945343018 + }, + { + "auxiliary_loss_clip": 0.01674334, + "auxiliary_loss_mlp": 0.00395732, + "balance_loss_clip": 1.26272964, + "balance_loss_mlp": 0.34623629, + "epoch": 0.02813768224860965, + "flos": 19244564789760.0, + "grad_norm": 13.730101772483815, + "language_loss": 0.94508934, + "learning_rate": 3.958704737531818e-06, + "loss": 0.96579003, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 4.11132812, + "router_z_loss_mlp": 0.49536133, + "step": 468, + "time_per_iteration": 2.5876619815826416 + }, + { + "auxiliary_loss_clip": 0.01685533, + "auxiliary_loss_mlp": 0.00405385, + "balance_loss_clip": 1.26634765, + "balance_loss_mlp": 0.35522196, + "epoch": 0.02819780550127762, + "flos": 20813968800000.0, + "grad_norm": 39.239317277857765, + "language_loss": 0.99810183, + "learning_rate": 3.9600790223300065e-06, + "loss": 1.01901102, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 4.19335938, + "router_z_loss_mlp": 0.50170898, + "step": 469, + "time_per_iteration": 2.655141592025757 + }, + { + "auxiliary_loss_clip": 0.01656482, + "auxiliary_loss_mlp": 0.0040573, + "balance_loss_clip": 1.25377572, + "balance_loss_mlp": 0.35587716, + "epoch": 0.028257928753945588, + "flos": 19974125928960.0, + "grad_norm": 18.627156912056076, + "language_loss": 0.93777621, + "learning_rate": 3.96145038000181e-06, + "loss": 0.95839834, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.4987793, + "step": 470, + "time_per_iteration": 2.61094331741333 + }, + { + "auxiliary_loss_clip": 0.01671436, + "auxiliary_loss_mlp": 0.00446841, + "balance_loss_clip": 1.26193404, + "balance_loss_mlp": 0.39498454, + "epoch": 0.028318052006613557, + "flos": 20484488321280.0, + "grad_norm": 6.028146617309995, + "language_loss": 0.98823726, + "learning_rate": 3.962818822989861e-06, + "loss": 1.00942004, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 4.0859375, + "router_z_loss_mlp": 0.51904297, + "step": 471, + "time_per_iteration": 2.67940616607666 + }, + { + "auxiliary_loss_clip": 0.01677671, + "auxiliary_loss_mlp": 0.00471743, + "balance_loss_clip": 1.26170433, + "balance_loss_mlp": 0.41719291, + "epoch": 0.02837817525928153, + "flos": 28514832410880.0, + "grad_norm": 64.5131300716062, + "language_loss": 0.82872355, + "learning_rate": 3.964184363657625e-06, + "loss": 0.8502177, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 4.15429688, + "router_z_loss_mlp": 0.54541016, + "step": 472, + "time_per_iteration": 2.709202289581299 + }, + { + "auxiliary_loss_clip": 0.01684532, + "auxiliary_loss_mlp": 0.00417002, + "balance_loss_clip": 1.27155757, + "balance_loss_mlp": 0.36679086, + "epoch": 0.028438298511949497, + "flos": 18551668458240.0, + "grad_norm": 8.601339366916669, + "language_loss": 0.99594665, + "learning_rate": 3.965547014290071e-06, + "loss": 1.01696181, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.50219727, + "step": 473, + "time_per_iteration": 2.66874098777771 + }, + { + "auxiliary_loss_clip": 0.01701227, + "auxiliary_loss_mlp": 0.00431521, + "balance_loss_clip": 1.27728355, + "balance_loss_mlp": 0.37782955, + "epoch": 0.028498421764617466, + "flos": 16910227722240.0, + "grad_norm": 4.791088798316729, + "language_loss": 0.96900588, + "learning_rate": 3.96690678709433e-06, + "loss": 0.99033332, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 4.23242188, + "router_z_loss_mlp": 0.53662109, + "step": 474, + "time_per_iteration": 2.6374316215515137 + }, + { + "auxiliary_loss_clip": 0.01666358, + "auxiliary_loss_mlp": 0.00406079, + "balance_loss_clip": 1.26403487, + "balance_loss_mlp": 0.35505795, + "epoch": 0.028558545017285435, + "flos": 27778699082880.0, + "grad_norm": 35.43302404030814, + "language_loss": 0.87442017, + "learning_rate": 3.968263694200355e-06, + "loss": 0.89514458, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 4.02929688, + "router_z_loss_mlp": 0.51025391, + "step": 475, + "time_per_iteration": 2.6786632537841797 + }, + { + "auxiliary_loss_clip": 0.01384902, + "auxiliary_loss_mlp": 0.00193587, + "balance_loss_clip": 1.14020324, + "balance_loss_mlp": 0.16898173, + "epoch": 0.028618668269953403, + "flos": 65654367258240.0, + "grad_norm": 0.9291700734381153, + "language_loss": 0.66371679, + "learning_rate": 3.969617747661569e-06, + "loss": 0.67950165, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.24511719, + "step": 476, + "time_per_iteration": 3.121316909790039 + }, + { + "auxiliary_loss_clip": 0.01666, + "auxiliary_loss_mlp": 0.00431453, + "balance_loss_clip": 1.25922418, + "balance_loss_mlp": 0.3784287, + "epoch": 0.028678791522621375, + "flos": 21937074324480.0, + "grad_norm": 24.93841940312841, + "language_loss": 0.9307071, + "learning_rate": 3.970968959455509e-06, + "loss": 0.95168161, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 4.06445312, + "router_z_loss_mlp": 0.53051758, + "step": 477, + "time_per_iteration": 2.6416988372802734 + }, + { + "auxiliary_loss_clip": 0.01654292, + "auxiliary_loss_mlp": 0.00400377, + "balance_loss_clip": 1.25869203, + "balance_loss_mlp": 0.34811527, + "epoch": 0.028738914775289344, + "flos": 24572128055040.0, + "grad_norm": 42.15343484209316, + "language_loss": 0.91207069, + "learning_rate": 3.97231734148446e-06, + "loss": 0.93261743, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 3.94921875, + "router_z_loss_mlp": 0.52270508, + "step": 478, + "time_per_iteration": 2.7045342922210693 + }, + { + "auxiliary_loss_clip": 0.01636847, + "auxiliary_loss_mlp": 0.0041247, + "balance_loss_clip": 1.24128032, + "balance_loss_mlp": 0.36056578, + "epoch": 0.028799038027957313, + "flos": 23257977068160.0, + "grad_norm": 10.06608199179809, + "language_loss": 0.89379597, + "learning_rate": 3.973662905576082e-06, + "loss": 0.91428912, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.51904297, + "step": 479, + "time_per_iteration": 2.6477785110473633 + }, + { + "auxiliary_loss_clip": 0.01622189, + "auxiliary_loss_mlp": 0.00375676, + "balance_loss_clip": 1.23930502, + "balance_loss_mlp": 0.32606089, + "epoch": 0.02885916128062528, + "flos": 22164102236160.0, + "grad_norm": 344.7146441499143, + "language_loss": 0.79431993, + "learning_rate": 3.975005663484038e-06, + "loss": 0.81429863, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 0.49633789, + "step": 480, + "time_per_iteration": 2.8323681354522705 + }, + { + "auxiliary_loss_clip": 0.01630188, + "auxiliary_loss_mlp": 0.00399362, + "balance_loss_clip": 1.24044001, + "balance_loss_mlp": 0.34972373, + "epoch": 0.02891928453329325, + "flos": 22932842135040.0, + "grad_norm": 31.78291565060535, + "language_loss": 0.93088746, + "learning_rate": 3.976345626888605e-06, + "loss": 0.95118284, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 3.8984375, + "router_z_loss_mlp": 0.49584961, + "step": 481, + "time_per_iteration": 2.7295796871185303 + }, + { + "auxiliary_loss_clip": 0.01332034, + "auxiliary_loss_mlp": 0.00204299, + "balance_loss_clip": 1.10088432, + "balance_loss_mlp": 0.1837955, + "epoch": 0.028979407785961222, + "flos": 57432941792640.0, + "grad_norm": 0.846702217204551, + "language_loss": 0.65514052, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.67050385, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.20507812, + "step": 482, + "time_per_iteration": 4.35185432434082 + }, + { + "auxiliary_loss_clip": 0.01641228, + "auxiliary_loss_mlp": 0.00371041, + "balance_loss_clip": 1.24434757, + "balance_loss_mlp": 0.32009065, + "epoch": 0.02903953103862919, + "flos": 16722737706240.0, + "grad_norm": 17.037588262614094, + "language_loss": 0.90730822, + "learning_rate": 3.979017216545415e-06, + "loss": 0.92743099, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 3.97070312, + "router_z_loss_mlp": 0.50952148, + "step": 483, + "time_per_iteration": 5.494209051132202 + }, + { + "auxiliary_loss_clip": 0.01629055, + "auxiliary_loss_mlp": 0.00407596, + "balance_loss_clip": 1.23883271, + "balance_loss_mlp": 0.35926831, + "epoch": 0.02909965429129716, + "flos": 16763640318720.0, + "grad_norm": 579.9113753000111, + "language_loss": 0.83976912, + "learning_rate": 3.980348865796749e-06, + "loss": 0.86013561, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.48266602, + "step": 484, + "time_per_iteration": 4.004236936569214 + }, + { + "auxiliary_loss_clip": 0.01644707, + "auxiliary_loss_mlp": 0.00465921, + "balance_loss_clip": 1.2430445, + "balance_loss_mlp": 0.41413647, + "epoch": 0.029159777543965128, + "flos": 19785343023360.0, + "grad_norm": 8.056651768195435, + "language_loss": 0.89565438, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.91676068, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 4.015625, + "router_z_loss_mlp": 0.51782227, + "step": 485, + "time_per_iteration": 2.711810827255249 + }, + { + "auxiliary_loss_clip": 0.01605974, + "auxiliary_loss_mlp": 0.00432714, + "balance_loss_clip": 1.2275424, + "balance_loss_mlp": 0.38913119, + "epoch": 0.029219900796633096, + "flos": 19642670202240.0, + "grad_norm": 667.9359666331274, + "language_loss": 0.94309163, + "learning_rate": 3.983003930109732e-06, + "loss": 0.96347851, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 0.43579102, + "step": 486, + "time_per_iteration": 2.6045475006103516 + }, + { + "auxiliary_loss_clip": 0.01631774, + "auxiliary_loss_mlp": 0.00498792, + "balance_loss_clip": 1.23758793, + "balance_loss_mlp": 0.45029742, + "epoch": 0.02928002404930107, + "flos": 25885704424320.0, + "grad_norm": 30.677630014429024, + "language_loss": 0.95339292, + "learning_rate": 3.984327367746315e-06, + "loss": 0.9746986, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 3.94335938, + "router_z_loss_mlp": 0.48486328, + "step": 487, + "time_per_iteration": 2.7017838954925537 + }, + { + "auxiliary_loss_clip": 0.01623267, + "auxiliary_loss_mlp": 0.00505647, + "balance_loss_clip": 1.23782516, + "balance_loss_mlp": 0.45793906, + "epoch": 0.029340147301969037, + "flos": 20660234590080.0, + "grad_norm": 24.55853370115544, + "language_loss": 0.99562252, + "learning_rate": 3.985648090637122e-06, + "loss": 1.01691175, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.47705078, + "step": 488, + "time_per_iteration": 2.6169066429138184 + }, + { + "auxiliary_loss_clip": 0.01627008, + "auxiliary_loss_mlp": 0.00525776, + "balance_loss_clip": 1.24182856, + "balance_loss_mlp": 0.47718614, + "epoch": 0.029400270554637006, + "flos": 24428018689920.0, + "grad_norm": 49.252005004550455, + "language_loss": 0.94390881, + "learning_rate": 3.986966109896785e-06, + "loss": 0.9654367, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.4855957, + "step": 489, + "time_per_iteration": 2.713409423828125 + }, + { + "auxiliary_loss_clip": 0.01613674, + "auxiliary_loss_mlp": 0.00566241, + "balance_loss_clip": 1.23198676, + "balance_loss_mlp": 0.5197736, + "epoch": 0.029460393807304974, + "flos": 20120892900480.0, + "grad_norm": 129.26095784973577, + "language_loss": 0.93325692, + "learning_rate": 3.988281436571815e-06, + "loss": 0.95505607, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 0.46484375, + "step": 490, + "time_per_iteration": 2.748716354370117 + }, + { + "auxiliary_loss_clip": 0.01636393, + "auxiliary_loss_mlp": 0.00630094, + "balance_loss_clip": 1.23939967, + "balance_loss_mlp": 0.57943046, + "epoch": 0.029520517059972943, + "flos": 17675914965120.0, + "grad_norm": 8.470335412807401, + "language_loss": 0.98680246, + "learning_rate": 3.989594081641164e-06, + "loss": 1.00946736, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 3.9765625, + "router_z_loss_mlp": 0.50708008, + "step": 491, + "time_per_iteration": 2.6535732746124268 + }, + { + "auxiliary_loss_clip": 0.01616857, + "auxiliary_loss_mlp": 0.00611852, + "balance_loss_clip": 1.24136138, + "balance_loss_mlp": 0.56660008, + "epoch": 0.029580640312640915, + "flos": 18953185662720.0, + "grad_norm": 66.00535074055533, + "language_loss": 0.90708905, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.92937613, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.45239258, + "step": 492, + "time_per_iteration": 2.7165191173553467 + }, + { + "auxiliary_loss_clip": 0.01632885, + "auxiliary_loss_mlp": 0.00582145, + "balance_loss_clip": 1.25056148, + "balance_loss_mlp": 0.53481865, + "epoch": 0.029640763565308884, + "flos": 18726121837440.0, + "grad_norm": 615.7254573905122, + "language_loss": 0.92586887, + "learning_rate": 3.992211370544093e-06, + "loss": 0.94801921, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 0.47338867, + "step": 493, + "time_per_iteration": 2.641636371612549 + }, + { + "auxiliary_loss_clip": 0.01631096, + "auxiliary_loss_mlp": 0.00634959, + "balance_loss_clip": 1.24258494, + "balance_loss_mlp": 0.58586878, + "epoch": 0.029700886817976852, + "flos": 20595308757120.0, + "grad_norm": 22.316228197752555, + "language_loss": 0.94347453, + "learning_rate": 3.99351603600268e-06, + "loss": 0.96613508, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 3.88476562, + "router_z_loss_mlp": 0.4909668, + "step": 494, + "time_per_iteration": 2.7028331756591797 + }, + { + "auxiliary_loss_clip": 0.0164094, + "auxiliary_loss_mlp": 0.0064609, + "balance_loss_clip": 1.25168967, + "balance_loss_mlp": 0.59730959, + "epoch": 0.02976101007064482, + "flos": 22236857233920.0, + "grad_norm": 32.501966079315324, + "language_loss": 0.93185163, + "learning_rate": 3.994818063106668e-06, + "loss": 0.95472193, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 3.89257812, + "router_z_loss_mlp": 0.48779297, + "step": 495, + "time_per_iteration": 2.653486967086792 + }, + { + "auxiliary_loss_clip": 0.01600817, + "auxiliary_loss_mlp": 0.00588808, + "balance_loss_clip": 1.2354176, + "balance_loss_mlp": 0.54472458, + "epoch": 0.029821133323312793, + "flos": 23732644320000.0, + "grad_norm": 166.1558215812027, + "language_loss": 0.68286473, + "learning_rate": 3.99611746250533e-06, + "loss": 0.70476091, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.44067383, + "step": 496, + "time_per_iteration": 2.7420854568481445 + }, + { + "auxiliary_loss_clip": 0.01630275, + "auxiliary_loss_mlp": 0.0067904, + "balance_loss_clip": 1.25510406, + "balance_loss_mlp": 0.62451369, + "epoch": 0.02988125657598076, + "flos": 22419498913920.0, + "grad_norm": 31.80347437307548, + "language_loss": 0.94580102, + "learning_rate": 3.997414244783595e-06, + "loss": 0.96889412, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.54492188, + "step": 497, + "time_per_iteration": 2.6838860511779785 + }, + { + "auxiliary_loss_clip": 0.01639771, + "auxiliary_loss_mlp": 0.00706722, + "balance_loss_clip": 1.25539446, + "balance_loss_mlp": 0.65140939, + "epoch": 0.02994137982864873, + "flos": 13845108453120.0, + "grad_norm": 31.58930142222188, + "language_loss": 0.94131684, + "learning_rate": 3.998708420462557e-06, + "loss": 0.96478176, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 0.55297852, + "step": 498, + "time_per_iteration": 2.6679627895355225 + }, + { + "auxiliary_loss_clip": 0.01616164, + "auxiliary_loss_mlp": 0.00677247, + "balance_loss_clip": 1.24491, + "balance_loss_mlp": 0.62612963, + "epoch": 0.0300015030813167, + "flos": 23908354675200.0, + "grad_norm": 686.1086590570216, + "language_loss": 0.87458235, + "learning_rate": 4e-06, + "loss": 0.89751637, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 0.51123047, + "step": 499, + "time_per_iteration": 2.624300241470337 + }, + { + "auxiliary_loss_clip": 0.01631564, + "auxiliary_loss_mlp": 0.0075037, + "balance_loss_clip": 1.25528026, + "balance_loss_mlp": 0.69765562, + "epoch": 0.030061626333984667, + "flos": 22016796560640.0, + "grad_norm": 23.320261392628876, + "language_loss": 0.87525988, + "learning_rate": 3.9999999620799e-06, + "loss": 0.8990792, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 0.52709961, + "step": 500, + "time_per_iteration": 2.7644782066345215 + }, + { + "auxiliary_loss_clip": 0.01624909, + "auxiliary_loss_mlp": 0.00675242, + "balance_loss_clip": 1.25540876, + "balance_loss_mlp": 0.62233651, + "epoch": 0.03012174958665264, + "flos": 23039747988480.0, + "grad_norm": 239.55311891655197, + "language_loss": 0.95346224, + "learning_rate": 3.9999998483196e-06, + "loss": 0.97646379, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 0.52905273, + "step": 501, + "time_per_iteration": 2.617553472518921 + }, + { + "auxiliary_loss_clip": 0.01643131, + "auxiliary_loss_mlp": 0.00787933, + "balance_loss_clip": 1.25768328, + "balance_loss_mlp": 0.73114157, + "epoch": 0.030181872839320608, + "flos": 18953257489920.0, + "grad_norm": 19.153410822087142, + "language_loss": 0.93180466, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.95611525, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 3.85742188, + "router_z_loss_mlp": 0.56762695, + "step": 502, + "time_per_iteration": 2.6452219486236572 + }, + { + "auxiliary_loss_clip": 0.0162632, + "auxiliary_loss_mlp": 0.00664238, + "balance_loss_clip": 1.25957489, + "balance_loss_mlp": 0.61350268, + "epoch": 0.030241996091988577, + "flos": 16728017005440.0, + "grad_norm": 19.54399061538533, + "language_loss": 0.89776182, + "learning_rate": 3.999999393278425e-06, + "loss": 0.92066741, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 0.50708008, + "step": 503, + "time_per_iteration": 2.6938576698303223 + }, + { + "auxiliary_loss_clip": 0.01591859, + "auxiliary_loss_mlp": 0.00494039, + "balance_loss_clip": 1.24951673, + "balance_loss_mlp": 0.45779979, + "epoch": 0.030302119344656545, + "flos": 28621271387520.0, + "grad_norm": 141.58734540020782, + "language_loss": 0.94647115, + "learning_rate": 3.999999051997567e-06, + "loss": 0.9673301, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 3.41992188, + "router_z_loss_mlp": 0.36254883, + "step": 504, + "time_per_iteration": 2.7574057579040527 + }, + { + "auxiliary_loss_clip": 0.01565787, + "auxiliary_loss_mlp": 0.00438381, + "balance_loss_clip": 1.22881031, + "balance_loss_mlp": 0.40006739, + "epoch": 0.030362242597324514, + "flos": 15669334523520.0, + "grad_norm": 31.005432475865618, + "language_loss": 0.8530699, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.87311155, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.38305664, + "step": 505, + "time_per_iteration": 2.617814302444458 + }, + { + "auxiliary_loss_clip": 0.01267881, + "auxiliary_loss_mlp": 0.00126276, + "balance_loss_clip": 1.04718673, + "balance_loss_mlp": 0.11592839, + "epoch": 0.030422365849992486, + "flos": 72125973676800.0, + "grad_norm": 0.9391482765599067, + "language_loss": 0.55090737, + "learning_rate": 3.999998141915371e-06, + "loss": 0.56484896, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.10351562, + "step": 506, + "time_per_iteration": 3.300869941711426 + }, + { + "auxiliary_loss_clip": 0.01514615, + "auxiliary_loss_mlp": 0.0030222, + "balance_loss_clip": 1.19311118, + "balance_loss_mlp": 0.26383421, + "epoch": 0.030482489102660455, + "flos": 19427817000960.0, + "grad_norm": 3.09395363291162, + "language_loss": 0.89906734, + "learning_rate": 3.999997573114069e-06, + "loss": 0.91723567, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.3840332, + "step": 507, + "time_per_iteration": 2.661376714706421 + }, + { + "auxiliary_loss_clip": 0.01530554, + "auxiliary_loss_mlp": 0.00305617, + "balance_loss_clip": 1.20238423, + "balance_loss_mlp": 0.2684471, + "epoch": 0.030542612355328423, + "flos": 20375822701440.0, + "grad_norm": 24.66052323830122, + "language_loss": 0.96589637, + "learning_rate": 3.999996928472659e-06, + "loss": 0.984258, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.37182617, + "step": 508, + "time_per_iteration": 2.6320927143096924 + }, + { + "auxiliary_loss_clip": 0.01529286, + "auxiliary_loss_mlp": 0.00319818, + "balance_loss_clip": 1.20247126, + "balance_loss_mlp": 0.27952483, + "epoch": 0.030602735607996392, + "flos": 34677354297600.0, + "grad_norm": 53.060682858793804, + "language_loss": 0.77238065, + "learning_rate": 3.999996207991165e-06, + "loss": 0.79087174, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.40283203, + "step": 509, + "time_per_iteration": 2.7597920894622803 + }, + { + "auxiliary_loss_clip": 0.01498749, + "auxiliary_loss_mlp": 0.0025273, + "balance_loss_clip": 1.19064879, + "balance_loss_mlp": 0.2159659, + "epoch": 0.03066285886066436, + "flos": 23658668259840.0, + "grad_norm": 14.855713631811353, + "language_loss": 0.89747268, + "learning_rate": 3.999995411669614e-06, + "loss": 0.91498744, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 3.08203125, + "router_z_loss_mlp": 0.36767578, + "step": 510, + "time_per_iteration": 2.6731817722320557 + }, + { + "auxiliary_loss_clip": 0.01486287, + "auxiliary_loss_mlp": 0.0029509, + "balance_loss_clip": 1.1828804, + "balance_loss_mlp": 0.26082876, + "epoch": 0.030722982113332332, + "flos": 23002975440000.0, + "grad_norm": 16.999197705817675, + "language_loss": 0.94074953, + "learning_rate": 3.999994539508036e-06, + "loss": 0.95856327, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.34301758, + "step": 511, + "time_per_iteration": 2.6485049724578857 + }, + { + "auxiliary_loss_clip": 0.01489985, + "auxiliary_loss_mlp": 0.00290181, + "balance_loss_clip": 1.1758498, + "balance_loss_mlp": 0.25303537, + "epoch": 0.0307831053660003, + "flos": 24750855152640.0, + "grad_norm": 63.87178091044093, + "language_loss": 0.91400093, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.93180263, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.37182617, + "step": 512, + "time_per_iteration": 2.652329444885254 + }, + { + "auxiliary_loss_clip": 0.01469858, + "auxiliary_loss_mlp": 0.00305389, + "balance_loss_clip": 1.16574049, + "balance_loss_mlp": 0.26991197, + "epoch": 0.03084322861866827, + "flos": 26140885620480.0, + "grad_norm": 5.467410380538235, + "language_loss": 0.95335698, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.97110951, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.35473633, + "step": 513, + "time_per_iteration": 2.724252223968506 + }, + { + "auxiliary_loss_clip": 0.01482473, + "auxiliary_loss_mlp": 0.00341052, + "balance_loss_clip": 1.17050958, + "balance_loss_mlp": 0.30261886, + "epoch": 0.03090335187133624, + "flos": 18771298168320.0, + "grad_norm": 9.422891357298525, + "language_loss": 0.86297917, + "learning_rate": 3.999991467983491e-06, + "loss": 0.8812145, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.38378906, + "step": 514, + "time_per_iteration": 2.605374336242676 + }, + { + "auxiliary_loss_clip": 0.01463462, + "auxiliary_loss_mlp": 0.00361657, + "balance_loss_clip": 1.15642476, + "balance_loss_mlp": 0.32413, + "epoch": 0.030963475124004207, + "flos": 23221886878080.0, + "grad_norm": 553.2286196816439, + "language_loss": 0.8670736, + "learning_rate": 3.999990292462167e-06, + "loss": 0.88532478, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.37524414, + "step": 515, + "time_per_iteration": 2.7141072750091553 + }, + { + "auxiliary_loss_clip": 0.01470196, + "auxiliary_loss_mlp": 0.00373336, + "balance_loss_clip": 1.15618575, + "balance_loss_mlp": 0.33430719, + "epoch": 0.03102359837667218, + "flos": 42525595411200.0, + "grad_norm": 91.74291686572451, + "language_loss": 0.91210866, + "learning_rate": 3.999989041101011e-06, + "loss": 0.9305439, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.39013672, + "step": 516, + "time_per_iteration": 2.854295492172241 + }, + { + "auxiliary_loss_clip": 0.01444216, + "auxiliary_loss_mlp": 0.00365185, + "balance_loss_clip": 1.14123392, + "balance_loss_mlp": 0.32923099, + "epoch": 0.031083721629340148, + "flos": 21176953689600.0, + "grad_norm": 12.637876151477146, + "language_loss": 0.86290765, + "learning_rate": 3.999987713900071e-06, + "loss": 0.88100165, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.35961914, + "step": 517, + "time_per_iteration": 2.6639604568481445 + }, + { + "auxiliary_loss_clip": 0.01458448, + "auxiliary_loss_mlp": 0.00349409, + "balance_loss_clip": 1.15857005, + "balance_loss_mlp": 0.31464791, + "epoch": 0.031143844882008116, + "flos": 29716187713920.0, + "grad_norm": 6.845053219721069, + "language_loss": 0.96038306, + "learning_rate": 3.999986310859396e-06, + "loss": 0.97846168, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.34765625, + "step": 518, + "time_per_iteration": 2.727872610092163 + }, + { + "auxiliary_loss_clip": 0.0145879, + "auxiliary_loss_mlp": 0.0035611, + "balance_loss_clip": 1.15677774, + "balance_loss_mlp": 0.31965533, + "epoch": 0.031203968134676085, + "flos": 23112467072640.0, + "grad_norm": 9.234371928440192, + "language_loss": 0.95085871, + "learning_rate": 3.999984831979039e-06, + "loss": 0.96900773, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.36425781, + "step": 519, + "time_per_iteration": 2.6519672870635986 + }, + { + "auxiliary_loss_clip": 0.0145317, + "auxiliary_loss_mlp": 0.00346983, + "balance_loss_clip": 1.14113975, + "balance_loss_mlp": 0.31029049, + "epoch": 0.03126409138734405, + "flos": 20954379064320.0, + "grad_norm": 37.613327483904854, + "language_loss": 0.9437331, + "learning_rate": 3.999983277259057e-06, + "loss": 0.96173459, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.36694336, + "step": 520, + "time_per_iteration": 2.6994664669036865 + }, + { + "auxiliary_loss_clip": 0.01464707, + "auxiliary_loss_mlp": 0.00381959, + "balance_loss_clip": 1.15429139, + "balance_loss_mlp": 0.34400246, + "epoch": 0.031324214640012026, + "flos": 21650112570240.0, + "grad_norm": 109.41931350161167, + "language_loss": 0.94820178, + "learning_rate": 3.999981646699509e-06, + "loss": 0.96666837, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.37963867, + "step": 521, + "time_per_iteration": 2.6520917415618896 + }, + { + "auxiliary_loss_clip": 0.0144627, + "auxiliary_loss_mlp": 0.00387856, + "balance_loss_clip": 1.14122367, + "balance_loss_mlp": 0.35183135, + "epoch": 0.03138433789267999, + "flos": 23441337020160.0, + "grad_norm": 13.76961157632272, + "language_loss": 0.77541494, + "learning_rate": 3.999979940300456e-06, + "loss": 0.79375619, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.3605957, + "step": 522, + "time_per_iteration": 2.7099862098693848 + }, + { + "auxiliary_loss_clip": 0.01447079, + "auxiliary_loss_mlp": 0.00376848, + "balance_loss_clip": 1.13733935, + "balance_loss_mlp": 0.34015507, + "epoch": 0.03144446114534796, + "flos": 18982164960000.0, + "grad_norm": 83.07709269267495, + "language_loss": 0.95348203, + "learning_rate": 3.999978158061963e-06, + "loss": 0.97172129, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.36669922, + "step": 523, + "time_per_iteration": 2.6191391944885254 + }, + { + "auxiliary_loss_clip": 0.01429476, + "auxiliary_loss_mlp": 0.00394735, + "balance_loss_clip": 1.12157094, + "balance_loss_mlp": 0.35763752, + "epoch": 0.031504584398015935, + "flos": 22637692080000.0, + "grad_norm": 19.151827090153482, + "language_loss": 1.00478852, + "learning_rate": 3.999976299984099e-06, + "loss": 1.02303064, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 3.07617188, + "router_z_loss_mlp": 0.37109375, + "step": 524, + "time_per_iteration": 2.753680467605591 + }, + { + "auxiliary_loss_clip": 0.01431105, + "auxiliary_loss_mlp": 0.00415275, + "balance_loss_clip": 1.12703514, + "balance_loss_mlp": 0.37801021, + "epoch": 0.0315647076506839, + "flos": 25297056339840.0, + "grad_norm": 17.7046588126031, + "language_loss": 0.90463626, + "learning_rate": 3.999974366066933e-06, + "loss": 0.92310005, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.37255859, + "step": 525, + "time_per_iteration": 7.104484558105469 + }, + { + "auxiliary_loss_clip": 0.01411041, + "auxiliary_loss_mlp": 0.00396481, + "balance_loss_clip": 1.11544585, + "balance_loss_mlp": 0.36021784, + "epoch": 0.03162483090335187, + "flos": 16982839065600.0, + "grad_norm": 6.892137231508144, + "language_loss": 0.88602221, + "learning_rate": 3.999972356310538e-06, + "loss": 0.9040975, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.36279297, + "step": 526, + "time_per_iteration": 3.9900999069213867 + }, + { + "auxiliary_loss_clip": 0.01417671, + "auxiliary_loss_mlp": 0.00407412, + "balance_loss_clip": 1.11478317, + "balance_loss_mlp": 0.36986065, + "epoch": 0.03168495415601984, + "flos": 18734489706240.0, + "grad_norm": 32.290830555084426, + "language_loss": 0.902354, + "learning_rate": 3.999970270714991e-06, + "loss": 0.92060483, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.37573242, + "step": 527, + "time_per_iteration": 2.5914225578308105 + }, + { + "auxiliary_loss_clip": 0.01405367, + "auxiliary_loss_mlp": 0.00396599, + "balance_loss_clip": 1.10656619, + "balance_loss_mlp": 0.35926259, + "epoch": 0.03174507740868781, + "flos": 21214875473280.0, + "grad_norm": 23.238630028526792, + "language_loss": 1.04777873, + "learning_rate": 3.999968109280371e-06, + "loss": 1.06579828, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.37329102, + "step": 528, + "time_per_iteration": 2.650545597076416 + }, + { + "auxiliary_loss_clip": 0.01406028, + "auxiliary_loss_mlp": 0.0042047, + "balance_loss_clip": 1.10637617, + "balance_loss_mlp": 0.38482636, + "epoch": 0.03180520066135578, + "flos": 24787663614720.0, + "grad_norm": 10.859910882662396, + "language_loss": 0.9004252, + "learning_rate": 3.99996587200676e-06, + "loss": 0.91869015, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 2.99414062, + "router_z_loss_mlp": 0.35644531, + "step": 529, + "time_per_iteration": 2.6545662879943848 + }, + { + "auxiliary_loss_clip": 0.01402594, + "auxiliary_loss_mlp": 0.00398653, + "balance_loss_clip": 1.11123836, + "balance_loss_mlp": 0.36277124, + "epoch": 0.03186532391402375, + "flos": 24864261367680.0, + "grad_norm": 68.68164399603273, + "language_loss": 0.98089647, + "learning_rate": 3.999963558894243e-06, + "loss": 0.998909, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.35864258, + "step": 530, + "time_per_iteration": 2.6588621139526367 + }, + { + "auxiliary_loss_clip": 0.0140758, + "auxiliary_loss_mlp": 0.00376178, + "balance_loss_clip": 1.10522103, + "balance_loss_mlp": 0.33841231, + "epoch": 0.03192544716669172, + "flos": 21215055041280.0, + "grad_norm": 419.9234834610135, + "language_loss": 0.83871984, + "learning_rate": 3.999961169942907e-06, + "loss": 0.85655743, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.37744141, + "step": 531, + "time_per_iteration": 2.664142370223999 + }, + { + "auxiliary_loss_clip": 0.0140912, + "auxiliary_loss_mlp": 0.00381424, + "balance_loss_clip": 1.11147809, + "balance_loss_mlp": 0.3443259, + "epoch": 0.03198557041935969, + "flos": 24353216616960.0, + "grad_norm": 23.678258102373285, + "language_loss": 1.0056839, + "learning_rate": 3.999958705152843e-06, + "loss": 1.02358937, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.37109375, + "step": 532, + "time_per_iteration": 2.653696298599243 + }, + { + "auxiliary_loss_clip": 0.01244597, + "auxiliary_loss_mlp": 0.00331241, + "balance_loss_clip": 1.0033561, + "balance_loss_mlp": 0.31302628, + "epoch": 0.032045693672027656, + "flos": 61827367587840.0, + "grad_norm": 0.7693832825670364, + "language_loss": 0.5764575, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.5922159, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.18261719, + "step": 533, + "time_per_iteration": 3.1847927570343018 + }, + { + "auxiliary_loss_clip": 0.01417275, + "auxiliary_loss_mlp": 0.00389014, + "balance_loss_clip": 1.11583483, + "balance_loss_mlp": 0.35163003, + "epoch": 0.03210581692469563, + "flos": 28401174800640.0, + "grad_norm": 14.938099841623377, + "language_loss": 0.95006204, + "learning_rate": 3.999953548056907e-06, + "loss": 0.96812493, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.37426758, + "step": 534, + "time_per_iteration": 2.6831164360046387 + }, + { + "auxiliary_loss_clip": 0.01431815, + "auxiliary_loss_mlp": 0.00385965, + "balance_loss_clip": 1.12664342, + "balance_loss_mlp": 0.34669715, + "epoch": 0.03216594017736359, + "flos": 24717709877760.0, + "grad_norm": 56.31789877808908, + "language_loss": 0.86914629, + "learning_rate": 3.999950855751232e-06, + "loss": 0.88732409, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.39257812, + "step": 535, + "time_per_iteration": 2.790473699569702 + }, + { + "auxiliary_loss_clip": 0.01428891, + "auxiliary_loss_mlp": 0.00374157, + "balance_loss_clip": 1.13094544, + "balance_loss_mlp": 0.33586666, + "epoch": 0.032226063430031565, + "flos": 31175453646720.0, + "grad_norm": 526.7941884697213, + "language_loss": 0.8899107, + "learning_rate": 3.999948087607219e-06, + "loss": 0.90794122, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.3828125, + "step": 536, + "time_per_iteration": 2.8727195262908936 + }, + { + "auxiliary_loss_clip": 0.01435018, + "auxiliary_loss_mlp": 0.00407134, + "balance_loss_clip": 1.13346696, + "balance_loss_mlp": 0.36536288, + "epoch": 0.03228618668269954, + "flos": 32198225506560.0, + "grad_norm": 5.63621813557084, + "language_loss": 0.78065759, + "learning_rate": 3.999945243624975e-06, + "loss": 0.79907906, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.41772461, + "step": 537, + "time_per_iteration": 2.757281541824341 + }, + { + "auxiliary_loss_clip": 0.01427019, + "auxiliary_loss_mlp": 0.00381439, + "balance_loss_clip": 1.12819004, + "balance_loss_mlp": 0.34446031, + "epoch": 0.0323463099353675, + "flos": 22670154996480.0, + "grad_norm": 328.7442255678031, + "language_loss": 0.91313142, + "learning_rate": 3.999942323804607e-06, + "loss": 0.93121594, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.37011719, + "step": 538, + "time_per_iteration": 2.649658441543579 + }, + { + "auxiliary_loss_clip": 0.01455074, + "auxiliary_loss_mlp": 0.00430085, + "balance_loss_clip": 1.14582765, + "balance_loss_mlp": 0.387909, + "epoch": 0.032406433188035474, + "flos": 26905172232960.0, + "grad_norm": 31.55741572942659, + "language_loss": 0.86308122, + "learning_rate": 3.999939328146225e-06, + "loss": 0.8819328, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.42138672, + "step": 539, + "time_per_iteration": 2.6828184127807617 + }, + { + "auxiliary_loss_clip": 0.01437098, + "auxiliary_loss_mlp": 0.00425216, + "balance_loss_clip": 1.13273478, + "balance_loss_mlp": 0.38272941, + "epoch": 0.03246655644070344, + "flos": 31503928544640.0, + "grad_norm": 23.300193756156954, + "language_loss": 0.84225589, + "learning_rate": 3.999936256649943e-06, + "loss": 0.860879, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.42480469, + "step": 540, + "time_per_iteration": 2.7755157947540283 + }, + { + "auxiliary_loss_clip": 0.01456472, + "auxiliary_loss_mlp": 0.00429091, + "balance_loss_clip": 1.14925909, + "balance_loss_mlp": 0.38414946, + "epoch": 0.03252667969337141, + "flos": 23218331431680.0, + "grad_norm": 176.41188597304046, + "language_loss": 0.93958241, + "learning_rate": 3.999933109315878e-06, + "loss": 0.95843804, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.44946289, + "step": 541, + "time_per_iteration": 2.6530449390411377 + }, + { + "auxiliary_loss_clip": 0.01437866, + "auxiliary_loss_mlp": 0.00433644, + "balance_loss_clip": 1.1411612, + "balance_loss_mlp": 0.39130038, + "epoch": 0.032586802946039384, + "flos": 14757454926720.0, + "grad_norm": 16.364442470543636, + "language_loss": 0.96431792, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.98303306, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.42333984, + "step": 542, + "time_per_iteration": 2.669996976852417 + }, + { + "auxiliary_loss_clip": 0.01443739, + "auxiliary_loss_mlp": 0.00443743, + "balance_loss_clip": 1.13566113, + "balance_loss_mlp": 0.40256789, + "epoch": 0.03264692619870735, + "flos": 24280677100800.0, + "grad_norm": 60.11754299098067, + "language_loss": 0.80248559, + "learning_rate": 3.999926587134879e-06, + "loss": 0.82136047, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 3.08398438, + "router_z_loss_mlp": 0.41186523, + "step": 543, + "time_per_iteration": 2.6673216819763184 + }, + { + "auxiliary_loss_clip": 0.01454557, + "auxiliary_loss_mlp": 0.00455767, + "balance_loss_clip": 1.1382401, + "balance_loss_mlp": 0.41278037, + "epoch": 0.03270704945137532, + "flos": 22893160584960.0, + "grad_norm": 5.319865603142345, + "language_loss": 1.01399291, + "learning_rate": 3.999923212288192e-06, + "loss": 1.03309631, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.42993164, + "step": 544, + "time_per_iteration": 2.6687207221984863 + }, + { + "auxiliary_loss_clip": 0.01463236, + "auxiliary_loss_mlp": 0.00509963, + "balance_loss_clip": 1.14999413, + "balance_loss_mlp": 0.46545005, + "epoch": 0.032767172704043286, + "flos": 18041018757120.0, + "grad_norm": 20.14819768742619, + "language_loss": 0.77094567, + "learning_rate": 3.999919761604216e-06, + "loss": 0.79067767, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.44506836, + "step": 545, + "time_per_iteration": 2.6066088676452637 + }, + { + "auxiliary_loss_clip": 0.01459992, + "auxiliary_loss_mlp": 0.00484205, + "balance_loss_clip": 1.14656448, + "balance_loss_mlp": 0.43993023, + "epoch": 0.03282729595671126, + "flos": 22528739151360.0, + "grad_norm": 34.310423874500415, + "language_loss": 1.01545835, + "learning_rate": 3.999916235083083e-06, + "loss": 1.03490043, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.44287109, + "step": 546, + "time_per_iteration": 2.7529876232147217 + }, + { + "auxiliary_loss_clip": 0.01433524, + "auxiliary_loss_mlp": 0.00449381, + "balance_loss_clip": 1.12726116, + "balance_loss_mlp": 0.40441495, + "epoch": 0.03288741920937923, + "flos": 20410620001920.0, + "grad_norm": 26.707600559384097, + "language_loss": 0.91979563, + "learning_rate": 3.999912632724925e-06, + "loss": 0.93862474, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.44995117, + "step": 547, + "time_per_iteration": 2.655493974685669 + }, + { + "auxiliary_loss_clip": 0.01429994, + "auxiliary_loss_mlp": 0.00528379, + "balance_loss_clip": 1.1212585, + "balance_loss_mlp": 0.47964653, + "epoch": 0.032947542462047195, + "flos": 20777986350720.0, + "grad_norm": 8.923209221035412, + "language_loss": 0.89867127, + "learning_rate": 3.999908954529881e-06, + "loss": 0.91825497, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 3.0859375, + "router_z_loss_mlp": 0.48779297, + "step": 548, + "time_per_iteration": 2.7090330123901367 + }, + { + "auxiliary_loss_clip": 0.01415985, + "auxiliary_loss_mlp": 0.00486902, + "balance_loss_clip": 1.11845803, + "balance_loss_mlp": 0.44250861, + "epoch": 0.03300766571471517, + "flos": 19901263190400.0, + "grad_norm": 312.6066792198882, + "language_loss": 0.79911727, + "learning_rate": 3.999905200498087e-06, + "loss": 0.81814617, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.44335938, + "step": 549, + "time_per_iteration": 2.674975633621216 + }, + { + "auxiliary_loss_clip": 0.01408982, + "auxiliary_loss_mlp": 0.00461059, + "balance_loss_clip": 1.11810076, + "balance_loss_mlp": 0.41876343, + "epoch": 0.03306778896738313, + "flos": 17967760968960.0, + "grad_norm": 22.136794942723494, + "language_loss": 0.92098647, + "learning_rate": 3.999901370629689e-06, + "loss": 0.93968689, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.42333984, + "step": 550, + "time_per_iteration": 2.6124558448791504 + }, + { + "auxiliary_loss_clip": 0.01398223, + "auxiliary_loss_mlp": 0.00475955, + "balance_loss_clip": 1.11558187, + "balance_loss_mlp": 0.4373554, + "epoch": 0.033127912220051105, + "flos": 21653380707840.0, + "grad_norm": 22.723804039138805, + "language_loss": 0.88557947, + "learning_rate": 3.99989746492483e-06, + "loss": 0.90432125, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.38598633, + "step": 551, + "time_per_iteration": 2.659625291824341 + }, + { + "auxiliary_loss_clip": 0.01416708, + "auxiliary_loss_mlp": 0.00489711, + "balance_loss_clip": 1.11481047, + "balance_loss_mlp": 0.44255143, + "epoch": 0.03318803547271908, + "flos": 30188376927360.0, + "grad_norm": 41.63544371367076, + "language_loss": 0.96905786, + "learning_rate": 3.999893483383658e-06, + "loss": 0.98812199, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.47167969, + "step": 552, + "time_per_iteration": 2.717057943344116 + }, + { + "auxiliary_loss_clip": 0.01399877, + "auxiliary_loss_mlp": 0.00468806, + "balance_loss_clip": 1.10748768, + "balance_loss_mlp": 0.4273209, + "epoch": 0.03324815872538704, + "flos": 20376038183040.0, + "grad_norm": 173.49162987535243, + "language_loss": 1.0129168, + "learning_rate": 3.999889426006326e-06, + "loss": 1.03160357, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.41455078, + "step": 553, + "time_per_iteration": 2.6790990829467773 + }, + { + "auxiliary_loss_clip": 0.01392059, + "auxiliary_loss_mlp": 0.00473758, + "balance_loss_clip": 1.10543799, + "balance_loss_mlp": 0.43277425, + "epoch": 0.033308281978055014, + "flos": 24494560634880.0, + "grad_norm": 134.77969276687043, + "language_loss": 0.86991549, + "learning_rate": 3.999885292792986e-06, + "loss": 0.88857371, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.41015625, + "step": 554, + "time_per_iteration": 2.68129563331604 + }, + { + "auxiliary_loss_clip": 0.01402678, + "auxiliary_loss_mlp": 0.00427516, + "balance_loss_clip": 1.1220783, + "balance_loss_mlp": 0.38524458, + "epoch": 0.03336840523072298, + "flos": 23400326666880.0, + "grad_norm": 13594.393388761804, + "language_loss": 0.90083694, + "learning_rate": 3.999881083743795e-06, + "loss": 0.91913891, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.42236328, + "step": 555, + "time_per_iteration": 2.6592769622802734 + }, + { + "auxiliary_loss_clip": 0.01419177, + "auxiliary_loss_mlp": 0.00377343, + "balance_loss_clip": 1.12870145, + "balance_loss_mlp": 0.33397514, + "epoch": 0.03342852848339095, + "flos": 30550571717760.0, + "grad_norm": 5.415235348014524, + "language_loss": 0.99613839, + "learning_rate": 3.999876798858914e-06, + "loss": 1.01410365, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.43334961, + "step": 556, + "time_per_iteration": 2.6849286556243896 + }, + { + "auxiliary_loss_clip": 0.01420982, + "auxiliary_loss_mlp": 0.00403536, + "balance_loss_clip": 1.13188338, + "balance_loss_mlp": 0.36083519, + "epoch": 0.03348865173605892, + "flos": 22893304239360.0, + "grad_norm": 79.67036984617063, + "language_loss": 0.93315685, + "learning_rate": 3.999872438138503e-06, + "loss": 0.95140201, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.42700195, + "step": 557, + "time_per_iteration": 2.6483161449432373 + }, + { + "auxiliary_loss_clip": 0.01441726, + "auxiliary_loss_mlp": 0.00397586, + "balance_loss_clip": 1.14553046, + "balance_loss_mlp": 0.35540974, + "epoch": 0.03354877498872689, + "flos": 17676022705920.0, + "grad_norm": 229.2299094226992, + "language_loss": 1.05163693, + "learning_rate": 3.999868001582729e-06, + "loss": 1.07003009, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.421875, + "step": 558, + "time_per_iteration": 2.83170485496521 + }, + { + "auxiliary_loss_clip": 0.01454886, + "auxiliary_loss_mlp": 0.00377803, + "balance_loss_clip": 1.15662587, + "balance_loss_mlp": 0.33965608, + "epoch": 0.03360889824139486, + "flos": 21652985658240.0, + "grad_norm": 37.95872776710402, + "language_loss": 0.86306882, + "learning_rate": 3.99986348919176e-06, + "loss": 0.8813957, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.38134766, + "step": 559, + "time_per_iteration": 2.7128684520721436 + }, + { + "auxiliary_loss_clip": 0.0146213, + "auxiliary_loss_mlp": 0.00343619, + "balance_loss_clip": 1.17122161, + "balance_loss_mlp": 0.30642527, + "epoch": 0.033669021494062826, + "flos": 21795730306560.0, + "grad_norm": 12.106716410882317, + "language_loss": 0.94454145, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.96259892, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.37207031, + "step": 560, + "time_per_iteration": 2.630286455154419 + }, + { + "auxiliary_loss_clip": 0.01480486, + "auxiliary_loss_mlp": 0.00343919, + "balance_loss_clip": 1.18209958, + "balance_loss_mlp": 0.30746448, + "epoch": 0.0337291447467308, + "flos": 21866222747520.0, + "grad_norm": 46.79975559390302, + "language_loss": 0.88768727, + "learning_rate": 3.999854236904925e-06, + "loss": 0.90593135, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.36474609, + "step": 561, + "time_per_iteration": 2.6813862323760986 + }, + { + "auxiliary_loss_clip": 0.01517033, + "auxiliary_loss_mlp": 0.00397801, + "balance_loss_clip": 1.21353054, + "balance_loss_mlp": 0.36055994, + "epoch": 0.03378926799939877, + "flos": 24245951627520.0, + "grad_norm": 46.99898596212948, + "language_loss": 0.88649303, + "learning_rate": 3.999849497009409e-06, + "loss": 0.90564132, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.37207031, + "step": 562, + "time_per_iteration": 2.7265377044677734 + }, + { + "auxiliary_loss_clip": 0.0152929, + "auxiliary_loss_mlp": 0.00408223, + "balance_loss_clip": 1.22183669, + "balance_loss_mlp": 0.3703618, + "epoch": 0.033849391252066735, + "flos": 16507812677760.0, + "grad_norm": 12.913551012918083, + "language_loss": 0.90110111, + "learning_rate": 3.999844681279401e-06, + "loss": 0.92047614, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 3.07617188, + "router_z_loss_mlp": 0.37866211, + "step": 563, + "time_per_iteration": 2.6296610832214355 + }, + { + "auxiliary_loss_clip": 0.01573137, + "auxiliary_loss_mlp": 0.00535282, + "balance_loss_clip": 1.25706983, + "balance_loss_mlp": 0.49885139, + "epoch": 0.03390951450473471, + "flos": 15669298609920.0, + "grad_norm": 44.335420908147455, + "language_loss": 0.99358177, + "learning_rate": 3.99983978971508e-06, + "loss": 1.01466608, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.36401367, + "step": 564, + "time_per_iteration": 2.635957717895508 + }, + { + "auxiliary_loss_clip": 0.01605492, + "auxiliary_loss_mlp": 0.00757579, + "balance_loss_clip": 1.26272273, + "balance_loss_mlp": 0.7084173, + "epoch": 0.03396963775740267, + "flos": 22674787850880.0, + "grad_norm": 203.50803183057292, + "language_loss": 1.01487458, + "learning_rate": 3.999834822316635e-06, + "loss": 1.03850532, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.49194336, + "step": 565, + "time_per_iteration": 2.669855833053589 + }, + { + "auxiliary_loss_clip": 0.01622963, + "auxiliary_loss_mlp": 0.01001888, + "balance_loss_clip": 1.3108145, + "balance_loss_mlp": 0.96602994, + "epoch": 0.034029761010070644, + "flos": 64392683063040.0, + "grad_norm": 1.8677573933637512, + "language_loss": 0.56518769, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.59143615, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.359375, + "step": 566, + "time_per_iteration": 3.2568817138671875 + }, + { + "auxiliary_loss_clip": 0.01642218, + "auxiliary_loss_mlp": 0.01026052, + "balance_loss_clip": 1.28326082, + "balance_loss_mlp": 0.96809232, + "epoch": 0.034089884262738616, + "flos": 25004204755200.0, + "grad_norm": 1498.6202311004936, + "language_loss": 0.83743101, + "learning_rate": 3.999824660018126e-06, + "loss": 0.86411369, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 0.57983398, + "step": 567, + "time_per_iteration": 4.146958827972412 + }, + { + "auxiliary_loss_clip": 0.01649733, + "auxiliary_loss_mlp": 0.01071359, + "balance_loss_clip": 1.30259621, + "balance_loss_mlp": 1.01673698, + "epoch": 0.03415000751540658, + "flos": 28439096584320.0, + "grad_norm": 8.853944548668016, + "language_loss": 0.8747673, + "learning_rate": 3.999819465118447e-06, + "loss": 0.90197819, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 3.47460938, + "router_z_loss_mlp": 0.54638672, + "step": 568, + "time_per_iteration": 5.578441143035889 + }, + { + "auxiliary_loss_clip": 0.01681329, + "auxiliary_loss_mlp": 0.01012992, + "balance_loss_clip": 1.32591093, + "balance_loss_mlp": 0.95677346, + "epoch": 0.034210130768074554, + "flos": 21468727866240.0, + "grad_norm": 43.262484023601175, + "language_loss": 0.92702097, + "learning_rate": 3.999814194385413e-06, + "loss": 0.95396417, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 0.56176758, + "step": 569, + "time_per_iteration": 2.6144187450408936 + }, + { + "auxiliary_loss_clip": 0.01702387, + "auxiliary_loss_mlp": 0.00967973, + "balance_loss_clip": 1.33617926, + "balance_loss_mlp": 0.91161126, + "epoch": 0.03427025402074252, + "flos": 18697501676160.0, + "grad_norm": 22.446507801485726, + "language_loss": 1.01552176, + "learning_rate": 3.9998088478192255e-06, + "loss": 1.04222536, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.56347656, + "step": 570, + "time_per_iteration": 2.666616201400757 + }, + { + "auxiliary_loss_clip": 0.01742455, + "auxiliary_loss_mlp": 0.0092479, + "balance_loss_clip": 1.35013866, + "balance_loss_mlp": 0.86232466, + "epoch": 0.03433037727341049, + "flos": 20849987162880.0, + "grad_norm": 1077.7680718825236, + "language_loss": 0.87356269, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.90023512, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 3.92578125, + "router_z_loss_mlp": 0.62451172, + "step": 571, + "time_per_iteration": 2.7737529277801514 + }, + { + "auxiliary_loss_clip": 0.01761361, + "auxiliary_loss_mlp": 0.00986959, + "balance_loss_clip": 1.36948776, + "balance_loss_mlp": 0.92492282, + "epoch": 0.03439050052607846, + "flos": 25410282986880.0, + "grad_norm": 74.58121092739677, + "language_loss": 0.89474499, + "learning_rate": 3.999797927188199e-06, + "loss": 0.92222822, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 3.921875, + "router_z_loss_mlp": 0.62060547, + "step": 572, + "time_per_iteration": 2.749459981918335 + }, + { + "auxiliary_loss_clip": 0.01805608, + "auxiliary_loss_mlp": 0.00967457, + "balance_loss_clip": 1.3915472, + "balance_loss_mlp": 0.89922154, + "epoch": 0.03445062377874643, + "flos": 17640147997440.0, + "grad_norm": 33.6482832974872, + "language_loss": 0.90398079, + "learning_rate": 3.999792353123774e-06, + "loss": 0.93171138, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 4.13671875, + "router_z_loss_mlp": 0.68261719, + "step": 573, + "time_per_iteration": 2.5988261699676514 + }, + { + "auxiliary_loss_clip": 0.01814002, + "auxiliary_loss_mlp": 0.00890432, + "balance_loss_clip": 1.39303374, + "balance_loss_mlp": 0.82558239, + "epoch": 0.0345107470314144, + "flos": 16764502245120.0, + "grad_norm": 14.868498631335719, + "language_loss": 0.85061079, + "learning_rate": 3.999786703227023e-06, + "loss": 0.87765515, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 4.2109375, + "router_z_loss_mlp": 0.64892578, + "step": 574, + "time_per_iteration": 2.673856735229492 + }, + { + "auxiliary_loss_clip": 0.01833263, + "auxiliary_loss_mlp": 0.01003166, + "balance_loss_clip": 1.40273416, + "balance_loss_mlp": 0.93082976, + "epoch": 0.03457087028408237, + "flos": 14684448533760.0, + "grad_norm": 211.00482096691226, + "language_loss": 0.91248941, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.94085371, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 4.3046875, + "router_z_loss_mlp": 0.72314453, + "step": 575, + "time_per_iteration": 2.610645055770874 + }, + { + "auxiliary_loss_clip": 0.01868642, + "auxiliary_loss_mlp": 0.00928575, + "balance_loss_clip": 1.42889428, + "balance_loss_mlp": 0.86253327, + "epoch": 0.03463099353675034, + "flos": 20011293527040.0, + "grad_norm": 34.77446711763382, + "language_loss": 0.8858934, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.91386557, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 4.40234375, + "router_z_loss_mlp": 0.66015625, + "step": 576, + "time_per_iteration": 2.6080243587493896 + }, + { + "auxiliary_loss_clip": 0.01892458, + "auxiliary_loss_mlp": 0.00922645, + "balance_loss_clip": 1.45069718, + "balance_loss_mlp": 0.85841531, + "epoch": 0.03469111678941831, + "flos": 25301150490240.0, + "grad_norm": 30.295653150372363, + "language_loss": 0.91821396, + "learning_rate": 3.99976929854497e-06, + "loss": 0.946365, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 4.4140625, + "router_z_loss_mlp": 0.64257812, + "step": 577, + "time_per_iteration": 2.6505420207977295 + }, + { + "auxiliary_loss_clip": 0.01917375, + "auxiliary_loss_mlp": 0.00815244, + "balance_loss_clip": 1.46010435, + "balance_loss_mlp": 0.7511574, + "epoch": 0.034751240042086275, + "flos": 23259413612160.0, + "grad_norm": 25.263367315564192, + "language_loss": 0.7807588, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.80808502, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 4.57421875, + "router_z_loss_mlp": 0.64160156, + "step": 578, + "time_per_iteration": 2.6346471309661865 + }, + { + "auxiliary_loss_clip": 0.01964185, + "auxiliary_loss_mlp": 0.00846752, + "balance_loss_clip": 1.48495173, + "balance_loss_mlp": 0.77761054, + "epoch": 0.03481136329475425, + "flos": 23769237300480.0, + "grad_norm": 27.096566289297, + "language_loss": 0.82887483, + "learning_rate": 3.999757316265973e-06, + "loss": 0.85698414, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 4.80078125, + "router_z_loss_mlp": 0.69189453, + "step": 579, + "time_per_iteration": 2.708754777908325 + }, + { + "auxiliary_loss_clip": 0.01980446, + "auxiliary_loss_mlp": 0.00766055, + "balance_loss_clip": 1.4903897, + "balance_loss_mlp": 0.70296997, + "epoch": 0.03487148654742222, + "flos": 20157521794560.0, + "grad_norm": 6.171736537355569, + "language_loss": 0.909549, + "learning_rate": 3.999751211379863e-06, + "loss": 0.93701398, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 4.90234375, + "router_z_loss_mlp": 0.63085938, + "step": 580, + "time_per_iteration": 2.644131660461426 + }, + { + "auxiliary_loss_clip": 0.01994816, + "auxiliary_loss_mlp": 0.00809086, + "balance_loss_clip": 1.50035203, + "balance_loss_mlp": 0.74495143, + "epoch": 0.034931609800090184, + "flos": 15669585918720.0, + "grad_norm": 414.56554159352845, + "language_loss": 0.89531243, + "learning_rate": 3.999745030662987e-06, + "loss": 0.92335141, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.64111328, + "step": 581, + "time_per_iteration": 2.6543874740600586 + }, + { + "auxiliary_loss_clip": 0.02020242, + "auxiliary_loss_mlp": 0.00709909, + "balance_loss_clip": 1.52640152, + "balance_loss_mlp": 0.65287942, + "epoch": 0.034991733052758156, + "flos": 16362374509440.0, + "grad_norm": 6.665103957820782, + "language_loss": 0.8334043, + "learning_rate": 3.99973877411558e-06, + "loss": 0.86070573, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.56982422, + "step": 582, + "time_per_iteration": 2.6119818687438965 + }, + { + "auxiliary_loss_clip": 0.02019719, + "auxiliary_loss_mlp": 0.0073515, + "balance_loss_clip": 1.52335572, + "balance_loss_mlp": 0.67139739, + "epoch": 0.03505185630542612, + "flos": 19387309438080.0, + "grad_norm": 21.997048664800474, + "language_loss": 0.93027049, + "learning_rate": 3.999732441737877e-06, + "loss": 0.95781922, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 4.96484375, + "router_z_loss_mlp": 0.63671875, + "step": 583, + "time_per_iteration": 2.740124464035034 + }, + { + "auxiliary_loss_clip": 0.02041112, + "auxiliary_loss_mlp": 0.00690405, + "balance_loss_clip": 1.52354217, + "balance_loss_mlp": 0.62865436, + "epoch": 0.03511197955809409, + "flos": 21323828401920.0, + "grad_norm": 296.0604880249817, + "language_loss": 0.87412864, + "learning_rate": 3.99972603353012e-06, + "loss": 0.90144378, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 5.17578125, + "router_z_loss_mlp": 0.6171875, + "step": 584, + "time_per_iteration": 2.624398946762085 + }, + { + "auxiliary_loss_clip": 0.02014235, + "auxiliary_loss_mlp": 0.00687322, + "balance_loss_clip": 1.51288009, + "balance_loss_mlp": 0.62561977, + "epoch": 0.035172102810762065, + "flos": 14136595320960.0, + "grad_norm": 315.6845240989909, + "language_loss": 1.01652288, + "learning_rate": 3.999719549492551e-06, + "loss": 1.04353845, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 5.01953125, + "router_z_loss_mlp": 0.6171875, + "step": 585, + "time_per_iteration": 2.643486261367798 + }, + { + "auxiliary_loss_clip": 0.02008165, + "auxiliary_loss_mlp": 0.0070608, + "balance_loss_clip": 1.50918734, + "balance_loss_mlp": 0.64451993, + "epoch": 0.03523222606343003, + "flos": 20296890564480.0, + "grad_norm": 237.65832751070636, + "language_loss": 0.93462563, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.96176809, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 4.9921875, + "router_z_loss_mlp": 0.61523438, + "step": 586, + "time_per_iteration": 2.6885077953338623 + }, + { + "auxiliary_loss_clip": 0.02020211, + "auxiliary_loss_mlp": 0.00691459, + "balance_loss_clip": 1.5188297, + "balance_loss_mlp": 0.63147265, + "epoch": 0.035292349316098, + "flos": 20375822701440.0, + "grad_norm": 14.504496014331218, + "language_loss": 0.82897758, + "learning_rate": 3.999706353928965e-06, + "loss": 0.85609436, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 5.0078125, + "router_z_loss_mlp": 0.60009766, + "step": 587, + "time_per_iteration": 2.6590113639831543 + }, + { + "auxiliary_loss_clip": 0.02029785, + "auxiliary_loss_mlp": 0.00687763, + "balance_loss_clip": 1.51845419, + "balance_loss_mlp": 0.62467724, + "epoch": 0.03535247256876597, + "flos": 21468871520640.0, + "grad_norm": 14.244863750209356, + "language_loss": 0.83517396, + "learning_rate": 3.999699642403449e-06, + "loss": 0.86234945, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 5.11328125, + "router_z_loss_mlp": 0.63110352, + "step": 588, + "time_per_iteration": 2.626535177230835 + }, + { + "auxiliary_loss_clip": 0.02002217, + "auxiliary_loss_mlp": 0.00705739, + "balance_loss_clip": 1.50593865, + "balance_loss_mlp": 0.63831377, + "epoch": 0.03541259582143394, + "flos": 23623044946560.0, + "grad_norm": 97.69233701336675, + "language_loss": 1.02350628, + "learning_rate": 3.99969285504912e-06, + "loss": 1.05058587, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.67431641, + "step": 589, + "time_per_iteration": 2.658149480819702 + }, + { + "auxiliary_loss_clip": 0.0200716, + "auxiliary_loss_mlp": 0.00654137, + "balance_loss_clip": 1.513098, + "balance_loss_mlp": 0.59543884, + "epoch": 0.03547271907410191, + "flos": 33726367768320.0, + "grad_norm": 11.777739714064962, + "language_loss": 0.90522361, + "learning_rate": 3.99968599186624e-06, + "loss": 0.93183661, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 4.9375, + "router_z_loss_mlp": 0.58691406, + "step": 590, + "time_per_iteration": 2.708995819091797 + }, + { + "auxiliary_loss_clip": 0.02007494, + "auxiliary_loss_mlp": 0.00627235, + "balance_loss_clip": 1.51799631, + "balance_loss_mlp": 0.57068229, + "epoch": 0.03553284232676988, + "flos": 21142695093120.0, + "grad_norm": 18.3018832348814, + "language_loss": 0.93866599, + "learning_rate": 3.999679052855065e-06, + "loss": 0.96501327, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 4.890625, + "router_z_loss_mlp": 0.56591797, + "step": 591, + "time_per_iteration": 2.6830832958221436 + }, + { + "auxiliary_loss_clip": 0.02018357, + "auxiliary_loss_mlp": 0.00635042, + "balance_loss_clip": 1.51816225, + "balance_loss_mlp": 0.57310134, + "epoch": 0.03559296557943785, + "flos": 20046593617920.0, + "grad_norm": 36.099413574747146, + "language_loss": 0.90049314, + "learning_rate": 3.999672038015861e-06, + "loss": 0.92702723, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 5.00390625, + "router_z_loss_mlp": 0.61914062, + "step": 592, + "time_per_iteration": 2.6144847869873047 + }, + { + "auxiliary_loss_clip": 0.01949497, + "auxiliary_loss_mlp": 0.00531676, + "balance_loss_clip": 1.50533867, + "balance_loss_mlp": 0.50573599, + "epoch": 0.035653088832105814, + "flos": 60334597244160.0, + "grad_norm": 1.2813554423647389, + "language_loss": 0.60362959, + "learning_rate": 3.999664947348893e-06, + "loss": 0.62844121, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 4.4375, + "router_z_loss_mlp": 0.25976562, + "step": 593, + "time_per_iteration": 3.1666927337646484 + }, + { + "auxiliary_loss_clip": 0.01977755, + "auxiliary_loss_mlp": 0.00614455, + "balance_loss_clip": 1.50727212, + "balance_loss_mlp": 0.55656743, + "epoch": 0.035713212084773786, + "flos": 20113135562880.0, + "grad_norm": 50.77649937291338, + "language_loss": 0.92572582, + "learning_rate": 3.999657780854429e-06, + "loss": 0.95164788, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 4.703125, + "router_z_loss_mlp": 0.57836914, + "step": 594, + "time_per_iteration": 2.6271414756774902 + }, + { + "auxiliary_loss_clip": 0.0195604, + "auxiliary_loss_mlp": 0.0060608, + "balance_loss_clip": 1.48770142, + "balance_loss_mlp": 0.5499559, + "epoch": 0.03577333533744176, + "flos": 26285785084800.0, + "grad_norm": 33.92128025014446, + "language_loss": 0.90670955, + "learning_rate": 3.999650538532742e-06, + "loss": 0.93233073, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.56201172, + "step": 595, + "time_per_iteration": 2.6685123443603516 + }, + { + "auxiliary_loss_clip": 0.01943685, + "auxiliary_loss_mlp": 0.00649888, + "balance_loss_clip": 1.48400664, + "balance_loss_mlp": 0.58828044, + "epoch": 0.035833458590109724, + "flos": 10889732211840.0, + "grad_norm": 9.945465030226796, + "language_loss": 1.03141451, + "learning_rate": 3.999643220384106e-06, + "loss": 1.05735016, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 4.59375, + "router_z_loss_mlp": 0.61572266, + "step": 596, + "time_per_iteration": 2.649545431137085 + }, + { + "auxiliary_loss_clip": 0.01900971, + "auxiliary_loss_mlp": 0.00629936, + "balance_loss_clip": 1.46156228, + "balance_loss_mlp": 0.57331192, + "epoch": 0.035893581842777696, + "flos": 22090198003200.0, + "grad_norm": 62.40770534230728, + "language_loss": 0.90476179, + "learning_rate": 3.999635826408799e-06, + "loss": 0.93007088, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.56616211, + "step": 597, + "time_per_iteration": 2.6184678077697754 + }, + { + "auxiliary_loss_clip": 0.01856841, + "auxiliary_loss_mlp": 0.00689091, + "balance_loss_clip": 1.43695235, + "balance_loss_mlp": 0.6286279, + "epoch": 0.03595370509544566, + "flos": 23038347358080.0, + "grad_norm": 17.785870669816784, + "language_loss": 0.85853088, + "learning_rate": 3.999628356607101e-06, + "loss": 0.88399023, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 4.1953125, + "router_z_loss_mlp": 0.60400391, + "step": 598, + "time_per_iteration": 2.6845881938934326 + }, + { + "auxiliary_loss_clip": 0.01842241, + "auxiliary_loss_mlp": 0.00670352, + "balance_loss_clip": 1.44181287, + "balance_loss_mlp": 0.61155838, + "epoch": 0.03601382834811363, + "flos": 20777734955520.0, + "grad_norm": 58.81162484315021, + "language_loss": 0.86409879, + "learning_rate": 3.999620810979295e-06, + "loss": 0.88922471, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 4.00390625, + "router_z_loss_mlp": 0.58789062, + "step": 599, + "time_per_iteration": 2.620342254638672 + }, + { + "auxiliary_loss_clip": 0.01849226, + "auxiliary_loss_mlp": 0.00716901, + "balance_loss_clip": 1.42464352, + "balance_loss_mlp": 0.65166974, + "epoch": 0.036073951600781605, + "flos": 23951627585280.0, + "grad_norm": 55.48536596399606, + "language_loss": 0.9408803, + "learning_rate": 3.999613189525668e-06, + "loss": 0.96654159, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 4.24609375, + "router_z_loss_mlp": 0.65234375, + "step": 600, + "time_per_iteration": 2.6679952144622803 + }, + { + "auxiliary_loss_clip": 0.01786637, + "auxiliary_loss_mlp": 0.00721602, + "balance_loss_clip": 1.38870406, + "balance_loss_mlp": 0.65918416, + "epoch": 0.03613407485344957, + "flos": 18912283050240.0, + "grad_norm": 16.97827862683609, + "language_loss": 0.8887701, + "learning_rate": 3.999605492246508e-06, + "loss": 0.91385245, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 3.97851562, + "router_z_loss_mlp": 0.62451172, + "step": 601, + "time_per_iteration": 2.608794689178467 + }, + { + "auxiliary_loss_clip": 0.01790957, + "auxiliary_loss_mlp": 0.00692612, + "balance_loss_clip": 1.39483416, + "balance_loss_mlp": 0.62690437, + "epoch": 0.03619419810611754, + "flos": 23038526926080.0, + "grad_norm": 29.261740903186556, + "language_loss": 0.83166671, + "learning_rate": 3.999597719142107e-06, + "loss": 0.85650235, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 3.95898438, + "router_z_loss_mlp": 0.65722656, + "step": 602, + "time_per_iteration": 2.720123529434204 + }, + { + "auxiliary_loss_clip": 0.01755859, + "auxiliary_loss_mlp": 0.00797982, + "balance_loss_clip": 1.37093163, + "balance_loss_mlp": 0.72898388, + "epoch": 0.03625432135878551, + "flos": 29457774293760.0, + "grad_norm": 7.454777674472082, + "language_loss": 0.85395163, + "learning_rate": 3.999589870212761e-06, + "loss": 0.87949002, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.68994141, + "step": 603, + "time_per_iteration": 2.6800179481506348 + }, + { + "auxiliary_loss_clip": 0.01755196, + "auxiliary_loss_mlp": 0.00750192, + "balance_loss_clip": 1.37865901, + "balance_loss_mlp": 0.68386394, + "epoch": 0.03631444461145348, + "flos": 23508525409920.0, + "grad_norm": 149.98577430233928, + "language_loss": 0.92379153, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.94884539, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 0.66308594, + "step": 604, + "time_per_iteration": 2.703775644302368 + }, + { + "auxiliary_loss_clip": 0.01723986, + "auxiliary_loss_mlp": 0.00798295, + "balance_loss_clip": 1.34745407, + "balance_loss_mlp": 0.72705567, + "epoch": 0.03637456786412145, + "flos": 16618130323200.0, + "grad_norm": 108.13593476923434, + "language_loss": 0.87444812, + "learning_rate": 3.999573944880424e-06, + "loss": 0.89967096, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 0.71240234, + "step": 605, + "time_per_iteration": 2.6238231658935547 + }, + { + "auxiliary_loss_clip": 0.01712726, + "auxiliary_loss_mlp": 0.00792757, + "balance_loss_clip": 1.3400563, + "balance_loss_mlp": 0.72337711, + "epoch": 0.03643469111678942, + "flos": 15851832549120.0, + "grad_norm": 208.49637246258834, + "language_loss": 0.9387697, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.96382451, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 0.69384766, + "step": 606, + "time_per_iteration": 2.6952779293060303 + }, + { + "auxiliary_loss_clip": 0.01709396, + "auxiliary_loss_mlp": 0.00779323, + "balance_loss_clip": 1.33238947, + "balance_loss_mlp": 0.70851243, + "epoch": 0.03649481436945739, + "flos": 23620387340160.0, + "grad_norm": 107.25136719834062, + "language_loss": 0.88664651, + "learning_rate": 3.999557716251912e-06, + "loss": 0.91153365, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 0.70800781, + "step": 607, + "time_per_iteration": 2.694709539413452 + }, + { + "auxiliary_loss_clip": 0.01673821, + "auxiliary_loss_mlp": 0.00794912, + "balance_loss_clip": 1.31629944, + "balance_loss_mlp": 0.72548413, + "epoch": 0.036554937622125354, + "flos": 21755581879680.0, + "grad_norm": 73.24456522714853, + "language_loss": 0.87785721, + "learning_rate": 3.999549488202358e-06, + "loss": 0.90254462, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.69384766, + "step": 608, + "time_per_iteration": 2.6421711444854736 + }, + { + "auxiliary_loss_clip": 0.01672513, + "auxiliary_loss_mlp": 0.00784633, + "balance_loss_clip": 1.31212986, + "balance_loss_mlp": 0.71143895, + "epoch": 0.036615060874793326, + "flos": 17819772935040.0, + "grad_norm": 33.54363176602559, + "language_loss": 0.89208674, + "learning_rate": 3.999541184329688e-06, + "loss": 0.91665816, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 0.73144531, + "step": 609, + "time_per_iteration": 4.183840990066528 + }, + { + "auxiliary_loss_clip": 0.01670426, + "auxiliary_loss_mlp": 0.00764576, + "balance_loss_clip": 1.31005907, + "balance_loss_mlp": 0.6976279, + "epoch": 0.0366751841274613, + "flos": 26753808320640.0, + "grad_norm": 123.67412266866349, + "language_loss": 0.87920654, + "learning_rate": 3.999532804634215e-06, + "loss": 0.90355659, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 0.66992188, + "step": 610, + "time_per_iteration": 5.556333065032959 + }, + { + "auxiliary_loss_clip": 0.01661842, + "auxiliary_loss_mlp": 0.00805105, + "balance_loss_clip": 1.30855036, + "balance_loss_mlp": 0.73062265, + "epoch": 0.03673530738012926, + "flos": 22196960202240.0, + "grad_norm": 3103.4075066069727, + "language_loss": 0.92518616, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.94985569, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.74511719, + "step": 611, + "time_per_iteration": 2.666883945465088 + }, + { + "auxiliary_loss_clip": 0.01645307, + "auxiliary_loss_mlp": 0.00776172, + "balance_loss_clip": 1.30125988, + "balance_loss_mlp": 0.70955807, + "epoch": 0.036795430632797235, + "flos": 24681655601280.0, + "grad_norm": 45.88738320244028, + "language_loss": 0.80365014, + "learning_rate": 3.999515817776136e-06, + "loss": 0.82786489, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.66650391, + "step": 612, + "time_per_iteration": 2.702878713607788 + }, + { + "auxiliary_loss_clip": 0.01666348, + "auxiliary_loss_mlp": 0.00738106, + "balance_loss_clip": 1.31442523, + "balance_loss_mlp": 0.67320818, + "epoch": 0.0368555538854652, + "flos": 17748921358080.0, + "grad_norm": 31.380879775743193, + "language_loss": 0.87318254, + "learning_rate": 3.999507210614175e-06, + "loss": 0.89722705, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.64892578, + "step": 613, + "time_per_iteration": 2.6394965648651123 + }, + { + "auxiliary_loss_clip": 0.01655801, + "auxiliary_loss_mlp": 0.00760903, + "balance_loss_clip": 1.30376768, + "balance_loss_mlp": 0.69304866, + "epoch": 0.03691567713813317, + "flos": 20594554571520.0, + "grad_norm": 222.08150868411596, + "language_loss": 1.00481391, + "learning_rate": 3.9994985276307e-06, + "loss": 1.02898097, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.67822266, + "step": 614, + "time_per_iteration": 2.6039953231811523 + }, + { + "auxiliary_loss_clip": 0.01704064, + "auxiliary_loss_mlp": 0.00702621, + "balance_loss_clip": 1.33099854, + "balance_loss_mlp": 0.63810462, + "epoch": 0.036975800390801145, + "flos": 33650380546560.0, + "grad_norm": 9.233223548431452, + "language_loss": 0.80384535, + "learning_rate": 3.999489768826041e-06, + "loss": 0.82791221, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 3.7265625, + "router_z_loss_mlp": 0.64453125, + "step": 615, + "time_per_iteration": 2.7351934909820557 + }, + { + "auxiliary_loss_clip": 0.01686097, + "auxiliary_loss_mlp": 0.00671777, + "balance_loss_clip": 1.3214612, + "balance_loss_mlp": 0.61078936, + "epoch": 0.03703592364346911, + "flos": 28293694329600.0, + "grad_norm": 30.953645878050597, + "language_loss": 0.88484204, + "learning_rate": 3.999480934200528e-06, + "loss": 0.90842074, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 0.61035156, + "step": 616, + "time_per_iteration": 2.6970481872558594 + }, + { + "auxiliary_loss_clip": 0.0171754, + "auxiliary_loss_mlp": 0.00619353, + "balance_loss_clip": 1.35251343, + "balance_loss_mlp": 0.56630504, + "epoch": 0.03709604689613708, + "flos": 31504215853440.0, + "grad_norm": 46.508553013642874, + "language_loss": 0.76023853, + "learning_rate": 3.999472023754499e-06, + "loss": 0.78360748, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.53100586, + "step": 617, + "time_per_iteration": 2.732340097427368 + }, + { + "auxiliary_loss_clip": 0.01727317, + "auxiliary_loss_mlp": 0.00627676, + "balance_loss_clip": 1.35073292, + "balance_loss_mlp": 0.57312572, + "epoch": 0.03715617014880505, + "flos": 19609381272960.0, + "grad_norm": 10.607218686800634, + "language_loss": 0.87950921, + "learning_rate": 3.99946303748829e-06, + "loss": 0.90305912, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 0.54541016, + "step": 618, + "time_per_iteration": 2.7103404998779297 + }, + { + "auxiliary_loss_clip": 0.0174649, + "auxiliary_loss_mlp": 0.00582684, + "balance_loss_clip": 1.36922455, + "balance_loss_mlp": 0.52844357, + "epoch": 0.03721629340147302, + "flos": 15924192497280.0, + "grad_norm": 8.639996354184785, + "language_loss": 0.96735764, + "learning_rate": 3.999453975402242e-06, + "loss": 0.9906494, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.54199219, + "step": 619, + "time_per_iteration": 2.732637882232666 + }, + { + "auxiliary_loss_clip": 0.01757406, + "auxiliary_loss_mlp": 0.00582804, + "balance_loss_clip": 1.37296748, + "balance_loss_mlp": 0.53008997, + "epoch": 0.03727641665414099, + "flos": 21104090951040.0, + "grad_norm": 119.11059251748637, + "language_loss": 1.0224694, + "learning_rate": 3.9994448374967e-06, + "loss": 1.0458715, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.52709961, + "step": 620, + "time_per_iteration": 2.612511157989502 + }, + { + "auxiliary_loss_clip": 0.01769647, + "auxiliary_loss_mlp": 0.00536909, + "balance_loss_clip": 1.38318586, + "balance_loss_mlp": 0.48820043, + "epoch": 0.037336539906808956, + "flos": 24131683486080.0, + "grad_norm": 142.34343113389593, + "language_loss": 0.84162456, + "learning_rate": 3.999435623772008e-06, + "loss": 0.86469018, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 3.86328125, + "router_z_loss_mlp": 0.48730469, + "step": 621, + "time_per_iteration": 2.6906991004943848 + }, + { + "auxiliary_loss_clip": 0.0178301, + "auxiliary_loss_mlp": 0.00521567, + "balance_loss_clip": 1.4010098, + "balance_loss_mlp": 0.47529027, + "epoch": 0.03739666315947693, + "flos": 22346384780160.0, + "grad_norm": 6.52740370431409, + "language_loss": 0.93270314, + "learning_rate": 3.999426334228518e-06, + "loss": 0.95574898, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.46313477, + "step": 622, + "time_per_iteration": 2.644049644470215 + }, + { + "auxiliary_loss_clip": 0.01765069, + "auxiliary_loss_mlp": 0.00497296, + "balance_loss_clip": 1.38324809, + "balance_loss_mlp": 0.4536888, + "epoch": 0.0374567864121449, + "flos": 20449511452800.0, + "grad_norm": 19.174729412505698, + "language_loss": 0.96270442, + "learning_rate": 3.999416968866581e-06, + "loss": 0.98532814, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 0.43579102, + "step": 623, + "time_per_iteration": 2.636193037033081 + }, + { + "auxiliary_loss_clip": 0.01802411, + "auxiliary_loss_mlp": 0.00486951, + "balance_loss_clip": 1.41435432, + "balance_loss_mlp": 0.44465533, + "epoch": 0.037516909664812866, + "flos": 19208043636480.0, + "grad_norm": 25.46390177568775, + "language_loss": 0.90066588, + "learning_rate": 3.999407527686551e-06, + "loss": 0.92355943, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.42333984, + "step": 624, + "time_per_iteration": 2.6942198276519775 + }, + { + "auxiliary_loss_clip": 0.01818944, + "auxiliary_loss_mlp": 0.00473283, + "balance_loss_clip": 1.42042422, + "balance_loss_mlp": 0.43160778, + "epoch": 0.03757703291748084, + "flos": 35005218664320.0, + "grad_norm": 13.668658410713816, + "language_loss": 0.75106621, + "learning_rate": 3.999398010688788e-06, + "loss": 0.77398849, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 3.9765625, + "router_z_loss_mlp": 0.41650391, + "step": 625, + "time_per_iteration": 2.7894694805145264 + }, + { + "auxiliary_loss_clip": 0.01836122, + "auxiliary_loss_mlp": 0.0044201, + "balance_loss_clip": 1.4400301, + "balance_loss_mlp": 0.4016462, + "epoch": 0.0376371561701488, + "flos": 25483899911040.0, + "grad_norm": 20.31428238383766, + "language_loss": 0.85251456, + "learning_rate": 3.999388417873652e-06, + "loss": 0.87529588, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 3.95507812, + "router_z_loss_mlp": 0.40356445, + "step": 626, + "time_per_iteration": 2.7474262714385986 + }, + { + "auxiliary_loss_clip": 0.01879358, + "auxiliary_loss_mlp": 0.00442147, + "balance_loss_clip": 1.46245313, + "balance_loss_mlp": 0.40183067, + "epoch": 0.037697279422816775, + "flos": 18185630912640.0, + "grad_norm": 61.61251114623347, + "language_loss": 0.86823785, + "learning_rate": 3.999378749241506e-06, + "loss": 0.89145291, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 4.171875, + "router_z_loss_mlp": 0.40283203, + "step": 627, + "time_per_iteration": 2.615792751312256 + }, + { + "auxiliary_loss_clip": 0.01890805, + "auxiliary_loss_mlp": 0.00479059, + "balance_loss_clip": 1.47144294, + "balance_loss_mlp": 0.4370015, + "epoch": 0.03775740267548475, + "flos": 24644272521600.0, + "grad_norm": 8.942604614822011, + "language_loss": 0.94272935, + "learning_rate": 3.999369004792719e-06, + "loss": 0.96642804, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 4.1953125, + "router_z_loss_mlp": 0.4206543, + "step": 628, + "time_per_iteration": 2.720594882965088 + }, + { + "auxiliary_loss_clip": 0.0189625, + "auxiliary_loss_mlp": 0.00424298, + "balance_loss_clip": 1.4767189, + "balance_loss_mlp": 0.38784418, + "epoch": 0.03781752592815271, + "flos": 21288205088640.0, + "grad_norm": 9.292827658801787, + "language_loss": 0.86653376, + "learning_rate": 3.999359184527658e-06, + "loss": 0.88973922, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 4.1953125, + "router_z_loss_mlp": 0.36450195, + "step": 629, + "time_per_iteration": 2.5990638732910156 + }, + { + "auxiliary_loss_clip": 0.0192087, + "auxiliary_loss_mlp": 0.00424266, + "balance_loss_clip": 1.49892771, + "balance_loss_mlp": 0.38826507, + "epoch": 0.037877649180820684, + "flos": 22089623385600.0, + "grad_norm": 14.134274765357928, + "language_loss": 0.84481788, + "learning_rate": 3.999349288446696e-06, + "loss": 0.86826921, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 4.2265625, + "router_z_loss_mlp": 0.35986328, + "step": 630, + "time_per_iteration": 2.722088575363159 + }, + { + "auxiliary_loss_clip": 0.01935542, + "auxiliary_loss_mlp": 0.00425377, + "balance_loss_clip": 1.5198946, + "balance_loss_mlp": 0.39013892, + "epoch": 0.03793777243348865, + "flos": 14501339976960.0, + "grad_norm": 30.062075033987398, + "language_loss": 1.01996052, + "learning_rate": 3.99933931655021e-06, + "loss": 1.0435698, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 4.15234375, + "router_z_loss_mlp": 0.35253906, + "step": 631, + "time_per_iteration": 2.595613479614258 + }, + { + "auxiliary_loss_clip": 0.01908467, + "auxiliary_loss_mlp": 0.00387605, + "balance_loss_clip": 1.51143909, + "balance_loss_mlp": 0.35508439, + "epoch": 0.03799789568615662, + "flos": 21908418249600.0, + "grad_norm": 319.81103059709346, + "language_loss": 0.97060728, + "learning_rate": 3.999329268838575e-06, + "loss": 0.99356794, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.32519531, + "step": 632, + "time_per_iteration": 2.67881441116333 + }, + { + "auxiliary_loss_clip": 0.01943401, + "auxiliary_loss_mlp": 0.0038268, + "balance_loss_clip": 1.54486537, + "balance_loss_mlp": 0.34973049, + "epoch": 0.03805801893882459, + "flos": 24827021942400.0, + "grad_norm": 31.102081817193174, + "language_loss": 0.89858079, + "learning_rate": 3.999319145312175e-06, + "loss": 0.92184156, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 3.99023438, + "router_z_loss_mlp": 0.3293457, + "step": 633, + "time_per_iteration": 2.617716073989868 + }, + { + "auxiliary_loss_clip": 0.01964605, + "auxiliary_loss_mlp": 0.00376556, + "balance_loss_clip": 1.56341171, + "balance_loss_mlp": 0.34019747, + "epoch": 0.03811814219149256, + "flos": 30482952364800.0, + "grad_norm": 11.492085796734942, + "language_loss": 0.76998353, + "learning_rate": 3.999308945971392e-06, + "loss": 0.79339516, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 4.00976562, + "router_z_loss_mlp": 0.36352539, + "step": 634, + "time_per_iteration": 2.7283332347869873 + }, + { + "auxiliary_loss_clip": 0.01774151, + "auxiliary_loss_mlp": 0.00314401, + "balance_loss_clip": 1.40765691, + "balance_loss_mlp": 0.29742548, + "epoch": 0.03817826544416053, + "flos": 66992577379200.0, + "grad_norm": 1.1335412237511113, + "language_loss": 0.61858273, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63946825, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.16992188, + "step": 635, + "time_per_iteration": 3.142587423324585 + }, + { + "auxiliary_loss_clip": 0.01925258, + "auxiliary_loss_mlp": 0.00345045, + "balance_loss_clip": 1.54288304, + "balance_loss_mlp": 0.31097507, + "epoch": 0.038238388696828496, + "flos": 20485350247680.0, + "grad_norm": 27.847161150631294, + "language_loss": 0.93111575, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.95381874, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.34057617, + "step": 636, + "time_per_iteration": 2.62214994430542 + }, + { + "auxiliary_loss_clip": 0.0197474, + "auxiliary_loss_mlp": 0.00392603, + "balance_loss_clip": 1.58592582, + "balance_loss_mlp": 0.35564813, + "epoch": 0.03829851194949647, + "flos": 17965893461760.0, + "grad_norm": 5.093947573232372, + "language_loss": 0.93186462, + "learning_rate": 3.999277893066632e-06, + "loss": 0.95553803, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 3.88671875, + "router_z_loss_mlp": 0.36938477, + "step": 637, + "time_per_iteration": 2.567892551422119 + }, + { + "auxiliary_loss_clip": 0.01955957, + "auxiliary_loss_mlp": 0.00345063, + "balance_loss_clip": 1.58456826, + "balance_loss_mlp": 0.31127918, + "epoch": 0.03835863520216444, + "flos": 22456522857600.0, + "grad_norm": 17.486262876536127, + "language_loss": 0.92828321, + "learning_rate": 3.999267390472215e-06, + "loss": 0.95129347, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 0.33813477, + "step": 638, + "time_per_iteration": 2.673048257827759 + }, + { + "auxiliary_loss_clip": 0.019692, + "auxiliary_loss_mlp": 0.00364066, + "balance_loss_clip": 1.58900356, + "balance_loss_mlp": 0.32837445, + "epoch": 0.038418758454832405, + "flos": 22164425458560.0, + "grad_norm": 14.723795914872971, + "language_loss": 0.79829764, + "learning_rate": 3.999256812065381e-06, + "loss": 0.8216303, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.35693359, + "step": 639, + "time_per_iteration": 2.820840358734131 + }, + { + "auxiliary_loss_clip": 0.01961481, + "auxiliary_loss_mlp": 0.00390971, + "balance_loss_clip": 1.58266592, + "balance_loss_mlp": 0.3518703, + "epoch": 0.03847888170750038, + "flos": 22747435107840.0, + "grad_norm": 125.71937525632016, + "language_loss": 0.94800961, + "learning_rate": 3.999246157846526e-06, + "loss": 0.97153413, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 0.39111328, + "step": 640, + "time_per_iteration": 2.6748476028442383 + }, + { + "auxiliary_loss_clip": 0.01950324, + "auxiliary_loss_mlp": 0.00420923, + "balance_loss_clip": 1.58119774, + "balance_loss_mlp": 0.38377696, + "epoch": 0.03853900496016834, + "flos": 22711201263360.0, + "grad_norm": 18.150582403006332, + "language_loss": 0.91801643, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.94172889, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 0.37109375, + "step": 641, + "time_per_iteration": 2.7279531955718994 + }, + { + "auxiliary_loss_clip": 0.01684038, + "auxiliary_loss_mlp": 0.00286559, + "balance_loss_clip": 1.37494826, + "balance_loss_mlp": 0.27025115, + "epoch": 0.038599128212836314, + "flos": 70399136355840.0, + "grad_norm": 0.8857152202062817, + "language_loss": 0.64668989, + "learning_rate": 3.999224621974381e-06, + "loss": 0.6663959, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.16308594, + "step": 642, + "time_per_iteration": 3.111320972442627 + }, + { + "auxiliary_loss_clip": 0.01911382, + "auxiliary_loss_mlp": 0.00539412, + "balance_loss_clip": 1.55162835, + "balance_loss_mlp": 0.50012088, + "epoch": 0.03865925146550429, + "flos": 23295144666240.0, + "grad_norm": 81.3274848875477, + "language_loss": 0.86299127, + "learning_rate": 3.999213740321906e-06, + "loss": 0.88749921, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 0.39282227, + "step": 643, + "time_per_iteration": 2.669640064239502 + }, + { + "auxiliary_loss_clip": 0.0193515, + "auxiliary_loss_mlp": 0.00547202, + "balance_loss_clip": 1.57247365, + "balance_loss_mlp": 0.50853086, + "epoch": 0.03871937471817225, + "flos": 21430446946560.0, + "grad_norm": 282.34850671061906, + "language_loss": 0.88534284, + "learning_rate": 3.999202782859046e-06, + "loss": 0.91016638, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.38671875, + "step": 644, + "time_per_iteration": 2.654322862625122 + }, + { + "auxiliary_loss_clip": 0.01974262, + "auxiliary_loss_mlp": 0.00568553, + "balance_loss_clip": 1.61236489, + "balance_loss_mlp": 0.53023887, + "epoch": 0.038779497970840224, + "flos": 34277309550720.0, + "grad_norm": 375.5361713733268, + "language_loss": 0.91128975, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.93671787, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.38256836, + "step": 645, + "time_per_iteration": 2.7381434440612793 + }, + { + "auxiliary_loss_clip": 0.01999831, + "auxiliary_loss_mlp": 0.00593107, + "balance_loss_clip": 1.61336291, + "balance_loss_mlp": 0.55248082, + "epoch": 0.03883962122350819, + "flos": 22748189293440.0, + "grad_norm": 3.5665763313601313, + "language_loss": 0.88968921, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.9156186, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.40649414, + "step": 646, + "time_per_iteration": 2.6984643936157227 + }, + { + "auxiliary_loss_clip": 0.01975735, + "auxiliary_loss_mlp": 0.00652024, + "balance_loss_clip": 1.58888066, + "balance_loss_mlp": 0.60579473, + "epoch": 0.03889974447617616, + "flos": 21945837242880.0, + "grad_norm": 71.70577580328208, + "language_loss": 0.88167202, + "learning_rate": 3.999169455612323e-06, + "loss": 0.90794969, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 3.8671875, + "router_z_loss_mlp": 0.46191406, + "step": 647, + "time_per_iteration": 2.7149605751037598 + }, + { + "auxiliary_loss_clip": 0.01986583, + "auxiliary_loss_mlp": 0.0058033, + "balance_loss_clip": 1.59928608, + "balance_loss_mlp": 0.53834498, + "epoch": 0.03895986772884413, + "flos": 31504826384640.0, + "grad_norm": 233.7051180166742, + "language_loss": 0.92008734, + "learning_rate": 3.999158194912106e-06, + "loss": 0.94575655, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.42016602, + "step": 648, + "time_per_iteration": 2.7556378841400146 + }, + { + "auxiliary_loss_clip": 0.01996342, + "auxiliary_loss_mlp": 0.00608746, + "balance_loss_clip": 1.60296428, + "balance_loss_mlp": 0.56406641, + "epoch": 0.0390199909815121, + "flos": 19901011795200.0, + "grad_norm": 49.411917250117355, + "language_loss": 0.90366793, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.92971885, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 3.93554688, + "router_z_loss_mlp": 0.44726562, + "step": 649, + "time_per_iteration": 2.6371541023254395 + }, + { + "auxiliary_loss_clip": 0.02046406, + "auxiliary_loss_mlp": 0.00562397, + "balance_loss_clip": 1.62874031, + "balance_loss_mlp": 0.51895672, + "epoch": 0.03908011423418007, + "flos": 21612478095360.0, + "grad_norm": 65.50023558447242, + "language_loss": 0.8566522, + "learning_rate": 3.999135446087263e-06, + "loss": 0.88274014, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 4.17382812, + "router_z_loss_mlp": 0.43457031, + "step": 650, + "time_per_iteration": 2.7042124271392822 + }, + { + "auxiliary_loss_clip": 0.02074033, + "auxiliary_loss_mlp": 0.00578286, + "balance_loss_clip": 1.63805676, + "balance_loss_mlp": 0.53408337, + "epoch": 0.039140237486848035, + "flos": 18661411486080.0, + "grad_norm": 6416.836453017863, + "language_loss": 0.85550672, + "learning_rate": 3.9991239579635e-06, + "loss": 0.88202989, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.44213867, + "step": 651, + "time_per_iteration": 4.092526197433472 + }, + { + "auxiliary_loss_clip": 0.02093141, + "auxiliary_loss_mlp": 0.0055894, + "balance_loss_clip": 1.64628017, + "balance_loss_mlp": 0.51123238, + "epoch": 0.03920036073951601, + "flos": 18661124177280.0, + "grad_norm": 251.2964613630038, + "language_loss": 0.96829975, + "learning_rate": 3.999112394032757e-06, + "loss": 0.99482059, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 4.46875, + "router_z_loss_mlp": 0.47729492, + "step": 652, + "time_per_iteration": 4.000746488571167 + }, + { + "auxiliary_loss_clip": 0.020829, + "auxiliary_loss_mlp": 0.00643139, + "balance_loss_clip": 1.63088882, + "balance_loss_mlp": 0.59152174, + "epoch": 0.03926048399218398, + "flos": 31354468053120.0, + "grad_norm": 44.15598492436149, + "language_loss": 0.88257718, + "learning_rate": 3.999100754295471e-06, + "loss": 0.9098376, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 4.51953125, + "router_z_loss_mlp": 0.51635742, + "step": 653, + "time_per_iteration": 4.055376291275024 + }, + { + "auxiliary_loss_clip": 0.02156269, + "auxiliary_loss_mlp": 0.00569803, + "balance_loss_clip": 1.6559484, + "balance_loss_mlp": 0.5228582, + "epoch": 0.039320607244851945, + "flos": 29603499770880.0, + "grad_norm": 19.97538381348395, + "language_loss": 0.94763261, + "learning_rate": 3.999089038752085e-06, + "loss": 0.97489333, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 5.00390625, + "router_z_loss_mlp": 0.46948242, + "step": 654, + "time_per_iteration": 2.732017993927002 + }, + { + "auxiliary_loss_clip": 0.01760074, + "auxiliary_loss_mlp": 0.00357257, + "balance_loss_clip": 1.46064818, + "balance_loss_mlp": 0.33856481, + "epoch": 0.03938073049751992, + "flos": 66534609951360.0, + "grad_norm": 0.8172297183611896, + "language_loss": 0.49776557, + "learning_rate": 3.999077247403041e-06, + "loss": 0.51893884, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.18652344, + "step": 655, + "time_per_iteration": 3.168562650680542 + }, + { + "auxiliary_loss_clip": 0.0217516, + "auxiliary_loss_mlp": 0.00553205, + "balance_loss_clip": 1.68573654, + "balance_loss_mlp": 0.51083755, + "epoch": 0.03944085375018788, + "flos": 23367827836800.0, + "grad_norm": 87.3725371008475, + "language_loss": 0.87139708, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.89868075, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 4.890625, + "router_z_loss_mlp": 0.42333984, + "step": 656, + "time_per_iteration": 2.665041923522949 + }, + { + "auxiliary_loss_clip": 0.0219256, + "auxiliary_loss_mlp": 0.00627665, + "balance_loss_clip": 1.66923535, + "balance_loss_mlp": 0.57251883, + "epoch": 0.039500977002855854, + "flos": 18548292579840.0, + "grad_norm": 17.533615763992845, + "language_loss": 0.864766, + "learning_rate": 3.999053437289776e-06, + "loss": 0.89296818, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 5.2265625, + "router_z_loss_mlp": 0.55175781, + "step": 657, + "time_per_iteration": 2.5272276401519775 + }, + { + "auxiliary_loss_clip": 0.02198626, + "auxiliary_loss_mlp": 0.00574232, + "balance_loss_clip": 1.67796814, + "balance_loss_mlp": 0.52270973, + "epoch": 0.039561100255523826, + "flos": 25338174433920.0, + "grad_norm": 6.444703610725153, + "language_loss": 0.8865844, + "learning_rate": 3.999041418526457e-06, + "loss": 0.91431296, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 5.203125, + "router_z_loss_mlp": 0.515625, + "step": 658, + "time_per_iteration": 2.656120777130127 + }, + { + "auxiliary_loss_clip": 0.02268436, + "auxiliary_loss_mlp": 0.00559187, + "balance_loss_clip": 1.71665215, + "balance_loss_mlp": 0.51353031, + "epoch": 0.03962122350819179, + "flos": 18219889509120.0, + "grad_norm": 52.69173536888343, + "language_loss": 0.98754567, + "learning_rate": 3.999029323959287e-06, + "loss": 1.01582193, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 5.51171875, + "router_z_loss_mlp": 0.45654297, + "step": 659, + "time_per_iteration": 2.587963342666626 + }, + { + "auxiliary_loss_clip": 0.02405383, + "auxiliary_loss_mlp": 0.00527192, + "balance_loss_clip": 1.77901661, + "balance_loss_mlp": 0.48253635, + "epoch": 0.03968134676085976, + "flos": 20522230536960.0, + "grad_norm": 19.331217708857565, + "language_loss": 0.86348087, + "learning_rate": 3.999017153588724e-06, + "loss": 0.89280665, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 6.2578125, + "router_z_loss_mlp": 0.44628906, + "step": 660, + "time_per_iteration": 2.6425647735595703 + }, + { + "auxiliary_loss_clip": 0.02507957, + "auxiliary_loss_mlp": 0.00552722, + "balance_loss_clip": 1.80635357, + "balance_loss_mlp": 0.50575352, + "epoch": 0.03974147001352773, + "flos": 22422587483520.0, + "grad_norm": 22.779691536193326, + "language_loss": 0.88695961, + "learning_rate": 3.999004907415231e-06, + "loss": 0.91756642, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 7.01953125, + "router_z_loss_mlp": 0.46948242, + "step": 661, + "time_per_iteration": 2.625096082687378 + }, + { + "auxiliary_loss_clip": 0.02546688, + "auxiliary_loss_mlp": 0.00585067, + "balance_loss_clip": 1.85607219, + "balance_loss_mlp": 0.54863662, + "epoch": 0.0398015932661957, + "flos": 71128769322240.0, + "grad_norm": 1.2247738578318161, + "language_loss": 0.69276673, + "learning_rate": 3.998992585439272e-06, + "loss": 0.72408432, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 6.90625, + "router_z_loss_mlp": 0.36523438, + "step": 662, + "time_per_iteration": 3.2324178218841553 + }, + { + "auxiliary_loss_clip": 0.0262099, + "auxiliary_loss_mlp": 0.00587252, + "balance_loss_clip": 1.81714535, + "balance_loss_mlp": 0.53620648, + "epoch": 0.03986171651886367, + "flos": 16800951571200.0, + "grad_norm": 3.426026851534422, + "language_loss": 0.89582694, + "learning_rate": 3.998980187661314e-06, + "loss": 0.92790931, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 8.05078125, + "router_z_loss_mlp": 0.50952148, + "step": 663, + "time_per_iteration": 2.6464409828186035 + }, + { + "auxiliary_loss_clip": 0.02556614, + "auxiliary_loss_mlp": 0.00595794, + "balance_loss_clip": 1.77946091, + "balance_loss_mlp": 0.54629874, + "epoch": 0.03992183977153164, + "flos": 24535068197760.0, + "grad_norm": 25.994257837945693, + "language_loss": 0.95952493, + "learning_rate": 3.998967714081826e-06, + "loss": 0.99104899, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 7.7734375, + "router_z_loss_mlp": 0.49462891, + "step": 664, + "time_per_iteration": 2.6639349460601807 + }, + { + "auxiliary_loss_clip": 0.02449348, + "auxiliary_loss_mlp": 0.00512162, + "balance_loss_clip": 1.75904465, + "balance_loss_mlp": 0.4702003, + "epoch": 0.03998196302419961, + "flos": 15595897167360.0, + "grad_norm": 5.051422469954378, + "language_loss": 0.90852118, + "learning_rate": 3.998955164701281e-06, + "loss": 0.93813622, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 6.9140625, + "router_z_loss_mlp": 0.41967773, + "step": 665, + "time_per_iteration": 2.7399356365203857 + }, + { + "auxiliary_loss_clip": 0.02383126, + "auxiliary_loss_mlp": 0.00577178, + "balance_loss_clip": 1.7154125, + "balance_loss_mlp": 0.52923214, + "epoch": 0.04004208627686758, + "flos": 25305065072640.0, + "grad_norm": 94.92558934825564, + "language_loss": 0.87820464, + "learning_rate": 3.998942539520158e-06, + "loss": 0.90780765, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 6.68359375, + "router_z_loss_mlp": 0.47973633, + "step": 666, + "time_per_iteration": 2.6586272716522217 + }, + { + "auxiliary_loss_clip": 0.02263394, + "auxiliary_loss_mlp": 0.00536666, + "balance_loss_clip": 1.66534281, + "balance_loss_mlp": 0.49472791, + "epoch": 0.04010220952953555, + "flos": 23475847011840.0, + "grad_norm": 73.0874144190832, + "language_loss": 0.93097818, + "learning_rate": 3.998929838538932e-06, + "loss": 0.95897877, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 5.984375, + "router_z_loss_mlp": 0.41943359, + "step": 667, + "time_per_iteration": 2.675675392150879 + }, + { + "auxiliary_loss_clip": 0.02175935, + "auxiliary_loss_mlp": 0.0052408, + "balance_loss_clip": 1.62860155, + "balance_loss_mlp": 0.48393008, + "epoch": 0.04016233278220352, + "flos": 18617025254400.0, + "grad_norm": 15.27651906351232, + "language_loss": 0.88229167, + "learning_rate": 3.998917061758087e-06, + "loss": 0.9092918, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 5.47265625, + "router_z_loss_mlp": 0.40112305, + "step": 668, + "time_per_iteration": 2.5493245124816895 + }, + { + "auxiliary_loss_clip": 0.01805453, + "auxiliary_loss_mlp": 0.00210456, + "balance_loss_clip": 1.50548339, + "balance_loss_mlp": 0.19472075, + "epoch": 0.040222456034871484, + "flos": 70906194696960.0, + "grad_norm": 2.6400101795725646, + "language_loss": 0.61212009, + "learning_rate": 3.998904209178107e-06, + "loss": 0.63227922, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.15722656, + "step": 669, + "time_per_iteration": 3.2700488567352295 + }, + { + "auxiliary_loss_clip": 0.02112235, + "auxiliary_loss_mlp": 0.00540982, + "balance_loss_clip": 1.59481025, + "balance_loss_mlp": 0.49904376, + "epoch": 0.040282579287539456, + "flos": 23764712186880.0, + "grad_norm": 21.999949832904544, + "language_loss": 0.93456936, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.96110153, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 5.171875, + "router_z_loss_mlp": 0.41943359, + "step": 670, + "time_per_iteration": 2.687681198120117 + }, + { + "auxiliary_loss_clip": 0.02034449, + "auxiliary_loss_mlp": 0.00530567, + "balance_loss_clip": 1.5695318, + "balance_loss_mlp": 0.48772344, + "epoch": 0.04034270254020743, + "flos": 18478518410880.0, + "grad_norm": 30.833397007559316, + "language_loss": 0.81083345, + "learning_rate": 3.998878276622692e-06, + "loss": 0.83648366, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 4.64453125, + "router_z_loss_mlp": 0.42797852, + "step": 671, + "time_per_iteration": 2.6715080738067627 + }, + { + "auxiliary_loss_clip": 0.01952451, + "auxiliary_loss_mlp": 0.00519614, + "balance_loss_clip": 1.52733755, + "balance_loss_mlp": 0.47915423, + "epoch": 0.040402825792875394, + "flos": 17201858244480.0, + "grad_norm": 13.706260668472142, + "language_loss": 1.00329661, + "learning_rate": 3.998865196648242e-06, + "loss": 1.02801716, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 4.25390625, + "router_z_loss_mlp": 0.4050293, + "step": 672, + "time_per_iteration": 2.5770363807678223 + }, + { + "auxiliary_loss_clip": 0.01883741, + "auxiliary_loss_mlp": 0.00566891, + "balance_loss_clip": 1.4855001, + "balance_loss_mlp": 0.52242589, + "epoch": 0.040462949045543366, + "flos": 19172168928000.0, + "grad_norm": 4.146258687732022, + "language_loss": 0.97702432, + "learning_rate": 3.998852040876622e-06, + "loss": 1.00153065, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.4453125, + "step": 673, + "time_per_iteration": 2.6310653686523438 + }, + { + "auxiliary_loss_clip": 0.01820583, + "auxiliary_loss_mlp": 0.0055822, + "balance_loss_clip": 1.44987381, + "balance_loss_mlp": 0.51227641, + "epoch": 0.04052307229821133, + "flos": 24019821555840.0, + "grad_norm": 26.590680117649466, + "language_loss": 0.82526392, + "learning_rate": 3.998838809308334e-06, + "loss": 0.84905195, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 0.45947266, + "step": 674, + "time_per_iteration": 2.631953239440918 + }, + { + "auxiliary_loss_clip": 0.01814148, + "auxiliary_loss_mlp": 0.00572547, + "balance_loss_clip": 1.4414711, + "balance_loss_mlp": 0.52603102, + "epoch": 0.0405831955508793, + "flos": 16436601964800.0, + "grad_norm": 15.003727805945285, + "language_loss": 0.85965931, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.88352633, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 0.46508789, + "step": 675, + "time_per_iteration": 2.5894391536712646 + }, + { + "auxiliary_loss_clip": 0.01759414, + "auxiliary_loss_mlp": 0.00581232, + "balance_loss_clip": 1.41543889, + "balance_loss_mlp": 0.53171241, + "epoch": 0.040643318803547275, + "flos": 24279922915200.0, + "grad_norm": 68.12186848323748, + "language_loss": 0.82364386, + "learning_rate": 3.998812118783757e-06, + "loss": 0.84705031, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.49511719, + "step": 676, + "time_per_iteration": 2.678849458694458 + }, + { + "auxiliary_loss_clip": 0.01747955, + "auxiliary_loss_mlp": 0.00632076, + "balance_loss_clip": 1.40351784, + "balance_loss_mlp": 0.57991016, + "epoch": 0.04070344205621524, + "flos": 17712076982400.0, + "grad_norm": 46.45720877019674, + "language_loss": 0.9308188, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.95461917, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.52148438, + "step": 677, + "time_per_iteration": 2.7620902061462402 + }, + { + "auxiliary_loss_clip": 0.01738642, + "auxiliary_loss_mlp": 0.00608979, + "balance_loss_clip": 1.40411758, + "balance_loss_mlp": 0.55919719, + "epoch": 0.04076356530888321, + "flos": 26177658168960.0, + "grad_norm": 4.584290896147419, + "language_loss": 0.83028448, + "learning_rate": 3.998785125078559e-06, + "loss": 0.85376072, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.49804688, + "step": 678, + "time_per_iteration": 2.7228546142578125 + }, + { + "auxiliary_loss_clip": 0.01735919, + "auxiliary_loss_mlp": 0.00554663, + "balance_loss_clip": 1.40054727, + "balance_loss_mlp": 0.50917244, + "epoch": 0.04082368856155118, + "flos": 35773455772800.0, + "grad_norm": 10.537941339399541, + "language_loss": 0.88295138, + "learning_rate": 3.998771514534505e-06, + "loss": 0.90585721, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.45483398, + "step": 679, + "time_per_iteration": 2.7209551334381104 + }, + { + "auxiliary_loss_clip": 0.01709327, + "auxiliary_loss_mlp": 0.00571791, + "balance_loss_clip": 1.39651668, + "balance_loss_mlp": 0.5257048, + "epoch": 0.04088381181421915, + "flos": 28146640049280.0, + "grad_norm": 18.583747760518797, + "language_loss": 0.84233713, + "learning_rate": 3.998757828196835e-06, + "loss": 0.86514831, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 3.13085938, + "router_z_loss_mlp": 0.46069336, + "step": 680, + "time_per_iteration": 2.648125410079956 + }, + { + "auxiliary_loss_clip": 0.01696908, + "auxiliary_loss_mlp": 0.00593822, + "balance_loss_clip": 1.37453663, + "balance_loss_mlp": 0.5429436, + "epoch": 0.04094393506688712, + "flos": 27597673514880.0, + "grad_norm": 33.38905964533509, + "language_loss": 0.89899087, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.92189819, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.50854492, + "step": 681, + "time_per_iteration": 2.6421959400177 + }, + { + "auxiliary_loss_clip": 0.01701977, + "auxiliary_loss_mlp": 0.00622249, + "balance_loss_clip": 1.37764668, + "balance_loss_mlp": 0.56884283, + "epoch": 0.04100405831955509, + "flos": 23112036109440.0, + "grad_norm": 116.91096025399212, + "language_loss": 0.7770822, + "learning_rate": 3.998730228142726e-06, + "loss": 0.80032444, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.53466797, + "step": 682, + "time_per_iteration": 2.660576343536377 + }, + { + "auxiliary_loss_clip": 0.01677625, + "auxiliary_loss_mlp": 0.0056175, + "balance_loss_clip": 1.37401628, + "balance_loss_mlp": 0.51649773, + "epoch": 0.04106418157222306, + "flos": 20156731695360.0, + "grad_norm": 179.7737758796279, + "language_loss": 0.8009454, + "learning_rate": 3.998716314427333e-06, + "loss": 0.82333916, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.45263672, + "step": 683, + "time_per_iteration": 2.609663963317871 + }, + { + "auxiliary_loss_clip": 0.01667256, + "auxiliary_loss_mlp": 0.00552528, + "balance_loss_clip": 1.36447358, + "balance_loss_mlp": 0.50749028, + "epoch": 0.041124304824891024, + "flos": 17420697855360.0, + "grad_norm": 12.520434985300072, + "language_loss": 0.88856578, + "learning_rate": 3.998702324920417e-06, + "loss": 0.91076362, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.44970703, + "step": 684, + "time_per_iteration": 2.6338376998901367 + }, + { + "auxiliary_loss_clip": 0.01663224, + "auxiliary_loss_mlp": 0.00630728, + "balance_loss_clip": 1.35930037, + "balance_loss_mlp": 0.5793013, + "epoch": 0.041184428077558996, + "flos": 25780163287680.0, + "grad_norm": 11.469153130522159, + "language_loss": 0.96790755, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.99084711, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.51391602, + "step": 685, + "time_per_iteration": 2.708998203277588 + }, + { + "auxiliary_loss_clip": 0.01688141, + "auxiliary_loss_mlp": 0.00614166, + "balance_loss_clip": 1.37076056, + "balance_loss_mlp": 0.56207144, + "epoch": 0.04124455133022697, + "flos": 22964766347520.0, + "grad_norm": 14.618648563099569, + "language_loss": 0.96074629, + "learning_rate": 3.998674118534141e-06, + "loss": 0.9837693, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.52050781, + "step": 686, + "time_per_iteration": 2.6661434173583984 + }, + { + "auxiliary_loss_clip": 0.01688666, + "auxiliary_loss_mlp": 0.00608278, + "balance_loss_clip": 1.36904311, + "balance_loss_mlp": 0.55477679, + "epoch": 0.04130467458289493, + "flos": 21289067015040.0, + "grad_norm": 47.15214115047802, + "language_loss": 0.78831488, + "learning_rate": 3.998659901655851e-06, + "loss": 0.8112843, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.53466797, + "step": 687, + "time_per_iteration": 2.7051823139190674 + }, + { + "auxiliary_loss_clip": 0.01662237, + "auxiliary_loss_mlp": 0.0057823, + "balance_loss_clip": 1.35691285, + "balance_loss_mlp": 0.53157121, + "epoch": 0.041364797835562905, + "flos": 19974233669760.0, + "grad_norm": 24.00543725917011, + "language_loss": 0.92567748, + "learning_rate": 3.998645608988177e-06, + "loss": 0.94808209, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.46606445, + "step": 688, + "time_per_iteration": 2.6573781967163086 + }, + { + "auxiliary_loss_clip": 0.0165701, + "auxiliary_loss_mlp": 0.00598196, + "balance_loss_clip": 1.35321069, + "balance_loss_mlp": 0.55156147, + "epoch": 0.04142492108823087, + "flos": 21906227520000.0, + "grad_norm": 13.382936225765608, + "language_loss": 0.89131337, + "learning_rate": 3.998631240531661e-06, + "loss": 0.91386539, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.46655273, + "step": 689, + "time_per_iteration": 2.713397264480591 + }, + { + "auxiliary_loss_clip": 0.01660731, + "auxiliary_loss_mlp": 0.00596836, + "balance_loss_clip": 1.35020816, + "balance_loss_mlp": 0.55249041, + "epoch": 0.04148504434089884, + "flos": 27639617621760.0, + "grad_norm": 1591.2107716678804, + "language_loss": 0.75028044, + "learning_rate": 3.998616796286848e-06, + "loss": 0.77285612, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.44360352, + "step": 690, + "time_per_iteration": 2.756495952606201 + }, + { + "auxiliary_loss_clip": 0.0166896, + "auxiliary_loss_mlp": 0.00660485, + "balance_loss_clip": 1.35126138, + "balance_loss_mlp": 0.60614944, + "epoch": 0.041545167593566815, + "flos": 20518387781760.0, + "grad_norm": 109.69850579566506, + "language_loss": 0.80587423, + "learning_rate": 3.998602276254286e-06, + "loss": 0.82916868, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.54321289, + "step": 691, + "time_per_iteration": 2.6761515140533447 + }, + { + "auxiliary_loss_clip": 0.01649624, + "auxiliary_loss_mlp": 0.00641622, + "balance_loss_clip": 1.33713126, + "balance_loss_mlp": 0.58945644, + "epoch": 0.04160529084623478, + "flos": 11868907939200.0, + "grad_norm": 73.97961816373382, + "language_loss": 0.90069556, + "learning_rate": 3.998587680434526e-06, + "loss": 0.92360806, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.52148438, + "step": 692, + "time_per_iteration": 2.59477162361145 + }, + { + "auxiliary_loss_clip": 0.01667166, + "auxiliary_loss_mlp": 0.00702187, + "balance_loss_clip": 1.34054923, + "balance_loss_mlp": 0.64062703, + "epoch": 0.04166541409890275, + "flos": 14828306503680.0, + "grad_norm": 81.53478424440388, + "language_loss": 0.96929157, + "learning_rate": 3.99857300882812e-06, + "loss": 0.99298513, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.61572266, + "step": 693, + "time_per_iteration": 4.127526760101318 + }, + { + "auxiliary_loss_clip": 0.01672766, + "auxiliary_loss_mlp": 0.00628943, + "balance_loss_clip": 1.34850574, + "balance_loss_mlp": 0.57579994, + "epoch": 0.04172553735157072, + "flos": 25808137004160.0, + "grad_norm": 8.49364069899407, + "language_loss": 0.89662707, + "learning_rate": 3.998558261435626e-06, + "loss": 0.91964424, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.53125, + "step": 694, + "time_per_iteration": 4.069854497909546 + }, + { + "auxiliary_loss_clip": 0.01680149, + "auxiliary_loss_mlp": 0.00683672, + "balance_loss_clip": 1.34997547, + "balance_loss_mlp": 0.62499726, + "epoch": 0.04178566060423869, + "flos": 24279815174400.0, + "grad_norm": 4.905552123487546, + "language_loss": 0.9113884, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.93502653, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.5859375, + "step": 695, + "time_per_iteration": 4.090334177017212 + }, + { + "auxiliary_loss_clip": 0.01656277, + "auxiliary_loss_mlp": 0.00651567, + "balance_loss_clip": 1.34279442, + "balance_loss_mlp": 0.59503824, + "epoch": 0.04184578385690666, + "flos": 18222008411520.0, + "grad_norm": 136.01014630787253, + "language_loss": 0.92289627, + "learning_rate": 3.99852853929461e-06, + "loss": 0.94597471, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.56542969, + "step": 696, + "time_per_iteration": 2.578969717025757 + }, + { + "auxiliary_loss_clip": 0.01673582, + "auxiliary_loss_mlp": 0.00670714, + "balance_loss_clip": 1.35682821, + "balance_loss_mlp": 0.61482871, + "epoch": 0.041905907109574626, + "flos": 22776342577920.0, + "grad_norm": 9.189305971287913, + "language_loss": 0.99314475, + "learning_rate": 3.998513564547216e-06, + "loss": 1.01658773, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.55834961, + "step": 697, + "time_per_iteration": 2.6971144676208496 + }, + { + "auxiliary_loss_clip": 0.01679678, + "auxiliary_loss_mlp": 0.00629183, + "balance_loss_clip": 1.35825109, + "balance_loss_mlp": 0.57556295, + "epoch": 0.0419660303622426, + "flos": 20156947176960.0, + "grad_norm": 25.356556580341532, + "language_loss": 0.92936927, + "learning_rate": 3.998498514015987e-06, + "loss": 0.9524579, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.53564453, + "step": 698, + "time_per_iteration": 2.6208012104034424 + }, + { + "auxiliary_loss_clip": 0.01712302, + "auxiliary_loss_mlp": 0.00591885, + "balance_loss_clip": 1.38408935, + "balance_loss_mlp": 0.54415369, + "epoch": 0.042026153614910564, + "flos": 23076376882560.0, + "grad_norm": 23.190362305839166, + "language_loss": 0.96935213, + "learning_rate": 3.998483387701495e-06, + "loss": 0.99239409, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.4777832, + "step": 699, + "time_per_iteration": 2.6588175296783447 + }, + { + "auxiliary_loss_clip": 0.01637874, + "auxiliary_loss_mlp": 0.00531743, + "balance_loss_clip": 1.34434748, + "balance_loss_mlp": 0.50217897, + "epoch": 0.042086276867578536, + "flos": 64495243370880.0, + "grad_norm": 0.9196749125804492, + "language_loss": 0.68242824, + "learning_rate": 3.998468185604312e-06, + "loss": 0.70412445, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.29492188, + "step": 700, + "time_per_iteration": 3.1739885807037354 + }, + { + "auxiliary_loss_clip": 0.01813639, + "auxiliary_loss_mlp": 0.00546141, + "balance_loss_clip": 1.45208645, + "balance_loss_mlp": 0.49869582, + "epoch": 0.04214640012024651, + "flos": 15487016065920.0, + "grad_norm": 45.69409073707609, + "language_loss": 0.95972288, + "learning_rate": 3.998452907725016e-06, + "loss": 0.98332071, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.47436523, + "step": 701, + "time_per_iteration": 2.6863391399383545 + }, + { + "auxiliary_loss_clip": 0.01849197, + "auxiliary_loss_mlp": 0.00590157, + "balance_loss_clip": 1.47140622, + "balance_loss_mlp": 0.54087627, + "epoch": 0.04220652337291447, + "flos": 23877040993920.0, + "grad_norm": 92.53025160928954, + "language_loss": 0.73163611, + "learning_rate": 3.998437554064184e-06, + "loss": 0.75602967, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.49291992, + "step": 702, + "time_per_iteration": 2.688028573989868 + }, + { + "auxiliary_loss_clip": 0.01725033, + "auxiliary_loss_mlp": 0.00353602, + "balance_loss_clip": 1.4130336, + "balance_loss_mlp": 0.33510089, + "epoch": 0.042266646625582445, + "flos": 63795451628160.0, + "grad_norm": 0.9031037050006142, + "language_loss": 0.60757738, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.62836379, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.18457031, + "step": 703, + "time_per_iteration": 3.1882622241973877 + }, + { + "auxiliary_loss_clip": 0.01709145, + "auxiliary_loss_mlp": 0.00248237, + "balance_loss_clip": 1.40468955, + "balance_loss_mlp": 0.22954522, + "epoch": 0.04232676987825041, + "flos": 50018863345920.0, + "grad_norm": 1.0315781545076872, + "language_loss": 0.57778203, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.59735584, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.18652344, + "step": 704, + "time_per_iteration": 3.0621256828308105 + }, + { + "auxiliary_loss_clip": 0.01926917, + "auxiliary_loss_mlp": 0.00576541, + "balance_loss_clip": 1.51088285, + "balance_loss_mlp": 0.52458984, + "epoch": 0.04238689313091838, + "flos": 21616105368960.0, + "grad_norm": 519.3402419437504, + "language_loss": 0.94500351, + "learning_rate": 3.998391038398319e-06, + "loss": 0.97003806, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.51928711, + "step": 705, + "time_per_iteration": 2.6108241081237793 + }, + { + "auxiliary_loss_clip": 0.01919147, + "auxiliary_loss_mlp": 0.0053508, + "balance_loss_clip": 1.50891423, + "balance_loss_mlp": 0.48887488, + "epoch": 0.042447016383586354, + "flos": 19135109070720.0, + "grad_norm": 26.816510880888114, + "language_loss": 0.78098452, + "learning_rate": 3.998375381617201e-06, + "loss": 0.80552673, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 4.10351562, + "router_z_loss_mlp": 0.4621582, + "step": 706, + "time_per_iteration": 2.639017343521118 + }, + { + "auxiliary_loss_clip": 0.01917584, + "auxiliary_loss_mlp": 0.00536565, + "balance_loss_clip": 1.50685763, + "balance_loss_mlp": 0.48604369, + "epoch": 0.04250713963625432, + "flos": 24426007528320.0, + "grad_norm": 5.315494052513124, + "language_loss": 0.98393273, + "learning_rate": 3.9983596490574875e-06, + "loss": 1.00847423, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 4.11328125, + "router_z_loss_mlp": 0.50561523, + "step": 707, + "time_per_iteration": 2.6399965286254883 + }, + { + "auxiliary_loss_clip": 0.01915653, + "auxiliary_loss_mlp": 0.00545442, + "balance_loss_clip": 1.50004673, + "balance_loss_mlp": 0.49284708, + "epoch": 0.04256726288892229, + "flos": 30367391333760.0, + "grad_norm": 39.95717743980437, + "language_loss": 0.88288164, + "learning_rate": 3.998343840719776e-06, + "loss": 0.90749264, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 4.15625, + "router_z_loss_mlp": 0.52636719, + "step": 708, + "time_per_iteration": 2.7736308574676514 + }, + { + "auxiliary_loss_clip": 0.01901422, + "auxiliary_loss_mlp": 0.0057718, + "balance_loss_clip": 1.48729539, + "balance_loss_mlp": 0.52315468, + "epoch": 0.04262738614159026, + "flos": 16362661818240.0, + "grad_norm": 58.26147947441614, + "language_loss": 0.90113723, + "learning_rate": 3.998327956604666e-06, + "loss": 0.92592323, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.54077148, + "step": 709, + "time_per_iteration": 2.606072187423706 + }, + { + "auxiliary_loss_clip": 0.0192841, + "auxiliary_loss_mlp": 0.00660778, + "balance_loss_clip": 1.50481486, + "balance_loss_mlp": 0.59883654, + "epoch": 0.04268750939425823, + "flos": 20412379768320.0, + "grad_norm": 34.15601489535805, + "language_loss": 0.93343544, + "learning_rate": 3.99831199671276e-06, + "loss": 0.95932728, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.61914062, + "step": 710, + "time_per_iteration": 2.6763060092926025 + }, + { + "auxiliary_loss_clip": 0.01893345, + "auxiliary_loss_mlp": 0.00702682, + "balance_loss_clip": 1.48752081, + "balance_loss_mlp": 0.64240992, + "epoch": 0.0427476326469262, + "flos": 20302959962880.0, + "grad_norm": 36.07950193478578, + "language_loss": 0.903862, + "learning_rate": 3.998295961044662e-06, + "loss": 0.92982233, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.60253906, + "step": 711, + "time_per_iteration": 2.6675117015838623 + }, + { + "auxiliary_loss_clip": 0.01883556, + "auxiliary_loss_mlp": 0.00784427, + "balance_loss_clip": 1.4822607, + "balance_loss_mlp": 0.71514297, + "epoch": 0.042807755899594166, + "flos": 21650794928640.0, + "grad_norm": 61.51999906686752, + "language_loss": 0.93069386, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.95737368, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 4.01367188, + "router_z_loss_mlp": 0.69287109, + "step": 712, + "time_per_iteration": 2.79653263092041 + }, + { + "auxiliary_loss_clip": 0.01904924, + "auxiliary_loss_mlp": 0.00809494, + "balance_loss_clip": 1.49445748, + "balance_loss_mlp": 0.74140179, + "epoch": 0.04286787915226214, + "flos": 21435007973760.0, + "grad_norm": 5.30412501788728, + "language_loss": 0.98289996, + "learning_rate": 3.998263662382328e-06, + "loss": 1.0100441, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 4.1015625, + "router_z_loss_mlp": 0.68115234, + "step": 713, + "time_per_iteration": 2.673685312271118 + }, + { + "auxiliary_loss_clip": 0.01751084, + "auxiliary_loss_mlp": 0.00537148, + "balance_loss_clip": 1.46883821, + "balance_loss_mlp": 0.51588148, + "epoch": 0.04292800240493011, + "flos": 66397970615040.0, + "grad_norm": 0.9727692491409453, + "language_loss": 0.64473629, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.66761863, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.21289062, + "step": 714, + "time_per_iteration": 3.289107322692871 + }, + { + "auxiliary_loss_clip": 0.01852733, + "auxiliary_loss_mlp": 0.00737667, + "balance_loss_clip": 1.46903765, + "balance_loss_mlp": 0.67138678, + "epoch": 0.042988125657598075, + "flos": 31650264552960.0, + "grad_norm": 39.62571548453726, + "language_loss": 0.80969656, + "learning_rate": 3.998231060622563e-06, + "loss": 0.83560061, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 0.66259766, + "step": 715, + "time_per_iteration": 2.7501022815704346 + }, + { + "auxiliary_loss_clip": 0.01843783, + "auxiliary_loss_mlp": 0.00681385, + "balance_loss_clip": 1.45391011, + "balance_loss_mlp": 0.61310196, + "epoch": 0.04304824891026605, + "flos": 33248468292480.0, + "grad_norm": 89.95459247620057, + "language_loss": 0.78846765, + "learning_rate": 3.998214646082688e-06, + "loss": 0.81371927, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.68310547, + "step": 716, + "time_per_iteration": 2.75793194770813 + }, + { + "auxiliary_loss_clip": 0.01787173, + "auxiliary_loss_mlp": 0.00623957, + "balance_loss_clip": 1.50843167, + "balance_loss_mlp": 0.60059208, + "epoch": 0.04310837216293401, + "flos": 64064782782720.0, + "grad_norm": 0.9451730020071218, + "language_loss": 0.65513599, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67924732, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.23339844, + "step": 717, + "time_per_iteration": 3.2478103637695312 + }, + { + "auxiliary_loss_clip": 0.01658094, + "auxiliary_loss_mlp": 0.00203555, + "balance_loss_clip": 1.41420054, + "balance_loss_mlp": 0.1892494, + "epoch": 0.043168495415601985, + "flos": 61343757849600.0, + "grad_norm": 1.0063366495566757, + "language_loss": 0.58419108, + "learning_rate": 3.998181589686065e-06, + "loss": 0.60280752, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.14257812, + "step": 718, + "time_per_iteration": 3.0017330646514893 + }, + { + "auxiliary_loss_clip": 0.01826733, + "auxiliary_loss_mlp": 0.00581048, + "balance_loss_clip": 1.44255292, + "balance_loss_mlp": 0.52408975, + "epoch": 0.04322861866826996, + "flos": 20704261685760.0, + "grad_norm": 4.2708150121173185, + "language_loss": 0.97743702, + "learning_rate": 3.99816494783057e-06, + "loss": 1.00151491, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 0.57006836, + "step": 719, + "time_per_iteration": 2.617546796798706 + }, + { + "auxiliary_loss_clip": 0.01849039, + "auxiliary_loss_mlp": 0.00618883, + "balance_loss_clip": 1.44409192, + "balance_loss_mlp": 0.56182963, + "epoch": 0.04328874192093792, + "flos": 30373352991360.0, + "grad_norm": 779.0729005643553, + "language_loss": 0.71113551, + "learning_rate": 3.99814823020446e-06, + "loss": 0.73581475, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 4.04882812, + "router_z_loss_mlp": 0.57080078, + "step": 720, + "time_per_iteration": 2.728274345397949 + }, + { + "auxiliary_loss_clip": 0.01825817, + "auxiliary_loss_mlp": 0.00608812, + "balance_loss_clip": 1.44484568, + "balance_loss_mlp": 0.55070901, + "epoch": 0.043348865173605894, + "flos": 21944795748480.0, + "grad_norm": 34.830416017614795, + "language_loss": 0.82678872, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.85113502, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 0.58203125, + "step": 721, + "time_per_iteration": 2.6226534843444824 + }, + { + "auxiliary_loss_clip": 0.01872051, + "auxiliary_loss_mlp": 0.00577108, + "balance_loss_clip": 1.48478556, + "balance_loss_mlp": 0.52894735, + "epoch": 0.04340898842627386, + "flos": 15264225959040.0, + "grad_norm": 25.554006557439678, + "language_loss": 0.95385122, + "learning_rate": 3.998114567642933e-06, + "loss": 0.97834277, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 3.86914062, + "router_z_loss_mlp": 0.48120117, + "step": 722, + "time_per_iteration": 2.646631956100464 + }, + { + "auxiliary_loss_clip": 0.01902472, + "auxiliary_loss_mlp": 0.00635194, + "balance_loss_clip": 1.50350046, + "balance_loss_mlp": 0.57828343, + "epoch": 0.04346911167894183, + "flos": 27965434913280.0, + "grad_norm": 4.6228388333301105, + "language_loss": 0.908306, + "learning_rate": 3.998097622708792e-06, + "loss": 0.93368262, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.56860352, + "step": 723, + "time_per_iteration": 2.680851936340332 + }, + { + "auxiliary_loss_clip": 0.01910245, + "auxiliary_loss_mlp": 0.00576182, + "balance_loss_clip": 1.50901055, + "balance_loss_mlp": 0.52623308, + "epoch": 0.0435292349316098, + "flos": 29242202820480.0, + "grad_norm": 10.845984846170527, + "language_loss": 0.87470865, + "learning_rate": 3.99808060200659e-06, + "loss": 0.89957285, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 4.015625, + "router_z_loss_mlp": 0.49926758, + "step": 724, + "time_per_iteration": 2.6944401264190674 + }, + { + "auxiliary_loss_clip": 0.01902456, + "auxiliary_loss_mlp": 0.00614245, + "balance_loss_clip": 1.50722921, + "balance_loss_mlp": 0.56043375, + "epoch": 0.04358935818427777, + "flos": 20558356640640.0, + "grad_norm": 11.818354124322987, + "language_loss": 0.86182839, + "learning_rate": 3.998063505536971e-06, + "loss": 0.88699538, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.53833008, + "step": 725, + "time_per_iteration": 2.588808536529541 + }, + { + "auxiliary_loss_clip": 0.01886803, + "auxiliary_loss_mlp": 0.00625011, + "balance_loss_clip": 1.48490822, + "balance_loss_mlp": 0.57267785, + "epoch": 0.04364948143694574, + "flos": 14464926564480.0, + "grad_norm": 35.68773126051081, + "language_loss": 0.9403441, + "learning_rate": 3.998046333300584e-06, + "loss": 0.96546221, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 4.01953125, + "router_z_loss_mlp": 0.52319336, + "step": 726, + "time_per_iteration": 2.616271734237671 + }, + { + "auxiliary_loss_clip": 0.01676269, + "auxiliary_loss_mlp": 0.00515434, + "balance_loss_clip": 1.35932374, + "balance_loss_mlp": 0.49254543, + "epoch": 0.043709604689613706, + "flos": 50067268922880.0, + "grad_norm": 1.00764493074828, + "language_loss": 0.55935264, + "learning_rate": 3.998029085298079e-06, + "loss": 0.58126968, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.22851562, + "step": 727, + "time_per_iteration": 3.2592782974243164 + }, + { + "auxiliary_loss_clip": 0.01834742, + "auxiliary_loss_mlp": 0.00671625, + "balance_loss_clip": 1.4574368, + "balance_loss_mlp": 0.61478633, + "epoch": 0.04376972794228168, + "flos": 13991588115840.0, + "grad_norm": 80.95313198153251, + "language_loss": 0.88401091, + "learning_rate": 3.998011761530112e-06, + "loss": 0.9090746, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 0.56860352, + "step": 728, + "time_per_iteration": 2.6531593799591064 + }, + { + "auxiliary_loss_clip": 0.0180862, + "auxiliary_loss_mlp": 0.00643124, + "balance_loss_clip": 1.44599414, + "balance_loss_mlp": 0.59191215, + "epoch": 0.04382985119494965, + "flos": 22009901149440.0, + "grad_norm": 41.34957982720944, + "language_loss": 0.81811553, + "learning_rate": 3.997994361997338e-06, + "loss": 0.84263301, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.51269531, + "step": 729, + "time_per_iteration": 2.6272218227386475 + }, + { + "auxiliary_loss_clip": 0.01777496, + "auxiliary_loss_mlp": 0.00626124, + "balance_loss_clip": 1.42228436, + "balance_loss_mlp": 0.56976199, + "epoch": 0.043889974447617615, + "flos": 24206521472640.0, + "grad_norm": 24.594713735323435, + "language_loss": 1.01923561, + "learning_rate": 3.997976886700417e-06, + "loss": 1.04327178, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 0.56396484, + "step": 730, + "time_per_iteration": 2.672004461288452 + }, + { + "auxiliary_loss_clip": 0.0180749, + "auxiliary_loss_mlp": 0.00605042, + "balance_loss_clip": 1.4437654, + "balance_loss_mlp": 0.55211353, + "epoch": 0.04395009770028559, + "flos": 17274541415040.0, + "grad_norm": 9.708760576686043, + "language_loss": 0.9552213, + "learning_rate": 3.997959335640013e-06, + "loss": 0.97934663, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 0.52905273, + "step": 731, + "time_per_iteration": 2.6576879024505615 + }, + { + "auxiliary_loss_clip": 0.01807565, + "auxiliary_loss_mlp": 0.00549159, + "balance_loss_clip": 1.44214487, + "balance_loss_mlp": 0.5004738, + "epoch": 0.04401022095295355, + "flos": 12310286261760.0, + "grad_norm": 114.69460952735488, + "language_loss": 0.99384016, + "learning_rate": 3.997941708816791e-06, + "loss": 1.01740742, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 0.48730469, + "step": 732, + "time_per_iteration": 2.587033271789551 + }, + { + "auxiliary_loss_clip": 0.018012, + "auxiliary_loss_mlp": 0.00556574, + "balance_loss_clip": 1.43126631, + "balance_loss_mlp": 0.50796074, + "epoch": 0.044070344205621524, + "flos": 20959658363520.0, + "grad_norm": 2.9809440524821635, + "language_loss": 0.92383182, + "learning_rate": 3.997924006231419e-06, + "loss": 0.94740951, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 0.48632812, + "step": 733, + "time_per_iteration": 2.76142954826355 + }, + { + "auxiliary_loss_clip": 0.01799183, + "auxiliary_loss_mlp": 0.00526247, + "balance_loss_clip": 1.42680335, + "balance_loss_mlp": 0.47608364, + "epoch": 0.044130467458289496, + "flos": 13845288021120.0, + "grad_norm": 49.5211373737798, + "language_loss": 0.9657886, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.98904294, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.50170898, + "step": 734, + "time_per_iteration": 2.573207139968872 + }, + { + "auxiliary_loss_clip": 0.01795784, + "auxiliary_loss_mlp": 0.0048323, + "balance_loss_clip": 1.4242177, + "balance_loss_mlp": 0.43993297, + "epoch": 0.04419059071095746, + "flos": 28655063107200.0, + "grad_norm": 108.08018719957764, + "language_loss": 0.85673702, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.87952721, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 0.43286133, + "step": 735, + "time_per_iteration": 2.7445359230041504 + }, + { + "auxiliary_loss_clip": 0.01791788, + "auxiliary_loss_mlp": 0.00516108, + "balance_loss_clip": 1.41111648, + "balance_loss_mlp": 0.46995038, + "epoch": 0.04425071396362543, + "flos": 28183304856960.0, + "grad_norm": 12.348637740441953, + "language_loss": 0.95734715, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.98042613, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.46142578, + "step": 736, + "time_per_iteration": 5.5520179271698 + }, + { + "auxiliary_loss_clip": 0.01778103, + "auxiliary_loss_mlp": 0.00454534, + "balance_loss_clip": 1.40353537, + "balance_loss_mlp": 0.41130853, + "epoch": 0.0443108372162934, + "flos": 23658452778240.0, + "grad_norm": 13.812762379932344, + "language_loss": 0.90680456, + "learning_rate": 3.997852438281901e-06, + "loss": 0.92913091, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 0.43212891, + "step": 737, + "time_per_iteration": 4.098562717437744 + }, + { + "auxiliary_loss_clip": 0.01806015, + "auxiliary_loss_mlp": 0.00471538, + "balance_loss_clip": 1.41929293, + "balance_loss_mlp": 0.42306709, + "epoch": 0.04437096046896137, + "flos": 33979861025280.0, + "grad_norm": 11.102505927743579, + "language_loss": 0.91092336, + "learning_rate": 3.997834356895906e-06, + "loss": 0.93369889, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.48461914, + "step": 738, + "time_per_iteration": 4.19480299949646 + }, + { + "auxiliary_loss_clip": 0.01512159, + "auxiliary_loss_mlp": 0.00194761, + "balance_loss_clip": 1.2966361, + "balance_loss_mlp": 0.18121903, + "epoch": 0.04443108372162934, + "flos": 67397506375680.0, + "grad_norm": 0.8852143336296973, + "language_loss": 0.59093809, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.60800731, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.13574219, + "step": 739, + "time_per_iteration": 3.100675106048584 + }, + { + "auxiliary_loss_clip": 0.01827177, + "auxiliary_loss_mlp": 0.00446427, + "balance_loss_clip": 1.4126308, + "balance_loss_mlp": 0.40465599, + "epoch": 0.04449120697429731, + "flos": 29752672953600.0, + "grad_norm": 91.91945997327304, + "language_loss": 0.98923743, + "learning_rate": 3.997797966850369e-06, + "loss": 1.0119735, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 4.14648438, + "router_z_loss_mlp": 0.41796875, + "step": 740, + "time_per_iteration": 2.7332727909088135 + }, + { + "auxiliary_loss_clip": 0.01845553, + "auxiliary_loss_mlp": 0.00426983, + "balance_loss_clip": 1.4178102, + "balance_loss_mlp": 0.38602239, + "epoch": 0.04455133022696528, + "flos": 36502119072000.0, + "grad_norm": 26.68004100042141, + "language_loss": 0.78248048, + "learning_rate": 3.997779658192205e-06, + "loss": 0.80520582, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 4.27734375, + "router_z_loss_mlp": 0.40966797, + "step": 741, + "time_per_iteration": 2.7543768882751465 + }, + { + "auxiliary_loss_clip": 0.01858739, + "auxiliary_loss_mlp": 0.00365452, + "balance_loss_clip": 1.4193368, + "balance_loss_mlp": 0.33071405, + "epoch": 0.044611453479633245, + "flos": 28803661672320.0, + "grad_norm": 68.27173179263607, + "language_loss": 0.92020392, + "learning_rate": 3.997761273778037e-06, + "loss": 0.94244587, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 4.39453125, + "router_z_loss_mlp": 0.34765625, + "step": 742, + "time_per_iteration": 2.7714953422546387 + }, + { + "auxiliary_loss_clip": 0.01891007, + "auxiliary_loss_mlp": 0.00397318, + "balance_loss_clip": 1.4294064, + "balance_loss_mlp": 0.35898063, + "epoch": 0.04467157673230122, + "flos": 20010970304640.0, + "grad_norm": 6.182932563559368, + "language_loss": 0.9161191, + "learning_rate": 3.997742813608561e-06, + "loss": 0.93900234, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.38330078, + "step": 743, + "time_per_iteration": 2.598271369934082 + }, + { + "auxiliary_loss_clip": 0.01891102, + "auxiliary_loss_mlp": 0.00398452, + "balance_loss_clip": 1.42897284, + "balance_loss_mlp": 0.35930318, + "epoch": 0.04473169998496919, + "flos": 18004964480640.0, + "grad_norm": 6.25588368089373, + "language_loss": 0.88164341, + "learning_rate": 3.997724277684479e-06, + "loss": 0.90453893, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 4.62109375, + "router_z_loss_mlp": 0.39160156, + "step": 744, + "time_per_iteration": 2.652085781097412 + }, + { + "auxiliary_loss_clip": 0.01894667, + "auxiliary_loss_mlp": 0.00372913, + "balance_loss_clip": 1.4377352, + "balance_loss_mlp": 0.33488566, + "epoch": 0.044791823237637154, + "flos": 20631722169600.0, + "grad_norm": 3.970387333103113, + "language_loss": 0.92340887, + "learning_rate": 3.99770566600649e-06, + "loss": 0.94608468, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 4.56640625, + "router_z_loss_mlp": 0.38012695, + "step": 745, + "time_per_iteration": 2.6184685230255127 + }, + { + "auxiliary_loss_clip": 0.0189821, + "auxiliary_loss_mlp": 0.00429561, + "balance_loss_clip": 1.43217802, + "balance_loss_mlp": 0.38838568, + "epoch": 0.04485194649030513, + "flos": 31176171918720.0, + "grad_norm": 127.52672196292924, + "language_loss": 0.75506973, + "learning_rate": 3.997686978575302e-06, + "loss": 0.77834749, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 4.66015625, + "router_z_loss_mlp": 0.41162109, + "step": 746, + "time_per_iteration": 2.7507131099700928 + }, + { + "auxiliary_loss_clip": 0.01898308, + "auxiliary_loss_mlp": 0.00427273, + "balance_loss_clip": 1.4456991, + "balance_loss_mlp": 0.385717, + "epoch": 0.04491206974297309, + "flos": 26143291831680.0, + "grad_norm": 35.49829127682906, + "language_loss": 0.76436138, + "learning_rate": 3.997668215391625e-06, + "loss": 0.78761721, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.41577148, + "step": 747, + "time_per_iteration": 2.664830446243286 + }, + { + "auxiliary_loss_clip": 0.01915546, + "auxiliary_loss_mlp": 0.00416157, + "balance_loss_clip": 1.4569962, + "balance_loss_mlp": 0.37696111, + "epoch": 0.044972192995641064, + "flos": 20667668705280.0, + "grad_norm": 43.726412112338345, + "language_loss": 0.72856754, + "learning_rate": 3.997649376456168e-06, + "loss": 0.75188452, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 4.5859375, + "router_z_loss_mlp": 0.39208984, + "step": 748, + "time_per_iteration": 2.7309587001800537 + }, + { + "auxiliary_loss_clip": 0.01908121, + "auxiliary_loss_mlp": 0.0048002, + "balance_loss_clip": 1.4469769, + "balance_loss_mlp": 0.43319502, + "epoch": 0.045032316248309036, + "flos": 16106834177280.0, + "grad_norm": 53.38791382826946, + "language_loss": 0.84428704, + "learning_rate": 3.997630461769647e-06, + "loss": 0.86816847, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.46826172, + "step": 749, + "time_per_iteration": 2.636099100112915 + }, + { + "auxiliary_loss_clip": 0.01925955, + "auxiliary_loss_mlp": 0.00516219, + "balance_loss_clip": 1.45280886, + "balance_loss_mlp": 0.47294623, + "epoch": 0.045092439500977, + "flos": 17858843953920.0, + "grad_norm": 13.600231900421955, + "language_loss": 0.95595843, + "learning_rate": 3.997611471332778e-06, + "loss": 0.98038012, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 4.734375, + "router_z_loss_mlp": 0.43310547, + "step": 750, + "time_per_iteration": 2.6272833347320557 + }, + { + "auxiliary_loss_clip": 0.01940517, + "auxiliary_loss_mlp": 0.00609479, + "balance_loss_clip": 1.45615649, + "balance_loss_mlp": 0.55390346, + "epoch": 0.04515256275364497, + "flos": 24462815990400.0, + "grad_norm": 10.478815296779713, + "language_loss": 0.81120455, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.83670449, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 4.84375, + "router_z_loss_mlp": 0.55566406, + "step": 751, + "time_per_iteration": 2.723679304122925 + }, + { + "auxiliary_loss_clip": 0.01970077, + "auxiliary_loss_mlp": 0.00623429, + "balance_loss_clip": 1.46576881, + "balance_loss_mlp": 0.57111967, + "epoch": 0.04521268600631294, + "flos": 20916385453440.0, + "grad_norm": 5.607757877235695, + "language_loss": 0.79174054, + "learning_rate": 3.997573263210883e-06, + "loss": 0.81767559, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 5.04296875, + "router_z_loss_mlp": 0.52294922, + "step": 752, + "time_per_iteration": 2.68918776512146 + }, + { + "auxiliary_loss_clip": 0.02001078, + "auxiliary_loss_mlp": 0.00673054, + "balance_loss_clip": 1.4692564, + "balance_loss_mlp": 0.61898053, + "epoch": 0.04527280925898091, + "flos": 13371374954880.0, + "grad_norm": 85.27299153438244, + "language_loss": 1.01283848, + "learning_rate": 3.997554045527305e-06, + "loss": 1.03957987, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 5.32421875, + "router_z_loss_mlp": 0.54077148, + "step": 753, + "time_per_iteration": 2.6330745220184326 + }, + { + "auxiliary_loss_clip": 0.02060231, + "auxiliary_loss_mlp": 0.00822137, + "balance_loss_clip": 1.47807848, + "balance_loss_mlp": 0.75738233, + "epoch": 0.04533293251164888, + "flos": 23254565276160.0, + "grad_norm": 62.128051058476764, + "language_loss": 0.98132205, + "learning_rate": 3.997534752096277e-06, + "loss": 1.01014578, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 5.82421875, + "router_z_loss_mlp": 0.64746094, + "step": 754, + "time_per_iteration": 2.664883852005005 + }, + { + "auxiliary_loss_clip": 0.02027432, + "auxiliary_loss_mlp": 0.00795333, + "balance_loss_clip": 1.46249843, + "balance_loss_mlp": 0.73668188, + "epoch": 0.04539305576431685, + "flos": 12422004537600.0, + "grad_norm": 66.2395708954885, + "language_loss": 0.86686981, + "learning_rate": 3.997515382918531e-06, + "loss": 0.89509743, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 5.64453125, + "router_z_loss_mlp": 0.58642578, + "step": 755, + "time_per_iteration": 2.632842540740967 + }, + { + "auxiliary_loss_clip": 0.02072408, + "auxiliary_loss_mlp": 0.00844809, + "balance_loss_clip": 1.47878981, + "balance_loss_mlp": 0.77657354, + "epoch": 0.04545317901698482, + "flos": 16070995382400.0, + "grad_norm": 17.939018517329966, + "language_loss": 0.85513973, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.88431191, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 5.93359375, + "router_z_loss_mlp": 0.68212891, + "step": 756, + "time_per_iteration": 2.6389565467834473 + }, + { + "auxiliary_loss_clip": 0.01669742, + "auxiliary_loss_mlp": 0.00240112, + "balance_loss_clip": 1.39981651, + "balance_loss_mlp": 0.22122931, + "epoch": 0.045513302269652785, + "flos": 66396139021440.0, + "grad_norm": 0.8282115449185338, + "language_loss": 0.62813628, + "learning_rate": 3.997476417325827e-06, + "loss": 0.6472348, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.18847656, + "step": 757, + "time_per_iteration": 3.2467265129089355 + }, + { + "auxiliary_loss_clip": 0.02122433, + "auxiliary_loss_mlp": 0.00943747, + "balance_loss_clip": 1.49097371, + "balance_loss_mlp": 0.87460554, + "epoch": 0.04557342552232076, + "flos": 21471169991040.0, + "grad_norm": 72.84049119495762, + "language_loss": 0.88665026, + "learning_rate": 3.997456820912346e-06, + "loss": 0.91731203, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 6.3203125, + "router_z_loss_mlp": 0.69042969, + "step": 758, + "time_per_iteration": 2.7413125038146973 + }, + { + "auxiliary_loss_clip": 0.02090271, + "auxiliary_loss_mlp": 0.00849576, + "balance_loss_clip": 1.47047448, + "balance_loss_mlp": 0.79111612, + "epoch": 0.04563354877498873, + "flos": 23732680233600.0, + "grad_norm": 47.51341628815215, + "language_loss": 0.93559581, + "learning_rate": 3.997437148755101e-06, + "loss": 0.96499431, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 6.20703125, + "router_z_loss_mlp": 0.58398438, + "step": 759, + "time_per_iteration": 2.6904356479644775 + }, + { + "auxiliary_loss_clip": 0.02125923, + "auxiliary_loss_mlp": 0.00990581, + "balance_loss_clip": 1.48251605, + "balance_loss_mlp": 0.91929388, + "epoch": 0.045693672027656694, + "flos": 25735741142400.0, + "grad_norm": 2661.7574243517897, + "language_loss": 0.80696011, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.83812511, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 6.43359375, + "router_z_loss_mlp": 0.71240234, + "step": 760, + "time_per_iteration": 2.6806578636169434 + }, + { + "auxiliary_loss_clip": 0.02134688, + "auxiliary_loss_mlp": 0.0099569, + "balance_loss_clip": 1.47898078, + "balance_loss_mlp": 0.93050623, + "epoch": 0.045753795280324666, + "flos": 19719016560000.0, + "grad_norm": 6.168276617121272, + "language_loss": 0.89172745, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.92303133, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 6.55859375, + "router_z_loss_mlp": 0.65209961, + "step": 761, + "time_per_iteration": 2.616067409515381 + }, + { + "auxiliary_loss_clip": 0.02099262, + "auxiliary_loss_mlp": 0.00937008, + "balance_loss_clip": 1.46195304, + "balance_loss_mlp": 0.86653161, + "epoch": 0.04581391853299264, + "flos": 23255786338560.0, + "grad_norm": 71.07298919669064, + "language_loss": 0.85291123, + "learning_rate": 3.997377677828266e-06, + "loss": 0.88327396, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 6.375, + "router_z_loss_mlp": 0.70458984, + "step": 762, + "time_per_iteration": 2.6804826259613037 + }, + { + "auxiliary_loss_clip": 0.01614589, + "auxiliary_loss_mlp": 0.00361888, + "balance_loss_clip": 1.32801676, + "balance_loss_mlp": 0.33690152, + "epoch": 0.0458740417856606, + "flos": 64231155601920.0, + "grad_norm": 0.9999990608480717, + "language_loss": 0.59096426, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.61072904, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.24902344, + "step": 763, + "time_per_iteration": 3.3696467876434326 + }, + { + "auxiliary_loss_clip": 0.0211924, + "auxiliary_loss_mlp": 0.01081746, + "balance_loss_clip": 1.44429326, + "balance_loss_mlp": 1.00182819, + "epoch": 0.045934165038328575, + "flos": 20770121272320.0, + "grad_norm": 2.8521362321513815, + "language_loss": 0.94714499, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.97915494, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 6.75, + "router_z_loss_mlp": 0.79931641, + "step": 764, + "time_per_iteration": 2.731343984603882 + }, + { + "auxiliary_loss_clip": 0.0207197, + "auxiliary_loss_mlp": 0.0103123, + "balance_loss_clip": 1.42662525, + "balance_loss_mlp": 0.95212245, + "epoch": 0.04599428829099654, + "flos": 30262891691520.0, + "grad_norm": 233.94235822546887, + "language_loss": 0.92292035, + "learning_rate": 3.997317525234592e-06, + "loss": 0.95395231, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 6.453125, + "router_z_loss_mlp": 0.79101562, + "step": 765, + "time_per_iteration": 2.7207884788513184 + }, + { + "auxiliary_loss_clip": 0.02015826, + "auxiliary_loss_mlp": 0.00920282, + "balance_loss_clip": 1.4108932, + "balance_loss_mlp": 0.84308207, + "epoch": 0.04605441154366451, + "flos": 23038921975680.0, + "grad_norm": 39.689741247687344, + "language_loss": 0.96385831, + "learning_rate": 3.997297322892056e-06, + "loss": 0.99321944, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 6.05078125, + "router_z_loss_mlp": 0.77246094, + "step": 766, + "time_per_iteration": 2.663517475128174 + }, + { + "auxiliary_loss_clip": 0.02015754, + "auxiliary_loss_mlp": 0.0091493, + "balance_loss_clip": 1.40132523, + "balance_loss_mlp": 0.84211648, + "epoch": 0.046114534796332485, + "flos": 22017407091840.0, + "grad_norm": 59.146484763623846, + "language_loss": 0.89922118, + "learning_rate": 3.997277044811806e-06, + "loss": 0.92852795, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 6.140625, + "router_z_loss_mlp": 0.72802734, + "step": 767, + "time_per_iteration": 2.595640182495117 + }, + { + "auxiliary_loss_clip": 0.01979128, + "auxiliary_loss_mlp": 0.00840297, + "balance_loss_clip": 1.38942027, + "balance_loss_mlp": 0.76486105, + "epoch": 0.04617465804900045, + "flos": 29862380067840.0, + "grad_norm": 269.4524179954204, + "language_loss": 0.93196058, + "learning_rate": 3.99725669099461e-06, + "loss": 0.96015477, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.75439453, + "step": 768, + "time_per_iteration": 2.720355272293091 + }, + { + "auxiliary_loss_clip": 0.02003907, + "auxiliary_loss_mlp": 0.00896657, + "balance_loss_clip": 1.39102173, + "balance_loss_mlp": 0.82627594, + "epoch": 0.04623478130166842, + "flos": 25630056351360.0, + "grad_norm": 411.67907214764506, + "language_loss": 0.81029558, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.83930117, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 6.1171875, + "router_z_loss_mlp": 0.70410156, + "step": 769, + "time_per_iteration": 2.676751136779785 + }, + { + "auxiliary_loss_clip": 0.01979988, + "auxiliary_loss_mlp": 0.00846295, + "balance_loss_clip": 1.3919487, + "balance_loss_mlp": 0.77605677, + "epoch": 0.04629490455433639, + "flos": 20449080489600.0, + "grad_norm": 5.473746502186435, + "language_loss": 0.91380221, + "learning_rate": 3.997215756152471e-06, + "loss": 0.942065, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 0.70166016, + "step": 770, + "time_per_iteration": 2.698193073272705 + }, + { + "auxiliary_loss_clip": 0.01966438, + "auxiliary_loss_mlp": 0.00780114, + "balance_loss_clip": 1.38342619, + "balance_loss_mlp": 0.70930403, + "epoch": 0.04635502780700436, + "flos": 23148736830720.0, + "grad_norm": 42.688067939268294, + "language_loss": 0.95074135, + "learning_rate": 3.99719517512908e-06, + "loss": 0.97820687, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 5.82421875, + "router_z_loss_mlp": 0.70849609, + "step": 771, + "time_per_iteration": 2.680745840072632 + }, + { + "auxiliary_loss_clip": 0.01972238, + "auxiliary_loss_mlp": 0.0085187, + "balance_loss_clip": 1.38186872, + "balance_loss_mlp": 0.77581489, + "epoch": 0.04641515105967233, + "flos": 23292020183040.0, + "grad_norm": 201.3484719752468, + "language_loss": 0.92091644, + "learning_rate": 3.997174518371848e-06, + "loss": 0.9491576, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 5.91015625, + "router_z_loss_mlp": 0.76025391, + "step": 772, + "time_per_iteration": 2.695600748062134 + }, + { + "auxiliary_loss_clip": 0.01959231, + "auxiliary_loss_mlp": 0.00809324, + "balance_loss_clip": 1.3804276, + "balance_loss_mlp": 0.73832315, + "epoch": 0.046475274312340296, + "flos": 25115204759040.0, + "grad_norm": 35.5093176169444, + "language_loss": 0.81066144, + "learning_rate": 3.997153785881557e-06, + "loss": 0.83834696, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 5.79296875, + "router_z_loss_mlp": 0.70996094, + "step": 773, + "time_per_iteration": 2.762474536895752 + }, + { + "auxiliary_loss_clip": 0.01925295, + "auxiliary_loss_mlp": 0.00763685, + "balance_loss_clip": 1.37234688, + "balance_loss_mlp": 0.69544977, + "epoch": 0.04653539756500827, + "flos": 25264916645760.0, + "grad_norm": 371.81061758176304, + "language_loss": 0.84285092, + "learning_rate": 3.997132977658996e-06, + "loss": 0.86974072, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 5.53515625, + "router_z_loss_mlp": 0.68212891, + "step": 774, + "time_per_iteration": 2.709393262863159 + }, + { + "auxiliary_loss_clip": 0.01930085, + "auxiliary_loss_mlp": 0.00758949, + "balance_loss_clip": 1.37389922, + "balance_loss_mlp": 0.6882813, + "epoch": 0.046595520817676234, + "flos": 35404150089600.0, + "grad_norm": 12.256097133067644, + "language_loss": 0.79794049, + "learning_rate": 3.997112093704952e-06, + "loss": 0.82483083, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 5.56640625, + "router_z_loss_mlp": 0.70605469, + "step": 775, + "time_per_iteration": 2.7908260822296143 + }, + { + "auxiliary_loss_clip": 0.01906264, + "auxiliary_loss_mlp": 0.0074636, + "balance_loss_clip": 1.36411858, + "balance_loss_mlp": 0.67259359, + "epoch": 0.046655644070344206, + "flos": 18112516778880.0, + "grad_norm": 8.192144842907005, + "language_loss": 0.82359064, + "learning_rate": 3.997091134020217e-06, + "loss": 0.85011685, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 5.421875, + "router_z_loss_mlp": 0.73730469, + "step": 776, + "time_per_iteration": 2.7027831077575684 + }, + { + "auxiliary_loss_clip": 0.01873448, + "auxiliary_loss_mlp": 0.00725034, + "balance_loss_clip": 1.35835588, + "balance_loss_mlp": 0.65212566, + "epoch": 0.04671576732301218, + "flos": 29205286617600.0, + "grad_norm": 142.77043301184582, + "language_loss": 0.79184151, + "learning_rate": 3.997070098605585e-06, + "loss": 0.81782627, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 5.15625, + "router_z_loss_mlp": 0.72851562, + "step": 777, + "time_per_iteration": 2.7541096210479736 + }, + { + "auxiliary_loss_clip": 0.01864801, + "auxiliary_loss_mlp": 0.00731263, + "balance_loss_clip": 1.35666406, + "balance_loss_mlp": 0.66126335, + "epoch": 0.04677589057568014, + "flos": 30478319510400.0, + "grad_norm": 149.60500324324997, + "language_loss": 0.81369823, + "learning_rate": 3.997048987461856e-06, + "loss": 0.83965886, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 5.0859375, + "router_z_loss_mlp": 0.70019531, + "step": 778, + "time_per_iteration": 5.677742004394531 + }, + { + "auxiliary_loss_clip": 0.01842519, + "auxiliary_loss_mlp": 0.00669039, + "balance_loss_clip": 1.34712625, + "balance_loss_mlp": 0.60628772, + "epoch": 0.046836013828348115, + "flos": 20557674282240.0, + "grad_norm": 25.75734806832815, + "language_loss": 0.85305798, + "learning_rate": 3.997027800589829e-06, + "loss": 0.87817347, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 4.953125, + "router_z_loss_mlp": 0.62744141, + "step": 779, + "time_per_iteration": 4.102157831192017 + }, + { + "auxiliary_loss_clip": 0.01832032, + "auxiliary_loss_mlp": 0.00663451, + "balance_loss_clip": 1.34169757, + "balance_loss_mlp": 0.59903002, + "epoch": 0.04689613708101608, + "flos": 25447378757760.0, + "grad_norm": 6.332365112167262, + "language_loss": 0.82173133, + "learning_rate": 3.997006537990308e-06, + "loss": 0.84668612, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 4.8984375, + "router_z_loss_mlp": 0.64453125, + "step": 780, + "time_per_iteration": 4.114478826522827 + }, + { + "auxiliary_loss_clip": 0.01798746, + "auxiliary_loss_mlp": 0.00671787, + "balance_loss_clip": 1.33579683, + "balance_loss_mlp": 0.61108589, + "epoch": 0.04695626033368405, + "flos": 23001395241600.0, + "grad_norm": 3.2436311683858876, + "language_loss": 0.81979525, + "learning_rate": 3.996985199664099e-06, + "loss": 0.84450054, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 4.62890625, + "router_z_loss_mlp": 0.60693359, + "step": 781, + "time_per_iteration": 2.6431217193603516 + }, + { + "auxiliary_loss_clip": 0.0179338, + "auxiliary_loss_mlp": 0.00730883, + "balance_loss_clip": 1.33438683, + "balance_loss_mlp": 0.66064465, + "epoch": 0.047016383586352024, + "flos": 29133357632640.0, + "grad_norm": 17.64943166900422, + "language_loss": 0.82255447, + "learning_rate": 3.99696378561201e-06, + "loss": 0.84779716, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 4.58984375, + "router_z_loss_mlp": 0.70214844, + "step": 782, + "time_per_iteration": 2.7442290782928467 + }, + { + "auxiliary_loss_clip": 0.01757459, + "auxiliary_loss_mlp": 0.00679999, + "balance_loss_clip": 1.32338369, + "balance_loss_mlp": 0.61572093, + "epoch": 0.04707650683901999, + "flos": 14976330451200.0, + "grad_norm": 744.582057588403, + "language_loss": 0.87187755, + "learning_rate": 3.996942295834855e-06, + "loss": 0.89625216, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.64306641, + "step": 783, + "time_per_iteration": 2.5977392196655273 + }, + { + "auxiliary_loss_clip": 0.01726196, + "auxiliary_loss_mlp": 0.00683615, + "balance_loss_clip": 1.31908238, + "balance_loss_mlp": 0.61785901, + "epoch": 0.04713663009168796, + "flos": 21651118151040.0, + "grad_norm": 28.135790300750536, + "language_loss": 0.87496144, + "learning_rate": 3.996920730333448e-06, + "loss": 0.89905953, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 4.07421875, + "router_z_loss_mlp": 0.65771484, + "step": 784, + "time_per_iteration": 2.670236349105835 + }, + { + "auxiliary_loss_clip": 0.01719863, + "auxiliary_loss_mlp": 0.00692496, + "balance_loss_clip": 1.32386923, + "balance_loss_mlp": 0.62683576, + "epoch": 0.04719675334435593, + "flos": 21325408600320.0, + "grad_norm": 76.55283764483013, + "language_loss": 0.85465741, + "learning_rate": 3.996899089108607e-06, + "loss": 0.87878096, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.65673828, + "step": 785, + "time_per_iteration": 2.726367473602295 + }, + { + "auxiliary_loss_clip": 0.01708243, + "auxiliary_loss_mlp": 0.00691236, + "balance_loss_clip": 1.32614422, + "balance_loss_mlp": 0.62481254, + "epoch": 0.0472568765970239, + "flos": 17931383470080.0, + "grad_norm": 10.686150861979993, + "language_loss": 0.97648871, + "learning_rate": 3.996877372161152e-06, + "loss": 1.00048351, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.6640625, + "step": 786, + "time_per_iteration": 2.645674705505371 + }, + { + "auxiliary_loss_clip": 0.01697931, + "auxiliary_loss_mlp": 0.0070949, + "balance_loss_clip": 1.31398618, + "balance_loss_mlp": 0.63820326, + "epoch": 0.04731699984969187, + "flos": 18077324428800.0, + "grad_norm": 155.24143350845583, + "language_loss": 0.86919188, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.89326608, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 0.71240234, + "step": 787, + "time_per_iteration": 2.5903167724609375 + }, + { + "auxiliary_loss_clip": 0.01695468, + "auxiliary_loss_mlp": 0.0068902, + "balance_loss_clip": 1.32561815, + "balance_loss_mlp": 0.62302554, + "epoch": 0.047377123102359836, + "flos": 23185078416000.0, + "grad_norm": 11.712945113928907, + "language_loss": 0.88195384, + "learning_rate": 3.996833711101698e-06, + "loss": 0.90579879, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 0.65966797, + "step": 788, + "time_per_iteration": 2.701110601425171 + }, + { + "auxiliary_loss_clip": 0.01670211, + "auxiliary_loss_mlp": 0.00657961, + "balance_loss_clip": 1.31850886, + "balance_loss_mlp": 0.59492296, + "epoch": 0.04743724635502781, + "flos": 22747794243840.0, + "grad_norm": 16.624783614077174, + "language_loss": 0.90065479, + "learning_rate": 3.996811766991355e-06, + "loss": 0.92393649, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 0.63085938, + "step": 789, + "time_per_iteration": 2.6665971279144287 + }, + { + "auxiliary_loss_clip": 0.01697043, + "auxiliary_loss_mlp": 0.00651673, + "balance_loss_clip": 1.33579159, + "balance_loss_mlp": 0.58987534, + "epoch": 0.04749736960769577, + "flos": 17238702620160.0, + "grad_norm": 26.998405653444987, + "language_loss": 0.88663912, + "learning_rate": 3.996789747161709e-06, + "loss": 0.91012633, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 0.6171875, + "step": 790, + "time_per_iteration": 2.8173024654388428 + }, + { + "auxiliary_loss_clip": 0.01684351, + "auxiliary_loss_mlp": 0.00671671, + "balance_loss_clip": 1.32096493, + "balance_loss_mlp": 0.60400748, + "epoch": 0.047557492860363745, + "flos": 40479261592320.0, + "grad_norm": 30.933860354143427, + "language_loss": 0.9445321, + "learning_rate": 3.996767651613597e-06, + "loss": 0.96809232, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 0.67626953, + "step": 791, + "time_per_iteration": 2.7913708686828613 + }, + { + "auxiliary_loss_clip": 0.01679159, + "auxiliary_loss_mlp": 0.00628467, + "balance_loss_clip": 1.31972027, + "balance_loss_mlp": 0.57007837, + "epoch": 0.04761761611303172, + "flos": 18698004466560.0, + "grad_norm": 42.835559807477786, + "language_loss": 0.94821858, + "learning_rate": 3.996745480347854e-06, + "loss": 0.97129482, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 0.58349609, + "step": 792, + "time_per_iteration": 2.6417407989501953 + }, + { + "auxiliary_loss_clip": 0.01691638, + "auxiliary_loss_mlp": 0.00609187, + "balance_loss_clip": 1.32146442, + "balance_loss_mlp": 0.55339754, + "epoch": 0.04767773936569968, + "flos": 20921987975040.0, + "grad_norm": 47.9325061351851, + "language_loss": 0.79132503, + "learning_rate": 3.996723233365324e-06, + "loss": 0.81433332, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 0.55883789, + "step": 793, + "time_per_iteration": 2.6506357192993164 + }, + { + "auxiliary_loss_clip": 0.0171687, + "auxiliary_loss_mlp": 0.00638299, + "balance_loss_clip": 1.3328805, + "balance_loss_mlp": 0.57497495, + "epoch": 0.047737862618367655, + "flos": 23732680233600.0, + "grad_norm": 40.27346322317569, + "language_loss": 0.93030167, + "learning_rate": 3.996700910666847e-06, + "loss": 0.95385337, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.63354492, + "step": 794, + "time_per_iteration": 2.732424259185791 + }, + { + "auxiliary_loss_clip": 0.01747249, + "auxiliary_loss_mlp": 0.00610187, + "balance_loss_clip": 1.34332323, + "balance_loss_mlp": 0.55186933, + "epoch": 0.04779798587103562, + "flos": 23695764030720.0, + "grad_norm": 32.03584557102573, + "language_loss": 0.77707183, + "learning_rate": 3.996678512253272e-06, + "loss": 0.80064619, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 4.04296875, + "router_z_loss_mlp": 0.58349609, + "step": 795, + "time_per_iteration": 2.687696695327759 + }, + { + "auxiliary_loss_clip": 0.01744641, + "auxiliary_loss_mlp": 0.0059147, + "balance_loss_clip": 1.35113621, + "balance_loss_mlp": 0.53663391, + "epoch": 0.04785810912370359, + "flos": 23183641872000.0, + "grad_norm": 11.477493725723159, + "language_loss": 0.8668257, + "learning_rate": 3.996656038125449e-06, + "loss": 0.89018691, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 3.93554688, + "router_z_loss_mlp": 0.54882812, + "step": 796, + "time_per_iteration": 2.651655435562134 + }, + { + "auxiliary_loss_clip": 0.01771284, + "auxiliary_loss_mlp": 0.00613483, + "balance_loss_clip": 1.36345148, + "balance_loss_mlp": 0.55297279, + "epoch": 0.047918232376371564, + "flos": 18040623707520.0, + "grad_norm": 80.39089267269694, + "language_loss": 0.89312083, + "learning_rate": 3.996633488284228e-06, + "loss": 0.91696852, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 4.078125, + "router_z_loss_mlp": 0.60546875, + "step": 797, + "time_per_iteration": 2.743457794189453 + }, + { + "auxiliary_loss_clip": 0.01474906, + "auxiliary_loss_mlp": 0.00404169, + "balance_loss_clip": 1.24384129, + "balance_loss_mlp": 0.384619, + "epoch": 0.04797835562903953, + "flos": 62442588758400.0, + "grad_norm": 0.9467917394107902, + "language_loss": 0.64544719, + "learning_rate": 3.996610862730465e-06, + "loss": 0.66423792, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.1953125, + "step": 798, + "time_per_iteration": 3.0898241996765137 + }, + { + "auxiliary_loss_clip": 0.01860024, + "auxiliary_loss_mlp": 0.00636737, + "balance_loss_clip": 1.40724587, + "balance_loss_mlp": 0.57241195, + "epoch": 0.0480384788817075, + "flos": 21507296094720.0, + "grad_norm": 70.33132455897956, + "language_loss": 0.96834505, + "learning_rate": 3.996588161465018e-06, + "loss": 0.99331272, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.64306641, + "step": 799, + "time_per_iteration": 2.7204036712646484 + }, + { + "auxiliary_loss_clip": 0.01855174, + "auxiliary_loss_mlp": 0.00614753, + "balance_loss_clip": 1.42659497, + "balance_loss_mlp": 0.55634046, + "epoch": 0.048098602134375466, + "flos": 21726710323200.0, + "grad_norm": 128.06904460274455, + "language_loss": 0.91925037, + "learning_rate": 3.996565384488748e-06, + "loss": 0.94394964, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 4.29296875, + "router_z_loss_mlp": 0.58422852, + "step": 800, + "time_per_iteration": 2.6985037326812744 + }, + { + "auxiliary_loss_clip": 0.01872611, + "auxiliary_loss_mlp": 0.00600719, + "balance_loss_clip": 1.42619777, + "balance_loss_mlp": 0.54228282, + "epoch": 0.04815872538704344, + "flos": 22931082368640.0, + "grad_norm": 9.512955791076795, + "language_loss": 0.9014957, + "learning_rate": 3.996542531802518e-06, + "loss": 0.92622894, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 4.46484375, + "router_z_loss_mlp": 0.58422852, + "step": 801, + "time_per_iteration": 2.749375343322754 + }, + { + "auxiliary_loss_clip": 0.01900708, + "auxiliary_loss_mlp": 0.00639607, + "balance_loss_clip": 1.44764984, + "balance_loss_mlp": 0.57637882, + "epoch": 0.04821884863971141, + "flos": 43174716042240.0, + "grad_norm": 41.37458616582597, + "language_loss": 0.85626942, + "learning_rate": 3.996519603407196e-06, + "loss": 0.88167262, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 4.53320312, + "router_z_loss_mlp": 0.63183594, + "step": 802, + "time_per_iteration": 2.8913662433624268 + }, + { + "auxiliary_loss_clip": 0.01872651, + "auxiliary_loss_mlp": 0.00637158, + "balance_loss_clip": 1.42879701, + "balance_loss_mlp": 0.57605088, + "epoch": 0.048278971892379376, + "flos": 18620006083200.0, + "grad_norm": 7.141341785683392, + "language_loss": 0.92570174, + "learning_rate": 3.996496599303649e-06, + "loss": 0.95079982, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 4.4453125, + "router_z_loss_mlp": 0.61132812, + "step": 803, + "time_per_iteration": 2.679373264312744 + }, + { + "auxiliary_loss_clip": 0.0189631, + "auxiliary_loss_mlp": 0.00645835, + "balance_loss_clip": 1.45019925, + "balance_loss_mlp": 0.58255845, + "epoch": 0.04833909514504735, + "flos": 20230061310720.0, + "grad_norm": 57.341681876187224, + "language_loss": 0.93902028, + "learning_rate": 3.996473519492753e-06, + "loss": 0.96444166, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 4.46875, + "router_z_loss_mlp": 0.63305664, + "step": 804, + "time_per_iteration": 2.8157472610473633 + }, + { + "auxiliary_loss_clip": 0.0192525, + "auxiliary_loss_mlp": 0.00621779, + "balance_loss_clip": 1.46296275, + "balance_loss_mlp": 0.56100631, + "epoch": 0.04839921839771532, + "flos": 24645170361600.0, + "grad_norm": 149.79700091360982, + "language_loss": 0.9161377, + "learning_rate": 3.99645036397538e-06, + "loss": 0.94160801, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 4.62109375, + "router_z_loss_mlp": 0.60766602, + "step": 805, + "time_per_iteration": 2.654324769973755 + }, + { + "auxiliary_loss_clip": 0.01919477, + "auxiliary_loss_mlp": 0.00615726, + "balance_loss_clip": 1.45042217, + "balance_loss_mlp": 0.55597806, + "epoch": 0.048459341650383285, + "flos": 24827452905600.0, + "grad_norm": 10.681133091495237, + "language_loss": 0.73470742, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.76005942, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 4.69140625, + "router_z_loss_mlp": 0.59667969, + "step": 806, + "time_per_iteration": 2.680438280105591 + }, + { + "auxiliary_loss_clip": 0.01906259, + "auxiliary_loss_mlp": 0.00614998, + "balance_loss_clip": 1.44561863, + "balance_loss_mlp": 0.56094849, + "epoch": 0.04851946490305126, + "flos": 22163204396160.0, + "grad_norm": 63.87799231482107, + "language_loss": 0.82774925, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.85296178, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 4.6015625, + "router_z_loss_mlp": 0.54101562, + "step": 807, + "time_per_iteration": 2.6045193672180176 + }, + { + "auxiliary_loss_clip": 0.01928777, + "auxiliary_loss_mlp": 0.00623396, + "balance_loss_clip": 1.46732152, + "balance_loss_mlp": 0.56605655, + "epoch": 0.04857958815571922, + "flos": 19792022952960.0, + "grad_norm": 15.171736061469401, + "language_loss": 0.92958784, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.95510966, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 4.61328125, + "router_z_loss_mlp": 0.57324219, + "step": 808, + "time_per_iteration": 2.6949644088745117 + }, + { + "auxiliary_loss_clip": 0.01958761, + "auxiliary_loss_mlp": 0.00658412, + "balance_loss_clip": 1.48206675, + "balance_loss_mlp": 0.59270358, + "epoch": 0.048639711408387194, + "flos": 18697968552960.0, + "grad_norm": 8.13083308779844, + "language_loss": 0.96039653, + "learning_rate": 3.996356984858732e-06, + "loss": 0.98656821, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 4.76171875, + "router_z_loss_mlp": 0.65722656, + "step": 809, + "time_per_iteration": 2.6232762336730957 + }, + { + "auxiliary_loss_clip": 0.01950126, + "auxiliary_loss_mlp": 0.00643776, + "balance_loss_clip": 1.49260533, + "balance_loss_mlp": 0.58259773, + "epoch": 0.048699834661055166, + "flos": 24863507182080.0, + "grad_norm": 6.0065731221634255, + "language_loss": 0.91812855, + "learning_rate": 3.996333450822208e-06, + "loss": 0.9440676, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 4.5703125, + "router_z_loss_mlp": 0.61132812, + "step": 810, + "time_per_iteration": 2.6927523612976074 + }, + { + "auxiliary_loss_clip": 0.0194961, + "auxiliary_loss_mlp": 0.00617379, + "balance_loss_clip": 1.47997785, + "balance_loss_mlp": 0.55884743, + "epoch": 0.04875995791372313, + "flos": 20704010290560.0, + "grad_norm": 139.49603247061447, + "language_loss": 0.86343634, + "learning_rate": 3.99630984108452e-06, + "loss": 0.88910627, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 4.70703125, + "router_z_loss_mlp": 0.58447266, + "step": 811, + "time_per_iteration": 2.6777451038360596 + }, + { + "auxiliary_loss_clip": 0.01938657, + "auxiliary_loss_mlp": 0.00619328, + "balance_loss_clip": 1.48062634, + "balance_loss_mlp": 0.56444359, + "epoch": 0.048820081166391104, + "flos": 18588297352320.0, + "grad_norm": 16.39122673326237, + "language_loss": 0.79464114, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.82022101, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 4.578125, + "router_z_loss_mlp": 0.54833984, + "step": 812, + "time_per_iteration": 2.7148749828338623 + }, + { + "auxiliary_loss_clip": 0.0194951, + "auxiliary_loss_mlp": 0.00564321, + "balance_loss_clip": 1.49686468, + "balance_loss_mlp": 0.51351362, + "epoch": 0.04888020441905907, + "flos": 22707322594560.0, + "grad_norm": 19.37989838640658, + "language_loss": 0.95163387, + "learning_rate": 3.996262394509233e-06, + "loss": 0.97677219, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 4.52734375, + "router_z_loss_mlp": 0.50805664, + "step": 813, + "time_per_iteration": 2.621969699859619 + }, + { + "auxiliary_loss_clip": 0.01967871, + "auxiliary_loss_mlp": 0.00558417, + "balance_loss_clip": 1.4917593, + "balance_loss_mlp": 0.50546443, + "epoch": 0.04894032767172704, + "flos": 22784351310720.0, + "grad_norm": 148.51658978175917, + "language_loss": 0.82339329, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.84865618, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 4.74609375, + "router_z_loss_mlp": 0.52978516, + "step": 814, + "time_per_iteration": 2.6812689304351807 + }, + { + "auxiliary_loss_clip": 0.01992883, + "auxiliary_loss_mlp": 0.00622935, + "balance_loss_clip": 1.50228739, + "balance_loss_mlp": 0.56280619, + "epoch": 0.04900045092439501, + "flos": 25516147345920.0, + "grad_norm": 15.494807207981768, + "language_loss": 0.89238703, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.91854525, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 4.90234375, + "router_z_loss_mlp": 0.60107422, + "step": 815, + "time_per_iteration": 2.7152721881866455 + }, + { + "auxiliary_loss_clip": 0.020353, + "auxiliary_loss_mlp": 0.00605897, + "balance_loss_clip": 1.52456713, + "balance_loss_mlp": 0.54681712, + "epoch": 0.04906057417706298, + "flos": 25958136199680.0, + "grad_norm": 79.49121858297036, + "language_loss": 0.98980999, + "learning_rate": 3.996190656910043e-06, + "loss": 1.016222, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 5.10546875, + "router_z_loss_mlp": 0.59082031, + "step": 816, + "time_per_iteration": 2.7170348167419434 + }, + { + "auxiliary_loss_clip": 0.02029663, + "auxiliary_loss_mlp": 0.00545444, + "balance_loss_clip": 1.51527441, + "balance_loss_mlp": 0.49394536, + "epoch": 0.04912069742973095, + "flos": 18624638937600.0, + "grad_norm": 24.225954841030525, + "language_loss": 0.86173624, + "learning_rate": 3.996166592984268e-06, + "loss": 0.88748729, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 5.1328125, + "router_z_loss_mlp": 0.51513672, + "step": 817, + "time_per_iteration": 2.7165017127990723 + }, + { + "auxiliary_loss_clip": 0.02057423, + "auxiliary_loss_mlp": 0.00596363, + "balance_loss_clip": 1.54008555, + "balance_loss_mlp": 0.53923774, + "epoch": 0.049180820682398915, + "flos": 23699786353920.0, + "grad_norm": 24.612310013554886, + "language_loss": 0.89410353, + "learning_rate": 3.996142453363656e-06, + "loss": 0.92064142, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 5.16796875, + "router_z_loss_mlp": 0.5715332, + "step": 818, + "time_per_iteration": 2.7395379543304443 + }, + { + "auxiliary_loss_clip": 0.02071008, + "auxiliary_loss_mlp": 0.00546629, + "balance_loss_clip": 1.54373157, + "balance_loss_mlp": 0.49324745, + "epoch": 0.04924094393506689, + "flos": 22420396753920.0, + "grad_norm": 6.576655520737829, + "language_loss": 0.83518481, + "learning_rate": 3.996118238049124e-06, + "loss": 0.86136115, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 5.2734375, + "router_z_loss_mlp": 0.53369141, + "step": 819, + "time_per_iteration": 2.7592594623565674 + }, + { + "auxiliary_loss_clip": 0.02070044, + "auxiliary_loss_mlp": 0.00575626, + "balance_loss_clip": 1.53742218, + "balance_loss_mlp": 0.52233922, + "epoch": 0.04930106718773486, + "flos": 15738246766080.0, + "grad_norm": 3.521882075692675, + "language_loss": 0.90201759, + "learning_rate": 3.996093947041586e-06, + "loss": 0.92847431, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 5.3203125, + "router_z_loss_mlp": 0.5324707, + "step": 820, + "time_per_iteration": 2.621433734893799 + }, + { + "auxiliary_loss_clip": 0.02088102, + "auxiliary_loss_mlp": 0.00608356, + "balance_loss_clip": 1.54700172, + "balance_loss_mlp": 0.55073047, + "epoch": 0.049361190440402825, + "flos": 26250628648320.0, + "grad_norm": 21.169884093128427, + "language_loss": 0.95639831, + "learning_rate": 3.996069580341966e-06, + "loss": 0.98336291, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 5.4140625, + "router_z_loss_mlp": 0.57641602, + "step": 821, + "time_per_iteration": 5.5618486404418945 + }, + { + "auxiliary_loss_clip": 0.02095164, + "auxiliary_loss_mlp": 0.00538367, + "balance_loss_clip": 1.56291771, + "balance_loss_mlp": 0.48968166, + "epoch": 0.0494213136930708, + "flos": 21252366293760.0, + "grad_norm": 121.33176196475102, + "language_loss": 0.95208395, + "learning_rate": 3.996045137951188e-06, + "loss": 0.97841918, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 5.31640625, + "router_z_loss_mlp": 0.48706055, + "step": 822, + "time_per_iteration": 2.7759718894958496 + }, + { + "auxiliary_loss_clip": 0.02078032, + "auxiliary_loss_mlp": 0.00559783, + "balance_loss_clip": 1.55931485, + "balance_loss_mlp": 0.50754577, + "epoch": 0.04948143694573876, + "flos": 27965506740480.0, + "grad_norm": 59.7954962767851, + "language_loss": 0.74669462, + "learning_rate": 3.996020619870178e-06, + "loss": 0.77307278, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 5.19140625, + "router_z_loss_mlp": 0.52246094, + "step": 823, + "time_per_iteration": 4.106802701950073 + }, + { + "auxiliary_loss_clip": 0.01947618, + "auxiliary_loss_mlp": 0.00231869, + "balance_loss_clip": 1.67284644, + "balance_loss_mlp": 0.21317743, + "epoch": 0.049541560198406734, + "flos": 66180995533440.0, + "grad_norm": 1.5021650702350493, + "language_loss": 0.62632531, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64812016, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.18652344, + "step": 824, + "time_per_iteration": 3.218663215637207 + }, + { + "auxiliary_loss_clip": 0.02052215, + "auxiliary_loss_mlp": 0.00632423, + "balance_loss_clip": 1.53640842, + "balance_loss_mlp": 0.57191187, + "epoch": 0.049601683451074706, + "flos": 22892693708160.0, + "grad_norm": 4.986448762622307, + "language_loss": 0.96080399, + "learning_rate": 3.995971356641185e-06, + "loss": 0.98765039, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 5.16015625, + "router_z_loss_mlp": 0.60522461, + "step": 825, + "time_per_iteration": 2.6487462520599365 + }, + { + "auxiliary_loss_clip": 0.01990604, + "auxiliary_loss_mlp": 0.00628302, + "balance_loss_clip": 1.50622475, + "balance_loss_mlp": 0.57134342, + "epoch": 0.04966180670374267, + "flos": 21433643256960.0, + "grad_norm": 72.5292786641759, + "language_loss": 0.73832178, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.76451087, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 4.8359375, + "router_z_loss_mlp": 0.56982422, + "step": 826, + "time_per_iteration": 2.6590113639831543 + }, + { + "auxiliary_loss_clip": 0.01982033, + "auxiliary_loss_mlp": 0.00665089, + "balance_loss_clip": 1.5020566, + "balance_loss_mlp": 0.60121661, + "epoch": 0.04972192995641064, + "flos": 23107367341440.0, + "grad_norm": 39.94237820448589, + "language_loss": 0.83888173, + "learning_rate": 3.995921790662459e-06, + "loss": 0.86535299, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.63891602, + "step": 827, + "time_per_iteration": 2.6934103965759277 + }, + { + "auxiliary_loss_clip": 0.01956368, + "auxiliary_loss_mlp": 0.0066146, + "balance_loss_clip": 1.48178792, + "balance_loss_mlp": 0.59997225, + "epoch": 0.04978205320907861, + "flos": 40406147458560.0, + "grad_norm": 18.315037259798938, + "language_loss": 0.84420794, + "learning_rate": 3.995896894144294e-06, + "loss": 0.87038624, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 4.75390625, + "router_z_loss_mlp": 0.61474609, + "step": 828, + "time_per_iteration": 2.8112950325012207 + }, + { + "auxiliary_loss_clip": 0.01919099, + "auxiliary_loss_mlp": 0.00663659, + "balance_loss_clip": 1.47243714, + "balance_loss_mlp": 0.60224187, + "epoch": 0.04984217646174658, + "flos": 25228539146880.0, + "grad_norm": 13800.023793837081, + "language_loss": 0.89132357, + "learning_rate": 3.995871921941519e-06, + "loss": 0.91715109, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 4.4609375, + "router_z_loss_mlp": 0.61376953, + "step": 829, + "time_per_iteration": 2.7020108699798584 + }, + { + "auxiliary_loss_clip": 0.01918511, + "auxiliary_loss_mlp": 0.00694476, + "balance_loss_clip": 1.46930528, + "balance_loss_mlp": 0.62590659, + "epoch": 0.04990229971441455, + "flos": 15959636242560.0, + "grad_norm": 94.3226575017001, + "language_loss": 0.82855403, + "learning_rate": 3.99584687405508e-06, + "loss": 0.85468388, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 4.49609375, + "router_z_loss_mlp": 0.68505859, + "step": 830, + "time_per_iteration": 2.6879682540893555 + }, + { + "auxiliary_loss_clip": 0.01917334, + "auxiliary_loss_mlp": 0.00687025, + "balance_loss_clip": 1.46498466, + "balance_loss_mlp": 0.62417752, + "epoch": 0.04996242296708252, + "flos": 18405116968320.0, + "grad_norm": 4.664498017085166, + "language_loss": 0.8347764, + "learning_rate": 3.995821750485929e-06, + "loss": 0.86082006, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 4.52734375, + "router_z_loss_mlp": 0.62890625, + "step": 831, + "time_per_iteration": 2.5884928703308105 + }, + { + "auxiliary_loss_clip": 0.01870155, + "auxiliary_loss_mlp": 0.007262, + "balance_loss_clip": 1.42827058, + "balance_loss_mlp": 0.66130227, + "epoch": 0.05002254621975049, + "flos": 17858053854720.0, + "grad_norm": 6.031280293274802, + "language_loss": 0.9801628, + "learning_rate": 3.995796551235016e-06, + "loss": 1.0061264, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 4.4140625, + "router_z_loss_mlp": 0.64892578, + "step": 832, + "time_per_iteration": 2.655500888824463 + }, + { + "auxiliary_loss_clip": 0.01856654, + "auxiliary_loss_mlp": 0.00713304, + "balance_loss_clip": 1.43562102, + "balance_loss_mlp": 0.64640379, + "epoch": 0.050082669472418455, + "flos": 45660273367680.0, + "grad_norm": 16.10031813951759, + "language_loss": 0.87988597, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.90558559, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 4.21289062, + "router_z_loss_mlp": 0.66943359, + "step": 833, + "time_per_iteration": 2.8584516048431396 + }, + { + "auxiliary_loss_clip": 0.01828871, + "auxiliary_loss_mlp": 0.00660417, + "balance_loss_clip": 1.41829872, + "balance_loss_mlp": 0.59742665, + "epoch": 0.05014279272508643, + "flos": 37962067363200.0, + "grad_norm": 9.46645668021778, + "language_loss": 0.87605166, + "learning_rate": 3.995745925691733e-06, + "loss": 0.90094447, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 4.10546875, + "router_z_loss_mlp": 0.62939453, + "step": 834, + "time_per_iteration": 2.7947311401367188 + }, + { + "auxiliary_loss_clip": 0.01827164, + "auxiliary_loss_mlp": 0.00636996, + "balance_loss_clip": 1.4157654, + "balance_loss_mlp": 0.58032358, + "epoch": 0.0502029159777544, + "flos": 20996179516800.0, + "grad_norm": 13.522677705756996, + "language_loss": 0.97817659, + "learning_rate": 3.995720499401282e-06, + "loss": 1.00281811, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 4.11328125, + "router_z_loss_mlp": 0.56665039, + "step": 835, + "time_per_iteration": 2.6257901191711426 + }, + { + "auxiliary_loss_clip": 0.01804923, + "auxiliary_loss_mlp": 0.00637389, + "balance_loss_clip": 1.4009192, + "balance_loss_mlp": 0.57787991, + "epoch": 0.050263039230422364, + "flos": 15888066393600.0, + "grad_norm": 23.5666771974087, + "language_loss": 0.84499639, + "learning_rate": 3.995694997432911e-06, + "loss": 0.86941952, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 4.04296875, + "router_z_loss_mlp": 0.59521484, + "step": 836, + "time_per_iteration": 2.7221529483795166 + }, + { + "auxiliary_loss_clip": 0.01775714, + "auxiliary_loss_mlp": 0.00590374, + "balance_loss_clip": 1.39550686, + "balance_loss_mlp": 0.5373494, + "epoch": 0.050323162483090336, + "flos": 23732752060800.0, + "grad_norm": 32.9990596450426, + "language_loss": 0.89832258, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.92198348, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.53027344, + "step": 837, + "time_per_iteration": 2.7224795818328857 + }, + { + "auxiliary_loss_clip": 0.01822871, + "auxiliary_loss_mlp": 0.00625346, + "balance_loss_clip": 1.42960024, + "balance_loss_mlp": 0.56698078, + "epoch": 0.0503832857357583, + "flos": 20266223328000.0, + "grad_norm": 11.684287960086067, + "language_loss": 0.79501355, + "learning_rate": 3.995643766466275e-06, + "loss": 0.81949568, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 3.93554688, + "router_z_loss_mlp": 0.58374023, + "step": 838, + "time_per_iteration": 2.6432714462280273 + }, + { + "auxiliary_loss_clip": 0.01807311, + "auxiliary_loss_mlp": 0.00626187, + "balance_loss_clip": 1.39889097, + "balance_loss_mlp": 0.56801283, + "epoch": 0.05044340898842627, + "flos": 17785011548160.0, + "grad_norm": 11.531138834147129, + "language_loss": 0.87507713, + "learning_rate": 3.995618037469953e-06, + "loss": 0.89941216, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 4.08984375, + "router_z_loss_mlp": 0.58154297, + "step": 839, + "time_per_iteration": 2.661890745162964 + }, + { + "auxiliary_loss_clip": 0.01789312, + "auxiliary_loss_mlp": 0.00593055, + "balance_loss_clip": 1.40167117, + "balance_loss_mlp": 0.54062659, + "epoch": 0.050503532241094246, + "flos": 22966526113920.0, + "grad_norm": 44.450854048895486, + "language_loss": 0.91081661, + "learning_rate": 3.995592232799595e-06, + "loss": 0.93464029, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.52441406, + "step": 840, + "time_per_iteration": 2.6979548931121826 + }, + { + "auxiliary_loss_clip": 0.01792389, + "auxiliary_loss_mlp": 0.00602572, + "balance_loss_clip": 1.39741433, + "balance_loss_mlp": 0.54637706, + "epoch": 0.05056365549376221, + "flos": 22776989022720.0, + "grad_norm": 6.837679585794663, + "language_loss": 1.00103331, + "learning_rate": 3.99556635245618e-06, + "loss": 1.02498293, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 3.94726562, + "router_z_loss_mlp": 0.56274414, + "step": 841, + "time_per_iteration": 2.683462619781494 + }, + { + "auxiliary_loss_clip": 0.0179708, + "auxiliary_loss_mlp": 0.0056253, + "balance_loss_clip": 1.40491152, + "balance_loss_mlp": 0.50983924, + "epoch": 0.05062377874643018, + "flos": 30916968399360.0, + "grad_norm": 15.162577944300773, + "language_loss": 0.84531176, + "learning_rate": 3.995540396440688e-06, + "loss": 0.86890781, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 3.921875, + "router_z_loss_mlp": 0.52685547, + "step": 842, + "time_per_iteration": 2.7239091396331787 + }, + { + "auxiliary_loss_clip": 0.01823941, + "auxiliary_loss_mlp": 0.0059746, + "balance_loss_clip": 1.42272139, + "balance_loss_mlp": 0.53787887, + "epoch": 0.05068390199909815, + "flos": 19647159402240.0, + "grad_norm": 15.61960677967312, + "language_loss": 0.83642244, + "learning_rate": 3.995514364754105e-06, + "loss": 0.86063641, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.59521484, + "step": 843, + "time_per_iteration": 2.6605565547943115 + }, + { + "auxiliary_loss_clip": 0.0180933, + "auxiliary_loss_mlp": 0.00575262, + "balance_loss_clip": 1.41319823, + "balance_loss_mlp": 0.52099806, + "epoch": 0.05074402525176612, + "flos": 37962103276800.0, + "grad_norm": 19.771051366402194, + "language_loss": 0.89457417, + "learning_rate": 3.995488257397417e-06, + "loss": 0.91842014, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 3.95507812, + "router_z_loss_mlp": 0.54321289, + "step": 844, + "time_per_iteration": 2.803661823272705 + }, + { + "auxiliary_loss_clip": 0.0175222, + "auxiliary_loss_mlp": 0.0051188, + "balance_loss_clip": 1.36945617, + "balance_loss_mlp": 0.46402895, + "epoch": 0.05080414850443409, + "flos": 22054610603520.0, + "grad_norm": 27.97210265298818, + "language_loss": 0.82759351, + "learning_rate": 3.995462074371614e-06, + "loss": 0.85023445, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 3.82421875, + "router_z_loss_mlp": 0.47851562, + "step": 845, + "time_per_iteration": 2.65384840965271 + }, + { + "auxiliary_loss_clip": 0.01748123, + "auxiliary_loss_mlp": 0.00534968, + "balance_loss_clip": 1.37789094, + "balance_loss_mlp": 0.48420846, + "epoch": 0.05086427175710206, + "flos": 20225787592320.0, + "grad_norm": 6.728449134179256, + "language_loss": 0.94173467, + "learning_rate": 3.99543581567769e-06, + "loss": 0.96456552, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 0.50805664, + "step": 846, + "time_per_iteration": 2.6099023818969727 + }, + { + "auxiliary_loss_clip": 0.01765746, + "auxiliary_loss_mlp": 0.00569879, + "balance_loss_clip": 1.38826609, + "balance_loss_mlp": 0.51795155, + "epoch": 0.05092439500977003, + "flos": 15159223526400.0, + "grad_norm": 289.5224484625523, + "language_loss": 0.92622137, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.94957757, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 0.51953125, + "step": 847, + "time_per_iteration": 2.6253128051757812 + }, + { + "auxiliary_loss_clip": 0.01744629, + "auxiliary_loss_mlp": 0.00528142, + "balance_loss_clip": 1.37170577, + "balance_loss_mlp": 0.48012424, + "epoch": 0.050984518262437994, + "flos": 22055149307520.0, + "grad_norm": 7.9634515612335175, + "language_loss": 0.88365161, + "learning_rate": 3.995383071289462e-06, + "loss": 0.90637934, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 0.48022461, + "step": 848, + "time_per_iteration": 2.610164165496826 + }, + { + "auxiliary_loss_clip": 0.01768101, + "auxiliary_loss_mlp": 0.00542718, + "balance_loss_clip": 1.38414812, + "balance_loss_mlp": 0.49305579, + "epoch": 0.05104464151510597, + "flos": 30225329043840.0, + "grad_norm": 11.171096467137474, + "language_loss": 0.92854655, + "learning_rate": 3.995356585597158e-06, + "loss": 0.95165467, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.49658203, + "step": 849, + "time_per_iteration": 2.777496576309204 + }, + { + "auxiliary_loss_clip": 0.01791547, + "auxiliary_loss_mlp": 0.00486207, + "balance_loss_clip": 1.40575635, + "balance_loss_mlp": 0.4432677, + "epoch": 0.05110476476777394, + "flos": 18332900674560.0, + "grad_norm": 10.891672760550325, + "language_loss": 0.89826548, + "learning_rate": 3.995330024240732e-06, + "loss": 0.92104298, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 3.85742188, + "router_z_loss_mlp": 0.42944336, + "step": 850, + "time_per_iteration": 2.6980459690093994 + }, + { + "auxiliary_loss_clip": 0.01804584, + "auxiliary_loss_mlp": 0.00527217, + "balance_loss_clip": 1.41541338, + "balance_loss_mlp": 0.47826961, + "epoch": 0.051164888020441904, + "flos": 37998732170880.0, + "grad_norm": 6.462387989099383, + "language_loss": 0.73496753, + "learning_rate": 3.995303387221192e-06, + "loss": 0.75828552, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.48950195, + "step": 851, + "time_per_iteration": 2.8701047897338867 + }, + { + "auxiliary_loss_clip": 0.0180674, + "auxiliary_loss_mlp": 0.00494134, + "balance_loss_clip": 1.41060925, + "balance_loss_mlp": 0.44566375, + "epoch": 0.051225011273109876, + "flos": 23038634666880.0, + "grad_norm": 154.8695135520212, + "language_loss": 0.90431666, + "learning_rate": 3.995276674539547e-06, + "loss": 0.92732543, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.484375, + "step": 852, + "time_per_iteration": 2.6970930099487305 + }, + { + "auxiliary_loss_clip": 0.01821816, + "auxiliary_loss_mlp": 0.005125, + "balance_loss_clip": 1.41785645, + "balance_loss_mlp": 0.46309948, + "epoch": 0.05128513452577785, + "flos": 18259822454400.0, + "grad_norm": 21.959710238248892, + "language_loss": 0.8631658, + "learning_rate": 3.995249886196811e-06, + "loss": 0.88650888, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 4.03515625, + "router_z_loss_mlp": 0.49365234, + "step": 853, + "time_per_iteration": 2.612952709197998 + }, + { + "auxiliary_loss_clip": 0.01807803, + "auxiliary_loss_mlp": 0.00519624, + "balance_loss_clip": 1.39804935, + "balance_loss_mlp": 0.47039077, + "epoch": 0.05134525777844581, + "flos": 27198957571200.0, + "grad_norm": 18.5981467577841, + "language_loss": 0.8345623, + "learning_rate": 3.995223022193999e-06, + "loss": 0.8578366, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 4.10546875, + "router_z_loss_mlp": 0.49243164, + "step": 854, + "time_per_iteration": 2.7002182006835938 + }, + { + "auxiliary_loss_clip": 0.01794944, + "auxiliary_loss_mlp": 0.00468083, + "balance_loss_clip": 1.3936404, + "balance_loss_mlp": 0.42252105, + "epoch": 0.051405381031113785, + "flos": 28362247436160.0, + "grad_norm": 17.102669687057773, + "language_loss": 0.86831594, + "learning_rate": 3.99519608253213e-06, + "loss": 0.89094615, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.45581055, + "step": 855, + "time_per_iteration": 2.703603506088257 + }, + { + "auxiliary_loss_clip": 0.01416758, + "auxiliary_loss_mlp": 0.00376836, + "balance_loss_clip": 1.19377446, + "balance_loss_mlp": 0.35223138, + "epoch": 0.05146550428378175, + "flos": 65618169327360.0, + "grad_norm": 0.9659037549788141, + "language_loss": 0.65370506, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67164099, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24511719, + "step": 856, + "time_per_iteration": 3.125983476638794 + }, + { + "auxiliary_loss_clip": 0.01838339, + "auxiliary_loss_mlp": 0.00484257, + "balance_loss_clip": 1.42251277, + "balance_loss_mlp": 0.44076979, + "epoch": 0.05152562753644972, + "flos": 22054861998720.0, + "grad_norm": 23.398901697124433, + "language_loss": 0.81381327, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.83703923, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.43530273, + "step": 857, + "time_per_iteration": 2.6325879096984863 + }, + { + "auxiliary_loss_clip": 0.01847236, + "auxiliary_loss_mlp": 0.0048602, + "balance_loss_clip": 1.41629648, + "balance_loss_mlp": 0.44029117, + "epoch": 0.051585750789117694, + "flos": 18509544783360.0, + "grad_norm": 2.8991958216961797, + "language_loss": 0.94211203, + "learning_rate": 3.995114809602412e-06, + "loss": 0.96544456, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 4.31640625, + "router_z_loss_mlp": 0.45776367, + "step": 858, + "time_per_iteration": 2.654993772506714 + }, + { + "auxiliary_loss_clip": 0.0185559, + "auxiliary_loss_mlp": 0.00450773, + "balance_loss_clip": 1.42294741, + "balance_loss_mlp": 0.40611771, + "epoch": 0.05164587404178566, + "flos": 23730238108800.0, + "grad_norm": 2.8644293644391787, + "language_loss": 0.84076244, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.8638261, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.4465332, + "step": 859, + "time_per_iteration": 2.6572022438049316 + }, + { + "auxiliary_loss_clip": 0.01877652, + "auxiliary_loss_mlp": 0.00465733, + "balance_loss_clip": 1.42919087, + "balance_loss_mlp": 0.42157817, + "epoch": 0.05170599729445363, + "flos": 16252882876800.0, + "grad_norm": 8.120185033276039, + "language_loss": 0.9786644, + "learning_rate": 3.995060249372788e-06, + "loss": 1.00209832, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 4.484375, + "router_z_loss_mlp": 0.44140625, + "step": 860, + "time_per_iteration": 2.6666126251220703 + }, + { + "auxiliary_loss_clip": 0.01816181, + "auxiliary_loss_mlp": 0.00453932, + "balance_loss_clip": 1.39375699, + "balance_loss_mlp": 0.41368699, + "epoch": 0.0517661205471216, + "flos": 23985922095360.0, + "grad_norm": 4.95519167257323, + "language_loss": 0.87895405, + "learning_rate": 3.99503285577813e-06, + "loss": 0.9016552, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 4.22265625, + "router_z_loss_mlp": 0.40209961, + "step": 861, + "time_per_iteration": 2.6688601970672607 + }, + { + "auxiliary_loss_clip": 0.01849814, + "auxiliary_loss_mlp": 0.00492742, + "balance_loss_clip": 1.42095256, + "balance_loss_mlp": 0.44796747, + "epoch": 0.05182624379978957, + "flos": 29277718392960.0, + "grad_norm": 51.98539204076333, + "language_loss": 0.84816521, + "learning_rate": 3.995005386531627e-06, + "loss": 0.87159079, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 0.44775391, + "step": 862, + "time_per_iteration": 2.7266600131988525 + }, + { + "auxiliary_loss_clip": 0.01842484, + "auxiliary_loss_mlp": 0.0047518, + "balance_loss_clip": 1.42311859, + "balance_loss_mlp": 0.43462467, + "epoch": 0.05188636705245754, + "flos": 24170826332160.0, + "grad_norm": 13.748884119792233, + "language_loss": 0.95299232, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.97616899, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 4.19921875, + "router_z_loss_mlp": 0.40576172, + "step": 863, + "time_per_iteration": 6.9993977546691895 + }, + { + "auxiliary_loss_clip": 0.01818335, + "auxiliary_loss_mlp": 0.00488351, + "balance_loss_clip": 1.39554441, + "balance_loss_mlp": 0.44514924, + "epoch": 0.051946490305125506, + "flos": 26760703731840.0, + "grad_norm": 79.08470432130815, + "language_loss": 0.83019346, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.85326034, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 4.2265625, + "router_z_loss_mlp": 0.43188477, + "step": 864, + "time_per_iteration": 2.7332234382629395 + }, + { + "auxiliary_loss_clip": 0.01825519, + "auxiliary_loss_mlp": 0.004641, + "balance_loss_clip": 1.39407909, + "balance_loss_mlp": 0.42273426, + "epoch": 0.05200661355779348, + "flos": 21502519585920.0, + "grad_norm": 28.25080088290974, + "language_loss": 0.85253006, + "learning_rate": 3.994922524891474e-06, + "loss": 0.87542629, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 4.31054688, + "router_z_loss_mlp": 0.41357422, + "step": 865, + "time_per_iteration": 4.068542718887329 + }, + { + "auxiliary_loss_clip": 0.01813655, + "auxiliary_loss_mlp": 0.00485492, + "balance_loss_clip": 1.40094817, + "balance_loss_mlp": 0.44205266, + "epoch": 0.05206673681046144, + "flos": 18114492026880.0, + "grad_norm": 94.04546712657232, + "language_loss": 0.91081071, + "learning_rate": 3.994894753048032e-06, + "loss": 0.93380225, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 4.12890625, + "router_z_loss_mlp": 0.43383789, + "step": 866, + "time_per_iteration": 2.5832226276397705 + }, + { + "auxiliary_loss_clip": 0.01752927, + "auxiliary_loss_mlp": 0.00432133, + "balance_loss_clip": 1.36585259, + "balance_loss_mlp": 0.39310366, + "epoch": 0.052126860063129415, + "flos": 17524191916800.0, + "grad_norm": 4.480715756466937, + "language_loss": 0.95786309, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.97971368, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.390625, + "step": 867, + "time_per_iteration": 2.617755889892578 + }, + { + "auxiliary_loss_clip": 0.01750036, + "auxiliary_loss_mlp": 0.00443482, + "balance_loss_clip": 1.36745274, + "balance_loss_mlp": 0.40547836, + "epoch": 0.05218698331579739, + "flos": 32598054771840.0, + "grad_norm": 17.21715381494275, + "language_loss": 0.67033482, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.69227004, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 0.38012695, + "step": 868, + "time_per_iteration": 2.694946527481079 + }, + { + "auxiliary_loss_clip": 0.01763915, + "auxiliary_loss_mlp": 0.00496021, + "balance_loss_clip": 1.37502027, + "balance_loss_mlp": 0.4503873, + "epoch": 0.05224710656846535, + "flos": 22127293774080.0, + "grad_norm": 30.101169403263086, + "language_loss": 0.89680946, + "learning_rate": 3.994810983642281e-06, + "loss": 0.91940886, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 3.88476562, + "router_z_loss_mlp": 0.45678711, + "step": 869, + "time_per_iteration": 2.612485647201538 + }, + { + "auxiliary_loss_clip": 0.01739797, + "auxiliary_loss_mlp": 0.00519835, + "balance_loss_clip": 1.36049187, + "balance_loss_mlp": 0.47682473, + "epoch": 0.052307229821133325, + "flos": 11145092976000.0, + "grad_norm": 13.603991159104815, + "language_loss": 0.94369733, + "learning_rate": 3.994782909218751e-06, + "loss": 0.96629357, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.42993164, + "step": 870, + "time_per_iteration": 2.5729899406433105 + }, + { + "auxiliary_loss_clip": 0.01739533, + "auxiliary_loss_mlp": 0.00488883, + "balance_loss_clip": 1.36362672, + "balance_loss_mlp": 0.44773245, + "epoch": 0.05236735307380129, + "flos": 19128070005120.0, + "grad_norm": 6.646130373083594, + "language_loss": 0.86846924, + "learning_rate": 3.994754759152854e-06, + "loss": 0.89075339, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.41113281, + "step": 871, + "time_per_iteration": 2.632363796234131 + }, + { + "auxiliary_loss_clip": 0.01733193, + "auxiliary_loss_mlp": 0.00452427, + "balance_loss_clip": 1.36935544, + "balance_loss_mlp": 0.41456571, + "epoch": 0.05242747632646926, + "flos": 20960663944320.0, + "grad_norm": 10.969765354594596, + "language_loss": 0.85358477, + "learning_rate": 3.994726533445656e-06, + "loss": 0.87544096, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 0.37841797, + "step": 872, + "time_per_iteration": 2.643563747406006 + }, + { + "auxiliary_loss_clip": 0.0138621, + "auxiliary_loss_mlp": 0.00408666, + "balance_loss_clip": 1.16170096, + "balance_loss_mlp": 0.38806689, + "epoch": 0.052487599579137234, + "flos": 65020542842880.0, + "grad_norm": 1.2823183081890244, + "language_loss": 0.61694765, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.6348964, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.20605469, + "step": 873, + "time_per_iteration": 3.039125680923462 + }, + { + "auxiliary_loss_clip": 0.01727478, + "auxiliary_loss_mlp": 0.00490882, + "balance_loss_clip": 1.35732448, + "balance_loss_mlp": 0.44701278, + "epoch": 0.0525477228318052, + "flos": 23288859786240.0, + "grad_norm": 4.480880507769946, + "language_loss": 0.94957238, + "learning_rate": 3.994669855111643e-06, + "loss": 0.97175604, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 0.43847656, + "step": 874, + "time_per_iteration": 2.6265294551849365 + }, + { + "auxiliary_loss_clip": 0.01742645, + "auxiliary_loss_mlp": 0.00453563, + "balance_loss_clip": 1.36618471, + "balance_loss_mlp": 0.410815, + "epoch": 0.05260784608447317, + "flos": 32230221546240.0, + "grad_norm": 108.55232001613368, + "language_loss": 0.79611647, + "learning_rate": 3.994641402486977e-06, + "loss": 0.81807858, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 0.42749023, + "step": 875, + "time_per_iteration": 2.7108123302459717 + }, + { + "auxiliary_loss_clip": 0.01702986, + "auxiliary_loss_mlp": 0.0044744, + "balance_loss_clip": 1.34875894, + "balance_loss_mlp": 0.40817294, + "epoch": 0.052667969337141136, + "flos": 24463211040000.0, + "grad_norm": 185.84883256414145, + "language_loss": 0.98097688, + "learning_rate": 3.99461287422531e-06, + "loss": 1.0024811, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 0.39282227, + "step": 876, + "time_per_iteration": 2.727052927017212 + }, + { + "auxiliary_loss_clip": 0.01362129, + "auxiliary_loss_mlp": 0.00228833, + "balance_loss_clip": 1.14320755, + "balance_loss_mlp": 0.21319227, + "epoch": 0.05272809258980911, + "flos": 57784329567360.0, + "grad_norm": 0.8105898847675201, + "language_loss": 0.62917268, + "learning_rate": 3.994584270327722e-06, + "loss": 0.64508224, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.15625, + "step": 877, + "time_per_iteration": 3.1606452465057373 + }, + { + "auxiliary_loss_clip": 0.01735485, + "auxiliary_loss_mlp": 0.00509755, + "balance_loss_clip": 1.36859751, + "balance_loss_mlp": 0.46498042, + "epoch": 0.05278821584247708, + "flos": 17420805596160.0, + "grad_norm": 53.329454199134176, + "language_loss": 0.91574776, + "learning_rate": 3.994555590795299e-06, + "loss": 0.93820012, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.44750977, + "step": 878, + "time_per_iteration": 2.655395030975342 + }, + { + "auxiliary_loss_clip": 0.01721791, + "auxiliary_loss_mlp": 0.00470242, + "balance_loss_clip": 1.35854495, + "balance_loss_mlp": 0.42975807, + "epoch": 0.052848339095145046, + "flos": 26137258346880.0, + "grad_norm": 11.289471182359376, + "language_loss": 0.90511191, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.92703223, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.4050293, + "step": 879, + "time_per_iteration": 2.6622626781463623 + }, + { + "auxiliary_loss_clip": 0.01729424, + "auxiliary_loss_mlp": 0.00496735, + "balance_loss_clip": 1.37161911, + "balance_loss_mlp": 0.45420146, + "epoch": 0.05290846234781302, + "flos": 16472081623680.0, + "grad_norm": 19.814622542722052, + "language_loss": 0.90951884, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.93178046, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 0.42578125, + "step": 880, + "time_per_iteration": 2.6624526977539062 + }, + { + "auxiliary_loss_clip": 0.01752044, + "auxiliary_loss_mlp": 0.00539682, + "balance_loss_clip": 1.37833142, + "balance_loss_mlp": 0.49459723, + "epoch": 0.05296858560048098, + "flos": 19865173000320.0, + "grad_norm": 20.64539096850892, + "language_loss": 0.9406693, + "learning_rate": 3.994469098399906e-06, + "loss": 0.96358651, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.45092773, + "step": 881, + "time_per_iteration": 2.631316661834717 + }, + { + "auxiliary_loss_clip": 0.01766423, + "auxiliary_loss_mlp": 0.00491335, + "balance_loss_clip": 1.3840996, + "balance_loss_mlp": 0.44663179, + "epoch": 0.053028708853148955, + "flos": 24388588535040.0, + "grad_norm": 69.89188300332219, + "language_loss": 0.93824089, + "learning_rate": 3.994440116339046e-06, + "loss": 0.96081853, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.44702148, + "step": 882, + "time_per_iteration": 2.6870999336242676 + }, + { + "auxiliary_loss_clip": 0.01794781, + "auxiliary_loss_mlp": 0.00515078, + "balance_loss_clip": 1.40209651, + "balance_loss_mlp": 0.46613115, + "epoch": 0.05308883210581693, + "flos": 36393166143360.0, + "grad_norm": 5.288844038076498, + "language_loss": 0.77087057, + "learning_rate": 3.994411058648816e-06, + "loss": 0.79396915, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 3.92773438, + "router_z_loss_mlp": 0.48901367, + "step": 883, + "time_per_iteration": 2.8606038093566895 + }, + { + "auxiliary_loss_clip": 0.01806552, + "auxiliary_loss_mlp": 0.00488355, + "balance_loss_clip": 1.4089551, + "balance_loss_mlp": 0.4426502, + "epoch": 0.05314895535848489, + "flos": 22855095146880.0, + "grad_norm": 6.88384230790795, + "language_loss": 0.82693535, + "learning_rate": 3.994381925330319e-06, + "loss": 0.84988439, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.45654297, + "step": 884, + "time_per_iteration": 2.6392874717712402 + }, + { + "auxiliary_loss_clip": 0.01821359, + "auxiliary_loss_mlp": 0.0047535, + "balance_loss_clip": 1.42315745, + "balance_loss_mlp": 0.43844295, + "epoch": 0.053209078611152864, + "flos": 12860330204160.0, + "grad_norm": 756.2058686700101, + "language_loss": 0.92832446, + "learning_rate": 3.994352716384659e-06, + "loss": 0.95129156, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 3.98046875, + "router_z_loss_mlp": 0.36889648, + "step": 885, + "time_per_iteration": 2.631077289581299 + }, + { + "auxiliary_loss_clip": 0.0185807, + "auxiliary_loss_mlp": 0.00483352, + "balance_loss_clip": 1.44712305, + "balance_loss_mlp": 0.44117612, + "epoch": 0.05326920186382083, + "flos": 12164596698240.0, + "grad_norm": 44.61466463436276, + "language_loss": 0.9446938, + "learning_rate": 3.994323431812945e-06, + "loss": 0.96810806, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 4.11132812, + "router_z_loss_mlp": 0.42163086, + "step": 886, + "time_per_iteration": 2.668527841567993 + }, + { + "auxiliary_loss_clip": 0.01866275, + "auxiliary_loss_mlp": 0.0045802, + "balance_loss_clip": 1.45486283, + "balance_loss_mlp": 0.41529524, + "epoch": 0.0533293251164888, + "flos": 22704485420160.0, + "grad_norm": 48.137273334601375, + "language_loss": 0.95188934, + "learning_rate": 3.994294071616286e-06, + "loss": 0.97513223, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 4.1171875, + "router_z_loss_mlp": 0.42700195, + "step": 887, + "time_per_iteration": 2.6415271759033203 + }, + { + "auxiliary_loss_clip": 0.01882104, + "auxiliary_loss_mlp": 0.00460157, + "balance_loss_clip": 1.45388722, + "balance_loss_mlp": 0.41721737, + "epoch": 0.053389448369156774, + "flos": 26940939200640.0, + "grad_norm": 130.41107995658203, + "language_loss": 0.79995942, + "learning_rate": 3.994264635795796e-06, + "loss": 0.82338202, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 4.28125, + "router_z_loss_mlp": 0.4296875, + "step": 888, + "time_per_iteration": 2.6423799991607666 + }, + { + "auxiliary_loss_clip": 0.01880745, + "auxiliary_loss_mlp": 0.00424104, + "balance_loss_clip": 1.45791459, + "balance_loss_mlp": 0.38626707, + "epoch": 0.05344957162182474, + "flos": 25556331686400.0, + "grad_norm": 40.01266621964378, + "language_loss": 0.94601011, + "learning_rate": 3.994235124352592e-06, + "loss": 0.96905869, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 4.2265625, + "router_z_loss_mlp": 0.37841797, + "step": 889, + "time_per_iteration": 2.7261788845062256 + }, + { + "auxiliary_loss_clip": 0.01904904, + "auxiliary_loss_mlp": 0.00392318, + "balance_loss_clip": 1.46991825, + "balance_loss_mlp": 0.35903525, + "epoch": 0.05350969487449271, + "flos": 19719591177600.0, + "grad_norm": 6.150673468330552, + "language_loss": 0.94277948, + "learning_rate": 3.994205537287791e-06, + "loss": 0.96575171, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 4.35351562, + "router_z_loss_mlp": 0.33300781, + "step": 890, + "time_per_iteration": 2.6315758228302 + }, + { + "auxiliary_loss_clip": 0.01909533, + "auxiliary_loss_mlp": 0.00422282, + "balance_loss_clip": 1.47044182, + "balance_loss_mlp": 0.38272801, + "epoch": 0.053569818127160676, + "flos": 27016351804800.0, + "grad_norm": 7.292798196424232, + "language_loss": 1.02258134, + "learning_rate": 3.994175874602517e-06, + "loss": 1.04589963, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 4.3828125, + "router_z_loss_mlp": 0.39550781, + "step": 891, + "time_per_iteration": 2.730222463607788 + }, + { + "auxiliary_loss_clip": 0.01907218, + "auxiliary_loss_mlp": 0.00400269, + "balance_loss_clip": 1.45915091, + "balance_loss_mlp": 0.3626225, + "epoch": 0.05362994137982865, + "flos": 13188338225280.0, + "grad_norm": 11.449869997346486, + "language_loss": 0.77762532, + "learning_rate": 3.994146136297893e-06, + "loss": 0.80070019, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 4.484375, + "router_z_loss_mlp": 0.37670898, + "step": 892, + "time_per_iteration": 2.5977931022644043 + }, + { + "auxiliary_loss_clip": 0.01936303, + "auxiliary_loss_mlp": 0.00386534, + "balance_loss_clip": 1.46724665, + "balance_loss_mlp": 0.35136753, + "epoch": 0.05369006463249662, + "flos": 28658008022400.0, + "grad_norm": 73.52392366010264, + "language_loss": 0.89767838, + "learning_rate": 3.994116322375049e-06, + "loss": 0.92090678, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 4.6953125, + "router_z_loss_mlp": 0.3515625, + "step": 893, + "time_per_iteration": 2.7308437824249268 + }, + { + "auxiliary_loss_clip": 0.01917745, + "auxiliary_loss_mlp": 0.00387407, + "balance_loss_clip": 1.45360613, + "balance_loss_mlp": 0.35266903, + "epoch": 0.053750187885164585, + "flos": 28913153304960.0, + "grad_norm": 6.6061548550390485, + "language_loss": 0.89339435, + "learning_rate": 3.994086432835114e-06, + "loss": 0.91644579, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 4.64453125, + "router_z_loss_mlp": 0.34741211, + "step": 894, + "time_per_iteration": 2.665363073348999 + }, + { + "auxiliary_loss_clip": 0.01909769, + "auxiliary_loss_mlp": 0.00371733, + "balance_loss_clip": 1.44957817, + "balance_loss_mlp": 0.33363372, + "epoch": 0.05381031113783256, + "flos": 15158828476800.0, + "grad_norm": 22.407071972748415, + "language_loss": 0.82978785, + "learning_rate": 3.994056467679221e-06, + "loss": 0.8526029, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 4.6015625, + "router_z_loss_mlp": 0.38085938, + "step": 895, + "time_per_iteration": 2.7008798122406006 + }, + { + "auxiliary_loss_clip": 0.01996582, + "auxiliary_loss_mlp": 0.00398709, + "balance_loss_clip": 1.48255277, + "balance_loss_mlp": 0.36094368, + "epoch": 0.05387043439050053, + "flos": 21835232288640.0, + "grad_norm": 8.770876148159605, + "language_loss": 0.93599904, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.959952, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 5.1328125, + "router_z_loss_mlp": 0.37768555, + "step": 896, + "time_per_iteration": 2.700834035873413 + }, + { + "auxiliary_loss_clip": 0.02064736, + "auxiliary_loss_mlp": 0.00410395, + "balance_loss_clip": 1.49287081, + "balance_loss_mlp": 0.36957788, + "epoch": 0.053930557643168495, + "flos": 17310308382720.0, + "grad_norm": 24.972660284814282, + "language_loss": 0.96285182, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.98760319, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 5.72265625, + "router_z_loss_mlp": 0.40820312, + "step": 897, + "time_per_iteration": 2.7015769481658936 + }, + { + "auxiliary_loss_clip": 0.02110562, + "auxiliary_loss_mlp": 0.0038739, + "balance_loss_clip": 1.51168621, + "balance_loss_mlp": 0.34750301, + "epoch": 0.05399068089583647, + "flos": 17348481561600.0, + "grad_norm": 29.640415801413177, + "language_loss": 0.95662892, + "learning_rate": 3.993966118527175e-06, + "loss": 0.98160833, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 5.9921875, + "router_z_loss_mlp": 0.39892578, + "step": 898, + "time_per_iteration": 2.636110782623291 + }, + { + "auxiliary_loss_clip": 0.02149158, + "auxiliary_loss_mlp": 0.00400164, + "balance_loss_clip": 1.51665676, + "balance_loss_mlp": 0.36020523, + "epoch": 0.05405080414850443, + "flos": 17486952491520.0, + "grad_norm": 251.76860332533985, + "language_loss": 1.01757252, + "learning_rate": 3.993935850918845e-06, + "loss": 1.04306579, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 6.33203125, + "router_z_loss_mlp": 0.3996582, + "step": 899, + "time_per_iteration": 2.6251611709594727 + }, + { + "auxiliary_loss_clip": 0.02213833, + "auxiliary_loss_mlp": 0.00391818, + "balance_loss_clip": 1.52035844, + "balance_loss_mlp": 0.3556262, + "epoch": 0.054110927401172404, + "flos": 24496787278080.0, + "grad_norm": 31.46391484374107, + "language_loss": 0.8095454, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.83560193, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 6.9296875, + "router_z_loss_mlp": 0.36181641, + "step": 900, + "time_per_iteration": 2.6651110649108887 + }, + { + "auxiliary_loss_clip": 0.0221555, + "auxiliary_loss_mlp": 0.00408473, + "balance_loss_clip": 1.50631166, + "balance_loss_mlp": 0.37149426, + "epoch": 0.054171050653840376, + "flos": 22930040874240.0, + "grad_norm": 5.017049363280182, + "language_loss": 0.85126376, + "learning_rate": 3.993875088872592e-06, + "loss": 0.87750405, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 7.08203125, + "router_z_loss_mlp": 0.36987305, + "step": 901, + "time_per_iteration": 2.6414055824279785 + }, + { + "auxiliary_loss_clip": 0.02124949, + "auxiliary_loss_mlp": 0.00400023, + "balance_loss_clip": 1.48850584, + "balance_loss_mlp": 0.36402139, + "epoch": 0.05423117390650834, + "flos": 12933192942720.0, + "grad_norm": 36.957730548523486, + "language_loss": 0.92066169, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.94591141, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 6.359375, + "router_z_loss_mlp": 0.35986328, + "step": 902, + "time_per_iteration": 2.608783006668091 + }, + { + "auxiliary_loss_clip": 0.0206733, + "auxiliary_loss_mlp": 0.00424623, + "balance_loss_clip": 1.46440661, + "balance_loss_mlp": 0.38475901, + "epoch": 0.05429129715917631, + "flos": 19901335017600.0, + "grad_norm": 3.0087089055216034, + "language_loss": 0.93807304, + "learning_rate": 3.993814024394569e-06, + "loss": 0.96299255, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 6.0234375, + "router_z_loss_mlp": 0.39868164, + "step": 903, + "time_per_iteration": 2.6678402423858643 + }, + { + "auxiliary_loss_clip": 0.01994314, + "auxiliary_loss_mlp": 0.00442839, + "balance_loss_clip": 1.42761564, + "balance_loss_mlp": 0.40440571, + "epoch": 0.05435142041184428, + "flos": 16908611610240.0, + "grad_norm": 15.121796619501287, + "language_loss": 0.82869506, + "learning_rate": 3.993783378746537e-06, + "loss": 0.85306656, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 5.6640625, + "router_z_loss_mlp": 0.38427734, + "step": 904, + "time_per_iteration": 2.6681394577026367 + }, + { + "auxiliary_loss_clip": 0.01966773, + "auxiliary_loss_mlp": 0.00508461, + "balance_loss_clip": 1.43124723, + "balance_loss_mlp": 0.46561676, + "epoch": 0.05441154366451225, + "flos": 23948323534080.0, + "grad_norm": 17.274527042949426, + "language_loss": 0.93922746, + "learning_rate": 3.993752657494039e-06, + "loss": 0.96397984, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 5.35546875, + "router_z_loss_mlp": 0.42797852, + "step": 905, + "time_per_iteration": 4.118563890457153 + }, + { + "auxiliary_loss_clip": 0.01928249, + "auxiliary_loss_mlp": 0.00474403, + "balance_loss_clip": 1.41293025, + "balance_loss_mlp": 0.43573138, + "epoch": 0.05447166691718022, + "flos": 19975382904960.0, + "grad_norm": 24.178947777754704, + "language_loss": 0.81103706, + "learning_rate": 3.993721860638241e-06, + "loss": 0.83506358, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 5.1484375, + "router_z_loss_mlp": 0.38696289, + "step": 906, + "time_per_iteration": 4.057260274887085 + }, + { + "auxiliary_loss_clip": 0.01862781, + "auxiliary_loss_mlp": 0.00488076, + "balance_loss_clip": 1.36902261, + "balance_loss_mlp": 0.44613814, + "epoch": 0.05453179016984819, + "flos": 24936513575040.0, + "grad_norm": 33.82835448654488, + "language_loss": 0.95018649, + "learning_rate": 3.993690988180309e-06, + "loss": 0.9736951, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.41943359, + "step": 907, + "time_per_iteration": 2.743877410888672 + }, + { + "auxiliary_loss_clip": 0.01830778, + "auxiliary_loss_mlp": 0.00532811, + "balance_loss_clip": 1.36039281, + "balance_loss_mlp": 0.48846492, + "epoch": 0.05459191342251616, + "flos": 18115102558080.0, + "grad_norm": 5.046168128917598, + "language_loss": 0.95843905, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.98207504, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 4.70703125, + "router_z_loss_mlp": 0.44335938, + "step": 908, + "time_per_iteration": 4.0955681800842285 + }, + { + "auxiliary_loss_clip": 0.01820729, + "auxiliary_loss_mlp": 0.00586616, + "balance_loss_clip": 1.3596282, + "balance_loss_mlp": 0.54451144, + "epoch": 0.054652036675184125, + "flos": 19208295031680.0, + "grad_norm": 125.93407706020687, + "language_loss": 0.99151742, + "learning_rate": 3.9936290164627345e-06, + "loss": 1.01559091, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 4.60546875, + "router_z_loss_mlp": 0.42114258, + "step": 909, + "time_per_iteration": 2.6084134578704834 + }, + { + "auxiliary_loss_clip": 0.01817703, + "auxiliary_loss_mlp": 0.0057909, + "balance_loss_clip": 1.35444474, + "balance_loss_mlp": 0.52966583, + "epoch": 0.0547121599278521, + "flos": 16325745615360.0, + "grad_norm": 378.68557798185253, + "language_loss": 0.80332208, + "learning_rate": 3.99359791720544e-06, + "loss": 0.82729006, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 4.640625, + "router_z_loss_mlp": 0.49438477, + "step": 910, + "time_per_iteration": 2.5913472175598145 + }, + { + "auxiliary_loss_clip": 0.0174992, + "auxiliary_loss_mlp": 0.00587603, + "balance_loss_clip": 1.31948948, + "balance_loss_mlp": 0.54483104, + "epoch": 0.05477228318052007, + "flos": 20339014239360.0, + "grad_norm": 13.949952091580005, + "language_loss": 0.91533351, + "learning_rate": 3.993566742350714e-06, + "loss": 0.93870872, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 4.3125, + "router_z_loss_mlp": 0.42773438, + "step": 911, + "time_per_iteration": 2.622007131576538 + }, + { + "auxiliary_loss_clip": 0.01762993, + "auxiliary_loss_mlp": 0.00634464, + "balance_loss_clip": 1.3304733, + "balance_loss_mlp": 0.58716142, + "epoch": 0.054832406433188034, + "flos": 21973092687360.0, + "grad_norm": 80.79802762624257, + "language_loss": 0.84448653, + "learning_rate": 3.993535491899736e-06, + "loss": 0.86846113, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 4.32421875, + "router_z_loss_mlp": 0.47290039, + "step": 912, + "time_per_iteration": 2.661226987838745 + }, + { + "auxiliary_loss_clip": 0.01779401, + "auxiliary_loss_mlp": 0.00605116, + "balance_loss_clip": 1.34452891, + "balance_loss_mlp": 0.56308281, + "epoch": 0.054892529685856006, + "flos": 16398931576320.0, + "grad_norm": 2.954012261646679, + "language_loss": 0.90320653, + "learning_rate": 3.993504165853694e-06, + "loss": 0.92705178, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 4.34765625, + "router_z_loss_mlp": 0.42016602, + "step": 913, + "time_per_iteration": 2.582120656967163 + }, + { + "auxiliary_loss_clip": 0.01761157, + "auxiliary_loss_mlp": 0.00637234, + "balance_loss_clip": 1.33852255, + "balance_loss_mlp": 0.58950281, + "epoch": 0.05495265293852397, + "flos": 23912341084800.0, + "grad_norm": 2.689229029184647, + "language_loss": 0.89246714, + "learning_rate": 3.993472764213772e-06, + "loss": 0.91645104, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 4.2265625, + "router_z_loss_mlp": 0.47705078, + "step": 914, + "time_per_iteration": 2.684616804122925 + }, + { + "auxiliary_loss_clip": 0.01760658, + "auxiliary_loss_mlp": 0.00585628, + "balance_loss_clip": 1.33318567, + "balance_loss_mlp": 0.54199731, + "epoch": 0.055012776191191944, + "flos": 23586954756480.0, + "grad_norm": 9.079867508682248, + "language_loss": 0.98997307, + "learning_rate": 3.9934412869811655e-06, + "loss": 1.01343584, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 4.2734375, + "router_z_loss_mlp": 0.43603516, + "step": 915, + "time_per_iteration": 2.65529727935791 + }, + { + "auxiliary_loss_clip": 0.01769287, + "auxiliary_loss_mlp": 0.00549542, + "balance_loss_clip": 1.334306, + "balance_loss_mlp": 0.5078665, + "epoch": 0.055072899443859916, + "flos": 17528501548800.0, + "grad_norm": 6.024315553424879, + "language_loss": 0.95688248, + "learning_rate": 3.993409734157064e-06, + "loss": 0.98007071, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 4.34765625, + "router_z_loss_mlp": 0.41674805, + "step": 916, + "time_per_iteration": 2.7659976482391357 + }, + { + "auxiliary_loss_clip": 0.01794558, + "auxiliary_loss_mlp": 0.00563892, + "balance_loss_clip": 1.34648132, + "balance_loss_mlp": 0.52097714, + "epoch": 0.05513302269652788, + "flos": 21687172427520.0, + "grad_norm": 11.251816023657971, + "language_loss": 0.86911815, + "learning_rate": 3.993378105742666e-06, + "loss": 0.89270264, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 4.48046875, + "router_z_loss_mlp": 0.42895508, + "step": 917, + "time_per_iteration": 2.6677706241607666 + }, + { + "auxiliary_loss_clip": 0.01794905, + "auxiliary_loss_mlp": 0.0054934, + "balance_loss_clip": 1.35434282, + "balance_loss_mlp": 0.50628161, + "epoch": 0.05519314594919585, + "flos": 21613340021760.0, + "grad_norm": 19.389152824504357, + "language_loss": 0.87820232, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.90164477, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 4.40625, + "router_z_loss_mlp": 0.42993164, + "step": 918, + "time_per_iteration": 2.6696524620056152 + }, + { + "auxiliary_loss_clip": 0.01813262, + "auxiliary_loss_mlp": 0.00548841, + "balance_loss_clip": 1.35193563, + "balance_loss_mlp": 0.50635463, + "epoch": 0.05525326920186382, + "flos": 21798567480960.0, + "grad_norm": 3.101437172717932, + "language_loss": 0.94457006, + "learning_rate": 3.99331462214778e-06, + "loss": 0.96819115, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.42504883, + "step": 919, + "time_per_iteration": 2.7370235919952393 + }, + { + "auxiliary_loss_clip": 0.01787307, + "auxiliary_loss_mlp": 0.0048915, + "balance_loss_clip": 1.34047377, + "balance_loss_mlp": 0.4493106, + "epoch": 0.05531339245453179, + "flos": 28439635288320.0, + "grad_norm": 15.251987036627693, + "language_loss": 0.95477593, + "learning_rate": 3.993282766969699e-06, + "loss": 0.97754055, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 4.4609375, + "router_z_loss_mlp": 0.39892578, + "step": 920, + "time_per_iteration": 2.7196731567382812 + }, + { + "auxiliary_loss_clip": 0.01793841, + "auxiliary_loss_mlp": 0.00475209, + "balance_loss_clip": 1.34983087, + "balance_loss_mlp": 0.43596563, + "epoch": 0.05537351570719976, + "flos": 37375143131520.0, + "grad_norm": 3.41216067938645, + "language_loss": 0.73470169, + "learning_rate": 3.993250836206136e-06, + "loss": 0.75739223, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 4.44140625, + "router_z_loss_mlp": 0.39233398, + "step": 921, + "time_per_iteration": 2.8482346534729004 + }, + { + "auxiliary_loss_clip": 0.01827091, + "auxiliary_loss_mlp": 0.00518619, + "balance_loss_clip": 1.37573767, + "balance_loss_mlp": 0.4750365, + "epoch": 0.05543363895986773, + "flos": 20084479488000.0, + "grad_norm": 8.503238966940122, + "language_loss": 0.79605162, + "learning_rate": 3.993218829858301e-06, + "loss": 0.81950867, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 4.51171875, + "router_z_loss_mlp": 0.43530273, + "step": 922, + "time_per_iteration": 2.645453929901123 + }, + { + "auxiliary_loss_clip": 0.0178505, + "auxiliary_loss_mlp": 0.00494322, + "balance_loss_clip": 1.35082281, + "balance_loss_mlp": 0.45269442, + "epoch": 0.0554937622125357, + "flos": 24533200690560.0, + "grad_norm": 10.074786988647633, + "language_loss": 0.90350109, + "learning_rate": 3.993186747927408e-06, + "loss": 0.9262948, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.41625977, + "step": 923, + "time_per_iteration": 2.670626640319824 + }, + { + "auxiliary_loss_clip": 0.01773417, + "auxiliary_loss_mlp": 0.00451005, + "balance_loss_clip": 1.33897734, + "balance_loss_mlp": 0.40985391, + "epoch": 0.055553885465203665, + "flos": 14320063013760.0, + "grad_norm": 40.94003758638627, + "language_loss": 0.85532564, + "learning_rate": 3.993154590414675e-06, + "loss": 0.87756985, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.41137695, + "step": 924, + "time_per_iteration": 2.612720489501953 + }, + { + "auxiliary_loss_clip": 0.01762343, + "auxiliary_loss_mlp": 0.00463437, + "balance_loss_clip": 1.34371734, + "balance_loss_mlp": 0.42259562, + "epoch": 0.05561400871787164, + "flos": 27381132374400.0, + "grad_norm": 5.4109943734867745, + "language_loss": 1.07817507, + "learning_rate": 3.993122357321319e-06, + "loss": 1.10043287, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.40795898, + "step": 925, + "time_per_iteration": 2.7028605937957764 + }, + { + "auxiliary_loss_clip": 0.0175838, + "auxiliary_loss_mlp": 0.00461304, + "balance_loss_clip": 1.3336395, + "balance_loss_mlp": 0.42153588, + "epoch": 0.05567413197053961, + "flos": 23221096778880.0, + "grad_norm": 101.2262599651773, + "language_loss": 0.87683904, + "learning_rate": 3.993090048648564e-06, + "loss": 0.89903587, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 4.25390625, + "router_z_loss_mlp": 0.39770508, + "step": 926, + "time_per_iteration": 2.6274898052215576 + }, + { + "auxiliary_loss_clip": 0.017823, + "auxiliary_loss_mlp": 0.00513003, + "balance_loss_clip": 1.35315883, + "balance_loss_mlp": 0.46877682, + "epoch": 0.055734255223207574, + "flos": 25264952559360.0, + "grad_norm": 18.357884434419617, + "language_loss": 0.84876317, + "learning_rate": 3.993057664397634e-06, + "loss": 0.87171614, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 4.29296875, + "router_z_loss_mlp": 0.44238281, + "step": 927, + "time_per_iteration": 2.7139382362365723 + }, + { + "auxiliary_loss_clip": 0.0153901, + "auxiliary_loss_mlp": 0.00597545, + "balance_loss_clip": 1.24272084, + "balance_loss_mlp": 0.56473869, + "epoch": 0.055794378475875546, + "flos": 66503116702080.0, + "grad_norm": 94.6342139542409, + "language_loss": 0.59725136, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.61861688, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.328125, + "step": 928, + "time_per_iteration": 3.1806087493896484 + }, + { + "auxiliary_loss_clip": 0.01740478, + "auxiliary_loss_mlp": 0.00506131, + "balance_loss_clip": 1.33095312, + "balance_loss_mlp": 0.46519473, + "epoch": 0.05585450172854351, + "flos": 25337635729920.0, + "grad_norm": 9.22723760214616, + "language_loss": 1.03655946, + "learning_rate": 3.992992669166168e-06, + "loss": 1.05902553, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 4.08984375, + "router_z_loss_mlp": 0.40942383, + "step": 929, + "time_per_iteration": 2.7041940689086914 + }, + { + "auxiliary_loss_clip": 0.01738628, + "auxiliary_loss_mlp": 0.00496184, + "balance_loss_clip": 1.33178711, + "balance_loss_mlp": 0.45384097, + "epoch": 0.05591462498121148, + "flos": 33911738881920.0, + "grad_norm": 4.1632716818384985, + "language_loss": 0.81096864, + "learning_rate": 3.992960058188094e-06, + "loss": 0.83331674, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.42333984, + "step": 930, + "time_per_iteration": 2.8635597229003906 + }, + { + "auxiliary_loss_clip": 0.01727614, + "auxiliary_loss_mlp": 0.00441801, + "balance_loss_clip": 1.31811261, + "balance_loss_mlp": 0.40172291, + "epoch": 0.055974748233879455, + "flos": 17930880679680.0, + "grad_norm": 24.608419918063564, + "language_loss": 0.93112612, + "learning_rate": 3.992927371636776e-06, + "loss": 0.9528203, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 4.09765625, + "router_z_loss_mlp": 0.40112305, + "step": 931, + "time_per_iteration": 2.596482515335083 + }, + { + "auxiliary_loss_clip": 0.01717981, + "auxiliary_loss_mlp": 0.00484677, + "balance_loss_clip": 1.31293285, + "balance_loss_mlp": 0.441333, + "epoch": 0.05603487148654742, + "flos": 24021976371840.0, + "grad_norm": 25.5514173118681, + "language_loss": 0.90797722, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.93000388, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.43359375, + "step": 932, + "time_per_iteration": 2.7318124771118164 + }, + { + "auxiliary_loss_clip": 0.01708789, + "auxiliary_loss_mlp": 0.00490028, + "balance_loss_clip": 1.31256866, + "balance_loss_mlp": 0.4480184, + "epoch": 0.05609499473921539, + "flos": 17307758517120.0, + "grad_norm": 6.287785884327479, + "language_loss": 0.83031631, + "learning_rate": 3.992861771819365e-06, + "loss": 0.85230452, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 3.95898438, + "router_z_loss_mlp": 0.42041016, + "step": 933, + "time_per_iteration": 2.5961861610412598 + }, + { + "auxiliary_loss_clip": 0.01705374, + "auxiliary_loss_mlp": 0.0043135, + "balance_loss_clip": 1.30995607, + "balance_loss_mlp": 0.39332202, + "epoch": 0.05615511799188336, + "flos": 20994742972800.0, + "grad_norm": 4.21534535401605, + "language_loss": 0.9413777, + "learning_rate": 3.99282885855576e-06, + "loss": 0.96274495, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.38061523, + "step": 934, + "time_per_iteration": 2.6803839206695557 + }, + { + "auxiliary_loss_clip": 0.01692668, + "auxiliary_loss_mlp": 0.00399836, + "balance_loss_clip": 1.30016255, + "balance_loss_mlp": 0.36290526, + "epoch": 0.05621524124455133, + "flos": 17273535834240.0, + "grad_norm": 107.04591177556884, + "language_loss": 0.87611997, + "learning_rate": 3.992795869723885e-06, + "loss": 0.89704496, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 3.93164062, + "router_z_loss_mlp": 0.36962891, + "step": 935, + "time_per_iteration": 2.605409860610962 + }, + { + "auxiliary_loss_clip": 0.0150573, + "auxiliary_loss_mlp": 0.00458334, + "balance_loss_clip": 1.19996762, + "balance_loss_mlp": 0.42724457, + "epoch": 0.0562753644972193, + "flos": 58719370458240.0, + "grad_norm": 0.8607848108686647, + "language_loss": 0.6916219, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71126258, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.31054688, + "step": 936, + "time_per_iteration": 3.047752618789673 + }, + { + "auxiliary_loss_clip": 0.01718602, + "auxiliary_loss_mlp": 0.00502104, + "balance_loss_clip": 1.31997311, + "balance_loss_mlp": 0.46073857, + "epoch": 0.05633548774988727, + "flos": 17457039440640.0, + "grad_norm": 32.678350370678615, + "language_loss": 0.85398382, + "learning_rate": 3.992729665360331e-06, + "loss": 0.8761909, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.4140625, + "step": 937, + "time_per_iteration": 2.619739294052124 + }, + { + "auxiliary_loss_clip": 0.01490642, + "auxiliary_loss_mlp": 0.00270365, + "balance_loss_clip": 1.20579696, + "balance_loss_mlp": 0.24385297, + "epoch": 0.05639561100255524, + "flos": 70654928083200.0, + "grad_norm": 0.8833337043326056, + "language_loss": 0.64388412, + "learning_rate": 3.992696449831162e-06, + "loss": 0.6614942, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.265625, + "step": 938, + "time_per_iteration": 3.084285259246826 + }, + { + "auxiliary_loss_clip": 0.01749834, + "auxiliary_loss_mlp": 0.00514616, + "balance_loss_clip": 1.32865465, + "balance_loss_mlp": 0.47081828, + "epoch": 0.056455734255223204, + "flos": 20485996692480.0, + "grad_norm": 93.44651891046269, + "language_loss": 0.88869172, + "learning_rate": 3.992663158738745e-06, + "loss": 0.91133618, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 4.2109375, + "router_z_loss_mlp": 0.43774414, + "step": 939, + "time_per_iteration": 2.668912410736084 + }, + { + "auxiliary_loss_clip": 0.01764555, + "auxiliary_loss_mlp": 0.00505843, + "balance_loss_clip": 1.34084237, + "balance_loss_mlp": 0.46211731, + "epoch": 0.056515857507891176, + "flos": 22053569109120.0, + "grad_norm": 5.949069665058613, + "language_loss": 0.80140364, + "learning_rate": 3.992629792084341e-06, + "loss": 0.82410765, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 4.234375, + "router_z_loss_mlp": 0.43701172, + "step": 940, + "time_per_iteration": 2.681264638900757 + }, + { + "auxiliary_loss_clip": 0.01800432, + "auxiliary_loss_mlp": 0.0053689, + "balance_loss_clip": 1.36484575, + "balance_loss_mlp": 0.49426088, + "epoch": 0.05657598076055915, + "flos": 24025316336640.0, + "grad_norm": 14.820073274207592, + "language_loss": 0.78170013, + "learning_rate": 3.992596349869216e-06, + "loss": 0.80507338, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 4.35546875, + "router_z_loss_mlp": 0.42626953, + "step": 941, + "time_per_iteration": 2.6951587200164795 + }, + { + "auxiliary_loss_clip": 0.01815152, + "auxiliary_loss_mlp": 0.0054092, + "balance_loss_clip": 1.36693358, + "balance_loss_mlp": 0.49748033, + "epoch": 0.05663610401322711, + "flos": 20480609652480.0, + "grad_norm": 362.0262727255131, + "language_loss": 0.87207568, + "learning_rate": 3.992562832094637e-06, + "loss": 0.89563638, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 4.48046875, + "router_z_loss_mlp": 0.43457031, + "step": 942, + "time_per_iteration": 2.6325583457946777 + }, + { + "auxiliary_loss_clip": 0.01847493, + "auxiliary_loss_mlp": 0.00565238, + "balance_loss_clip": 1.36444974, + "balance_loss_mlp": 0.52141637, + "epoch": 0.056696227265895086, + "flos": 21069042255360.0, + "grad_norm": 53.36786643870972, + "language_loss": 0.96522897, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.98935628, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 4.828125, + "router_z_loss_mlp": 0.43823242, + "step": 943, + "time_per_iteration": 2.750648021697998 + }, + { + "auxiliary_loss_clip": 0.01907178, + "auxiliary_loss_mlp": 0.00547553, + "balance_loss_clip": 1.39377451, + "balance_loss_mlp": 0.50525713, + "epoch": 0.05675635051856306, + "flos": 17821317219840.0, + "grad_norm": 26.747850149197312, + "language_loss": 0.84224367, + "learning_rate": 3.992495569872206e-06, + "loss": 0.86679101, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 5.12890625, + "router_z_loss_mlp": 0.4230957, + "step": 944, + "time_per_iteration": 2.717059373855591 + }, + { + "auxiliary_loss_clip": 0.01940047, + "auxiliary_loss_mlp": 0.00512523, + "balance_loss_clip": 1.3940227, + "balance_loss_mlp": 0.47318375, + "epoch": 0.05681647377123102, + "flos": 23114945111040.0, + "grad_norm": 1.9456023418220305, + "language_loss": 0.86061525, + "learning_rate": 3.992461825426906e-06, + "loss": 0.88514102, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 5.46484375, + "router_z_loss_mlp": 0.39306641, + "step": 945, + "time_per_iteration": 2.6823153495788574 + }, + { + "auxiliary_loss_clip": 0.01955139, + "auxiliary_loss_mlp": 0.00527639, + "balance_loss_clip": 1.40367174, + "balance_loss_mlp": 0.48694074, + "epoch": 0.056876597023898995, + "flos": 16070528505600.0, + "grad_norm": 11.26918952530046, + "language_loss": 0.91534138, + "learning_rate": 3.992428005427252e-06, + "loss": 0.94016922, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 5.515625, + "router_z_loss_mlp": 0.40673828, + "step": 946, + "time_per_iteration": 2.5735859870910645 + }, + { + "auxiliary_loss_clip": 0.01980308, + "auxiliary_loss_mlp": 0.0060074, + "balance_loss_clip": 1.40488029, + "balance_loss_mlp": 0.55305636, + "epoch": 0.05693672027656696, + "flos": 16835641130880.0, + "grad_norm": 35.46331054096292, + "language_loss": 0.86846191, + "learning_rate": 3.992394109874529e-06, + "loss": 0.89427245, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 5.75, + "router_z_loss_mlp": 0.47705078, + "step": 947, + "time_per_iteration": 4.076129674911499 + }, + { + "auxiliary_loss_clip": 0.01981743, + "auxiliary_loss_mlp": 0.00580601, + "balance_loss_clip": 1.4105978, + "balance_loss_mlp": 0.53503931, + "epoch": 0.05699684352923493, + "flos": 21389113370880.0, + "grad_norm": 11.58105383219656, + "language_loss": 0.93177497, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.95739841, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 5.71484375, + "router_z_loss_mlp": 0.45581055, + "step": 948, + "time_per_iteration": 4.054994106292725 + }, + { + "auxiliary_loss_clip": 0.01983255, + "auxiliary_loss_mlp": 0.00587047, + "balance_loss_clip": 1.40292501, + "balance_loss_mlp": 0.5408653, + "epoch": 0.057056966781902904, + "flos": 15560309767680.0, + "grad_norm": 404.44655829120086, + "language_loss": 0.9347614, + "learning_rate": 3.992326092115019e-06, + "loss": 0.96046448, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 5.80078125, + "router_z_loss_mlp": 0.46166992, + "step": 949, + "time_per_iteration": 2.6716415882110596 + }, + { + "auxiliary_loss_clip": 0.01940038, + "auxiliary_loss_mlp": 0.00524098, + "balance_loss_clip": 1.3876574, + "balance_loss_mlp": 0.48647529, + "epoch": 0.05711709003457087, + "flos": 19937856170880.0, + "grad_norm": 3.980106878883666, + "language_loss": 0.84345758, + "learning_rate": 3.992291969910811e-06, + "loss": 0.86809897, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 5.52734375, + "router_z_loss_mlp": 0.3762207, + "step": 950, + "time_per_iteration": 4.114667177200317 + }, + { + "auxiliary_loss_clip": 0.0189358, + "auxiliary_loss_mlp": 0.00569942, + "balance_loss_clip": 1.37363362, + "balance_loss_mlp": 0.52852905, + "epoch": 0.05717721328723884, + "flos": 30332701774080.0, + "grad_norm": 33.808354044603604, + "language_loss": 0.89216077, + "learning_rate": 3.992257772158691e-06, + "loss": 0.91679597, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 5.19921875, + "router_z_loss_mlp": 0.41381836, + "step": 951, + "time_per_iteration": 2.7260332107543945 + }, + { + "auxiliary_loss_clip": 0.01855162, + "auxiliary_loss_mlp": 0.00603096, + "balance_loss_clip": 1.35999322, + "balance_loss_mlp": 0.55460155, + "epoch": 0.05723733653990681, + "flos": 23654358627840.0, + "grad_norm": 20.368231199540404, + "language_loss": 0.93933797, + "learning_rate": 3.992223498859958e-06, + "loss": 0.96392053, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 4.953125, + "router_z_loss_mlp": 0.48510742, + "step": 952, + "time_per_iteration": 2.6228275299072266 + }, + { + "auxiliary_loss_clip": 0.01818815, + "auxiliary_loss_mlp": 0.00616961, + "balance_loss_clip": 1.34017062, + "balance_loss_mlp": 0.5678463, + "epoch": 0.05729745979257478, + "flos": 22055759838720.0, + "grad_norm": 63.427552374841255, + "language_loss": 0.86905164, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.89340943, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 4.78125, + "router_z_loss_mlp": 0.49072266, + "step": 953, + "time_per_iteration": 2.664486885070801 + }, + { + "auxiliary_loss_clip": 0.01801376, + "auxiliary_loss_mlp": 0.00566432, + "balance_loss_clip": 1.33606076, + "balance_loss_mlp": 0.52220577, + "epoch": 0.05735758304524275, + "flos": 19604353368960.0, + "grad_norm": 18.7892456439907, + "language_loss": 0.94109023, + "learning_rate": 3.992154725627848e-06, + "loss": 0.96476829, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 4.65234375, + "router_z_loss_mlp": 0.44262695, + "step": 954, + "time_per_iteration": 2.6121699810028076 + }, + { + "auxiliary_loss_clip": 0.01803382, + "auxiliary_loss_mlp": 0.0057832, + "balance_loss_clip": 1.34093094, + "balance_loss_mlp": 0.53573799, + "epoch": 0.057417706297910716, + "flos": 19099018880640.0, + "grad_norm": 4.252796450519734, + "language_loss": 0.98213321, + "learning_rate": 3.9921202256970804e-06, + "loss": 1.00595021, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 4.62109375, + "router_z_loss_mlp": 0.42578125, + "step": 955, + "time_per_iteration": 2.6320037841796875 + }, + { + "auxiliary_loss_clip": 0.01771463, + "auxiliary_loss_mlp": 0.00559407, + "balance_loss_clip": 1.33174849, + "balance_loss_mlp": 0.51677746, + "epoch": 0.05747782955057869, + "flos": 16654507822080.0, + "grad_norm": 3.2785943300703804, + "language_loss": 0.95770657, + "learning_rate": 3.992085650224914e-06, + "loss": 0.98101532, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 4.3984375, + "router_z_loss_mlp": 0.42651367, + "step": 956, + "time_per_iteration": 2.5881903171539307 + }, + { + "auxiliary_loss_clip": 0.01781379, + "auxiliary_loss_mlp": 0.00571029, + "balance_loss_clip": 1.34326708, + "balance_loss_mlp": 0.52987814, + "epoch": 0.05753795280324665, + "flos": 14502058248960.0, + "grad_norm": 2.4355064332758043, + "language_loss": 0.8229996, + "learning_rate": 3.99205099921266e-06, + "loss": 0.8465237, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 4.38671875, + "router_z_loss_mlp": 0.41162109, + "step": 957, + "time_per_iteration": 2.6059138774871826 + }, + { + "auxiliary_loss_clip": 0.01737133, + "auxiliary_loss_mlp": 0.0056011, + "balance_loss_clip": 1.31161427, + "balance_loss_mlp": 0.51724207, + "epoch": 0.057598076055914625, + "flos": 18076318848000.0, + "grad_norm": 16.027683514065775, + "language_loss": 0.85971862, + "learning_rate": 3.992016272661633e-06, + "loss": 0.88269103, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 4.25390625, + "router_z_loss_mlp": 0.42871094, + "step": 958, + "time_per_iteration": 2.602760076522827 + }, + { + "auxiliary_loss_clip": 0.01721829, + "auxiliary_loss_mlp": 0.00526263, + "balance_loss_clip": 1.30836928, + "balance_loss_mlp": 0.48787761, + "epoch": 0.0576581993085826, + "flos": 22124600254080.0, + "grad_norm": 2.7762420468421043, + "language_loss": 0.94987857, + "learning_rate": 3.99198147057315e-06, + "loss": 0.97235948, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.3840332, + "step": 959, + "time_per_iteration": 2.6261825561523438 + }, + { + "auxiliary_loss_clip": 0.01708294, + "auxiliary_loss_mlp": 0.00529597, + "balance_loss_clip": 1.30080438, + "balance_loss_mlp": 0.49147382, + "epoch": 0.05771832256125056, + "flos": 33181746779520.0, + "grad_norm": 6.599927021264802, + "language_loss": 0.86910421, + "learning_rate": 3.991946592948529e-06, + "loss": 0.89148307, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 4.07421875, + "router_z_loss_mlp": 0.38110352, + "step": 960, + "time_per_iteration": 2.7559149265289307 + }, + { + "auxiliary_loss_clip": 0.01662059, + "auxiliary_loss_mlp": 0.0053954, + "balance_loss_clip": 1.26772141, + "balance_loss_mlp": 0.49686259, + "epoch": 0.057778445813918534, + "flos": 24170143973760.0, + "grad_norm": 11.22024783925195, + "language_loss": 1.00127459, + "learning_rate": 3.991911639789094e-06, + "loss": 1.02329051, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 3.94335938, + "router_z_loss_mlp": 0.42651367, + "step": 961, + "time_per_iteration": 2.670612096786499 + }, + { + "auxiliary_loss_clip": 0.01656915, + "auxiliary_loss_mlp": 0.00532961, + "balance_loss_clip": 1.26074672, + "balance_loss_mlp": 0.49033153, + "epoch": 0.0578385690665865, + "flos": 29643037666560.0, + "grad_norm": 15.370856970853373, + "language_loss": 0.78111768, + "learning_rate": 3.991876611096169e-06, + "loss": 0.80301648, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.42602539, + "step": 962, + "time_per_iteration": 2.74819016456604 + }, + { + "auxiliary_loss_clip": 0.01650034, + "auxiliary_loss_mlp": 0.00500616, + "balance_loss_clip": 1.24973655, + "balance_loss_mlp": 0.46170652, + "epoch": 0.05789869231925447, + "flos": 20885430908160.0, + "grad_norm": 6.66375047375855, + "language_loss": 0.95622104, + "learning_rate": 3.991841506871084e-06, + "loss": 0.97772753, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 4.00585938, + "router_z_loss_mlp": 0.38891602, + "step": 963, + "time_per_iteration": 2.6615824699401855 + }, + { + "auxiliary_loss_clip": 0.01695989, + "auxiliary_loss_mlp": 0.00518673, + "balance_loss_clip": 1.2750535, + "balance_loss_mlp": 0.47921515, + "epoch": 0.057958815571922444, + "flos": 26031106679040.0, + "grad_norm": 26.12666491258496, + "language_loss": 0.95553809, + "learning_rate": 3.99180632711517e-06, + "loss": 0.97768474, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 4.20898438, + "router_z_loss_mlp": 0.39477539, + "step": 964, + "time_per_iteration": 2.769488573074341 + }, + { + "auxiliary_loss_clip": 0.01697784, + "auxiliary_loss_mlp": 0.00475586, + "balance_loss_clip": 1.2676332, + "balance_loss_mlp": 0.43793941, + "epoch": 0.05801893882459041, + "flos": 18077683564800.0, + "grad_norm": 10.22613285020329, + "language_loss": 0.86878109, + "learning_rate": 3.99177107182976e-06, + "loss": 0.89051479, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 4.30664062, + "router_z_loss_mlp": 0.37646484, + "step": 965, + "time_per_iteration": 2.648366689682007 + }, + { + "auxiliary_loss_clip": 0.01722978, + "auxiliary_loss_mlp": 0.00475942, + "balance_loss_clip": 1.27746153, + "balance_loss_mlp": 0.43629289, + "epoch": 0.05807906207725838, + "flos": 17748885444480.0, + "grad_norm": 6.717538864036319, + "language_loss": 0.91474187, + "learning_rate": 3.99173574101619e-06, + "loss": 0.9367311, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 4.4609375, + "router_z_loss_mlp": 0.39648438, + "step": 966, + "time_per_iteration": 2.6468584537506104 + }, + { + "auxiliary_loss_clip": 0.01767904, + "auxiliary_loss_mlp": 0.00444916, + "balance_loss_clip": 1.3063904, + "balance_loss_mlp": 0.40784156, + "epoch": 0.058139185329926346, + "flos": 18040372312320.0, + "grad_norm": 2.276564976747255, + "language_loss": 0.85453457, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.87666285, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 4.61328125, + "router_z_loss_mlp": 0.37060547, + "step": 967, + "time_per_iteration": 2.631924867630005 + }, + { + "auxiliary_loss_clip": 0.01416589, + "auxiliary_loss_mlp": 0.00183514, + "balance_loss_clip": 1.14752614, + "balance_loss_mlp": 0.15356825, + "epoch": 0.05819930858259432, + "flos": 62363297485440.0, + "grad_norm": 1.0172049194584312, + "language_loss": 0.57393897, + "learning_rate": 3.991664852809939e-06, + "loss": 0.58994001, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.29882812, + "step": 968, + "time_per_iteration": 3.1238908767700195 + }, + { + "auxiliary_loss_clip": 0.01737193, + "auxiliary_loss_mlp": 0.00442892, + "balance_loss_clip": 1.28753257, + "balance_loss_mlp": 0.40333885, + "epoch": 0.05825943183526229, + "flos": 19135360465920.0, + "grad_norm": 84.4478735663587, + "language_loss": 0.89742708, + "learning_rate": 3.991629295419945e-06, + "loss": 0.91922796, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 4.4921875, + "router_z_loss_mlp": 0.39575195, + "step": 969, + "time_per_iteration": 2.628114938735962 + }, + { + "auxiliary_loss_clip": 0.01760266, + "auxiliary_loss_mlp": 0.00419313, + "balance_loss_clip": 1.29520631, + "balance_loss_mlp": 0.37842435, + "epoch": 0.058319555087930255, + "flos": 29022465369600.0, + "grad_norm": 49.0195720245925, + "language_loss": 0.85831594, + "learning_rate": 3.991593662507167e-06, + "loss": 0.88011169, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 4.65234375, + "router_z_loss_mlp": 0.40893555, + "step": 970, + "time_per_iteration": 2.6714987754821777 + }, + { + "auxiliary_loss_clip": 0.01749526, + "auxiliary_loss_mlp": 0.00430922, + "balance_loss_clip": 1.29003549, + "balance_loss_mlp": 0.39015213, + "epoch": 0.05837967834059823, + "flos": 18879999701760.0, + "grad_norm": 31.163886959858, + "language_loss": 1.02563953, + "learning_rate": 3.991557954072958e-06, + "loss": 1.04744399, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 4.59765625, + "router_z_loss_mlp": 0.40771484, + "step": 971, + "time_per_iteration": 2.603003978729248 + }, + { + "auxiliary_loss_clip": 0.01708068, + "auxiliary_loss_mlp": 0.00412253, + "balance_loss_clip": 1.27216148, + "balance_loss_mlp": 0.37343812, + "epoch": 0.05843980159326619, + "flos": 25703062744320.0, + "grad_norm": 9.548571456596441, + "language_loss": 0.93512803, + "learning_rate": 3.991522170118673e-06, + "loss": 0.95633131, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 4.35546875, + "router_z_loss_mlp": 0.38793945, + "step": 972, + "time_per_iteration": 2.699547529220581 + }, + { + "auxiliary_loss_clip": 0.01816696, + "auxiliary_loss_mlp": 0.00406327, + "balance_loss_clip": 1.32280707, + "balance_loss_mlp": 0.36860967, + "epoch": 0.058499924845934165, + "flos": 25552129795200.0, + "grad_norm": 58.4696183899547, + "language_loss": 0.95317572, + "learning_rate": 3.991486310645667e-06, + "loss": 0.97540593, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 4.94140625, + "router_z_loss_mlp": 0.37719727, + "step": 973, + "time_per_iteration": 2.6980061531066895 + }, + { + "auxiliary_loss_clip": 0.01772396, + "auxiliary_loss_mlp": 0.00413902, + "balance_loss_clip": 1.30503571, + "balance_loss_mlp": 0.3743009, + "epoch": 0.05856004809860214, + "flos": 16436171001600.0, + "grad_norm": 10.840408437692407, + "language_loss": 0.82452202, + "learning_rate": 3.991450375655301e-06, + "loss": 0.846385, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 4.66796875, + "router_z_loss_mlp": 0.39575195, + "step": 974, + "time_per_iteration": 2.5982847213745117 + }, + { + "auxiliary_loss_clip": 0.01781646, + "auxiliary_loss_mlp": 0.0039081, + "balance_loss_clip": 1.3084681, + "balance_loss_mlp": 0.3534497, + "epoch": 0.0586201713512701, + "flos": 39458824116480.0, + "grad_norm": 17.991482705299635, + "language_loss": 0.82773483, + "learning_rate": 3.991414365148936e-06, + "loss": 0.84945941, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 4.73828125, + "router_z_loss_mlp": 0.37353516, + "step": 975, + "time_per_iteration": 2.8071060180664062 + }, + { + "auxiliary_loss_clip": 0.0178021, + "auxiliary_loss_mlp": 0.00444642, + "balance_loss_clip": 1.31565917, + "balance_loss_mlp": 0.40110654, + "epoch": 0.058680294603938074, + "flos": 23365170230400.0, + "grad_norm": 5.639986831676729, + "language_loss": 0.8510282, + "learning_rate": 3.99137827912794e-06, + "loss": 0.87327671, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 4.6484375, + "router_z_loss_mlp": 0.43530273, + "step": 976, + "time_per_iteration": 2.6275224685668945 + }, + { + "auxiliary_loss_clip": 0.01791208, + "auxiliary_loss_mlp": 0.00419549, + "balance_loss_clip": 1.31809402, + "balance_loss_mlp": 0.38104409, + "epoch": 0.05874041785660604, + "flos": 32232017226240.0, + "grad_norm": 2.468698548563877, + "language_loss": 0.92671204, + "learning_rate": 3.991342117593679e-06, + "loss": 0.94881964, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 4.73046875, + "router_z_loss_mlp": 0.38525391, + "step": 977, + "time_per_iteration": 2.7718665599823 + }, + { + "auxiliary_loss_clip": 0.01753582, + "auxiliary_loss_mlp": 0.00412122, + "balance_loss_clip": 1.30041885, + "balance_loss_mlp": 0.37163812, + "epoch": 0.05880054110927401, + "flos": 22310043194880.0, + "grad_norm": 14.809127019196689, + "language_loss": 0.8682735, + "learning_rate": 3.991305880547527e-06, + "loss": 0.88993055, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 4.52734375, + "router_z_loss_mlp": 0.40478516, + "step": 978, + "time_per_iteration": 2.6365275382995605 + }, + { + "auxiliary_loss_clip": 0.01774499, + "auxiliary_loss_mlp": 0.00419181, + "balance_loss_clip": 1.30926323, + "balance_loss_mlp": 0.37800649, + "epoch": 0.05886066436194198, + "flos": 27380450016000.0, + "grad_norm": 10.21025917633754, + "language_loss": 0.8702389, + "learning_rate": 3.991269567990855e-06, + "loss": 0.89217579, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 4.66015625, + "router_z_loss_mlp": 0.41137695, + "step": 979, + "time_per_iteration": 2.708183526992798 + }, + { + "auxiliary_loss_clip": 0.01398857, + "auxiliary_loss_mlp": 0.00181478, + "balance_loss_clip": 1.0927906, + "balance_loss_mlp": 0.16183192, + "epoch": 0.05892078761460995, + "flos": 59584493525760.0, + "grad_norm": 1.0912572623344694, + "language_loss": 0.59258795, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.60839128, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.19628906, + "step": 980, + "time_per_iteration": 3.055863618850708 + }, + { + "auxiliary_loss_clip": 0.01762895, + "auxiliary_loss_mlp": 0.00374049, + "balance_loss_clip": 1.30756664, + "balance_loss_mlp": 0.3351391, + "epoch": 0.05898091086727792, + "flos": 15414081500160.0, + "grad_norm": 5.982337744485506, + "language_loss": 0.94057399, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.96194345, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 4.55078125, + "router_z_loss_mlp": 0.38891602, + "step": 981, + "time_per_iteration": 2.6406936645507812 + }, + { + "auxiliary_loss_clip": 0.01732501, + "auxiliary_loss_mlp": 0.00361432, + "balance_loss_clip": 1.28992009, + "balance_loss_mlp": 0.32416725, + "epoch": 0.059041034119945886, + "flos": 23655328295040.0, + "grad_norm": 3.499936463769145, + "language_loss": 0.85492826, + "learning_rate": 3.991160177271513e-06, + "loss": 0.87586755, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 4.42578125, + "router_z_loss_mlp": 0.37280273, + "step": 982, + "time_per_iteration": 2.6601898670196533 + }, + { + "auxiliary_loss_clip": 0.01714556, + "auxiliary_loss_mlp": 0.00349984, + "balance_loss_clip": 1.28025103, + "balance_loss_mlp": 0.30945295, + "epoch": 0.05910115737261386, + "flos": 24754087376640.0, + "grad_norm": 31.506532804059844, + "language_loss": 0.93357801, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.95422339, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.40527344, + "step": 983, + "time_per_iteration": 2.6741371154785156 + }, + { + "auxiliary_loss_clip": 0.01645633, + "auxiliary_loss_mlp": 0.00340075, + "balance_loss_clip": 1.25016356, + "balance_loss_mlp": 0.30266684, + "epoch": 0.05916128062528183, + "flos": 11728749070080.0, + "grad_norm": 37.933988772030695, + "language_loss": 0.92243147, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.94228852, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.37402344, + "step": 984, + "time_per_iteration": 2.619649648666382 + }, + { + "auxiliary_loss_clip": 0.01659682, + "auxiliary_loss_mlp": 0.00343236, + "balance_loss_clip": 1.25745797, + "balance_loss_mlp": 0.307569, + "epoch": 0.059221403877949795, + "flos": 21902995296000.0, + "grad_norm": 782.7973208884571, + "language_loss": 0.85744202, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.87747121, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 4.01367188, + "router_z_loss_mlp": 0.35644531, + "step": 985, + "time_per_iteration": 2.6285126209259033 + }, + { + "auxiliary_loss_clip": 0.01678796, + "auxiliary_loss_mlp": 0.00328507, + "balance_loss_clip": 1.26990747, + "balance_loss_mlp": 0.29305398, + "epoch": 0.05928152713061777, + "flos": 20514580940160.0, + "grad_norm": 3.7945138102849816, + "language_loss": 0.97963524, + "learning_rate": 3.991013265915661e-06, + "loss": 0.99970829, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 4.08789062, + "router_z_loss_mlp": 0.35424805, + "step": 986, + "time_per_iteration": 2.703303813934326 + }, + { + "auxiliary_loss_clip": 0.01665464, + "auxiliary_loss_mlp": 0.0032005, + "balance_loss_clip": 1.26600242, + "balance_loss_mlp": 0.28290477, + "epoch": 0.05934165038328574, + "flos": 24495135252480.0, + "grad_norm": 2.89219661127288, + "language_loss": 0.83994293, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.85979807, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 3.99023438, + "router_z_loss_mlp": 0.37182617, + "step": 987, + "time_per_iteration": 2.682426929473877 + }, + { + "auxiliary_loss_clip": 0.01650861, + "auxiliary_loss_mlp": 0.00330608, + "balance_loss_clip": 1.25947285, + "balance_loss_mlp": 0.29226989, + "epoch": 0.059401773635953704, + "flos": 38728041914880.0, + "grad_norm": 4.113118901959257, + "language_loss": 0.80119324, + "learning_rate": 3.990939357235621e-06, + "loss": 0.82100797, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.38354492, + "step": 988, + "time_per_iteration": 2.8041887283325195 + }, + { + "auxiliary_loss_clip": 0.01358978, + "auxiliary_loss_mlp": 0.00115667, + "balance_loss_clip": 1.10455847, + "balance_loss_mlp": 0.10050378, + "epoch": 0.059461896888621676, + "flos": 58023565125120.0, + "grad_norm": 1.154805644810666, + "language_loss": 0.71360099, + "learning_rate": 3.99090228964997e-06, + "loss": 0.72834748, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.15136719, + "step": 989, + "time_per_iteration": 2.980240821838379 + }, + { + "auxiliary_loss_clip": 0.01626309, + "auxiliary_loss_mlp": 0.00322504, + "balance_loss_clip": 1.25046539, + "balance_loss_mlp": 0.28540596, + "epoch": 0.05952202014128964, + "flos": 22127760650880.0, + "grad_norm": 16.816813001840316, + "language_loss": 0.89332926, + "learning_rate": 3.990865146569105e-06, + "loss": 0.91281736, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 3.76171875, + "router_z_loss_mlp": 0.37109375, + "step": 990, + "time_per_iteration": 4.209578514099121 + }, + { + "auxiliary_loss_clip": 0.01620381, + "auxiliary_loss_mlp": 0.00347667, + "balance_loss_clip": 1.24760878, + "balance_loss_mlp": 0.30961585, + "epoch": 0.059582143393957614, + "flos": 20445776438400.0, + "grad_norm": 2.225494722717274, + "language_loss": 0.91901922, + "learning_rate": 3.990827927994434e-06, + "loss": 0.93869972, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.38061523, + "step": 991, + "time_per_iteration": 4.055104970932007 + }, + { + "auxiliary_loss_clip": 0.01647733, + "auxiliary_loss_mlp": 0.00378909, + "balance_loss_clip": 1.265311, + "balance_loss_mlp": 0.34102407, + "epoch": 0.059642266646625586, + "flos": 20594877793920.0, + "grad_norm": 87.50052918852928, + "language_loss": 0.85411024, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.87437665, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 0.37841797, + "step": 992, + "time_per_iteration": 2.6310813426971436 + }, + { + "auxiliary_loss_clip": 0.01637106, + "auxiliary_loss_mlp": 0.00324015, + "balance_loss_clip": 1.26657367, + "balance_loss_mlp": 0.28548694, + "epoch": 0.05970238989929355, + "flos": 19352655792000.0, + "grad_norm": 13.423008164917546, + "language_loss": 0.85813022, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.8777414, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.38549805, + "step": 993, + "time_per_iteration": 4.132798194885254 + }, + { + "auxiliary_loss_clip": 0.0166116, + "auxiliary_loss_mlp": 0.0036086, + "balance_loss_clip": 1.29145288, + "balance_loss_mlp": 0.32609814, + "epoch": 0.05976251315196152, + "flos": 30264040926720.0, + "grad_norm": 2.9596703954082884, + "language_loss": 0.87511587, + "learning_rate": 3.990715819321712e-06, + "loss": 0.89533603, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 0.34716797, + "step": 994, + "time_per_iteration": 2.7761309146881104 + }, + { + "auxiliary_loss_clip": 0.01681864, + "auxiliary_loss_mlp": 0.0032291, + "balance_loss_clip": 1.30787945, + "balance_loss_mlp": 0.28299823, + "epoch": 0.05982263640462949, + "flos": 23185150243200.0, + "grad_norm": 16.813937286339197, + "language_loss": 0.87366414, + "learning_rate": 3.99067829878596e-06, + "loss": 0.89371192, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 0.39941406, + "step": 995, + "time_per_iteration": 2.6131067276000977 + }, + { + "auxiliary_loss_clip": 0.01706365, + "auxiliary_loss_mlp": 0.00338826, + "balance_loss_clip": 1.33180904, + "balance_loss_mlp": 0.29970169, + "epoch": 0.05988275965729746, + "flos": 27850879463040.0, + "grad_norm": 24.819074816642743, + "language_loss": 0.94176435, + "learning_rate": 3.990640702763487e-06, + "loss": 0.96221626, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.39111328, + "step": 996, + "time_per_iteration": 2.679871082305908 + }, + { + "auxiliary_loss_clip": 0.01731905, + "auxiliary_loss_mlp": 0.00359559, + "balance_loss_clip": 1.35744858, + "balance_loss_mlp": 0.32031548, + "epoch": 0.05994288290996543, + "flos": 24680003575680.0, + "grad_norm": 21.222392055358505, + "language_loss": 0.97970444, + "learning_rate": 3.990603031255718e-06, + "loss": 1.00061905, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.39233398, + "step": 997, + "time_per_iteration": 2.6330554485321045 + }, + { + "auxiliary_loss_clip": 0.01633977, + "auxiliary_loss_mlp": 0.00150513, + "balance_loss_clip": 1.38874149, + "balance_loss_mlp": 0.12752967, + "epoch": 0.0600030061626334, + "flos": 69929568835200.0, + "grad_norm": 1.0551008262270325, + "language_loss": 0.75483477, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77267969, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.22949219, + "step": 998, + "time_per_iteration": 3.203890562057495 + }, + { + "auxiliary_loss_clip": 0.01776175, + "auxiliary_loss_mlp": 0.00326555, + "balance_loss_clip": 1.40509033, + "balance_loss_mlp": 0.29024369, + "epoch": 0.06006312941530137, + "flos": 26540140268160.0, + "grad_norm": 106.00589759352613, + "language_loss": 0.83826053, + "learning_rate": 3.990527461790013e-06, + "loss": 0.85928786, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 0.36303711, + "step": 999, + "time_per_iteration": 2.685818672180176 + }, + { + "auxiliary_loss_clip": 0.01822776, + "auxiliary_loss_mlp": 0.00345271, + "balance_loss_clip": 1.41901135, + "balance_loss_mlp": 0.30252233, + "epoch": 0.060123252667969335, + "flos": 27344000689920.0, + "grad_norm": 3.2307546122738313, + "language_loss": 0.87548482, + "learning_rate": 3.990489563834943e-06, + "loss": 0.89716524, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 4.03710938, + "router_z_loss_mlp": 0.42724609, + "step": 1000, + "time_per_iteration": 2.6541972160339355 + }, + { + "auxiliary_loss_clip": 0.01842635, + "auxiliary_loss_mlp": 0.00370844, + "balance_loss_clip": 1.43991327, + "balance_loss_mlp": 0.32881033, + "epoch": 0.06018337592063731, + "flos": 27016710940800.0, + "grad_norm": 39.40998100797067, + "language_loss": 0.93547332, + "learning_rate": 3.990451590400309e-06, + "loss": 0.9576081, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.42041016, + "step": 1001, + "time_per_iteration": 2.6802408695220947 + }, + { + "auxiliary_loss_clip": 0.01827205, + "auxiliary_loss_mlp": 0.00325694, + "balance_loss_clip": 1.43629336, + "balance_loss_mlp": 0.28895375, + "epoch": 0.06024349917330528, + "flos": 25592960580480.0, + "grad_norm": 16.304013767085294, + "language_loss": 0.80052364, + "learning_rate": 3.990413541487551e-06, + "loss": 0.8220526, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.3671875, + "step": 1002, + "time_per_iteration": 2.637951374053955 + }, + { + "auxiliary_loss_clip": 0.01878831, + "auxiliary_loss_mlp": 0.00368687, + "balance_loss_clip": 1.45997667, + "balance_loss_mlp": 0.32746488, + "epoch": 0.060303622425973244, + "flos": 26133271937280.0, + "grad_norm": 143.03357346808386, + "language_loss": 0.84405422, + "learning_rate": 3.990375417098112e-06, + "loss": 0.86652941, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 4.1953125, + "router_z_loss_mlp": 0.41210938, + "step": 1003, + "time_per_iteration": 2.694754123687744 + }, + { + "auxiliary_loss_clip": 0.01843627, + "auxiliary_loss_mlp": 0.00340702, + "balance_loss_clip": 1.43888187, + "balance_loss_mlp": 0.29781002, + "epoch": 0.060363745678641216, + "flos": 20377187418240.0, + "grad_norm": 25.42022466348677, + "language_loss": 0.79460704, + "learning_rate": 3.990337217233437e-06, + "loss": 0.81645036, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 4.05273438, + "router_z_loss_mlp": 0.42895508, + "step": 1004, + "time_per_iteration": 2.6466543674468994 + }, + { + "auxiliary_loss_clip": 0.01877194, + "auxiliary_loss_mlp": 0.00360124, + "balance_loss_clip": 1.45792222, + "balance_loss_mlp": 0.31840116, + "epoch": 0.06042386893130918, + "flos": 17749172753280.0, + "grad_norm": 8.376134772046267, + "language_loss": 0.91994119, + "learning_rate": 3.990298941894976e-06, + "loss": 0.94231427, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 4.19140625, + "router_z_loss_mlp": 0.41748047, + "step": 1005, + "time_per_iteration": 2.634685754776001 + }, + { + "auxiliary_loss_clip": 0.01538009, + "auxiliary_loss_mlp": 0.00219056, + "balance_loss_clip": 1.3119669, + "balance_loss_mlp": 0.20093589, + "epoch": 0.06048399218397715, + "flos": 68538496872960.0, + "grad_norm": 43.177648572100075, + "language_loss": 0.58819586, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.60576648, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.18164062, + "step": 1006, + "time_per_iteration": 3.2165729999542236 + }, + { + "auxiliary_loss_clip": 0.01868401, + "auxiliary_loss_mlp": 0.00360045, + "balance_loss_clip": 1.45409334, + "balance_loss_mlp": 0.32065862, + "epoch": 0.060544115436645125, + "flos": 23258515772160.0, + "grad_norm": 2.4604411269669657, + "language_loss": 0.82523763, + "learning_rate": 3.990222164802503e-06, + "loss": 0.84752208, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 4.13671875, + "router_z_loss_mlp": 0.39404297, + "step": 1007, + "time_per_iteration": 2.6570353507995605 + }, + { + "auxiliary_loss_clip": 0.01896049, + "auxiliary_loss_mlp": 0.00382109, + "balance_loss_clip": 1.46747708, + "balance_loss_mlp": 0.33752495, + "epoch": 0.06060423868931309, + "flos": 23878441624320.0, + "grad_norm": 4.400931930487963, + "language_loss": 0.88007832, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.90285981, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 4.28515625, + "router_z_loss_mlp": 0.44604492, + "step": 1008, + "time_per_iteration": 2.670351505279541 + }, + { + "auxiliary_loss_clip": 0.01879731, + "auxiliary_loss_mlp": 0.00352687, + "balance_loss_clip": 1.47013474, + "balance_loss_mlp": 0.31420663, + "epoch": 0.06066436194198106, + "flos": 18728061171840.0, + "grad_norm": 6.58875470930951, + "language_loss": 0.86297059, + "learning_rate": 3.990145085832335e-06, + "loss": 0.8852948, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 4.0859375, + "router_z_loss_mlp": 0.38476562, + "step": 1009, + "time_per_iteration": 2.6222121715545654 + }, + { + "auxiliary_loss_clip": 0.01850553, + "auxiliary_loss_mlp": 0.00341223, + "balance_loss_clip": 1.45428109, + "balance_loss_mlp": 0.30124068, + "epoch": 0.06072448519464903, + "flos": 24640465680000.0, + "grad_norm": 279.7912568692669, + "language_loss": 0.98131657, + "learning_rate": 3.990106433146769e-06, + "loss": 1.00323427, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 3.95898438, + "router_z_loss_mlp": 0.39990234, + "step": 1010, + "time_per_iteration": 2.696638822555542 + }, + { + "auxiliary_loss_clip": 0.01873696, + "auxiliary_loss_mlp": 0.00431112, + "balance_loss_clip": 1.44980359, + "balance_loss_mlp": 0.3839286, + "epoch": 0.060784608447317, + "flos": 17378825575680.0, + "grad_norm": 223.97511486786448, + "language_loss": 0.81750584, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.84055394, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 4.24609375, + "router_z_loss_mlp": 0.47241211, + "step": 1011, + "time_per_iteration": 2.600973129272461 + }, + { + "auxiliary_loss_clip": 0.01853479, + "auxiliary_loss_mlp": 0.00393421, + "balance_loss_clip": 1.45569372, + "balance_loss_mlp": 0.35110182, + "epoch": 0.06084473169998497, + "flos": 23692208584320.0, + "grad_norm": 2.6030797472411518, + "language_loss": 0.93187892, + "learning_rate": 3.990028901381999e-06, + "loss": 0.95434785, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 3.9765625, + "router_z_loss_mlp": 0.4230957, + "step": 1012, + "time_per_iteration": 2.7041497230529785 + }, + { + "auxiliary_loss_clip": 0.01877217, + "auxiliary_loss_mlp": 0.00416363, + "balance_loss_clip": 1.45679975, + "balance_loss_mlp": 0.371373, + "epoch": 0.06090485495265294, + "flos": 23546339452800.0, + "grad_norm": 66.06224628510309, + "language_loss": 0.83260453, + "learning_rate": 3.989990022305734e-06, + "loss": 0.85554034, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 4.20898438, + "router_z_loss_mlp": 0.44995117, + "step": 1013, + "time_per_iteration": 2.636848211288452 + }, + { + "auxiliary_loss_clip": 0.01913739, + "auxiliary_loss_mlp": 0.00429934, + "balance_loss_clip": 1.47620547, + "balance_loss_mlp": 0.38034272, + "epoch": 0.06096497820532091, + "flos": 20339301548160.0, + "grad_norm": 45.23706536116578, + "language_loss": 0.91789669, + "learning_rate": 3.98995106776885e-06, + "loss": 0.94133341, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.49560547, + "step": 1014, + "time_per_iteration": 2.6293182373046875 + }, + { + "auxiliary_loss_clip": 0.01870352, + "auxiliary_loss_mlp": 0.00425068, + "balance_loss_clip": 1.44842851, + "balance_loss_mlp": 0.37728938, + "epoch": 0.061025101457988874, + "flos": 26939035779840.0, + "grad_norm": 6.954144846840946, + "language_loss": 0.81825733, + "learning_rate": 3.98991203777282e-06, + "loss": 0.8412115, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 4.22265625, + "router_z_loss_mlp": 0.47729492, + "step": 1015, + "time_per_iteration": 2.696056842803955 + }, + { + "auxiliary_loss_clip": 0.01909917, + "auxiliary_loss_mlp": 0.0036301, + "balance_loss_clip": 1.47450376, + "balance_loss_mlp": 0.3243621, + "epoch": 0.061085224710656846, + "flos": 25375054723200.0, + "grad_norm": 6.526687359081663, + "language_loss": 0.85430336, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.87703264, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.38623047, + "step": 1016, + "time_per_iteration": 2.6829376220703125 + }, + { + "auxiliary_loss_clip": 0.0187294, + "auxiliary_loss_mlp": 0.00393178, + "balance_loss_clip": 1.45441532, + "balance_loss_mlp": 0.34921318, + "epoch": 0.06114534796332482, + "flos": 24824759385600.0, + "grad_norm": 127.25332078933282, + "language_loss": 0.81198764, + "learning_rate": 3.989833751409254e-06, + "loss": 0.83464879, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.43994141, + "step": 1017, + "time_per_iteration": 2.6310696601867676 + }, + { + "auxiliary_loss_clip": 0.01861398, + "auxiliary_loss_mlp": 0.0040916, + "balance_loss_clip": 1.44158649, + "balance_loss_mlp": 0.36054617, + "epoch": 0.061205471215992784, + "flos": 20631434860800.0, + "grad_norm": 46.068224886081566, + "language_loss": 0.93144459, + "learning_rate": 3.989794495044685e-06, + "loss": 0.9541502, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 4.19921875, + "router_z_loss_mlp": 0.48608398, + "step": 1018, + "time_per_iteration": 2.678844690322876 + }, + { + "auxiliary_loss_clip": 0.01886766, + "auxiliary_loss_mlp": 0.00373083, + "balance_loss_clip": 1.46006417, + "balance_loss_mlp": 0.33028725, + "epoch": 0.061265594468660756, + "flos": 16508351381760.0, + "grad_norm": 374.41224165433374, + "language_loss": 0.86680174, + "learning_rate": 3.989755163226909e-06, + "loss": 0.88940024, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 4.26757812, + "router_z_loss_mlp": 0.42773438, + "step": 1019, + "time_per_iteration": 2.6223413944244385 + }, + { + "auxiliary_loss_clip": 0.01902488, + "auxiliary_loss_mlp": 0.00392727, + "balance_loss_clip": 1.46695912, + "balance_loss_mlp": 0.35210082, + "epoch": 0.06132571772132872, + "flos": 26246211275520.0, + "grad_norm": 99.48803234602866, + "language_loss": 0.89968264, + "learning_rate": 3.989715755957418e-06, + "loss": 0.92263484, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 4.35742188, + "router_z_loss_mlp": 0.40673828, + "step": 1020, + "time_per_iteration": 2.66105318069458 + }, + { + "auxiliary_loss_clip": 0.01898763, + "auxiliary_loss_mlp": 0.0038324, + "balance_loss_clip": 1.46687329, + "balance_loss_mlp": 0.34146923, + "epoch": 0.06138584097399669, + "flos": 37414788768000.0, + "grad_norm": 12.244898198062398, + "language_loss": 0.85054487, + "learning_rate": 3.989676273237705e-06, + "loss": 0.87336487, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 4.31640625, + "router_z_loss_mlp": 0.41772461, + "step": 1021, + "time_per_iteration": 2.730564594268799 + }, + { + "auxiliary_loss_clip": 0.01850455, + "auxiliary_loss_mlp": 0.00390287, + "balance_loss_clip": 1.44380486, + "balance_loss_mlp": 0.35209283, + "epoch": 0.061445964226664665, + "flos": 17420661941760.0, + "grad_norm": 27.189028685072607, + "language_loss": 0.94254279, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.96495026, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 4.06640625, + "router_z_loss_mlp": 0.38183594, + "step": 1022, + "time_per_iteration": 2.6192102432250977 + }, + { + "auxiliary_loss_clip": 0.01880958, + "auxiliary_loss_mlp": 0.00383711, + "balance_loss_clip": 1.4619596, + "balance_loss_mlp": 0.34356105, + "epoch": 0.06150608747933263, + "flos": 22600021691520.0, + "grad_norm": 2.8361091294177605, + "language_loss": 0.88909113, + "learning_rate": 3.989597081453611e-06, + "loss": 0.9117378, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.40136719, + "step": 1023, + "time_per_iteration": 2.633117437362671 + }, + { + "auxiliary_loss_clip": 0.01456976, + "auxiliary_loss_mlp": 0.00162268, + "balance_loss_clip": 1.24416542, + "balance_loss_mlp": 0.14948836, + "epoch": 0.0615662107320006, + "flos": 56741482005120.0, + "grad_norm": 0.9635263638814625, + "language_loss": 0.64754468, + "learning_rate": 3.989557372392231e-06, + "loss": 0.66373712, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.12792969, + "step": 1024, + "time_per_iteration": 3.20097017288208 + }, + { + "auxiliary_loss_clip": 0.01861612, + "auxiliary_loss_mlp": 0.00395914, + "balance_loss_clip": 1.44995832, + "balance_loss_mlp": 0.35376108, + "epoch": 0.06162633398466857, + "flos": 22564793427840.0, + "grad_norm": 177.3400539591709, + "language_loss": 0.94409627, + "learning_rate": 3.989517587886636e-06, + "loss": 0.96667153, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 4.11914062, + "router_z_loss_mlp": 0.421875, + "step": 1025, + "time_per_iteration": 2.659351348876953 + }, + { + "auxiliary_loss_clip": 0.01831959, + "auxiliary_loss_mlp": 0.00390447, + "balance_loss_clip": 1.43562269, + "balance_loss_mlp": 0.34798512, + "epoch": 0.06168645723733654, + "flos": 25593104234880.0, + "grad_norm": 3.5224436118804165, + "language_loss": 0.89430487, + "learning_rate": 3.989477727938335e-06, + "loss": 0.91652894, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.42456055, + "step": 1026, + "time_per_iteration": 2.6625702381134033 + }, + { + "auxiliary_loss_clip": 0.01808867, + "auxiliary_loss_mlp": 0.00408159, + "balance_loss_clip": 1.41947174, + "balance_loss_mlp": 0.3622635, + "epoch": 0.06174658049000451, + "flos": 15997917162240.0, + "grad_norm": 5.570639554946303, + "language_loss": 0.88275379, + "learning_rate": 3.989437792548839e-06, + "loss": 0.90492404, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.45898438, + "step": 1027, + "time_per_iteration": 2.693840503692627 + }, + { + "auxiliary_loss_clip": 0.01767877, + "auxiliary_loss_mlp": 0.00373479, + "balance_loss_clip": 1.40484929, + "balance_loss_mlp": 0.33084989, + "epoch": 0.06180670374267248, + "flos": 11285970117120.0, + "grad_norm": 11.56938496606523, + "language_loss": 0.90706736, + "learning_rate": 3.989397781719663e-06, + "loss": 0.92848092, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 0.42602539, + "step": 1028, + "time_per_iteration": 2.6674892902374268 + }, + { + "auxiliary_loss_clip": 0.01433993, + "auxiliary_loss_mlp": 0.00150846, + "balance_loss_clip": 1.22681284, + "balance_loss_mlp": 0.13568221, + "epoch": 0.06186682699534045, + "flos": 65130142216320.0, + "grad_norm": 1.0901759473695556, + "language_loss": 0.60327333, + "learning_rate": 3.989357695452323e-06, + "loss": 0.61912173, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.15136719, + "step": 1029, + "time_per_iteration": 3.0580804347991943 + }, + { + "auxiliary_loss_clip": 0.01743541, + "auxiliary_loss_mlp": 0.00395852, + "balance_loss_clip": 1.38180041, + "balance_loss_mlp": 0.35105315, + "epoch": 0.061926950248008414, + "flos": 21105742976640.0, + "grad_norm": 9.723089529918502, + "language_loss": 0.88497978, + "learning_rate": 3.98931753374834e-06, + "loss": 0.90637368, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 0.44775391, + "step": 1030, + "time_per_iteration": 2.7045562267303467 + }, + { + "auxiliary_loss_clip": 0.01708122, + "auxiliary_loss_mlp": 0.00354881, + "balance_loss_clip": 1.36777782, + "balance_loss_mlp": 0.31427795, + "epoch": 0.061987073500676386, + "flos": 17748454481280.0, + "grad_norm": 10.52847040221949, + "language_loss": 0.8979668, + "learning_rate": 3.989277296609237e-06, + "loss": 0.91859686, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.40576172, + "step": 1031, + "time_per_iteration": 2.7101552486419678 + }, + { + "auxiliary_loss_clip": 0.01717851, + "auxiliary_loss_mlp": 0.00378509, + "balance_loss_clip": 1.3781321, + "balance_loss_mlp": 0.33869347, + "epoch": 0.06204719675334436, + "flos": 21836237869440.0, + "grad_norm": 27.03847660760682, + "language_loss": 0.83419991, + "learning_rate": 3.98923698403654e-06, + "loss": 0.85516351, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 0.39868164, + "step": 1032, + "time_per_iteration": 4.214528799057007 + }, + { + "auxiliary_loss_clip": 0.01659865, + "auxiliary_loss_mlp": 0.00332538, + "balance_loss_clip": 1.33332968, + "balance_loss_mlp": 0.29136324, + "epoch": 0.06210732000601232, + "flos": 19353697286400.0, + "grad_norm": 6.631828807688025, + "language_loss": 0.9858889, + "learning_rate": 3.989196596031776e-06, + "loss": 1.00581288, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.41210938, + "step": 1033, + "time_per_iteration": 4.075666189193726 + }, + { + "auxiliary_loss_clip": 0.01685666, + "auxiliary_loss_mlp": 0.00342824, + "balance_loss_clip": 1.35468292, + "balance_loss_mlp": 0.30405715, + "epoch": 0.062167443258680295, + "flos": 24749382695040.0, + "grad_norm": 51.025577116666796, + "language_loss": 0.90720797, + "learning_rate": 3.989156132596479e-06, + "loss": 0.92749286, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.38745117, + "step": 1034, + "time_per_iteration": 2.6525027751922607 + }, + { + "auxiliary_loss_clip": 0.01659377, + "auxiliary_loss_mlp": 0.00339913, + "balance_loss_clip": 1.34293532, + "balance_loss_mlp": 0.30074096, + "epoch": 0.06222756651134827, + "flos": 34458478773120.0, + "grad_norm": 7.819582584557896, + "language_loss": 0.87168229, + "learning_rate": 3.989115593732182e-06, + "loss": 0.89167523, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.3918457, + "step": 1035, + "time_per_iteration": 4.274803400039673 + }, + { + "auxiliary_loss_clip": 0.01659602, + "auxiliary_loss_mlp": 0.00421645, + "balance_loss_clip": 1.33363366, + "balance_loss_mlp": 0.37434241, + "epoch": 0.06228768976401623, + "flos": 25666469763840.0, + "grad_norm": 16.870520550119824, + "language_loss": 0.85486513, + "learning_rate": 3.989074979440421e-06, + "loss": 0.87567759, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.47290039, + "step": 1036, + "time_per_iteration": 3.036057472229004 + }, + { + "auxiliary_loss_clip": 0.01658651, + "auxiliary_loss_mlp": 0.00396918, + "balance_loss_clip": 1.33672905, + "balance_loss_mlp": 0.35538575, + "epoch": 0.062347813016684205, + "flos": 25295619795840.0, + "grad_norm": 51.35407173571347, + "language_loss": 0.91975152, + "learning_rate": 3.989034289722739e-06, + "loss": 0.9403072, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.41552734, + "step": 1037, + "time_per_iteration": 2.703125238418579 + }, + { + "auxiliary_loss_clip": 0.01631469, + "auxiliary_loss_mlp": 0.00371782, + "balance_loss_clip": 1.3175211, + "balance_loss_mlp": 0.32874703, + "epoch": 0.06240793626935217, + "flos": 26907039740160.0, + "grad_norm": 102.63033574690957, + "language_loss": 0.87647635, + "learning_rate": 3.988993524580676e-06, + "loss": 0.89650881, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.43066406, + "step": 1038, + "time_per_iteration": 2.6997342109680176 + }, + { + "auxiliary_loss_clip": 0.01626554, + "auxiliary_loss_mlp": 0.00396285, + "balance_loss_clip": 1.31949532, + "balance_loss_mlp": 0.35236877, + "epoch": 0.06246805952202014, + "flos": 21615782146560.0, + "grad_norm": 14.82545027577469, + "language_loss": 0.92193919, + "learning_rate": 3.98895268401578e-06, + "loss": 0.94216758, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.43920898, + "step": 1039, + "time_per_iteration": 2.607518434524536 + }, + { + "auxiliary_loss_clip": 0.01624446, + "auxiliary_loss_mlp": 0.00401954, + "balance_loss_clip": 1.31115305, + "balance_loss_mlp": 0.35884771, + "epoch": 0.0625281827746881, + "flos": 19311896833920.0, + "grad_norm": 3.2250239979738278, + "language_loss": 0.88526273, + "learning_rate": 3.9889117680296e-06, + "loss": 0.90552664, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.43139648, + "step": 1040, + "time_per_iteration": 2.636939287185669 + }, + { + "auxiliary_loss_clip": 0.01610067, + "auxiliary_loss_mlp": 0.0043907, + "balance_loss_clip": 1.30093145, + "balance_loss_mlp": 0.39317405, + "epoch": 0.06258830602735609, + "flos": 27745769289600.0, + "grad_norm": 24.126318215047956, + "language_loss": 0.75930858, + "learning_rate": 3.988870776623685e-06, + "loss": 0.7798, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.45898438, + "step": 1041, + "time_per_iteration": 2.6684648990631104 + }, + { + "auxiliary_loss_clip": 0.01575003, + "auxiliary_loss_mlp": 0.00418086, + "balance_loss_clip": 1.2693882, + "balance_loss_mlp": 0.37219071, + "epoch": 0.06264842928002405, + "flos": 23222605150080.0, + "grad_norm": 5.778946106556664, + "language_loss": 0.87992918, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.89986014, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.45947266, + "step": 1042, + "time_per_iteration": 2.6777265071868896 + }, + { + "auxiliary_loss_clip": 0.01566022, + "auxiliary_loss_mlp": 0.0039418, + "balance_loss_clip": 1.26798785, + "balance_loss_mlp": 0.35393482, + "epoch": 0.06270855253269202, + "flos": 38399495189760.0, + "grad_norm": 82.22822001469639, + "language_loss": 0.82518959, + "learning_rate": 3.988788567558874e-06, + "loss": 0.84479165, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.40258789, + "step": 1043, + "time_per_iteration": 2.797785997390747 + }, + { + "auxiliary_loss_clip": 0.01551051, + "auxiliary_loss_mlp": 0.00408318, + "balance_loss_clip": 1.26185131, + "balance_loss_mlp": 0.36649978, + "epoch": 0.06276867578535998, + "flos": 22453542028800.0, + "grad_norm": 10.33769875826316, + "language_loss": 0.97667933, + "learning_rate": 3.988747349903097e-06, + "loss": 0.99627304, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.41772461, + "step": 1044, + "time_per_iteration": 2.660820722579956 + }, + { + "auxiliary_loss_clip": 0.0153446, + "auxiliary_loss_mlp": 0.00440267, + "balance_loss_clip": 1.24242496, + "balance_loss_mlp": 0.39742348, + "epoch": 0.06282879903802796, + "flos": 22930435923840.0, + "grad_norm": 5.094887820162873, + "language_loss": 0.90001857, + "learning_rate": 3.988706056833821e-06, + "loss": 0.91976577, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.42871094, + "step": 1045, + "time_per_iteration": 2.6405088901519775 + }, + { + "auxiliary_loss_clip": 0.01538443, + "auxiliary_loss_mlp": 0.00411714, + "balance_loss_clip": 1.24800789, + "balance_loss_mlp": 0.37049109, + "epoch": 0.06288892229069593, + "flos": 34819237019520.0, + "grad_norm": 572.2871477114196, + "language_loss": 0.8495332, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.86903477, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.41210938, + "step": 1046, + "time_per_iteration": 2.743758201599121 + }, + { + "auxiliary_loss_clip": 0.01529178, + "auxiliary_loss_mlp": 0.00392568, + "balance_loss_clip": 1.24025345, + "balance_loss_mlp": 0.35375324, + "epoch": 0.06294904554336389, + "flos": 19427134642560.0, + "grad_norm": 23.076959387051286, + "language_loss": 0.85560918, + "learning_rate": 3.988623244461039e-06, + "loss": 0.87482667, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.38793945, + "step": 1047, + "time_per_iteration": 2.631890296936035 + }, + { + "auxiliary_loss_clip": 0.0153781, + "auxiliary_loss_mlp": 0.00432864, + "balance_loss_clip": 1.2367475, + "balance_loss_mlp": 0.38847029, + "epoch": 0.06300916879603187, + "flos": 40661867358720.0, + "grad_norm": 11.663159420434013, + "language_loss": 0.82016432, + "learning_rate": 3.988581725160672e-06, + "loss": 0.83987111, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.4440918, + "step": 1048, + "time_per_iteration": 2.8371901512145996 + }, + { + "auxiliary_loss_clip": 0.01505333, + "auxiliary_loss_mlp": 0.00419765, + "balance_loss_clip": 1.21950078, + "balance_loss_mlp": 0.3788051, + "epoch": 0.06306929204869983, + "flos": 23804142341760.0, + "grad_norm": 9.289639915963775, + "language_loss": 0.86412036, + "learning_rate": 3.988540130453087e-06, + "loss": 0.88337135, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.40991211, + "step": 1049, + "time_per_iteration": 2.7028040885925293 + }, + { + "auxiliary_loss_clip": 0.0150963, + "auxiliary_loss_mlp": 0.00416393, + "balance_loss_clip": 1.22100389, + "balance_loss_mlp": 0.37357295, + "epoch": 0.0631294153013678, + "flos": 18915802583040.0, + "grad_norm": 355.8530737112155, + "language_loss": 0.88772601, + "learning_rate": 3.988498460339862e-06, + "loss": 0.90698624, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.4284668, + "step": 1050, + "time_per_iteration": 2.6492273807525635 + }, + { + "auxiliary_loss_clip": 0.01499065, + "auxiliary_loss_mlp": 0.00414138, + "balance_loss_clip": 1.221246, + "balance_loss_mlp": 0.37599057, + "epoch": 0.06318953855403578, + "flos": 24280174310400.0, + "grad_norm": 50.78399641545571, + "language_loss": 0.83595645, + "learning_rate": 3.988456714822575e-06, + "loss": 0.85508847, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.38110352, + "step": 1051, + "time_per_iteration": 2.7040462493896484 + }, + { + "auxiliary_loss_clip": 0.0149942, + "auxiliary_loss_mlp": 0.0037517, + "balance_loss_clip": 1.216043, + "balance_loss_mlp": 0.33575985, + "epoch": 0.06324966180670374, + "flos": 22528918719360.0, + "grad_norm": 10.062722000736667, + "language_loss": 0.88306195, + "learning_rate": 3.98841489390281e-06, + "loss": 0.9018079, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.39453125, + "step": 1052, + "time_per_iteration": 2.645591974258423 + }, + { + "auxiliary_loss_clip": 0.0149662, + "auxiliary_loss_mlp": 0.00409485, + "balance_loss_clip": 1.20810378, + "balance_loss_mlp": 0.36862049, + "epoch": 0.06330978505937171, + "flos": 15778107884160.0, + "grad_norm": 129.30674006392036, + "language_loss": 0.85990751, + "learning_rate": 3.988372997582155e-06, + "loss": 0.8789686, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.40893555, + "step": 1053, + "time_per_iteration": 2.6938087940216064 + }, + { + "auxiliary_loss_clip": 0.01483033, + "auxiliary_loss_mlp": 0.00400075, + "balance_loss_clip": 1.20167947, + "balance_loss_mlp": 0.36214286, + "epoch": 0.06336990831203967, + "flos": 21471098163840.0, + "grad_norm": 3.3322601717178655, + "language_loss": 0.89846718, + "learning_rate": 3.988331025862195e-06, + "loss": 0.91729832, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.37915039, + "step": 1054, + "time_per_iteration": 2.6541240215301514 + }, + { + "auxiliary_loss_clip": 0.01479265, + "auxiliary_loss_mlp": 0.00384223, + "balance_loss_clip": 1.19916058, + "balance_loss_mlp": 0.34671992, + "epoch": 0.06343003156470765, + "flos": 18478877546880.0, + "grad_norm": 12.357516295828102, + "language_loss": 0.9272716, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.94590652, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.375, + "step": 1055, + "time_per_iteration": 2.664271116256714 + }, + { + "auxiliary_loss_clip": 0.01505765, + "auxiliary_loss_mlp": 0.00393729, + "balance_loss_clip": 1.21096802, + "balance_loss_mlp": 0.35467586, + "epoch": 0.06349015481737562, + "flos": 25154886309120.0, + "grad_norm": 88.11730059923521, + "language_loss": 0.92687345, + "learning_rate": 3.988246856230734e-06, + "loss": 0.94586837, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.39038086, + "step": 1056, + "time_per_iteration": 2.6961615085601807 + }, + { + "auxiliary_loss_clip": 0.01518604, + "auxiliary_loss_mlp": 0.00475803, + "balance_loss_clip": 1.22077286, + "balance_loss_mlp": 0.42645082, + "epoch": 0.06355027807004358, + "flos": 26871775562880.0, + "grad_norm": 20.327154981567332, + "language_loss": 0.88945347, + "learning_rate": 3.988204658322426e-06, + "loss": 0.90939754, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.4934082, + "step": 1057, + "time_per_iteration": 2.7087011337280273 + }, + { + "auxiliary_loss_clip": 0.01497867, + "auxiliary_loss_mlp": 0.00364259, + "balance_loss_clip": 1.2158916, + "balance_loss_mlp": 0.32863915, + "epoch": 0.06361040132271156, + "flos": 21396691140480.0, + "grad_norm": 46.064800770965434, + "language_loss": 0.89666426, + "learning_rate": 3.988162385021196e-06, + "loss": 0.91528553, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.35620117, + "step": 1058, + "time_per_iteration": 2.6293692588806152 + }, + { + "auxiliary_loss_clip": 0.01519517, + "auxiliary_loss_mlp": 0.00388324, + "balance_loss_clip": 1.22846603, + "balance_loss_mlp": 0.34858, + "epoch": 0.06367052457537953, + "flos": 25733765894400.0, + "grad_norm": 63.72267193090609, + "language_loss": 0.94794762, + "learning_rate": 3.988120036328651e-06, + "loss": 0.96702611, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.39746094, + "step": 1059, + "time_per_iteration": 2.7277355194091797 + }, + { + "auxiliary_loss_clip": 0.01536588, + "auxiliary_loss_mlp": 0.00394358, + "balance_loss_clip": 1.23985207, + "balance_loss_mlp": 0.35354123, + "epoch": 0.0637306478280475, + "flos": 17631420992640.0, + "grad_norm": 4.157186951909997, + "language_loss": 0.99300253, + "learning_rate": 3.988077612246394e-06, + "loss": 1.01231194, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.40820312, + "step": 1060, + "time_per_iteration": 2.5862419605255127 + }, + { + "auxiliary_loss_clip": 0.01547165, + "auxiliary_loss_mlp": 0.00391442, + "balance_loss_clip": 1.24869335, + "balance_loss_mlp": 0.35200712, + "epoch": 0.06379077108071547, + "flos": 13662610427520.0, + "grad_norm": 5.584768817925065, + "language_loss": 0.94537163, + "learning_rate": 3.988035112776035e-06, + "loss": 0.96475774, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.39453125, + "step": 1061, + "time_per_iteration": 2.6051387786865234 + }, + { + "auxiliary_loss_clip": 0.01543514, + "auxiliary_loss_mlp": 0.00372989, + "balance_loss_clip": 1.24258363, + "balance_loss_mlp": 0.33520007, + "epoch": 0.06385089433338344, + "flos": 28478849961600.0, + "grad_norm": 8.75335593505827, + "language_loss": 0.83288771, + "learning_rate": 3.987992537919185e-06, + "loss": 0.85205275, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.37817383, + "step": 1062, + "time_per_iteration": 2.7441327571868896 + }, + { + "auxiliary_loss_clip": 0.01557501, + "auxiliary_loss_mlp": 0.00391459, + "balance_loss_clip": 1.25073242, + "balance_loss_mlp": 0.35161975, + "epoch": 0.0639110175860514, + "flos": 24311057028480.0, + "grad_norm": 9.362583243499625, + "language_loss": 0.91714388, + "learning_rate": 3.987949887677459e-06, + "loss": 0.93663347, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.3984375, + "step": 1063, + "time_per_iteration": 2.642925500869751 + }, + { + "auxiliary_loss_clip": 0.01573474, + "auxiliary_loss_mlp": 0.00438398, + "balance_loss_clip": 1.26112819, + "balance_loss_mlp": 0.39202562, + "epoch": 0.06397114083871938, + "flos": 22090772620800.0, + "grad_norm": 9.789677258754391, + "language_loss": 0.85331023, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.87342894, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 3.12304688, + "router_z_loss_mlp": 0.46362305, + "step": 1064, + "time_per_iteration": 2.677478313446045 + }, + { + "auxiliary_loss_clip": 0.01557572, + "auxiliary_loss_mlp": 0.0039344, + "balance_loss_clip": 1.25735438, + "balance_loss_mlp": 0.35433954, + "epoch": 0.06403126409138735, + "flos": 19572824206080.0, + "grad_norm": 146.04552299016308, + "language_loss": 0.91371113, + "learning_rate": 3.987864361045851e-06, + "loss": 0.93322122, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.39086914, + "step": 1065, + "time_per_iteration": 2.607443332672119 + }, + { + "auxiliary_loss_clip": 0.01552345, + "auxiliary_loss_mlp": 0.00403413, + "balance_loss_clip": 1.24839139, + "balance_loss_mlp": 0.36371672, + "epoch": 0.06409138734405531, + "flos": 40807413267840.0, + "grad_norm": 680.7253694394888, + "language_loss": 0.74215496, + "learning_rate": 3.987821484659211e-06, + "loss": 0.76171255, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.3972168, + "step": 1066, + "time_per_iteration": 2.8125345706939697 + }, + { + "auxiliary_loss_clip": 0.01560783, + "auxiliary_loss_mlp": 0.0040573, + "balance_loss_clip": 1.25480652, + "balance_loss_mlp": 0.36901328, + "epoch": 0.06415151059672328, + "flos": 20441610460800.0, + "grad_norm": 14.980010759529012, + "language_loss": 0.98019511, + "learning_rate": 3.987778532894181e-06, + "loss": 0.99986023, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.36694336, + "step": 1067, + "time_per_iteration": 2.602358102798462 + }, + { + "auxiliary_loss_clip": 0.015529, + "auxiliary_loss_mlp": 0.00401857, + "balance_loss_clip": 1.25268376, + "balance_loss_mlp": 0.36559349, + "epoch": 0.06421163384939126, + "flos": 18072045129600.0, + "grad_norm": 46.00123723636975, + "language_loss": 0.89616835, + "learning_rate": 3.987735505752391e-06, + "loss": 0.91571599, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.36254883, + "step": 1068, + "time_per_iteration": 2.615971088409424 + }, + { + "auxiliary_loss_clip": 0.01549241, + "auxiliary_loss_mlp": 0.0038103, + "balance_loss_clip": 1.2496841, + "balance_loss_mlp": 0.34569657, + "epoch": 0.06427175710205922, + "flos": 25119442563840.0, + "grad_norm": 29.213615004749805, + "language_loss": 0.97126901, + "learning_rate": 3.987692403235471e-06, + "loss": 0.99057174, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.35327148, + "step": 1069, + "time_per_iteration": 2.694183349609375 + }, + { + "auxiliary_loss_clip": 0.01563775, + "auxiliary_loss_mlp": 0.00416793, + "balance_loss_clip": 1.25250769, + "balance_loss_mlp": 0.37471163, + "epoch": 0.06433188035472719, + "flos": 17380549428480.0, + "grad_norm": 29.67592500752994, + "language_loss": 1.02379549, + "learning_rate": 3.987649225345056e-06, + "loss": 1.04360116, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.42114258, + "step": 1070, + "time_per_iteration": 2.641355037689209 + }, + { + "auxiliary_loss_clip": 0.01567864, + "auxiliary_loss_mlp": 0.00439148, + "balance_loss_clip": 1.25787437, + "balance_loss_mlp": 0.39785361, + "epoch": 0.06439200360739517, + "flos": 23546267625600.0, + "grad_norm": 53.54771654045865, + "language_loss": 0.92173779, + "learning_rate": 3.987605972082782e-06, + "loss": 0.94180787, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.4128418, + "step": 1071, + "time_per_iteration": 2.8051066398620605 + }, + { + "auxiliary_loss_clip": 0.0155171, + "auxiliary_loss_mlp": 0.00379936, + "balance_loss_clip": 1.24605525, + "balance_loss_mlp": 0.34467351, + "epoch": 0.06445212686006313, + "flos": 21979772616960.0, + "grad_norm": 8.067639148397818, + "language_loss": 0.82165742, + "learning_rate": 3.987562643450292e-06, + "loss": 0.84097385, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.35253906, + "step": 1072, + "time_per_iteration": 2.6741156578063965 + }, + { + "auxiliary_loss_clip": 0.01556527, + "auxiliary_loss_mlp": 0.00418543, + "balance_loss_clip": 1.24563742, + "balance_loss_mlp": 0.38106364, + "epoch": 0.0645122501127311, + "flos": 25921291824000.0, + "grad_norm": 40.777536209979765, + "language_loss": 0.8974393, + "learning_rate": 3.987519239449226e-06, + "loss": 0.91719002, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 3.109375, + "router_z_loss_mlp": 0.37475586, + "step": 1073, + "time_per_iteration": 2.6493942737579346 + }, + { + "auxiliary_loss_clip": 0.01558547, + "auxiliary_loss_mlp": 0.00450251, + "balance_loss_clip": 1.25615835, + "balance_loss_mlp": 0.41057837, + "epoch": 0.06457237336539907, + "flos": 25626034028160.0, + "grad_norm": 87.85068016510577, + "language_loss": 0.85579813, + "learning_rate": 3.987475760081233e-06, + "loss": 0.87588608, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.39672852, + "step": 1074, + "time_per_iteration": 5.51121711730957 + }, + { + "auxiliary_loss_clip": 0.01547752, + "auxiliary_loss_mlp": 0.0039766, + "balance_loss_clip": 1.24638975, + "balance_loss_mlp": 0.36206406, + "epoch": 0.06463249661806704, + "flos": 19463979018240.0, + "grad_norm": 76.75042737376367, + "language_loss": 0.86686063, + "learning_rate": 3.987432205347958e-06, + "loss": 0.88631481, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.35595703, + "step": 1075, + "time_per_iteration": 2.613860607147217 + }, + { + "auxiliary_loss_clip": 0.01546384, + "auxiliary_loss_mlp": 0.00396056, + "balance_loss_clip": 1.24677181, + "balance_loss_mlp": 0.36153337, + "epoch": 0.064692619870735, + "flos": 24498044254080.0, + "grad_norm": 26.22063402936528, + "language_loss": 0.96182168, + "learning_rate": 3.987388575251055e-06, + "loss": 0.98124605, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.34545898, + "step": 1076, + "time_per_iteration": 4.071648120880127 + }, + { + "auxiliary_loss_clip": 0.015302, + "auxiliary_loss_mlp": 0.00435003, + "balance_loss_clip": 1.2339946, + "balance_loss_mlp": 0.39685613, + "epoch": 0.06475274312340297, + "flos": 17018677860480.0, + "grad_norm": 210.6407695580567, + "language_loss": 0.88593298, + "learning_rate": 3.98734486979218e-06, + "loss": 0.90558499, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.38110352, + "step": 1077, + "time_per_iteration": 2.6218690872192383 + }, + { + "auxiliary_loss_clip": 0.01544744, + "auxiliary_loss_mlp": 0.0045729, + "balance_loss_clip": 1.23875308, + "balance_loss_mlp": 0.41334954, + "epoch": 0.06481286637607095, + "flos": 24572379450240.0, + "grad_norm": 12.315166116433852, + "language_loss": 0.98389566, + "learning_rate": 3.987301088972986e-06, + "loss": 1.00391603, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.43920898, + "step": 1078, + "time_per_iteration": 4.145873069763184 + }, + { + "auxiliary_loss_clip": 0.01541123, + "auxiliary_loss_mlp": 0.00460134, + "balance_loss_clip": 1.22873521, + "balance_loss_mlp": 0.42050847, + "epoch": 0.06487298962873891, + "flos": 21105635235840.0, + "grad_norm": 29.602996997084748, + "language_loss": 0.86135066, + "learning_rate": 3.987257232795137e-06, + "loss": 0.88136321, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.39648438, + "step": 1079, + "time_per_iteration": 2.6874167919158936 + }, + { + "auxiliary_loss_clip": 0.01508993, + "auxiliary_loss_mlp": 0.00445486, + "balance_loss_clip": 1.21067691, + "balance_loss_mlp": 0.40698129, + "epoch": 0.06493311288140688, + "flos": 24608182331520.0, + "grad_norm": 43.58859169042818, + "language_loss": 0.76935792, + "learning_rate": 3.987213301260294e-06, + "loss": 0.7889027, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.38500977, + "step": 1080, + "time_per_iteration": 2.7185263633728027 + }, + { + "auxiliary_loss_clip": 0.01502675, + "auxiliary_loss_mlp": 0.00430247, + "balance_loss_clip": 1.20162296, + "balance_loss_mlp": 0.39097995, + "epoch": 0.06499323613407486, + "flos": 25337994865920.0, + "grad_norm": 242.99721186075243, + "language_loss": 0.80292755, + "learning_rate": 3.987169294370123e-06, + "loss": 0.8222568, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.39282227, + "step": 1081, + "time_per_iteration": 2.688467502593994 + }, + { + "auxiliary_loss_clip": 0.01500189, + "auxiliary_loss_mlp": 0.00433031, + "balance_loss_clip": 1.20405936, + "balance_loss_mlp": 0.39683947, + "epoch": 0.06505335938674282, + "flos": 20375714960640.0, + "grad_norm": 188.8150167244038, + "language_loss": 0.92806673, + "learning_rate": 3.987125212126294e-06, + "loss": 0.9473989, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.36206055, + "step": 1082, + "time_per_iteration": 2.8112616539001465 + }, + { + "auxiliary_loss_clip": 0.01516944, + "auxiliary_loss_mlp": 0.00464151, + "balance_loss_clip": 1.20826054, + "balance_loss_mlp": 0.42493162, + "epoch": 0.06511348263941079, + "flos": 25337923038720.0, + "grad_norm": 11.397109176745477, + "language_loss": 0.91180778, + "learning_rate": 3.987081054530478e-06, + "loss": 0.93161875, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.39208984, + "step": 1083, + "time_per_iteration": 2.6949079036712646 + }, + { + "auxiliary_loss_clip": 0.01517887, + "auxiliary_loss_mlp": 0.00469492, + "balance_loss_clip": 1.21159422, + "balance_loss_mlp": 0.42974758, + "epoch": 0.06517360589207877, + "flos": 20332801186560.0, + "grad_norm": 3.758280459939653, + "language_loss": 0.87138993, + "learning_rate": 3.987036821584348e-06, + "loss": 0.89126366, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.39746094, + "step": 1084, + "time_per_iteration": 2.621683359146118 + }, + { + "auxiliary_loss_clip": 0.01514992, + "auxiliary_loss_mlp": 0.00496219, + "balance_loss_clip": 1.20840478, + "balance_loss_mlp": 0.456236, + "epoch": 0.06523372914474673, + "flos": 31681650061440.0, + "grad_norm": 35.04331138540899, + "language_loss": 0.75052845, + "learning_rate": 3.986992513289584e-06, + "loss": 0.77064061, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.3996582, + "step": 1085, + "time_per_iteration": 2.718093156814575 + }, + { + "auxiliary_loss_clip": 0.01503572, + "auxiliary_loss_mlp": 0.00442939, + "balance_loss_clip": 1.20146847, + "balance_loss_mlp": 0.40765265, + "epoch": 0.0652938523974147, + "flos": 20778165918720.0, + "grad_norm": 14.582494034499238, + "language_loss": 0.82788557, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.84735072, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.35253906, + "step": 1086, + "time_per_iteration": 2.6772780418395996 + }, + { + "auxiliary_loss_clip": 0.01511313, + "auxiliary_loss_mlp": 0.00441418, + "balance_loss_clip": 1.20503306, + "balance_loss_mlp": 0.40532196, + "epoch": 0.06535397565008266, + "flos": 16690993061760.0, + "grad_norm": 36.656422257372256, + "language_loss": 0.90638804, + "learning_rate": 3.986903670660872e-06, + "loss": 0.92591536, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.36108398, + "step": 1087, + "time_per_iteration": 2.585280418395996 + }, + { + "auxiliary_loss_clip": 0.01511202, + "auxiliary_loss_mlp": 0.00464397, + "balance_loss_clip": 1.20263517, + "balance_loss_mlp": 0.42646471, + "epoch": 0.06541409890275064, + "flos": 26868220116480.0, + "grad_norm": 5.206591851545119, + "language_loss": 0.85310954, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.87286556, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.37915039, + "step": 1088, + "time_per_iteration": 2.719813346862793 + }, + { + "auxiliary_loss_clip": 0.01533252, + "auxiliary_loss_mlp": 0.00442714, + "balance_loss_clip": 1.21480274, + "balance_loss_mlp": 0.40690309, + "epoch": 0.06547422215541861, + "flos": 20521620005760.0, + "grad_norm": 31.964932527850305, + "language_loss": 0.79606891, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.81582856, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.35791016, + "step": 1089, + "time_per_iteration": 2.6042380332946777 + }, + { + "auxiliary_loss_clip": 0.01507508, + "auxiliary_loss_mlp": 0.00473291, + "balance_loss_clip": 1.19788396, + "balance_loss_mlp": 0.43831497, + "epoch": 0.06553434540808657, + "flos": 22016616992640.0, + "grad_norm": 78135.55625499117, + "language_loss": 0.90796828, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.92777622, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.34985352, + "step": 1090, + "time_per_iteration": 2.6337783336639404 + }, + { + "auxiliary_loss_clip": 0.01496927, + "auxiliary_loss_mlp": 0.00417726, + "balance_loss_clip": 1.18738317, + "balance_loss_mlp": 0.38432378, + "epoch": 0.06559446866075455, + "flos": 24608649208320.0, + "grad_norm": 3.7839025941876936, + "language_loss": 0.80294907, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.82209563, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.33398438, + "step": 1091, + "time_per_iteration": 2.6386218070983887 + }, + { + "auxiliary_loss_clip": 0.01527811, + "auxiliary_loss_mlp": 0.00429907, + "balance_loss_clip": 1.20664191, + "balance_loss_mlp": 0.39340517, + "epoch": 0.06565459191342252, + "flos": 24274679529600.0, + "grad_norm": 85.63250149530734, + "language_loss": 0.88802528, + "learning_rate": 3.986680245605936e-06, + "loss": 0.90760243, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.36499023, + "step": 1092, + "time_per_iteration": 2.7093825340270996 + }, + { + "auxiliary_loss_clip": 0.01530232, + "auxiliary_loss_mlp": 0.00431994, + "balance_loss_clip": 1.20471907, + "balance_loss_mlp": 0.39606392, + "epoch": 0.06571471516609048, + "flos": 24787124910720.0, + "grad_norm": 5.569353587638561, + "language_loss": 0.78017324, + "learning_rate": 3.986635334582814e-06, + "loss": 0.79979551, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.35888672, + "step": 1093, + "time_per_iteration": 2.648963451385498 + }, + { + "auxiliary_loss_clip": 0.01530819, + "auxiliary_loss_mlp": 0.00467103, + "balance_loss_clip": 1.20848739, + "balance_loss_mlp": 0.43205568, + "epoch": 0.06577483841875846, + "flos": 26214071581440.0, + "grad_norm": 10.472997528471563, + "language_loss": 0.9358511, + "learning_rate": 3.986590348226282e-06, + "loss": 0.95583034, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.3503418, + "step": 1094, + "time_per_iteration": 2.7091903686523438 + }, + { + "auxiliary_loss_clip": 0.01542966, + "auxiliary_loss_mlp": 0.00427117, + "balance_loss_clip": 1.21122992, + "balance_loss_mlp": 0.39011449, + "epoch": 0.06583496167142643, + "flos": 25080802508160.0, + "grad_norm": 4.432508296085686, + "language_loss": 0.8707093, + "learning_rate": 3.986545286538044e-06, + "loss": 0.89041013, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.36962891, + "step": 1095, + "time_per_iteration": 2.7017223834991455 + }, + { + "auxiliary_loss_clip": 0.01545124, + "auxiliary_loss_mlp": 0.00380493, + "balance_loss_clip": 1.2118094, + "balance_loss_mlp": 0.34680438, + "epoch": 0.06589508492409439, + "flos": 25629804956160.0, + "grad_norm": 42.43572473162818, + "language_loss": 0.81397593, + "learning_rate": 3.986500149519811e-06, + "loss": 0.83323205, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.33666992, + "step": 1096, + "time_per_iteration": 2.7311806678771973 + }, + { + "auxiliary_loss_clip": 0.0155012, + "auxiliary_loss_mlp": 0.00381489, + "balance_loss_clip": 1.2126838, + "balance_loss_mlp": 0.34963635, + "epoch": 0.06595520817676236, + "flos": 23621249266560.0, + "grad_norm": 14.825915176056803, + "language_loss": 0.83390749, + "learning_rate": 3.986454937173292e-06, + "loss": 0.85322356, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.31835938, + "step": 1097, + "time_per_iteration": 2.66382098197937 + }, + { + "auxiliary_loss_clip": 0.015903, + "auxiliary_loss_mlp": 0.00422726, + "balance_loss_clip": 1.2359128, + "balance_loss_mlp": 0.3859143, + "epoch": 0.06601533142943034, + "flos": 33801708545280.0, + "grad_norm": 6.196072147382637, + "language_loss": 0.85043728, + "learning_rate": 3.986409649500203e-06, + "loss": 0.87056744, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 0.3684082, + "step": 1098, + "time_per_iteration": 2.7350857257843018 + }, + { + "auxiliary_loss_clip": 0.01579616, + "auxiliary_loss_mlp": 0.00408974, + "balance_loss_clip": 1.23240757, + "balance_loss_mlp": 0.37323463, + "epoch": 0.0660754546820983, + "flos": 20259184262400.0, + "grad_norm": 184.51648028747067, + "language_loss": 0.86960495, + "learning_rate": 3.986364286502261e-06, + "loss": 0.88949084, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.35742188, + "step": 1099, + "time_per_iteration": 2.6432154178619385 + }, + { + "auxiliary_loss_clip": 0.01578707, + "auxiliary_loss_mlp": 0.00394742, + "balance_loss_clip": 1.2310977, + "balance_loss_mlp": 0.36241212, + "epoch": 0.06613557793476627, + "flos": 19354164163200.0, + "grad_norm": 9.299121306933175, + "language_loss": 0.91285431, + "learning_rate": 3.986318848181186e-06, + "loss": 0.93258882, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.32324219, + "step": 1100, + "time_per_iteration": 2.6162126064300537 + }, + { + "auxiliary_loss_clip": 0.01622129, + "auxiliary_loss_mlp": 0.00405581, + "balance_loss_clip": 1.25428033, + "balance_loss_mlp": 0.36962759, + "epoch": 0.06619570118743424, + "flos": 13772568936960.0, + "grad_norm": 840.4603302899762, + "language_loss": 0.81159019, + "learning_rate": 3.986273334538702e-06, + "loss": 0.83186728, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 0.35961914, + "step": 1101, + "time_per_iteration": 2.642754316329956 + }, + { + "auxiliary_loss_clip": 0.01622071, + "auxiliary_loss_mlp": 0.0039952, + "balance_loss_clip": 1.25738811, + "balance_loss_mlp": 0.36368519, + "epoch": 0.06625582444010221, + "flos": 17857874286720.0, + "grad_norm": 17.843400776548947, + "language_loss": 0.9459098, + "learning_rate": 3.986227745576533e-06, + "loss": 0.96612567, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 0.3581543, + "step": 1102, + "time_per_iteration": 2.586280345916748 + }, + { + "auxiliary_loss_clip": 0.01640467, + "auxiliary_loss_mlp": 0.00397432, + "balance_loss_clip": 1.26550233, + "balance_loss_mlp": 0.36212233, + "epoch": 0.06631594769277017, + "flos": 11838707579520.0, + "grad_norm": 43.899597911879226, + "language_loss": 0.91727132, + "learning_rate": 3.98618208129641e-06, + "loss": 0.93765026, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 0.35327148, + "step": 1103, + "time_per_iteration": 2.6517715454101562 + }, + { + "auxiliary_loss_clip": 0.01649206, + "auxiliary_loss_mlp": 0.00361247, + "balance_loss_clip": 1.27190113, + "balance_loss_mlp": 0.32705826, + "epoch": 0.06637607094543815, + "flos": 19793351756160.0, + "grad_norm": 6.779525427558469, + "language_loss": 0.87057012, + "learning_rate": 3.986136341700063e-06, + "loss": 0.89067471, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 0.34204102, + "step": 1104, + "time_per_iteration": 2.62314510345459 + }, + { + "auxiliary_loss_clip": 0.01649048, + "auxiliary_loss_mlp": 0.00358936, + "balance_loss_clip": 1.27057528, + "balance_loss_mlp": 0.32500857, + "epoch": 0.06643619419810612, + "flos": 25485659677440.0, + "grad_norm": 27.93691968056262, + "language_loss": 0.85043311, + "learning_rate": 3.986090526789227e-06, + "loss": 0.87051296, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 0.33959961, + "step": 1105, + "time_per_iteration": 2.6693952083587646 + }, + { + "auxiliary_loss_clip": 0.01653518, + "auxiliary_loss_mlp": 0.0031932, + "balance_loss_clip": 1.26982379, + "balance_loss_mlp": 0.2877053, + "epoch": 0.06649631745077408, + "flos": 16946533393920.0, + "grad_norm": 2.8522276632720027, + "language_loss": 1.02287722, + "learning_rate": 3.986044636565639e-06, + "loss": 1.04260564, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.31640625, + "step": 1106, + "time_per_iteration": 2.564732789993286 + }, + { + "auxiliary_loss_clip": 0.01658518, + "auxiliary_loss_mlp": 0.00340788, + "balance_loss_clip": 1.27290928, + "balance_loss_mlp": 0.30540663, + "epoch": 0.06655644070344206, + "flos": 17858592558720.0, + "grad_norm": 35.325263204974235, + "language_loss": 0.88242, + "learning_rate": 3.985998671031039e-06, + "loss": 0.90241301, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 3.85546875, + "router_z_loss_mlp": 0.35375977, + "step": 1107, + "time_per_iteration": 2.6067376136779785 + }, + { + "auxiliary_loss_clip": 0.01442802, + "auxiliary_loss_mlp": 0.00194826, + "balance_loss_clip": 1.15568638, + "balance_loss_mlp": 0.17451292, + "epoch": 0.06661656395611003, + "flos": 61419350021760.0, + "grad_norm": 0.8347110073290563, + "language_loss": 0.56689417, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.58327043, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.203125, + "step": 1108, + "time_per_iteration": 3.0873939990997314 + }, + { + "auxiliary_loss_clip": 0.01642278, + "auxiliary_loss_mlp": 0.00353747, + "balance_loss_clip": 1.2612282, + "balance_loss_mlp": 0.3184377, + "epoch": 0.066676687208778, + "flos": 20662856282880.0, + "grad_norm": 54.703890366498094, + "language_loss": 0.77887332, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.79883361, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 0.35302734, + "step": 1109, + "time_per_iteration": 2.6344878673553467 + }, + { + "auxiliary_loss_clip": 0.01631735, + "auxiliary_loss_mlp": 0.00349072, + "balance_loss_clip": 1.26157081, + "balance_loss_mlp": 0.31526428, + "epoch": 0.06673681046144596, + "flos": 20923280864640.0, + "grad_norm": 3.4030065813555805, + "language_loss": 0.84524399, + "learning_rate": 3.985860322578614e-06, + "loss": 0.86505204, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 0.33764648, + "step": 1110, + "time_per_iteration": 2.622086763381958 + }, + { + "auxiliary_loss_clip": 0.01622198, + "auxiliary_loss_mlp": 0.00337702, + "balance_loss_clip": 1.25308657, + "balance_loss_mlp": 0.30162913, + "epoch": 0.06679693371411394, + "flos": 31065818359680.0, + "grad_norm": 18.23724618390184, + "language_loss": 0.80302697, + "learning_rate": 3.985814055817427e-06, + "loss": 0.82262599, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 0.36083984, + "step": 1111, + "time_per_iteration": 2.709742307662964 + }, + { + "auxiliary_loss_clip": 0.01608889, + "auxiliary_loss_mlp": 0.0033618, + "balance_loss_clip": 1.24935901, + "balance_loss_mlp": 0.30206278, + "epoch": 0.0668570569667819, + "flos": 21726135705600.0, + "grad_norm": 25.85640391990921, + "language_loss": 0.86714983, + "learning_rate": 3.985767713753971e-06, + "loss": 0.88660055, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 0.34106445, + "step": 1112, + "time_per_iteration": 2.641934394836426 + }, + { + "auxiliary_loss_clip": 0.01598814, + "auxiliary_loss_mlp": 0.00314776, + "balance_loss_clip": 1.24116898, + "balance_loss_mlp": 0.28189793, + "epoch": 0.06691718021944987, + "flos": 22747255539840.0, + "grad_norm": 104.3728604204608, + "language_loss": 0.88341606, + "learning_rate": 3.985721296390005e-06, + "loss": 0.90255189, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 0.32861328, + "step": 1113, + "time_per_iteration": 2.618006706237793 + }, + { + "auxiliary_loss_clip": 0.01571733, + "auxiliary_loss_mlp": 0.00315928, + "balance_loss_clip": 1.22871113, + "balance_loss_mlp": 0.28436148, + "epoch": 0.06697730347211785, + "flos": 16545626720640.0, + "grad_norm": 11.516511632202697, + "language_loss": 0.8987301, + "learning_rate": 3.985674803727289e-06, + "loss": 0.91760671, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 0.31591797, + "step": 1114, + "time_per_iteration": 2.6031835079193115 + }, + { + "auxiliary_loss_clip": 0.01472119, + "auxiliary_loss_mlp": 0.00238236, + "balance_loss_clip": 1.22960997, + "balance_loss_mlp": 0.22173743, + "epoch": 0.06703742672478581, + "flos": 59782326658560.0, + "grad_norm": 0.854563349565955, + "language_loss": 0.58195043, + "learning_rate": 3.985628235767584e-06, + "loss": 0.59905398, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.16503906, + "step": 1115, + "time_per_iteration": 3.1172804832458496 + }, + { + "auxiliary_loss_clip": 0.01569155, + "auxiliary_loss_mlp": 0.00330821, + "balance_loss_clip": 1.22810304, + "balance_loss_mlp": 0.2982291, + "epoch": 0.06709754997745378, + "flos": 16800197385600.0, + "grad_norm": 3.9785170330059443, + "language_loss": 0.97796261, + "learning_rate": 3.985581592512658e-06, + "loss": 0.99696237, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.32568359, + "step": 1116, + "time_per_iteration": 5.490621089935303 + }, + { + "auxiliary_loss_clip": 0.0157432, + "auxiliary_loss_mlp": 0.00338017, + "balance_loss_clip": 1.2318356, + "balance_loss_mlp": 0.30175358, + "epoch": 0.06715767323012176, + "flos": 22123917895680.0, + "grad_norm": 74.69648050434692, + "language_loss": 0.93750024, + "learning_rate": 3.985534873964279e-06, + "loss": 0.95662367, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.36230469, + "step": 1117, + "time_per_iteration": 2.6785333156585693 + }, + { + "auxiliary_loss_clip": 0.01386377, + "auxiliary_loss_mlp": 0.00197589, + "balance_loss_clip": 1.16091776, + "balance_loss_mlp": 0.17956434, + "epoch": 0.06721779648278972, + "flos": 66618100137600.0, + "grad_norm": 0.8398369303680734, + "language_loss": 0.59745502, + "learning_rate": 3.985488080124218e-06, + "loss": 0.61329472, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.18066406, + "step": 1118, + "time_per_iteration": 4.494428396224976 + }, + { + "auxiliary_loss_clip": 0.01569084, + "auxiliary_loss_mlp": 0.00325188, + "balance_loss_clip": 1.23355997, + "balance_loss_mlp": 0.28983086, + "epoch": 0.06727791973545769, + "flos": 22382474970240.0, + "grad_norm": 6.547349853400643, + "language_loss": 0.94161958, + "learning_rate": 3.985441210994251e-06, + "loss": 0.96056235, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.35351562, + "step": 1119, + "time_per_iteration": 2.605325937271118 + }, + { + "auxiliary_loss_clip": 0.01551582, + "auxiliary_loss_mlp": 0.00319682, + "balance_loss_clip": 1.22145998, + "balance_loss_mlp": 0.28706586, + "epoch": 0.06733804298812565, + "flos": 24280210224000.0, + "grad_norm": 19.742784783764762, + "language_loss": 0.91820306, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.9369157, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.32617188, + "step": 1120, + "time_per_iteration": 4.112436771392822 + }, + { + "auxiliary_loss_clip": 0.01542665, + "auxiliary_loss_mlp": 0.00305823, + "balance_loss_clip": 1.21776271, + "balance_loss_mlp": 0.27256316, + "epoch": 0.06739816624079363, + "flos": 15918230839680.0, + "grad_norm": 17.317694518391747, + "language_loss": 0.84988981, + "learning_rate": 3.985347246871708e-06, + "loss": 0.86837465, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.33227539, + "step": 1121, + "time_per_iteration": 2.6816940307617188 + }, + { + "auxiliary_loss_clip": 0.0147589, + "auxiliary_loss_mlp": 0.00159378, + "balance_loss_clip": 1.24019074, + "balance_loss_mlp": 0.14221199, + "epoch": 0.0674582894934616, + "flos": 71398567353600.0, + "grad_norm": 0.9420820738565313, + "language_loss": 0.58228141, + "learning_rate": 3.985300151882694e-06, + "loss": 0.59863406, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.171875, + "step": 1122, + "time_per_iteration": 3.301090955734253 + }, + { + "auxiliary_loss_clip": 0.01541368, + "auxiliary_loss_mlp": 0.00298738, + "balance_loss_clip": 1.21444511, + "balance_loss_mlp": 0.26834011, + "epoch": 0.06751841274612956, + "flos": 25264952559360.0, + "grad_norm": 6.7434611826683994, + "language_loss": 0.79904532, + "learning_rate": 3.985252981610901e-06, + "loss": 0.81744641, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.30371094, + "step": 1123, + "time_per_iteration": 2.6947760581970215 + }, + { + "auxiliary_loss_clip": 0.01544659, + "auxiliary_loss_mlp": 0.00341201, + "balance_loss_clip": 1.21795201, + "balance_loss_mlp": 0.30250543, + "epoch": 0.06757853599879754, + "flos": 23802741711360.0, + "grad_norm": 7.983247594063329, + "language_loss": 0.86740446, + "learning_rate": 3.985205736058114e-06, + "loss": 0.88626307, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.38696289, + "step": 1124, + "time_per_iteration": 2.680556535720825 + }, + { + "auxiliary_loss_clip": 0.01531511, + "auxiliary_loss_mlp": 0.00284997, + "balance_loss_clip": 1.2098726, + "balance_loss_mlp": 0.2545509, + "epoch": 0.0676386592514655, + "flos": 21033742164480.0, + "grad_norm": 53.99835329944499, + "language_loss": 0.79650009, + "learning_rate": 3.985158415226128e-06, + "loss": 0.8146652, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.30456543, + "step": 1125, + "time_per_iteration": 2.797760009765625 + }, + { + "auxiliary_loss_clip": 0.0153063, + "auxiliary_loss_mlp": 0.00324624, + "balance_loss_clip": 1.21017039, + "balance_loss_mlp": 0.28764489, + "epoch": 0.06769878250413347, + "flos": 25556331686400.0, + "grad_norm": 48.68996196504871, + "language_loss": 0.89216411, + "learning_rate": 3.985111019116736e-06, + "loss": 0.91071671, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.36987305, + "step": 1126, + "time_per_iteration": 2.6467361450195312 + }, + { + "auxiliary_loss_clip": 0.0147831, + "auxiliary_loss_mlp": 0.00185745, + "balance_loss_clip": 1.23904741, + "balance_loss_mlp": 0.17325151, + "epoch": 0.06775890575680145, + "flos": 70655251305600.0, + "grad_norm": 0.781343461824998, + "language_loss": 0.59978271, + "learning_rate": 3.985063547731735e-06, + "loss": 0.61642325, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.125, + "step": 1127, + "time_per_iteration": 3.144613027572632 + }, + { + "auxiliary_loss_clip": 0.01546235, + "auxiliary_loss_mlp": 0.00348587, + "balance_loss_clip": 1.22098255, + "balance_loss_mlp": 0.31232375, + "epoch": 0.06781902900946941, + "flos": 24235500769920.0, + "grad_norm": 8.940820543093892, + "language_loss": 0.87959445, + "learning_rate": 3.985016001072925e-06, + "loss": 0.89854264, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.36279297, + "step": 1128, + "time_per_iteration": 2.7251129150390625 + }, + { + "auxiliary_loss_clip": 0.01569021, + "auxiliary_loss_mlp": 0.00395362, + "balance_loss_clip": 1.23742604, + "balance_loss_mlp": 0.35471135, + "epoch": 0.06787915226213738, + "flos": 22417523665920.0, + "grad_norm": 42.37293701502774, + "language_loss": 0.81682754, + "learning_rate": 3.984968379142109e-06, + "loss": 0.83647132, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.40637207, + "step": 1129, + "time_per_iteration": 2.662787914276123 + }, + { + "auxiliary_loss_clip": 0.01562279, + "auxiliary_loss_mlp": 0.00410292, + "balance_loss_clip": 1.23244727, + "balance_loss_mlp": 0.37002271, + "epoch": 0.06793927551480534, + "flos": 37706922080640.0, + "grad_norm": 309.0301604210543, + "language_loss": 0.79519355, + "learning_rate": 3.984920681941094e-06, + "loss": 0.81491929, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.40258789, + "step": 1130, + "time_per_iteration": 2.780775308609009 + }, + { + "auxiliary_loss_clip": 0.01554943, + "auxiliary_loss_mlp": 0.00421782, + "balance_loss_clip": 1.22690511, + "balance_loss_mlp": 0.38403994, + "epoch": 0.06799939876747332, + "flos": 20631398947200.0, + "grad_norm": 3.0056273591166858, + "language_loss": 0.87709939, + "learning_rate": 3.984872909471688e-06, + "loss": 0.89686668, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.37768555, + "step": 1131, + "time_per_iteration": 2.645106315612793 + }, + { + "auxiliary_loss_clip": 0.01546091, + "auxiliary_loss_mlp": 0.00372547, + "balance_loss_clip": 1.22206211, + "balance_loss_mlp": 0.33873951, + "epoch": 0.06805952202014129, + "flos": 14864755829760.0, + "grad_norm": 6.07236335985529, + "language_loss": 0.87335199, + "learning_rate": 3.984825061735701e-06, + "loss": 0.89253843, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.33764648, + "step": 1132, + "time_per_iteration": 2.6039018630981445 + }, + { + "auxiliary_loss_clip": 0.01557666, + "auxiliary_loss_mlp": 0.003452, + "balance_loss_clip": 1.23310101, + "balance_loss_mlp": 0.31189311, + "epoch": 0.06811964527280925, + "flos": 48909434947200.0, + "grad_norm": 10.992894049198293, + "language_loss": 0.69868863, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.71771729, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.33239746, + "step": 1133, + "time_per_iteration": 2.8680946826934814 + }, + { + "auxiliary_loss_clip": 0.01561422, + "auxiliary_loss_mlp": 0.00387697, + "balance_loss_clip": 1.23093164, + "balance_loss_mlp": 0.34819093, + "epoch": 0.06817976852547723, + "flos": 15377273038080.0, + "grad_norm": 20.089213066606074, + "language_loss": 0.84641808, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.86590922, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.39526367, + "step": 1134, + "time_per_iteration": 2.6728973388671875 + }, + { + "auxiliary_loss_clip": 0.01572015, + "auxiliary_loss_mlp": 0.00355688, + "balance_loss_clip": 1.24514592, + "balance_loss_mlp": 0.32269111, + "epoch": 0.0682398917781452, + "flos": 20155690200960.0, + "grad_norm": 2.1750504208071475, + "language_loss": 0.93159044, + "learning_rate": 3.984681066946423e-06, + "loss": 0.95086741, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.33007812, + "step": 1135, + "time_per_iteration": 2.7111313343048096 + }, + { + "auxiliary_loss_clip": 0.01574075, + "auxiliary_loss_mlp": 0.00378982, + "balance_loss_clip": 1.24319422, + "balance_loss_mlp": 0.3397859, + "epoch": 0.06830001503081316, + "flos": 23440618748160.0, + "grad_norm": 4.145828388770281, + "language_loss": 0.87704086, + "learning_rate": 3.984632918162291e-06, + "loss": 0.8965714, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.39160156, + "step": 1136, + "time_per_iteration": 2.651824712753296 + }, + { + "auxiliary_loss_clip": 0.01575999, + "auxiliary_loss_mlp": 0.00387799, + "balance_loss_clip": 1.24775779, + "balance_loss_mlp": 0.34793541, + "epoch": 0.06836013828348114, + "flos": 34349813153280.0, + "grad_norm": 29.465991111605405, + "language_loss": 0.9107123, + "learning_rate": 3.984584694120679e-06, + "loss": 0.93035024, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.39868164, + "step": 1137, + "time_per_iteration": 2.7855048179626465 + }, + { + "auxiliary_loss_clip": 0.01569601, + "auxiliary_loss_mlp": 0.00358542, + "balance_loss_clip": 1.24267805, + "balance_loss_mlp": 0.32266006, + "epoch": 0.06842026153614911, + "flos": 23148844571520.0, + "grad_norm": 12.941904901721681, + "language_loss": 0.86551988, + "learning_rate": 3.984536394823418e-06, + "loss": 0.88480127, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.35888672, + "step": 1138, + "time_per_iteration": 2.6317453384399414 + }, + { + "auxiliary_loss_clip": 0.01582238, + "auxiliary_loss_mlp": 0.00371449, + "balance_loss_clip": 1.24834633, + "balance_loss_mlp": 0.33265841, + "epoch": 0.06848038478881707, + "flos": 24608972430720.0, + "grad_norm": 12.766622163507185, + "language_loss": 0.90994763, + "learning_rate": 3.984488020272336e-06, + "loss": 0.92948449, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.38842773, + "step": 1139, + "time_per_iteration": 2.6876070499420166 + }, + { + "auxiliary_loss_clip": 0.01578922, + "auxiliary_loss_mlp": 0.00370103, + "balance_loss_clip": 1.2496357, + "balance_loss_mlp": 0.33400667, + "epoch": 0.06854050804148504, + "flos": 40880994278400.0, + "grad_norm": 3.755456063678699, + "language_loss": 0.81623602, + "learning_rate": 3.984439570469271e-06, + "loss": 0.83572632, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.36083984, + "step": 1140, + "time_per_iteration": 2.818798780441284 + }, + { + "auxiliary_loss_clip": 0.01615117, + "auxiliary_loss_mlp": 0.00445102, + "balance_loss_clip": 1.27004397, + "balance_loss_mlp": 0.40027899, + "epoch": 0.06860063129415302, + "flos": 31686354743040.0, + "grad_norm": 27.733424929984626, + "language_loss": 0.7764231, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.79702526, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.44848633, + "step": 1141, + "time_per_iteration": 2.7750937938690186 + }, + { + "auxiliary_loss_clip": 0.01607596, + "auxiliary_loss_mlp": 0.00404925, + "balance_loss_clip": 1.26090598, + "balance_loss_mlp": 0.36165231, + "epoch": 0.06866075454682098, + "flos": 26542007775360.0, + "grad_norm": 9.982567015359463, + "language_loss": 0.86510372, + "learning_rate": 3.984342445114538e-06, + "loss": 0.88522899, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 3.46679688, + "router_z_loss_mlp": 0.43286133, + "step": 1142, + "time_per_iteration": 2.6843442916870117 + }, + { + "auxiliary_loss_clip": 0.01601757, + "auxiliary_loss_mlp": 0.0035194, + "balance_loss_clip": 1.27338457, + "balance_loss_mlp": 0.31782225, + "epoch": 0.06872087779948895, + "flos": 29789768724480.0, + "grad_norm": 101.96065626602888, + "language_loss": 0.75338131, + "learning_rate": 3.984293769566553e-06, + "loss": 0.77291822, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.34082031, + "step": 1143, + "time_per_iteration": 2.709531784057617 + }, + { + "auxiliary_loss_clip": 0.01596759, + "auxiliary_loss_mlp": 0.00384561, + "balance_loss_clip": 1.26724696, + "balance_loss_mlp": 0.34588927, + "epoch": 0.06878100105215693, + "flos": 26941118768640.0, + "grad_norm": 8.603017299665016, + "language_loss": 0.80269861, + "learning_rate": 3.98424501877395e-06, + "loss": 0.82251179, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.38647461, + "step": 1144, + "time_per_iteration": 2.6498889923095703 + }, + { + "auxiliary_loss_clip": 0.01636768, + "auxiliary_loss_mlp": 0.00398318, + "balance_loss_clip": 1.29371274, + "balance_loss_mlp": 0.35637987, + "epoch": 0.06884112430482489, + "flos": 10670748946560.0, + "grad_norm": 3.999698703659843, + "language_loss": 1.00703073, + "learning_rate": 3.984196192738577e-06, + "loss": 1.02738166, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 0.41918945, + "step": 1145, + "time_per_iteration": 2.6486732959747314 + }, + { + "auxiliary_loss_clip": 0.01637557, + "auxiliary_loss_mlp": 0.00404592, + "balance_loss_clip": 1.29367757, + "balance_loss_mlp": 0.35883927, + "epoch": 0.06890124755749286, + "flos": 20193647898240.0, + "grad_norm": 7.580126670843302, + "language_loss": 0.91712755, + "learning_rate": 3.984147291462285e-06, + "loss": 0.93754905, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.45751953, + "step": 1146, + "time_per_iteration": 2.632357597351074 + }, + { + "auxiliary_loss_clip": 0.01605423, + "auxiliary_loss_mlp": 0.00327038, + "balance_loss_clip": 1.28163981, + "balance_loss_mlp": 0.290178, + "epoch": 0.06896137081016084, + "flos": 20449224144000.0, + "grad_norm": 12.59067332855627, + "language_loss": 0.92195052, + "learning_rate": 3.98409831494693e-06, + "loss": 0.94127512, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.36816406, + "step": 1147, + "time_per_iteration": 2.6410651206970215 + }, + { + "auxiliary_loss_clip": 0.01635895, + "auxiliary_loss_mlp": 0.00355845, + "balance_loss_clip": 1.30349255, + "balance_loss_mlp": 0.31881875, + "epoch": 0.0690214940628288, + "flos": 18368703555840.0, + "grad_norm": 16.863136346462884, + "language_loss": 0.92531264, + "learning_rate": 3.984049263194367e-06, + "loss": 0.94523001, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.37060547, + "step": 1148, + "time_per_iteration": 2.6643481254577637 + }, + { + "auxiliary_loss_clip": 0.01655751, + "auxiliary_loss_mlp": 0.00366252, + "balance_loss_clip": 1.31801224, + "balance_loss_mlp": 0.32653135, + "epoch": 0.06908161731549677, + "flos": 20558033418240.0, + "grad_norm": 17.20691771101131, + "language_loss": 0.76093686, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.7811569, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.39672852, + "step": 1149, + "time_per_iteration": 2.622317314147949 + }, + { + "auxiliary_loss_clip": 0.01672064, + "auxiliary_loss_mlp": 0.00381476, + "balance_loss_clip": 1.32701421, + "balance_loss_mlp": 0.33999118, + "epoch": 0.06914174056816474, + "flos": 27563666313600.0, + "grad_norm": 3.3630760386636465, + "language_loss": 0.90461552, + "learning_rate": 3.983950933985064e-06, + "loss": 0.92515093, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.41479492, + "step": 1150, + "time_per_iteration": 2.6836493015289307 + }, + { + "auxiliary_loss_clip": 0.01681058, + "auxiliary_loss_mlp": 0.00367535, + "balance_loss_clip": 1.34158707, + "balance_loss_mlp": 0.32635987, + "epoch": 0.06920186382083271, + "flos": 15304015249920.0, + "grad_norm": 36.22665717035775, + "language_loss": 0.90929663, + "learning_rate": 3.983901656532052e-06, + "loss": 0.92978257, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.41162109, + "step": 1151, + "time_per_iteration": 2.5954506397247314 + }, + { + "auxiliary_loss_clip": 0.01692839, + "auxiliary_loss_mlp": 0.00356976, + "balance_loss_clip": 1.34626472, + "balance_loss_mlp": 0.31830412, + "epoch": 0.06926198707350067, + "flos": 25191227894400.0, + "grad_norm": 11.067330982503133, + "language_loss": 0.92167258, + "learning_rate": 3.983852303849291e-06, + "loss": 0.94217074, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.38647461, + "step": 1152, + "time_per_iteration": 2.64985728263855 + }, + { + "auxiliary_loss_clip": 0.01733544, + "auxiliary_loss_mlp": 0.00370901, + "balance_loss_clip": 1.37383246, + "balance_loss_mlp": 0.32905805, + "epoch": 0.06932211032616864, + "flos": 13256137146240.0, + "grad_norm": 20.563340864918022, + "language_loss": 0.97641873, + "learning_rate": 3.983802875938651e-06, + "loss": 0.99746323, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.41845703, + "step": 1153, + "time_per_iteration": 2.587677001953125 + }, + { + "auxiliary_loss_clip": 0.01742356, + "auxiliary_loss_mlp": 0.00329485, + "balance_loss_clip": 1.38553739, + "balance_loss_mlp": 0.28981233, + "epoch": 0.06938223357883662, + "flos": 24827381078400.0, + "grad_norm": 10.418496116067969, + "language_loss": 0.87752032, + "learning_rate": 3.983753372802008e-06, + "loss": 0.89823872, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.39648438, + "step": 1154, + "time_per_iteration": 2.689920425415039 + }, + { + "auxiliary_loss_clip": 0.01747583, + "auxiliary_loss_mlp": 0.00328271, + "balance_loss_clip": 1.38908732, + "balance_loss_mlp": 0.28886086, + "epoch": 0.06944235683150458, + "flos": 27267977554560.0, + "grad_norm": 5.145383504792062, + "language_loss": 0.81923866, + "learning_rate": 3.983703794441237e-06, + "loss": 0.83999717, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.39428711, + "step": 1155, + "time_per_iteration": 2.6897523403167725 + }, + { + "auxiliary_loss_clip": 0.01787378, + "auxiliary_loss_mlp": 0.00352341, + "balance_loss_clip": 1.40646243, + "balance_loss_mlp": 0.31099969, + "epoch": 0.06950248008417255, + "flos": 25808065176960.0, + "grad_norm": 31.149074774946293, + "language_loss": 0.76379859, + "learning_rate": 3.98365414085822e-06, + "loss": 0.78519571, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 0.41381836, + "step": 1156, + "time_per_iteration": 2.728524923324585 + }, + { + "auxiliary_loss_clip": 0.01812423, + "auxiliary_loss_mlp": 0.00340964, + "balance_loss_clip": 1.42713916, + "balance_loss_mlp": 0.29699922, + "epoch": 0.06956260333684053, + "flos": 22271546793600.0, + "grad_norm": 12.713093012304856, + "language_loss": 0.80121899, + "learning_rate": 3.98360441205484e-06, + "loss": 0.82275283, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 3.85351562, + "router_z_loss_mlp": 0.43920898, + "step": 1157, + "time_per_iteration": 2.6542296409606934 + }, + { + "auxiliary_loss_clip": 0.01800264, + "auxiliary_loss_mlp": 0.00342536, + "balance_loss_clip": 1.41583931, + "balance_loss_mlp": 0.30322045, + "epoch": 0.0696227265895085, + "flos": 29681390413440.0, + "grad_norm": 14.809911364438436, + "language_loss": 0.79372001, + "learning_rate": 3.983554608032982e-06, + "loss": 0.815148, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.39331055, + "step": 1158, + "time_per_iteration": 4.160548210144043 + }, + { + "auxiliary_loss_clip": 0.0183108, + "auxiliary_loss_mlp": 0.00333623, + "balance_loss_clip": 1.43854368, + "balance_loss_mlp": 0.2933062, + "epoch": 0.06968284984217646, + "flos": 25523545547520.0, + "grad_norm": 9.705391720166851, + "language_loss": 0.86484182, + "learning_rate": 3.983504728794533e-06, + "loss": 0.88648885, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 3.921875, + "router_z_loss_mlp": 0.40307617, + "step": 1159, + "time_per_iteration": 2.703179359436035 + }, + { + "auxiliary_loss_clip": 0.01851809, + "auxiliary_loss_mlp": 0.00341525, + "balance_loss_clip": 1.44974697, + "balance_loss_mlp": 0.29846621, + "epoch": 0.06974297309484444, + "flos": 20698192287360.0, + "grad_norm": 6.058266436064423, + "language_loss": 0.9052459, + "learning_rate": 3.983454774341387e-06, + "loss": 0.92717922, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.43066406, + "step": 1160, + "time_per_iteration": 2.6436238288879395 + }, + { + "auxiliary_loss_clip": 0.01897252, + "auxiliary_loss_mlp": 0.00354339, + "balance_loss_clip": 1.4797399, + "balance_loss_mlp": 0.31352174, + "epoch": 0.0698030963475124, + "flos": 26505199313280.0, + "grad_norm": 22.190599681929076, + "language_loss": 0.83230537, + "learning_rate": 3.983404744675437e-06, + "loss": 0.85482132, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.40820312, + "step": 1161, + "time_per_iteration": 4.166844606399536 + }, + { + "auxiliary_loss_clip": 0.01923672, + "auxiliary_loss_mlp": 0.00350492, + "balance_loss_clip": 1.49595666, + "balance_loss_mlp": 0.30922163, + "epoch": 0.06986321960018037, + "flos": 23040430346880.0, + "grad_norm": 5.133298835517874, + "language_loss": 0.89439559, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.91713721, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 4.27734375, + "router_z_loss_mlp": 0.41259766, + "step": 1162, + "time_per_iteration": 2.7280614376068115 + }, + { + "auxiliary_loss_clip": 0.01927346, + "auxiliary_loss_mlp": 0.00342161, + "balance_loss_clip": 1.49551845, + "balance_loss_mlp": 0.30055657, + "epoch": 0.06992334285284833, + "flos": 28584822061440.0, + "grad_norm": 24.826763187592892, + "language_loss": 0.85314769, + "learning_rate": 3.983304459712716e-06, + "loss": 0.87584281, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 4.3203125, + "router_z_loss_mlp": 0.41625977, + "step": 1163, + "time_per_iteration": 4.137516975402832 + }, + { + "auxiliary_loss_clip": 0.01976647, + "auxiliary_loss_mlp": 0.00386264, + "balance_loss_clip": 1.52781284, + "balance_loss_mlp": 0.33860368, + "epoch": 0.06998346610551631, + "flos": 20595344670720.0, + "grad_norm": 5.559898458919318, + "language_loss": 0.8442781, + "learning_rate": 3.983254204419749e-06, + "loss": 0.86790717, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 4.4921875, + "router_z_loss_mlp": 0.47680664, + "step": 1164, + "time_per_iteration": 2.6722609996795654 + }, + { + "auxiliary_loss_clip": 0.01991184, + "auxiliary_loss_mlp": 0.00358153, + "balance_loss_clip": 1.5383904, + "balance_loss_mlp": 0.31309199, + "epoch": 0.07004358935818428, + "flos": 22528810978560.0, + "grad_norm": 12.558368149832297, + "language_loss": 0.79571867, + "learning_rate": 3.983203873921583e-06, + "loss": 0.81921202, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 4.5234375, + "router_z_loss_mlp": 0.45117188, + "step": 1165, + "time_per_iteration": 2.684443473815918 + }, + { + "auxiliary_loss_clip": 0.0197024, + "auxiliary_loss_mlp": 0.00335613, + "balance_loss_clip": 1.5304004, + "balance_loss_mlp": 0.29250675, + "epoch": 0.07010371261085224, + "flos": 28949997680640.0, + "grad_norm": 34.75302646478748, + "language_loss": 0.86790049, + "learning_rate": 3.983153468220128e-06, + "loss": 0.89095902, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 4.40234375, + "router_z_loss_mlp": 0.4309082, + "step": 1166, + "time_per_iteration": 2.729116201400757 + }, + { + "auxiliary_loss_clip": 0.02002851, + "auxiliary_loss_mlp": 0.00324148, + "balance_loss_clip": 1.54772401, + "balance_loss_mlp": 0.28314009, + "epoch": 0.07016383586352022, + "flos": 23659171050240.0, + "grad_norm": 8.039851687618254, + "language_loss": 0.90736967, + "learning_rate": 3.983102987317295e-06, + "loss": 0.93063962, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 4.5546875, + "router_z_loss_mlp": 0.40991211, + "step": 1167, + "time_per_iteration": 2.6508960723876953 + }, + { + "auxiliary_loss_clip": 0.02001508, + "auxiliary_loss_mlp": 0.00353214, + "balance_loss_clip": 1.54645526, + "balance_loss_mlp": 0.30901158, + "epoch": 0.07022395911618819, + "flos": 19792130693760.0, + "grad_norm": 42.72310995107356, + "language_loss": 0.98066354, + "learning_rate": 3.983052431214997e-06, + "loss": 1.00421071, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 4.55078125, + "router_z_loss_mlp": 0.44262695, + "step": 1168, + "time_per_iteration": 2.6517162322998047 + }, + { + "auxiliary_loss_clip": 0.02005164, + "auxiliary_loss_mlp": 0.00379899, + "balance_loss_clip": 1.53919387, + "balance_loss_mlp": 0.33133364, + "epoch": 0.07028408236885615, + "flos": 21689147675520.0, + "grad_norm": 11.169331359162113, + "language_loss": 0.94846225, + "learning_rate": 3.983001799915153e-06, + "loss": 0.97231293, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 4.6640625, + "router_z_loss_mlp": 0.48583984, + "step": 1169, + "time_per_iteration": 2.618039846420288 + }, + { + "auxiliary_loss_clip": 0.02036618, + "auxiliary_loss_mlp": 0.00376617, + "balance_loss_clip": 1.56720948, + "balance_loss_mlp": 0.33172256, + "epoch": 0.07034420562152413, + "flos": 25630271832960.0, + "grad_norm": 10.894662129386642, + "language_loss": 0.93560767, + "learning_rate": 3.982951093419681e-06, + "loss": 0.95974004, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 4.6953125, + "router_z_loss_mlp": 0.44873047, + "step": 1170, + "time_per_iteration": 2.6453754901885986 + }, + { + "auxiliary_loss_clip": 0.0203113, + "auxiliary_loss_mlp": 0.00332593, + "balance_loss_clip": 1.5750525, + "balance_loss_mlp": 0.29218054, + "epoch": 0.0704043288741921, + "flos": 20810449267200.0, + "grad_norm": 9.179689398681868, + "language_loss": 0.82112861, + "learning_rate": 3.982900311730506e-06, + "loss": 0.8447659, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 4.5546875, + "router_z_loss_mlp": 0.40405273, + "step": 1171, + "time_per_iteration": 2.606445074081421 + }, + { + "auxiliary_loss_clip": 0.02034779, + "auxiliary_loss_mlp": 0.0034623, + "balance_loss_clip": 1.57007992, + "balance_loss_mlp": 0.30424446, + "epoch": 0.07046445212686006, + "flos": 25593176062080.0, + "grad_norm": 3.0473501297660066, + "language_loss": 0.95480716, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.97861731, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 4.65234375, + "router_z_loss_mlp": 0.41992188, + "step": 1172, + "time_per_iteration": 2.687406063079834 + }, + { + "auxiliary_loss_clip": 0.02022102, + "auxiliary_loss_mlp": 0.00368744, + "balance_loss_clip": 1.55239677, + "balance_loss_mlp": 0.32127506, + "epoch": 0.07052457537952803, + "flos": 25556978131200.0, + "grad_norm": 96.05533046217592, + "language_loss": 0.87490821, + "learning_rate": 3.982798522778748e-06, + "loss": 0.8988167, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 4.69921875, + "router_z_loss_mlp": 0.47460938, + "step": 1173, + "time_per_iteration": 2.7375686168670654 + }, + { + "auxiliary_loss_clip": 0.02046676, + "auxiliary_loss_mlp": 0.00380363, + "balance_loss_clip": 1.57515335, + "balance_loss_mlp": 0.33496839, + "epoch": 0.070584698632196, + "flos": 17968515154560.0, + "grad_norm": 3.0069679790741453, + "language_loss": 0.87586689, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.90013731, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 4.71484375, + "router_z_loss_mlp": 0.45361328, + "step": 1174, + "time_per_iteration": 2.5884928703308105 + }, + { + "auxiliary_loss_clip": 0.02027474, + "auxiliary_loss_mlp": 0.00389971, + "balance_loss_clip": 1.55896735, + "balance_loss_mlp": 0.34257302, + "epoch": 0.07064482188486397, + "flos": 25370888745600.0, + "grad_norm": 3.4081120625283265, + "language_loss": 0.89958864, + "learning_rate": 3.982696433075317e-06, + "loss": 0.92376304, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.47436523, + "step": 1175, + "time_per_iteration": 2.712364673614502 + }, + { + "auxiliary_loss_clip": 0.02035345, + "auxiliary_loss_mlp": 0.00336522, + "balance_loss_clip": 1.57401204, + "balance_loss_mlp": 0.29045999, + "epoch": 0.07070494513753194, + "flos": 24899848767360.0, + "grad_norm": 217.93869975188045, + "language_loss": 0.91556942, + "learning_rate": 3.982645275446563e-06, + "loss": 0.93928814, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.46044922, + "step": 1176, + "time_per_iteration": 2.7098910808563232 + }, + { + "auxiliary_loss_clip": 0.02028748, + "auxiliary_loss_mlp": 0.00335689, + "balance_loss_clip": 1.57773232, + "balance_loss_mlp": 0.29017478, + "epoch": 0.07076506839019991, + "flos": 22338447874560.0, + "grad_norm": 5.959343762549416, + "language_loss": 0.81497651, + "learning_rate": 3.982594042635701e-06, + "loss": 0.83862084, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 4.51171875, + "router_z_loss_mlp": 0.45458984, + "step": 1177, + "time_per_iteration": 2.6549010276794434 + }, + { + "auxiliary_loss_clip": 0.02014092, + "auxiliary_loss_mlp": 0.00343284, + "balance_loss_clip": 1.55788565, + "balance_loss_mlp": 0.29881924, + "epoch": 0.07082519164286788, + "flos": 18660800954880.0, + "grad_norm": 3.013587511572665, + "language_loss": 0.903732, + "learning_rate": 3.982542734644673e-06, + "loss": 0.92730576, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 4.5625, + "router_z_loss_mlp": 0.44384766, + "step": 1178, + "time_per_iteration": 2.604640007019043 + }, + { + "auxiliary_loss_clip": 0.02148813, + "auxiliary_loss_mlp": 0.00108228, + "balance_loss_clip": 1.80609822, + "balance_loss_mlp": 0.08905933, + "epoch": 0.07088531489553584, + "flos": 63654107610240.0, + "grad_norm": 0.8940067811555822, + "language_loss": 0.63398254, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65655291, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.19140625, + "step": 1179, + "time_per_iteration": 3.2820568084716797 + }, + { + "auxiliary_loss_clip": 0.01997068, + "auxiliary_loss_mlp": 0.00334994, + "balance_loss_clip": 1.55009174, + "balance_loss_mlp": 0.28771591, + "epoch": 0.07094543814820382, + "flos": 21572688804480.0, + "grad_norm": 23865.90907202281, + "language_loss": 0.96034491, + "learning_rate": 3.98243989312991e-06, + "loss": 0.98366547, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 4.47265625, + "router_z_loss_mlp": 0.47265625, + "step": 1180, + "time_per_iteration": 2.6100945472717285 + }, + { + "auxiliary_loss_clip": 0.0199728, + "auxiliary_loss_mlp": 0.00390421, + "balance_loss_clip": 1.54114878, + "balance_loss_mlp": 0.33990011, + "epoch": 0.07100556140087179, + "flos": 22089946608000.0, + "grad_norm": 9.57608657555911, + "language_loss": 0.94702613, + "learning_rate": 3.982388359610074e-06, + "loss": 0.9709031, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 4.5625, + "router_z_loss_mlp": 0.50463867, + "step": 1181, + "time_per_iteration": 2.6121630668640137 + }, + { + "auxiliary_loss_clip": 0.01973638, + "auxiliary_loss_mlp": 0.00340385, + "balance_loss_clip": 1.54010332, + "balance_loss_mlp": 0.29403669, + "epoch": 0.07106568465353975, + "flos": 47922286400640.0, + "grad_norm": 9.963702778957204, + "language_loss": 0.89889759, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.92203772, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.46337891, + "step": 1182, + "time_per_iteration": 2.844759464263916 + }, + { + "auxiliary_loss_clip": 0.01928846, + "auxiliary_loss_mlp": 0.00300602, + "balance_loss_clip": 1.5182929, + "balance_loss_mlp": 0.25911701, + "epoch": 0.07112580790620772, + "flos": 23440798316160.0, + "grad_norm": 137.72326808533958, + "language_loss": 0.88906991, + "learning_rate": 3.982285067055262e-06, + "loss": 0.91136438, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 4.10742188, + "router_z_loss_mlp": 0.41503906, + "step": 1183, + "time_per_iteration": 2.7407312393188477 + }, + { + "auxiliary_loss_clip": 0.0193812, + "auxiliary_loss_mlp": 0.00355591, + "balance_loss_clip": 1.51941478, + "balance_loss_mlp": 0.3101486, + "epoch": 0.0711859311588757, + "flos": 31868888682240.0, + "grad_norm": 4.035364526223806, + "language_loss": 0.87759435, + "learning_rate": 3.982233308024204e-06, + "loss": 0.90053153, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 4.18359375, + "router_z_loss_mlp": 0.45458984, + "step": 1184, + "time_per_iteration": 2.797081708908081 + }, + { + "auxiliary_loss_clip": 0.01925962, + "auxiliary_loss_mlp": 0.00297677, + "balance_loss_clip": 1.5160321, + "balance_loss_mlp": 0.25550067, + "epoch": 0.07124605441154366, + "flos": 19610315026560.0, + "grad_norm": 4.100639930432864, + "language_loss": 0.83988082, + "learning_rate": 3.98218147382666e-06, + "loss": 0.86211723, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 4.09179688, + "router_z_loss_mlp": 0.421875, + "step": 1185, + "time_per_iteration": 2.6754298210144043 + }, + { + "auxiliary_loss_clip": 0.01918268, + "auxiliary_loss_mlp": 0.00308039, + "balance_loss_clip": 1.50033057, + "balance_loss_mlp": 0.2645275, + "epoch": 0.07130617766421163, + "flos": 14684448533760.0, + "grad_norm": 133.5844426362042, + "language_loss": 0.73247939, + "learning_rate": 3.982129564464596e-06, + "loss": 0.75474244, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 4.17773438, + "router_z_loss_mlp": 0.43481445, + "step": 1186, + "time_per_iteration": 2.6543211936950684 + }, + { + "auxiliary_loss_clip": 0.01906878, + "auxiliary_loss_mlp": 0.00320587, + "balance_loss_clip": 1.49768758, + "balance_loss_mlp": 0.27688473, + "epoch": 0.07136630091687961, + "flos": 26067915141120.0, + "grad_norm": 9.64702243732301, + "language_loss": 0.76121724, + "learning_rate": 3.98207757993998e-06, + "loss": 0.78349191, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 4.08984375, + "router_z_loss_mlp": 0.43701172, + "step": 1187, + "time_per_iteration": 2.7187769412994385 + }, + { + "auxiliary_loss_clip": 0.0187841, + "auxiliary_loss_mlp": 0.0025653, + "balance_loss_clip": 1.48427939, + "balance_loss_mlp": 0.21409161, + "epoch": 0.07142642416954757, + "flos": 15669190869120.0, + "grad_norm": 21.134611372777158, + "language_loss": 0.87557077, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.89692026, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 3.94140625, + "router_z_loss_mlp": 0.42456055, + "step": 1188, + "time_per_iteration": 2.606985330581665 + }, + { + "auxiliary_loss_clip": 0.01881321, + "auxiliary_loss_mlp": 0.00308404, + "balance_loss_clip": 1.4825598, + "balance_loss_mlp": 0.26541704, + "epoch": 0.07148654742221554, + "flos": 19755322231680.0, + "grad_norm": 24.66476086861671, + "language_loss": 0.92398912, + "learning_rate": 3.981973385410981e-06, + "loss": 0.94588637, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.42993164, + "step": 1189, + "time_per_iteration": 2.5759904384613037 + }, + { + "auxiliary_loss_clip": 0.0187238, + "auxiliary_loss_mlp": 0.00296084, + "balance_loss_clip": 1.47655392, + "balance_loss_mlp": 0.2529301, + "epoch": 0.07154667067488352, + "flos": 23471824688640.0, + "grad_norm": 52.94855472970312, + "language_loss": 0.83732516, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.85900974, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.43164062, + "step": 1190, + "time_per_iteration": 2.6727817058563232 + }, + { + "auxiliary_loss_clip": 0.01848568, + "auxiliary_loss_mlp": 0.00302119, + "balance_loss_clip": 1.45050132, + "balance_loss_mlp": 0.25691503, + "epoch": 0.07160679392755148, + "flos": 18332936588160.0, + "grad_norm": 2.524916110046282, + "language_loss": 0.83949542, + "learning_rate": 3.981868890255468e-06, + "loss": 0.86100233, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 3.98046875, + "router_z_loss_mlp": 0.45214844, + "step": 1191, + "time_per_iteration": 2.5905675888061523 + }, + { + "auxiliary_loss_clip": 0.01844228, + "auxiliary_loss_mlp": 0.0031116, + "balance_loss_clip": 1.44942403, + "balance_loss_mlp": 0.27031875, + "epoch": 0.07166691718021945, + "flos": 17747017937280.0, + "grad_norm": 38.086326928987624, + "language_loss": 0.8390466, + "learning_rate": 3.981816529947719e-06, + "loss": 0.86060047, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 3.94921875, + "router_z_loss_mlp": 0.40844727, + "step": 1192, + "time_per_iteration": 2.6470110416412354 + }, + { + "auxiliary_loss_clip": 0.01835179, + "auxiliary_loss_mlp": 0.002984, + "balance_loss_clip": 1.44467187, + "balance_loss_mlp": 0.25846496, + "epoch": 0.07172704043288743, + "flos": 22451925916800.0, + "grad_norm": 43.53075509293043, + "language_loss": 0.85164464, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.87298042, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.39941406, + "step": 1193, + "time_per_iteration": 2.6253979206085205 + }, + { + "auxiliary_loss_clip": 0.01828407, + "auxiliary_loss_mlp": 0.00274522, + "balance_loss_clip": 1.44100332, + "balance_loss_mlp": 0.23358509, + "epoch": 0.07178716368555539, + "flos": 23222210100480.0, + "grad_norm": 15.554209254832838, + "language_loss": 0.92416239, + "learning_rate": 3.981711583882166e-06, + "loss": 0.94519168, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 3.87304688, + "router_z_loss_mlp": 0.40942383, + "step": 1194, + "time_per_iteration": 2.629704475402832 + }, + { + "auxiliary_loss_clip": 0.01804624, + "auxiliary_loss_mlp": 0.00267411, + "balance_loss_clip": 1.42643046, + "balance_loss_mlp": 0.22690405, + "epoch": 0.07184728693822336, + "flos": 25150828072320.0, + "grad_norm": 2.7532576868593286, + "language_loss": 0.87093425, + "learning_rate": 3.981658998128341e-06, + "loss": 0.89165455, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 0.40478516, + "step": 1195, + "time_per_iteration": 2.642906904220581 + }, + { + "auxiliary_loss_clip": 0.01794631, + "auxiliary_loss_mlp": 0.00280913, + "balance_loss_clip": 1.41756892, + "balance_loss_mlp": 0.24121594, + "epoch": 0.07190741019089132, + "flos": 22711237176960.0, + "grad_norm": 39.030288273948024, + "language_loss": 0.83658838, + "learning_rate": 3.981606337229808e-06, + "loss": 0.85734379, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 0.39697266, + "step": 1196, + "time_per_iteration": 2.6311964988708496 + }, + { + "auxiliary_loss_clip": 0.01786964, + "auxiliary_loss_mlp": 0.00307228, + "balance_loss_clip": 1.4012084, + "balance_loss_mlp": 0.26512301, + "epoch": 0.0719675334435593, + "flos": 29349791032320.0, + "grad_norm": 33.47031376312383, + "language_loss": 0.81230927, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.83325112, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 3.85546875, + "router_z_loss_mlp": 0.42114258, + "step": 1197, + "time_per_iteration": 2.6679258346557617 + }, + { + "auxiliary_loss_clip": 0.01770838, + "auxiliary_loss_mlp": 0.00282168, + "balance_loss_clip": 1.39186382, + "balance_loss_mlp": 0.24559422, + "epoch": 0.07202765669622727, + "flos": 17639788861440.0, + "grad_norm": 9.64301881290966, + "language_loss": 0.9137944, + "learning_rate": 3.98150079000661e-06, + "loss": 0.93432438, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.36572266, + "step": 1198, + "time_per_iteration": 2.5990424156188965 + }, + { + "auxiliary_loss_clip": 0.01760693, + "auxiliary_loss_mlp": 0.00293016, + "balance_loss_clip": 1.38390839, + "balance_loss_mlp": 0.25505954, + "epoch": 0.07208777994889523, + "flos": 21434038306560.0, + "grad_norm": 39.15808558529027, + "language_loss": 0.90738612, + "learning_rate": 3.981447903685947e-06, + "loss": 0.9279232, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 0.37963867, + "step": 1199, + "time_per_iteration": 2.6991679668426514 + }, + { + "auxiliary_loss_clip": 0.01782043, + "auxiliary_loss_mlp": 0.0031653, + "balance_loss_clip": 1.39591277, + "balance_loss_mlp": 0.2761423, + "epoch": 0.07214790320156321, + "flos": 26940867373440.0, + "grad_norm": 81.3456418493951, + "language_loss": 0.81673861, + "learning_rate": 3.981394942228581e-06, + "loss": 0.83772433, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 3.859375, + "router_z_loss_mlp": 0.40380859, + "step": 1200, + "time_per_iteration": 4.170860767364502 + }, + { + "auxiliary_loss_clip": 0.01769445, + "auxiliary_loss_mlp": 0.00260656, + "balance_loss_clip": 1.39191413, + "balance_loss_mlp": 0.22453588, + "epoch": 0.07220802645423118, + "flos": 23879949995520.0, + "grad_norm": 133.14806697535727, + "language_loss": 0.88566136, + "learning_rate": 3.98134190563652e-06, + "loss": 0.90596241, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 0.36157227, + "step": 1201, + "time_per_iteration": 2.81274676322937 + }, + { + "auxiliary_loss_clip": 0.01762425, + "auxiliary_loss_mlp": 0.00302936, + "balance_loss_clip": 1.37606883, + "balance_loss_mlp": 0.26490796, + "epoch": 0.07226814970689914, + "flos": 19243631036160.0, + "grad_norm": 16.065357553284617, + "language_loss": 0.7752313, + "learning_rate": 3.981288793911775e-06, + "loss": 0.79588497, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 3.86523438, + "router_z_loss_mlp": 0.38012695, + "step": 1202, + "time_per_iteration": 2.620327949523926 + }, + { + "auxiliary_loss_clip": 0.01778451, + "auxiliary_loss_mlp": 0.00303594, + "balance_loss_clip": 1.39132261, + "balance_loss_mlp": 0.26570928, + "epoch": 0.07232827295956712, + "flos": 19172025273600.0, + "grad_norm": 13.595155062122313, + "language_loss": 0.92762685, + "learning_rate": 3.98123560705636e-06, + "loss": 0.94844735, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.37890625, + "step": 1203, + "time_per_iteration": 4.1546101570129395 + }, + { + "auxiliary_loss_clip": 0.01781765, + "auxiliary_loss_mlp": 0.00329685, + "balance_loss_clip": 1.39342701, + "balance_loss_mlp": 0.29039347, + "epoch": 0.07238839621223508, + "flos": 17639752947840.0, + "grad_norm": 17532.39370600667, + "language_loss": 0.86552793, + "learning_rate": 3.981182345072293e-06, + "loss": 0.88664246, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 3.88671875, + "router_z_loss_mlp": 0.39306641, + "step": 1204, + "time_per_iteration": 2.6317145824432373 + }, + { + "auxiliary_loss_clip": 0.01758173, + "auxiliary_loss_mlp": 0.0031316, + "balance_loss_clip": 1.37398362, + "balance_loss_mlp": 0.27236617, + "epoch": 0.07244851946490305, + "flos": 28292401440000.0, + "grad_norm": 22.646117281179823, + "language_loss": 0.8769682, + "learning_rate": 3.981129007961593e-06, + "loss": 0.89768147, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 0.40820312, + "step": 1205, + "time_per_iteration": 4.085329294204712 + }, + { + "auxiliary_loss_clip": 0.01782191, + "auxiliary_loss_mlp": 0.00362068, + "balance_loss_clip": 1.38668084, + "balance_loss_mlp": 0.31638706, + "epoch": 0.07250864271757101, + "flos": 22564829341440.0, + "grad_norm": 2.534414062526998, + "language_loss": 0.81857502, + "learning_rate": 3.981075595726283e-06, + "loss": 0.84001756, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.45678711, + "step": 1206, + "time_per_iteration": 2.640543222427368 + }, + { + "auxiliary_loss_clip": 0.01776629, + "auxiliary_loss_mlp": 0.00315558, + "balance_loss_clip": 1.38480473, + "balance_loss_mlp": 0.27569401, + "epoch": 0.072568765970239, + "flos": 21762405463680.0, + "grad_norm": 4.860574613242271, + "language_loss": 0.82871318, + "learning_rate": 3.981022108368387e-06, + "loss": 0.84963512, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 3.9140625, + "router_z_loss_mlp": 0.39868164, + "step": 1207, + "time_per_iteration": 2.6494531631469727 + }, + { + "auxiliary_loss_clip": 0.01773117, + "auxiliary_loss_mlp": 0.00322019, + "balance_loss_clip": 1.38668418, + "balance_loss_mlp": 0.28220338, + "epoch": 0.07262888922290696, + "flos": 25519702792320.0, + "grad_norm": 17.485206470674285, + "language_loss": 0.86022907, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.88118052, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 3.859375, + "router_z_loss_mlp": 0.39794922, + "step": 1208, + "time_per_iteration": 2.7001681327819824 + }, + { + "auxiliary_loss_clip": 0.01763255, + "auxiliary_loss_mlp": 0.00293591, + "balance_loss_clip": 1.37545252, + "balance_loss_mlp": 0.25444287, + "epoch": 0.07268901247557492, + "flos": 21246548290560.0, + "grad_norm": 17.789517354924158, + "language_loss": 0.84867996, + "learning_rate": 3.980914908292955e-06, + "loss": 0.86924839, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 3.87890625, + "router_z_loss_mlp": 0.39160156, + "step": 1209, + "time_per_iteration": 2.6945884227752686 + }, + { + "auxiliary_loss_clip": 0.0175133, + "auxiliary_loss_mlp": 0.00265919, + "balance_loss_clip": 1.36699462, + "balance_loss_mlp": 0.22860655, + "epoch": 0.0727491357282429, + "flos": 25479302970240.0, + "grad_norm": 4.116735519190709, + "language_loss": 0.88198376, + "learning_rate": 3.980861195579486e-06, + "loss": 0.90215629, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 3.84570312, + "router_z_loss_mlp": 0.37280273, + "step": 1210, + "time_per_iteration": 2.654256582260132 + }, + { + "auxiliary_loss_clip": 0.0175576, + "auxiliary_loss_mlp": 0.00279568, + "balance_loss_clip": 1.37871587, + "balance_loss_mlp": 0.24323305, + "epoch": 0.07280925898091087, + "flos": 24462169545600.0, + "grad_norm": 12391.021023178184, + "language_loss": 0.91680288, + "learning_rate": 3.98080740775156e-06, + "loss": 0.93715608, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.36303711, + "step": 1211, + "time_per_iteration": 2.7109155654907227 + }, + { + "auxiliary_loss_clip": 0.01752315, + "auxiliary_loss_mlp": 0.00325872, + "balance_loss_clip": 1.36802518, + "balance_loss_mlp": 0.28455377, + "epoch": 0.07286938223357883, + "flos": 18288191220480.0, + "grad_norm": 84.89166911499264, + "language_loss": 0.99883854, + "learning_rate": 3.98075354481122e-06, + "loss": 1.01962042, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 0.41333008, + "step": 1212, + "time_per_iteration": 2.6248910427093506 + }, + { + "auxiliary_loss_clip": 0.01754823, + "auxiliary_loss_mlp": 0.00306164, + "balance_loss_clip": 1.37429309, + "balance_loss_mlp": 0.26730192, + "epoch": 0.07292950548624681, + "flos": 21214803646080.0, + "grad_norm": 96.34855686279157, + "language_loss": 0.80296028, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.82357013, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 0.38842773, + "step": 1213, + "time_per_iteration": 2.636127471923828 + }, + { + "auxiliary_loss_clip": 0.01730965, + "auxiliary_loss_mlp": 0.00298987, + "balance_loss_clip": 1.35703671, + "balance_loss_mlp": 0.26064903, + "epoch": 0.07298962873891478, + "flos": 24642009964800.0, + "grad_norm": 4.9462898932934145, + "language_loss": 0.91152471, + "learning_rate": 3.980645593601465e-06, + "loss": 0.93182421, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 0.38378906, + "step": 1214, + "time_per_iteration": 2.6319620609283447 + }, + { + "auxiliary_loss_clip": 0.01729426, + "auxiliary_loss_mlp": 0.00311594, + "balance_loss_clip": 1.35041261, + "balance_loss_mlp": 0.27359009, + "epoch": 0.07304975199158274, + "flos": 27052765217280.0, + "grad_norm": 26.378948035232142, + "language_loss": 0.92744732, + "learning_rate": 3.980591505336144e-06, + "loss": 0.9478575, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.37988281, + "step": 1215, + "time_per_iteration": 2.664077043533325 + }, + { + "auxiliary_loss_clip": 0.0171555, + "auxiliary_loss_mlp": 0.00323434, + "balance_loss_clip": 1.34287202, + "balance_loss_mlp": 0.28199649, + "epoch": 0.07310987524425071, + "flos": 33549544091520.0, + "grad_norm": 280.32436371514217, + "language_loss": 0.8826263, + "learning_rate": 3.980537341966595e-06, + "loss": 0.90301609, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.41430664, + "step": 1216, + "time_per_iteration": 2.767228603363037 + }, + { + "auxiliary_loss_clip": 0.01692427, + "auxiliary_loss_mlp": 0.00284548, + "balance_loss_clip": 1.33269238, + "balance_loss_mlp": 0.24685387, + "epoch": 0.07316999849691869, + "flos": 28110944908800.0, + "grad_norm": 8.511851482279422, + "language_loss": 0.85677612, + "learning_rate": 3.980483103494872e-06, + "loss": 0.87654591, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.37719727, + "step": 1217, + "time_per_iteration": 2.6749603748321533 + }, + { + "auxiliary_loss_clip": 0.0168151, + "auxiliary_loss_mlp": 0.00283002, + "balance_loss_clip": 1.32201362, + "balance_loss_mlp": 0.2453322, + "epoch": 0.07323012174958665, + "flos": 14392602529920.0, + "grad_norm": 236.37909431363445, + "language_loss": 0.93766129, + "learning_rate": 3.98042878992303e-06, + "loss": 0.95730639, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.37670898, + "step": 1218, + "time_per_iteration": 2.6020755767822266 + }, + { + "auxiliary_loss_clip": 0.01671514, + "auxiliary_loss_mlp": 0.00294243, + "balance_loss_clip": 1.31294298, + "balance_loss_mlp": 0.25542864, + "epoch": 0.07329024500225462, + "flos": 21616428591360.0, + "grad_norm": 6.493228085037835, + "language_loss": 0.94410551, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.96376312, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.38818359, + "step": 1219, + "time_per_iteration": 2.6060001850128174 + }, + { + "auxiliary_loss_clip": 0.01658501, + "auxiliary_loss_mlp": 0.00288324, + "balance_loss_clip": 1.30809283, + "balance_loss_mlp": 0.24996282, + "epoch": 0.0733503682549226, + "flos": 13224141106560.0, + "grad_norm": 116.40844388325014, + "language_loss": 0.93935961, + "learning_rate": 3.980319937487235e-06, + "loss": 0.95882785, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.38378906, + "step": 1220, + "time_per_iteration": 2.634206771850586 + }, + { + "auxiliary_loss_clip": 0.01658668, + "auxiliary_loss_mlp": 0.0029836, + "balance_loss_clip": 1.30339968, + "balance_loss_mlp": 0.25763839, + "epoch": 0.07341049150759056, + "flos": 20886975192960.0, + "grad_norm": 49.95680182421531, + "language_loss": 0.8629232, + "learning_rate": 3.98026539862741e-06, + "loss": 0.8824935, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.40698242, + "step": 1221, + "time_per_iteration": 2.5993690490722656 + }, + { + "auxiliary_loss_clip": 0.01655561, + "auxiliary_loss_mlp": 0.0029513, + "balance_loss_clip": 1.30015111, + "balance_loss_mlp": 0.25512385, + "epoch": 0.07347061476025853, + "flos": 15413614623360.0, + "grad_norm": 19.204668160790856, + "language_loss": 1.00942874, + "learning_rate": 3.980210784675722e-06, + "loss": 1.02893555, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.40039062, + "step": 1222, + "time_per_iteration": 2.6222894191741943 + }, + { + "auxiliary_loss_clip": 0.01657953, + "auxiliary_loss_mlp": 0.00263511, + "balance_loss_clip": 1.29812622, + "balance_loss_mlp": 0.22574517, + "epoch": 0.0735307380129265, + "flos": 11108859131520.0, + "grad_norm": 6.166627177130085, + "language_loss": 0.99910802, + "learning_rate": 3.980156095634242e-06, + "loss": 1.01832271, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.37768555, + "step": 1223, + "time_per_iteration": 2.863445281982422 + }, + { + "auxiliary_loss_clip": 0.01660513, + "auxiliary_loss_mlp": 0.00258814, + "balance_loss_clip": 1.30219626, + "balance_loss_mlp": 0.22057119, + "epoch": 0.07359086126559447, + "flos": 23732392924800.0, + "grad_norm": 85.62252114613987, + "language_loss": 0.91951895, + "learning_rate": 3.980101331505045e-06, + "loss": 0.93871218, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.38256836, + "step": 1224, + "time_per_iteration": 2.7737972736358643 + }, + { + "auxiliary_loss_clip": 0.01687969, + "auxiliary_loss_mlp": 0.00290875, + "balance_loss_clip": 1.31576633, + "balance_loss_mlp": 0.24841282, + "epoch": 0.07365098451826244, + "flos": 20993270515200.0, + "grad_norm": 2.8043529514526355, + "language_loss": 0.90741557, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.92720401, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.42456055, + "step": 1225, + "time_per_iteration": 2.6662776470184326 + }, + { + "auxiliary_loss_clip": 0.01702346, + "auxiliary_loss_mlp": 0.00316492, + "balance_loss_clip": 1.32743192, + "balance_loss_mlp": 0.27712899, + "epoch": 0.0737111077709304, + "flos": 19933582452480.0, + "grad_norm": 13.040113648083024, + "language_loss": 0.96725172, + "learning_rate": 3.979991577991808e-06, + "loss": 0.98744011, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 0.39355469, + "step": 1226, + "time_per_iteration": 2.6351284980773926 + }, + { + "auxiliary_loss_clip": 0.01723308, + "auxiliary_loss_mlp": 0.0026638, + "balance_loss_clip": 1.33294439, + "balance_loss_mlp": 0.22603986, + "epoch": 0.07377123102359838, + "flos": 16581537342720.0, + "grad_norm": 13.750423025944169, + "language_loss": 0.90023571, + "learning_rate": 3.97993658861193e-06, + "loss": 0.92013264, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 3.90234375, + "router_z_loss_mlp": 0.40356445, + "step": 1227, + "time_per_iteration": 2.5911548137664795 + }, + { + "auxiliary_loss_clip": 0.01722762, + "auxiliary_loss_mlp": 0.00251349, + "balance_loss_clip": 1.33773184, + "balance_loss_mlp": 0.21105593, + "epoch": 0.07383135427626634, + "flos": 28328563457280.0, + "grad_norm": 8.900785442979558, + "language_loss": 0.9091475, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.92888862, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.40283203, + "step": 1228, + "time_per_iteration": 2.7142958641052246 + }, + { + "auxiliary_loss_clip": 0.01770463, + "auxiliary_loss_mlp": 0.00267699, + "balance_loss_clip": 1.36320162, + "balance_loss_mlp": 0.227001, + "epoch": 0.07389147752893431, + "flos": 20047168235520.0, + "grad_norm": 31.11147435652375, + "language_loss": 0.87942225, + "learning_rate": 3.97982638461608e-06, + "loss": 0.89980388, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 4.06640625, + "router_z_loss_mlp": 0.40722656, + "step": 1229, + "time_per_iteration": 2.6391875743865967 + }, + { + "auxiliary_loss_clip": 0.01792303, + "auxiliary_loss_mlp": 0.0030875, + "balance_loss_clip": 1.37106538, + "balance_loss_mlp": 0.26497626, + "epoch": 0.07395160078160229, + "flos": 18114132890880.0, + "grad_norm": 15.452870379079627, + "language_loss": 0.87003779, + "learning_rate": 3.979771170004287e-06, + "loss": 0.89104831, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 4.2109375, + "router_z_loss_mlp": 0.43774414, + "step": 1230, + "time_per_iteration": 2.607898235321045 + }, + { + "auxiliary_loss_clip": 0.01844256, + "auxiliary_loss_mlp": 0.0033389, + "balance_loss_clip": 1.40002084, + "balance_loss_mlp": 0.290999, + "epoch": 0.07401172403427025, + "flos": 23586918842880.0, + "grad_norm": 2056.2328742958885, + "language_loss": 0.8793844, + "learning_rate": 3.979715880319372e-06, + "loss": 0.90116584, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 4.43359375, + "router_z_loss_mlp": 0.42871094, + "step": 1231, + "time_per_iteration": 2.64737606048584 + }, + { + "auxiliary_loss_clip": 0.01850298, + "auxiliary_loss_mlp": 0.00339964, + "balance_loss_clip": 1.39627111, + "balance_loss_mlp": 0.30024299, + "epoch": 0.07407184728693822, + "flos": 26359904799360.0, + "grad_norm": 30.354822010416864, + "language_loss": 1.02926707, + "learning_rate": 3.979660515563434e-06, + "loss": 1.05116963, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.39746094, + "step": 1232, + "time_per_iteration": 2.6508588790893555 + }, + { + "auxiliary_loss_clip": 0.01884588, + "auxiliary_loss_mlp": 0.00391555, + "balance_loss_clip": 1.41314435, + "balance_loss_mlp": 0.35119033, + "epoch": 0.0741319705396062, + "flos": 22200443821440.0, + "grad_norm": 32.68244041767896, + "language_loss": 0.87297297, + "learning_rate": 3.979605075738569e-06, + "loss": 0.89573443, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 4.71484375, + "router_z_loss_mlp": 0.40380859, + "step": 1233, + "time_per_iteration": 2.636453628540039 + }, + { + "auxiliary_loss_clip": 0.01924915, + "auxiliary_loss_mlp": 0.00457656, + "balance_loss_clip": 1.43229795, + "balance_loss_mlp": 0.4144786, + "epoch": 0.07419209379227416, + "flos": 39200482523520.0, + "grad_norm": 12.729354984209062, + "language_loss": 0.77940959, + "learning_rate": 3.979549560846883e-06, + "loss": 0.80323529, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 4.92578125, + "router_z_loss_mlp": 0.43164062, + "step": 1234, + "time_per_iteration": 2.7591991424560547 + }, + { + "auxiliary_loss_clip": 0.01948153, + "auxiliary_loss_mlp": 0.00442524, + "balance_loss_clip": 1.44320703, + "balance_loss_mlp": 0.4005383, + "epoch": 0.07425221704494213, + "flos": 22781657790720.0, + "grad_norm": 175.93980104519426, + "language_loss": 0.84322345, + "learning_rate": 3.979493970890478e-06, + "loss": 0.86713016, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 5.0546875, + "router_z_loss_mlp": 0.41967773, + "step": 1235, + "time_per_iteration": 2.6482746601104736 + }, + { + "auxiliary_loss_clip": 0.01983477, + "auxiliary_loss_mlp": 0.00442699, + "balance_loss_clip": 1.45647144, + "balance_loss_mlp": 0.40157139, + "epoch": 0.0743123402976101, + "flos": 22272983337600.0, + "grad_norm": 11.588088671415957, + "language_loss": 0.88405347, + "learning_rate": 3.979438305871464e-06, + "loss": 0.9083153, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 5.265625, + "router_z_loss_mlp": 0.41113281, + "step": 1236, + "time_per_iteration": 2.6108901500701904 + }, + { + "auxiliary_loss_clip": 0.01998804, + "auxiliary_loss_mlp": 0.00545399, + "balance_loss_clip": 1.45789552, + "balance_loss_mlp": 0.49771485, + "epoch": 0.07437246355027807, + "flos": 29315029645440.0, + "grad_norm": 5.045829172421548, + "language_loss": 0.84263647, + "learning_rate": 3.979382565791951e-06, + "loss": 0.86807847, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 5.41015625, + "router_z_loss_mlp": 0.47631836, + "step": 1237, + "time_per_iteration": 2.687476873397827 + }, + { + "auxiliary_loss_clip": 0.02015388, + "auxiliary_loss_mlp": 0.00473895, + "balance_loss_clip": 1.45919621, + "balance_loss_mlp": 0.43064633, + "epoch": 0.07443258680294604, + "flos": 31944732249600.0, + "grad_norm": 9.555855448702177, + "language_loss": 0.82000387, + "learning_rate": 3.979326750654053e-06, + "loss": 0.84489673, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 5.5546875, + "router_z_loss_mlp": 0.43237305, + "step": 1238, + "time_per_iteration": 2.7219672203063965 + }, + { + "auxiliary_loss_clip": 0.02051824, + "auxiliary_loss_mlp": 0.00531211, + "balance_loss_clip": 1.48416519, + "balance_loss_mlp": 0.48476708, + "epoch": 0.074492710055614, + "flos": 22675290641280.0, + "grad_norm": 24.192768947576294, + "language_loss": 0.92108226, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.94691265, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 5.67578125, + "router_z_loss_mlp": 0.46386719, + "step": 1239, + "time_per_iteration": 2.643127918243408 + }, + { + "auxiliary_loss_clip": 0.02057622, + "auxiliary_loss_mlp": 0.00512899, + "balance_loss_clip": 1.47692084, + "balance_loss_mlp": 0.46683636, + "epoch": 0.07455283330828198, + "flos": 21284901037440.0, + "grad_norm": 75.01082286181317, + "language_loss": 0.95546561, + "learning_rate": 3.979214895211569e-06, + "loss": 0.98117089, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 5.80859375, + "router_z_loss_mlp": 0.46069336, + "step": 1240, + "time_per_iteration": 2.5958359241485596 + }, + { + "auxiliary_loss_clip": 0.02087433, + "auxiliary_loss_mlp": 0.00565152, + "balance_loss_clip": 1.49864912, + "balance_loss_mlp": 0.517111, + "epoch": 0.07461295656094995, + "flos": 24388408967040.0, + "grad_norm": 29.334731538265935, + "language_loss": 0.95643449, + "learning_rate": 3.979158854911225e-06, + "loss": 0.98296034, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 5.88671875, + "router_z_loss_mlp": 0.47998047, + "step": 1241, + "time_per_iteration": 2.682462215423584 + }, + { + "auxiliary_loss_clip": 0.02123784, + "auxiliary_loss_mlp": 0.00233252, + "balance_loss_clip": 1.59085, + "balance_loss_mlp": 0.20521365, + "epoch": 0.07467307981361791, + "flos": 62109660574080.0, + "grad_norm": 0.9188999348400004, + "language_loss": 0.63106978, + "learning_rate": 3.979102739560979e-06, + "loss": 0.6546402, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 5.3125, + "router_z_loss_mlp": 0.28125, + "step": 1242, + "time_per_iteration": 4.556946039199829 + }, + { + "auxiliary_loss_clip": 0.02083826, + "auxiliary_loss_mlp": 0.00544255, + "balance_loss_clip": 1.48711562, + "balance_loss_mlp": 0.4930191, + "epoch": 0.07473320306628589, + "flos": 24863148046080.0, + "grad_norm": 16.37931004705845, + "language_loss": 0.74394864, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.77022946, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 5.97265625, + "router_z_loss_mlp": 0.51196289, + "step": 1243, + "time_per_iteration": 4.109212160110474 + }, + { + "auxiliary_loss_clip": 0.02072201, + "auxiliary_loss_mlp": 0.00498868, + "balance_loss_clip": 1.48642945, + "balance_loss_mlp": 0.45609522, + "epoch": 0.07479332631895386, + "flos": 24897442556160.0, + "grad_norm": 4.406390036972256, + "language_loss": 0.81601471, + "learning_rate": 3.978990283719296e-06, + "loss": 0.84172535, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 5.86328125, + "router_z_loss_mlp": 0.42797852, + "step": 1244, + "time_per_iteration": 2.686453342437744 + }, + { + "auxiliary_loss_clip": 0.02100165, + "auxiliary_loss_mlp": 0.00469691, + "balance_loss_clip": 1.50296664, + "balance_loss_mlp": 0.42739576, + "epoch": 0.07485344957162182, + "flos": 17815247821440.0, + "grad_norm": 8.287687264290682, + "language_loss": 0.77038962, + "learning_rate": 3.978933943232123e-06, + "loss": 0.79608822, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 5.96875, + "router_z_loss_mlp": 0.42260742, + "step": 1245, + "time_per_iteration": 2.6932711601257324 + }, + { + "auxiliary_loss_clip": 0.02083214, + "auxiliary_loss_mlp": 0.00511406, + "balance_loss_clip": 1.501302, + "balance_loss_mlp": 0.46632069, + "epoch": 0.0749135728242898, + "flos": 25010202326400.0, + "grad_norm": 5.026412155457402, + "language_loss": 0.94894791, + "learning_rate": 3.978877527703576e-06, + "loss": 0.97489411, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 5.82421875, + "router_z_loss_mlp": 0.45092773, + "step": 1246, + "time_per_iteration": 4.063573360443115 + }, + { + "auxiliary_loss_clip": 0.02061325, + "auxiliary_loss_mlp": 0.00519109, + "balance_loss_clip": 1.48485541, + "balance_loss_mlp": 0.4711864, + "epoch": 0.07497369607695777, + "flos": 17822071405440.0, + "grad_norm": 6.11090591072476, + "language_loss": 0.9845742, + "learning_rate": 3.9788210371357945e-06, + "loss": 1.01037848, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 5.76171875, + "router_z_loss_mlp": 0.47949219, + "step": 1247, + "time_per_iteration": 2.6021323204040527 + }, + { + "auxiliary_loss_clip": 0.0206292, + "auxiliary_loss_mlp": 0.00492644, + "balance_loss_clip": 1.5020957, + "balance_loss_mlp": 0.44648594, + "epoch": 0.07503381932962573, + "flos": 15121086261120.0, + "grad_norm": 4.85421606279229, + "language_loss": 0.71860194, + "learning_rate": 3.978764471530921e-06, + "loss": 0.74415761, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 5.6171875, + "router_z_loss_mlp": 0.46191406, + "step": 1248, + "time_per_iteration": 4.044569492340088 + }, + { + "auxiliary_loss_clip": 0.02053743, + "auxiliary_loss_mlp": 0.00479746, + "balance_loss_clip": 1.50054634, + "balance_loss_mlp": 0.43892905, + "epoch": 0.0750939425822937, + "flos": 12816734071680.0, + "grad_norm": 31.157364854126744, + "language_loss": 0.82510203, + "learning_rate": 3.978707830891102e-06, + "loss": 0.85043693, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 5.53515625, + "router_z_loss_mlp": 0.40820312, + "step": 1249, + "time_per_iteration": 2.6047630310058594 + }, + { + "auxiliary_loss_clip": 0.02062647, + "auxiliary_loss_mlp": 0.0048002, + "balance_loss_clip": 1.50360537, + "balance_loss_mlp": 0.43362403, + "epoch": 0.07515406583496168, + "flos": 24206844695040.0, + "grad_norm": 12.62034212670182, + "language_loss": 0.90865469, + "learning_rate": 3.978651115218482e-06, + "loss": 0.93408138, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 5.5859375, + "router_z_loss_mlp": 0.46362305, + "step": 1250, + "time_per_iteration": 2.6986653804779053 + }, + { + "auxiliary_loss_clip": 0.02021262, + "auxiliary_loss_mlp": 0.00482564, + "balance_loss_clip": 1.49205637, + "balance_loss_mlp": 0.43623954, + "epoch": 0.07521418908762964, + "flos": 26688164215680.0, + "grad_norm": 180.82353292519102, + "language_loss": 0.73793209, + "learning_rate": 3.978594324515215e-06, + "loss": 0.76297039, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 5.2890625, + "router_z_loss_mlp": 0.46362305, + "step": 1251, + "time_per_iteration": 2.650257110595703 + }, + { + "auxiliary_loss_clip": 0.01939101, + "auxiliary_loss_mlp": 0.00090447, + "balance_loss_clip": 1.57252979, + "balance_loss_mlp": 0.07280371, + "epoch": 0.0752743123402976, + "flos": 59095140589440.0, + "grad_norm": 0.9407650897128139, + "language_loss": 0.70348328, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72377872, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.17675781, + "step": 1252, + "time_per_iteration": 3.1591384410858154 + }, + { + "auxiliary_loss_clip": 0.01985061, + "auxiliary_loss_mlp": 0.00428676, + "balance_loss_clip": 1.47543526, + "balance_loss_mlp": 0.38361502, + "epoch": 0.07533443559296558, + "flos": 23477032160640.0, + "grad_norm": 20.758675179068167, + "language_loss": 0.85778308, + "learning_rate": 3.97848051802535e-06, + "loss": 0.88192046, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 5.09375, + "router_z_loss_mlp": 0.45043945, + "step": 1253, + "time_per_iteration": 2.6204352378845215 + }, + { + "auxiliary_loss_clip": 0.01989333, + "auxiliary_loss_mlp": 0.00430266, + "balance_loss_clip": 1.48151541, + "balance_loss_mlp": 0.38675439, + "epoch": 0.07539455884563355, + "flos": 20879110114560.0, + "grad_norm": 307.95804209728215, + "language_loss": 1.04643893, + "learning_rate": 3.978423502243069e-06, + "loss": 1.07063484, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 5.0859375, + "router_z_loss_mlp": 0.43530273, + "step": 1254, + "time_per_iteration": 2.5903306007385254 + }, + { + "auxiliary_loss_clip": 0.01962063, + "auxiliary_loss_mlp": 0.00445455, + "balance_loss_clip": 1.46999669, + "balance_loss_mlp": 0.3995592, + "epoch": 0.07545468209830151, + "flos": 27672906551040.0, + "grad_norm": 122.40374423241059, + "language_loss": 0.93612808, + "learning_rate": 3.97836641143877e-06, + "loss": 0.96020317, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 4.9140625, + "router_z_loss_mlp": 0.45947266, + "step": 1255, + "time_per_iteration": 2.7531492710113525 + }, + { + "auxiliary_loss_clip": 0.01951354, + "auxiliary_loss_mlp": 0.0039811, + "balance_loss_clip": 1.46657276, + "balance_loss_mlp": 0.35593367, + "epoch": 0.0755148053509695, + "flos": 14136990370560.0, + "grad_norm": 132.4509159294308, + "language_loss": 0.86846274, + "learning_rate": 3.978309245614618e-06, + "loss": 0.8919574, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 4.8515625, + "router_z_loss_mlp": 0.42163086, + "step": 1256, + "time_per_iteration": 2.693870782852173 + }, + { + "auxiliary_loss_clip": 0.01843558, + "auxiliary_loss_mlp": 0.00099102, + "balance_loss_clip": 1.50573826, + "balance_loss_mlp": 0.0853695, + "epoch": 0.07557492860363746, + "flos": 58235257929600.0, + "grad_norm": 0.7789917263767719, + "language_loss": 0.57786608, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.59729266, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.13769531, + "step": 1257, + "time_per_iteration": 3.2493221759796143 + }, + { + "auxiliary_loss_clip": 0.01935169, + "auxiliary_loss_mlp": 0.00411377, + "balance_loss_clip": 1.4522388, + "balance_loss_mlp": 0.36734146, + "epoch": 0.07563505185630542, + "flos": 24644380262400.0, + "grad_norm": 28.439571061385895, + "language_loss": 0.96720612, + "learning_rate": 3.978194688915432e-06, + "loss": 0.99067158, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 4.8359375, + "router_z_loss_mlp": 0.44018555, + "step": 1258, + "time_per_iteration": 2.674708604812622 + }, + { + "auxiliary_loss_clip": 0.01879693, + "auxiliary_loss_mlp": 0.00387259, + "balance_loss_clip": 1.42166996, + "balance_loss_mlp": 0.34834915, + "epoch": 0.07569517510897339, + "flos": 15522998515200.0, + "grad_norm": 33.760609803398125, + "language_loss": 0.87107396, + "learning_rate": 3.978137298044741e-06, + "loss": 0.89374346, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 4.58203125, + "router_z_loss_mlp": 0.38891602, + "step": 1259, + "time_per_iteration": 2.6118059158325195 + }, + { + "auxiliary_loss_clip": 0.018978, + "auxiliary_loss_mlp": 0.00384913, + "balance_loss_clip": 1.43256068, + "balance_loss_mlp": 0.34583625, + "epoch": 0.07575529836164137, + "flos": 22928532503040.0, + "grad_norm": 6.334528540629632, + "language_loss": 0.81544089, + "learning_rate": 3.978079832162885e-06, + "loss": 0.83826804, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 4.65234375, + "router_z_loss_mlp": 0.39086914, + "step": 1260, + "time_per_iteration": 2.6686007976531982 + }, + { + "auxiliary_loss_clip": 0.0186867, + "auxiliary_loss_mlp": 0.00409607, + "balance_loss_clip": 1.42425132, + "balance_loss_mlp": 0.36793119, + "epoch": 0.07581542161430933, + "flos": 19500428344320.0, + "grad_norm": 19282.728988933814, + "language_loss": 0.91563696, + "learning_rate": 3.978022291272044e-06, + "loss": 0.9384197, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 4.44140625, + "router_z_loss_mlp": 0.41699219, + "step": 1261, + "time_per_iteration": 2.629772663116455 + }, + { + "auxiliary_loss_clip": 0.0185738, + "auxiliary_loss_mlp": 0.00406583, + "balance_loss_clip": 1.41467881, + "balance_loss_mlp": 0.36011517, + "epoch": 0.0758755448669773, + "flos": 24973465691520.0, + "grad_norm": 111.3396820062762, + "language_loss": 0.87286377, + "learning_rate": 3.977964675374399e-06, + "loss": 0.89550334, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 4.421875, + "router_z_loss_mlp": 0.46459961, + "step": 1262, + "time_per_iteration": 2.732150077819824 + }, + { + "auxiliary_loss_clip": 0.01815647, + "auxiliary_loss_mlp": 0.00428914, + "balance_loss_clip": 1.38262391, + "balance_loss_mlp": 0.38399595, + "epoch": 0.07593566811964528, + "flos": 22747973811840.0, + "grad_norm": 22.509841609017162, + "language_loss": 0.90741086, + "learning_rate": 3.977906984472136e-06, + "loss": 0.92985654, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.44897461, + "step": 1263, + "time_per_iteration": 2.649317979812622 + }, + { + "auxiliary_loss_clip": 0.01811943, + "auxiliary_loss_mlp": 0.00407305, + "balance_loss_clip": 1.37618589, + "balance_loss_mlp": 0.36374646, + "epoch": 0.07599579137231324, + "flos": 23112395245440.0, + "grad_norm": 15.581912491732353, + "language_loss": 0.81774187, + "learning_rate": 3.977849218567442e-06, + "loss": 0.83993435, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.43579102, + "step": 1264, + "time_per_iteration": 2.665125846862793 + }, + { + "auxiliary_loss_clip": 0.01773768, + "auxiliary_loss_mlp": 0.00387493, + "balance_loss_clip": 1.35008764, + "balance_loss_mlp": 0.34543562, + "epoch": 0.07605591462498121, + "flos": 14502058248960.0, + "grad_norm": 9.018273664800596, + "language_loss": 0.92410636, + "learning_rate": 3.977791377662507e-06, + "loss": 0.94571888, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 4.24609375, + "router_z_loss_mlp": 0.42041016, + "step": 1265, + "time_per_iteration": 2.584578037261963 + }, + { + "auxiliary_loss_clip": 0.01775309, + "auxiliary_loss_mlp": 0.00414332, + "balance_loss_clip": 1.34737098, + "balance_loss_mlp": 0.37148857, + "epoch": 0.07611603787764919, + "flos": 23514199758720.0, + "grad_norm": 207.75085167469493, + "language_loss": 0.72564304, + "learning_rate": 3.977733461759524e-06, + "loss": 0.74753952, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 4.28125, + "router_z_loss_mlp": 0.42822266, + "step": 1266, + "time_per_iteration": 2.727354049682617 + }, + { + "auxiliary_loss_clip": 0.01745726, + "auxiliary_loss_mlp": 0.0039794, + "balance_loss_clip": 1.32969236, + "balance_loss_mlp": 0.35700351, + "epoch": 0.07617616113031715, + "flos": 21507188353920.0, + "grad_norm": 91.4630059630357, + "language_loss": 0.9122858, + "learning_rate": 3.977675470860691e-06, + "loss": 0.93372244, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.40917969, + "step": 1267, + "time_per_iteration": 2.6626882553100586 + }, + { + "auxiliary_loss_clip": 0.01732228, + "auxiliary_loss_mlp": 0.00398932, + "balance_loss_clip": 1.32293785, + "balance_loss_mlp": 0.36018917, + "epoch": 0.07623628438298512, + "flos": 14573161221120.0, + "grad_norm": 33.55488548490917, + "language_loss": 0.79705495, + "learning_rate": 3.977617404968205e-06, + "loss": 0.81836653, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 4.09179688, + "router_z_loss_mlp": 0.38745117, + "step": 1268, + "time_per_iteration": 2.6123392581939697 + }, + { + "auxiliary_loss_clip": 0.01741528, + "auxiliary_loss_mlp": 0.00374701, + "balance_loss_clip": 1.32704854, + "balance_loss_mlp": 0.33652985, + "epoch": 0.07629640763565308, + "flos": 14720395069440.0, + "grad_norm": 70.11496768498584, + "language_loss": 0.89901024, + "learning_rate": 3.977559264084269e-06, + "loss": 0.92017251, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 4.14453125, + "router_z_loss_mlp": 0.3815918, + "step": 1269, + "time_per_iteration": 2.6489317417144775 + }, + { + "auxiliary_loss_clip": 0.01727575, + "auxiliary_loss_mlp": 0.00397976, + "balance_loss_clip": 1.3155452, + "balance_loss_mlp": 0.35820818, + "epoch": 0.07635653088832106, + "flos": 14902929008640.0, + "grad_norm": 19.570112889270128, + "language_loss": 0.97329545, + "learning_rate": 3.977501048211088e-06, + "loss": 0.99455094, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 4.11328125, + "router_z_loss_mlp": 0.3972168, + "step": 1270, + "time_per_iteration": 2.7451910972595215 + }, + { + "auxiliary_loss_clip": 0.01732058, + "auxiliary_loss_mlp": 0.00431138, + "balance_loss_clip": 1.31979036, + "balance_loss_mlp": 0.38533753, + "epoch": 0.07641665414098903, + "flos": 26651571235200.0, + "grad_norm": 8.923158329237609, + "language_loss": 0.78309023, + "learning_rate": 3.977442757350869e-06, + "loss": 0.80472219, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 4.12109375, + "router_z_loss_mlp": 0.45800781, + "step": 1271, + "time_per_iteration": 2.65997052192688 + }, + { + "auxiliary_loss_clip": 0.01697876, + "auxiliary_loss_mlp": 0.00344314, + "balance_loss_clip": 1.30313241, + "balance_loss_mlp": 0.30919528, + "epoch": 0.07647677739365699, + "flos": 25192808092800.0, + "grad_norm": 4.737226967339089, + "language_loss": 0.88084912, + "learning_rate": 3.977384391505823e-06, + "loss": 0.90127099, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 3.94726562, + "router_z_loss_mlp": 0.35107422, + "step": 1272, + "time_per_iteration": 2.7631027698516846 + }, + { + "auxiliary_loss_clip": 0.01673589, + "auxiliary_loss_mlp": 0.00309038, + "balance_loss_clip": 1.27907014, + "balance_loss_mlp": 0.27580267, + "epoch": 0.07653690064632497, + "flos": 20558141159040.0, + "grad_norm": 4.346671735743657, + "language_loss": 0.88363796, + "learning_rate": 3.977325950678162e-06, + "loss": 0.90346426, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 3.94921875, + "router_z_loss_mlp": 0.33251953, + "step": 1273, + "time_per_iteration": 2.6227657794952393 + }, + { + "auxiliary_loss_clip": 0.01669931, + "auxiliary_loss_mlp": 0.00339851, + "balance_loss_clip": 1.2711041, + "balance_loss_mlp": 0.30153733, + "epoch": 0.07659702389899294, + "flos": 22269320150400.0, + "grad_norm": 5.044524737465733, + "language_loss": 0.87016308, + "learning_rate": 3.977267434870103e-06, + "loss": 0.89026093, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 3.9921875, + "router_z_loss_mlp": 0.3828125, + "step": 1274, + "time_per_iteration": 2.651512861251831 + }, + { + "auxiliary_loss_clip": 0.01646599, + "auxiliary_loss_mlp": 0.00305376, + "balance_loss_clip": 1.25978887, + "balance_loss_mlp": 0.27161574, + "epoch": 0.0766571471516609, + "flos": 32636120209920.0, + "grad_norm": 14.363001335113136, + "language_loss": 0.79308766, + "learning_rate": 3.977208844083865e-06, + "loss": 0.81260741, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.33764648, + "step": 1275, + "time_per_iteration": 2.74676251411438 + }, + { + "auxiliary_loss_clip": 0.01631204, + "auxiliary_loss_mlp": 0.00310552, + "balance_loss_clip": 1.23960042, + "balance_loss_mlp": 0.27395451, + "epoch": 0.07671727040432888, + "flos": 15267386355840.0, + "grad_norm": 25.13753843577571, + "language_loss": 0.88959205, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.90900958, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 3.91992188, + "router_z_loss_mlp": 0.36572266, + "step": 1276, + "time_per_iteration": 2.6248066425323486 + }, + { + "auxiliary_loss_clip": 0.0161095, + "auxiliary_loss_mlp": 0.00308203, + "balance_loss_clip": 1.22672296, + "balance_loss_mlp": 0.27260691, + "epoch": 0.07677739365699685, + "flos": 28184094956160.0, + "grad_norm": 961.320283048435, + "language_loss": 0.69782436, + "learning_rate": 3.97709143758574e-06, + "loss": 0.71701586, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 3.84179688, + "router_z_loss_mlp": 0.35595703, + "step": 1277, + "time_per_iteration": 2.6630594730377197 + }, + { + "auxiliary_loss_clip": 0.01618516, + "auxiliary_loss_mlp": 0.00312157, + "balance_loss_clip": 1.23010349, + "balance_loss_mlp": 0.27546433, + "epoch": 0.07683751690966481, + "flos": 18296128126080.0, + "grad_norm": 6.618897011855969, + "language_loss": 0.83482885, + "learning_rate": 3.977032621878305e-06, + "loss": 0.85413551, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 3.88476562, + "router_z_loss_mlp": 0.3671875, + "step": 1278, + "time_per_iteration": 2.647878646850586 + }, + { + "auxiliary_loss_clip": 0.01592786, + "auxiliary_loss_mlp": 0.0030171, + "balance_loss_clip": 1.21005392, + "balance_loss_mlp": 0.26766348, + "epoch": 0.07689764016233278, + "flos": 21981101420160.0, + "grad_norm": 28.326049654561977, + "language_loss": 0.95236957, + "learning_rate": 3.976973731201596e-06, + "loss": 0.97131455, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 0.34057617, + "step": 1279, + "time_per_iteration": 2.647526502609253 + }, + { + "auxiliary_loss_clip": 0.0158287, + "auxiliary_loss_mlp": 0.00304841, + "balance_loss_clip": 1.19835341, + "balance_loss_mlp": 0.2701987, + "epoch": 0.07695776341500075, + "flos": 22235995307520.0, + "grad_norm": 5.0587063161056784, + "language_loss": 0.88756126, + "learning_rate": 3.976914765557845e-06, + "loss": 0.90643835, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 3.84179688, + "router_z_loss_mlp": 0.34643555, + "step": 1280, + "time_per_iteration": 2.614304780960083 + }, + { + "auxiliary_loss_clip": 0.01565731, + "auxiliary_loss_mlp": 0.00302362, + "balance_loss_clip": 1.188761, + "balance_loss_mlp": 0.26910272, + "epoch": 0.07701788666766872, + "flos": 16143750380160.0, + "grad_norm": 3.314127578716393, + "language_loss": 0.84418833, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.86286926, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 0.33251953, + "step": 1281, + "time_per_iteration": 2.612200975418091 + }, + { + "auxiliary_loss_clip": 0.01561521, + "auxiliary_loss_mlp": 0.0030691, + "balance_loss_clip": 1.17886806, + "balance_loss_mlp": 0.26800001, + "epoch": 0.07707800992033668, + "flos": 19463045264640.0, + "grad_norm": 12955.784200158134, + "language_loss": 0.85814333, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.87682772, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 3.82617188, + "router_z_loss_mlp": 0.38916016, + "step": 1282, + "time_per_iteration": 2.6317756175994873 + }, + { + "auxiliary_loss_clip": 0.01551849, + "auxiliary_loss_mlp": 0.00306627, + "balance_loss_clip": 1.17753339, + "balance_loss_mlp": 0.27198493, + "epoch": 0.07713813317300466, + "flos": 18990281433600.0, + "grad_norm": 4.915151956411521, + "language_loss": 0.88332635, + "learning_rate": 3.976737418846713e-06, + "loss": 0.90191114, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.34643555, + "step": 1283, + "time_per_iteration": 2.714460849761963 + }, + { + "auxiliary_loss_clip": 0.0154992, + "auxiliary_loss_mlp": 0.00317281, + "balance_loss_clip": 1.17429495, + "balance_loss_mlp": 0.28073195, + "epoch": 0.07719825642567263, + "flos": 18113953322880.0, + "grad_norm": 43.001294932244875, + "language_loss": 0.81510997, + "learning_rate": 3.976678153357181e-06, + "loss": 0.83378196, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.36547852, + "step": 1284, + "time_per_iteration": 4.036075830459595 + }, + { + "auxiliary_loss_clip": 0.01540166, + "auxiliary_loss_mlp": 0.00298823, + "balance_loss_clip": 1.16643167, + "balance_loss_mlp": 0.26372784, + "epoch": 0.0772583796783406, + "flos": 42194426993280.0, + "grad_norm": 46.02181373526052, + "language_loss": 0.81287575, + "learning_rate": 3.976618812911817e-06, + "loss": 0.83126563, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 0.35083008, + "step": 1285, + "time_per_iteration": 4.1603734493255615 + }, + { + "auxiliary_loss_clip": 0.01522623, + "auxiliary_loss_mlp": 0.00290177, + "balance_loss_clip": 1.15279865, + "balance_loss_mlp": 0.25710821, + "epoch": 0.07731850293100857, + "flos": 24753692327040.0, + "grad_norm": 9.036012574273268, + "language_loss": 0.91345906, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.9315871, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 0.33056641, + "step": 1286, + "time_per_iteration": 2.68825101852417 + }, + { + "auxiliary_loss_clip": 0.01538433, + "auxiliary_loss_mlp": 0.00314877, + "balance_loss_clip": 1.15773606, + "balance_loss_mlp": 0.27744561, + "epoch": 0.07737862618367654, + "flos": 17565884628480.0, + "grad_norm": 6.638437210453716, + "language_loss": 0.89166927, + "learning_rate": 3.97649990716259e-06, + "loss": 0.91020238, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 0.37426758, + "step": 1287, + "time_per_iteration": 2.6894731521606445 + }, + { + "auxiliary_loss_clip": 0.01508099, + "auxiliary_loss_mlp": 0.00232291, + "balance_loss_clip": 1.13968885, + "balance_loss_mlp": 0.1978398, + "epoch": 0.0774387494363445, + "flos": 25627147349760.0, + "grad_norm": 3.1202048357135714, + "language_loss": 0.92406225, + "learning_rate": 3.976440341863237e-06, + "loss": 0.94146615, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.34448242, + "step": 1288, + "time_per_iteration": 4.105735778808594 + }, + { + "auxiliary_loss_clip": 0.01530784, + "auxiliary_loss_mlp": 0.00261543, + "balance_loss_clip": 1.15744758, + "balance_loss_mlp": 0.22487432, + "epoch": 0.07749887268901248, + "flos": 12239865648000.0, + "grad_norm": 9.695149628619848, + "language_loss": 0.97393548, + "learning_rate": 3.976380701617068e-06, + "loss": 0.99185878, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 0.36645508, + "step": 1289, + "time_per_iteration": 2.660024642944336 + }, + { + "auxiliary_loss_clip": 0.01499395, + "auxiliary_loss_mlp": 0.0026983, + "balance_loss_clip": 1.13270521, + "balance_loss_mlp": 0.23583147, + "epoch": 0.07755899594168045, + "flos": 25081736261760.0, + "grad_norm": 15.763323708444984, + "language_loss": 0.91781104, + "learning_rate": 3.976320986426344e-06, + "loss": 0.9355033, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.34008789, + "step": 1290, + "time_per_iteration": 4.107405662536621 + }, + { + "auxiliary_loss_clip": 0.01480336, + "auxiliary_loss_mlp": 0.00270968, + "balance_loss_clip": 1.11924624, + "balance_loss_mlp": 0.23832861, + "epoch": 0.07761911919434841, + "flos": 14246410176000.0, + "grad_norm": 15.119856203535962, + "language_loss": 1.00472176, + "learning_rate": 3.9762611962933315e-06, + "loss": 1.02223468, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.32592773, + "step": 1291, + "time_per_iteration": 2.6115522384643555 + }, + { + "auxiliary_loss_clip": 0.01448861, + "auxiliary_loss_mlp": 0.00115257, + "balance_loss_clip": 1.14143157, + "balance_loss_mlp": 0.09799597, + "epoch": 0.07767924244701638, + "flos": 67237202954880.0, + "grad_norm": 0.873794344290375, + "language_loss": 0.65275633, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.66839755, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.17285156, + "step": 1292, + "time_per_iteration": 3.2827842235565186 + }, + { + "auxiliary_loss_clip": 0.01473027, + "auxiliary_loss_mlp": 0.00255291, + "balance_loss_clip": 1.11182308, + "balance_loss_mlp": 0.22343849, + "epoch": 0.07773936569968436, + "flos": 28550635292160.0, + "grad_norm": 403.31357504539426, + "language_loss": 0.94402105, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.96130419, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.31835938, + "step": 1293, + "time_per_iteration": 2.6694436073303223 + }, + { + "auxiliary_loss_clip": 0.01478372, + "auxiliary_loss_mlp": 0.00318, + "balance_loss_clip": 1.11369538, + "balance_loss_mlp": 0.28202266, + "epoch": 0.07779948895235232, + "flos": 27490264871040.0, + "grad_norm": 4.516317694182326, + "language_loss": 0.92960036, + "learning_rate": 3.976081376263239e-06, + "loss": 0.94756407, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 0.35961914, + "step": 1294, + "time_per_iteration": 2.738605260848999 + }, + { + "auxiliary_loss_clip": 0.01493806, + "auxiliary_loss_mlp": 0.00347296, + "balance_loss_clip": 1.12264013, + "balance_loss_mlp": 0.31005472, + "epoch": 0.07785961220502029, + "flos": 18223301301120.0, + "grad_norm": 12.451545536122813, + "language_loss": 0.89866936, + "learning_rate": 3.976021286383768e-06, + "loss": 0.9170804, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 0.37255859, + "step": 1295, + "time_per_iteration": 2.5776052474975586 + }, + { + "auxiliary_loss_clip": 0.01485882, + "auxiliary_loss_mlp": 0.00327894, + "balance_loss_clip": 1.12294316, + "balance_loss_mlp": 0.29520714, + "epoch": 0.07791973545768827, + "flos": 24608218245120.0, + "grad_norm": 64.63187690010908, + "language_loss": 0.95203745, + "learning_rate": 3.975961121573371e-06, + "loss": 0.97017527, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 0.3269043, + "step": 1296, + "time_per_iteration": 2.6654083728790283 + }, + { + "auxiliary_loss_clip": 0.0147747, + "auxiliary_loss_mlp": 0.00321517, + "balance_loss_clip": 1.11305833, + "balance_loss_mlp": 0.28666016, + "epoch": 0.07797985871035623, + "flos": 14282069402880.0, + "grad_norm": 25.87711742882511, + "language_loss": 1.04803872, + "learning_rate": 3.9759008818343305e-06, + "loss": 1.06602848, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 0.34887695, + "step": 1297, + "time_per_iteration": 2.6322638988494873 + }, + { + "auxiliary_loss_clip": 0.01469857, + "auxiliary_loss_mlp": 0.00302587, + "balance_loss_clip": 1.11044669, + "balance_loss_mlp": 0.27035236, + "epoch": 0.0780399819630242, + "flos": 26610453141120.0, + "grad_norm": 72.34628357240062, + "language_loss": 0.84144515, + "learning_rate": 3.97584056716893e-06, + "loss": 0.8591696, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 0.32226562, + "step": 1298, + "time_per_iteration": 2.629021406173706 + }, + { + "auxiliary_loss_clip": 0.01466746, + "auxiliary_loss_mlp": 0.00267258, + "balance_loss_clip": 1.11132181, + "balance_loss_mlp": 0.23750377, + "epoch": 0.07810010521569218, + "flos": 21834514016640.0, + "grad_norm": 6.277747504722253, + "language_loss": 0.86946702, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.88680708, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 0.29736328, + "step": 1299, + "time_per_iteration": 2.708911657333374 + }, + { + "auxiliary_loss_clip": 0.01455982, + "auxiliary_loss_mlp": 0.00315179, + "balance_loss_clip": 1.10262775, + "balance_loss_mlp": 0.28455424, + "epoch": 0.07816022846836014, + "flos": 25081233471360.0, + "grad_norm": 2.659201632890322, + "language_loss": 0.94243062, + "learning_rate": 3.975719713068202e-06, + "loss": 0.96014214, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.30651855, + "step": 1300, + "time_per_iteration": 2.676058769226074 + }, + { + "auxiliary_loss_clip": 0.01466151, + "auxiliary_loss_mlp": 0.00307926, + "balance_loss_clip": 1.10686314, + "balance_loss_mlp": 0.27428484, + "epoch": 0.0782203517210281, + "flos": 40917515431680.0, + "grad_norm": 3.2940222042143663, + "language_loss": 0.81058365, + "learning_rate": 3.975659173637458e-06, + "loss": 0.82832444, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 0.33642578, + "step": 1301, + "time_per_iteration": 2.834796905517578 + }, + { + "auxiliary_loss_clip": 0.01454787, + "auxiliary_loss_mlp": 0.00300937, + "balance_loss_clip": 1.10203028, + "balance_loss_mlp": 0.26968068, + "epoch": 0.07828047497369607, + "flos": 41172014269440.0, + "grad_norm": 16.901336868381307, + "language_loss": 0.77863818, + "learning_rate": 3.97559855928952e-06, + "loss": 0.79619545, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.31262207, + "step": 1302, + "time_per_iteration": 2.8743605613708496 + }, + { + "auxiliary_loss_clip": 0.01434724, + "auxiliary_loss_mlp": 0.00276383, + "balance_loss_clip": 1.089697, + "balance_loss_mlp": 0.24553189, + "epoch": 0.07834059822636405, + "flos": 23508130360320.0, + "grad_norm": 2.7184775746163004, + "language_loss": 0.90487152, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.92198259, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.30895996, + "step": 1303, + "time_per_iteration": 2.6830248832702637 + }, + { + "auxiliary_loss_clip": 0.01426043, + "auxiliary_loss_mlp": 0.00314533, + "balance_loss_clip": 1.07919967, + "balance_loss_mlp": 0.28277561, + "epoch": 0.07840072147903202, + "flos": 20193899293440.0, + "grad_norm": 2.22191932793665, + "language_loss": 0.81473172, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.83213747, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.31762695, + "step": 1304, + "time_per_iteration": 2.664872884750366 + }, + { + "auxiliary_loss_clip": 0.01410942, + "auxiliary_loss_mlp": 0.00329632, + "balance_loss_clip": 1.06785464, + "balance_loss_mlp": 0.29673055, + "epoch": 0.07846084473169998, + "flos": 21360816432000.0, + "grad_norm": 22.67342063662987, + "language_loss": 0.82748127, + "learning_rate": 3.975416266765542e-06, + "loss": 0.84488702, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.32910156, + "step": 1305, + "time_per_iteration": 2.6626453399658203 + }, + { + "auxiliary_loss_clip": 0.01418873, + "auxiliary_loss_mlp": 0.00318543, + "balance_loss_clip": 1.07806051, + "balance_loss_mlp": 0.28685689, + "epoch": 0.07852096798436796, + "flos": 25410965345280.0, + "grad_norm": 3.3206413140308517, + "language_loss": 0.93043554, + "learning_rate": 3.975355352771841e-06, + "loss": 0.94780964, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.31689453, + "step": 1306, + "time_per_iteration": 2.6679725646972656 + }, + { + "auxiliary_loss_clip": 0.01410053, + "auxiliary_loss_mlp": 0.00308313, + "balance_loss_clip": 1.07601142, + "balance_loss_mlp": 0.27879685, + "epoch": 0.07858109123703592, + "flos": 24571481610240.0, + "grad_norm": 120.56965652071403, + "language_loss": 0.96196103, + "learning_rate": 3.975294363872468e-06, + "loss": 0.97914469, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.29492188, + "step": 1307, + "time_per_iteration": 2.717865467071533 + }, + { + "auxiliary_loss_clip": 0.01408236, + "auxiliary_loss_mlp": 0.00322812, + "balance_loss_clip": 1.07271171, + "balance_loss_mlp": 0.2894097, + "epoch": 0.07864121448970389, + "flos": 20698874645760.0, + "grad_norm": 17.687448734907633, + "language_loss": 0.90909654, + "learning_rate": 3.975233300069735e-06, + "loss": 0.92640698, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.33374023, + "step": 1308, + "time_per_iteration": 2.647608995437622 + }, + { + "auxiliary_loss_clip": 0.01394271, + "auxiliary_loss_mlp": 0.00300571, + "balance_loss_clip": 1.06297922, + "balance_loss_mlp": 0.27044648, + "epoch": 0.07870133774237187, + "flos": 22966526113920.0, + "grad_norm": 8.147203224519316, + "language_loss": 0.82811677, + "learning_rate": 3.975172161365958e-06, + "loss": 0.84506518, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.30126953, + "step": 1309, + "time_per_iteration": 2.6644253730773926 + }, + { + "auxiliary_loss_clip": 0.01407384, + "auxiliary_loss_mlp": 0.00330953, + "balance_loss_clip": 1.07292914, + "balance_loss_mlp": 0.3006376, + "epoch": 0.07876146099503983, + "flos": 18842832103680.0, + "grad_norm": 2.9965015254121434, + "language_loss": 0.8733964, + "learning_rate": 3.975110947763453e-06, + "loss": 0.89077973, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.30334473, + "step": 1310, + "time_per_iteration": 2.6906721591949463 + }, + { + "auxiliary_loss_clip": 0.01395347, + "auxiliary_loss_mlp": 0.00312646, + "balance_loss_clip": 1.07704675, + "balance_loss_mlp": 0.2825579, + "epoch": 0.0788215842477078, + "flos": 23805794367360.0, + "grad_norm": 10.04567384592467, + "language_loss": 0.81175137, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.82883132, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.30078125, + "step": 1311, + "time_per_iteration": 2.6528451442718506 + }, + { + "auxiliary_loss_clip": 0.01400381, + "auxiliary_loss_mlp": 0.00336469, + "balance_loss_clip": 1.07760525, + "balance_loss_mlp": 0.30487901, + "epoch": 0.07888170750037576, + "flos": 21579907438080.0, + "grad_norm": 11.478228168800827, + "language_loss": 0.92468333, + "learning_rate": 3.974988295871553e-06, + "loss": 0.94205177, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.31591797, + "step": 1312, + "time_per_iteration": 2.6705079078674316 + }, + { + "auxiliary_loss_clip": 0.01385438, + "auxiliary_loss_mlp": 0.00316402, + "balance_loss_clip": 1.06337905, + "balance_loss_mlp": 0.28726742, + "epoch": 0.07894183075304374, + "flos": 19864849777920.0, + "grad_norm": 26.338500635792823, + "language_loss": 0.88949043, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.90650886, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.29125977, + "step": 1313, + "time_per_iteration": 2.6854147911071777 + }, + { + "auxiliary_loss_clip": 0.01390935, + "auxiliary_loss_mlp": 0.00331473, + "balance_loss_clip": 1.06350064, + "balance_loss_mlp": 0.29692662, + "epoch": 0.07900195400571171, + "flos": 16143463071360.0, + "grad_norm": 4.080048582420998, + "language_loss": 0.83872008, + "learning_rate": 3.97486534441264e-06, + "loss": 0.85594422, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.34545898, + "step": 1314, + "time_per_iteration": 2.6966142654418945 + }, + { + "auxiliary_loss_clip": 0.01392291, + "auxiliary_loss_mlp": 0.00291491, + "balance_loss_clip": 1.06610084, + "balance_loss_mlp": 0.25799349, + "epoch": 0.07906207725837967, + "flos": 23730417676800.0, + "grad_norm": 18.15723668597912, + "language_loss": 0.85387945, + "learning_rate": 3.974803756351379e-06, + "loss": 0.87071723, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.33520508, + "step": 1315, + "time_per_iteration": 2.659820079803467 + }, + { + "auxiliary_loss_clip": 0.01390301, + "auxiliary_loss_mlp": 0.00296508, + "balance_loss_clip": 1.06603003, + "balance_loss_mlp": 0.25924331, + "epoch": 0.07912220051104765, + "flos": 24315905364480.0, + "grad_norm": 3.577516834993414, + "language_loss": 0.79986447, + "learning_rate": 3.974742093405362e-06, + "loss": 0.81673259, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.37304688, + "step": 1316, + "time_per_iteration": 2.769587993621826 + }, + { + "auxiliary_loss_clip": 0.01394711, + "auxiliary_loss_mlp": 0.00291121, + "balance_loss_clip": 1.0727675, + "balance_loss_mlp": 0.25590658, + "epoch": 0.07918232376371562, + "flos": 18880035615360.0, + "grad_norm": 5.184978455922231, + "language_loss": 0.78060043, + "learning_rate": 3.974680355576927e-06, + "loss": 0.79745877, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.35229492, + "step": 1317, + "time_per_iteration": 2.615846872329712 + }, + { + "auxiliary_loss_clip": 0.01408987, + "auxiliary_loss_mlp": 0.00305417, + "balance_loss_clip": 1.0773716, + "balance_loss_mlp": 0.26691246, + "epoch": 0.07924244701638358, + "flos": 27376284038400.0, + "grad_norm": 176.31192396419831, + "language_loss": 0.85277057, + "learning_rate": 3.974618542868415e-06, + "loss": 0.86991465, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.38525391, + "step": 1318, + "time_per_iteration": 2.681126594543457 + }, + { + "auxiliary_loss_clip": 0.01388766, + "auxiliary_loss_mlp": 0.00264503, + "balance_loss_clip": 1.0707655, + "balance_loss_mlp": 0.2302184, + "epoch": 0.07930257026905156, + "flos": 25120340403840.0, + "grad_norm": 756.8473818355302, + "language_loss": 0.97091597, + "learning_rate": 3.97455665528217e-06, + "loss": 0.98744857, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.3425293, + "step": 1319, + "time_per_iteration": 2.671539783477783 + }, + { + "auxiliary_loss_clip": 0.01382685, + "auxiliary_loss_mlp": 0.00233903, + "balance_loss_clip": 1.06473112, + "balance_loss_mlp": 0.1955649, + "epoch": 0.07936269352171953, + "flos": 21834478103040.0, + "grad_norm": 11.650460679611053, + "language_loss": 0.88113636, + "learning_rate": 3.974494692820539e-06, + "loss": 0.89730227, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.38330078, + "step": 1320, + "time_per_iteration": 2.648799419403076 + }, + { + "auxiliary_loss_clip": 0.01404987, + "auxiliary_loss_mlp": 0.00278176, + "balance_loss_clip": 1.0882796, + "balance_loss_mlp": 0.23855065, + "epoch": 0.07942281677438749, + "flos": 16939889377920.0, + "grad_norm": 260.1573593843893, + "language_loss": 0.77232158, + "learning_rate": 3.974432655485872e-06, + "loss": 0.78915322, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.39648438, + "step": 1321, + "time_per_iteration": 2.579237937927246 + }, + { + "auxiliary_loss_clip": 0.0139272, + "auxiliary_loss_mlp": 0.00241047, + "balance_loss_clip": 1.0747391, + "balance_loss_mlp": 0.20261356, + "epoch": 0.07948294002705546, + "flos": 18986941468800.0, + "grad_norm": 8.838144935149005, + "language_loss": 0.92992288, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.94626057, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.38452148, + "step": 1322, + "time_per_iteration": 2.669523000717163 + }, + { + "auxiliary_loss_clip": 0.01403796, + "auxiliary_loss_mlp": 0.0026753, + "balance_loss_clip": 1.08187318, + "balance_loss_mlp": 0.22726148, + "epoch": 0.07954306327972344, + "flos": 21653452535040.0, + "grad_norm": 12.326743069990895, + "language_loss": 0.98932981, + "learning_rate": 3.974308356206838e-06, + "loss": 1.0060432, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.40283203, + "step": 1323, + "time_per_iteration": 2.6597750186920166 + }, + { + "auxiliary_loss_clip": 0.01414834, + "auxiliary_loss_mlp": 0.00253188, + "balance_loss_clip": 1.09591353, + "balance_loss_mlp": 0.21253768, + "epoch": 0.0796031865323914, + "flos": 23220270766080.0, + "grad_norm": 2.385538940534498, + "language_loss": 0.89007491, + "learning_rate": 3.974246094267187e-06, + "loss": 0.90675509, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.40625, + "step": 1324, + "time_per_iteration": 2.6459662914276123 + }, + { + "auxiliary_loss_clip": 0.01419459, + "auxiliary_loss_mlp": 0.00262011, + "balance_loss_clip": 1.092273, + "balance_loss_mlp": 0.21993056, + "epoch": 0.07966330978505937, + "flos": 23294534135040.0, + "grad_norm": 3.8664466737200485, + "language_loss": 0.86864859, + "learning_rate": 3.974183757463925e-06, + "loss": 0.88546336, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.42089844, + "step": 1325, + "time_per_iteration": 2.6315460205078125 + }, + { + "auxiliary_loss_clip": 0.01422942, + "auxiliary_loss_mlp": 0.00251991, + "balance_loss_clip": 1.10237932, + "balance_loss_mlp": 0.21412989, + "epoch": 0.07972343303772735, + "flos": 18363783392640.0, + "grad_norm": 11.667800206315418, + "language_loss": 0.95391053, + "learning_rate": 3.974121345799418e-06, + "loss": 0.97065985, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.37866211, + "step": 1326, + "time_per_iteration": 4.018853187561035 + }, + { + "auxiliary_loss_clip": 0.01433172, + "auxiliary_loss_mlp": 0.00260051, + "balance_loss_clip": 1.1066407, + "balance_loss_mlp": 0.21584773, + "epoch": 0.07978355629039531, + "flos": 21762513204480.0, + "grad_norm": 6.975556626986303, + "language_loss": 0.90548259, + "learning_rate": 3.974058859276032e-06, + "loss": 0.92241478, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.44213867, + "step": 1327, + "time_per_iteration": 4.1199305057525635 + }, + { + "auxiliary_loss_clip": 0.01445254, + "auxiliary_loss_mlp": 0.00280698, + "balance_loss_clip": 1.10936821, + "balance_loss_mlp": 0.23797376, + "epoch": 0.07984367954306328, + "flos": 18551309322240.0, + "grad_norm": 5.054045915624102, + "language_loss": 0.88045537, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.89771485, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.42724609, + "step": 1328, + "time_per_iteration": 2.6364290714263916 + }, + { + "auxiliary_loss_clip": 0.0144175, + "auxiliary_loss_mlp": 0.00266718, + "balance_loss_clip": 1.11200893, + "balance_loss_mlp": 0.22623454, + "epoch": 0.07990380279573125, + "flos": 16904050583040.0, + "grad_norm": 14.179001909603684, + "language_loss": 0.87453425, + "learning_rate": 3.973933661662101e-06, + "loss": 0.89161897, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.40478516, + "step": 1329, + "time_per_iteration": 2.580385446548462 + }, + { + "auxiliary_loss_clip": 0.01431004, + "auxiliary_loss_mlp": 0.00262007, + "balance_loss_clip": 1.11454558, + "balance_loss_mlp": 0.22238135, + "epoch": 0.07996392604839922, + "flos": 24098358643200.0, + "grad_norm": 59.21744736908394, + "language_loss": 0.87752497, + "learning_rate": 3.973870950576305e-06, + "loss": 0.89445502, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.39599609, + "step": 1330, + "time_per_iteration": 4.077152490615845 + }, + { + "auxiliary_loss_clip": 0.01444203, + "auxiliary_loss_mlp": 0.00267575, + "balance_loss_clip": 1.11664915, + "balance_loss_mlp": 0.23004797, + "epoch": 0.08002404930106718, + "flos": 14278729438080.0, + "grad_norm": 65.28312445244964, + "language_loss": 0.96788812, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.98500586, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.37524414, + "step": 1331, + "time_per_iteration": 2.6203792095184326 + }, + { + "auxiliary_loss_clip": 0.01440922, + "auxiliary_loss_mlp": 0.00298727, + "balance_loss_clip": 1.11194932, + "balance_loss_mlp": 0.26015094, + "epoch": 0.08008417255373516, + "flos": 40406219285760.0, + "grad_norm": 49.18814412866486, + "language_loss": 0.81172788, + "learning_rate": 3.973745303858942e-06, + "loss": 0.82912439, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.38598633, + "step": 1332, + "time_per_iteration": 2.8272128105163574 + }, + { + "auxiliary_loss_clip": 0.01426554, + "auxiliary_loss_mlp": 0.00248876, + "balance_loss_clip": 1.10953546, + "balance_loss_mlp": 0.21194528, + "epoch": 0.08014429580640313, + "flos": 18478913460480.0, + "grad_norm": 32.2903565935653, + "language_loss": 0.88781768, + "learning_rate": 3.973682368232138e-06, + "loss": 0.90457195, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.36938477, + "step": 1333, + "time_per_iteration": 4.1850972175598145 + }, + { + "auxiliary_loss_clip": 0.01436825, + "auxiliary_loss_mlp": 0.00237983, + "balance_loss_clip": 1.12227488, + "balance_loss_mlp": 0.19864425, + "epoch": 0.0802044190590711, + "flos": 22053461368320.0, + "grad_norm": 69.7683104693559, + "language_loss": 0.83835208, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.85510015, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.39331055, + "step": 1334, + "time_per_iteration": 2.6641104221343994 + }, + { + "auxiliary_loss_clip": 0.01456593, + "auxiliary_loss_mlp": 0.0027576, + "balance_loss_clip": 1.12998819, + "balance_loss_mlp": 0.23572937, + "epoch": 0.08026454231173906, + "flos": 24572128055040.0, + "grad_norm": 8.306882742668991, + "language_loss": 0.87945241, + "learning_rate": 3.973556272454221e-06, + "loss": 0.8967759, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.39990234, + "step": 1335, + "time_per_iteration": 2.698497772216797 + }, + { + "auxiliary_loss_clip": 0.02042788, + "auxiliary_loss_mlp": 0.00177893, + "balance_loss_clip": 1.5445745, + "balance_loss_mlp": 0.14775665, + "epoch": 0.08032466556440704, + "flos": 52581841459200.0, + "grad_norm": 0.7601829488643302, + "language_loss": 0.56162977, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58383656, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.30078125, + "step": 1336, + "time_per_iteration": 3.205627918243408 + }, + { + "auxiliary_loss_clip": 0.01451414, + "auxiliary_loss_mlp": 0.0026925, + "balance_loss_clip": 1.13348436, + "balance_loss_mlp": 0.23112664, + "epoch": 0.080384788817075, + "flos": 23842602829440.0, + "grad_norm": 5.4611488041055845, + "language_loss": 0.76377583, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.78098238, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.38134766, + "step": 1337, + "time_per_iteration": 2.701704740524292 + }, + { + "auxiliary_loss_clip": 0.01420685, + "auxiliary_loss_mlp": 0.00254874, + "balance_loss_clip": 1.11420918, + "balance_loss_mlp": 0.22035086, + "epoch": 0.08044491206974297, + "flos": 25300719527040.0, + "grad_norm": 14.069328534570172, + "language_loss": 0.95055366, + "learning_rate": 3.973366567512453e-06, + "loss": 0.96730924, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.3449707, + "step": 1338, + "time_per_iteration": 2.7330636978149414 + }, + { + "auxiliary_loss_clip": 0.01436469, + "auxiliary_loss_mlp": 0.00294185, + "balance_loss_clip": 1.11614764, + "balance_loss_mlp": 0.25618076, + "epoch": 0.08050503532241095, + "flos": 22376549226240.0, + "grad_norm": 5.8244769519333754, + "language_loss": 0.94220161, + "learning_rate": 3.973303182868147e-06, + "loss": 0.95950818, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.37988281, + "step": 1339, + "time_per_iteration": 2.6214189529418945 + }, + { + "auxiliary_loss_clip": 0.0141966, + "auxiliary_loss_mlp": 0.00257269, + "balance_loss_clip": 1.11751604, + "balance_loss_mlp": 0.22595277, + "epoch": 0.08056515857507891, + "flos": 18369421827840.0, + "grad_norm": 3.3326363771219927, + "language_loss": 0.97063345, + "learning_rate": 3.973239723395988e-06, + "loss": 0.98740268, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.31286621, + "step": 1340, + "time_per_iteration": 2.6067183017730713 + }, + { + "auxiliary_loss_clip": 0.01681847, + "auxiliary_loss_mlp": 0.0012862, + "balance_loss_clip": 1.373528, + "balance_loss_mlp": 0.11145389, + "epoch": 0.08062528182774688, + "flos": 51348130980480.0, + "grad_norm": 0.9098652215587222, + "language_loss": 0.65231824, + "learning_rate": 3.97317618909838e-06, + "loss": 0.67042291, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.171875, + "step": 1341, + "time_per_iteration": 3.0832440853118896 + }, + { + "auxiliary_loss_clip": 0.01434552, + "auxiliary_loss_mlp": 0.00300324, + "balance_loss_clip": 1.11568832, + "balance_loss_mlp": 0.26222479, + "epoch": 0.08068540508041486, + "flos": 17599712261760.0, + "grad_norm": 2.7079291988594205, + "language_loss": 0.97051221, + "learning_rate": 3.973112579977733e-06, + "loss": 0.98786098, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.38110352, + "step": 1342, + "time_per_iteration": 2.657400608062744 + }, + { + "auxiliary_loss_clip": 0.01426119, + "auxiliary_loss_mlp": 0.00297171, + "balance_loss_clip": 1.12599373, + "balance_loss_mlp": 0.26326814, + "epoch": 0.08074552833308282, + "flos": 10561185486720.0, + "grad_norm": 2.675041880325655, + "language_loss": 0.83563262, + "learning_rate": 3.973048896036459e-06, + "loss": 0.85286546, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.33911133, + "step": 1343, + "time_per_iteration": 2.638948678970337 + }, + { + "auxiliary_loss_clip": 0.01565932, + "auxiliary_loss_mlp": 0.00292391, + "balance_loss_clip": 1.29619789, + "balance_loss_mlp": 0.27760869, + "epoch": 0.08080565158575079, + "flos": 60840254954880.0, + "grad_norm": 0.79597231301682, + "language_loss": 0.57763076, + "learning_rate": 3.972985137276974e-06, + "loss": 0.596214, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.14746094, + "step": 1344, + "time_per_iteration": 3.0435681343078613 + }, + { + "auxiliary_loss_clip": 0.01420147, + "auxiliary_loss_mlp": 0.00311191, + "balance_loss_clip": 1.1183486, + "balance_loss_mlp": 0.28022087, + "epoch": 0.08086577483841875, + "flos": 18332361970560.0, + "grad_norm": 5.039585428395724, + "language_loss": 0.97421706, + "learning_rate": 3.972921303701695e-06, + "loss": 0.99153042, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.31005859, + "step": 1345, + "time_per_iteration": 2.6152117252349854 + }, + { + "auxiliary_loss_clip": 0.01405817, + "auxiliary_loss_mlp": 0.00274243, + "balance_loss_clip": 1.10885477, + "balance_loss_mlp": 0.24331996, + "epoch": 0.08092589809108673, + "flos": 21543601766400.0, + "grad_norm": 5.838819945360532, + "language_loss": 0.9270404, + "learning_rate": 3.972857395313042e-06, + "loss": 0.94384098, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.30932617, + "step": 1346, + "time_per_iteration": 2.6433396339416504 + }, + { + "auxiliary_loss_clip": 0.01413063, + "auxiliary_loss_mlp": 0.00303431, + "balance_loss_clip": 1.11475408, + "balance_loss_mlp": 0.2691468, + "epoch": 0.0809860213437547, + "flos": 22128012046080.0, + "grad_norm": 77.41764165493458, + "language_loss": 0.98336738, + "learning_rate": 3.972793412113439e-06, + "loss": 1.00053239, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.34277344, + "step": 1347, + "time_per_iteration": 2.744241237640381 + }, + { + "auxiliary_loss_clip": 0.01400941, + "auxiliary_loss_mlp": 0.00301265, + "balance_loss_clip": 1.10825872, + "balance_loss_mlp": 0.26955515, + "epoch": 0.08104614459642266, + "flos": 21725489260800.0, + "grad_norm": 1.6436972827826741, + "language_loss": 0.95333678, + "learning_rate": 3.972729354105312e-06, + "loss": 0.97035885, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.31665039, + "step": 1348, + "time_per_iteration": 2.7137255668640137 + }, + { + "auxiliary_loss_clip": 0.01407239, + "auxiliary_loss_mlp": 0.00324102, + "balance_loss_clip": 1.11742091, + "balance_loss_mlp": 0.29306, + "epoch": 0.08110626784909064, + "flos": 23951878980480.0, + "grad_norm": 32.97636595817586, + "language_loss": 0.82468939, + "learning_rate": 3.97266522129109e-06, + "loss": 0.84200275, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.31054688, + "step": 1349, + "time_per_iteration": 2.7487282752990723 + }, + { + "auxiliary_loss_clip": 0.01413591, + "auxiliary_loss_mlp": 0.00327915, + "balance_loss_clip": 1.11875904, + "balance_loss_mlp": 0.29332051, + "epoch": 0.0811663911017586, + "flos": 19025689265280.0, + "grad_norm": 10.447957278290486, + "language_loss": 0.95505327, + "learning_rate": 3.972601013673205e-06, + "loss": 0.97246826, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.34594727, + "step": 1350, + "time_per_iteration": 2.617375135421753 + }, + { + "auxiliary_loss_clip": 0.01421181, + "auxiliary_loss_mlp": 0.00315214, + "balance_loss_clip": 1.13039172, + "balance_loss_mlp": 0.28369457, + "epoch": 0.08122651435442657, + "flos": 15341290588800.0, + "grad_norm": 6.912938196762292, + "language_loss": 0.91077328, + "learning_rate": 3.972536731254092e-06, + "loss": 0.92813718, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.31518555, + "step": 1351, + "time_per_iteration": 2.691523790359497 + }, + { + "auxiliary_loss_clip": 0.01423061, + "auxiliary_loss_mlp": 0.00322645, + "balance_loss_clip": 1.12792492, + "balance_loss_mlp": 0.28518909, + "epoch": 0.08128663760709455, + "flos": 23221563655680.0, + "grad_norm": 32.31038849588147, + "language_loss": 0.82267827, + "learning_rate": 3.972472374036189e-06, + "loss": 0.84013534, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.37451172, + "step": 1352, + "time_per_iteration": 2.667226791381836 + }, + { + "auxiliary_loss_clip": 0.01454504, + "auxiliary_loss_mlp": 0.00352808, + "balance_loss_clip": 1.15080142, + "balance_loss_mlp": 0.31509066, + "epoch": 0.08134676085976252, + "flos": 22965628273920.0, + "grad_norm": 13.358210995339473, + "language_loss": 0.89220464, + "learning_rate": 3.972407942021935e-06, + "loss": 0.91027784, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.37695312, + "step": 1353, + "time_per_iteration": 2.763735055923462 + }, + { + "auxiliary_loss_clip": 0.0144267, + "auxiliary_loss_mlp": 0.00734404, + "balance_loss_clip": 1.18967795, + "balance_loss_mlp": 0.71266013, + "epoch": 0.08140688411243048, + "flos": 64322115816960.0, + "grad_norm": 0.9461380544002201, + "language_loss": 0.59893882, + "learning_rate": 3.972343435213775e-06, + "loss": 0.62070954, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.21777344, + "step": 1354, + "time_per_iteration": 3.2026994228363037 + }, + { + "auxiliary_loss_clip": 0.01431726, + "auxiliary_loss_mlp": 0.00324446, + "balance_loss_clip": 1.14842105, + "balance_loss_mlp": 0.29149652, + "epoch": 0.08146700736509845, + "flos": 22491858862080.0, + "grad_norm": 8.061344233741078, + "language_loss": 0.90319955, + "learning_rate": 3.972278853614154e-06, + "loss": 0.92076129, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.32958984, + "step": 1355, + "time_per_iteration": 2.636833429336548 + }, + { + "auxiliary_loss_clip": 0.01448948, + "auxiliary_loss_mlp": 0.00354532, + "balance_loss_clip": 1.16368151, + "balance_loss_mlp": 0.31791103, + "epoch": 0.08152713061776642, + "flos": 20447823513600.0, + "grad_norm": 649.2016172846198, + "language_loss": 0.80451214, + "learning_rate": 3.972214197225521e-06, + "loss": 0.82254696, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.36621094, + "step": 1356, + "time_per_iteration": 2.641563653945923 + }, + { + "auxiliary_loss_clip": 0.01447019, + "auxiliary_loss_mlp": 0.00379513, + "balance_loss_clip": 1.15043473, + "balance_loss_mlp": 0.34029353, + "epoch": 0.08158725387043439, + "flos": 23550218121600.0, + "grad_norm": 1424.8314398088262, + "language_loss": 0.76557982, + "learning_rate": 3.972149466050329e-06, + "loss": 0.78384519, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.39257812, + "step": 1357, + "time_per_iteration": 2.7008700370788574 + }, + { + "auxiliary_loss_clip": 0.01442217, + "auxiliary_loss_mlp": 0.00366685, + "balance_loss_clip": 1.1464138, + "balance_loss_mlp": 0.32894379, + "epoch": 0.08164737712310235, + "flos": 22017335264640.0, + "grad_norm": 21.227015740965125, + "language_loss": 0.91807985, + "learning_rate": 3.97208466009103e-06, + "loss": 0.93616885, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.37744141, + "step": 1358, + "time_per_iteration": 2.679595470428467 + }, + { + "auxiliary_loss_clip": 0.01453959, + "auxiliary_loss_mlp": 0.00418901, + "balance_loss_clip": 1.15256476, + "balance_loss_mlp": 0.37510327, + "epoch": 0.08170750037577033, + "flos": 23367827836800.0, + "grad_norm": 3.3955513623216826, + "language_loss": 1.07820153, + "learning_rate": 3.972019779350084e-06, + "loss": 1.09693003, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.43774414, + "step": 1359, + "time_per_iteration": 2.6239664554595947 + }, + { + "auxiliary_loss_clip": 0.01461177, + "auxiliary_loss_mlp": 0.00408633, + "balance_loss_clip": 1.16645861, + "balance_loss_mlp": 0.36965173, + "epoch": 0.0817676236284383, + "flos": 28397978490240.0, + "grad_norm": 7.555162744047138, + "language_loss": 0.92664993, + "learning_rate": 3.971954823829951e-06, + "loss": 0.94534802, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.38989258, + "step": 1360, + "time_per_iteration": 2.6898250579833984 + }, + { + "auxiliary_loss_clip": 0.01481577, + "auxiliary_loss_mlp": 0.00424362, + "balance_loss_clip": 1.18043351, + "balance_loss_mlp": 0.38528496, + "epoch": 0.08182774688110626, + "flos": 19208905562880.0, + "grad_norm": 25.29377455752081, + "language_loss": 0.8328377, + "learning_rate": 3.971889793533093e-06, + "loss": 0.85189712, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.390625, + "step": 1361, + "time_per_iteration": 2.6112399101257324 + }, + { + "auxiliary_loss_clip": 0.01476786, + "auxiliary_loss_mlp": 0.00372993, + "balance_loss_clip": 1.18250513, + "balance_loss_mlp": 0.33274817, + "epoch": 0.08188787013377424, + "flos": 22784099915520.0, + "grad_norm": 5.30383032689686, + "language_loss": 0.85654593, + "learning_rate": 3.971824688461976e-06, + "loss": 0.87504375, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.40209961, + "step": 1362, + "time_per_iteration": 2.72064208984375 + }, + { + "auxiliary_loss_clip": 0.01501628, + "auxiliary_loss_mlp": 0.00371492, + "balance_loss_clip": 1.20607579, + "balance_loss_mlp": 0.33212864, + "epoch": 0.08194799338644221, + "flos": 16468095214080.0, + "grad_norm": 21.194398991352116, + "language_loss": 0.81935596, + "learning_rate": 3.971759508619069e-06, + "loss": 0.8380872, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.39379883, + "step": 1363, + "time_per_iteration": 2.5952396392822266 + }, + { + "auxiliary_loss_clip": 0.01494139, + "auxiliary_loss_mlp": 0.00376529, + "balance_loss_clip": 1.19424534, + "balance_loss_mlp": 0.33604527, + "epoch": 0.08200811663911017, + "flos": 23913633974400.0, + "grad_norm": 2.70565991622576, + "language_loss": 0.84861469, + "learning_rate": 3.971694254006844e-06, + "loss": 0.86732137, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.4050293, + "step": 1364, + "time_per_iteration": 2.684079170227051 + }, + { + "auxiliary_loss_clip": 0.01512381, + "auxiliary_loss_mlp": 0.00372853, + "balance_loss_clip": 1.20360422, + "balance_loss_mlp": 0.33217889, + "epoch": 0.08206823989177814, + "flos": 17896550256000.0, + "grad_norm": 10.3001611154545, + "language_loss": 0.87720716, + "learning_rate": 3.971628924627776e-06, + "loss": 0.89605945, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.40625, + "step": 1365, + "time_per_iteration": 2.7238385677337646 + }, + { + "auxiliary_loss_clip": 0.01505445, + "auxiliary_loss_mlp": 0.0035495, + "balance_loss_clip": 1.200032, + "balance_loss_mlp": 0.31644556, + "epoch": 0.08212836314444612, + "flos": 22088186841600.0, + "grad_norm": 3.371759335043087, + "language_loss": 0.86607736, + "learning_rate": 3.97156352048434e-06, + "loss": 0.88468134, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.38476562, + "step": 1366, + "time_per_iteration": 2.6997082233428955 + }, + { + "auxiliary_loss_clip": 0.01503157, + "auxiliary_loss_mlp": 0.00340469, + "balance_loss_clip": 1.18876421, + "balance_loss_mlp": 0.29972345, + "epoch": 0.08218848639711408, + "flos": 17597485618560.0, + "grad_norm": 19.524977014807764, + "language_loss": 0.89101708, + "learning_rate": 3.97149804157902e-06, + "loss": 0.90945339, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.4074707, + "step": 1367, + "time_per_iteration": 2.5843591690063477 + }, + { + "auxiliary_loss_clip": 0.01524178, + "auxiliary_loss_mlp": 0.00380291, + "balance_loss_clip": 1.2023375, + "balance_loss_mlp": 0.3390688, + "epoch": 0.08224860964978205, + "flos": 17857838373120.0, + "grad_norm": 3.1394002588550505, + "language_loss": 0.92075825, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.939803, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.41186523, + "step": 1368, + "time_per_iteration": 2.613765239715576 + }, + { + "auxiliary_loss_clip": 0.01503181, + "auxiliary_loss_mlp": 0.00313359, + "balance_loss_clip": 1.19737434, + "balance_loss_mlp": 0.27804875, + "epoch": 0.08230873290245003, + "flos": 25227533566080.0, + "grad_norm": 1319.882525717178, + "language_loss": 0.86825335, + "learning_rate": 3.971366859492653e-06, + "loss": 0.8864187, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.35327148, + "step": 1369, + "time_per_iteration": 5.457366466522217 + }, + { + "auxiliary_loss_clip": 0.01545534, + "auxiliary_loss_mlp": 0.00319879, + "balance_loss_clip": 1.21872437, + "balance_loss_mlp": 0.28435433, + "epoch": 0.08236885615511799, + "flos": 31759935753600.0, + "grad_norm": 7.091798063172654, + "language_loss": 0.82685709, + "learning_rate": 3.971301156316582e-06, + "loss": 0.84551126, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.35522461, + "step": 1370, + "time_per_iteration": 2.75970721244812 + }, + { + "auxiliary_loss_clip": 0.01550499, + "auxiliary_loss_mlp": 0.00354941, + "balance_loss_clip": 1.22069907, + "balance_loss_mlp": 0.31247887, + "epoch": 0.08242897940778596, + "flos": 23185832601600.0, + "grad_norm": 16.80090765670434, + "language_loss": 0.80525839, + "learning_rate": 3.971235378388573e-06, + "loss": 0.82431281, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.42480469, + "step": 1371, + "time_per_iteration": 2.651773452758789 + }, + { + "auxiliary_loss_clip": 0.01510917, + "auxiliary_loss_mlp": 0.00340208, + "balance_loss_clip": 1.18743753, + "balance_loss_mlp": 0.29962936, + "epoch": 0.08248910266045394, + "flos": 34491480393600.0, + "grad_norm": 17.008377014679116, + "language_loss": 0.77367759, + "learning_rate": 3.971169525711122e-06, + "loss": 0.79218882, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 3.23828125, + "router_z_loss_mlp": 0.40576172, + "step": 1372, + "time_per_iteration": 4.184276580810547 + }, + { + "auxiliary_loss_clip": 0.01536004, + "auxiliary_loss_mlp": 0.00381441, + "balance_loss_clip": 1.19820213, + "balance_loss_mlp": 0.33654669, + "epoch": 0.0825492259131219, + "flos": 13436228960640.0, + "grad_norm": 12.89747305238341, + "language_loss": 0.98249853, + "learning_rate": 3.9711035982867246e-06, + "loss": 1.00167298, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.44897461, + "step": 1373, + "time_per_iteration": 2.591956615447998 + }, + { + "auxiliary_loss_clip": 0.01504488, + "auxiliary_loss_mlp": 0.00361955, + "balance_loss_clip": 1.17351198, + "balance_loss_mlp": 0.321042, + "epoch": 0.08260934916578987, + "flos": 25812446636160.0, + "grad_norm": 16.96708677937049, + "language_loss": 0.90338355, + "learning_rate": 3.971037596117882e-06, + "loss": 0.92204797, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.40917969, + "step": 1374, + "time_per_iteration": 2.6778225898742676 + }, + { + "auxiliary_loss_clip": 0.01286433, + "auxiliary_loss_mlp": 0.00312579, + "balance_loss_clip": 1.06154203, + "balance_loss_mlp": 0.28883225, + "epoch": 0.08266947241845783, + "flos": 63460009491840.0, + "grad_norm": 1.1604104936875914, + "language_loss": 0.60793674, + "learning_rate": 3.970971519207095e-06, + "loss": 0.62392688, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.23730469, + "step": 1375, + "time_per_iteration": 4.589948415756226 + }, + { + "auxiliary_loss_clip": 0.01279185, + "auxiliary_loss_mlp": 0.00299179, + "balance_loss_clip": 1.05391312, + "balance_loss_mlp": 0.27581388, + "epoch": 0.08272959567112581, + "flos": 69993704568960.0, + "grad_norm": 0.9207949082836084, + "language_loss": 0.62195438, + "learning_rate": 3.970905367556871e-06, + "loss": 0.63773805, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.23339844, + "step": 1376, + "time_per_iteration": 3.1141345500946045 + }, + { + "auxiliary_loss_clip": 0.01506086, + "auxiliary_loss_mlp": 0.00375788, + "balance_loss_clip": 1.17959213, + "balance_loss_mlp": 0.33511329, + "epoch": 0.08278971892379378, + "flos": 20413205781120.0, + "grad_norm": 29.75791082188071, + "language_loss": 0.87200284, + "learning_rate": 3.970839141169718e-06, + "loss": 0.89082158, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.40673828, + "step": 1377, + "time_per_iteration": 2.783621311187744 + }, + { + "auxiliary_loss_clip": 0.0149548, + "auxiliary_loss_mlp": 0.00343829, + "balance_loss_clip": 1.17049563, + "balance_loss_mlp": 0.30665916, + "epoch": 0.08284984217646174, + "flos": 26250233598720.0, + "grad_norm": 27.82418002448848, + "language_loss": 0.90899485, + "learning_rate": 3.970772840048147e-06, + "loss": 0.92738795, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.37182617, + "step": 1378, + "time_per_iteration": 2.7616915702819824 + }, + { + "auxiliary_loss_clip": 0.01490664, + "auxiliary_loss_mlp": 0.0035293, + "balance_loss_clip": 1.16233206, + "balance_loss_mlp": 0.31287569, + "epoch": 0.08290996542912972, + "flos": 27194683852800.0, + "grad_norm": 25.91353255953872, + "language_loss": 0.93953145, + "learning_rate": 3.970706464194672e-06, + "loss": 0.95796728, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.40087891, + "step": 1379, + "time_per_iteration": 2.6669485569000244 + }, + { + "auxiliary_loss_clip": 0.0148992, + "auxiliary_loss_mlp": 0.00340641, + "balance_loss_clip": 1.171103, + "balance_loss_mlp": 0.3070004, + "epoch": 0.08297008868179769, + "flos": 38618191146240.0, + "grad_norm": 4.957387895827511, + "language_loss": 0.86753297, + "learning_rate": 3.970640013611812e-06, + "loss": 0.88583863, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.33642578, + "step": 1380, + "time_per_iteration": 2.7512216567993164 + }, + { + "auxiliary_loss_clip": 0.01476404, + "auxiliary_loss_mlp": 0.00359361, + "balance_loss_clip": 1.1538167, + "balance_loss_mlp": 0.32138044, + "epoch": 0.08303021193446565, + "flos": 19974736460160.0, + "grad_norm": 2.4440670162000764, + "language_loss": 0.9310627, + "learning_rate": 3.970573488302083e-06, + "loss": 0.94942033, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 3.2265625, + "router_z_loss_mlp": 0.37963867, + "step": 1381, + "time_per_iteration": 2.583623170852661 + }, + { + "auxiliary_loss_clip": 0.01479991, + "auxiliary_loss_mlp": 0.00362033, + "balance_loss_clip": 1.15537977, + "balance_loss_mlp": 0.32410049, + "epoch": 0.08309033518713363, + "flos": 13662646341120.0, + "grad_norm": 17.340877554445722, + "language_loss": 0.9950496, + "learning_rate": 3.970506888268011e-06, + "loss": 1.01346993, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.37963867, + "step": 1382, + "time_per_iteration": 2.6213760375976562 + }, + { + "auxiliary_loss_clip": 0.01476485, + "auxiliary_loss_mlp": 0.00338399, + "balance_loss_clip": 1.1549325, + "balance_loss_mlp": 0.30173057, + "epoch": 0.0831504584398016, + "flos": 17968551068160.0, + "grad_norm": 41.384821818849886, + "language_loss": 0.82987273, + "learning_rate": 3.970440213512121e-06, + "loss": 0.84802151, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.36694336, + "step": 1383, + "time_per_iteration": 2.5799930095672607 + }, + { + "auxiliary_loss_clip": 0.01461771, + "auxiliary_loss_mlp": 0.00334515, + "balance_loss_clip": 1.14639688, + "balance_loss_mlp": 0.29879999, + "epoch": 0.08321058169246956, + "flos": 22601386408320.0, + "grad_norm": 3.64615711127402, + "language_loss": 0.91532779, + "learning_rate": 3.97037346403694e-06, + "loss": 0.9332906, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 3.15234375, + "router_z_loss_mlp": 0.35693359, + "step": 1384, + "time_per_iteration": 2.6343994140625 + }, + { + "auxiliary_loss_clip": 0.0147155, + "auxiliary_loss_mlp": 0.00420901, + "balance_loss_clip": 1.14142644, + "balance_loss_mlp": 0.37803364, + "epoch": 0.08327070494513754, + "flos": 22850426378880.0, + "grad_norm": 11.649242793535327, + "language_loss": 0.95609069, + "learning_rate": 3.970306639845e-06, + "loss": 0.97501516, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.42871094, + "step": 1385, + "time_per_iteration": 2.6666338443756104 + }, + { + "auxiliary_loss_clip": 0.01457364, + "auxiliary_loss_mlp": 0.00396145, + "balance_loss_clip": 1.13607764, + "balance_loss_mlp": 0.35322946, + "epoch": 0.0833308281978055, + "flos": 22782986593920.0, + "grad_norm": 34.959581184296546, + "language_loss": 0.7881304, + "learning_rate": 3.970239740938835e-06, + "loss": 0.80666554, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.4296875, + "step": 1386, + "time_per_iteration": 2.64005184173584 + }, + { + "auxiliary_loss_clip": 0.014624, + "auxiliary_loss_mlp": 0.00374258, + "balance_loss_clip": 1.1418376, + "balance_loss_mlp": 0.33525279, + "epoch": 0.08339095145047347, + "flos": 20812604083200.0, + "grad_norm": 16.610477528073023, + "language_loss": 0.88818771, + "learning_rate": 3.97017276732098e-06, + "loss": 0.90655428, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.39038086, + "step": 1387, + "time_per_iteration": 2.710378885269165 + }, + { + "auxiliary_loss_clip": 0.01457109, + "auxiliary_loss_mlp": 0.00386503, + "balance_loss_clip": 1.13884139, + "balance_loss_mlp": 0.34859487, + "epoch": 0.08345107470314143, + "flos": 18515326872960.0, + "grad_norm": 4.152718201045154, + "language_loss": 0.84819818, + "learning_rate": 3.970105718993978e-06, + "loss": 0.86663425, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.37915039, + "step": 1388, + "time_per_iteration": 2.605311870574951 + }, + { + "auxiliary_loss_clip": 0.01443011, + "auxiliary_loss_mlp": 0.00357603, + "balance_loss_clip": 1.13340139, + "balance_loss_mlp": 0.32167298, + "epoch": 0.08351119795580941, + "flos": 18807567926400.0, + "grad_norm": 8.309567451449135, + "language_loss": 0.87633842, + "learning_rate": 3.970038595960369e-06, + "loss": 0.89434457, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.359375, + "step": 1389, + "time_per_iteration": 2.6574862003326416 + }, + { + "auxiliary_loss_clip": 0.0146468, + "auxiliary_loss_mlp": 0.00400138, + "balance_loss_clip": 1.1459204, + "balance_loss_mlp": 0.35874861, + "epoch": 0.08357132120847738, + "flos": 18441817689600.0, + "grad_norm": 59.022413469291216, + "language_loss": 0.96220088, + "learning_rate": 3.969971398222699e-06, + "loss": 0.98084909, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.41381836, + "step": 1390, + "time_per_iteration": 2.714029550552368 + }, + { + "auxiliary_loss_clip": 0.01470727, + "auxiliary_loss_mlp": 0.00374149, + "balance_loss_clip": 1.15050161, + "balance_loss_mlp": 0.33318818, + "epoch": 0.08363144446114534, + "flos": 25922333318400.0, + "grad_norm": 9.874281619070665, + "language_loss": 0.92696488, + "learning_rate": 3.969904125783517e-06, + "loss": 0.94541365, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.40942383, + "step": 1391, + "time_per_iteration": 2.803018093109131 + }, + { + "auxiliary_loss_clip": 0.01486396, + "auxiliary_loss_mlp": 0.00446116, + "balance_loss_clip": 1.16030455, + "balance_loss_mlp": 0.40105504, + "epoch": 0.08369156771381332, + "flos": 18041306065920.0, + "grad_norm": 42.72536792030382, + "language_loss": 0.99160397, + "learning_rate": 3.969836778645371e-06, + "loss": 1.01092911, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.45019531, + "step": 1392, + "time_per_iteration": 2.6316065788269043 + }, + { + "auxiliary_loss_clip": 0.01484904, + "auxiliary_loss_mlp": 0.00403862, + "balance_loss_clip": 1.15944028, + "balance_loss_mlp": 0.36223447, + "epoch": 0.08375169096648129, + "flos": 22675111073280.0, + "grad_norm": 947.9845413577557, + "language_loss": 0.88693172, + "learning_rate": 3.969769356810819e-06, + "loss": 0.90581942, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.41625977, + "step": 1393, + "time_per_iteration": 2.6955533027648926 + }, + { + "auxiliary_loss_clip": 0.01474758, + "auxiliary_loss_mlp": 0.00392328, + "balance_loss_clip": 1.15612888, + "balance_loss_mlp": 0.35465834, + "epoch": 0.08381181421914925, + "flos": 26103215232000.0, + "grad_norm": 196.96000809576648, + "language_loss": 0.91408396, + "learning_rate": 3.969701860282415e-06, + "loss": 0.93275481, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.37670898, + "step": 1394, + "time_per_iteration": 2.679044008255005 + }, + { + "auxiliary_loss_clip": 0.01473909, + "auxiliary_loss_mlp": 0.00401305, + "balance_loss_clip": 1.15419221, + "balance_loss_mlp": 0.36201355, + "epoch": 0.08387193747181723, + "flos": 20629782835200.0, + "grad_norm": 48.92272220059523, + "language_loss": 0.88366318, + "learning_rate": 3.969634289062719e-06, + "loss": 0.90241534, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.39282227, + "step": 1395, + "time_per_iteration": 2.7173666954040527 + }, + { + "auxiliary_loss_clip": 0.01491892, + "auxiliary_loss_mlp": 0.00438842, + "balance_loss_clip": 1.16492701, + "balance_loss_mlp": 0.39482969, + "epoch": 0.0839320607244852, + "flos": 13443196199040.0, + "grad_norm": 3.486364621210599, + "language_loss": 0.91763014, + "learning_rate": 3.969566643154293e-06, + "loss": 0.93693751, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.44018555, + "step": 1396, + "time_per_iteration": 2.5968189239501953 + }, + { + "auxiliary_loss_clip": 0.01489128, + "auxiliary_loss_mlp": 0.00427251, + "balance_loss_clip": 1.16639566, + "balance_loss_mlp": 0.38605267, + "epoch": 0.08399218397715316, + "flos": 23477247642240.0, + "grad_norm": 3.3466701554884732, + "language_loss": 0.83904576, + "learning_rate": 3.969498922559703e-06, + "loss": 0.85820961, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.41210938, + "step": 1397, + "time_per_iteration": 2.758316993713379 + }, + { + "auxiliary_loss_clip": 0.0150117, + "auxiliary_loss_mlp": 0.00421882, + "balance_loss_clip": 1.18098354, + "balance_loss_mlp": 0.38108853, + "epoch": 0.08405230722982113, + "flos": 25920717206400.0, + "grad_norm": 117.623737339325, + "language_loss": 0.85628992, + "learning_rate": 3.969431127281516e-06, + "loss": 0.87552047, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.40771484, + "step": 1398, + "time_per_iteration": 2.6821324825286865 + }, + { + "auxiliary_loss_clip": 0.01477498, + "auxiliary_loss_mlp": 0.00420109, + "balance_loss_clip": 1.1636008, + "balance_loss_mlp": 0.38141394, + "epoch": 0.0841124304824891, + "flos": 17967437746560.0, + "grad_norm": 307.11899152085994, + "language_loss": 1.00626695, + "learning_rate": 3.969363257322304e-06, + "loss": 1.02524304, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.38720703, + "step": 1399, + "time_per_iteration": 2.6626136302948 + }, + { + "auxiliary_loss_clip": 0.01516433, + "auxiliary_loss_mlp": 0.00478644, + "balance_loss_clip": 1.17932439, + "balance_loss_mlp": 0.43045929, + "epoch": 0.08417255373515707, + "flos": 25629661301760.0, + "grad_norm": 52.06691807792565, + "language_loss": 0.89229429, + "learning_rate": 3.96929531268464e-06, + "loss": 0.91224504, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.48217773, + "step": 1400, + "time_per_iteration": 2.6726112365722656 + }, + { + "auxiliary_loss_clip": 0.01509501, + "auxiliary_loss_mlp": 0.0046302, + "balance_loss_clip": 1.17792201, + "balance_loss_mlp": 0.418531, + "epoch": 0.08423267698782504, + "flos": 26249730808320.0, + "grad_norm": 3.8812751806786823, + "language_loss": 0.92343247, + "learning_rate": 3.969227293371099e-06, + "loss": 0.94315767, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.44458008, + "step": 1401, + "time_per_iteration": 2.671091318130493 + }, + { + "auxiliary_loss_clip": 0.01480182, + "auxiliary_loss_mlp": 0.00463384, + "balance_loss_clip": 1.15472519, + "balance_loss_mlp": 0.41956222, + "epoch": 0.08429280024049302, + "flos": 20119707751680.0, + "grad_norm": 9.292077422187699, + "language_loss": 0.93297279, + "learning_rate": 3.969159199384263e-06, + "loss": 0.95240831, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.43774414, + "step": 1402, + "time_per_iteration": 2.6246485710144043 + }, + { + "auxiliary_loss_clip": 0.01479151, + "auxiliary_loss_mlp": 0.00450614, + "balance_loss_clip": 1.15982437, + "balance_loss_mlp": 0.41151339, + "epoch": 0.08435292349316098, + "flos": 42924526836480.0, + "grad_norm": 5.146818398454941, + "language_loss": 0.94136912, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.96066672, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.390625, + "step": 1403, + "time_per_iteration": 2.8343868255615234 + }, + { + "auxiliary_loss_clip": 0.01491251, + "auxiliary_loss_mlp": 0.0052383, + "balance_loss_clip": 1.16804576, + "balance_loss_mlp": 0.47676599, + "epoch": 0.08441304674582895, + "flos": 22857285876480.0, + "grad_norm": 1917.7035852091574, + "language_loss": 0.87639654, + "learning_rate": 3.969022787401033e-06, + "loss": 0.89654732, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.47070312, + "step": 1404, + "time_per_iteration": 2.644843578338623 + }, + { + "auxiliary_loss_clip": 0.01491258, + "auxiliary_loss_mlp": 0.00554098, + "balance_loss_clip": 1.16087317, + "balance_loss_mlp": 0.50758219, + "epoch": 0.08447316999849692, + "flos": 18697501676160.0, + "grad_norm": 9.603147835275461, + "language_loss": 0.92833388, + "learning_rate": 3.968954469409811e-06, + "loss": 0.94878745, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.46557617, + "step": 1405, + "time_per_iteration": 2.6111621856689453 + }, + { + "auxiliary_loss_clip": 0.01488433, + "auxiliary_loss_mlp": 0.0064152, + "balance_loss_clip": 1.16277635, + "balance_loss_mlp": 0.59250075, + "epoch": 0.08453329325116489, + "flos": 25483971738240.0, + "grad_norm": 275.78596606624666, + "language_loss": 0.85675609, + "learning_rate": 3.968886076755639e-06, + "loss": 0.87805557, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.49023438, + "step": 1406, + "time_per_iteration": 2.707791805267334 + }, + { + "auxiliary_loss_clip": 0.0150664, + "auxiliary_loss_mlp": 0.00721485, + "balance_loss_clip": 1.17429471, + "balance_loss_mlp": 0.6713928, + "epoch": 0.08459341650383286, + "flos": 20920048640640.0, + "grad_norm": 23.086204912749324, + "language_loss": 0.85250294, + "learning_rate": 3.96881760944111e-06, + "loss": 0.87478423, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.5012207, + "step": 1407, + "time_per_iteration": 2.610388994216919 + }, + { + "auxiliary_loss_clip": 0.01507452, + "auxiliary_loss_mlp": 0.00786237, + "balance_loss_clip": 1.17099369, + "balance_loss_mlp": 0.73221099, + "epoch": 0.08465353975650082, + "flos": 13043079624960.0, + "grad_norm": 57.22063493213088, + "language_loss": 0.98215747, + "learning_rate": 3.968749067468819e-06, + "loss": 1.00509429, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.53979492, + "step": 1408, + "time_per_iteration": 2.6865179538726807 + }, + { + "auxiliary_loss_clip": 0.01578148, + "auxiliary_loss_mlp": 0.00681473, + "balance_loss_clip": 1.34453368, + "balance_loss_mlp": 0.64237195, + "epoch": 0.0847136630091688, + "flos": 60877422552960.0, + "grad_norm": 0.9466890930103423, + "language_loss": 0.62134963, + "learning_rate": 3.968680450841368e-06, + "loss": 0.64394581, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.390625, + "step": 1409, + "time_per_iteration": 3.2589778900146484 + }, + { + "auxiliary_loss_clip": 0.01512313, + "auxiliary_loss_mlp": 0.00888072, + "balance_loss_clip": 1.17792308, + "balance_loss_mlp": 0.83423698, + "epoch": 0.08477378626183676, + "flos": 22046530043520.0, + "grad_norm": 3.436600939512624, + "language_loss": 0.93820202, + "learning_rate": 3.968611759561355e-06, + "loss": 0.96220589, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.53808594, + "step": 1410, + "time_per_iteration": 2.6580398082733154 + }, + { + "auxiliary_loss_clip": 0.01518692, + "auxiliary_loss_mlp": 0.00887568, + "balance_loss_clip": 1.16877747, + "balance_loss_mlp": 0.82801116, + "epoch": 0.08483390951450473, + "flos": 16690059308160.0, + "grad_norm": 3.9595279578017806, + "language_loss": 0.80310565, + "learning_rate": 3.968542993631388e-06, + "loss": 0.82716823, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 0.59570312, + "step": 1411, + "time_per_iteration": 5.423904657363892 + }, + { + "auxiliary_loss_clip": 0.01470354, + "auxiliary_loss_mlp": 0.00506167, + "balance_loss_clip": 1.24291337, + "balance_loss_mlp": 0.46763849, + "epoch": 0.08489403276717271, + "flos": 51584640082560.0, + "grad_norm": 0.9027719414639797, + "language_loss": 0.5690847, + "learning_rate": 3.968474153054073e-06, + "loss": 0.5888499, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.38476562, + "step": 1412, + "time_per_iteration": 3.097105026245117 + }, + { + "auxiliary_loss_clip": 0.01495942, + "auxiliary_loss_mlp": 0.00691523, + "balance_loss_clip": 1.14618254, + "balance_loss_mlp": 0.64026308, + "epoch": 0.08495415601984067, + "flos": 17092330698240.0, + "grad_norm": 148.88458336940633, + "language_loss": 0.98071611, + "learning_rate": 3.96840523783202e-06, + "loss": 1.00259078, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 0.51245117, + "step": 1413, + "time_per_iteration": 2.6239163875579834 + }, + { + "auxiliary_loss_clip": 0.01494976, + "auxiliary_loss_mlp": 0.00676632, + "balance_loss_clip": 1.14531684, + "balance_loss_mlp": 0.62468112, + "epoch": 0.08501427927250864, + "flos": 23148413608320.0, + "grad_norm": 5.253088800144623, + "language_loss": 0.9449749, + "learning_rate": 3.968336247967844e-06, + "loss": 0.96669096, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 0.51928711, + "step": 1414, + "time_per_iteration": 2.692439079284668 + }, + { + "auxiliary_loss_clip": 0.01492321, + "auxiliary_loss_mlp": 0.00628795, + "balance_loss_clip": 1.13486242, + "balance_loss_mlp": 0.57958543, + "epoch": 0.08507440252517662, + "flos": 19063467394560.0, + "grad_norm": 5.732840028320461, + "language_loss": 0.82910991, + "learning_rate": 3.96826718346416e-06, + "loss": 0.85032105, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.49194336, + "step": 1415, + "time_per_iteration": 4.076260328292847 + }, + { + "auxiliary_loss_clip": 0.01497348, + "auxiliary_loss_mlp": 0.00596743, + "balance_loss_clip": 1.12750661, + "balance_loss_mlp": 0.54567426, + "epoch": 0.08513452577784458, + "flos": 60182296600320.0, + "grad_norm": 11.99697523076555, + "language_loss": 0.76938081, + "learning_rate": 3.968198044323587e-06, + "loss": 0.79032171, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 0.51049805, + "step": 1416, + "time_per_iteration": 2.974874973297119 + }, + { + "auxiliary_loss_clip": 0.01504847, + "auxiliary_loss_mlp": 0.00552343, + "balance_loss_clip": 1.128811, + "balance_loss_mlp": 0.50330061, + "epoch": 0.08519464903051255, + "flos": 27308485117440.0, + "grad_norm": 17.624798077327075, + "language_loss": 0.83111322, + "learning_rate": 3.968128830548748e-06, + "loss": 0.85168517, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.49047852, + "step": 1417, + "time_per_iteration": 4.14927339553833 + }, + { + "auxiliary_loss_clip": 0.01517473, + "auxiliary_loss_mlp": 0.00551271, + "balance_loss_clip": 1.13932395, + "balance_loss_mlp": 0.50027335, + "epoch": 0.08525477228318051, + "flos": 20266438809600.0, + "grad_norm": 77.08634862179153, + "language_loss": 0.92682022, + "learning_rate": 3.968059542142265e-06, + "loss": 0.94750762, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 3.78320312, + "router_z_loss_mlp": 0.51025391, + "step": 1418, + "time_per_iteration": 2.651712656021118 + }, + { + "auxiliary_loss_clip": 0.01429892, + "auxiliary_loss_mlp": 0.01093657, + "balance_loss_clip": 1.1568867, + "balance_loss_mlp": 1.0429213, + "epoch": 0.08531489553584849, + "flos": 67615017183360.0, + "grad_norm": 1.091315559581165, + "language_loss": 0.56647778, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.59171319, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.5078125, + "step": 1419, + "time_per_iteration": 3.086315393447876 + }, + { + "auxiliary_loss_clip": 0.01502746, + "auxiliary_loss_mlp": 0.00511685, + "balance_loss_clip": 1.11740446, + "balance_loss_mlp": 0.46683809, + "epoch": 0.08537501878851646, + "flos": 27526965592320.0, + "grad_norm": 13.627144618741106, + "language_loss": 0.78429788, + "learning_rate": 3.967920741444886e-06, + "loss": 0.80444217, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 3.85546875, + "router_z_loss_mlp": 0.44799805, + "step": 1420, + "time_per_iteration": 2.7210195064544678 + }, + { + "auxiliary_loss_clip": 0.01533569, + "auxiliary_loss_mlp": 0.00467096, + "balance_loss_clip": 1.13367939, + "balance_loss_mlp": 0.42553943, + "epoch": 0.08543514204118442, + "flos": 22784243569920.0, + "grad_norm": 10.652391466836573, + "language_loss": 0.94743413, + "learning_rate": 3.967851229159252e-06, + "loss": 0.96744078, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 4.0, + "router_z_loss_mlp": 0.41552734, + "step": 1421, + "time_per_iteration": 2.736368179321289 + }, + { + "auxiliary_loss_clip": 0.0144046, + "auxiliary_loss_mlp": 0.00572, + "balance_loss_clip": 1.15733194, + "balance_loss_mlp": 0.54739541, + "epoch": 0.0854952652938524, + "flos": 60990721027200.0, + "grad_norm": 0.8035149817353388, + "language_loss": 0.63610911, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65623379, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.24609375, + "step": 1422, + "time_per_iteration": 3.1159443855285645 + }, + { + "auxiliary_loss_clip": 0.01523658, + "auxiliary_loss_mlp": 0.00456332, + "balance_loss_clip": 1.13433599, + "balance_loss_mlp": 0.41749397, + "epoch": 0.08555538854652037, + "flos": 28038046256640.0, + "grad_norm": 6.82050859032842, + "language_loss": 0.89552706, + "learning_rate": 3.967711980727276e-06, + "loss": 0.91532695, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.38842773, + "step": 1423, + "time_per_iteration": 2.670178174972534 + }, + { + "auxiliary_loss_clip": 0.01545768, + "auxiliary_loss_mlp": 0.00415143, + "balance_loss_clip": 1.1478734, + "balance_loss_mlp": 0.37961873, + "epoch": 0.08561551179918833, + "flos": 23509279595520.0, + "grad_norm": 41.0583568058291, + "language_loss": 0.83026731, + "learning_rate": 3.967642244586213e-06, + "loss": 0.8498764, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 3.98046875, + "router_z_loss_mlp": 0.35571289, + "step": 1424, + "time_per_iteration": 2.6902871131896973 + }, + { + "auxiliary_loss_clip": 0.01551748, + "auxiliary_loss_mlp": 0.00386208, + "balance_loss_clip": 1.15483809, + "balance_loss_mlp": 0.3531158, + "epoch": 0.08567563505185631, + "flos": 17926930183680.0, + "grad_norm": 12.249714471230456, + "language_loss": 0.82475638, + "learning_rate": 3.96757243383196e-06, + "loss": 0.844136, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 3.97070312, + "router_z_loss_mlp": 0.33105469, + "step": 1425, + "time_per_iteration": 2.5962626934051514 + }, + { + "auxiliary_loss_clip": 0.01546463, + "auxiliary_loss_mlp": 0.00400786, + "balance_loss_clip": 1.15391159, + "balance_loss_mlp": 0.36437944, + "epoch": 0.08573575830452428, + "flos": 19719519350400.0, + "grad_norm": 5.503482996069108, + "language_loss": 1.00503898, + "learning_rate": 3.9675025484671624e-06, + "loss": 1.02451158, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 3.9296875, + "router_z_loss_mlp": 0.36425781, + "step": 1426, + "time_per_iteration": 2.652249574661255 + }, + { + "auxiliary_loss_clip": 0.01546222, + "auxiliary_loss_mlp": 0.00444703, + "balance_loss_clip": 1.15647817, + "balance_loss_mlp": 0.40770048, + "epoch": 0.08579588155719224, + "flos": 17931563038080.0, + "grad_norm": 780.6686840872288, + "language_loss": 0.84651613, + "learning_rate": 3.967432588494471e-06, + "loss": 0.86642545, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 3.90039062, + "router_z_loss_mlp": 0.36987305, + "step": 1427, + "time_per_iteration": 2.6352133750915527 + }, + { + "auxiliary_loss_clip": 0.01558936, + "auxiliary_loss_mlp": 0.00612597, + "balance_loss_clip": 1.17077947, + "balance_loss_mlp": 0.57411641, + "epoch": 0.08585600480986022, + "flos": 16033324993920.0, + "grad_norm": 45.58893189663297, + "language_loss": 0.9032799, + "learning_rate": 3.96736255391654e-06, + "loss": 0.92499518, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.38427734, + "step": 1428, + "time_per_iteration": 2.6484854221343994 + }, + { + "auxiliary_loss_clip": 0.01571678, + "auxiliary_loss_mlp": 0.00627901, + "balance_loss_clip": 1.17429042, + "balance_loss_mlp": 0.5867734, + "epoch": 0.08591612806252819, + "flos": 28657433404800.0, + "grad_norm": 20.679888670668813, + "language_loss": 0.89883065, + "learning_rate": 3.967292444736023e-06, + "loss": 0.92082649, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.41137695, + "step": 1429, + "time_per_iteration": 2.6823489665985107 + }, + { + "auxiliary_loss_clip": 0.01568537, + "auxiliary_loss_mlp": 0.00821139, + "balance_loss_clip": 1.17744303, + "balance_loss_mlp": 0.77572048, + "epoch": 0.08597625131519615, + "flos": 20959119659520.0, + "grad_norm": 1910.1090961771852, + "language_loss": 0.93997419, + "learning_rate": 3.967222260955578e-06, + "loss": 0.96387088, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 3.91796875, + "router_z_loss_mlp": 0.45410156, + "step": 1430, + "time_per_iteration": 2.6118879318237305 + }, + { + "auxiliary_loss_clip": 0.01567433, + "auxiliary_loss_mlp": 0.00907472, + "balance_loss_clip": 1.17795038, + "balance_loss_mlp": 0.86214888, + "epoch": 0.08603637456786412, + "flos": 23256360956160.0, + "grad_norm": 13.615381743991968, + "language_loss": 0.87828445, + "learning_rate": 3.96715200257787e-06, + "loss": 0.90303355, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.45336914, + "step": 1431, + "time_per_iteration": 2.6730010509490967 + }, + { + "auxiliary_loss_clip": 0.01583713, + "auxiliary_loss_mlp": 0.00998183, + "balance_loss_clip": 1.19190609, + "balance_loss_mlp": 0.94837701, + "epoch": 0.0860964978205321, + "flos": 28694170039680.0, + "grad_norm": 43.756851987733626, + "language_loss": 0.81969738, + "learning_rate": 3.967081669605559e-06, + "loss": 0.84551644, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 3.91601562, + "router_z_loss_mlp": 0.49755859, + "step": 1432, + "time_per_iteration": 2.6992886066436768 + }, + { + "auxiliary_loss_clip": 0.01560809, + "auxiliary_loss_mlp": 0.01055311, + "balance_loss_clip": 1.18237913, + "balance_loss_mlp": 1.00424194, + "epoch": 0.08615662107320006, + "flos": 19318397195520.0, + "grad_norm": 5.912445912875783, + "language_loss": 0.81196034, + "learning_rate": 3.967011262041315e-06, + "loss": 0.83812153, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 0.51074219, + "step": 1433, + "time_per_iteration": 2.6771671772003174 + }, + { + "auxiliary_loss_clip": 0.01577512, + "auxiliary_loss_mlp": 0.01104613, + "balance_loss_clip": 1.19040751, + "balance_loss_mlp": 1.04791701, + "epoch": 0.08621674432586802, + "flos": 15851688894720.0, + "grad_norm": 342.91953573954845, + "language_loss": 0.94460464, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.97142589, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.56689453, + "step": 1434, + "time_per_iteration": 2.639725923538208 + }, + { + "auxiliary_loss_clip": 0.01569051, + "auxiliary_loss_mlp": 0.01090521, + "balance_loss_clip": 1.18526292, + "balance_loss_mlp": 1.03656721, + "epoch": 0.086276867578536, + "flos": 14100648785280.0, + "grad_norm": 103.31636671846091, + "language_loss": 0.8596074, + "learning_rate": 3.966870223147707e-06, + "loss": 0.88620311, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.53955078, + "step": 1435, + "time_per_iteration": 2.7046473026275635 + }, + { + "auxiliary_loss_clip": 0.0154036, + "auxiliary_loss_mlp": 0.0144339, + "balance_loss_clip": 1.29662931, + "balance_loss_mlp": 1.41287231, + "epoch": 0.08633699083120397, + "flos": 70184857772160.0, + "grad_norm": 0.9812531490652936, + "language_loss": 0.58532405, + "learning_rate": 3.96679959182369e-06, + "loss": 0.61516154, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.3046875, + "step": 1436, + "time_per_iteration": 3.2617053985595703 + }, + { + "auxiliary_loss_clip": 0.01562005, + "auxiliary_loss_mlp": 0.00987248, + "balance_loss_clip": 1.19172621, + "balance_loss_mlp": 0.93615472, + "epoch": 0.08639711408387193, + "flos": 30298874140800.0, + "grad_norm": 20.640088491902095, + "language_loss": 0.80583286, + "learning_rate": 3.966728885918437e-06, + "loss": 0.83132541, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 0.51098633, + "step": 1437, + "time_per_iteration": 2.741683006286621 + }, + { + "auxiliary_loss_clip": 0.01570457, + "auxiliary_loss_mlp": 0.00938521, + "balance_loss_clip": 1.19378304, + "balance_loss_mlp": 0.88976395, + "epoch": 0.08645723733653991, + "flos": 20297680663680.0, + "grad_norm": 5.532559750722417, + "language_loss": 0.78448057, + "learning_rate": 3.966658105434627e-06, + "loss": 0.80957037, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 0.48730469, + "step": 1438, + "time_per_iteration": 2.600903034210205 + }, + { + "auxiliary_loss_clip": 0.01566197, + "auxiliary_loss_mlp": 0.00919743, + "balance_loss_clip": 1.19831443, + "balance_loss_mlp": 0.86469233, + "epoch": 0.08651736058920788, + "flos": 32890583134080.0, + "grad_norm": 13.912638113822572, + "language_loss": 0.71318328, + "learning_rate": 3.966587250374945e-06, + "loss": 0.73804265, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 0.55029297, + "step": 1439, + "time_per_iteration": 2.822282075881958 + }, + { + "auxiliary_loss_clip": 0.01546718, + "auxiliary_loss_mlp": 0.00702491, + "balance_loss_clip": 1.17904437, + "balance_loss_mlp": 0.65616584, + "epoch": 0.08657748384187584, + "flos": 22637368857600.0, + "grad_norm": 72.2508745101841, + "language_loss": 0.94360763, + "learning_rate": 3.966516320742077e-06, + "loss": 0.96609974, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 0.46386719, + "step": 1440, + "time_per_iteration": 2.797118663787842 + }, + { + "auxiliary_loss_clip": 0.01574159, + "auxiliary_loss_mlp": 0.00632237, + "balance_loss_clip": 1.19959438, + "balance_loss_mlp": 0.58345616, + "epoch": 0.08663760709454381, + "flos": 23658380951040.0, + "grad_norm": 44.01009224805087, + "language_loss": 0.90897381, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.93103784, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 0.48803711, + "step": 1441, + "time_per_iteration": 2.7048444747924805 + }, + { + "auxiliary_loss_clip": 0.01562712, + "auxiliary_loss_mlp": 0.00339397, + "balance_loss_clip": 1.32322001, + "balance_loss_mlp": 0.3208003, + "epoch": 0.08669773034721179, + "flos": 62686564911360.0, + "grad_norm": 0.8251012677919664, + "language_loss": 0.60425186, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62327296, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.18554688, + "step": 1442, + "time_per_iteration": 3.2543179988861084 + }, + { + "auxiliary_loss_clip": 0.01569746, + "auxiliary_loss_mlp": 0.00547648, + "balance_loss_clip": 1.2011044, + "balance_loss_mlp": 0.50513756, + "epoch": 0.08675785359987975, + "flos": 20667489137280.0, + "grad_norm": 2.473713864904694, + "language_loss": 0.90083337, + "learning_rate": 3.96630308443127e-06, + "loss": 0.92200732, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 0.42504883, + "step": 1443, + "time_per_iteration": 2.5958163738250732 + }, + { + "auxiliary_loss_clip": 0.0158384, + "auxiliary_loss_mlp": 0.00475403, + "balance_loss_clip": 1.22192049, + "balance_loss_mlp": 0.43525285, + "epoch": 0.08681797685254772, + "flos": 26941118768640.0, + "grad_norm": 15.460596296428054, + "language_loss": 0.89236176, + "learning_rate": 3.966231856532584e-06, + "loss": 0.91295421, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.40136719, + "step": 1444, + "time_per_iteration": 2.720736026763916 + }, + { + "auxiliary_loss_clip": 0.01576157, + "auxiliary_loss_mlp": 0.00517931, + "balance_loss_clip": 1.20932555, + "balance_loss_mlp": 0.47430074, + "epoch": 0.0868781001052157, + "flos": 17712831168000.0, + "grad_norm": 51.26186637250275, + "language_loss": 0.94343007, + "learning_rate": 3.966160554074189e-06, + "loss": 0.96437091, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.43603516, + "step": 1445, + "time_per_iteration": 2.586970090866089 + }, + { + "auxiliary_loss_clip": 0.01578771, + "auxiliary_loss_mlp": 0.00451015, + "balance_loss_clip": 1.21820855, + "balance_loss_mlp": 0.41165262, + "epoch": 0.08693822335788366, + "flos": 19896522595200.0, + "grad_norm": 3.941530536675627, + "language_loss": 0.91298515, + "learning_rate": 3.96608917705879e-06, + "loss": 0.93328303, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 0.39355469, + "step": 1446, + "time_per_iteration": 2.6339218616485596 + }, + { + "auxiliary_loss_clip": 0.01581026, + "auxiliary_loss_mlp": 0.00407267, + "balance_loss_clip": 1.32373238, + "balance_loss_mlp": 0.3909595, + "epoch": 0.08699834661055163, + "flos": 67023747406080.0, + "grad_norm": 0.7625732369971318, + "language_loss": 0.5475384, + "learning_rate": 3.966017725489091e-06, + "loss": 0.56742132, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.16308594, + "step": 1447, + "time_per_iteration": 3.190419912338257 + }, + { + "auxiliary_loss_clip": 0.01581667, + "auxiliary_loss_mlp": 0.00423207, + "balance_loss_clip": 1.2257477, + "balance_loss_mlp": 0.38227063, + "epoch": 0.0870584698632196, + "flos": 13480507451520.0, + "grad_norm": 25.514377247628772, + "language_loss": 0.91820055, + "learning_rate": 3.965946199367804e-06, + "loss": 0.93824923, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 0.40942383, + "step": 1448, + "time_per_iteration": 2.696981191635132 + }, + { + "auxiliary_loss_clip": 0.0157822, + "auxiliary_loss_mlp": 0.00456349, + "balance_loss_clip": 1.21800101, + "balance_loss_mlp": 0.41245613, + "epoch": 0.08711859311588757, + "flos": 16107013745280.0, + "grad_norm": 9.401953666086182, + "language_loss": 0.91845506, + "learning_rate": 3.965874598697638e-06, + "loss": 0.93880081, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 0.4387207, + "step": 1449, + "time_per_iteration": 2.571150302886963 + }, + { + "auxiliary_loss_clip": 0.01604003, + "auxiliary_loss_mlp": 0.00407474, + "balance_loss_clip": 1.25340414, + "balance_loss_mlp": 0.3684206, + "epoch": 0.08717871636855554, + "flos": 38472357928320.0, + "grad_norm": 35.20826794401027, + "language_loss": 0.78185683, + "learning_rate": 3.965802923481313e-06, + "loss": 0.80197155, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.39038086, + "step": 1450, + "time_per_iteration": 2.772542715072632 + }, + { + "auxiliary_loss_clip": 0.01581481, + "auxiliary_loss_mlp": 0.00406654, + "balance_loss_clip": 1.22777724, + "balance_loss_mlp": 0.3656458, + "epoch": 0.0872388396212235, + "flos": 17600574188160.0, + "grad_norm": 27.311863243292056, + "language_loss": 0.89296091, + "learning_rate": 3.965731173721542e-06, + "loss": 0.91284221, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.41015625, + "step": 1451, + "time_per_iteration": 2.7174925804138184 + }, + { + "auxiliary_loss_clip": 0.01579928, + "auxiliary_loss_mlp": 0.00345203, + "balance_loss_clip": 1.23532534, + "balance_loss_mlp": 0.30989337, + "epoch": 0.08729896287389148, + "flos": 25259385951360.0, + "grad_norm": 4.645548795457372, + "language_loss": 0.80492789, + "learning_rate": 3.965659349421049e-06, + "loss": 0.82417917, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.35302734, + "step": 1452, + "time_per_iteration": 2.705221652984619 + }, + { + "auxiliary_loss_clip": 0.0159542, + "auxiliary_loss_mlp": 0.00406579, + "balance_loss_clip": 1.23951316, + "balance_loss_mlp": 0.36325788, + "epoch": 0.08735908612655945, + "flos": 15632454234240.0, + "grad_norm": 7.6860130390421855, + "language_loss": 0.91024005, + "learning_rate": 3.965587450582556e-06, + "loss": 0.93026006, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 0.43334961, + "step": 1453, + "time_per_iteration": 5.48798680305481 + }, + { + "auxiliary_loss_clip": 0.01592778, + "auxiliary_loss_mlp": 0.00339243, + "balance_loss_clip": 1.24938381, + "balance_loss_mlp": 0.30491054, + "epoch": 0.08741920937922741, + "flos": 20339660684160.0, + "grad_norm": 4.6883716046616035, + "language_loss": 0.79788315, + "learning_rate": 3.96551547720879e-06, + "loss": 0.8172034, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.34326172, + "step": 1454, + "time_per_iteration": 2.71899676322937 + }, + { + "auxiliary_loss_clip": 0.01458203, + "auxiliary_loss_mlp": 0.0021438, + "balance_loss_clip": 1.21420407, + "balance_loss_mlp": 0.19940762, + "epoch": 0.08747933263189539, + "flos": 62819795433600.0, + "grad_norm": 0.7527579905524507, + "language_loss": 0.58253145, + "learning_rate": 3.96544342930248e-06, + "loss": 0.59925723, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.14941406, + "step": 1455, + "time_per_iteration": 3.1643145084381104 + }, + { + "auxiliary_loss_clip": 0.01583321, + "auxiliary_loss_mlp": 0.00348986, + "balance_loss_clip": 1.23727179, + "balance_loss_mlp": 0.31043416, + "epoch": 0.08753945588456336, + "flos": 33035877648000.0, + "grad_norm": 1.987938260369758, + "language_loss": 0.83422053, + "learning_rate": 3.965371306866359e-06, + "loss": 0.85354364, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 3.45703125, + "router_z_loss_mlp": 0.38549805, + "step": 1456, + "time_per_iteration": 2.7493720054626465 + }, + { + "auxiliary_loss_clip": 0.01595597, + "auxiliary_loss_mlp": 0.00382875, + "balance_loss_clip": 1.24868667, + "balance_loss_mlp": 0.34344065, + "epoch": 0.08759957913723132, + "flos": 35547182046720.0, + "grad_norm": 5.613722847915186, + "language_loss": 0.79174274, + "learning_rate": 3.96529910990316e-06, + "loss": 0.81152743, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.39453125, + "step": 1457, + "time_per_iteration": 4.139661550521851 + }, + { + "auxiliary_loss_clip": 0.01594869, + "auxiliary_loss_mlp": 0.00392641, + "balance_loss_clip": 1.2512207, + "balance_loss_mlp": 0.35325384, + "epoch": 0.0876597023898993, + "flos": 23911120022400.0, + "grad_norm": 6.599538582868752, + "language_loss": 0.91490519, + "learning_rate": 3.965226838415622e-06, + "loss": 0.9347803, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.39379883, + "step": 1458, + "time_per_iteration": 2.6788337230682373 + }, + { + "auxiliary_loss_clip": 0.01591275, + "auxiliary_loss_mlp": 0.00389736, + "balance_loss_clip": 1.25290251, + "balance_loss_mlp": 0.3504447, + "epoch": 0.08771982564256726, + "flos": 18114025150080.0, + "grad_norm": 6.235376130127792, + "language_loss": 0.87044156, + "learning_rate": 3.965154492406486e-06, + "loss": 0.89025164, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.39282227, + "step": 1459, + "time_per_iteration": 4.084290027618408 + }, + { + "auxiliary_loss_clip": 0.01589365, + "auxiliary_loss_mlp": 0.00425835, + "balance_loss_clip": 1.24368417, + "balance_loss_mlp": 0.38115507, + "epoch": 0.08777994889523523, + "flos": 17712005155200.0, + "grad_norm": 5.4607513263334715, + "language_loss": 0.91788113, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.9380331, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 0.44677734, + "step": 1460, + "time_per_iteration": 2.717517614364624 + }, + { + "auxiliary_loss_clip": 0.01585435, + "auxiliary_loss_mlp": 0.00405065, + "balance_loss_clip": 1.2388885, + "balance_loss_mlp": 0.36610729, + "epoch": 0.0878400721479032, + "flos": 12819930382080.0, + "grad_norm": 9.69070630027994, + "language_loss": 0.88599145, + "learning_rate": 3.965009576834394e-06, + "loss": 0.90589654, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 3.46679688, + "router_z_loss_mlp": 0.38964844, + "step": 1461, + "time_per_iteration": 2.557520866394043 + }, + { + "auxiliary_loss_clip": 0.01598293, + "auxiliary_loss_mlp": 0.0040446, + "balance_loss_clip": 1.24687982, + "balance_loss_mlp": 0.3632139, + "epoch": 0.08790019540057117, + "flos": 26392690938240.0, + "grad_norm": 3.3952066529514253, + "language_loss": 0.80208695, + "learning_rate": 3.964937007276932e-06, + "loss": 0.82211447, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.41235352, + "step": 1462, + "time_per_iteration": 2.6768417358398438 + }, + { + "auxiliary_loss_clip": 0.01616374, + "auxiliary_loss_mlp": 0.00419098, + "balance_loss_clip": 1.25791562, + "balance_loss_mlp": 0.3736077, + "epoch": 0.08796031865323914, + "flos": 19134031662720.0, + "grad_norm": 11.47304298145174, + "language_loss": 0.81623977, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.83659452, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.45507812, + "step": 1463, + "time_per_iteration": 2.662208080291748 + }, + { + "auxiliary_loss_clip": 0.01605593, + "auxiliary_loss_mlp": 0.00420011, + "balance_loss_clip": 1.23618209, + "balance_loss_mlp": 0.37337613, + "epoch": 0.0880204419059071, + "flos": 26064287867520.0, + "grad_norm": 7.011016933900526, + "language_loss": 0.89989328, + "learning_rate": 3.964791644632941e-06, + "loss": 0.92014933, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 0.46704102, + "step": 1464, + "time_per_iteration": 2.7780041694641113 + }, + { + "auxiliary_loss_clip": 0.01615123, + "auxiliary_loss_mlp": 0.00417952, + "balance_loss_clip": 1.25952578, + "balance_loss_mlp": 0.37372512, + "epoch": 0.08808056515857508, + "flos": 22377842115840.0, + "grad_norm": 3.4124844519847404, + "language_loss": 0.85021961, + "learning_rate": 3.964718851551923e-06, + "loss": 0.87055039, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 0.44213867, + "step": 1465, + "time_per_iteration": 2.6788649559020996 + }, + { + "auxiliary_loss_clip": 0.01625608, + "auxiliary_loss_mlp": 0.00377846, + "balance_loss_clip": 1.2641933, + "balance_loss_mlp": 0.33574146, + "epoch": 0.08814068841124305, + "flos": 23185293897600.0, + "grad_norm": 6.35475347199947, + "language_loss": 0.90895057, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.92898506, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 0.42089844, + "step": 1466, + "time_per_iteration": 2.6616625785827637 + }, + { + "auxiliary_loss_clip": 0.01624978, + "auxiliary_loss_mlp": 0.00377253, + "balance_loss_clip": 1.26290488, + "balance_loss_mlp": 0.33784205, + "epoch": 0.08820081166391101, + "flos": 25155281358720.0, + "grad_norm": 242.78436003807883, + "language_loss": 0.92217487, + "learning_rate": 3.964573041885641e-06, + "loss": 0.9421972, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.39428711, + "step": 1467, + "time_per_iteration": 2.6709775924682617 + }, + { + "auxiliary_loss_clip": 0.01629565, + "auxiliary_loss_mlp": 0.00347062, + "balance_loss_clip": 1.26833236, + "balance_loss_mlp": 0.30898675, + "epoch": 0.08826093491657899, + "flos": 22231685675520.0, + "grad_norm": 459.563549449036, + "language_loss": 0.82179737, + "learning_rate": 3.964500025305907e-06, + "loss": 0.84156358, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.38085938, + "step": 1468, + "time_per_iteration": 2.6433968544006348 + }, + { + "auxiliary_loss_clip": 0.01628283, + "auxiliary_loss_mlp": 0.00307358, + "balance_loss_clip": 1.26499581, + "balance_loss_mlp": 0.27183408, + "epoch": 0.08832105816924696, + "flos": 22126826897280.0, + "grad_norm": 11.19509994202962, + "language_loss": 0.8544035, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.87375993, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 3.6328125, + "router_z_loss_mlp": 0.35498047, + "step": 1469, + "time_per_iteration": 2.625145673751831 + }, + { + "auxiliary_loss_clip": 0.0159723, + "auxiliary_loss_mlp": 0.00348327, + "balance_loss_clip": 1.23295629, + "balance_loss_mlp": 0.31056118, + "epoch": 0.08838118142191492, + "flos": 17566495159680.0, + "grad_norm": 9.565210086393156, + "language_loss": 0.86388314, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.88333869, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.37768555, + "step": 1470, + "time_per_iteration": 2.5956976413726807 + }, + { + "auxiliary_loss_clip": 0.01605832, + "auxiliary_loss_mlp": 0.00362015, + "balance_loss_clip": 1.23745584, + "balance_loss_mlp": 0.32196051, + "epoch": 0.0884413046745829, + "flos": 20777196251520.0, + "grad_norm": 24.248940037979935, + "language_loss": 0.90666354, + "learning_rate": 3.964280528613569e-06, + "loss": 0.92634195, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 0.40039062, + "step": 1471, + "time_per_iteration": 2.734083890914917 + }, + { + "auxiliary_loss_clip": 0.01583128, + "auxiliary_loss_mlp": 0.00300949, + "balance_loss_clip": 1.22102046, + "balance_loss_mlp": 0.2661404, + "epoch": 0.08850142792725087, + "flos": 22125462180480.0, + "grad_norm": 1.865924153659744, + "language_loss": 0.8799125, + "learning_rate": 3.964207214074324e-06, + "loss": 0.89875329, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.34814453, + "step": 1472, + "time_per_iteration": 2.660503387451172 + }, + { + "auxiliary_loss_clip": 0.01568122, + "auxiliary_loss_mlp": 0.00337664, + "balance_loss_clip": 1.20431137, + "balance_loss_mlp": 0.29701388, + "epoch": 0.08856155117991883, + "flos": 22418744728320.0, + "grad_norm": 995.3046801656517, + "language_loss": 0.92719686, + "learning_rate": 3.964133825052146e-06, + "loss": 0.94625479, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 3.640625, + "router_z_loss_mlp": 0.40649414, + "step": 1473, + "time_per_iteration": 2.6887805461883545 + }, + { + "auxiliary_loss_clip": 0.01569712, + "auxiliary_loss_mlp": 0.0033602, + "balance_loss_clip": 1.20492148, + "balance_loss_mlp": 0.29782534, + "epoch": 0.0886216744325868, + "flos": 29937002572800.0, + "grad_norm": 5.858810318885965, + "language_loss": 0.83431828, + "learning_rate": 3.964060361549816e-06, + "loss": 0.85337555, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 0.38208008, + "step": 1474, + "time_per_iteration": 2.7197086811065674 + }, + { + "auxiliary_loss_clip": 0.01561453, + "auxiliary_loss_mlp": 0.0034727, + "balance_loss_clip": 1.19673407, + "balance_loss_mlp": 0.30671513, + "epoch": 0.08868179768525478, + "flos": 23982833525760.0, + "grad_norm": 25.952968433194332, + "language_loss": 0.85512388, + "learning_rate": 3.963986823570121e-06, + "loss": 0.87421119, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.40527344, + "step": 1475, + "time_per_iteration": 2.6799991130828857 + }, + { + "auxiliary_loss_clip": 0.01567188, + "auxiliary_loss_mlp": 0.00342718, + "balance_loss_clip": 1.19416332, + "balance_loss_mlp": 0.30008855, + "epoch": 0.08874192093792274, + "flos": 43177553216640.0, + "grad_norm": 19.96664790518494, + "language_loss": 0.79734194, + "learning_rate": 3.963913211115848e-06, + "loss": 0.816441, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 0.42578125, + "step": 1476, + "time_per_iteration": 2.8424692153930664 + }, + { + "auxiliary_loss_clip": 0.01571038, + "auxiliary_loss_mlp": 0.00337963, + "balance_loss_clip": 1.20601892, + "balance_loss_mlp": 0.29931554, + "epoch": 0.0888020441905907, + "flos": 32852445868800.0, + "grad_norm": 49.864469092350475, + "language_loss": 0.81114447, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.83023453, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 0.38696289, + "step": 1477, + "time_per_iteration": 2.837495803833008 + }, + { + "auxiliary_loss_clip": 0.01559138, + "auxiliary_loss_mlp": 0.00354624, + "balance_loss_clip": 1.18700182, + "balance_loss_mlp": 0.31249577, + "epoch": 0.08886216744325869, + "flos": 23149347361920.0, + "grad_norm": 2.130205359540668, + "language_loss": 0.94125962, + "learning_rate": 3.963765762794739e-06, + "loss": 0.96039724, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 0.42089844, + "step": 1478, + "time_per_iteration": 2.713108777999878 + }, + { + "auxiliary_loss_clip": 0.01547915, + "auxiliary_loss_mlp": 0.00358919, + "balance_loss_clip": 1.17697155, + "balance_loss_mlp": 0.32163057, + "epoch": 0.08892229069592665, + "flos": 23331593992320.0, + "grad_norm": 2.1952947899098225, + "language_loss": 0.82691729, + "learning_rate": 3.963691926933495e-06, + "loss": 0.84598565, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 0.37255859, + "step": 1479, + "time_per_iteration": 2.7240498065948486 + }, + { + "auxiliary_loss_clip": 0.01562655, + "auxiliary_loss_mlp": 0.00339286, + "balance_loss_clip": 1.19045353, + "balance_loss_mlp": 0.29610798, + "epoch": 0.08898241394859462, + "flos": 26213784272640.0, + "grad_norm": 3.9987522432269453, + "language_loss": 0.85996449, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.87898386, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.43188477, + "step": 1480, + "time_per_iteration": 2.693326711654663 + }, + { + "auxiliary_loss_clip": 0.01577628, + "auxiliary_loss_mlp": 0.00339982, + "balance_loss_clip": 1.20067549, + "balance_loss_mlp": 0.29422992, + "epoch": 0.0890425372012626, + "flos": 23550613171200.0, + "grad_norm": 121.29383422049143, + "language_loss": 0.75861752, + "learning_rate": 3.963544031823624e-06, + "loss": 0.77779365, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 3.76953125, + "router_z_loss_mlp": 0.45800781, + "step": 1481, + "time_per_iteration": 2.693154811859131 + }, + { + "auxiliary_loss_clip": 0.01564301, + "auxiliary_loss_mlp": 0.00303104, + "balance_loss_clip": 1.20162559, + "balance_loss_mlp": 0.26746073, + "epoch": 0.08910266045393056, + "flos": 23002795872000.0, + "grad_norm": 4616.726736567042, + "language_loss": 1.02828932, + "learning_rate": 3.9634699725806065e-06, + "loss": 1.04696345, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.35644531, + "step": 1482, + "time_per_iteration": 2.661295175552368 + }, + { + "auxiliary_loss_clip": 0.01567796, + "auxiliary_loss_mlp": 0.00334588, + "balance_loss_clip": 1.19999325, + "balance_loss_mlp": 0.29482016, + "epoch": 0.08916278370659853, + "flos": 31936508035200.0, + "grad_norm": 11.300069846144542, + "language_loss": 0.85775232, + "learning_rate": 3.96339583888261e-06, + "loss": 0.87677616, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.39770508, + "step": 1483, + "time_per_iteration": 2.729128837585449 + }, + { + "auxiliary_loss_clip": 0.01580363, + "auxiliary_loss_mlp": 0.00331863, + "balance_loss_clip": 1.20794618, + "balance_loss_mlp": 0.29002064, + "epoch": 0.08922290695926649, + "flos": 17530404969600.0, + "grad_norm": 7.879587425760029, + "language_loss": 0.94488662, + "learning_rate": 3.963321630732448e-06, + "loss": 0.96400881, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.41870117, + "step": 1484, + "time_per_iteration": 2.641637086868286 + }, + { + "auxiliary_loss_clip": 0.01574408, + "auxiliary_loss_mlp": 0.00310707, + "balance_loss_clip": 1.203022, + "balance_loss_mlp": 0.27294153, + "epoch": 0.08928303021193447, + "flos": 32125075459200.0, + "grad_norm": 20.467661454623748, + "language_loss": 0.86243308, + "learning_rate": 3.963247348132932e-06, + "loss": 0.8812843, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 0.37768555, + "step": 1485, + "time_per_iteration": 2.7051448822021484 + }, + { + "auxiliary_loss_clip": 0.01572866, + "auxiliary_loss_mlp": 0.00303224, + "balance_loss_clip": 1.20184731, + "balance_loss_mlp": 0.26059484, + "epoch": 0.08934315346460243, + "flos": 22125210785280.0, + "grad_norm": 3.355033484221029, + "language_loss": 0.88766825, + "learning_rate": 3.96317299108688e-06, + "loss": 0.90642917, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 0.42651367, + "step": 1486, + "time_per_iteration": 2.6984832286834717 + }, + { + "auxiliary_loss_clip": 0.01568498, + "auxiliary_loss_mlp": 0.00288281, + "balance_loss_clip": 1.20885468, + "balance_loss_mlp": 0.24929953, + "epoch": 0.0894032767172704, + "flos": 22565583527040.0, + "grad_norm": 1.7244405284250541, + "language_loss": 0.81739998, + "learning_rate": 3.963098559597111e-06, + "loss": 0.83596778, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 0.38964844, + "step": 1487, + "time_per_iteration": 2.689114809036255 + }, + { + "auxiliary_loss_clip": 0.01574206, + "auxiliary_loss_mlp": 0.00279755, + "balance_loss_clip": 1.21371031, + "balance_loss_mlp": 0.24089323, + "epoch": 0.08946339996993838, + "flos": 20193396503040.0, + "grad_norm": 4.543520103385851, + "language_loss": 0.91147989, + "learning_rate": 3.963024053666449e-06, + "loss": 0.93001944, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.38891602, + "step": 1488, + "time_per_iteration": 2.6531174182891846 + }, + { + "auxiliary_loss_clip": 0.01577609, + "auxiliary_loss_mlp": 0.00296511, + "balance_loss_clip": 1.21665323, + "balance_loss_mlp": 0.25869787, + "epoch": 0.08952352322260634, + "flos": 48360181104000.0, + "grad_norm": 2.8942813101283535, + "language_loss": 0.8072167, + "learning_rate": 3.962949473297718e-06, + "loss": 0.82595789, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 0.37817383, + "step": 1489, + "time_per_iteration": 2.851170539855957 + }, + { + "auxiliary_loss_clip": 0.01590006, + "auxiliary_loss_mlp": 0.00285174, + "balance_loss_clip": 1.23171496, + "balance_loss_mlp": 0.24483365, + "epoch": 0.08958364647527431, + "flos": 31793081028480.0, + "grad_norm": 7.273568883768564, + "language_loss": 0.96688199, + "learning_rate": 3.962874818493745e-06, + "loss": 0.98563373, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.40332031, + "step": 1490, + "time_per_iteration": 2.752030372619629 + }, + { + "auxiliary_loss_clip": 0.01582263, + "auxiliary_loss_mlp": 0.00312939, + "balance_loss_clip": 1.2222966, + "balance_loss_mlp": 0.27402878, + "epoch": 0.08964376972794229, + "flos": 23368186972800.0, + "grad_norm": 35.86472995713647, + "language_loss": 0.82015389, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.83910584, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 0.38916016, + "step": 1491, + "time_per_iteration": 2.686924934387207 + }, + { + "auxiliary_loss_clip": 0.01615572, + "auxiliary_loss_mlp": 0.00271151, + "balance_loss_clip": 1.26250625, + "balance_loss_mlp": 0.23381504, + "epoch": 0.08970389298061025, + "flos": 23294785530240.0, + "grad_norm": 2.712787447824314, + "language_loss": 0.84336603, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.86223328, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.37329102, + "step": 1492, + "time_per_iteration": 2.6261963844299316 + }, + { + "auxiliary_loss_clip": 0.01616953, + "auxiliary_loss_mlp": 0.00314642, + "balance_loss_clip": 1.25760388, + "balance_loss_mlp": 0.2733717, + "epoch": 0.08976401623327822, + "flos": 33761703772800.0, + "grad_norm": 6.083717042201034, + "language_loss": 0.78103268, + "learning_rate": 3.962650407498707e-06, + "loss": 0.80034858, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 0.41308594, + "step": 1493, + "time_per_iteration": 2.780771493911743 + }, + { + "auxiliary_loss_clip": 0.01656158, + "auxiliary_loss_mlp": 0.0032934, + "balance_loss_clip": 1.30003917, + "balance_loss_mlp": 0.28785479, + "epoch": 0.08982413948594618, + "flos": 23911335504000.0, + "grad_norm": 6.488057531062658, + "language_loss": 0.9253701, + "learning_rate": 3.962575454982109e-06, + "loss": 0.94522512, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 0.41503906, + "step": 1494, + "time_per_iteration": 2.7523915767669678 + }, + { + "auxiliary_loss_clip": 0.01671075, + "auxiliary_loss_mlp": 0.00332138, + "balance_loss_clip": 1.30932033, + "balance_loss_mlp": 0.29589862, + "epoch": 0.08988426273861416, + "flos": 16837544551680.0, + "grad_norm": 2.6123256035253926, + "language_loss": 0.89621484, + "learning_rate": 3.962500428044454e-06, + "loss": 0.91624701, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.36206055, + "step": 1495, + "time_per_iteration": 4.064457893371582 + }, + { + "auxiliary_loss_clip": 0.01702574, + "auxiliary_loss_mlp": 0.00368731, + "balance_loss_clip": 1.32747948, + "balance_loss_mlp": 0.32922471, + "epoch": 0.08994438599128213, + "flos": 14793365548800.0, + "grad_norm": 175.7652820081294, + "language_loss": 0.81182015, + "learning_rate": 3.962425326688585e-06, + "loss": 0.83253324, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.39501953, + "step": 1496, + "time_per_iteration": 4.0558555126190186 + }, + { + "auxiliary_loss_clip": 0.01726287, + "auxiliary_loss_mlp": 0.00360627, + "balance_loss_clip": 1.34267735, + "balance_loss_mlp": 0.32190776, + "epoch": 0.09000450924395009, + "flos": 17384320356480.0, + "grad_norm": 26.57186645173995, + "language_loss": 0.8609606, + "learning_rate": 3.962350150917351e-06, + "loss": 0.88182974, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 0.38745117, + "step": 1497, + "time_per_iteration": 2.579988479614258 + }, + { + "auxiliary_loss_clip": 0.01725993, + "auxiliary_loss_mlp": 0.00400293, + "balance_loss_clip": 1.33496189, + "balance_loss_mlp": 0.34950966, + "epoch": 0.09006463249661807, + "flos": 24280317964800.0, + "grad_norm": 52.490545434077504, + "language_loss": 0.89416736, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.91543025, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 3.91015625, + "router_z_loss_mlp": 0.50756836, + "step": 1498, + "time_per_iteration": 2.6758151054382324 + }, + { + "auxiliary_loss_clip": 0.01759891, + "auxiliary_loss_mlp": 0.00374046, + "balance_loss_clip": 1.37216735, + "balance_loss_mlp": 0.33086801, + "epoch": 0.09012475574928604, + "flos": 13661928069120.0, + "grad_norm": 3.952955149616601, + "language_loss": 0.87263429, + "learning_rate": 3.962199576140195e-06, + "loss": 0.89397365, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 3.87304688, + "router_z_loss_mlp": 0.43188477, + "step": 1499, + "time_per_iteration": 4.077389478683472 + }, + { + "auxiliary_loss_clip": 0.01773848, + "auxiliary_loss_mlp": 0.00372398, + "balance_loss_clip": 1.38102996, + "balance_loss_mlp": 0.32902998, + "epoch": 0.090184879001954, + "flos": 23327751237120.0, + "grad_norm": 7.215151261617818, + "language_loss": 0.9849751, + "learning_rate": 3.962124177139981e-06, + "loss": 1.00643754, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 3.92773438, + "router_z_loss_mlp": 0.43383789, + "step": 1500, + "time_per_iteration": 2.6519615650177 + }, + { + "auxiliary_loss_clip": 0.01782728, + "auxiliary_loss_mlp": 0.00424648, + "balance_loss_clip": 1.36813414, + "balance_loss_mlp": 0.37312621, + "epoch": 0.09024500225462198, + "flos": 23002688131200.0, + "grad_norm": 35.5678122782809, + "language_loss": 0.8169477, + "learning_rate": 3.962048703735822e-06, + "loss": 0.83902144, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 4.14453125, + "router_z_loss_mlp": 0.51513672, + "step": 1501, + "time_per_iteration": 2.6926915645599365 + }, + { + "auxiliary_loss_clip": 0.01675956, + "auxiliary_loss_mlp": 0.00110863, + "balance_loss_clip": 1.38541698, + "balance_loss_mlp": 0.09045473, + "epoch": 0.09030512550728995, + "flos": 62189203242240.0, + "grad_norm": 0.8179451123860338, + "language_loss": 0.58089972, + "learning_rate": 3.96197315593058e-06, + "loss": 0.59876794, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.20410156, + "step": 1502, + "time_per_iteration": 4.5393595695495605 + }, + { + "auxiliary_loss_clip": 0.01816271, + "auxiliary_loss_mlp": 0.00410913, + "balance_loss_clip": 1.39728427, + "balance_loss_mlp": 0.36673456, + "epoch": 0.09036524875995791, + "flos": 38800689171840.0, + "grad_norm": 3.3062648926156704, + "language_loss": 0.7925148, + "learning_rate": 3.961897533727119e-06, + "loss": 0.81478655, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 4.19140625, + "router_z_loss_mlp": 0.44189453, + "step": 1503, + "time_per_iteration": 2.825369358062744 + }, + { + "auxiliary_loss_clip": 0.0183539, + "auxiliary_loss_mlp": 0.00409794, + "balance_loss_clip": 1.40867186, + "balance_loss_mlp": 0.36304, + "epoch": 0.09042537201262588, + "flos": 21690081429120.0, + "grad_norm": 4.26176357684905, + "language_loss": 0.92152816, + "learning_rate": 3.961821837128306e-06, + "loss": 0.94398004, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 4.265625, + "router_z_loss_mlp": 0.46728516, + "step": 1504, + "time_per_iteration": 2.6720006465911865 + }, + { + "auxiliary_loss_clip": 0.01830553, + "auxiliary_loss_mlp": 0.00450672, + "balance_loss_clip": 1.40082777, + "balance_loss_mlp": 0.40048489, + "epoch": 0.09048549526529386, + "flos": 22267021680000.0, + "grad_norm": 200.18885387245447, + "language_loss": 0.80855888, + "learning_rate": 3.961746066137014e-06, + "loss": 0.83137119, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 4.296875, + "router_z_loss_mlp": 0.50219727, + "step": 1505, + "time_per_iteration": 2.5953216552734375 + }, + { + "auxiliary_loss_clip": 0.01792472, + "auxiliary_loss_mlp": 0.00455161, + "balance_loss_clip": 1.37919021, + "balance_loss_mlp": 0.40912265, + "epoch": 0.09054561851796182, + "flos": 14610939350400.0, + "grad_norm": 12.295315050612254, + "language_loss": 0.91298413, + "learning_rate": 3.961670220756114e-06, + "loss": 0.93546045, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 4.12890625, + "router_z_loss_mlp": 0.46020508, + "step": 1506, + "time_per_iteration": 2.6337547302246094 + }, + { + "auxiliary_loss_clip": 0.01809916, + "auxiliary_loss_mlp": 0.00417883, + "balance_loss_clip": 1.39665592, + "balance_loss_mlp": 0.3760885, + "epoch": 0.09060574177062979, + "flos": 27636169916160.0, + "grad_norm": 2.468973553449486, + "language_loss": 0.84710658, + "learning_rate": 3.961594300988482e-06, + "loss": 0.86938465, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 4.12304688, + "router_z_loss_mlp": 0.41821289, + "step": 1507, + "time_per_iteration": 2.6579830646514893 + }, + { + "auxiliary_loss_clip": 0.01730123, + "auxiliary_loss_mlp": 0.0014326, + "balance_loss_clip": 1.43544436, + "balance_loss_mlp": 0.12456796, + "epoch": 0.09066586502329776, + "flos": 66085797513600.0, + "grad_norm": 0.7533481289751136, + "language_loss": 0.57411951, + "learning_rate": 3.961518306836998e-06, + "loss": 0.59285331, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.18652344, + "step": 1508, + "time_per_iteration": 3.042431592941284 + }, + { + "auxiliary_loss_clip": 0.01792571, + "auxiliary_loss_mlp": 0.00449326, + "balance_loss_clip": 1.37518454, + "balance_loss_mlp": 0.40724522, + "epoch": 0.09072598827596573, + "flos": 18916449027840.0, + "grad_norm": 10.80070953798028, + "language_loss": 0.91211253, + "learning_rate": 3.961442238304543e-06, + "loss": 0.93453151, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 4.17578125, + "router_z_loss_mlp": 0.42138672, + "step": 1509, + "time_per_iteration": 2.6387927532196045 + }, + { + "auxiliary_loss_clip": 0.01801171, + "auxiliary_loss_mlp": 0.00455922, + "balance_loss_clip": 1.37930453, + "balance_loss_mlp": 0.41069421, + "epoch": 0.0907861115286337, + "flos": 24821742643200.0, + "grad_norm": 34.337139873825734, + "language_loss": 0.93186909, + "learning_rate": 3.961366095394002e-06, + "loss": 0.95444006, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 4.21484375, + "router_z_loss_mlp": 0.4519043, + "step": 1510, + "time_per_iteration": 2.6705751419067383 + }, + { + "auxiliary_loss_clip": 0.01799356, + "auxiliary_loss_mlp": 0.00483077, + "balance_loss_clip": 1.38159049, + "balance_loss_mlp": 0.44013757, + "epoch": 0.09084623478130167, + "flos": 21652842003840.0, + "grad_norm": 4.373573658731089, + "language_loss": 0.93755358, + "learning_rate": 3.961289878108262e-06, + "loss": 0.96037793, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 4.17578125, + "router_z_loss_mlp": 0.42895508, + "step": 1511, + "time_per_iteration": 2.6582579612731934 + }, + { + "auxiliary_loss_clip": 0.01785887, + "auxiliary_loss_mlp": 0.0048177, + "balance_loss_clip": 1.36791825, + "balance_loss_mlp": 0.4371618, + "epoch": 0.09090635803396964, + "flos": 27639258485760.0, + "grad_norm": 2.832251317020543, + "language_loss": 0.89805901, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.92073554, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 4.1796875, + "router_z_loss_mlp": 0.44628906, + "step": 1512, + "time_per_iteration": 2.6895296573638916 + }, + { + "auxiliary_loss_clip": 0.0180561, + "auxiliary_loss_mlp": 0.00437455, + "balance_loss_clip": 1.39308035, + "balance_loss_mlp": 0.394611, + "epoch": 0.0909664812866376, + "flos": 17669127294720.0, + "grad_norm": 186.96223430803812, + "language_loss": 0.94236714, + "learning_rate": 3.961137220422749e-06, + "loss": 0.96479774, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 4.12890625, + "router_z_loss_mlp": 0.42797852, + "step": 1513, + "time_per_iteration": 2.629394769668579 + }, + { + "auxiliary_loss_clip": 0.01779137, + "auxiliary_loss_mlp": 0.00497217, + "balance_loss_clip": 1.35778809, + "balance_loss_mlp": 0.45587537, + "epoch": 0.09102660453930557, + "flos": 23951448017280.0, + "grad_norm": 263.76130801197286, + "language_loss": 0.92893934, + "learning_rate": 3.961060780028764e-06, + "loss": 0.95170289, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 4.21484375, + "router_z_loss_mlp": 0.41357422, + "step": 1514, + "time_per_iteration": 2.731889009475708 + }, + { + "auxiliary_loss_clip": 0.01761006, + "auxiliary_loss_mlp": 0.00442167, + "balance_loss_clip": 1.35008144, + "balance_loss_mlp": 0.40072942, + "epoch": 0.09108672779197355, + "flos": 25812949426560.0, + "grad_norm": 12.330847545217834, + "language_loss": 0.95575231, + "learning_rate": 3.960984265271159e-06, + "loss": 0.97778404, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 4.11132812, + "router_z_loss_mlp": 0.41430664, + "step": 1515, + "time_per_iteration": 2.8291573524475098 + }, + { + "auxiliary_loss_clip": 0.01759858, + "auxiliary_loss_mlp": 0.00524765, + "balance_loss_clip": 1.34196186, + "balance_loss_mlp": 0.47817805, + "epoch": 0.09114685104464151, + "flos": 29639482220160.0, + "grad_norm": 342.3887823075521, + "language_loss": 0.92929971, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.95214593, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 4.17578125, + "router_z_loss_mlp": 0.46582031, + "step": 1516, + "time_per_iteration": 2.7146244049072266 + }, + { + "auxiliary_loss_clip": 0.01728076, + "auxiliary_loss_mlp": 0.00471359, + "balance_loss_clip": 1.32002854, + "balance_loss_mlp": 0.42799073, + "epoch": 0.09120697429730948, + "flos": 33729635905920.0, + "grad_norm": 8.730817973817866, + "language_loss": 0.85792905, + "learning_rate": 3.960831012676692e-06, + "loss": 0.8799234, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 4.08398438, + "router_z_loss_mlp": 0.43359375, + "step": 1517, + "time_per_iteration": 2.7968058586120605 + }, + { + "auxiliary_loss_clip": 0.01734379, + "auxiliary_loss_mlp": 0.00537622, + "balance_loss_clip": 1.31959426, + "balance_loss_mlp": 0.49113011, + "epoch": 0.09126709754997746, + "flos": 18401381953920.0, + "grad_norm": 6.455216979365299, + "language_loss": 0.84012085, + "learning_rate": 3.960754274845642e-06, + "loss": 0.86284089, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 4.15234375, + "router_z_loss_mlp": 0.46484375, + "step": 1518, + "time_per_iteration": 2.6377646923065186 + }, + { + "auxiliary_loss_clip": 0.01702512, + "auxiliary_loss_mlp": 0.00512369, + "balance_loss_clip": 1.29633868, + "balance_loss_mlp": 0.46761751, + "epoch": 0.09132722080264542, + "flos": 22091957769600.0, + "grad_norm": 4.481546695648526, + "language_loss": 0.94656217, + "learning_rate": 3.960677462662594e-06, + "loss": 0.96871096, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 4.05664062, + "router_z_loss_mlp": 0.44750977, + "step": 1519, + "time_per_iteration": 2.7401487827301025 + }, + { + "auxiliary_loss_clip": 0.01709386, + "auxiliary_loss_mlp": 0.00514006, + "balance_loss_clip": 1.29701912, + "balance_loss_mlp": 0.46813428, + "epoch": 0.09138734405531339, + "flos": 21033131633280.0, + "grad_norm": 17.587547218197223, + "language_loss": 0.81025362, + "learning_rate": 3.96060057613046e-06, + "loss": 0.83248752, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 4.125, + "router_z_loss_mlp": 0.45898438, + "step": 1520, + "time_per_iteration": 2.641139268875122 + }, + { + "auxiliary_loss_clip": 0.01675844, + "auxiliary_loss_mlp": 0.00533127, + "balance_loss_clip": 1.27065396, + "balance_loss_mlp": 0.48427501, + "epoch": 0.09144746730798137, + "flos": 20083940784000.0, + "grad_norm": 371.9197341888047, + "language_loss": 0.95201653, + "learning_rate": 3.960523615252156e-06, + "loss": 0.97410619, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 4.05273438, + "router_z_loss_mlp": 0.48803711, + "step": 1521, + "time_per_iteration": 2.713355541229248 + }, + { + "auxiliary_loss_clip": 0.01669803, + "auxiliary_loss_mlp": 0.00542446, + "balance_loss_clip": 1.2612685, + "balance_loss_mlp": 0.49585882, + "epoch": 0.09150759056064933, + "flos": 22778210085120.0, + "grad_norm": 7.659337601814755, + "language_loss": 0.9075892, + "learning_rate": 3.960446580030599e-06, + "loss": 0.92971164, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 4.08789062, + "router_z_loss_mlp": 0.46582031, + "step": 1522, + "time_per_iteration": 2.680223226547241 + }, + { + "auxiliary_loss_clip": 0.01675039, + "auxiliary_loss_mlp": 0.0054828, + "balance_loss_clip": 1.2633152, + "balance_loss_mlp": 0.50238472, + "epoch": 0.0915677138133173, + "flos": 27564205017600.0, + "grad_norm": 40.88806898134373, + "language_loss": 0.8713423, + "learning_rate": 3.960369470468711e-06, + "loss": 0.89357555, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 4.12890625, + "router_z_loss_mlp": 0.45874023, + "step": 1523, + "time_per_iteration": 2.7535557746887207 + }, + { + "auxiliary_loss_clip": 0.01697219, + "auxiliary_loss_mlp": 0.00535494, + "balance_loss_clip": 1.28180909, + "balance_loss_mlp": 0.49057603, + "epoch": 0.09162783706598528, + "flos": 17674765729920.0, + "grad_norm": 180.54293566040218, + "language_loss": 0.81970835, + "learning_rate": 3.960292286569418e-06, + "loss": 0.84203541, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 4.16015625, + "router_z_loss_mlp": 0.44946289, + "step": 1524, + "time_per_iteration": 2.6478545665740967 + }, + { + "auxiliary_loss_clip": 0.01699438, + "auxiliary_loss_mlp": 0.00542212, + "balance_loss_clip": 1.29479074, + "balance_loss_mlp": 0.49681699, + "epoch": 0.09168796031865324, + "flos": 18478195188480.0, + "grad_norm": 47.74393968052814, + "language_loss": 0.94465744, + "learning_rate": 3.960215028335644e-06, + "loss": 0.96707398, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 4.04296875, + "router_z_loss_mlp": 0.45410156, + "step": 1525, + "time_per_iteration": 2.611694574356079 + }, + { + "auxiliary_loss_clip": 0.01682704, + "auxiliary_loss_mlp": 0.00602508, + "balance_loss_clip": 1.27430856, + "balance_loss_mlp": 0.5536803, + "epoch": 0.0917480835713212, + "flos": 29387605075200.0, + "grad_norm": 17.444547939193942, + "language_loss": 0.8180362, + "learning_rate": 3.96013769577032e-06, + "loss": 0.84088832, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 4.078125, + "router_z_loss_mlp": 0.48828125, + "step": 1526, + "time_per_iteration": 2.7148869037628174 + }, + { + "auxiliary_loss_clip": 0.01644479, + "auxiliary_loss_mlp": 0.00599545, + "balance_loss_clip": 1.23871422, + "balance_loss_mlp": 0.54904819, + "epoch": 0.09180820682398917, + "flos": 19829262378240.0, + "grad_norm": 7.337035611845298, + "language_loss": 0.8390677, + "learning_rate": 3.960060288876378e-06, + "loss": 0.86150795, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 4.05859375, + "router_z_loss_mlp": 0.50415039, + "step": 1527, + "time_per_iteration": 2.6500484943389893 + }, + { + "auxiliary_loss_clip": 0.01642783, + "auxiliary_loss_mlp": 0.00656761, + "balance_loss_clip": 1.23855948, + "balance_loss_mlp": 0.60719419, + "epoch": 0.09186833007665715, + "flos": 23841848643840.0, + "grad_norm": 4.703988269676352, + "language_loss": 0.87835938, + "learning_rate": 3.959982807656753e-06, + "loss": 0.90135479, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 4.04296875, + "router_z_loss_mlp": 0.49609375, + "step": 1528, + "time_per_iteration": 2.768157958984375 + }, + { + "auxiliary_loss_clip": 0.01654742, + "auxiliary_loss_mlp": 0.00693401, + "balance_loss_clip": 1.23855722, + "balance_loss_mlp": 0.63660967, + "epoch": 0.09192845332932512, + "flos": 12932726065920.0, + "grad_norm": 32.63815762770739, + "language_loss": 0.86390901, + "learning_rate": 3.959905252114384e-06, + "loss": 0.88739043, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.56835938, + "step": 1529, + "time_per_iteration": 2.6292948722839355 + }, + { + "auxiliary_loss_clip": 0.01656884, + "auxiliary_loss_mlp": 0.0064774, + "balance_loss_clip": 1.24080634, + "balance_loss_mlp": 0.59390533, + "epoch": 0.09198857658199308, + "flos": 24568177559040.0, + "grad_norm": 88.35197366551327, + "language_loss": 0.873698, + "learning_rate": 3.959827622252211e-06, + "loss": 0.89674419, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 4.15625, + "router_z_loss_mlp": 0.53833008, + "step": 1530, + "time_per_iteration": 2.6976447105407715 + }, + { + "auxiliary_loss_clip": 0.01636188, + "auxiliary_loss_mlp": 0.0063498, + "balance_loss_clip": 1.2365458, + "balance_loss_mlp": 0.58579385, + "epoch": 0.09204869983466106, + "flos": 20266941600000.0, + "grad_norm": 314.41043627810313, + "language_loss": 0.90581191, + "learning_rate": 3.959749918073179e-06, + "loss": 0.92852366, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 3.99804688, + "router_z_loss_mlp": 0.4921875, + "step": 1531, + "time_per_iteration": 2.651965618133545 + }, + { + "auxiliary_loss_clip": 0.01652492, + "auxiliary_loss_mlp": 0.00679882, + "balance_loss_clip": 1.24188805, + "balance_loss_mlp": 0.62969446, + "epoch": 0.09210882308732903, + "flos": 20885646389760.0, + "grad_norm": 62.946539908423745, + "language_loss": 0.86835289, + "learning_rate": 3.959672139580233e-06, + "loss": 0.8916766, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 4.10742188, + "router_z_loss_mlp": 0.5012207, + "step": 1532, + "time_per_iteration": 2.651139736175537 + }, + { + "auxiliary_loss_clip": 0.01633058, + "auxiliary_loss_mlp": 0.00613392, + "balance_loss_clip": 1.22748733, + "balance_loss_mlp": 0.56604242, + "epoch": 0.09216894633999699, + "flos": 30956326727040.0, + "grad_norm": 75.54649267875463, + "language_loss": 0.90606219, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.9285267, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 4.05859375, + "router_z_loss_mlp": 0.47338867, + "step": 1533, + "time_per_iteration": 2.776776075363159 + }, + { + "auxiliary_loss_clip": 0.01655275, + "auxiliary_loss_mlp": 0.00661887, + "balance_loss_clip": 1.24497104, + "balance_loss_mlp": 0.61243916, + "epoch": 0.09222906959266497, + "flos": 13151565676800.0, + "grad_norm": 16.055698880649206, + "language_loss": 0.97866738, + "learning_rate": 3.959516359664402e-06, + "loss": 1.00183892, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 4.10546875, + "router_z_loss_mlp": 0.49462891, + "step": 1534, + "time_per_iteration": 2.681025266647339 + }, + { + "auxiliary_loss_clip": 0.01659167, + "auxiliary_loss_mlp": 0.00706021, + "balance_loss_clip": 1.24455714, + "balance_loss_mlp": 0.65602505, + "epoch": 0.09228919284533293, + "flos": 25994477784960.0, + "grad_norm": 4.398359631916404, + "language_loss": 0.83252072, + "learning_rate": 3.959438358247424e-06, + "loss": 0.85617262, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 4.14453125, + "router_z_loss_mlp": 0.5, + "step": 1535, + "time_per_iteration": 2.6962666511535645 + }, + { + "auxiliary_loss_clip": 0.01639613, + "auxiliary_loss_mlp": 0.00653107, + "balance_loss_clip": 1.23045468, + "balance_loss_mlp": 0.6034447, + "epoch": 0.0923493160980009, + "flos": 18660800954880.0, + "grad_norm": 11.297827703421754, + "language_loss": 0.86581314, + "learning_rate": 3.959360282528346e-06, + "loss": 0.88874042, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 4.08984375, + "router_z_loss_mlp": 0.49658203, + "step": 1536, + "time_per_iteration": 2.664775848388672 + }, + { + "auxiliary_loss_clip": 0.01662177, + "auxiliary_loss_mlp": 0.00701084, + "balance_loss_clip": 1.24532378, + "balance_loss_mlp": 0.65113533, + "epoch": 0.09240943935066886, + "flos": 21140576190720.0, + "grad_norm": 18.306664001246855, + "language_loss": 0.95891291, + "learning_rate": 3.959282132510131e-06, + "loss": 0.9825455, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 4.171875, + "router_z_loss_mlp": 0.49926758, + "step": 1537, + "time_per_iteration": 4.145832777023315 + }, + { + "auxiliary_loss_clip": 0.01658308, + "auxiliary_loss_mlp": 0.00634429, + "balance_loss_clip": 1.24047887, + "balance_loss_mlp": 0.58478999, + "epoch": 0.09246956260333684, + "flos": 20592435669120.0, + "grad_norm": 92.64681478904845, + "language_loss": 0.87369573, + "learning_rate": 3.959203908195741e-06, + "loss": 0.89662313, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 4.17382812, + "router_z_loss_mlp": 0.49682617, + "step": 1538, + "time_per_iteration": 4.041836738586426 + }, + { + "auxiliary_loss_clip": 0.01565601, + "auxiliary_loss_mlp": 0.00313331, + "balance_loss_clip": 1.25927317, + "balance_loss_mlp": 0.28996646, + "epoch": 0.09252968585600481, + "flos": 67558710614400.0, + "grad_norm": 2.2845906255469743, + "language_loss": 0.57442313, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59321243, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.23339844, + "step": 1539, + "time_per_iteration": 3.273526430130005 + }, + { + "auxiliary_loss_clip": 0.01628467, + "auxiliary_loss_mlp": 0.00656435, + "balance_loss_clip": 1.21253383, + "balance_loss_mlp": 0.60770178, + "epoch": 0.09258980910867277, + "flos": 17383853479680.0, + "grad_norm": 8.888998114133742, + "language_loss": 0.79205966, + "learning_rate": 3.959047236690304e-06, + "loss": 0.81490862, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 4.15625, + "router_z_loss_mlp": 0.48754883, + "step": 1540, + "time_per_iteration": 2.6476352214813232 + }, + { + "auxiliary_loss_clip": 0.01629932, + "auxiliary_loss_mlp": 0.00602607, + "balance_loss_clip": 1.21026552, + "balance_loss_mlp": 0.55413616, + "epoch": 0.09264993236134075, + "flos": 19865927185920.0, + "grad_norm": 34.944447123019785, + "language_loss": 0.87133998, + "learning_rate": 3.958968789505198e-06, + "loss": 0.89366537, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 4.19726562, + "router_z_loss_mlp": 0.48461914, + "step": 1541, + "time_per_iteration": 4.154497861862183 + }, + { + "auxiliary_loss_clip": 0.01482631, + "auxiliary_loss_mlp": 0.00291397, + "balance_loss_clip": 1.19269955, + "balance_loss_mlp": 0.2718465, + "epoch": 0.09271005561400872, + "flos": 62284401262080.0, + "grad_norm": 0.9151811691425403, + "language_loss": 0.62236893, + "learning_rate": 3.9588902680358e-06, + "loss": 0.64010918, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.1953125, + "step": 1542, + "time_per_iteration": 3.132094144821167 + }, + { + "auxiliary_loss_clip": 0.01625259, + "auxiliary_loss_mlp": 0.00630527, + "balance_loss_clip": 1.20626616, + "balance_loss_mlp": 0.57893336, + "epoch": 0.09277017886667668, + "flos": 23329870139520.0, + "grad_norm": 8.859439562296323, + "language_loss": 0.87736726, + "learning_rate": 3.958811672285086e-06, + "loss": 0.89992511, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 0.51611328, + "step": 1543, + "time_per_iteration": 2.6965577602386475 + }, + { + "auxiliary_loss_clip": 0.01606999, + "auxiliary_loss_mlp": 0.00647212, + "balance_loss_clip": 1.19383538, + "balance_loss_mlp": 0.59771681, + "epoch": 0.09283030211934466, + "flos": 54745169875200.0, + "grad_norm": 16.406634187051356, + "language_loss": 0.77680957, + "learning_rate": 3.958733002256038e-06, + "loss": 0.79935169, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 4.13671875, + "router_z_loss_mlp": 0.49438477, + "step": 1544, + "time_per_iteration": 4.575198411941528 + }, + { + "auxiliary_loss_clip": 0.01628789, + "auxiliary_loss_mlp": 0.00619671, + "balance_loss_clip": 1.20869875, + "balance_loss_mlp": 0.56977046, + "epoch": 0.09289042537201263, + "flos": 30334784762880.0, + "grad_norm": 7.921700233245406, + "language_loss": 0.81350303, + "learning_rate": 3.958654257951637e-06, + "loss": 0.83598763, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 4.203125, + "router_z_loss_mlp": 0.49853516, + "step": 1545, + "time_per_iteration": 2.7150626182556152 + }, + { + "auxiliary_loss_clip": 0.01602438, + "auxiliary_loss_mlp": 0.00590741, + "balance_loss_clip": 1.20135772, + "balance_loss_mlp": 0.54203242, + "epoch": 0.09295054862468059, + "flos": 17746838369280.0, + "grad_norm": 71.81283007322989, + "language_loss": 0.86549926, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.88743103, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 4.01757812, + "router_z_loss_mlp": 0.48730469, + "step": 1546, + "time_per_iteration": 2.6454825401306152 + }, + { + "auxiliary_loss_clip": 0.01619346, + "auxiliary_loss_mlp": 0.00621272, + "balance_loss_clip": 1.2106967, + "balance_loss_mlp": 0.5710851, + "epoch": 0.09301067187734856, + "flos": 23658021815040.0, + "grad_norm": 3.593977388948039, + "language_loss": 0.89920056, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.92160666, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 4.0859375, + "router_z_loss_mlp": 0.50219727, + "step": 1547, + "time_per_iteration": 2.631695032119751 + }, + { + "auxiliary_loss_clip": 0.01589727, + "auxiliary_loss_mlp": 0.00604264, + "balance_loss_clip": 1.1914252, + "balance_loss_mlp": 0.55481565, + "epoch": 0.09307079513001654, + "flos": 27527719777920.0, + "grad_norm": 635.9342212026423, + "language_loss": 0.76053756, + "learning_rate": 3.958417579416199e-06, + "loss": 0.78247744, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 3.984375, + "router_z_loss_mlp": 0.49487305, + "step": 1548, + "time_per_iteration": 2.699474811553955 + }, + { + "auxiliary_loss_clip": 0.01607152, + "auxiliary_loss_mlp": 0.00569635, + "balance_loss_clip": 1.19668519, + "balance_loss_mlp": 0.5242399, + "epoch": 0.0931309183826845, + "flos": 20627340710400.0, + "grad_norm": 13.33747818995187, + "language_loss": 0.89217889, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.91394675, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 4.1015625, + "router_z_loss_mlp": 0.45410156, + "step": 1549, + "time_per_iteration": 2.651573896408081 + }, + { + "auxiliary_loss_clip": 0.0159192, + "auxiliary_loss_mlp": 0.00550111, + "balance_loss_clip": 1.19058537, + "balance_loss_mlp": 0.50199813, + "epoch": 0.09319104163535247, + "flos": 29020921084800.0, + "grad_norm": 10.880066456355165, + "language_loss": 0.82360852, + "learning_rate": 3.958259422403966e-06, + "loss": 0.84502882, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.48144531, + "step": 1550, + "time_per_iteration": 2.749711275100708 + }, + { + "auxiliary_loss_clip": 0.01589303, + "auxiliary_loss_mlp": 0.00545529, + "balance_loss_clip": 1.19195426, + "balance_loss_mlp": 0.49887088, + "epoch": 0.09325116488802045, + "flos": 25301545539840.0, + "grad_norm": 27.730445852207158, + "language_loss": 0.90889001, + "learning_rate": 3.95818023251026e-06, + "loss": 0.93023831, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 3.96679688, + "router_z_loss_mlp": 0.46655273, + "step": 1551, + "time_per_iteration": 2.7047722339630127 + }, + { + "auxiliary_loss_clip": 0.01495563, + "auxiliary_loss_mlp": 0.002173, + "balance_loss_clip": 1.23523831, + "balance_loss_mlp": 0.20204151, + "epoch": 0.09331128814068841, + "flos": 61536203942400.0, + "grad_norm": 0.7452644886119041, + "language_loss": 0.61796463, + "learning_rate": 3.958100968362163e-06, + "loss": 0.63509321, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.15234375, + "step": 1552, + "time_per_iteration": 3.268558979034424 + }, + { + "auxiliary_loss_clip": 0.01513505, + "auxiliary_loss_mlp": 0.00211826, + "balance_loss_clip": 1.24946463, + "balance_loss_mlp": 0.19570903, + "epoch": 0.09337141139335638, + "flos": 53293700171520.0, + "grad_norm": 0.8049983232334689, + "language_loss": 0.59318334, + "learning_rate": 3.958021629962681e-06, + "loss": 0.61043668, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.16113281, + "step": 1553, + "time_per_iteration": 3.2934842109680176 + }, + { + "auxiliary_loss_clip": 0.01600356, + "auxiliary_loss_mlp": 0.00551251, + "balance_loss_clip": 1.18251085, + "balance_loss_mlp": 0.50330538, + "epoch": 0.09343153464602436, + "flos": 23476852592640.0, + "grad_norm": 54.272013237657355, + "language_loss": 0.93816429, + "learning_rate": 3.957942217314823e-06, + "loss": 0.95968044, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 4.17773438, + "router_z_loss_mlp": 0.47973633, + "step": 1554, + "time_per_iteration": 2.648390293121338 + }, + { + "auxiliary_loss_clip": 0.01581516, + "auxiliary_loss_mlp": 0.00490826, + "balance_loss_clip": 1.17801082, + "balance_loss_mlp": 0.44802943, + "epoch": 0.09349165789869232, + "flos": 19353481804800.0, + "grad_norm": 67.96675484368957, + "language_loss": 0.86747026, + "learning_rate": 3.957862730421599e-06, + "loss": 0.88819373, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 4.02929688, + "router_z_loss_mlp": 0.42822266, + "step": 1555, + "time_per_iteration": 2.69736385345459 + }, + { + "auxiliary_loss_clip": 0.01492957, + "auxiliary_loss_mlp": 0.00228128, + "balance_loss_clip": 1.23023033, + "balance_loss_mlp": 0.21181993, + "epoch": 0.09355178115136029, + "flos": 67502580635520.0, + "grad_norm": 0.8555562176413279, + "language_loss": 0.60012031, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61733115, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.16308594, + "step": 1556, + "time_per_iteration": 3.1395955085754395 + }, + { + "auxiliary_loss_clip": 0.01585635, + "auxiliary_loss_mlp": 0.00490798, + "balance_loss_clip": 1.17867541, + "balance_loss_mlp": 0.44788292, + "epoch": 0.09361190440402825, + "flos": 37341638720640.0, + "grad_norm": 182.6493193179392, + "language_loss": 0.90195459, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.92271888, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.42895508, + "step": 1557, + "time_per_iteration": 2.818901777267456 + }, + { + "auxiliary_loss_clip": 0.01555467, + "auxiliary_loss_mlp": 0.00529103, + "balance_loss_clip": 1.15301824, + "balance_loss_mlp": 0.48127645, + "epoch": 0.09367202765669623, + "flos": 24899705112960.0, + "grad_norm": 61.8930760389845, + "language_loss": 0.82229632, + "learning_rate": 3.957623824299893e-06, + "loss": 0.84314203, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 4.0234375, + "router_z_loss_mlp": 0.47802734, + "step": 1558, + "time_per_iteration": 2.7207813262939453 + }, + { + "auxiliary_loss_clip": 0.01570947, + "auxiliary_loss_mlp": 0.00499843, + "balance_loss_clip": 1.16724968, + "balance_loss_mlp": 0.45373327, + "epoch": 0.0937321509093642, + "flos": 15705568368000.0, + "grad_norm": 20.696220525003472, + "language_loss": 0.86067146, + "learning_rate": 3.957544040455379e-06, + "loss": 0.88137931, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 4.03320312, + "router_z_loss_mlp": 0.46118164, + "step": 1559, + "time_per_iteration": 2.661522150039673 + }, + { + "auxiliary_loss_clip": 0.01564957, + "auxiliary_loss_mlp": 0.00517802, + "balance_loss_clip": 1.16092741, + "balance_loss_mlp": 0.4689739, + "epoch": 0.09379227416203216, + "flos": 20483698222080.0, + "grad_norm": 4.882125126426394, + "language_loss": 0.82246792, + "learning_rate": 3.957464182380599e-06, + "loss": 0.84329557, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 4.04101562, + "router_z_loss_mlp": 0.48828125, + "step": 1560, + "time_per_iteration": 2.708096742630005 + }, + { + "auxiliary_loss_clip": 0.01541136, + "auxiliary_loss_mlp": 0.00531468, + "balance_loss_clip": 1.14612556, + "balance_loss_mlp": 0.48209196, + "epoch": 0.09385239741470014, + "flos": 24352498344960.0, + "grad_norm": 816.4133272574475, + "language_loss": 0.85582119, + "learning_rate": 3.95738425007858e-06, + "loss": 0.87654728, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 3.94921875, + "router_z_loss_mlp": 0.49389648, + "step": 1561, + "time_per_iteration": 2.6832385063171387 + }, + { + "auxiliary_loss_clip": 0.01533228, + "auxiliary_loss_mlp": 0.00482349, + "balance_loss_clip": 1.14608204, + "balance_loss_mlp": 0.438885, + "epoch": 0.0939125206673681, + "flos": 33291489807360.0, + "grad_norm": 3.420960250163326, + "language_loss": 0.68902409, + "learning_rate": 3.957304243552354e-06, + "loss": 0.70917988, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.43457031, + "step": 1562, + "time_per_iteration": 2.790388584136963 + }, + { + "auxiliary_loss_clip": 0.01506302, + "auxiliary_loss_mlp": 0.00506693, + "balance_loss_clip": 1.13592994, + "balance_loss_mlp": 0.46423072, + "epoch": 0.09397264392003607, + "flos": 19244923925760.0, + "grad_norm": 7.522424022735563, + "language_loss": 0.9164809, + "learning_rate": 3.957224162804956e-06, + "loss": 0.93661082, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 0.42480469, + "step": 1563, + "time_per_iteration": 2.6451759338378906 + }, + { + "auxiliary_loss_clip": 0.01520821, + "auxiliary_loss_mlp": 0.00518274, + "balance_loss_clip": 1.13607562, + "balance_loss_mlp": 0.47099608, + "epoch": 0.09403276717270405, + "flos": 19317930318720.0, + "grad_norm": 12.675444127368275, + "language_loss": 0.82647002, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.84686095, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 0.47290039, + "step": 1564, + "time_per_iteration": 2.781106472015381 + }, + { + "auxiliary_loss_clip": 0.0149551, + "auxiliary_loss_mlp": 0.00518837, + "balance_loss_clip": 1.1335746, + "balance_loss_mlp": 0.47706646, + "epoch": 0.09409289042537201, + "flos": 23583471137280.0, + "grad_norm": 144.45693104976684, + "language_loss": 0.8541857, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.87432915, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.41772461, + "step": 1565, + "time_per_iteration": 2.7218289375305176 + }, + { + "auxiliary_loss_clip": 0.01488816, + "auxiliary_loss_mlp": 0.00479026, + "balance_loss_clip": 1.12428498, + "balance_loss_mlp": 0.43708813, + "epoch": 0.09415301367803998, + "flos": 20078446003200.0, + "grad_norm": 31.19361825357456, + "language_loss": 0.84114218, + "learning_rate": 3.956983475266103e-06, + "loss": 0.86082059, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 0.41943359, + "step": 1566, + "time_per_iteration": 2.6999051570892334 + }, + { + "auxiliary_loss_clip": 0.0148521, + "auxiliary_loss_mlp": 0.00480907, + "balance_loss_clip": 1.12038589, + "balance_loss_mlp": 0.4394697, + "epoch": 0.09421313693070796, + "flos": 21062075016960.0, + "grad_norm": 303.8233521217873, + "language_loss": 0.8423934, + "learning_rate": 3.956903097664407e-06, + "loss": 0.86205453, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 0.4140625, + "step": 1567, + "time_per_iteration": 2.6433186531066895 + }, + { + "auxiliary_loss_clip": 0.01485648, + "auxiliary_loss_mlp": 0.00511538, + "balance_loss_clip": 1.12570798, + "balance_loss_mlp": 0.46986234, + "epoch": 0.09427326018337592, + "flos": 24316156759680.0, + "grad_norm": 100.51565541178083, + "language_loss": 0.88190681, + "learning_rate": 3.956822645856749e-06, + "loss": 0.90187865, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 0.41699219, + "step": 1568, + "time_per_iteration": 2.7152044773101807 + }, + { + "auxiliary_loss_clip": 0.0149411, + "auxiliary_loss_mlp": 0.00502308, + "balance_loss_clip": 1.12584877, + "balance_loss_mlp": 0.45655572, + "epoch": 0.09433338343604389, + "flos": 20263888944000.0, + "grad_norm": 20.483043568742048, + "language_loss": 0.81969607, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.83966023, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 0.45727539, + "step": 1569, + "time_per_iteration": 2.6175484657287598 + }, + { + "auxiliary_loss_clip": 0.01466198, + "auxiliary_loss_mlp": 0.00465416, + "balance_loss_clip": 1.11272573, + "balance_loss_mlp": 0.42359719, + "epoch": 0.09439350668871185, + "flos": 12742973493120.0, + "grad_norm": 6.487320274238727, + "language_loss": 0.9258256, + "learning_rate": 3.956661519635756e-06, + "loss": 0.94514173, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.41845703, + "step": 1570, + "time_per_iteration": 2.617433786392212 + }, + { + "auxiliary_loss_clip": 0.0147314, + "auxiliary_loss_mlp": 0.00503594, + "balance_loss_clip": 1.1193608, + "balance_loss_mlp": 0.46020186, + "epoch": 0.09445362994137983, + "flos": 25962266263680.0, + "grad_norm": 39.08927261475672, + "language_loss": 0.8249743, + "learning_rate": 3.95658084522853e-06, + "loss": 0.84474164, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.43359375, + "step": 1571, + "time_per_iteration": 2.7181520462036133 + }, + { + "auxiliary_loss_clip": 0.01458332, + "auxiliary_loss_mlp": 0.00457139, + "balance_loss_clip": 1.1175611, + "balance_loss_mlp": 0.41718, + "epoch": 0.0945137531940478, + "flos": 19715353372800.0, + "grad_norm": 5.4681301376066545, + "language_loss": 0.84105802, + "learning_rate": 3.956500096627561e-06, + "loss": 0.86021268, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.3996582, + "step": 1572, + "time_per_iteration": 2.618077278137207 + }, + { + "auxiliary_loss_clip": 0.01482239, + "auxiliary_loss_mlp": 0.00513872, + "balance_loss_clip": 1.1227057, + "balance_loss_mlp": 0.46595007, + "epoch": 0.09457387644671576, + "flos": 23617047375360.0, + "grad_norm": 427.41612822045556, + "language_loss": 0.92771113, + "learning_rate": 3.956419273835913e-06, + "loss": 0.94767225, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 0.47998047, + "step": 1573, + "time_per_iteration": 2.696810483932495 + }, + { + "auxiliary_loss_clip": 0.01471767, + "auxiliary_loss_mlp": 0.00448053, + "balance_loss_clip": 1.12313032, + "balance_loss_mlp": 0.40342149, + "epoch": 0.09463399969938374, + "flos": 26907291135360.0, + "grad_norm": 3749.8828682398394, + "language_loss": 0.87966132, + "learning_rate": 3.95633837685665e-06, + "loss": 0.89885956, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.4465332, + "step": 1574, + "time_per_iteration": 2.6723339557647705 + }, + { + "auxiliary_loss_clip": 0.01467599, + "auxiliary_loss_mlp": 0.00465044, + "balance_loss_clip": 1.11964607, + "balance_loss_mlp": 0.42188984, + "epoch": 0.0946941229520517, + "flos": 23659566099840.0, + "grad_norm": 35.68293558082962, + "language_loss": 0.86506128, + "learning_rate": 3.95625740569284e-06, + "loss": 0.88438761, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.43139648, + "step": 1575, + "time_per_iteration": 2.671769142150879 + }, + { + "auxiliary_loss_clip": 0.0145967, + "auxiliary_loss_mlp": 0.00444835, + "balance_loss_clip": 1.1150347, + "balance_loss_mlp": 0.40304002, + "epoch": 0.09475424620471967, + "flos": 24134053783680.0, + "grad_norm": 117.55989651134243, + "language_loss": 0.9370051, + "learning_rate": 3.956176360347553e-06, + "loss": 0.95605016, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.41796875, + "step": 1576, + "time_per_iteration": 2.7179057598114014 + }, + { + "auxiliary_loss_clip": 0.01411443, + "auxiliary_loss_mlp": 0.00172979, + "balance_loss_clip": 1.16829157, + "balance_loss_mlp": 0.15972319, + "epoch": 0.09481436945738765, + "flos": 68426168065920.0, + "grad_norm": 1.0014544854988967, + "language_loss": 0.65732551, + "learning_rate": 3.956095240823862e-06, + "loss": 0.67316973, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.1328125, + "step": 1577, + "time_per_iteration": 3.1825499534606934 + }, + { + "auxiliary_loss_clip": 0.01456388, + "auxiliary_loss_mlp": 0.00414046, + "balance_loss_clip": 1.11309326, + "balance_loss_mlp": 0.37318096, + "epoch": 0.09487449271005562, + "flos": 16654076858880.0, + "grad_norm": 41.863794204615424, + "language_loss": 0.8529796, + "learning_rate": 3.956014047124844e-06, + "loss": 0.87168396, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.40893555, + "step": 1578, + "time_per_iteration": 2.780531167984009 + }, + { + "auxiliary_loss_clip": 0.01466738, + "auxiliary_loss_mlp": 0.00390814, + "balance_loss_clip": 1.12128162, + "balance_loss_mlp": 0.34684959, + "epoch": 0.09493461596272358, + "flos": 24275685110400.0, + "grad_norm": 21.292668316234014, + "language_loss": 0.82940769, + "learning_rate": 3.955932779253578e-06, + "loss": 0.84798312, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 3.45703125, + "router_z_loss_mlp": 0.43969727, + "step": 1579, + "time_per_iteration": 4.191045761108398 + }, + { + "auxiliary_loss_clip": 0.01464207, + "auxiliary_loss_mlp": 0.00423602, + "balance_loss_clip": 1.12283206, + "balance_loss_mlp": 0.37780175, + "epoch": 0.09499473921539155, + "flos": 21870173243520.0, + "grad_norm": 6.908559414710728, + "language_loss": 0.79794967, + "learning_rate": 3.955851437213144e-06, + "loss": 0.81682771, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 3.41210938, + "router_z_loss_mlp": 0.45776367, + "step": 1580, + "time_per_iteration": 4.097251892089844 + }, + { + "auxiliary_loss_clip": 0.01447281, + "auxiliary_loss_mlp": 0.00380162, + "balance_loss_clip": 1.11353004, + "balance_loss_mlp": 0.3392016, + "epoch": 0.09505486246805953, + "flos": 33547137880320.0, + "grad_norm": 7.320494110706318, + "language_loss": 0.83431184, + "learning_rate": 3.955770021006627e-06, + "loss": 0.85258627, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.40991211, + "step": 1581, + "time_per_iteration": 2.8278415203094482 + }, + { + "auxiliary_loss_clip": 0.01459828, + "auxiliary_loss_mlp": 0.00396564, + "balance_loss_clip": 1.11631131, + "balance_loss_mlp": 0.35484061, + "epoch": 0.09511498572072749, + "flos": 21215342350080.0, + "grad_norm": 5.498188881887209, + "language_loss": 0.94164562, + "learning_rate": 3.955688530637116e-06, + "loss": 0.96020955, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 0.41748047, + "step": 1582, + "time_per_iteration": 2.626692056655884 + }, + { + "auxiliary_loss_clip": 0.01465287, + "auxiliary_loss_mlp": 0.00400317, + "balance_loss_clip": 1.11928856, + "balance_loss_mlp": 0.35706824, + "epoch": 0.09517510897339546, + "flos": 14611262572800.0, + "grad_norm": 13.814871522144895, + "language_loss": 0.75378948, + "learning_rate": 3.955606966107699e-06, + "loss": 0.77244556, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.43261719, + "step": 1583, + "time_per_iteration": 4.055368423461914 + }, + { + "auxiliary_loss_clip": 0.01463242, + "auxiliary_loss_mlp": 0.00396962, + "balance_loss_clip": 1.12047172, + "balance_loss_mlp": 0.35488129, + "epoch": 0.09523523222606343, + "flos": 27817339138560.0, + "grad_norm": 23.22407884464331, + "language_loss": 0.78388214, + "learning_rate": 3.95552532742147e-06, + "loss": 0.80248421, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.42089844, + "step": 1584, + "time_per_iteration": 2.697338581085205 + }, + { + "auxiliary_loss_clip": 0.01452479, + "auxiliary_loss_mlp": 0.00339452, + "balance_loss_clip": 1.11809778, + "balance_loss_mlp": 0.30058995, + "epoch": 0.0952953554787314, + "flos": 20706272847360.0, + "grad_norm": 15.914812722230907, + "language_loss": 0.86134779, + "learning_rate": 3.955443614581525e-06, + "loss": 0.8792671, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.38842773, + "step": 1585, + "time_per_iteration": 2.69438099861145 + }, + { + "auxiliary_loss_clip": 0.01482448, + "auxiliary_loss_mlp": 0.00378674, + "balance_loss_clip": 1.12254703, + "balance_loss_mlp": 0.33406585, + "epoch": 0.09535547873139937, + "flos": 24787627701120.0, + "grad_norm": 109.99262021094316, + "language_loss": 0.7914027, + "learning_rate": 3.955361827590961e-06, + "loss": 0.81001401, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.44604492, + "step": 1586, + "time_per_iteration": 4.076338529586792 + }, + { + "auxiliary_loss_clip": 0.01409773, + "auxiliary_loss_mlp": 0.00279769, + "balance_loss_clip": 1.18431008, + "balance_loss_mlp": 0.26431933, + "epoch": 0.09541560198406734, + "flos": 71912194905600.0, + "grad_norm": 0.9105395878185389, + "language_loss": 0.54989076, + "learning_rate": 3.955279966452883e-06, + "loss": 0.56678617, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.15429688, + "step": 1587, + "time_per_iteration": 2.9562792778015137 + }, + { + "auxiliary_loss_clip": 0.01460014, + "auxiliary_loss_mlp": 0.00359325, + "balance_loss_clip": 1.11448169, + "balance_loss_mlp": 0.31631458, + "epoch": 0.09547572523673531, + "flos": 28982604251520.0, + "grad_norm": 10.426169845069333, + "language_loss": 0.86589348, + "learning_rate": 3.955198031170391e-06, + "loss": 0.88408691, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 3.45703125, + "router_z_loss_mlp": 0.43017578, + "step": 1588, + "time_per_iteration": 2.7309257984161377 + }, + { + "auxiliary_loss_clip": 0.014602, + "auxiliary_loss_mlp": 0.00339123, + "balance_loss_clip": 1.11950755, + "balance_loss_mlp": 0.29954541, + "epoch": 0.09553584848940327, + "flos": 24133910129280.0, + "grad_norm": 15.35885723515526, + "language_loss": 0.86943209, + "learning_rate": 3.955116021746594e-06, + "loss": 0.8874253, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.39599609, + "step": 1589, + "time_per_iteration": 2.7566306591033936 + }, + { + "auxiliary_loss_clip": 0.01464313, + "auxiliary_loss_mlp": 0.00370477, + "balance_loss_clip": 1.11987281, + "balance_loss_mlp": 0.32803842, + "epoch": 0.09559597174207124, + "flos": 42851376789120.0, + "grad_norm": 20.92898834993177, + "language_loss": 0.70740473, + "learning_rate": 3.955033938184601e-06, + "loss": 0.72575271, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 3.44140625, + "router_z_loss_mlp": 0.42456055, + "step": 1590, + "time_per_iteration": 2.8768157958984375 + }, + { + "auxiliary_loss_clip": 0.01465584, + "auxiliary_loss_mlp": 0.00383055, + "balance_loss_clip": 1.12031651, + "balance_loss_mlp": 0.34195149, + "epoch": 0.09565609499473922, + "flos": 32670845683200.0, + "grad_norm": 2.0550934775046947, + "language_loss": 0.88488436, + "learning_rate": 3.954951780487526e-06, + "loss": 0.90337068, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 0.41064453, + "step": 1591, + "time_per_iteration": 2.800288677215576 + }, + { + "auxiliary_loss_clip": 0.01485254, + "auxiliary_loss_mlp": 0.00374049, + "balance_loss_clip": 1.12966633, + "balance_loss_mlp": 0.32829696, + "epoch": 0.09571621824740718, + "flos": 18478410670080.0, + "grad_norm": 34.44222560297388, + "language_loss": 0.859339, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.87793207, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.45776367, + "step": 1592, + "time_per_iteration": 2.6514580249786377 + }, + { + "auxiliary_loss_clip": 0.01471372, + "auxiliary_loss_mlp": 0.00394593, + "balance_loss_clip": 1.12136495, + "balance_loss_mlp": 0.35122448, + "epoch": 0.09577634150007515, + "flos": 29387497334400.0, + "grad_norm": 11.410260545778037, + "language_loss": 0.80087972, + "learning_rate": 3.954787242700592e-06, + "loss": 0.81953937, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.43359375, + "step": 1593, + "time_per_iteration": 2.7255873680114746 + }, + { + "auxiliary_loss_clip": 0.01476229, + "auxiliary_loss_mlp": 0.00377183, + "balance_loss_clip": 1.1287142, + "balance_loss_mlp": 0.33433884, + "epoch": 0.09583646475274313, + "flos": 22747830157440.0, + "grad_norm": 10.648045066655213, + "language_loss": 0.76151621, + "learning_rate": 3.954704862616971e-06, + "loss": 0.78005028, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 0.4284668, + "step": 1594, + "time_per_iteration": 2.6383283138275146 + }, + { + "auxiliary_loss_clip": 0.01470646, + "auxiliary_loss_mlp": 0.00354301, + "balance_loss_clip": 1.12303591, + "balance_loss_mlp": 0.31279218, + "epoch": 0.0958965880054111, + "flos": 23218367345280.0, + "grad_norm": 28.02014346170689, + "language_loss": 0.89696717, + "learning_rate": 3.954622408410747e-06, + "loss": 0.91521668, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.4152832, + "step": 1595, + "time_per_iteration": 2.680110454559326 + }, + { + "auxiliary_loss_clip": 0.01454898, + "auxiliary_loss_mlp": 0.00371408, + "balance_loss_clip": 1.11202383, + "balance_loss_mlp": 0.32849285, + "epoch": 0.09595671125807906, + "flos": 21324438933120.0, + "grad_norm": 9.600881169712252, + "language_loss": 0.93615103, + "learning_rate": 3.954539880085045e-06, + "loss": 0.95441407, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.42895508, + "step": 1596, + "time_per_iteration": 2.6818222999572754 + }, + { + "auxiliary_loss_clip": 0.01478502, + "auxiliary_loss_mlp": 0.00349746, + "balance_loss_clip": 1.13161671, + "balance_loss_mlp": 0.31150377, + "epoch": 0.09601683451074704, + "flos": 39603472185600.0, + "grad_norm": 11.433726438916572, + "language_loss": 0.74922872, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.76751119, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 0.38232422, + "step": 1597, + "time_per_iteration": 2.805889368057251 + }, + { + "auxiliary_loss_clip": 0.01464834, + "auxiliary_loss_mlp": 0.00371706, + "balance_loss_clip": 1.11700821, + "balance_loss_mlp": 0.33229494, + "epoch": 0.096076957763415, + "flos": 23732716147200.0, + "grad_norm": 8.125810718576458, + "language_loss": 0.84064847, + "learning_rate": 3.954374601087729e-06, + "loss": 0.85901386, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.39428711, + "step": 1598, + "time_per_iteration": 2.6935274600982666 + }, + { + "auxiliary_loss_clip": 0.01477652, + "auxiliary_loss_mlp": 0.00432739, + "balance_loss_clip": 1.12353015, + "balance_loss_mlp": 0.39030051, + "epoch": 0.09613708101608297, + "flos": 34678108483200.0, + "grad_norm": 11.647002242074175, + "language_loss": 0.74882346, + "learning_rate": 3.954291850422382e-06, + "loss": 0.76792741, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 0.42431641, + "step": 1599, + "time_per_iteration": 2.756392240524292 + }, + { + "auxiliary_loss_clip": 0.01454713, + "auxiliary_loss_mlp": 0.00359831, + "balance_loss_clip": 1.12457955, + "balance_loss_mlp": 0.32044381, + "epoch": 0.09619720426875093, + "flos": 20740028653440.0, + "grad_norm": 30.18276578540757, + "language_loss": 0.89875239, + "learning_rate": 3.954209025650093e-06, + "loss": 0.91689777, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.39355469, + "step": 1600, + "time_per_iteration": 2.6578125953674316 + }, + { + "auxiliary_loss_clip": 0.01432234, + "auxiliary_loss_mlp": 0.00368497, + "balance_loss_clip": 1.09863353, + "balance_loss_mlp": 0.32806119, + "epoch": 0.09625732752141891, + "flos": 13042720488960.0, + "grad_norm": 238.33126297537575, + "language_loss": 0.88368344, + "learning_rate": 3.954126126774001e-06, + "loss": 0.90169072, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.40454102, + "step": 1601, + "time_per_iteration": 2.715336799621582 + }, + { + "auxiliary_loss_clip": 0.01437705, + "auxiliary_loss_mlp": 0.0040907, + "balance_loss_clip": 1.09881997, + "balance_loss_mlp": 0.36510539, + "epoch": 0.09631745077408688, + "flos": 22273629782400.0, + "grad_norm": 330.1765393079444, + "language_loss": 0.88512212, + "learning_rate": 3.954043153797251e-06, + "loss": 0.90358984, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.43920898, + "step": 1602, + "time_per_iteration": 2.699397087097168 + }, + { + "auxiliary_loss_clip": 0.01404657, + "auxiliary_loss_mlp": 0.00363434, + "balance_loss_clip": 1.08432114, + "balance_loss_mlp": 0.32342711, + "epoch": 0.09637757402675484, + "flos": 24754266944640.0, + "grad_norm": 58.49719159649841, + "language_loss": 0.72358978, + "learning_rate": 3.953960106722989e-06, + "loss": 0.74127078, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.40014648, + "step": 1603, + "time_per_iteration": 2.689565896987915 + }, + { + "auxiliary_loss_clip": 0.01414849, + "auxiliary_loss_mlp": 0.00388815, + "balance_loss_clip": 1.08947968, + "balance_loss_mlp": 0.34830758, + "epoch": 0.09643769727942282, + "flos": 22525758322560.0, + "grad_norm": 13.61654815514293, + "language_loss": 0.82450479, + "learning_rate": 3.953876985554364e-06, + "loss": 0.8425414, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.4050293, + "step": 1604, + "time_per_iteration": 2.721919059753418 + }, + { + "auxiliary_loss_clip": 0.01396028, + "auxiliary_loss_mlp": 0.00375856, + "balance_loss_clip": 1.08172405, + "balance_loss_mlp": 0.3398551, + "epoch": 0.09649782053209079, + "flos": 30921026636160.0, + "grad_norm": 26.34210016506659, + "language_loss": 0.84968126, + "learning_rate": 3.953793790294527e-06, + "loss": 0.86740005, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.36035156, + "step": 1605, + "time_per_iteration": 2.7258007526397705 + }, + { + "auxiliary_loss_clip": 0.01405068, + "auxiliary_loss_mlp": 0.00399818, + "balance_loss_clip": 1.07626534, + "balance_loss_mlp": 0.36076462, + "epoch": 0.09655794378475875, + "flos": 25337635729920.0, + "grad_norm": 7.224346524094109, + "language_loss": 0.83657813, + "learning_rate": 3.953710520946634e-06, + "loss": 0.85462701, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.39038086, + "step": 1606, + "time_per_iteration": 2.7228784561157227 + }, + { + "auxiliary_loss_clip": 0.01381636, + "auxiliary_loss_mlp": 0.00376243, + "balance_loss_clip": 1.06844008, + "balance_loss_mlp": 0.34083745, + "epoch": 0.09661806703742673, + "flos": 22346061557760.0, + "grad_norm": 187.0862912371859, + "language_loss": 0.82656562, + "learning_rate": 3.953627177513843e-06, + "loss": 0.84414434, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.35400391, + "step": 1607, + "time_per_iteration": 2.6270315647125244 + }, + { + "auxiliary_loss_clip": 0.01385431, + "auxiliary_loss_mlp": 0.00353894, + "balance_loss_clip": 1.06754148, + "balance_loss_mlp": 0.31686792, + "epoch": 0.0966781902900947, + "flos": 17457578144640.0, + "grad_norm": 360.39419267827736, + "language_loss": 0.94055772, + "learning_rate": 3.953543759999312e-06, + "loss": 0.95795095, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.37036133, + "step": 1608, + "time_per_iteration": 2.611989736557007 + }, + { + "auxiliary_loss_clip": 0.01388022, + "auxiliary_loss_mlp": 0.00394951, + "balance_loss_clip": 1.07045162, + "balance_loss_mlp": 0.35534942, + "epoch": 0.09673831354276266, + "flos": 36903995412480.0, + "grad_norm": 51.08193485225469, + "language_loss": 0.78378534, + "learning_rate": 3.953460268406207e-06, + "loss": 0.80161512, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.39624023, + "step": 1609, + "time_per_iteration": 2.739863157272339 + }, + { + "auxiliary_loss_clip": 0.01375351, + "auxiliary_loss_mlp": 0.00363272, + "balance_loss_clip": 1.0613271, + "balance_loss_mlp": 0.32760417, + "epoch": 0.09679843679543064, + "flos": 20701388597760.0, + "grad_norm": 10.563222755965997, + "language_loss": 0.92151344, + "learning_rate": 3.953376702737693e-06, + "loss": 0.93889964, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.35693359, + "step": 1610, + "time_per_iteration": 2.644197940826416 + }, + { + "auxiliary_loss_clip": 0.0138217, + "auxiliary_loss_mlp": 0.00350981, + "balance_loss_clip": 1.07012129, + "balance_loss_mlp": 0.31483662, + "epoch": 0.0968585600480986, + "flos": 23514415240320.0, + "grad_norm": 39.89982408706452, + "language_loss": 0.73902893, + "learning_rate": 3.953293062996939e-06, + "loss": 0.75636041, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.36132812, + "step": 1611, + "time_per_iteration": 2.6438217163085938 + }, + { + "auxiliary_loss_clip": 0.01375535, + "auxiliary_loss_mlp": 0.0037727, + "balance_loss_clip": 1.06224203, + "balance_loss_mlp": 0.33869368, + "epoch": 0.09691868330076657, + "flos": 20121072468480.0, + "grad_norm": 10.446907198892651, + "language_loss": 0.87943256, + "learning_rate": 3.953209349187115e-06, + "loss": 0.89696062, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.38574219, + "step": 1612, + "time_per_iteration": 2.6622846126556396 + }, + { + "auxiliary_loss_clip": 0.01378831, + "auxiliary_loss_mlp": 0.00355607, + "balance_loss_clip": 1.07094121, + "balance_loss_mlp": 0.32001123, + "epoch": 0.09697880655343454, + "flos": 16544692967040.0, + "grad_norm": 15.982028498510683, + "language_loss": 0.88025498, + "learning_rate": 3.953125561311398e-06, + "loss": 0.89759934, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.35620117, + "step": 1613, + "time_per_iteration": 2.595209836959839 + }, + { + "auxiliary_loss_clip": 0.0137308, + "auxiliary_loss_mlp": 0.00349218, + "balance_loss_clip": 1.07069135, + "balance_loss_mlp": 0.31152433, + "epoch": 0.09703892980610251, + "flos": 26104184899200.0, + "grad_norm": 11.895974929224586, + "language_loss": 0.91014051, + "learning_rate": 3.953041699372964e-06, + "loss": 0.92736346, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.37670898, + "step": 1614, + "time_per_iteration": 2.6693644523620605 + }, + { + "auxiliary_loss_clip": 0.01294948, + "auxiliary_loss_mlp": 0.00129416, + "balance_loss_clip": 1.07847786, + "balance_loss_mlp": 0.11616013, + "epoch": 0.09709905305877048, + "flos": 60443622000000.0, + "grad_norm": 2.2876741084324825, + "language_loss": 0.54809356, + "learning_rate": 3.952957763374992e-06, + "loss": 0.56233716, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.1328125, + "step": 1615, + "time_per_iteration": 3.119640350341797 + }, + { + "auxiliary_loss_clip": 0.0130366, + "auxiliary_loss_mlp": 0.0013799, + "balance_loss_clip": 1.08367229, + "balance_loss_mlp": 0.12602146, + "epoch": 0.09715917631143844, + "flos": 57639932893440.0, + "grad_norm": 5.855854192864963, + "language_loss": 0.58044302, + "learning_rate": 3.952873753320666e-06, + "loss": 0.5948596, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.11962891, + "step": 1616, + "time_per_iteration": 3.2822763919830322 + }, + { + "auxiliary_loss_clip": 0.01366331, + "auxiliary_loss_mlp": 0.00322466, + "balance_loss_clip": 1.06608224, + "balance_loss_mlp": 0.28713194, + "epoch": 0.09721929956410642, + "flos": 20558212986240.0, + "grad_norm": 4.957679542592365, + "language_loss": 0.74371392, + "learning_rate": 3.952789669213172e-06, + "loss": 0.76060188, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.35302734, + "step": 1617, + "time_per_iteration": 2.674182653427124 + }, + { + "auxiliary_loss_clip": 0.0137517, + "auxiliary_loss_mlp": 0.00344133, + "balance_loss_clip": 1.06713128, + "balance_loss_mlp": 0.30686823, + "epoch": 0.09727942281677439, + "flos": 27344359825920.0, + "grad_norm": 55.12159380015707, + "language_loss": 0.86654687, + "learning_rate": 3.952705511055698e-06, + "loss": 0.88373989, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.37255859, + "step": 1618, + "time_per_iteration": 2.7114498615264893 + }, + { + "auxiliary_loss_clip": 0.01364984, + "auxiliary_loss_mlp": 0.00305408, + "balance_loss_clip": 1.06822395, + "balance_loss_mlp": 0.27310205, + "epoch": 0.09733954606944235, + "flos": 24900028335360.0, + "grad_norm": 30.072303136014003, + "language_loss": 0.97694063, + "learning_rate": 3.952621278851435e-06, + "loss": 0.99364454, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.32324219, + "step": 1619, + "time_per_iteration": 2.683267593383789 + }, + { + "auxiliary_loss_clip": 0.01368553, + "auxiliary_loss_mlp": 0.00304764, + "balance_loss_clip": 1.06991768, + "balance_loss_mlp": 0.27098033, + "epoch": 0.09739966932211033, + "flos": 31503928544640.0, + "grad_norm": 5.0350542866414365, + "language_loss": 0.94946963, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.96620274, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.33764648, + "step": 1620, + "time_per_iteration": 2.731321096420288 + }, + { + "auxiliary_loss_clip": 0.01381503, + "auxiliary_loss_mlp": 0.00333597, + "balance_loss_clip": 1.0778439, + "balance_loss_mlp": 0.29730994, + "epoch": 0.0974597925747783, + "flos": 23878764846720.0, + "grad_norm": 26.31097156913841, + "language_loss": 0.84652954, + "learning_rate": 3.952452592315324e-06, + "loss": 0.86368048, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.36279297, + "step": 1621, + "time_per_iteration": 4.102027177810669 + }, + { + "auxiliary_loss_clip": 0.01382471, + "auxiliary_loss_mlp": 0.00314216, + "balance_loss_clip": 1.07660162, + "balance_loss_mlp": 0.27633166, + "epoch": 0.09751991582744626, + "flos": 17019575700480.0, + "grad_norm": 11.276804478880166, + "language_loss": 0.82661992, + "learning_rate": 3.952368137989871e-06, + "loss": 0.8435868, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.37890625, + "step": 1622, + "time_per_iteration": 2.6767466068267822 + }, + { + "auxiliary_loss_clip": 0.01394598, + "auxiliary_loss_mlp": 0.00319706, + "balance_loss_clip": 1.08397174, + "balance_loss_mlp": 0.28198802, + "epoch": 0.09758003908011423, + "flos": 28402826826240.0, + "grad_norm": 17.2295521800851, + "language_loss": 0.90989459, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.9270376, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.37744141, + "step": 1623, + "time_per_iteration": 4.288774251937866 + }, + { + "auxiliary_loss_clip": 0.0140463, + "auxiliary_loss_mlp": 0.00326162, + "balance_loss_clip": 1.09507561, + "balance_loss_mlp": 0.29182965, + "epoch": 0.09764016233278221, + "flos": 18144297336960.0, + "grad_norm": 37.17631508535162, + "language_loss": 0.89013153, + "learning_rate": 3.952199007240184e-06, + "loss": 0.90743947, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.34326172, + "step": 1624, + "time_per_iteration": 2.604285478591919 + }, + { + "auxiliary_loss_clip": 0.01400579, + "auxiliary_loss_mlp": 0.00311985, + "balance_loss_clip": 1.09476292, + "balance_loss_mlp": 0.27724695, + "epoch": 0.09770028558545017, + "flos": 15265842071040.0, + "grad_norm": 51.66483367253249, + "language_loss": 0.93351501, + "learning_rate": 3.952114330822364e-06, + "loss": 0.95064062, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.34716797, + "step": 1625, + "time_per_iteration": 2.6437041759490967 + }, + { + "auxiliary_loss_clip": 0.01428175, + "auxiliary_loss_mlp": 0.0032931, + "balance_loss_clip": 1.11567664, + "balance_loss_mlp": 0.28997111, + "epoch": 0.09776040883811814, + "flos": 23472435219840.0, + "grad_norm": 10.913433525491849, + "language_loss": 0.93571645, + "learning_rate": 3.952029580380172e-06, + "loss": 0.9532913, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.39331055, + "step": 1626, + "time_per_iteration": 4.204698801040649 + }, + { + "auxiliary_loss_clip": 0.01441592, + "auxiliary_loss_mlp": 0.00308147, + "balance_loss_clip": 1.12127328, + "balance_loss_mlp": 0.26938018, + "epoch": 0.09782053209078612, + "flos": 24499480798080.0, + "grad_norm": 56.141400213673954, + "language_loss": 0.89405572, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.91155308, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.38769531, + "step": 1627, + "time_per_iteration": 2.7062103748321533 + }, + { + "auxiliary_loss_clip": 0.01451777, + "auxiliary_loss_mlp": 0.0032704, + "balance_loss_clip": 1.13677704, + "balance_loss_mlp": 0.2882497, + "epoch": 0.09788065534345408, + "flos": 21580158833280.0, + "grad_norm": 188.67721871091837, + "language_loss": 0.90784442, + "learning_rate": 3.951859857435534e-06, + "loss": 0.9256326, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.38793945, + "step": 1628, + "time_per_iteration": 4.093711614608765 + }, + { + "auxiliary_loss_clip": 0.01448772, + "auxiliary_loss_mlp": 0.00312192, + "balance_loss_clip": 1.13319123, + "balance_loss_mlp": 0.27635786, + "epoch": 0.09794077859612205, + "flos": 23842459175040.0, + "grad_norm": 390.511768140594, + "language_loss": 0.81458747, + "learning_rate": 3.951774884939523e-06, + "loss": 0.83219713, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.35864258, + "step": 1629, + "time_per_iteration": 2.682356357574463 + }, + { + "auxiliary_loss_clip": 0.01462408, + "auxiliary_loss_mlp": 0.00296461, + "balance_loss_clip": 1.1371609, + "balance_loss_mlp": 0.25652623, + "epoch": 0.09800090184879003, + "flos": 23659889322240.0, + "grad_norm": 304.26986508420475, + "language_loss": 0.84501708, + "learning_rate": 3.951689838432013e-06, + "loss": 0.86260575, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.39941406, + "step": 1630, + "time_per_iteration": 2.676539659500122 + }, + { + "auxiliary_loss_clip": 0.01482923, + "auxiliary_loss_mlp": 0.0033944, + "balance_loss_clip": 1.14890969, + "balance_loss_mlp": 0.30055392, + "epoch": 0.09806102510145799, + "flos": 17055773631360.0, + "grad_norm": 67.37776359726827, + "language_loss": 0.93010575, + "learning_rate": 3.951604717916228e-06, + "loss": 0.94832933, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.38916016, + "step": 1631, + "time_per_iteration": 2.669018507003784 + }, + { + "auxiliary_loss_clip": 0.01456454, + "auxiliary_loss_mlp": 0.00302459, + "balance_loss_clip": 1.13027453, + "balance_loss_mlp": 0.26660094, + "epoch": 0.09812114835412596, + "flos": 23878477537920.0, + "grad_norm": 5.085698129476178, + "language_loss": 0.88871658, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.90630567, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.35864258, + "step": 1632, + "time_per_iteration": 2.668764591217041 + }, + { + "auxiliary_loss_clip": 0.01447343, + "auxiliary_loss_mlp": 0.00303951, + "balance_loss_clip": 1.12104309, + "balance_loss_mlp": 0.26697278, + "epoch": 0.09818127160679392, + "flos": 20595488325120.0, + "grad_norm": 5.160438317551523, + "language_loss": 0.83777547, + "learning_rate": 3.951434254872751e-06, + "loss": 0.85528845, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.36987305, + "step": 1633, + "time_per_iteration": 2.747514247894287 + }, + { + "auxiliary_loss_clip": 0.01445719, + "auxiliary_loss_mlp": 0.00336213, + "balance_loss_clip": 1.11935377, + "balance_loss_mlp": 0.29513341, + "epoch": 0.0982413948594619, + "flos": 15487339288320.0, + "grad_norm": 12.407250287034092, + "language_loss": 0.80728757, + "learning_rate": 3.951348912351521e-06, + "loss": 0.82510686, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.41088867, + "step": 1634, + "time_per_iteration": 2.598177909851074 + }, + { + "auxiliary_loss_clip": 0.0144646, + "auxiliary_loss_mlp": 0.00291074, + "balance_loss_clip": 1.11214113, + "balance_loss_mlp": 0.25135383, + "epoch": 0.09830151811212987, + "flos": 24207958016640.0, + "grad_norm": 5.187586179991366, + "language_loss": 0.85152233, + "learning_rate": 3.951263495834947e-06, + "loss": 0.86889756, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.3972168, + "step": 1635, + "time_per_iteration": 2.729297399520874 + }, + { + "auxiliary_loss_clip": 0.01465532, + "auxiliary_loss_mlp": 0.00293758, + "balance_loss_clip": 1.13040185, + "balance_loss_mlp": 0.24857731, + "epoch": 0.09836164136479783, + "flos": 20594590485120.0, + "grad_norm": 22.188453190597773, + "language_loss": 0.831532, + "learning_rate": 3.951178005326264e-06, + "loss": 0.84912479, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.45166016, + "step": 1636, + "time_per_iteration": 2.774968147277832 + }, + { + "auxiliary_loss_clip": 0.01461645, + "auxiliary_loss_mlp": 0.00320959, + "balance_loss_clip": 1.12905419, + "balance_loss_mlp": 0.27692294, + "epoch": 0.09842176461746581, + "flos": 19934157070080.0, + "grad_norm": 4.79021418135691, + "language_loss": 0.77162182, + "learning_rate": 3.951092440828715e-06, + "loss": 0.78944778, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.43994141, + "step": 1637, + "time_per_iteration": 2.685391426086426 + }, + { + "auxiliary_loss_clip": 0.01447096, + "auxiliary_loss_mlp": 0.00292509, + "balance_loss_clip": 1.11828542, + "balance_loss_mlp": 0.25429046, + "epoch": 0.09848188787013377, + "flos": 21214659991680.0, + "grad_norm": 6.175645093288549, + "language_loss": 0.84843856, + "learning_rate": 3.951006802345545e-06, + "loss": 0.86583465, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.38208008, + "step": 1638, + "time_per_iteration": 2.6604514122009277 + }, + { + "auxiliary_loss_clip": 0.01435713, + "auxiliary_loss_mlp": 0.00299368, + "balance_loss_clip": 1.10743308, + "balance_loss_mlp": 0.26391506, + "epoch": 0.09854201112280174, + "flos": 30154226071680.0, + "grad_norm": 6.420272583261749, + "language_loss": 0.78366297, + "learning_rate": 3.950921089880003e-06, + "loss": 0.80101383, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.35449219, + "step": 1639, + "time_per_iteration": 2.8032875061035156 + }, + { + "auxiliary_loss_clip": 0.01445628, + "auxiliary_loss_mlp": 0.00292475, + "balance_loss_clip": 1.11155391, + "balance_loss_mlp": 0.25318322, + "epoch": 0.09860213437546972, + "flos": 21795730306560.0, + "grad_norm": 26.462372245390814, + "language_loss": 0.95435041, + "learning_rate": 3.950835303435337e-06, + "loss": 0.97173142, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.39306641, + "step": 1640, + "time_per_iteration": 2.7309720516204834 + }, + { + "auxiliary_loss_clip": 0.01427115, + "auxiliary_loss_mlp": 0.00284209, + "balance_loss_clip": 1.09873629, + "balance_loss_mlp": 0.24413131, + "epoch": 0.09866225762813768, + "flos": 21835555511040.0, + "grad_norm": 882.1241222844299, + "language_loss": 0.89556414, + "learning_rate": 3.950749443014801e-06, + "loss": 0.91267741, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.40063477, + "step": 1641, + "time_per_iteration": 2.6903560161590576 + }, + { + "auxiliary_loss_clip": 0.014426, + "auxiliary_loss_mlp": 0.00311693, + "balance_loss_clip": 1.11048245, + "balance_loss_mlp": 0.26961216, + "epoch": 0.09872238088080565, + "flos": 17599855916160.0, + "grad_norm": 12.390994715658211, + "language_loss": 0.96133423, + "learning_rate": 3.95066350862165e-06, + "loss": 0.97887719, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.42089844, + "step": 1642, + "time_per_iteration": 2.650209903717041 + }, + { + "auxiliary_loss_clip": 0.01417788, + "auxiliary_loss_mlp": 0.00272326, + "balance_loss_clip": 1.09144521, + "balance_loss_mlp": 0.23248588, + "epoch": 0.09878250413347361, + "flos": 27636134002560.0, + "grad_norm": 11.404019135932062, + "language_loss": 0.85627437, + "learning_rate": 3.950577500259144e-06, + "loss": 0.8731755, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.39868164, + "step": 1643, + "time_per_iteration": 2.7236762046813965 + }, + { + "auxiliary_loss_clip": 0.0143033, + "auxiliary_loss_mlp": 0.00278208, + "balance_loss_clip": 1.10411024, + "balance_loss_mlp": 0.23836799, + "epoch": 0.0988426273861416, + "flos": 16544728880640.0, + "grad_norm": 2.2323141187370847, + "language_loss": 0.90144938, + "learning_rate": 3.950491417930543e-06, + "loss": 0.91853476, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.39794922, + "step": 1644, + "time_per_iteration": 2.7793643474578857 + }, + { + "auxiliary_loss_clip": 0.01405208, + "auxiliary_loss_mlp": 0.00243818, + "balance_loss_clip": 1.08328223, + "balance_loss_mlp": 0.20431212, + "epoch": 0.09890275063880956, + "flos": 21215270522880.0, + "grad_norm": 154.41265567661176, + "language_loss": 0.76154464, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.77803487, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.39501953, + "step": 1645, + "time_per_iteration": 2.614414691925049 + }, + { + "auxiliary_loss_clip": 0.01417552, + "auxiliary_loss_mlp": 0.00101531, + "balance_loss_clip": 1.15992928, + "balance_loss_mlp": 0.09065877, + "epoch": 0.09896287389147752, + "flos": 59379372910080.0, + "grad_norm": 0.9210233144219142, + "language_loss": 0.60677552, + "learning_rate": 3.950319031388119e-06, + "loss": 0.62196636, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.10888672, + "step": 1646, + "time_per_iteration": 3.06249737739563 + }, + { + "auxiliary_loss_clip": 0.01414766, + "auxiliary_loss_mlp": 0.00280914, + "balance_loss_clip": 1.08509278, + "balance_loss_mlp": 0.23930976, + "epoch": 0.0990229971441455, + "flos": 29642678530560.0, + "grad_norm": 437.3844928219182, + "language_loss": 0.80162436, + "learning_rate": 3.950232727180833e-06, + "loss": 0.8185811, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.41625977, + "step": 1647, + "time_per_iteration": 2.755734443664551 + }, + { + "auxiliary_loss_clip": 0.01417598, + "auxiliary_loss_mlp": 0.00268623, + "balance_loss_clip": 1.09185374, + "balance_loss_mlp": 0.23176306, + "epoch": 0.09908312039681347, + "flos": 21834873152640.0, + "grad_norm": 5.233240041146046, + "language_loss": 0.91865826, + "learning_rate": 3.950146349020525e-06, + "loss": 0.93552041, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.36889648, + "step": 1648, + "time_per_iteration": 2.6644535064697266 + }, + { + "auxiliary_loss_clip": 0.01383242, + "auxiliary_loss_mlp": 0.00116059, + "balance_loss_clip": 1.14814699, + "balance_loss_mlp": 0.10609303, + "epoch": 0.09914324364948143, + "flos": 57564304807680.0, + "grad_norm": 0.7517902942788768, + "language_loss": 0.55603468, + "learning_rate": 3.950059896910473e-06, + "loss": 0.57102764, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.09960938, + "step": 1649, + "time_per_iteration": 3.0834810733795166 + }, + { + "auxiliary_loss_clip": 0.01412644, + "auxiliary_loss_mlp": 0.00279201, + "balance_loss_clip": 1.09079838, + "balance_loss_mlp": 0.24327101, + "epoch": 0.09920336690214941, + "flos": 34123934476800.0, + "grad_norm": 729.2221105261593, + "language_loss": 0.97997546, + "learning_rate": 3.949973370853954e-06, + "loss": 0.99689388, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.359375, + "step": 1650, + "time_per_iteration": 2.684771776199341 + }, + { + "auxiliary_loss_clip": 0.01398386, + "auxiliary_loss_mlp": 0.00121662, + "balance_loss_clip": 1.17629623, + "balance_loss_mlp": 0.11160024, + "epoch": 0.09926349015481738, + "flos": 71216428464000.0, + "grad_norm": 0.7993799667827964, + "language_loss": 0.63431954, + "learning_rate": 3.94988677085425e-06, + "loss": 0.64951998, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.10058594, + "step": 1651, + "time_per_iteration": 3.2925758361816406 + }, + { + "auxiliary_loss_clip": 0.0140087, + "auxiliary_loss_mlp": 0.00262476, + "balance_loss_clip": 1.08953261, + "balance_loss_mlp": 0.22485337, + "epoch": 0.09932361340748534, + "flos": 23148700917120.0, + "grad_norm": 3.756383771269253, + "language_loss": 0.94219649, + "learning_rate": 3.949800096914643e-06, + "loss": 0.95883, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.3762207, + "step": 1652, + "time_per_iteration": 2.666222095489502 + }, + { + "auxiliary_loss_clip": 0.01425063, + "auxiliary_loss_mlp": 0.00310801, + "balance_loss_clip": 1.10903072, + "balance_loss_mlp": 0.27069882, + "epoch": 0.09938373666015332, + "flos": 19828651847040.0, + "grad_norm": 58.590945464554004, + "language_loss": 0.86843073, + "learning_rate": 3.949713349038422e-06, + "loss": 0.88578939, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.40136719, + "step": 1653, + "time_per_iteration": 2.698317766189575 + }, + { + "auxiliary_loss_clip": 0.01410764, + "auxiliary_loss_mlp": 0.00281301, + "balance_loss_clip": 1.10168052, + "balance_loss_mlp": 0.24634862, + "epoch": 0.09944385991282129, + "flos": 22090664880000.0, + "grad_norm": 2.415372573811635, + "language_loss": 0.86359215, + "learning_rate": 3.949626527228875e-06, + "loss": 0.88051271, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.34960938, + "step": 1654, + "time_per_iteration": 2.663205862045288 + }, + { + "auxiliary_loss_clip": 0.01393019, + "auxiliary_loss_mlp": 0.00259011, + "balance_loss_clip": 1.10055566, + "balance_loss_mlp": 0.22672895, + "epoch": 0.09950398316548925, + "flos": 19828867328640.0, + "grad_norm": 123.67061679825424, + "language_loss": 0.86177742, + "learning_rate": 3.949539631489295e-06, + "loss": 0.87829775, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.32275391, + "step": 1655, + "time_per_iteration": 2.724569082260132 + }, + { + "auxiliary_loss_clip": 0.01435015, + "auxiliary_loss_mlp": 0.00303977, + "balance_loss_clip": 1.12446809, + "balance_loss_mlp": 0.26373199, + "epoch": 0.09956410641815722, + "flos": 25003701964800.0, + "grad_norm": 15.390113460269587, + "language_loss": 0.8770988, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.89448869, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.40258789, + "step": 1656, + "time_per_iteration": 2.645508050918579 + }, + { + "auxiliary_loss_clip": 0.0142248, + "auxiliary_loss_mlp": 0.0029147, + "balance_loss_clip": 1.12607574, + "balance_loss_mlp": 0.25673288, + "epoch": 0.0996242296708252, + "flos": 19317714837120.0, + "grad_norm": 3.204614845971538, + "language_loss": 0.94127679, + "learning_rate": 3.949365618233217e-06, + "loss": 0.95841628, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.34741211, + "step": 1657, + "time_per_iteration": 2.644312620162964 + }, + { + "auxiliary_loss_clip": 0.01442547, + "auxiliary_loss_mlp": 0.00315656, + "balance_loss_clip": 1.13442957, + "balance_loss_mlp": 0.27636468, + "epoch": 0.09968435292349316, + "flos": 21871609787520.0, + "grad_norm": 9.11912395004817, + "language_loss": 0.91288, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.930462, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.39306641, + "step": 1658, + "time_per_iteration": 2.6545588970184326 + }, + { + "auxiliary_loss_clip": 0.01549162, + "auxiliary_loss_mlp": 0.00207984, + "balance_loss_clip": 1.34527731, + "balance_loss_mlp": 0.19272523, + "epoch": 0.09974447617616113, + "flos": 65384533313280.0, + "grad_norm": 0.9352830289529805, + "language_loss": 0.60266578, + "learning_rate": 3.949191309296585e-06, + "loss": 0.62023729, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.15234375, + "step": 1659, + "time_per_iteration": 3.163952112197876 + }, + { + "auxiliary_loss_clip": 0.01413159, + "auxiliary_loss_mlp": 0.00257895, + "balance_loss_clip": 1.12765956, + "balance_loss_mlp": 0.22270395, + "epoch": 0.0998045994288291, + "flos": 23659817495040.0, + "grad_norm": 107.55994449957241, + "language_loss": 0.93383938, + "learning_rate": 3.949104043956321e-06, + "loss": 0.95054996, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 2.85351562, + "router_z_loss_mlp": 0.35131836, + "step": 1660, + "time_per_iteration": 2.7075650691986084 + }, + { + "auxiliary_loss_clip": 0.01442641, + "auxiliary_loss_mlp": 0.00259285, + "balance_loss_clip": 1.14798045, + "balance_loss_mlp": 0.22168659, + "epoch": 0.09986472268149707, + "flos": 19609704495360.0, + "grad_norm": 41.6734041499813, + "language_loss": 0.87648177, + "learning_rate": 3.949016704705836e-06, + "loss": 0.89350104, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.3762207, + "step": 1661, + "time_per_iteration": 2.6705164909362793 + }, + { + "auxiliary_loss_clip": 0.01430529, + "auxiliary_loss_mlp": 0.00293332, + "balance_loss_clip": 1.13189209, + "balance_loss_mlp": 0.25530419, + "epoch": 0.09992484593416504, + "flos": 26213317395840.0, + "grad_norm": 426.2581226746184, + "language_loss": 0.92841876, + "learning_rate": 3.948929291548443e-06, + "loss": 0.94565737, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.38061523, + "step": 1662, + "time_per_iteration": 2.758939504623413 + }, + { + "auxiliary_loss_clip": 0.0143714, + "auxiliary_loss_mlp": 0.00275315, + "balance_loss_clip": 1.14108229, + "balance_loss_mlp": 0.23549867, + "epoch": 0.09998496918683301, + "flos": 17493632421120.0, + "grad_norm": 17.1901505029231, + "language_loss": 0.9703455, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.98747009, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.39868164, + "step": 1663, + "time_per_iteration": 2.6607539653778076 + }, + { + "auxiliary_loss_clip": 0.01437242, + "auxiliary_loss_mlp": 0.002511, + "balance_loss_clip": 1.14134049, + "balance_loss_mlp": 0.21312006, + "epoch": 0.10004509243950098, + "flos": 22784925928320.0, + "grad_norm": 157.49998123032466, + "language_loss": 0.79141688, + "learning_rate": 3.948754243526191e-06, + "loss": 0.80830038, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.37963867, + "step": 1664, + "time_per_iteration": 4.176863670349121 + }, + { + "auxiliary_loss_clip": 0.01426042, + "auxiliary_loss_mlp": 0.00242867, + "balance_loss_clip": 1.13227177, + "balance_loss_mlp": 0.20715141, + "epoch": 0.10010521569216894, + "flos": 16253385667200.0, + "grad_norm": 176.79789016053664, + "language_loss": 0.88678372, + "learning_rate": 3.94866660866797e-06, + "loss": 0.90347278, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.35742188, + "step": 1665, + "time_per_iteration": 4.265687942504883 + }, + { + "auxiliary_loss_clip": 0.01424719, + "auxiliary_loss_mlp": 0.00226468, + "balance_loss_clip": 1.13280511, + "balance_loss_mlp": 0.19087201, + "epoch": 0.10016533894483691, + "flos": 23402589223680.0, + "grad_norm": 25.438376289177135, + "language_loss": 0.76097465, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.77748656, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.35571289, + "step": 1666, + "time_per_iteration": 2.6964516639709473 + }, + { + "auxiliary_loss_clip": 0.0143677, + "auxiliary_loss_mlp": 0.00239459, + "balance_loss_clip": 1.13127327, + "balance_loss_mlp": 0.19852208, + "epoch": 0.10022546219750489, + "flos": 19354164163200.0, + "grad_norm": 6.191282033078129, + "language_loss": 0.87066084, + "learning_rate": 3.948491117273956e-06, + "loss": 0.8874231, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.40917969, + "step": 1667, + "time_per_iteration": 2.668174982070923 + }, + { + "auxiliary_loss_clip": 0.01429857, + "auxiliary_loss_mlp": 0.00234921, + "balance_loss_clip": 1.13258195, + "balance_loss_mlp": 0.19760823, + "epoch": 0.10028558545017285, + "flos": 27085766837760.0, + "grad_norm": 114.51418818226877, + "language_loss": 0.8688609, + "learning_rate": 3.948403260744817e-06, + "loss": 0.88550866, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.37304688, + "step": 1668, + "time_per_iteration": 4.189879894256592 + }, + { + "auxiliary_loss_clip": 0.01434699, + "auxiliary_loss_mlp": 0.00227556, + "balance_loss_clip": 1.1325829, + "balance_loss_mlp": 0.19012442, + "epoch": 0.10034570870284082, + "flos": 25847136195840.0, + "grad_norm": 22.691598928189748, + "language_loss": 0.8759082, + "learning_rate": 3.948315330332031e-06, + "loss": 0.89253074, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.37451172, + "step": 1669, + "time_per_iteration": 2.672041893005371 + }, + { + "auxiliary_loss_clip": 0.01439864, + "auxiliary_loss_mlp": 0.0023979, + "balance_loss_clip": 1.13469648, + "balance_loss_mlp": 0.20016462, + "epoch": 0.1004058319555088, + "flos": 26249587153920.0, + "grad_norm": 10.14360407598895, + "language_loss": 0.95060164, + "learning_rate": 3.948227326038933e-06, + "loss": 0.96739817, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.39624023, + "step": 1670, + "time_per_iteration": 4.0912933349609375 + }, + { + "auxiliary_loss_clip": 0.01434845, + "auxiliary_loss_mlp": 0.00185353, + "balance_loss_clip": 1.14061391, + "balance_loss_mlp": 0.15278493, + "epoch": 0.10046595520817676, + "flos": 25374480105600.0, + "grad_norm": 17.57773720044362, + "language_loss": 0.83600575, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.85220778, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.32568359, + "step": 1671, + "time_per_iteration": 2.6996774673461914 + }, + { + "auxiliary_loss_clip": 0.01464616, + "auxiliary_loss_mlp": 0.00123662, + "balance_loss_clip": 1.27898288, + "balance_loss_mlp": 0.11383937, + "epoch": 0.10052607846084473, + "flos": 67461821677440.0, + "grad_norm": 0.8019365244581269, + "language_loss": 0.60874546, + "learning_rate": 3.948051095825149e-06, + "loss": 0.62462819, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.09814453, + "step": 1672, + "time_per_iteration": 3.0950546264648438 + }, + { + "auxiliary_loss_clip": 0.01421163, + "auxiliary_loss_mlp": 0.0021351, + "balance_loss_clip": 1.11952817, + "balance_loss_mlp": 0.17908221, + "epoch": 0.10058620171351271, + "flos": 21360493209600.0, + "grad_norm": 30.17956306927547, + "language_loss": 0.85099995, + "learning_rate": 3.947962869911147e-06, + "loss": 0.86734664, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.34448242, + "step": 1673, + "time_per_iteration": 2.6489686965942383 + }, + { + "auxiliary_loss_clip": 0.01411727, + "auxiliary_loss_mlp": 0.00195937, + "balance_loss_clip": 1.1093303, + "balance_loss_mlp": 0.15965018, + "epoch": 0.10064632496618067, + "flos": 16800125558400.0, + "grad_norm": 20.23488339568771, + "language_loss": 0.85137451, + "learning_rate": 3.947874570130197e-06, + "loss": 0.86745119, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.36303711, + "step": 1674, + "time_per_iteration": 2.609600782394409 + }, + { + "auxiliary_loss_clip": 0.01412876, + "auxiliary_loss_mlp": 0.00228919, + "balance_loss_clip": 1.10891938, + "balance_loss_mlp": 0.19542141, + "epoch": 0.10070644821884864, + "flos": 23624445576960.0, + "grad_norm": 33.302508536933516, + "language_loss": 0.88327658, + "learning_rate": 3.947786196485649e-06, + "loss": 0.89969456, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.33496094, + "step": 1675, + "time_per_iteration": 2.696009874343872 + }, + { + "auxiliary_loss_clip": 0.01419315, + "auxiliary_loss_mlp": 0.00206404, + "balance_loss_clip": 1.10375834, + "balance_loss_mlp": 0.17281038, + "epoch": 0.1007665714715166, + "flos": 24462564595200.0, + "grad_norm": 5.336583009923454, + "language_loss": 0.89189374, + "learning_rate": 3.947697748980853e-06, + "loss": 0.90815091, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.33569336, + "step": 1676, + "time_per_iteration": 2.6659927368164062 + }, + { + "auxiliary_loss_clip": 0.0141732, + "auxiliary_loss_mlp": 0.00210008, + "balance_loss_clip": 1.10244513, + "balance_loss_mlp": 0.17865562, + "epoch": 0.10082669472418458, + "flos": 16799119977600.0, + "grad_norm": 11.763357457203323, + "language_loss": 0.92486691, + "learning_rate": 3.947609227619163e-06, + "loss": 0.94114012, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.31347656, + "step": 1677, + "time_per_iteration": 2.6620006561279297 + }, + { + "auxiliary_loss_clip": 0.01414404, + "auxiliary_loss_mlp": 0.00226389, + "balance_loss_clip": 1.09996927, + "balance_loss_mlp": 0.19236687, + "epoch": 0.10088681797685255, + "flos": 13553513844480.0, + "grad_norm": 2.41861870268386, + "language_loss": 0.94003248, + "learning_rate": 3.947520632403936e-06, + "loss": 0.95644039, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.34008789, + "step": 1678, + "time_per_iteration": 2.5857696533203125 + }, + { + "auxiliary_loss_clip": 0.0141754, + "auxiliary_loss_mlp": 0.00240343, + "balance_loss_clip": 1.10247672, + "balance_loss_mlp": 0.20846653, + "epoch": 0.10094694122952051, + "flos": 25265706744960.0, + "grad_norm": 3.37059595808477, + "language_loss": 0.97837138, + "learning_rate": 3.947431963338532e-06, + "loss": 0.99495029, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.31884766, + "step": 1679, + "time_per_iteration": 2.7101662158966064 + }, + { + "auxiliary_loss_clip": 0.01296312, + "auxiliary_loss_mlp": 0.00132391, + "balance_loss_clip": 1.10422802, + "balance_loss_mlp": 0.11961204, + "epoch": 0.10100706448218849, + "flos": 69854299885440.0, + "grad_norm": 0.8173892981822501, + "language_loss": 0.53013223, + "learning_rate": 3.947343220426312e-06, + "loss": 0.54441923, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.12792969, + "step": 1680, + "time_per_iteration": 3.205596446990967 + }, + { + "auxiliary_loss_clip": 0.01414649, + "auxiliary_loss_mlp": 0.00226822, + "balance_loss_clip": 1.09729505, + "balance_loss_mlp": 0.19377714, + "epoch": 0.10106718773485646, + "flos": 20007163463040.0, + "grad_norm": 4.803159303337, + "language_loss": 0.84584606, + "learning_rate": 3.947254403670641e-06, + "loss": 0.86226082, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 3.17382812, + "router_z_loss_mlp": 0.33056641, + "step": 1681, + "time_per_iteration": 2.784868001937866 + }, + { + "auxiliary_loss_clip": 0.0143211, + "auxiliary_loss_mlp": 0.00300444, + "balance_loss_clip": 1.11296535, + "balance_loss_mlp": 0.26489624, + "epoch": 0.10112731098752442, + "flos": 13479825093120.0, + "grad_norm": 133.00649642392773, + "language_loss": 1.03123105, + "learning_rate": 3.947165513074889e-06, + "loss": 1.04855669, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.35571289, + "step": 1682, + "time_per_iteration": 2.7600817680358887 + }, + { + "auxiliary_loss_clip": 0.01423609, + "auxiliary_loss_mlp": 0.00257474, + "balance_loss_clip": 1.1057204, + "balance_loss_mlp": 0.22459617, + "epoch": 0.1011874342401924, + "flos": 18515901490560.0, + "grad_norm": 3.7287758085842624, + "language_loss": 0.93201721, + "learning_rate": 3.947076548642425e-06, + "loss": 0.94882798, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.32885742, + "step": 1683, + "time_per_iteration": 2.761664628982544 + }, + { + "auxiliary_loss_clip": 0.01423101, + "auxiliary_loss_mlp": 0.00304024, + "balance_loss_clip": 1.1126833, + "balance_loss_mlp": 0.27311268, + "epoch": 0.10124755749286037, + "flos": 20702861055360.0, + "grad_norm": 5.602867533156251, + "language_loss": 0.82046485, + "learning_rate": 3.946987510376624e-06, + "loss": 0.83773601, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.30908203, + "step": 1684, + "time_per_iteration": 2.8779306411743164 + }, + { + "auxiliary_loss_clip": 0.01329211, + "auxiliary_loss_mlp": 0.00130437, + "balance_loss_clip": 1.14411998, + "balance_loss_mlp": 0.11813499, + "epoch": 0.10130768074552833, + "flos": 56109456247680.0, + "grad_norm": 0.7564894431992994, + "language_loss": 0.61162949, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.62622607, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.12304688, + "step": 1685, + "time_per_iteration": 3.36984920501709 + }, + { + "auxiliary_loss_clip": 0.01437008, + "auxiliary_loss_mlp": 0.00321937, + "balance_loss_clip": 1.11840606, + "balance_loss_mlp": 0.28863022, + "epoch": 0.1013678039981963, + "flos": 33402346156800.0, + "grad_norm": 16.8453613964189, + "language_loss": 0.6915592, + "learning_rate": 3.946809212358516e-06, + "loss": 0.70914865, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.33300781, + "step": 1686, + "time_per_iteration": 2.8667733669281006 + }, + { + "auxiliary_loss_clip": 0.014281, + "auxiliary_loss_mlp": 0.00303839, + "balance_loss_clip": 1.11823153, + "balance_loss_mlp": 0.27313024, + "epoch": 0.10142792725086427, + "flos": 31905338008320.0, + "grad_norm": 87.20751824406162, + "language_loss": 0.86326063, + "learning_rate": 3.946719952612972e-06, + "loss": 0.88057995, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.30737305, + "step": 1687, + "time_per_iteration": 2.754265546798706 + }, + { + "auxiliary_loss_clip": 0.01463459, + "auxiliary_loss_mlp": 0.00293834, + "balance_loss_clip": 1.1423552, + "balance_loss_mlp": 0.26322144, + "epoch": 0.10148805050353224, + "flos": 28475905046400.0, + "grad_norm": 2.3945908658288007, + "language_loss": 0.79567862, + "learning_rate": 3.94663061904761e-06, + "loss": 0.81325155, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.3059082, + "step": 1688, + "time_per_iteration": 2.678931713104248 + }, + { + "auxiliary_loss_clip": 0.01447083, + "auxiliary_loss_mlp": 0.00325439, + "balance_loss_clip": 1.13314104, + "balance_loss_mlp": 0.2938723, + "epoch": 0.1015481737562002, + "flos": 25148888737920.0, + "grad_norm": 14.020804627921958, + "language_loss": 0.94305789, + "learning_rate": 3.94654121166582e-06, + "loss": 0.96078306, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.31542969, + "step": 1689, + "time_per_iteration": 2.7358720302581787 + }, + { + "auxiliary_loss_clip": 0.01461619, + "auxiliary_loss_mlp": 0.00299861, + "balance_loss_clip": 1.14937055, + "balance_loss_mlp": 0.26914099, + "epoch": 0.10160829700886818, + "flos": 30882781630080.0, + "grad_norm": 11.997471921879947, + "language_loss": 0.95998096, + "learning_rate": 3.946451730470993e-06, + "loss": 0.97759575, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 3.12304688, + "router_z_loss_mlp": 0.30712891, + "step": 1690, + "time_per_iteration": 2.723723888397217 + }, + { + "auxiliary_loss_clip": 0.01472717, + "auxiliary_loss_mlp": 0.00344132, + "balance_loss_clip": 1.15350783, + "balance_loss_mlp": 0.31103909, + "epoch": 0.10166842026153615, + "flos": 20412020632320.0, + "grad_norm": 14.412676678804287, + "language_loss": 0.90871477, + "learning_rate": 3.946362175466521e-06, + "loss": 0.92688322, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.33081055, + "step": 1691, + "time_per_iteration": 2.6711792945861816 + }, + { + "auxiliary_loss_clip": 0.01474346, + "auxiliary_loss_mlp": 0.00372493, + "balance_loss_clip": 1.1555202, + "balance_loss_mlp": 0.33620518, + "epoch": 0.10172854351420411, + "flos": 33476968661760.0, + "grad_norm": 5.172169120212253, + "language_loss": 0.74186301, + "learning_rate": 3.946272546655801e-06, + "loss": 0.76033139, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.36279297, + "step": 1692, + "time_per_iteration": 2.734665632247925 + }, + { + "auxiliary_loss_clip": 0.01439562, + "auxiliary_loss_mlp": 0.00300195, + "balance_loss_clip": 1.13034093, + "balance_loss_mlp": 0.2680797, + "epoch": 0.1017886667668721, + "flos": 23550325862400.0, + "grad_norm": 4.570299163859535, + "language_loss": 0.84220147, + "learning_rate": 3.94618284404223e-06, + "loss": 0.85959899, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.32104492, + "step": 1693, + "time_per_iteration": 2.660723924636841 + }, + { + "auxiliary_loss_clip": 0.01444867, + "auxiliary_loss_mlp": 0.00344537, + "balance_loss_clip": 1.12629056, + "balance_loss_mlp": 0.31015658, + "epoch": 0.10184879001954006, + "flos": 23296078419840.0, + "grad_norm": 9.345539190684303, + "language_loss": 0.92098272, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.93887675, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.34399414, + "step": 1694, + "time_per_iteration": 2.640963315963745 + }, + { + "auxiliary_loss_clip": 0.01428722, + "auxiliary_loss_mlp": 0.00323056, + "balance_loss_clip": 1.11391139, + "balance_loss_mlp": 0.28984481, + "epoch": 0.10190891327220802, + "flos": 18333116156160.0, + "grad_norm": 2.939321379974881, + "language_loss": 0.8741678, + "learning_rate": 3.946003217420147e-06, + "loss": 0.89168555, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.33227539, + "step": 1695, + "time_per_iteration": 2.622555732727051 + }, + { + "auxiliary_loss_clip": 0.01415742, + "auxiliary_loss_mlp": 0.00298006, + "balance_loss_clip": 1.1001513, + "balance_loss_mlp": 0.26496127, + "epoch": 0.10196903652487599, + "flos": 26465374108800.0, + "grad_norm": 18.009717742828766, + "language_loss": 0.92779464, + "learning_rate": 3.945913293418447e-06, + "loss": 0.9449321, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.33056641, + "step": 1696, + "time_per_iteration": 2.6884772777557373 + }, + { + "auxiliary_loss_clip": 0.01430817, + "auxiliary_loss_mlp": 0.00303471, + "balance_loss_clip": 1.11552036, + "balance_loss_mlp": 0.27391857, + "epoch": 0.10202915977754397, + "flos": 21869526798720.0, + "grad_norm": 426.3906322388595, + "language_loss": 0.88028038, + "learning_rate": 3.945823295627519e-06, + "loss": 0.89762318, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 3.15234375, + "router_z_loss_mlp": 0.29528809, + "step": 1697, + "time_per_iteration": 2.7094011306762695 + }, + { + "auxiliary_loss_clip": 0.01453221, + "auxiliary_loss_mlp": 0.00314835, + "balance_loss_clip": 1.12871981, + "balance_loss_mlp": 0.28243411, + "epoch": 0.10208928303021193, + "flos": 22309755886080.0, + "grad_norm": 35.08912082176857, + "language_loss": 0.89441675, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.91209733, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.32397461, + "step": 1698, + "time_per_iteration": 2.6961843967437744 + }, + { + "auxiliary_loss_clip": 0.01463414, + "auxiliary_loss_mlp": 0.00335751, + "balance_loss_clip": 1.13443375, + "balance_loss_mlp": 0.30272973, + "epoch": 0.1021494062828799, + "flos": 22125569921280.0, + "grad_norm": 10.313537078099635, + "language_loss": 0.82790053, + "learning_rate": 3.945643078691637e-06, + "loss": 0.84589213, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.33032227, + "step": 1699, + "time_per_iteration": 2.633845090866089 + }, + { + "auxiliary_loss_clip": 0.01453232, + "auxiliary_loss_mlp": 0.00332058, + "balance_loss_clip": 1.13075733, + "balance_loss_mlp": 0.2996096, + "epoch": 0.10220952953554788, + "flos": 19646728439040.0, + "grad_norm": 1.8595135487559091, + "language_loss": 0.86757338, + "learning_rate": 3.945552859553516e-06, + "loss": 0.88542628, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.32446289, + "step": 1700, + "time_per_iteration": 2.6122031211853027 + }, + { + "auxiliary_loss_clip": 0.01456532, + "auxiliary_loss_mlp": 0.00341514, + "balance_loss_clip": 1.12906694, + "balance_loss_mlp": 0.30634719, + "epoch": 0.10226965278821584, + "flos": 29787290686080.0, + "grad_norm": 2.896818235789939, + "language_loss": 0.83932233, + "learning_rate": 3.945462566639836e-06, + "loss": 0.85730278, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.35180664, + "step": 1701, + "time_per_iteration": 2.697561740875244 + }, + { + "auxiliary_loss_clip": 0.01487409, + "auxiliary_loss_mlp": 0.00376085, + "balance_loss_clip": 1.1499728, + "balance_loss_mlp": 0.33874881, + "epoch": 0.10232977604088381, + "flos": 27016818681600.0, + "grad_norm": 41.79406622165582, + "language_loss": 0.85265207, + "learning_rate": 3.945372199954019e-06, + "loss": 0.87128699, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.37304688, + "step": 1702, + "time_per_iteration": 2.748521327972412 + }, + { + "auxiliary_loss_clip": 0.01485064, + "auxiliary_loss_mlp": 0.00346643, + "balance_loss_clip": 1.16014636, + "balance_loss_mlp": 0.31338388, + "epoch": 0.10238989929355179, + "flos": 20777519473920.0, + "grad_norm": 8.435986546849765, + "language_loss": 1.02211738, + "learning_rate": 3.945281759499494e-06, + "loss": 1.04043436, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.33227539, + "step": 1703, + "time_per_iteration": 2.657196283340454 + }, + { + "auxiliary_loss_clip": 0.014011, + "auxiliary_loss_mlp": 0.00233996, + "balance_loss_clip": 1.20626974, + "balance_loss_mlp": 0.21263407, + "epoch": 0.10245002254621975, + "flos": 57698322451200.0, + "grad_norm": 0.9166106437099278, + "language_loss": 0.55300939, + "learning_rate": 3.94519124527969e-06, + "loss": 0.56936038, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.21386719, + "step": 1704, + "time_per_iteration": 3.086576461791992 + }, + { + "auxiliary_loss_clip": 0.01487249, + "auxiliary_loss_mlp": 0.00404888, + "balance_loss_clip": 1.16411316, + "balance_loss_mlp": 0.36717057, + "epoch": 0.10251014579888772, + "flos": 16800125558400.0, + "grad_norm": 12.420975702376198, + "language_loss": 0.92600834, + "learning_rate": 3.945100657298039e-06, + "loss": 0.94492972, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.37695312, + "step": 1705, + "time_per_iteration": 2.623239040374756 + }, + { + "auxiliary_loss_clip": 0.01408798, + "auxiliary_loss_mlp": 0.00224156, + "balance_loss_clip": 1.21146464, + "balance_loss_mlp": 0.20374699, + "epoch": 0.1025702690515557, + "flos": 68565500922240.0, + "grad_norm": 0.7625945072221837, + "language_loss": 0.60557574, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62190527, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.20410156, + "step": 1706, + "time_per_iteration": 4.550852298736572 + }, + { + "auxiliary_loss_clip": 0.01491073, + "auxiliary_loss_mlp": 0.00382191, + "balance_loss_clip": 1.15817165, + "balance_loss_mlp": 0.34618956, + "epoch": 0.10263039230422366, + "flos": 14866623336960.0, + "grad_norm": 6.226344804669652, + "language_loss": 0.94599545, + "learning_rate": 3.94491926006294e-06, + "loss": 0.96472812, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.36010742, + "step": 1707, + "time_per_iteration": 4.080641746520996 + }, + { + "auxiliary_loss_clip": 0.01471034, + "auxiliary_loss_mlp": 0.00353264, + "balance_loss_clip": 1.15367365, + "balance_loss_mlp": 0.32138765, + "epoch": 0.10269051555689163, + "flos": 25337599816320.0, + "grad_norm": 65.92205469271558, + "language_loss": 0.78483182, + "learning_rate": 3.944828450816369e-06, + "loss": 0.80307484, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 3.17382812, + "router_z_loss_mlp": 0.31860352, + "step": 1708, + "time_per_iteration": 2.7636330127716064 + }, + { + "auxiliary_loss_clip": 0.01496377, + "auxiliary_loss_mlp": 0.0035135, + "balance_loss_clip": 1.17134738, + "balance_loss_mlp": 0.3171367, + "epoch": 0.10275063880955959, + "flos": 21068826773760.0, + "grad_norm": 16.255664032896288, + "language_loss": 0.97282398, + "learning_rate": 3.944737567821709e-06, + "loss": 0.99130124, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.34228516, + "step": 1709, + "time_per_iteration": 2.669799327850342 + }, + { + "auxiliary_loss_clip": 0.01489755, + "auxiliary_loss_mlp": 0.00361515, + "balance_loss_clip": 1.17187619, + "balance_loss_mlp": 0.3277548, + "epoch": 0.10281076206222757, + "flos": 30366780802560.0, + "grad_norm": 7.491993838787577, + "language_loss": 0.94368768, + "learning_rate": 3.944646611082406e-06, + "loss": 0.9622004, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.33764648, + "step": 1710, + "time_per_iteration": 4.161514759063721 + }, + { + "auxiliary_loss_clip": 0.01482634, + "auxiliary_loss_mlp": 0.00352932, + "balance_loss_clip": 1.16162252, + "balance_loss_mlp": 0.31859973, + "epoch": 0.10287088531489554, + "flos": 22418313765120.0, + "grad_norm": 4.7032082268272735, + "language_loss": 0.84358144, + "learning_rate": 3.944555580601908e-06, + "loss": 0.86193711, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.34326172, + "step": 1711, + "time_per_iteration": 2.72757887840271 + }, + { + "auxiliary_loss_clip": 0.01469805, + "auxiliary_loss_mlp": 0.00363638, + "balance_loss_clip": 1.15177464, + "balance_loss_mlp": 0.32861468, + "epoch": 0.1029310085675635, + "flos": 25115994858240.0, + "grad_norm": 191.12944304550513, + "language_loss": 0.79383457, + "learning_rate": 3.944464476383668e-06, + "loss": 0.81216896, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.3503418, + "step": 1712, + "time_per_iteration": 4.049677133560181 + }, + { + "auxiliary_loss_clip": 0.01471269, + "auxiliary_loss_mlp": 0.00318981, + "balance_loss_clip": 1.15653777, + "balance_loss_mlp": 0.28498232, + "epoch": 0.10299113182023148, + "flos": 19865639877120.0, + "grad_norm": 3.563271369616205, + "language_loss": 0.93426603, + "learning_rate": 3.94437329843114e-06, + "loss": 0.95216846, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.33959961, + "step": 1713, + "time_per_iteration": 2.741966485977173 + }, + { + "auxiliary_loss_clip": 0.0147867, + "auxiliary_loss_mlp": 0.0036055, + "balance_loss_clip": 1.15600443, + "balance_loss_mlp": 0.32514477, + "epoch": 0.10305125507289944, + "flos": 20447608032000.0, + "grad_norm": 17.664611134235038, + "language_loss": 0.7881524, + "learning_rate": 3.944282046747782e-06, + "loss": 0.80654454, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.35400391, + "step": 1714, + "time_per_iteration": 2.6604840755462646 + }, + { + "auxiliary_loss_clip": 0.01459778, + "auxiliary_loss_mlp": 0.00385334, + "balance_loss_clip": 1.13968778, + "balance_loss_mlp": 0.34752065, + "epoch": 0.10311137832556741, + "flos": 26250772302720.0, + "grad_norm": 8.74199316690998, + "language_loss": 0.97704434, + "learning_rate": 3.944190721337053e-06, + "loss": 0.99549544, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.37792969, + "step": 1715, + "time_per_iteration": 2.672192335128784 + }, + { + "auxiliary_loss_clip": 0.01463325, + "auxiliary_loss_mlp": 0.00331049, + "balance_loss_clip": 1.14660215, + "balance_loss_mlp": 0.29345083, + "epoch": 0.10317150157823539, + "flos": 35298932175360.0, + "grad_norm": 14.919969256491244, + "language_loss": 0.83315682, + "learning_rate": 3.944099322202418e-06, + "loss": 0.85110056, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.37597656, + "step": 1716, + "time_per_iteration": 2.76080584526062 + }, + { + "auxiliary_loss_clip": 0.01485743, + "auxiliary_loss_mlp": 0.00394856, + "balance_loss_clip": 1.15672457, + "balance_loss_mlp": 0.35441989, + "epoch": 0.10323162483090335, + "flos": 25739943033600.0, + "grad_norm": 7.058526159856095, + "language_loss": 0.93469548, + "learning_rate": 3.944007849347342e-06, + "loss": 0.95350152, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.40405273, + "step": 1717, + "time_per_iteration": 2.6621499061584473 + }, + { + "auxiliary_loss_clip": 0.0146078, + "auxiliary_loss_mlp": 0.00400768, + "balance_loss_clip": 1.13858485, + "balance_loss_mlp": 0.36078477, + "epoch": 0.10329174808357132, + "flos": 16289870906880.0, + "grad_norm": 7.9528746026420665, + "language_loss": 0.93365324, + "learning_rate": 3.943916302775292e-06, + "loss": 0.95226872, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.39990234, + "step": 1718, + "time_per_iteration": 2.7176437377929688 + }, + { + "auxiliary_loss_clip": 0.0146603, + "auxiliary_loss_mlp": 0.00398268, + "balance_loss_clip": 1.14542365, + "balance_loss_mlp": 0.35985917, + "epoch": 0.10335187133623928, + "flos": 36687166963200.0, + "grad_norm": 10.07476906888366, + "language_loss": 0.78950119, + "learning_rate": 3.943824682489742e-06, + "loss": 0.80814421, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.38427734, + "step": 1719, + "time_per_iteration": 2.7405381202697754 + }, + { + "auxiliary_loss_clip": 0.01473594, + "auxiliary_loss_mlp": 0.00423049, + "balance_loss_clip": 1.14802992, + "balance_loss_mlp": 0.38499749, + "epoch": 0.10341199458890726, + "flos": 14975648092800.0, + "grad_norm": 6.912032760363442, + "language_loss": 0.98751557, + "learning_rate": 3.9437329884941665e-06, + "loss": 1.00648189, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.38012695, + "step": 1720, + "time_per_iteration": 2.6464953422546387 + }, + { + "auxiliary_loss_clip": 0.01499929, + "auxiliary_loss_mlp": 0.00433222, + "balance_loss_clip": 1.16659033, + "balance_loss_mlp": 0.39259592, + "epoch": 0.10347211784157523, + "flos": 21031587348480.0, + "grad_norm": 5.3935762883715075, + "language_loss": 0.85919183, + "learning_rate": 3.943641220792039e-06, + "loss": 0.87852335, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.40649414, + "step": 1721, + "time_per_iteration": 2.632091760635376 + }, + { + "auxiliary_loss_clip": 0.0150303, + "auxiliary_loss_mlp": 0.00454445, + "balance_loss_clip": 1.17036772, + "balance_loss_mlp": 0.41172075, + "epoch": 0.1035322410942432, + "flos": 19792094780160.0, + "grad_norm": 26.124846446410498, + "language_loss": 0.86820292, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.88777769, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.42700195, + "step": 1722, + "time_per_iteration": 2.6710479259490967 + }, + { + "auxiliary_loss_clip": 0.0131594, + "auxiliary_loss_mlp": 0.0021099, + "balance_loss_clip": 1.12796068, + "balance_loss_mlp": 0.19983213, + "epoch": 0.10359236434691117, + "flos": 52698874947840.0, + "grad_norm": 1.0005180675917151, + "language_loss": 0.67111003, + "learning_rate": 3.943457464282059e-06, + "loss": 0.68637931, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.11181641, + "step": 1723, + "time_per_iteration": 2.9467110633850098 + }, + { + "auxiliary_loss_clip": 0.01528722, + "auxiliary_loss_mlp": 0.00438229, + "balance_loss_clip": 1.19561613, + "balance_loss_mlp": 0.3970775, + "epoch": 0.10365248759957914, + "flos": 18405404277120.0, + "grad_norm": 4.770418731197657, + "language_loss": 0.8820169, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.90168643, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.41162109, + "step": 1724, + "time_per_iteration": 2.6420791149139404 + }, + { + "auxiliary_loss_clip": 0.01535596, + "auxiliary_loss_mlp": 0.00461173, + "balance_loss_clip": 1.19737482, + "balance_loss_mlp": 0.42030802, + "epoch": 0.1037126108522471, + "flos": 47553555335040.0, + "grad_norm": 136.54881706307236, + "language_loss": 0.81538224, + "learning_rate": 3.943273412987676e-06, + "loss": 0.83534992, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.40820312, + "step": 1725, + "time_per_iteration": 2.8589065074920654 + }, + { + "auxiliary_loss_clip": 0.01558511, + "auxiliary_loss_mlp": 0.00437395, + "balance_loss_clip": 1.22125304, + "balance_loss_mlp": 0.39688724, + "epoch": 0.10377273410491508, + "flos": 22816670572800.0, + "grad_norm": 7.2656130321275985, + "language_loss": 0.84472722, + "learning_rate": 3.943181276805054e-06, + "loss": 0.86468625, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.40478516, + "step": 1726, + "time_per_iteration": 2.668335437774658 + }, + { + "auxiliary_loss_clip": 0.01566532, + "auxiliary_loss_mlp": 0.00432072, + "balance_loss_clip": 1.22737098, + "balance_loss_mlp": 0.38860872, + "epoch": 0.10383285735758305, + "flos": 26138694890880.0, + "grad_norm": 4.963619203083205, + "language_loss": 0.82461321, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.84459925, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.43432617, + "step": 1727, + "time_per_iteration": 2.9124534130096436 + }, + { + "auxiliary_loss_clip": 0.01565128, + "auxiliary_loss_mlp": 0.003756, + "balance_loss_clip": 1.23016262, + "balance_loss_mlp": 0.33428231, + "epoch": 0.10389298061025101, + "flos": 17091791994240.0, + "grad_norm": 7.506658546973763, + "language_loss": 0.90746099, + "learning_rate": 3.942996783386422e-06, + "loss": 0.92686826, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.41308594, + "step": 1728, + "time_per_iteration": 2.6007304191589355 + }, + { + "auxiliary_loss_clip": 0.01572954, + "auxiliary_loss_mlp": 0.00381895, + "balance_loss_clip": 1.23956895, + "balance_loss_mlp": 0.34000459, + "epoch": 0.10395310386291898, + "flos": 20776513893120.0, + "grad_norm": 14.486373543647078, + "language_loss": 0.77515733, + "learning_rate": 3.942904426157406e-06, + "loss": 0.79470587, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.41918945, + "step": 1729, + "time_per_iteration": 2.7050485610961914 + }, + { + "auxiliary_loss_clip": 0.01587969, + "auxiliary_loss_mlp": 0.00359456, + "balance_loss_clip": 1.24423742, + "balance_loss_mlp": 0.31384614, + "epoch": 0.10401322711558696, + "flos": 12820540913280.0, + "grad_norm": 53.714932989829535, + "language_loss": 0.952214, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.97168827, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.45629883, + "step": 1730, + "time_per_iteration": 2.567613363265991 + }, + { + "auxiliary_loss_clip": 0.01585584, + "auxiliary_loss_mlp": 0.00379136, + "balance_loss_clip": 1.24647939, + "balance_loss_mlp": 0.33660209, + "epoch": 0.10407335036825492, + "flos": 23184683366400.0, + "grad_norm": 30.78783770511746, + "language_loss": 0.83378559, + "learning_rate": 3.942719490677489e-06, + "loss": 0.85343277, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.42529297, + "step": 1731, + "time_per_iteration": 2.675737142562866 + }, + { + "auxiliary_loss_clip": 0.01592693, + "auxiliary_loss_mlp": 0.00349519, + "balance_loss_clip": 1.24333, + "balance_loss_mlp": 0.30770087, + "epoch": 0.10413347362092289, + "flos": 26104184899200.0, + "grad_norm": 38.85001036042507, + "language_loss": 0.90364116, + "learning_rate": 3.9426269124336e-06, + "loss": 0.92306328, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 0.41845703, + "step": 1732, + "time_per_iteration": 2.6506969928741455 + }, + { + "auxiliary_loss_clip": 0.01598534, + "auxiliary_loss_mlp": 0.00372415, + "balance_loss_clip": 1.24979663, + "balance_loss_mlp": 0.32995307, + "epoch": 0.10419359687359087, + "flos": 12641059630080.0, + "grad_norm": 8.244051030442067, + "language_loss": 0.92917335, + "learning_rate": 3.942534260525104e-06, + "loss": 0.94888288, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 3.49023438, + "router_z_loss_mlp": 0.42431641, + "step": 1733, + "time_per_iteration": 2.6240134239196777 + }, + { + "auxiliary_loss_clip": 0.01582947, + "auxiliary_loss_mlp": 0.00373633, + "balance_loss_clip": 1.23463225, + "balance_loss_mlp": 0.32962102, + "epoch": 0.10425372012625883, + "flos": 12125094716160.0, + "grad_norm": 9.398022619894, + "language_loss": 0.85588163, + "learning_rate": 3.942441534955514e-06, + "loss": 0.87544751, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.44018555, + "step": 1734, + "time_per_iteration": 2.6137728691101074 + }, + { + "auxiliary_loss_clip": 0.015765, + "auxiliary_loss_mlp": 0.00342236, + "balance_loss_clip": 1.23770082, + "balance_loss_mlp": 0.30225328, + "epoch": 0.1043138433789268, + "flos": 25337563902720.0, + "grad_norm": 3.2159614994933547, + "language_loss": 0.8199057, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.83909309, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.3996582, + "step": 1735, + "time_per_iteration": 2.6779568195343018 + }, + { + "auxiliary_loss_clip": 0.01612614, + "auxiliary_loss_mlp": 0.00349545, + "balance_loss_clip": 1.25769794, + "balance_loss_mlp": 0.30460298, + "epoch": 0.10437396663159478, + "flos": 29167149352320.0, + "grad_norm": 18.007387948388846, + "language_loss": 0.84907115, + "learning_rate": 3.94225586284712e-06, + "loss": 0.8686927, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 0.44921875, + "step": 1736, + "time_per_iteration": 2.7234153747558594 + }, + { + "auxiliary_loss_clip": 0.01633939, + "auxiliary_loss_mlp": 0.0034326, + "balance_loss_clip": 1.28547168, + "balance_loss_mlp": 0.29884294, + "epoch": 0.10443408988426274, + "flos": 25080946162560.0, + "grad_norm": 2.0585507570629287, + "language_loss": 0.77182263, + "learning_rate": 3.942162916315356e-06, + "loss": 0.79159462, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 0.44433594, + "step": 1737, + "time_per_iteration": 2.644218921661377 + }, + { + "auxiliary_loss_clip": 0.01628862, + "auxiliary_loss_mlp": 0.0035896, + "balance_loss_clip": 1.27622736, + "balance_loss_mlp": 0.31418532, + "epoch": 0.1044942131369307, + "flos": 26759662237440.0, + "grad_norm": 218.03672651605248, + "language_loss": 0.9278326, + "learning_rate": 3.942069896136581e-06, + "loss": 0.94771081, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.44799805, + "step": 1738, + "time_per_iteration": 2.6946380138397217 + }, + { + "auxiliary_loss_clip": 0.01655126, + "auxiliary_loss_mlp": 0.0035218, + "balance_loss_clip": 1.300475, + "balance_loss_mlp": 0.30654699, + "epoch": 0.10455433638959867, + "flos": 18442571875200.0, + "grad_norm": 2.2849147879749356, + "language_loss": 0.83059168, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.85066473, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 0.45703125, + "step": 1739, + "time_per_iteration": 2.6532387733459473 + }, + { + "auxiliary_loss_clip": 0.01684166, + "auxiliary_loss_mlp": 0.0034767, + "balance_loss_clip": 1.3190062, + "balance_loss_mlp": 0.30394423, + "epoch": 0.10461445964226665, + "flos": 23218977876480.0, + "grad_norm": 3.3121362405484787, + "language_loss": 0.86222488, + "learning_rate": 3.941883634852104e-06, + "loss": 0.88254321, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 0.43701172, + "step": 1740, + "time_per_iteration": 2.716984272003174 + }, + { + "auxiliary_loss_clip": 0.01672187, + "auxiliary_loss_mlp": 0.0036453, + "balance_loss_clip": 1.31213522, + "balance_loss_mlp": 0.32321253, + "epoch": 0.10467458289493461, + "flos": 24345243797760.0, + "grad_norm": 22.97544195510155, + "language_loss": 0.93669802, + "learning_rate": 3.941790393753467e-06, + "loss": 0.95706517, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 0.41333008, + "step": 1741, + "time_per_iteration": 2.7056710720062256 + }, + { + "auxiliary_loss_clip": 0.01670901, + "auxiliary_loss_mlp": 0.00354977, + "balance_loss_clip": 1.31115997, + "balance_loss_mlp": 0.30903405, + "epoch": 0.10473470614760258, + "flos": 21287953693440.0, + "grad_norm": 14.175035145142978, + "language_loss": 0.84021676, + "learning_rate": 3.941697079021942e-06, + "loss": 0.86047554, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 0.45947266, + "step": 1742, + "time_per_iteration": 2.667635202407837 + }, + { + "auxiliary_loss_clip": 0.01657407, + "auxiliary_loss_mlp": 0.00340346, + "balance_loss_clip": 1.31108665, + "balance_loss_mlp": 0.30014819, + "epoch": 0.10479482940027056, + "flos": 21687208341120.0, + "grad_norm": 2.197745873710425, + "language_loss": 0.9459815, + "learning_rate": 3.94160369066107e-06, + "loss": 0.96595907, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.40185547, + "step": 1743, + "time_per_iteration": 2.6571171283721924 + }, + { + "auxiliary_loss_clip": 0.0165929, + "auxiliary_loss_mlp": 0.00347903, + "balance_loss_clip": 1.3098222, + "balance_loss_mlp": 0.3076342, + "epoch": 0.10485495265293852, + "flos": 21573694385280.0, + "grad_norm": 3.330117996187679, + "language_loss": 0.83216488, + "learning_rate": 3.941510228674391e-06, + "loss": 0.85223681, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 0.40283203, + "step": 1744, + "time_per_iteration": 2.693143367767334 + }, + { + "auxiliary_loss_clip": 0.01654802, + "auxiliary_loss_mlp": 0.00318819, + "balance_loss_clip": 1.31073022, + "balance_loss_mlp": 0.28060049, + "epoch": 0.10491507590560649, + "flos": 37961923708800.0, + "grad_norm": 2.157220932673442, + "language_loss": 0.8924619, + "learning_rate": 3.941416693065451e-06, + "loss": 0.91219819, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.38232422, + "step": 1745, + "time_per_iteration": 2.80916166305542 + }, + { + "auxiliary_loss_clip": 0.01638523, + "auxiliary_loss_mlp": 0.00362725, + "balance_loss_clip": 1.298347, + "balance_loss_mlp": 0.32283789, + "epoch": 0.10497519915827447, + "flos": 26396282298240.0, + "grad_norm": 3.2043601876571843, + "language_loss": 0.91712874, + "learning_rate": 3.941323083837794e-06, + "loss": 0.93714118, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.39868164, + "step": 1746, + "time_per_iteration": 2.7198946475982666 + }, + { + "auxiliary_loss_clip": 0.01652269, + "auxiliary_loss_mlp": 0.00349176, + "balance_loss_clip": 1.31304872, + "balance_loss_mlp": 0.30766743, + "epoch": 0.10503532241094243, + "flos": 40662190581120.0, + "grad_norm": 7.493146446017974, + "language_loss": 0.7653659, + "learning_rate": 3.941229400994971e-06, + "loss": 0.78538036, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.4152832, + "step": 1747, + "time_per_iteration": 2.8258371353149414 + }, + { + "auxiliary_loss_clip": 0.01635086, + "auxiliary_loss_mlp": 0.00327726, + "balance_loss_clip": 1.29716003, + "balance_loss_mlp": 0.28738546, + "epoch": 0.1050954456636104, + "flos": 29789409588480.0, + "grad_norm": 9.619571107583479, + "language_loss": 0.93550444, + "learning_rate": 3.941135644540535e-06, + "loss": 0.9551326, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.40332031, + "step": 1748, + "time_per_iteration": 2.7179088592529297 + }, + { + "auxiliary_loss_clip": 0.01642907, + "auxiliary_loss_mlp": 0.00319631, + "balance_loss_clip": 1.29663587, + "balance_loss_mlp": 0.27661985, + "epoch": 0.10515556891627838, + "flos": 23948754497280.0, + "grad_norm": 12.593562486728725, + "language_loss": 0.79039842, + "learning_rate": 3.941041814478041e-06, + "loss": 0.81002378, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.43017578, + "step": 1749, + "time_per_iteration": 4.172596454620361 + }, + { + "auxiliary_loss_clip": 0.01645279, + "auxiliary_loss_mlp": 0.00304084, + "balance_loss_clip": 1.30683625, + "balance_loss_mlp": 0.26481614, + "epoch": 0.10521569216894634, + "flos": 18259606972800.0, + "grad_norm": 15.558889682317055, + "language_loss": 0.91207302, + "learning_rate": 3.940947910811047e-06, + "loss": 0.9315666, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.39282227, + "step": 1750, + "time_per_iteration": 4.028273582458496 + }, + { + "auxiliary_loss_clip": 0.01658375, + "auxiliary_loss_mlp": 0.00295728, + "balance_loss_clip": 1.31654418, + "balance_loss_mlp": 0.25593632, + "epoch": 0.10527581542161431, + "flos": 15630909949440.0, + "grad_norm": 274.7052338540931, + "language_loss": 1.02233803, + "learning_rate": 3.940853933543114e-06, + "loss": 1.04187894, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.39794922, + "step": 1751, + "time_per_iteration": 2.654942512512207 + }, + { + "auxiliary_loss_clip": 0.01701578, + "auxiliary_loss_mlp": 0.00292643, + "balance_loss_clip": 1.34819889, + "balance_loss_mlp": 0.25361401, + "epoch": 0.10533593867428227, + "flos": 18296559089280.0, + "grad_norm": 33.456783425964694, + "language_loss": 0.86443907, + "learning_rate": 3.940759882677805e-06, + "loss": 0.88438129, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.39038086, + "step": 1752, + "time_per_iteration": 4.137614727020264 + }, + { + "auxiliary_loss_clip": 0.01711838, + "auxiliary_loss_mlp": 0.00286283, + "balance_loss_clip": 1.35606897, + "balance_loss_mlp": 0.24613284, + "epoch": 0.10539606192695025, + "flos": 29023219555200.0, + "grad_norm": 6.345608017836026, + "language_loss": 0.83368587, + "learning_rate": 3.940665758218686e-06, + "loss": 0.85366714, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 0.40161133, + "step": 1753, + "time_per_iteration": 2.6869330406188965 + }, + { + "auxiliary_loss_clip": 0.01729438, + "auxiliary_loss_mlp": 0.00299034, + "balance_loss_clip": 1.35467029, + "balance_loss_mlp": 0.25390178, + "epoch": 0.10545618517961822, + "flos": 19969313506560.0, + "grad_norm": 2.9128279787075866, + "language_loss": 0.91589999, + "learning_rate": 3.940571560169328e-06, + "loss": 0.9361847, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 0.45117188, + "step": 1754, + "time_per_iteration": 2.6418986320495605 + }, + { + "auxiliary_loss_clip": 0.01741594, + "auxiliary_loss_mlp": 0.00288061, + "balance_loss_clip": 1.38498521, + "balance_loss_mlp": 0.24485929, + "epoch": 0.10551630843228618, + "flos": 16143427157760.0, + "grad_norm": 8.033709684639213, + "language_loss": 0.80845946, + "learning_rate": 3.940477288533302e-06, + "loss": 0.82875597, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.43188477, + "step": 1755, + "time_per_iteration": 4.052709341049194 + }, + { + "auxiliary_loss_clip": 0.01757224, + "auxiliary_loss_mlp": 0.00287052, + "balance_loss_clip": 1.3872968, + "balance_loss_mlp": 0.24454184, + "epoch": 0.10557643168495416, + "flos": 23440115957760.0, + "grad_norm": 43.002468541496725, + "language_loss": 0.85647726, + "learning_rate": 3.940382943314182e-06, + "loss": 0.87691998, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 0.42456055, + "step": 1756, + "time_per_iteration": 2.6470117568969727 + }, + { + "auxiliary_loss_clip": 0.01752411, + "auxiliary_loss_mlp": 0.00268172, + "balance_loss_clip": 1.38460732, + "balance_loss_mlp": 0.22618632, + "epoch": 0.10563655493762213, + "flos": 21799034357760.0, + "grad_norm": 3.976043726062851, + "language_loss": 0.86173648, + "learning_rate": 3.940288524515547e-06, + "loss": 0.88194227, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 0.41992188, + "step": 1757, + "time_per_iteration": 2.6458230018615723 + }, + { + "auxiliary_loss_clip": 0.01759251, + "auxiliary_loss_mlp": 0.0028966, + "balance_loss_clip": 1.37784207, + "balance_loss_mlp": 0.24798459, + "epoch": 0.10569667819029009, + "flos": 53800863275520.0, + "grad_norm": 4.871918184727479, + "language_loss": 0.85340738, + "learning_rate": 3.940194032140976e-06, + "loss": 0.87389648, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 3.81445312, + "router_z_loss_mlp": 0.41674805, + "step": 1758, + "time_per_iteration": 2.9245893955230713 + }, + { + "auxiliary_loss_clip": 0.01737635, + "auxiliary_loss_mlp": 0.00281741, + "balance_loss_clip": 1.36661971, + "balance_loss_mlp": 0.24166249, + "epoch": 0.10575680144295807, + "flos": 22925515760640.0, + "grad_norm": 8.717656175096629, + "language_loss": 0.99208629, + "learning_rate": 3.940099466194054e-06, + "loss": 1.01228011, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 0.40063477, + "step": 1759, + "time_per_iteration": 2.6256330013275146 + }, + { + "auxiliary_loss_clip": 0.01713552, + "auxiliary_loss_mlp": 0.002799, + "balance_loss_clip": 1.35401869, + "balance_loss_mlp": 0.24120498, + "epoch": 0.10581692469562604, + "flos": 14136667148160.0, + "grad_norm": 26.46173487594867, + "language_loss": 0.86705029, + "learning_rate": 3.940004826678365e-06, + "loss": 0.88698483, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 0.38647461, + "step": 1760, + "time_per_iteration": 2.6640896797180176 + }, + { + "auxiliary_loss_clip": 0.0170733, + "auxiliary_loss_mlp": 0.00288845, + "balance_loss_clip": 1.34804416, + "balance_loss_mlp": 0.24976845, + "epoch": 0.105877047948294, + "flos": 25958674903680.0, + "grad_norm": 20.080540378596147, + "language_loss": 1.00639331, + "learning_rate": 3.939910113597498e-06, + "loss": 1.02635515, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 0.390625, + "step": 1761, + "time_per_iteration": 2.740861415863037 + }, + { + "auxiliary_loss_clip": 0.01682569, + "auxiliary_loss_mlp": 0.00319183, + "balance_loss_clip": 1.33622932, + "balance_loss_mlp": 0.28191769, + "epoch": 0.10593717120096197, + "flos": 30664768032000.0, + "grad_norm": 87.81189016518454, + "language_loss": 0.87708509, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.89710271, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.37231445, + "step": 1762, + "time_per_iteration": 2.761380195617676 + }, + { + "auxiliary_loss_clip": 0.01863446, + "auxiliary_loss_mlp": 0.00117205, + "balance_loss_clip": 1.47991347, + "balance_loss_mlp": 0.10695274, + "epoch": 0.10599729445362994, + "flos": 66436682497920.0, + "grad_norm": 0.8057422092373937, + "language_loss": 0.60650402, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62631053, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 0.10253906, + "step": 1763, + "time_per_iteration": 3.274625539779663 + }, + { + "auxiliary_loss_clip": 0.01650158, + "auxiliary_loss_mlp": 0.00328583, + "balance_loss_clip": 1.30935001, + "balance_loss_mlp": 0.29301125, + "epoch": 0.10605741770629791, + "flos": 23948179879680.0, + "grad_norm": 9.202227912979748, + "language_loss": 0.87587559, + "learning_rate": 3.939625532999763e-06, + "loss": 0.8956629, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.35571289, + "step": 1764, + "time_per_iteration": 2.697500467300415 + }, + { + "auxiliary_loss_clip": 0.01641061, + "auxiliary_loss_mlp": 0.00299009, + "balance_loss_clip": 1.30888319, + "balance_loss_mlp": 0.26267403, + "epoch": 0.10611754095896588, + "flos": 19387524919680.0, + "grad_norm": 67.13536636283474, + "language_loss": 0.85348725, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.87288797, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.36328125, + "step": 1765, + "time_per_iteration": 2.682814359664917 + }, + { + "auxiliary_loss_clip": 0.01625933, + "auxiliary_loss_mlp": 0.00317842, + "balance_loss_clip": 1.29344821, + "balance_loss_mlp": 0.28324705, + "epoch": 0.10617766421163385, + "flos": 22237755073920.0, + "grad_norm": 2.6863882634616036, + "language_loss": 0.83934575, + "learning_rate": 3.939435444841306e-06, + "loss": 0.85878348, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.34594727, + "step": 1766, + "time_per_iteration": 2.7067835330963135 + }, + { + "auxiliary_loss_clip": 0.015995, + "auxiliary_loss_mlp": 0.00323429, + "balance_loss_clip": 1.28106248, + "balance_loss_mlp": 0.28981227, + "epoch": 0.10623778746430182, + "flos": 28404407024640.0, + "grad_norm": 24.726945863101065, + "language_loss": 0.82537717, + "learning_rate": 3.939340290444895e-06, + "loss": 0.8446064, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.3359375, + "step": 1767, + "time_per_iteration": 2.6797306537628174 + }, + { + "auxiliary_loss_clip": 0.01467311, + "auxiliary_loss_mlp": 0.00193077, + "balance_loss_clip": 1.24390388, + "balance_loss_mlp": 0.17762774, + "epoch": 0.10629791071696978, + "flos": 64234639221120.0, + "grad_norm": 1.1153906039358974, + "language_loss": 0.57504141, + "learning_rate": 3.939245062508506e-06, + "loss": 0.59164524, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.15429688, + "step": 1768, + "time_per_iteration": 3.2208104133605957 + }, + { + "auxiliary_loss_clip": 0.01510407, + "auxiliary_loss_mlp": 0.00313131, + "balance_loss_clip": 1.19781613, + "balance_loss_mlp": 0.28133741, + "epoch": 0.10635803396963776, + "flos": 22747578762240.0, + "grad_norm": 23.22499291320026, + "language_loss": 0.91387397, + "learning_rate": 3.939149761035749e-06, + "loss": 0.93210936, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.31774902, + "step": 1769, + "time_per_iteration": 2.7244365215301514 + }, + { + "auxiliary_loss_clip": 0.01481427, + "auxiliary_loss_mlp": 0.00310284, + "balance_loss_clip": 1.17454815, + "balance_loss_mlp": 0.27723953, + "epoch": 0.10641815722230573, + "flos": 31395586147200.0, + "grad_norm": 2.12958368562284, + "language_loss": 0.68461412, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.70253122, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 3.07617188, + "router_z_loss_mlp": 0.33007812, + "step": 1770, + "time_per_iteration": 3.024176836013794 + }, + { + "auxiliary_loss_clip": 0.01387852, + "auxiliary_loss_mlp": 0.00100124, + "balance_loss_clip": 1.15792489, + "balance_loss_mlp": 0.08419771, + "epoch": 0.1064782804749737, + "flos": 58552527784320.0, + "grad_norm": 0.8959836845230595, + "language_loss": 0.57012165, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.58500135, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.15917969, + "step": 1771, + "time_per_iteration": 3.2143800258636475 + }, + { + "auxiliary_loss_clip": 0.01441214, + "auxiliary_loss_mlp": 0.00350587, + "balance_loss_clip": 1.13185406, + "balance_loss_mlp": 0.3165175, + "epoch": 0.10653840372764166, + "flos": 23987825516160.0, + "grad_norm": 9.729089096675889, + "language_loss": 0.94739389, + "learning_rate": 3.938863415435429e-06, + "loss": 0.96531188, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.34082031, + "step": 1772, + "time_per_iteration": 2.7913882732391357 + }, + { + "auxiliary_loss_clip": 0.01434425, + "auxiliary_loss_mlp": 0.00356702, + "balance_loss_clip": 1.12479115, + "balance_loss_mlp": 0.32270345, + "epoch": 0.10659852698030964, + "flos": 18294655668480.0, + "grad_norm": 12.224388743052401, + "language_loss": 0.90300584, + "learning_rate": 3.93876781985337e-06, + "loss": 0.92091703, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.34008789, + "step": 1773, + "time_per_iteration": 2.698160409927368 + }, + { + "auxiliary_loss_clip": 0.01415572, + "auxiliary_loss_mlp": 0.00284989, + "balance_loss_clip": 1.11411023, + "balance_loss_mlp": 0.25769031, + "epoch": 0.1066586502329776, + "flos": 32160591031680.0, + "grad_norm": 2.034992932550508, + "language_loss": 0.92722976, + "learning_rate": 3.938672150753041e-06, + "loss": 0.94423532, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.27294922, + "step": 1774, + "time_per_iteration": 2.7041797637939453 + }, + { + "auxiliary_loss_clip": 0.01411655, + "auxiliary_loss_mlp": 0.00299293, + "balance_loss_clip": 1.10590029, + "balance_loss_mlp": 0.26753521, + "epoch": 0.10671877348564557, + "flos": 17785155202560.0, + "grad_norm": 194.33838225348612, + "language_loss": 0.90750575, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.92461514, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.31762695, + "step": 1775, + "time_per_iteration": 2.5868654251098633 + }, + { + "auxiliary_loss_clip": 0.01304781, + "auxiliary_loss_mlp": 0.00088594, + "balance_loss_clip": 1.0759201, + "balance_loss_mlp": 0.0785807, + "epoch": 0.10677889673831355, + "flos": 63510177813120.0, + "grad_norm": 0.8286941972070695, + "language_loss": 0.57709074, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59102452, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.10009766, + "step": 1776, + "time_per_iteration": 3.182332992553711 + }, + { + "auxiliary_loss_clip": 0.01401687, + "auxiliary_loss_mlp": 0.00324942, + "balance_loss_clip": 1.09254098, + "balance_loss_mlp": 0.29034728, + "epoch": 0.10683901999098151, + "flos": 22017694400640.0, + "grad_norm": 3.207762622350708, + "language_loss": 0.89230108, + "learning_rate": 3.938384702378727e-06, + "loss": 0.90956736, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.34594727, + "step": 1777, + "time_per_iteration": 2.6986420154571533 + }, + { + "auxiliary_loss_clip": 0.01389856, + "auxiliary_loss_mlp": 0.00287281, + "balance_loss_clip": 1.08882618, + "balance_loss_mlp": 0.25869507, + "epoch": 0.10689914324364948, + "flos": 25042952551680.0, + "grad_norm": 22.125395543454207, + "language_loss": 0.93993527, + "learning_rate": 3.938288739241625e-06, + "loss": 0.95670664, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.28613281, + "step": 1778, + "time_per_iteration": 2.6574933528900146 + }, + { + "auxiliary_loss_clip": 0.01384805, + "auxiliary_loss_mlp": 0.00315935, + "balance_loss_clip": 1.07933569, + "balance_loss_mlp": 0.28424942, + "epoch": 0.10695926649631746, + "flos": 16435129507200.0, + "grad_norm": 10.754016814933884, + "language_loss": 0.93168962, + "learning_rate": 3.938192702604417e-06, + "loss": 0.94869697, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.31677246, + "step": 1779, + "time_per_iteration": 2.5876429080963135 + }, + { + "auxiliary_loss_clip": 0.01382948, + "auxiliary_loss_mlp": 0.00273382, + "balance_loss_clip": 1.08044362, + "balance_loss_mlp": 0.24454525, + "epoch": 0.10701938974898542, + "flos": 16979211792000.0, + "grad_norm": 3.873722912315816, + "language_loss": 0.73245776, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.74902105, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.2878418, + "step": 1780, + "time_per_iteration": 2.6332502365112305 + }, + { + "auxiliary_loss_clip": 0.01380599, + "auxiliary_loss_mlp": 0.00291917, + "balance_loss_clip": 1.07162881, + "balance_loss_mlp": 0.26025462, + "epoch": 0.10707951300165339, + "flos": 15888102307200.0, + "grad_norm": 38.296689111228126, + "language_loss": 1.00835276, + "learning_rate": 3.938000408844265e-06, + "loss": 1.02507794, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.31640625, + "step": 1781, + "time_per_iteration": 2.5870883464813232 + }, + { + "auxiliary_loss_clip": 0.01385451, + "auxiliary_loss_mlp": 0.00295995, + "balance_loss_clip": 1.08034861, + "balance_loss_mlp": 0.26299787, + "epoch": 0.10713963625432135, + "flos": 14247164361600.0, + "grad_norm": 4.104005449789552, + "language_loss": 0.86690247, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.88371694, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.33007812, + "step": 1782, + "time_per_iteration": 2.6141035556793213 + }, + { + "auxiliary_loss_clip": 0.01386131, + "auxiliary_loss_mlp": 0.00293206, + "balance_loss_clip": 1.08323598, + "balance_loss_mlp": 0.26094785, + "epoch": 0.10719975950698933, + "flos": 16756780821120.0, + "grad_norm": 71.47114223340824, + "language_loss": 0.8806839, + "learning_rate": 3.937807821127436e-06, + "loss": 0.89747733, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.3223877, + "step": 1783, + "time_per_iteration": 2.7039520740509033 + }, + { + "auxiliary_loss_clip": 0.01389, + "auxiliary_loss_mlp": 0.00308699, + "balance_loss_clip": 1.08492899, + "balance_loss_mlp": 0.27548766, + "epoch": 0.1072598827596573, + "flos": 22710626645760.0, + "grad_norm": 48.84198781954945, + "language_loss": 0.97773558, + "learning_rate": 3.937711417044395e-06, + "loss": 0.99471259, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.33227539, + "step": 1784, + "time_per_iteration": 2.6580519676208496 + }, + { + "auxiliary_loss_clip": 0.0139393, + "auxiliary_loss_mlp": 0.00286184, + "balance_loss_clip": 1.08680058, + "balance_loss_mlp": 0.25378338, + "epoch": 0.10732000601232526, + "flos": 23258264376960.0, + "grad_norm": 24.873085342389736, + "language_loss": 1.11884212, + "learning_rate": 3.937614939483143e-06, + "loss": 1.13564336, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.32421875, + "step": 1785, + "time_per_iteration": 2.6441102027893066 + }, + { + "auxiliary_loss_clip": 0.01383589, + "auxiliary_loss_mlp": 0.00252598, + "balance_loss_clip": 1.08451617, + "balance_loss_mlp": 0.22392821, + "epoch": 0.10738012926499324, + "flos": 24207060176640.0, + "grad_norm": 10.798155680158187, + "language_loss": 0.89920998, + "learning_rate": 3.937518388447339e-06, + "loss": 0.91557193, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 2.99414062, + "router_z_loss_mlp": 0.28649902, + "step": 1786, + "time_per_iteration": 2.672955274581909 + }, + { + "auxiliary_loss_clip": 0.01397942, + "auxiliary_loss_mlp": 0.00288884, + "balance_loss_clip": 1.0907495, + "balance_loss_mlp": 0.25529066, + "epoch": 0.1074402525176612, + "flos": 20923065383040.0, + "grad_norm": 83.39734802068014, + "language_loss": 0.869241, + "learning_rate": 3.937421763940642e-06, + "loss": 0.88610923, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.33569336, + "step": 1787, + "time_per_iteration": 2.6503984928131104 + }, + { + "auxiliary_loss_clip": 0.01374866, + "auxiliary_loss_mlp": 0.00308939, + "balance_loss_clip": 1.07301688, + "balance_loss_mlp": 0.27594215, + "epoch": 0.10750037577032917, + "flos": 16946928443520.0, + "grad_norm": 38.795870721172356, + "language_loss": 0.92538303, + "learning_rate": 3.937325065966719e-06, + "loss": 0.94222105, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.33007812, + "step": 1788, + "time_per_iteration": 2.65669322013855 + }, + { + "auxiliary_loss_clip": 0.01390519, + "auxiliary_loss_mlp": 0.00271547, + "balance_loss_clip": 1.09290147, + "balance_loss_mlp": 0.24086252, + "epoch": 0.10756049902299715, + "flos": 20266546550400.0, + "grad_norm": 2.4419012950144388, + "language_loss": 0.8723377, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.88895833, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.30664062, + "step": 1789, + "time_per_iteration": 2.6223630905151367 + }, + { + "auxiliary_loss_clip": 0.01379965, + "auxiliary_loss_mlp": 0.00278745, + "balance_loss_clip": 1.07419562, + "balance_loss_mlp": 0.24627212, + "epoch": 0.10762062227566511, + "flos": 23586523793280.0, + "grad_norm": 3.529426570472208, + "language_loss": 0.86947161, + "learning_rate": 3.937131449631859e-06, + "loss": 0.88605875, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.32470703, + "step": 1790, + "time_per_iteration": 2.689379930496216 + }, + { + "auxiliary_loss_clip": 0.01386043, + "auxiliary_loss_mlp": 0.0029117, + "balance_loss_clip": 1.07635701, + "balance_loss_mlp": 0.25817335, + "epoch": 0.10768074552833308, + "flos": 24310626065280.0, + "grad_norm": 3.5458341488307172, + "language_loss": 0.88728356, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.90405571, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.32983398, + "step": 1791, + "time_per_iteration": 4.071853160858154 + }, + { + "auxiliary_loss_clip": 0.01375607, + "auxiliary_loss_mlp": 0.00256091, + "balance_loss_clip": 1.08217645, + "balance_loss_mlp": 0.22559699, + "epoch": 0.10774086878100106, + "flos": 25299965341440.0, + "grad_norm": 47.54786146047729, + "language_loss": 0.78196716, + "learning_rate": 3.936937539472126e-06, + "loss": 0.79828411, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.30493164, + "step": 1792, + "time_per_iteration": 4.2336485385894775 + }, + { + "auxiliary_loss_clip": 0.01377868, + "auxiliary_loss_mlp": 0.00309846, + "balance_loss_clip": 1.06617689, + "balance_loss_mlp": 0.27656317, + "epoch": 0.10780099203366902, + "flos": 22054035985920.0, + "grad_norm": 8.47381438303947, + "language_loss": 0.85015178, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.86702889, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.33300781, + "step": 1793, + "time_per_iteration": 2.6233699321746826 + }, + { + "auxiliary_loss_clip": 0.01395847, + "auxiliary_loss_mlp": 0.00268225, + "balance_loss_clip": 1.10036004, + "balance_loss_mlp": 0.23820816, + "epoch": 0.10786111528633699, + "flos": 22747471021440.0, + "grad_norm": 4.701263541175931, + "language_loss": 0.90830886, + "learning_rate": 3.936743335516936e-06, + "loss": 0.92494953, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.30004883, + "step": 1794, + "time_per_iteration": 4.1916680335998535 + }, + { + "auxiliary_loss_clip": 0.01397203, + "auxiliary_loss_mlp": 0.00320966, + "balance_loss_clip": 1.08993363, + "balance_loss_mlp": 0.28520328, + "epoch": 0.10792123853900495, + "flos": 20851064570880.0, + "grad_norm": 96.65656289813673, + "language_loss": 0.82907444, + "learning_rate": 3.936646123375246e-06, + "loss": 0.84625614, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.35766602, + "step": 1795, + "time_per_iteration": 2.669577121734619 + }, + { + "auxiliary_loss_clip": 0.01403106, + "auxiliary_loss_mlp": 0.00275299, + "balance_loss_clip": 1.10466099, + "balance_loss_mlp": 0.24196775, + "epoch": 0.10798136179167293, + "flos": 17748705876480.0, + "grad_norm": 433.13951906292334, + "language_loss": 0.95062566, + "learning_rate": 3.936548837795741e-06, + "loss": 0.96740961, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.33349609, + "step": 1796, + "time_per_iteration": 2.5828678607940674 + }, + { + "auxiliary_loss_clip": 0.0142448, + "auxiliary_loss_mlp": 0.0028301, + "balance_loss_clip": 1.12576127, + "balance_loss_mlp": 0.24667516, + "epoch": 0.1080414850443409, + "flos": 13589639948160.0, + "grad_norm": 437.7938292108712, + "language_loss": 0.89110261, + "learning_rate": 3.936451478782111e-06, + "loss": 0.9081775, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 2.99023438, + "router_z_loss_mlp": 0.36328125, + "step": 1797, + "time_per_iteration": 4.118465185165405 + }, + { + "auxiliary_loss_clip": 0.01404743, + "auxiliary_loss_mlp": 0.00273057, + "balance_loss_clip": 1.1114254, + "balance_loss_mlp": 0.24286154, + "epoch": 0.10810160829700886, + "flos": 16253421580800.0, + "grad_norm": 2.3054153173628174, + "language_loss": 0.89840937, + "learning_rate": 3.936354046338046e-06, + "loss": 0.91518736, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.30212402, + "step": 1798, + "time_per_iteration": 2.586261034011841 + }, + { + "auxiliary_loss_clip": 0.01429342, + "auxiliary_loss_mlp": 0.00280687, + "balance_loss_clip": 1.13677311, + "balance_loss_mlp": 0.24566326, + "epoch": 0.10816173154967684, + "flos": 15158002464000.0, + "grad_norm": 2595.5447062725198, + "language_loss": 0.96062791, + "learning_rate": 3.936256540467242e-06, + "loss": 0.97772825, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.35009766, + "step": 1799, + "time_per_iteration": 2.6031785011291504 + }, + { + "auxiliary_loss_clip": 0.01442261, + "auxiliary_loss_mlp": 0.00258559, + "balance_loss_clip": 1.15770769, + "balance_loss_mlp": 0.22849467, + "epoch": 0.10822185480234481, + "flos": 17785334770560.0, + "grad_norm": 98.14875183093585, + "language_loss": 0.87112027, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.88812852, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.30078125, + "step": 1800, + "time_per_iteration": 2.6057732105255127 + }, + { + "auxiliary_loss_clip": 0.01467966, + "auxiliary_loss_mlp": 0.00315666, + "balance_loss_clip": 1.1698432, + "balance_loss_mlp": 0.28092852, + "epoch": 0.10828197805501277, + "flos": 25556654908800.0, + "grad_norm": 6.807032902799721, + "language_loss": 0.79651642, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.81435275, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.34741211, + "step": 1801, + "time_per_iteration": 2.6665329933166504 + }, + { + "auxiliary_loss_clip": 0.01472321, + "auxiliary_loss_mlp": 0.00328669, + "balance_loss_clip": 1.17027164, + "balance_loss_mlp": 0.29364514, + "epoch": 0.10834210130768075, + "flos": 28984435845120.0, + "grad_norm": 2.8411098187515536, + "language_loss": 0.73854452, + "learning_rate": 3.935963582331381e-06, + "loss": 0.75655442, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.35009766, + "step": 1802, + "time_per_iteration": 2.7500131130218506 + }, + { + "auxiliary_loss_clip": 0.01474939, + "auxiliary_loss_mlp": 0.00314696, + "balance_loss_clip": 1.1837461, + "balance_loss_mlp": 0.28236601, + "epoch": 0.10840222456034872, + "flos": 20264212166400.0, + "grad_norm": 34.54018930936695, + "language_loss": 0.89551222, + "learning_rate": 3.935865782790621e-06, + "loss": 0.91340864, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.32324219, + "step": 1803, + "time_per_iteration": 2.604130983352661 + }, + { + "auxiliary_loss_clip": 0.01478203, + "auxiliary_loss_mlp": 0.00324797, + "balance_loss_clip": 1.17036879, + "balance_loss_mlp": 0.28839049, + "epoch": 0.10846234781301668, + "flos": 19863054097920.0, + "grad_norm": 11.917340415905127, + "language_loss": 0.97469795, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.99272788, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 3.08203125, + "router_z_loss_mlp": 0.36425781, + "step": 1804, + "time_per_iteration": 2.6387641429901123 + }, + { + "auxiliary_loss_clip": 0.01496578, + "auxiliary_loss_mlp": 0.00342665, + "balance_loss_clip": 1.18304646, + "balance_loss_mlp": 0.30604413, + "epoch": 0.10852247106568465, + "flos": 26469037296000.0, + "grad_norm": 33.78534704626988, + "language_loss": 0.84067535, + "learning_rate": 3.935669963488139e-06, + "loss": 0.8590678, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.36645508, + "step": 1805, + "time_per_iteration": 2.660825729370117 + }, + { + "auxiliary_loss_clip": 0.01508224, + "auxiliary_loss_mlp": 0.00353214, + "balance_loss_clip": 1.20527029, + "balance_loss_mlp": 0.31783253, + "epoch": 0.10858259431835263, + "flos": 30081506987520.0, + "grad_norm": 6.971086073440231, + "language_loss": 0.92360806, + "learning_rate": 3.935571943733843e-06, + "loss": 0.94222248, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.35424805, + "step": 1806, + "time_per_iteration": 2.8033337593078613 + }, + { + "auxiliary_loss_clip": 0.01484505, + "auxiliary_loss_mlp": 0.00304765, + "balance_loss_clip": 1.19085264, + "balance_loss_mlp": 0.26876372, + "epoch": 0.10864271757102059, + "flos": 19063180085760.0, + "grad_norm": 27.975771254139342, + "language_loss": 0.89652562, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.91441834, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.35986328, + "step": 1807, + "time_per_iteration": 2.603841781616211 + }, + { + "auxiliary_loss_clip": 0.01510018, + "auxiliary_loss_mlp": 0.00305211, + "balance_loss_clip": 1.20350218, + "balance_loss_mlp": 0.27025837, + "epoch": 0.10870284082368856, + "flos": 24715052271360.0, + "grad_norm": 7.6441670455931385, + "language_loss": 0.8495543, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.8677066, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.34960938, + "step": 1808, + "time_per_iteration": 2.731492042541504 + }, + { + "auxiliary_loss_clip": 0.0151618, + "auxiliary_loss_mlp": 0.00316619, + "balance_loss_clip": 1.20883238, + "balance_loss_mlp": 0.27937794, + "epoch": 0.10876296407635654, + "flos": 20627663932800.0, + "grad_norm": 24.977472720099854, + "language_loss": 0.86508191, + "learning_rate": 3.935277444103342e-06, + "loss": 0.88340986, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.37231445, + "step": 1809, + "time_per_iteration": 2.6619210243225098 + }, + { + "auxiliary_loss_clip": 0.0153999, + "auxiliary_loss_mlp": 0.00361241, + "balance_loss_clip": 1.2304461, + "balance_loss_mlp": 0.32259336, + "epoch": 0.1088230873290245, + "flos": 21579835610880.0, + "grad_norm": 9.73462218335589, + "language_loss": 0.9343105, + "learning_rate": 3.935179130783046e-06, + "loss": 0.95332277, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.38647461, + "step": 1810, + "time_per_iteration": 2.6820244789123535 + }, + { + "auxiliary_loss_clip": 0.0154735, + "auxiliary_loss_mlp": 0.00326603, + "balance_loss_clip": 1.23460555, + "balance_loss_mlp": 0.2843551, + "epoch": 0.10888321058169247, + "flos": 26469037296000.0, + "grad_norm": 11.394632188952714, + "language_loss": 0.73688769, + "learning_rate": 3.935080744080564e-06, + "loss": 0.75562727, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.42236328, + "step": 1811, + "time_per_iteration": 2.6823573112487793 + }, + { + "auxiliary_loss_clip": 0.01587271, + "auxiliary_loss_mlp": 0.00337602, + "balance_loss_clip": 1.27420378, + "balance_loss_mlp": 0.29480544, + "epoch": 0.10894333383436045, + "flos": 25848608653440.0, + "grad_norm": 1910.3878602498169, + "language_loss": 0.82111073, + "learning_rate": 3.934982283999626e-06, + "loss": 0.84035939, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.42797852, + "step": 1812, + "time_per_iteration": 2.6844756603240967 + }, + { + "auxiliary_loss_clip": 0.015848, + "auxiliary_loss_mlp": 0.00343341, + "balance_loss_clip": 1.26620245, + "balance_loss_mlp": 0.29975769, + "epoch": 0.10900345708702841, + "flos": 19537093152000.0, + "grad_norm": 3.204041933392418, + "language_loss": 0.80224597, + "learning_rate": 3.934883750543966e-06, + "loss": 0.82152736, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.4362793, + "step": 1813, + "time_per_iteration": 2.653700113296509 + }, + { + "auxiliary_loss_clip": 0.0159923, + "auxiliary_loss_mlp": 0.00327981, + "balance_loss_clip": 1.27784538, + "balance_loss_mlp": 0.2868059, + "epoch": 0.10906358033969638, + "flos": 23623296341760.0, + "grad_norm": 5.225977549526657, + "language_loss": 0.88955015, + "learning_rate": 3.93478514371732e-06, + "loss": 0.90882224, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.41162109, + "step": 1814, + "time_per_iteration": 2.692347526550293 + }, + { + "auxiliary_loss_clip": 0.01623156, + "auxiliary_loss_mlp": 0.00359196, + "balance_loss_clip": 1.29374838, + "balance_loss_mlp": 0.31697163, + "epoch": 0.10912370359236434, + "flos": 21214731818880.0, + "grad_norm": 12.541549781049843, + "language_loss": 0.93313444, + "learning_rate": 3.934686463523429e-06, + "loss": 0.95295799, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.42211914, + "step": 1815, + "time_per_iteration": 2.6972737312316895 + }, + { + "auxiliary_loss_clip": 0.01623073, + "auxiliary_loss_mlp": 0.00365482, + "balance_loss_clip": 1.29678762, + "balance_loss_mlp": 0.32538027, + "epoch": 0.10918382684503232, + "flos": 13553190622080.0, + "grad_norm": 4.264451448830845, + "language_loss": 0.81292301, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.83280861, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.40063477, + "step": 1816, + "time_per_iteration": 2.660128355026245 + }, + { + "auxiliary_loss_clip": 0.01635908, + "auxiliary_loss_mlp": 0.00410412, + "balance_loss_clip": 1.29705024, + "balance_loss_mlp": 0.36952287, + "epoch": 0.10924395009770028, + "flos": 27964321591680.0, + "grad_norm": 4.613955592607701, + "language_loss": 0.8324784, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.85294157, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.40893555, + "step": 1817, + "time_per_iteration": 2.6948177814483643 + }, + { + "auxiliary_loss_clip": 0.01614317, + "auxiliary_loss_mlp": 0.00326715, + "balance_loss_clip": 1.28303671, + "balance_loss_mlp": 0.28713739, + "epoch": 0.10930407335036825, + "flos": 25593750679680.0, + "grad_norm": 4.241227629910632, + "language_loss": 0.73909134, + "learning_rate": 3.934389982775706e-06, + "loss": 0.75850165, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.39575195, + "step": 1818, + "time_per_iteration": 2.640061855316162 + }, + { + "auxiliary_loss_clip": 0.01623247, + "auxiliary_loss_mlp": 0.00391462, + "balance_loss_clip": 1.28983951, + "balance_loss_mlp": 0.34950054, + "epoch": 0.10936419660303623, + "flos": 18406194376320.0, + "grad_norm": 6.573320252767443, + "language_loss": 0.80576319, + "learning_rate": 3.934291009150275e-06, + "loss": 0.82591027, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.41943359, + "step": 1819, + "time_per_iteration": 2.6848671436309814 + }, + { + "auxiliary_loss_clip": 0.0162179, + "auxiliary_loss_mlp": 0.00359719, + "balance_loss_clip": 1.29034734, + "balance_loss_mlp": 0.32178682, + "epoch": 0.1094243198557042, + "flos": 23840052963840.0, + "grad_norm": 8.12649100918384, + "language_loss": 0.81334555, + "learning_rate": 3.934191962176335e-06, + "loss": 0.8331607, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.37939453, + "step": 1820, + "time_per_iteration": 2.6963491439819336 + }, + { + "auxiliary_loss_clip": 0.01629089, + "auxiliary_loss_mlp": 0.00406508, + "balance_loss_clip": 1.29039383, + "balance_loss_mlp": 0.3658815, + "epoch": 0.10948444310837216, + "flos": 14643940970880.0, + "grad_norm": 145.2705431942514, + "language_loss": 0.9071455, + "learning_rate": 3.934092841857642e-06, + "loss": 0.92750144, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.40625, + "step": 1821, + "time_per_iteration": 2.6103506088256836 + }, + { + "auxiliary_loss_clip": 0.01604937, + "auxiliary_loss_mlp": 0.00399406, + "balance_loss_clip": 1.26750946, + "balance_loss_mlp": 0.36240363, + "epoch": 0.10954456636104014, + "flos": 27818811596160.0, + "grad_norm": 12.563522602177912, + "language_loss": 0.82315767, + "learning_rate": 3.933993648197955e-06, + "loss": 0.84320116, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.37011719, + "step": 1822, + "time_per_iteration": 2.676347017288208 + }, + { + "auxiliary_loss_clip": 0.01614525, + "auxiliary_loss_mlp": 0.00374395, + "balance_loss_clip": 1.27959442, + "balance_loss_mlp": 0.33820325, + "epoch": 0.1096046896137081, + "flos": 33620934372480.0, + "grad_norm": 89.37402637073949, + "language_loss": 0.85689342, + "learning_rate": 3.933894381201034e-06, + "loss": 0.87678266, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.36206055, + "step": 1823, + "time_per_iteration": 2.744638681411743 + }, + { + "auxiliary_loss_clip": 0.01609962, + "auxiliary_loss_mlp": 0.00367224, + "balance_loss_clip": 1.27928019, + "balance_loss_mlp": 0.33208162, + "epoch": 0.10966481286637607, + "flos": 26980010219520.0, + "grad_norm": 34.91677701077163, + "language_loss": 0.85110998, + "learning_rate": 3.933795040870645e-06, + "loss": 0.87088192, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.3515625, + "step": 1824, + "time_per_iteration": 2.6609127521514893 + }, + { + "auxiliary_loss_clip": 0.01623299, + "auxiliary_loss_mlp": 0.00389844, + "balance_loss_clip": 1.28175902, + "balance_loss_mlp": 0.35176849, + "epoch": 0.10972493611904403, + "flos": 23036551678080.0, + "grad_norm": 9.102866931322795, + "language_loss": 0.95714837, + "learning_rate": 3.933695627210554e-06, + "loss": 0.97727984, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.38061523, + "step": 1825, + "time_per_iteration": 2.6535887718200684 + }, + { + "auxiliary_loss_clip": 0.01592187, + "auxiliary_loss_mlp": 0.00414589, + "balance_loss_clip": 1.24365628, + "balance_loss_mlp": 0.37737176, + "epoch": 0.10978505937171201, + "flos": 38104632443520.0, + "grad_norm": 7.218148193288993, + "language_loss": 0.81585062, + "learning_rate": 3.933596140224532e-06, + "loss": 0.83591843, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.37231445, + "step": 1826, + "time_per_iteration": 2.7624993324279785 + }, + { + "auxiliary_loss_clip": 0.01729088, + "auxiliary_loss_mlp": 0.00125958, + "balance_loss_clip": 1.39071679, + "balance_loss_mlp": 0.11441834, + "epoch": 0.10984518262437998, + "flos": 59849694616320.0, + "grad_norm": 0.8109961898326862, + "language_loss": 0.55054653, + "learning_rate": 3.93349657991635e-06, + "loss": 0.56909692, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.11523438, + "step": 1827, + "time_per_iteration": 3.209993600845337 + }, + { + "auxiliary_loss_clip": 0.01706902, + "auxiliary_loss_mlp": 0.00121979, + "balance_loss_clip": 1.37636495, + "balance_loss_mlp": 0.10900941, + "epoch": 0.10990530587704794, + "flos": 66719837410560.0, + "grad_norm": 1.1199575484257127, + "language_loss": 0.55318272, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57147151, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.12988281, + "step": 1828, + "time_per_iteration": 3.1910574436187744 + }, + { + "auxiliary_loss_clip": 0.01566081, + "auxiliary_loss_mlp": 0.0036008, + "balance_loss_clip": 1.24322033, + "balance_loss_mlp": 0.32651085, + "epoch": 0.10996542912971592, + "flos": 25447199189760.0, + "grad_norm": 4.017678019569364, + "language_loss": 0.9279502, + "learning_rate": 3.933297239348612e-06, + "loss": 0.94721174, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 3.2265625, + "router_z_loss_mlp": 0.33569336, + "step": 1829, + "time_per_iteration": 2.816012144088745 + }, + { + "auxiliary_loss_clip": 0.01544855, + "auxiliary_loss_mlp": 0.00388557, + "balance_loss_clip": 1.22472119, + "balance_loss_mlp": 0.35300922, + "epoch": 0.11002555238238389, + "flos": 44018186186880.0, + "grad_norm": 173.20911181698102, + "language_loss": 0.94620395, + "learning_rate": 3.933197459096614e-06, + "loss": 0.96553808, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.35546875, + "step": 1830, + "time_per_iteration": 2.882603883743286 + }, + { + "auxiliary_loss_clip": 0.01646177, + "auxiliary_loss_mlp": 0.00142622, + "balance_loss_clip": 1.34722471, + "balance_loss_mlp": 0.12965249, + "epoch": 0.11008567563505185, + "flos": 54065133590400.0, + "grad_norm": 0.6818301990539664, + "language_loss": 0.55377603, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57166404, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.12988281, + "step": 1831, + "time_per_iteration": 3.1484909057617188 + }, + { + "auxiliary_loss_clip": 0.01522746, + "auxiliary_loss_mlp": 0.00376712, + "balance_loss_clip": 1.21257639, + "balance_loss_mlp": 0.34071082, + "epoch": 0.11014579888771983, + "flos": 24243150366720.0, + "grad_norm": 71.01357958629484, + "language_loss": 0.97846997, + "learning_rate": 3.932997678675282e-06, + "loss": 0.99746454, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.36010742, + "step": 1832, + "time_per_iteration": 2.7099859714508057 + }, + { + "auxiliary_loss_clip": 0.01588613, + "auxiliary_loss_mlp": 0.0030973, + "balance_loss_clip": 1.31844795, + "balance_loss_mlp": 0.29275453, + "epoch": 0.1102059221403878, + "flos": 57743965658880.0, + "grad_norm": 0.7233894276619168, + "language_loss": 0.59755969, + "learning_rate": 3.932897678513523e-06, + "loss": 0.61654305, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.16992188, + "step": 1833, + "time_per_iteration": 3.1736154556274414 + }, + { + "auxiliary_loss_clip": 0.01517251, + "auxiliary_loss_mlp": 0.00395085, + "balance_loss_clip": 1.20315528, + "balance_loss_mlp": 0.35987055, + "epoch": 0.11026604539305576, + "flos": 16795923667200.0, + "grad_norm": 169.53829158831732, + "language_loss": 0.9125731, + "learning_rate": 3.93279760505609e-06, + "loss": 0.93169641, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.35180664, + "step": 1834, + "time_per_iteration": 5.501774311065674 + }, + { + "auxiliary_loss_clip": 0.01503822, + "auxiliary_loss_mlp": 0.00385618, + "balance_loss_clip": 1.19490921, + "balance_loss_mlp": 0.35214394, + "epoch": 0.11032616864572373, + "flos": 23988076911360.0, + "grad_norm": 143.68551988049956, + "language_loss": 0.99950361, + "learning_rate": 3.932697458306779e-06, + "loss": 1.01839805, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.3347168, + "step": 1835, + "time_per_iteration": 2.7429637908935547 + }, + { + "auxiliary_loss_clip": 0.01502036, + "auxiliary_loss_mlp": 0.00376736, + "balance_loss_clip": 1.20072079, + "balance_loss_mlp": 0.34346437, + "epoch": 0.1103862918983917, + "flos": 19683141851520.0, + "grad_norm": 12.775869127227542, + "language_loss": 0.73760641, + "learning_rate": 3.932597238269386e-06, + "loss": 0.75639415, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.33239746, + "step": 1836, + "time_per_iteration": 2.6511008739471436 + }, + { + "auxiliary_loss_clip": 0.01486623, + "auxiliary_loss_mlp": 0.00378279, + "balance_loss_clip": 1.18981481, + "balance_loss_mlp": 0.34370798, + "epoch": 0.11044641515105967, + "flos": 32160878340480.0, + "grad_norm": 32.27895800765118, + "language_loss": 0.78578794, + "learning_rate": 3.932496944947711e-06, + "loss": 0.80443692, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.34594727, + "step": 1837, + "time_per_iteration": 4.077972650527954 + }, + { + "auxiliary_loss_clip": 0.01505597, + "auxiliary_loss_mlp": 0.00408937, + "balance_loss_clip": 1.2091229, + "balance_loss_mlp": 0.37405628, + "epoch": 0.11050653840372764, + "flos": 16689233295360.0, + "grad_norm": 16.859323590609453, + "language_loss": 0.86386251, + "learning_rate": 3.93239657834556e-06, + "loss": 0.88300782, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.34887695, + "step": 1838, + "time_per_iteration": 2.6017768383026123 + }, + { + "auxiliary_loss_clip": 0.01492602, + "auxiliary_loss_mlp": 0.00437073, + "balance_loss_clip": 1.20116532, + "balance_loss_mlp": 0.39968851, + "epoch": 0.11056666165639562, + "flos": 21208877902080.0, + "grad_norm": 17.785028472712543, + "language_loss": 0.78523737, + "learning_rate": 3.932296138466736e-06, + "loss": 0.80453408, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.3737793, + "step": 1839, + "time_per_iteration": 4.129225730895996 + }, + { + "auxiliary_loss_clip": 0.01512603, + "auxiliary_loss_mlp": 0.00497813, + "balance_loss_clip": 1.20934319, + "balance_loss_mlp": 0.45687687, + "epoch": 0.11062678490906358, + "flos": 19165488998400.0, + "grad_norm": 27.369073127944052, + "language_loss": 0.85577941, + "learning_rate": 3.93219562531505e-06, + "loss": 0.87588352, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.40917969, + "step": 1840, + "time_per_iteration": 2.6889336109161377 + }, + { + "auxiliary_loss_clip": 0.01516334, + "auxiliary_loss_mlp": 0.00517685, + "balance_loss_clip": 1.22048616, + "balance_loss_mlp": 0.476915, + "epoch": 0.11068690816173155, + "flos": 24895287740160.0, + "grad_norm": 322.4156717713936, + "language_loss": 0.93081105, + "learning_rate": 3.932095038894311e-06, + "loss": 0.95115125, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.40771484, + "step": 1841, + "time_per_iteration": 2.674248695373535 + }, + { + "auxiliary_loss_clip": 0.01517465, + "auxiliary_loss_mlp": 0.00519808, + "balance_loss_clip": 1.21889591, + "balance_loss_mlp": 0.47863328, + "epoch": 0.11074703141439952, + "flos": 16472368932480.0, + "grad_norm": 6.001106881364176, + "language_loss": 0.97516751, + "learning_rate": 3.931994379208334e-06, + "loss": 0.99554026, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.41162109, + "step": 1842, + "time_per_iteration": 2.6109797954559326 + }, + { + "auxiliary_loss_clip": 0.01519917, + "auxiliary_loss_mlp": 0.00578539, + "balance_loss_clip": 1.22115874, + "balance_loss_mlp": 0.53903341, + "epoch": 0.11080715466706749, + "flos": 19172420323200.0, + "grad_norm": 9.11035780431033, + "language_loss": 0.92763209, + "learning_rate": 3.931893646260937e-06, + "loss": 0.94861674, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.39501953, + "step": 1843, + "time_per_iteration": 2.605214834213257 + }, + { + "auxiliary_loss_clip": 0.01525653, + "auxiliary_loss_mlp": 0.00557528, + "balance_loss_clip": 1.22174239, + "balance_loss_mlp": 0.5154947, + "epoch": 0.11086727791973545, + "flos": 27704687109120.0, + "grad_norm": 13.668610342905762, + "language_loss": 0.80902016, + "learning_rate": 3.931792840055941e-06, + "loss": 0.82985198, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.42041016, + "step": 1844, + "time_per_iteration": 2.67594313621521 + }, + { + "auxiliary_loss_clip": 0.01534229, + "auxiliary_loss_mlp": 0.00579375, + "balance_loss_clip": 1.22458482, + "balance_loss_mlp": 0.53436118, + "epoch": 0.11092740117240343, + "flos": 18514967736960.0, + "grad_norm": 2.993921245963958, + "language_loss": 0.82913709, + "learning_rate": 3.931691960597165e-06, + "loss": 0.85027313, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.44995117, + "step": 1845, + "time_per_iteration": 2.5866506099700928 + }, + { + "auxiliary_loss_clip": 0.01550872, + "auxiliary_loss_mlp": 0.00626795, + "balance_loss_clip": 1.24162066, + "balance_loss_mlp": 0.58001685, + "epoch": 0.1109875244250714, + "flos": 20522446018560.0, + "grad_norm": 81.0829551717652, + "language_loss": 0.82336473, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.84514147, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.46801758, + "step": 1846, + "time_per_iteration": 2.699831247329712 + }, + { + "auxiliary_loss_clip": 0.01581326, + "auxiliary_loss_mlp": 0.00646595, + "balance_loss_clip": 1.25892127, + "balance_loss_mlp": 0.59755254, + "epoch": 0.11104764767773936, + "flos": 14098601710080.0, + "grad_norm": 194.61970163304517, + "language_loss": 0.95756525, + "learning_rate": 3.931489981933584e-06, + "loss": 0.97984439, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.49023438, + "step": 1847, + "time_per_iteration": 2.6291682720184326 + }, + { + "auxiliary_loss_clip": 0.01589729, + "auxiliary_loss_mlp": 0.00602754, + "balance_loss_clip": 1.26432705, + "balance_loss_mlp": 0.55511844, + "epoch": 0.11110777093040733, + "flos": 20594518657920.0, + "grad_norm": 12.755682546890265, + "language_loss": 0.83710957, + "learning_rate": 3.931388882736438e-06, + "loss": 0.85903442, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.47558594, + "step": 1848, + "time_per_iteration": 2.6921894550323486 + }, + { + "auxiliary_loss_clip": 0.01633343, + "auxiliary_loss_mlp": 0.0059155, + "balance_loss_clip": 1.30315542, + "balance_loss_mlp": 0.54868245, + "epoch": 0.11116789418307531, + "flos": 21870065502720.0, + "grad_norm": 79.53701685844075, + "language_loss": 0.83417308, + "learning_rate": 3.931287710300832e-06, + "loss": 0.85642207, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.4284668, + "step": 1849, + "time_per_iteration": 2.6991817951202393 + }, + { + "auxiliary_loss_clip": 0.01635501, + "auxiliary_loss_mlp": 0.00625994, + "balance_loss_clip": 1.29677737, + "balance_loss_mlp": 0.57783389, + "epoch": 0.11122801743574327, + "flos": 15523106256000.0, + "grad_norm": 46.654569461353205, + "language_loss": 0.79757488, + "learning_rate": 3.931186464630601e-06, + "loss": 0.82018977, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.48193359, + "step": 1850, + "time_per_iteration": 2.6482081413269043 + }, + { + "auxiliary_loss_clip": 0.01672229, + "auxiliary_loss_mlp": 0.00675386, + "balance_loss_clip": 1.32253122, + "balance_loss_mlp": 0.62581885, + "epoch": 0.11128814068841124, + "flos": 14392279307520.0, + "grad_norm": 12.87232261205664, + "language_loss": 0.90954936, + "learning_rate": 3.931085145729588e-06, + "loss": 0.93302548, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 0.49609375, + "step": 1851, + "time_per_iteration": 2.6153407096862793 + }, + { + "auxiliary_loss_clip": 0.01671796, + "auxiliary_loss_mlp": 0.0060093, + "balance_loss_clip": 1.32091832, + "balance_loss_mlp": 0.55648881, + "epoch": 0.11134826394107922, + "flos": 16653933204480.0, + "grad_norm": 2.9525943341220704, + "language_loss": 0.94772923, + "learning_rate": 3.930983753601631e-06, + "loss": 0.97045648, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.44458008, + "step": 1852, + "time_per_iteration": 2.740330696105957 + }, + { + "auxiliary_loss_clip": 0.01662207, + "auxiliary_loss_mlp": 0.00597708, + "balance_loss_clip": 1.31359351, + "balance_loss_mlp": 0.55569845, + "epoch": 0.11140838719374718, + "flos": 16690993061760.0, + "grad_norm": 25.41324538280786, + "language_loss": 0.78251535, + "learning_rate": 3.930882288250578e-06, + "loss": 0.80511451, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 0.42016602, + "step": 1853, + "time_per_iteration": 2.6732494831085205 + }, + { + "auxiliary_loss_clip": 0.01703721, + "auxiliary_loss_mlp": 0.00332303, + "balance_loss_clip": 1.39928019, + "balance_loss_mlp": 0.30426455, + "epoch": 0.11146851044641515, + "flos": 60976355587200.0, + "grad_norm": 0.7919067087660426, + "language_loss": 0.5370115, + "learning_rate": 3.930780749680273e-06, + "loss": 0.55737174, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.28125, + "step": 1854, + "time_per_iteration": 3.13718843460083 + }, + { + "auxiliary_loss_clip": 0.0168083, + "auxiliary_loss_mlp": 0.00593079, + "balance_loss_clip": 1.30855775, + "balance_loss_mlp": 0.54303479, + "epoch": 0.11152863369908313, + "flos": 22193835719040.0, + "grad_norm": 2.8997223117781594, + "language_loss": 0.9235673, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.94630641, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.50073242, + "step": 1855, + "time_per_iteration": 2.6760520935058594 + }, + { + "auxiliary_loss_clip": 0.0166198, + "auxiliary_loss_mlp": 0.00545183, + "balance_loss_clip": 1.30192661, + "balance_loss_mlp": 0.50021702, + "epoch": 0.11158875695175109, + "flos": 19537524115200.0, + "grad_norm": 2.2107208361359683, + "language_loss": 0.88537341, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.90744501, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 0.44946289, + "step": 1856, + "time_per_iteration": 2.663083791732788 + }, + { + "auxiliary_loss_clip": 0.01657056, + "auxiliary_loss_mlp": 0.00478138, + "balance_loss_clip": 1.2954824, + "balance_loss_mlp": 0.43968105, + "epoch": 0.11164888020441906, + "flos": 25442709989760.0, + "grad_norm": 4.748360323084261, + "language_loss": 0.88185579, + "learning_rate": 3.93047569469238e-06, + "loss": 0.90320766, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.38476562, + "step": 1857, + "time_per_iteration": 2.669299840927124 + }, + { + "auxiliary_loss_clip": 0.01643477, + "auxiliary_loss_mlp": 0.00491009, + "balance_loss_clip": 1.28665543, + "balance_loss_mlp": 0.44775957, + "epoch": 0.11170900345708702, + "flos": 15632741543040.0, + "grad_norm": 11.695428267679812, + "language_loss": 0.91223681, + "learning_rate": 3.930373863283608e-06, + "loss": 0.93358159, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.43261719, + "step": 1858, + "time_per_iteration": 2.620676040649414 + }, + { + "auxiliary_loss_clip": 0.01676075, + "auxiliary_loss_mlp": 0.00500113, + "balance_loss_clip": 1.30905843, + "balance_loss_mlp": 0.45447946, + "epoch": 0.111769126709755, + "flos": 23039424766080.0, + "grad_norm": 17.893183287751707, + "language_loss": 0.99418414, + "learning_rate": 3.930271958674866e-06, + "loss": 1.01594603, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.45629883, + "step": 1859, + "time_per_iteration": 2.7012901306152344 + }, + { + "auxiliary_loss_clip": 0.01676723, + "auxiliary_loss_mlp": 0.00477543, + "balance_loss_clip": 1.30480719, + "balance_loss_mlp": 0.43074188, + "epoch": 0.11182924996242297, + "flos": 20850705434880.0, + "grad_norm": 31.39701333402126, + "language_loss": 0.91769266, + "learning_rate": 3.930169980870018e-06, + "loss": 0.93923533, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 0.46826172, + "step": 1860, + "time_per_iteration": 2.631378650665283 + }, + { + "auxiliary_loss_clip": 0.01689087, + "auxiliary_loss_mlp": 0.00436957, + "balance_loss_clip": 1.3155818, + "balance_loss_mlp": 0.39494798, + "epoch": 0.11188937321509093, + "flos": 17455315587840.0, + "grad_norm": 11.278647419398805, + "language_loss": 0.83187759, + "learning_rate": 3.930067929872931e-06, + "loss": 0.85313809, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 0.42016602, + "step": 1861, + "time_per_iteration": 2.6338207721710205 + }, + { + "auxiliary_loss_clip": 0.01708253, + "auxiliary_loss_mlp": 0.00446855, + "balance_loss_clip": 1.31779063, + "balance_loss_mlp": 0.40248567, + "epoch": 0.11194949646775891, + "flos": 24095916518400.0, + "grad_norm": 16.661477955858906, + "language_loss": 0.95724958, + "learning_rate": 3.929965805687474e-06, + "loss": 0.97880065, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.44384766, + "step": 1862, + "time_per_iteration": 2.657116413116455 + }, + { + "auxiliary_loss_clip": 0.01715665, + "auxiliary_loss_mlp": 0.00463638, + "balance_loss_clip": 1.30590034, + "balance_loss_mlp": 0.41898268, + "epoch": 0.11200961972042688, + "flos": 25153880728320.0, + "grad_norm": 441.5929409415987, + "language_loss": 0.94876027, + "learning_rate": 3.92986360831752e-06, + "loss": 0.97055328, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 4.09960938, + "router_z_loss_mlp": 0.4465332, + "step": 1863, + "time_per_iteration": 2.658836841583252 + }, + { + "auxiliary_loss_clip": 0.01763403, + "auxiliary_loss_mlp": 0.00459417, + "balance_loss_clip": 1.33859301, + "balance_loss_mlp": 0.4129256, + "epoch": 0.11206974297309484, + "flos": 21288312829440.0, + "grad_norm": 41.20300317362616, + "language_loss": 0.70874095, + "learning_rate": 3.929761337766945e-06, + "loss": 0.73096919, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 4.2578125, + "router_z_loss_mlp": 0.46508789, + "step": 1864, + "time_per_iteration": 2.616436243057251 + }, + { + "auxiliary_loss_clip": 0.01775251, + "auxiliary_loss_mlp": 0.00577093, + "balance_loss_clip": 1.33516932, + "balance_loss_mlp": 0.52504623, + "epoch": 0.11212986622576282, + "flos": 18915982151040.0, + "grad_norm": 35.183866966243826, + "language_loss": 0.80569696, + "learning_rate": 3.929658994039627e-06, + "loss": 0.82922041, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 4.3984375, + "router_z_loss_mlp": 0.52075195, + "step": 1865, + "time_per_iteration": 2.7222490310668945 + }, + { + "auxiliary_loss_clip": 0.0178983, + "auxiliary_loss_mlp": 0.0056548, + "balance_loss_clip": 1.33985639, + "balance_loss_mlp": 0.50868839, + "epoch": 0.11218998947843078, + "flos": 22054754257920.0, + "grad_norm": 73.71999564243977, + "language_loss": 0.94617701, + "learning_rate": 3.929556577139446e-06, + "loss": 0.96973008, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 4.49609375, + "router_z_loss_mlp": 0.56884766, + "step": 1866, + "time_per_iteration": 2.740328788757324 + }, + { + "auxiliary_loss_clip": 0.01749259, + "auxiliary_loss_mlp": 0.00563376, + "balance_loss_clip": 1.32157123, + "balance_loss_mlp": 0.50603616, + "epoch": 0.11225011273109875, + "flos": 24571697091840.0, + "grad_norm": 12.458967143221573, + "language_loss": 0.8782382, + "learning_rate": 3.929454087070286e-06, + "loss": 0.90136456, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 4.27734375, + "router_z_loss_mlp": 0.57299805, + "step": 1867, + "time_per_iteration": 2.735369920730591 + }, + { + "auxiliary_loss_clip": 0.01803705, + "auxiliary_loss_mlp": 0.00568062, + "balance_loss_clip": 1.34728563, + "balance_loss_mlp": 0.51391745, + "epoch": 0.11231023598376672, + "flos": 28438665621120.0, + "grad_norm": 19.127531992181996, + "language_loss": 0.92490119, + "learning_rate": 3.929351523836035e-06, + "loss": 0.94861877, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 4.5625, + "router_z_loss_mlp": 0.54150391, + "step": 1868, + "time_per_iteration": 2.736497163772583 + }, + { + "auxiliary_loss_clip": 0.01778644, + "auxiliary_loss_mlp": 0.00575402, + "balance_loss_clip": 1.34154415, + "balance_loss_mlp": 0.52395177, + "epoch": 0.1123703592364347, + "flos": 14426466076800.0, + "grad_norm": 6.254195777842118, + "language_loss": 0.75587708, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.77941757, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 4.3671875, + "router_z_loss_mlp": 0.51391602, + "step": 1869, + "time_per_iteration": 2.5943546295166016 + }, + { + "auxiliary_loss_clip": 0.01761751, + "auxiliary_loss_mlp": 0.00573342, + "balance_loss_clip": 1.32416689, + "balance_loss_mlp": 0.51802856, + "epoch": 0.11243048248910266, + "flos": 22236282616320.0, + "grad_norm": 12.641171635282825, + "language_loss": 0.83496541, + "learning_rate": 3.929146177887814e-06, + "loss": 0.8583163, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.55297852, + "step": 1870, + "time_per_iteration": 2.7942755222320557 + }, + { + "auxiliary_loss_clip": 0.01755736, + "auxiliary_loss_mlp": 0.00585868, + "balance_loss_clip": 1.32654965, + "balance_loss_mlp": 0.52788508, + "epoch": 0.11249060574177062, + "flos": 18584167288320.0, + "grad_norm": 10.180587496262952, + "language_loss": 0.85909432, + "learning_rate": 3.929043395181631e-06, + "loss": 0.8825103, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 4.296875, + "router_z_loss_mlp": 0.58007812, + "step": 1871, + "time_per_iteration": 2.6427886486053467 + }, + { + "auxiliary_loss_clip": 0.01754863, + "auxiliary_loss_mlp": 0.00633823, + "balance_loss_clip": 1.32364035, + "balance_loss_mlp": 0.57641137, + "epoch": 0.1125507289944386, + "flos": 22856567604480.0, + "grad_norm": 4.382185254390835, + "language_loss": 0.88828522, + "learning_rate": 3.928940539325929e-06, + "loss": 0.91217208, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 4.3125, + "router_z_loss_mlp": 0.57348633, + "step": 1872, + "time_per_iteration": 2.7204928398132324 + }, + { + "auxiliary_loss_clip": 0.01759948, + "auxiliary_loss_mlp": 0.00627755, + "balance_loss_clip": 1.3266809, + "balance_loss_mlp": 0.5666244, + "epoch": 0.11261085224710657, + "flos": 19676390094720.0, + "grad_norm": 18.195923961961665, + "language_loss": 0.87984651, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.9037236, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.61132812, + "step": 1873, + "time_per_iteration": 2.6613609790802 + }, + { + "auxiliary_loss_clip": 0.0175678, + "auxiliary_loss_mlp": 0.00652545, + "balance_loss_clip": 1.32274449, + "balance_loss_mlp": 0.58898246, + "epoch": 0.11267097549977453, + "flos": 26063246373120.0, + "grad_norm": 113.09618485303261, + "language_loss": 0.9674232, + "learning_rate": 3.928734608181575e-06, + "loss": 0.99151647, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 4.33984375, + "router_z_loss_mlp": 0.63623047, + "step": 1874, + "time_per_iteration": 2.7091379165649414 + }, + { + "auxiliary_loss_clip": 0.01720419, + "auxiliary_loss_mlp": 0.00590552, + "balance_loss_clip": 1.30049121, + "balance_loss_mlp": 0.53426129, + "epoch": 0.11273109875244251, + "flos": 21068036674560.0, + "grad_norm": 4.882780919674207, + "language_loss": 0.79103369, + "learning_rate": 3.928631532900729e-06, + "loss": 0.81414342, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 4.19921875, + "router_z_loss_mlp": 0.56298828, + "step": 1875, + "time_per_iteration": 2.6748838424682617 + }, + { + "auxiliary_loss_clip": 0.01755929, + "auxiliary_loss_mlp": 0.00657582, + "balance_loss_clip": 1.3427825, + "balance_loss_mlp": 0.60045737, + "epoch": 0.11279122200511048, + "flos": 27088999061760.0, + "grad_norm": 118.5655034890092, + "language_loss": 0.77524608, + "learning_rate": 3.928528384485984e-06, + "loss": 0.79938114, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 4.12890625, + "router_z_loss_mlp": 0.57202148, + "step": 1876, + "time_per_iteration": 5.505356550216675 + }, + { + "auxiliary_loss_clip": 0.01728044, + "auxiliary_loss_mlp": 0.00613004, + "balance_loss_clip": 1.3276279, + "balance_loss_mlp": 0.55721432, + "epoch": 0.11285134525777844, + "flos": 20187901722240.0, + "grad_norm": 15.4388596520176, + "language_loss": 0.82620615, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.84961665, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 4.00195312, + "router_z_loss_mlp": 0.55834961, + "step": 1877, + "time_per_iteration": 2.650360584259033 + }, + { + "auxiliary_loss_clip": 0.01717025, + "auxiliary_loss_mlp": 0.00674631, + "balance_loss_clip": 1.3061887, + "balance_loss_mlp": 0.61454976, + "epoch": 0.11291146851044641, + "flos": 12458453863680.0, + "grad_norm": 75.81159603542575, + "language_loss": 0.95667845, + "learning_rate": 3.928321868270436e-06, + "loss": 0.98059499, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 4.1015625, + "router_z_loss_mlp": 0.60107422, + "step": 1878, + "time_per_iteration": 2.794698715209961 + }, + { + "auxiliary_loss_clip": 0.01724233, + "auxiliary_loss_mlp": 0.00619817, + "balance_loss_clip": 1.32018054, + "balance_loss_mlp": 0.56347901, + "epoch": 0.11297159176311439, + "flos": 23842315520640.0, + "grad_norm": 18.391473131070985, + "language_loss": 0.90035439, + "learning_rate": 3.928218500477466e-06, + "loss": 0.92379487, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 4.03320312, + "router_z_loss_mlp": 0.56323242, + "step": 1879, + "time_per_iteration": 4.118160009384155 + }, + { + "auxiliary_loss_clip": 0.01733838, + "auxiliary_loss_mlp": 0.00702556, + "balance_loss_clip": 1.32535028, + "balance_loss_mlp": 0.63765836, + "epoch": 0.11303171501578235, + "flos": 29930538124800.0, + "grad_norm": 64.92923010692152, + "language_loss": 0.77249712, + "learning_rate": 3.928115059566259e-06, + "loss": 0.79686105, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 4.08007812, + "router_z_loss_mlp": 0.64868164, + "step": 1880, + "time_per_iteration": 2.703927516937256 + }, + { + "auxiliary_loss_clip": 0.01743003, + "auxiliary_loss_mlp": 0.00640585, + "balance_loss_clip": 1.33831167, + "balance_loss_mlp": 0.58355534, + "epoch": 0.11309183826845032, + "flos": 16180558842240.0, + "grad_norm": 26.446195002159403, + "language_loss": 0.78471828, + "learning_rate": 3.928011545540734e-06, + "loss": 0.80855417, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.57006836, + "step": 1881, + "time_per_iteration": 4.1128716468811035 + }, + { + "auxiliary_loss_clip": 0.01757191, + "auxiliary_loss_mlp": 0.0065549, + "balance_loss_clip": 1.33913684, + "balance_loss_mlp": 0.59235662, + "epoch": 0.1131519615211183, + "flos": 12020702814720.0, + "grad_norm": 51.601637197549564, + "language_loss": 0.81918985, + "learning_rate": 3.927907958404819e-06, + "loss": 0.84331667, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 4.18359375, + "router_z_loss_mlp": 0.63110352, + "step": 1882, + "time_per_iteration": 2.617844820022583 + }, + { + "auxiliary_loss_clip": 0.01741314, + "auxiliary_loss_mlp": 0.00641319, + "balance_loss_clip": 1.34026408, + "balance_loss_mlp": 0.57809085, + "epoch": 0.11321208477378626, + "flos": 26250125857920.0, + "grad_norm": 8.47375127554542, + "language_loss": 0.8819741, + "learning_rate": 3.92780429816244e-06, + "loss": 0.90580046, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 4.00585938, + "router_z_loss_mlp": 0.63183594, + "step": 1883, + "time_per_iteration": 2.688058376312256 + }, + { + "auxiliary_loss_clip": 0.01720172, + "auxiliary_loss_mlp": 0.00604242, + "balance_loss_clip": 1.32307911, + "balance_loss_mlp": 0.54726005, + "epoch": 0.11327220802645423, + "flos": 13626376583040.0, + "grad_norm": 15.547420083553092, + "language_loss": 0.85141528, + "learning_rate": 3.927700564817529e-06, + "loss": 0.87465948, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.56982422, + "step": 1884, + "time_per_iteration": 2.6467769145965576 + }, + { + "auxiliary_loss_clip": 0.01853828, + "auxiliary_loss_mlp": 0.00428225, + "balance_loss_clip": 1.50167513, + "balance_loss_mlp": 0.40209466, + "epoch": 0.1133323312791222, + "flos": 57191802814080.0, + "grad_norm": 0.7989833517751878, + "language_loss": 0.55413187, + "learning_rate": 3.927596758374019e-06, + "loss": 0.5769524, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.26171875, + "step": 1885, + "time_per_iteration": 3.01548433303833 + }, + { + "auxiliary_loss_clip": 0.01714426, + "auxiliary_loss_mlp": 0.00627603, + "balance_loss_clip": 1.32890725, + "balance_loss_mlp": 0.57138395, + "epoch": 0.11339245453179017, + "flos": 24351708245760.0, + "grad_norm": 104.7832019248302, + "language_loss": 0.96166122, + "learning_rate": 3.927492878835848e-06, + "loss": 0.98508149, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.56201172, + "step": 1886, + "time_per_iteration": 2.657252073287964 + }, + { + "auxiliary_loss_clip": 0.0171322, + "auxiliary_loss_mlp": 0.00609814, + "balance_loss_clip": 1.33694005, + "balance_loss_mlp": 0.55609787, + "epoch": 0.11345257778445814, + "flos": 22670693700480.0, + "grad_norm": 23.56868225058732, + "language_loss": 0.90837932, + "learning_rate": 3.927388926206953e-06, + "loss": 0.93160963, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 0.53735352, + "step": 1887, + "time_per_iteration": 2.655362129211426 + }, + { + "auxiliary_loss_clip": 0.01700472, + "auxiliary_loss_mlp": 0.00644564, + "balance_loss_clip": 1.32485676, + "balance_loss_mlp": 0.58615154, + "epoch": 0.11351270103712612, + "flos": 20988242611200.0, + "grad_norm": 5.144868233175181, + "language_loss": 0.86318523, + "learning_rate": 3.927284900491277e-06, + "loss": 0.8866356, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 3.75976562, + "router_z_loss_mlp": 0.58496094, + "step": 1888, + "time_per_iteration": 2.6479663848876953 + }, + { + "auxiliary_loss_clip": 0.01691717, + "auxiliary_loss_mlp": 0.00617442, + "balance_loss_clip": 1.32095528, + "balance_loss_mlp": 0.55931532, + "epoch": 0.11357282428979408, + "flos": 37347923600640.0, + "grad_norm": 4.053085303352004, + "language_loss": 0.73163694, + "learning_rate": 3.927180801692764e-06, + "loss": 0.75472856, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 0.58154297, + "step": 1889, + "time_per_iteration": 2.842867851257324 + }, + { + "auxiliary_loss_clip": 0.01711575, + "auxiliary_loss_mlp": 0.00616312, + "balance_loss_clip": 1.34253609, + "balance_loss_mlp": 0.55911577, + "epoch": 0.11363294754246205, + "flos": 21757018423680.0, + "grad_norm": 5.719702482591755, + "language_loss": 0.8865037, + "learning_rate": 3.927076629815362e-06, + "loss": 0.90978259, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 0.57250977, + "step": 1890, + "time_per_iteration": 2.7731456756591797 + }, + { + "auxiliary_loss_clip": 0.01707824, + "auxiliary_loss_mlp": 0.00643546, + "balance_loss_clip": 1.33679557, + "balance_loss_mlp": 0.58508623, + "epoch": 0.11369307079513001, + "flos": 22601637803520.0, + "grad_norm": 18.007245552310994, + "language_loss": 0.7180357, + "learning_rate": 3.926972384863022e-06, + "loss": 0.74154943, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 0.5847168, + "step": 1891, + "time_per_iteration": 2.667884111404419 + }, + { + "auxiliary_loss_clip": 0.01709949, + "auxiliary_loss_mlp": 0.00653874, + "balance_loss_clip": 1.33724332, + "balance_loss_mlp": 0.58935803, + "epoch": 0.11375319404779799, + "flos": 21944257044480.0, + "grad_norm": 82.69689388177135, + "language_loss": 0.93715262, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.96079087, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 0.64501953, + "step": 1892, + "time_per_iteration": 2.671290159225464 + }, + { + "auxiliary_loss_clip": 0.01704305, + "auxiliary_loss_mlp": 0.00636287, + "balance_loss_clip": 1.32284474, + "balance_loss_mlp": 0.57000637, + "epoch": 0.11381331730046595, + "flos": 26395456285440.0, + "grad_norm": 5.3375868111432805, + "language_loss": 0.80255097, + "learning_rate": 3.926763675749339e-06, + "loss": 0.82595688, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 0.66308594, + "step": 1893, + "time_per_iteration": 2.6738500595092773 + }, + { + "auxiliary_loss_clip": 0.01689084, + "auxiliary_loss_mlp": 0.00627885, + "balance_loss_clip": 1.3247788, + "balance_loss_mlp": 0.56541908, + "epoch": 0.11387344055313392, + "flos": 23804716959360.0, + "grad_norm": 23.366490131309735, + "language_loss": 0.85045999, + "learning_rate": 3.92665921159591e-06, + "loss": 0.87362969, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 0.625, + "step": 1894, + "time_per_iteration": 2.6925628185272217 + }, + { + "auxiliary_loss_clip": 0.01688916, + "auxiliary_loss_mlp": 0.00613962, + "balance_loss_clip": 1.32583427, + "balance_loss_mlp": 0.55650365, + "epoch": 0.1139335638058019, + "flos": 34522865902080.0, + "grad_norm": 15.132480094852998, + "language_loss": 0.86617565, + "learning_rate": 3.926554674383371e-06, + "loss": 0.88920438, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 0.57421875, + "step": 1895, + "time_per_iteration": 2.763258934020996 + }, + { + "auxiliary_loss_clip": 0.01990852, + "auxiliary_loss_mlp": 0.00241221, + "balance_loss_clip": 1.64146852, + "balance_loss_mlp": 0.22081225, + "epoch": 0.11399368705846986, + "flos": 70587811520640.0, + "grad_norm": 0.7855820347153732, + "language_loss": 0.6307497, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65307045, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.20410156, + "step": 1896, + "time_per_iteration": 3.2090044021606445 + }, + { + "auxiliary_loss_clip": 0.01671959, + "auxiliary_loss_mlp": 0.0055691, + "balance_loss_clip": 1.3175621, + "balance_loss_mlp": 0.50212157, + "epoch": 0.11405381031113783, + "flos": 21324259365120.0, + "grad_norm": 9.64710605543728, + "language_loss": 0.88512725, + "learning_rate": 3.926345380796821e-06, + "loss": 0.90741593, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 0.54833984, + "step": 1897, + "time_per_iteration": 2.728876829147339 + }, + { + "auxiliary_loss_clip": 0.0168213, + "auxiliary_loss_mlp": 0.00602323, + "balance_loss_clip": 1.32743144, + "balance_loss_mlp": 0.54500711, + "epoch": 0.11411393356380581, + "flos": 19719627091200.0, + "grad_norm": 19.42738256911801, + "language_loss": 0.86541563, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.88826019, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 0.57324219, + "step": 1898, + "time_per_iteration": 2.7851269245147705 + }, + { + "auxiliary_loss_clip": 0.0166109, + "auxiliary_loss_mlp": 0.00588624, + "balance_loss_clip": 1.30676842, + "balance_loss_mlp": 0.52892387, + "epoch": 0.11417405681647377, + "flos": 17530440883200.0, + "grad_norm": 12.695465204799858, + "language_loss": 0.81557614, + "learning_rate": 3.926135795021435e-06, + "loss": 0.83807325, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 0.59643555, + "step": 1899, + "time_per_iteration": 2.766625165939331 + }, + { + "auxiliary_loss_clip": 0.01928492, + "auxiliary_loss_mlp": 0.00254966, + "balance_loss_clip": 1.60081649, + "balance_loss_mlp": 0.23617911, + "epoch": 0.11423418006914174, + "flos": 59674666619520.0, + "grad_norm": 0.9078883697283056, + "language_loss": 0.6346494, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65648389, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.1875, + "step": 1900, + "time_per_iteration": 3.0836589336395264 + }, + { + "auxiliary_loss_clip": 0.01618102, + "auxiliary_loss_mlp": 0.00524626, + "balance_loss_clip": 1.28376675, + "balance_loss_mlp": 0.47391483, + "epoch": 0.1142943033218097, + "flos": 22963114321920.0, + "grad_norm": 13.493780917059022, + "language_loss": 0.82974625, + "learning_rate": 3.925925917089001e-06, + "loss": 0.85117352, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.50732422, + "step": 1901, + "time_per_iteration": 2.716153621673584 + }, + { + "auxiliary_loss_clip": 0.0162837, + "auxiliary_loss_mlp": 0.00571108, + "balance_loss_clip": 1.29270256, + "balance_loss_mlp": 0.5137918, + "epoch": 0.11435442657447768, + "flos": 18256267008000.0, + "grad_norm": 31.748997825112514, + "language_loss": 0.90583575, + "learning_rate": 3.925820868573839e-06, + "loss": 0.92783058, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.57324219, + "step": 1902, + "time_per_iteration": 2.646496534347534 + }, + { + "auxiliary_loss_clip": 0.01646353, + "auxiliary_loss_mlp": 0.00571692, + "balance_loss_clip": 1.29822576, + "balance_loss_mlp": 0.51342291, + "epoch": 0.11441454982714565, + "flos": 24061191045120.0, + "grad_norm": 54.892278907975, + "language_loss": 0.8338443, + "learning_rate": 3.925715747031356e-06, + "loss": 0.85602474, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.58276367, + "step": 1903, + "time_per_iteration": 2.696735382080078 + }, + { + "auxiliary_loss_clip": 0.01636497, + "auxiliary_loss_mlp": 0.00553512, + "balance_loss_clip": 1.29948294, + "balance_loss_mlp": 0.50232333, + "epoch": 0.11447467307981361, + "flos": 25337707557120.0, + "grad_norm": 770.788206538944, + "language_loss": 0.82237184, + "learning_rate": 3.925610552465539e-06, + "loss": 0.8442719, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.51245117, + "step": 1904, + "time_per_iteration": 2.6629276275634766 + }, + { + "auxiliary_loss_clip": 0.01649494, + "auxiliary_loss_mlp": 0.00581103, + "balance_loss_clip": 1.31106985, + "balance_loss_mlp": 0.52426434, + "epoch": 0.11453479633248159, + "flos": 21726063878400.0, + "grad_norm": 6.886829512253228, + "language_loss": 0.98280287, + "learning_rate": 3.9255052848803764e-06, + "loss": 1.00510895, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.56860352, + "step": 1905, + "time_per_iteration": 2.6310532093048096 + }, + { + "auxiliary_loss_clip": 0.01637985, + "auxiliary_loss_mlp": 0.006069, + "balance_loss_clip": 1.28378057, + "balance_loss_mlp": 0.54362321, + "epoch": 0.11459491958514956, + "flos": 12969714096000.0, + "grad_norm": 15.94529979519172, + "language_loss": 0.85887551, + "learning_rate": 3.925399944279861e-06, + "loss": 0.88132441, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 0.6328125, + "step": 1906, + "time_per_iteration": 2.5976357460021973 + }, + { + "auxiliary_loss_clip": 0.01638647, + "auxiliary_loss_mlp": 0.0057, + "balance_loss_clip": 1.29434359, + "balance_loss_mlp": 0.51478261, + "epoch": 0.11465504283781752, + "flos": 22711273090560.0, + "grad_norm": 14.39447499876485, + "language_loss": 0.89750141, + "learning_rate": 3.925294530667986e-06, + "loss": 0.91958791, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.55151367, + "step": 1907, + "time_per_iteration": 2.6454720497131348 + }, + { + "auxiliary_loss_clip": 0.01652733, + "auxiliary_loss_mlp": 0.00576222, + "balance_loss_clip": 1.30351698, + "balance_loss_mlp": 0.51790476, + "epoch": 0.1147151660904855, + "flos": 23398387332480.0, + "grad_norm": 95.03590844576158, + "language_loss": 0.90662968, + "learning_rate": 3.92518904404875e-06, + "loss": 0.92891932, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 0.58325195, + "step": 1908, + "time_per_iteration": 2.6210763454437256 + }, + { + "auxiliary_loss_clip": 0.01822972, + "auxiliary_loss_mlp": 0.00216322, + "balance_loss_clip": 1.52485347, + "balance_loss_mlp": 0.19810694, + "epoch": 0.11477528934315347, + "flos": 63011843498880.0, + "grad_norm": 0.9589290721778688, + "language_loss": 0.60801494, + "learning_rate": 3.925083484426153e-06, + "loss": 0.62840784, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.18261719, + "step": 1909, + "time_per_iteration": 2.8814597129821777 + }, + { + "auxiliary_loss_clip": 0.01658055, + "auxiliary_loss_mlp": 0.00563737, + "balance_loss_clip": 1.31287205, + "balance_loss_mlp": 0.50732672, + "epoch": 0.11483541259582143, + "flos": 16325601960960.0, + "grad_norm": 4.404829145885058, + "language_loss": 0.84660721, + "learning_rate": 3.924977851804197e-06, + "loss": 0.86882514, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 0.56420898, + "step": 1910, + "time_per_iteration": 2.6967506408691406 + }, + { + "auxiliary_loss_clip": 0.01657105, + "auxiliary_loss_mlp": 0.00565699, + "balance_loss_clip": 1.30532002, + "balance_loss_mlp": 0.51112461, + "epoch": 0.1148955358484894, + "flos": 21580410228480.0, + "grad_norm": 3.520374691284433, + "language_loss": 0.82293338, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.84516138, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 0.54541016, + "step": 1911, + "time_per_iteration": 2.7137577533721924 + }, + { + "auxiliary_loss_clip": 0.01654663, + "auxiliary_loss_mlp": 0.00522755, + "balance_loss_clip": 1.31361187, + "balance_loss_mlp": 0.4730683, + "epoch": 0.11495565910115738, + "flos": 27673696650240.0, + "grad_norm": 19.24418539694855, + "language_loss": 0.83476162, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.85653585, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 3.41210938, + "router_z_loss_mlp": 0.49707031, + "step": 1912, + "time_per_iteration": 2.7902281284332275 + }, + { + "auxiliary_loss_clip": 0.01650601, + "auxiliary_loss_mlp": 0.00515653, + "balance_loss_clip": 1.3058989, + "balance_loss_mlp": 0.46303397, + "epoch": 0.11501578235382534, + "flos": 20632368614400.0, + "grad_norm": 13.742813178946793, + "language_loss": 0.85280716, + "learning_rate": 3.924660515982246e-06, + "loss": 0.87446976, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.52563477, + "step": 1913, + "time_per_iteration": 2.72263503074646 + }, + { + "auxiliary_loss_clip": 0.0166418, + "auxiliary_loss_mlp": 0.00504654, + "balance_loss_clip": 1.31831813, + "balance_loss_mlp": 0.45275012, + "epoch": 0.1150759056064933, + "flos": 19829046896640.0, + "grad_norm": 94.04147068244441, + "language_loss": 0.75963807, + "learning_rate": 3.924554591402939e-06, + "loss": 0.78132641, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.51928711, + "step": 1914, + "time_per_iteration": 2.705019235610962 + }, + { + "auxiliary_loss_clip": 0.01787993, + "auxiliary_loss_mlp": 0.00353935, + "balance_loss_clip": 1.4968679, + "balance_loss_mlp": 0.33056957, + "epoch": 0.11513602885916129, + "flos": 70045776311040.0, + "grad_norm": 1.7776036246784046, + "language_loss": 0.60993826, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63135749, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.23339844, + "step": 1915, + "time_per_iteration": 3.258871555328369 + }, + { + "auxiliary_loss_clip": 0.0165776, + "auxiliary_loss_mlp": 0.00604279, + "balance_loss_clip": 1.30976987, + "balance_loss_mlp": 0.54703414, + "epoch": 0.11519615211182925, + "flos": 15741730385280.0, + "grad_norm": 71.99436300110091, + "language_loss": 0.99692947, + "learning_rate": 3.924342523310436e-06, + "loss": 1.01954985, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.57275391, + "step": 1916, + "time_per_iteration": 2.666714668273926 + }, + { + "auxiliary_loss_clip": 0.01651976, + "auxiliary_loss_mlp": 0.00580742, + "balance_loss_clip": 1.29288077, + "balance_loss_mlp": 0.52347374, + "epoch": 0.11525627536449722, + "flos": 20667632791680.0, + "grad_norm": 2.32259654164924, + "language_loss": 0.78835481, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.81068206, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 0.57275391, + "step": 1917, + "time_per_iteration": 2.782750129699707 + }, + { + "auxiliary_loss_clip": 0.01668259, + "auxiliary_loss_mlp": 0.00625637, + "balance_loss_clip": 1.31655335, + "balance_loss_mlp": 0.56827319, + "epoch": 0.1153163986171652, + "flos": 20303283185280.0, + "grad_norm": 217.8253763216717, + "language_loss": 0.80325037, + "learning_rate": 3.92413016333289e-06, + "loss": 0.8261894, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.57348633, + "step": 1918, + "time_per_iteration": 4.023033380508423 + }, + { + "auxiliary_loss_clip": 0.01673993, + "auxiliary_loss_mlp": 0.00568665, + "balance_loss_clip": 1.31614339, + "balance_loss_mlp": 0.51747644, + "epoch": 0.11537652186983316, + "flos": 17639321984640.0, + "grad_norm": 6.428376963377782, + "language_loss": 0.92650265, + "learning_rate": 3.92402387389729e-06, + "loss": 0.94892919, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 0.51196289, + "step": 1919, + "time_per_iteration": 4.039808750152588 + }, + { + "auxiliary_loss_clip": 0.01662466, + "auxiliary_loss_mlp": 0.00585124, + "balance_loss_clip": 1.30723834, + "balance_loss_mlp": 0.53171861, + "epoch": 0.11543664512250112, + "flos": 21069401391360.0, + "grad_norm": 5.263475273263496, + "language_loss": 0.91403365, + "learning_rate": 3.923917511502512e-06, + "loss": 0.93650955, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 0.53393555, + "step": 1920, + "time_per_iteration": 2.697622537612915 + }, + { + "auxiliary_loss_clip": 0.0167394, + "auxiliary_loss_mlp": 0.0060343, + "balance_loss_clip": 1.3223418, + "balance_loss_mlp": 0.55128783, + "epoch": 0.11549676837516909, + "flos": 22747542848640.0, + "grad_norm": 15.797380629692773, + "language_loss": 0.85608256, + "learning_rate": 3.923811076152589e-06, + "loss": 0.8788563, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.52148438, + "step": 1921, + "time_per_iteration": 4.066251516342163 + }, + { + "auxiliary_loss_clip": 0.01692994, + "auxiliary_loss_mlp": 0.00685517, + "balance_loss_clip": 1.31157327, + "balance_loss_mlp": 0.62359929, + "epoch": 0.11555689162783707, + "flos": 19168972617600.0, + "grad_norm": 2.4748444860897783, + "language_loss": 0.83602208, + "learning_rate": 3.923704567851557e-06, + "loss": 0.85980713, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 3.81054688, + "router_z_loss_mlp": 0.62011719, + "step": 1922, + "time_per_iteration": 2.646246910095215 + }, + { + "auxiliary_loss_clip": 0.01673025, + "auxiliary_loss_mlp": 0.00620342, + "balance_loss_clip": 1.31596959, + "balance_loss_mlp": 0.56428975, + "epoch": 0.11561701488050503, + "flos": 24572056227840.0, + "grad_norm": 4.752854472641086, + "language_loss": 0.88896304, + "learning_rate": 3.923597986603456e-06, + "loss": 0.91189671, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.56030273, + "step": 1923, + "time_per_iteration": 4.101696729660034 + }, + { + "auxiliary_loss_clip": 0.0169913, + "auxiliary_loss_mlp": 0.0063469, + "balance_loss_clip": 1.32676733, + "balance_loss_mlp": 0.57398909, + "epoch": 0.115677138133173, + "flos": 17092546179840.0, + "grad_norm": 1679.1180069374766, + "language_loss": 0.87661779, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.89995599, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.60644531, + "step": 1924, + "time_per_iteration": 2.6217706203460693 + }, + { + "auxiliary_loss_clip": 0.01808058, + "auxiliary_loss_mlp": 0.0037216, + "balance_loss_clip": 1.52008367, + "balance_loss_mlp": 0.34412193, + "epoch": 0.11573726138584098, + "flos": 62703875266560.0, + "grad_norm": 0.9361943685531844, + "language_loss": 0.60954827, + "learning_rate": 3.923384605282212e-06, + "loss": 0.63135046, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.28125, + "step": 1925, + "time_per_iteration": 3.1561315059661865 + }, + { + "auxiliary_loss_clip": 0.01732258, + "auxiliary_loss_mlp": 0.00641434, + "balance_loss_clip": 1.35215044, + "balance_loss_mlp": 0.58233035, + "epoch": 0.11579738463850894, + "flos": 22601135013120.0, + "grad_norm": 8.131277558715853, + "language_loss": 0.80813849, + "learning_rate": 3.923277805217161e-06, + "loss": 0.83187544, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 0.59106445, + "step": 1926, + "time_per_iteration": 2.7139699459075928 + }, + { + "auxiliary_loss_clip": 0.01771838, + "auxiliary_loss_mlp": 0.00616886, + "balance_loss_clip": 1.36981416, + "balance_loss_mlp": 0.55573165, + "epoch": 0.11585750789117691, + "flos": 21726135705600.0, + "grad_norm": 21.28348240427446, + "language_loss": 0.81719959, + "learning_rate": 3.923170932221222e-06, + "loss": 0.84108686, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 4.01367188, + "router_z_loss_mlp": 0.61181641, + "step": 1927, + "time_per_iteration": 2.6903817653656006 + }, + { + "auxiliary_loss_clip": 0.01769326, + "auxiliary_loss_mlp": 0.00656905, + "balance_loss_clip": 1.37348104, + "balance_loss_mlp": 0.59553653, + "epoch": 0.11591763114384489, + "flos": 26287544851200.0, + "grad_norm": 9.400642227139691, + "language_loss": 0.92323732, + "learning_rate": 3.92306398629845e-06, + "loss": 0.94749963, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 3.9609375, + "router_z_loss_mlp": 0.61303711, + "step": 1928, + "time_per_iteration": 2.7419395446777344 + }, + { + "auxiliary_loss_clip": 0.01768864, + "auxiliary_loss_mlp": 0.0066234, + "balance_loss_clip": 1.37058663, + "balance_loss_mlp": 0.60063696, + "epoch": 0.11597775439651285, + "flos": 23000461488000.0, + "grad_norm": 2.5664928193416374, + "language_loss": 0.83891857, + "learning_rate": 3.922956967452898e-06, + "loss": 0.86323059, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 3.97851562, + "router_z_loss_mlp": 0.61645508, + "step": 1929, + "time_per_iteration": 2.640810966491699 + }, + { + "auxiliary_loss_clip": 0.01761664, + "auxiliary_loss_mlp": 0.00665458, + "balance_loss_clip": 1.37132096, + "balance_loss_mlp": 0.60935843, + "epoch": 0.11603787764918082, + "flos": 31941715507200.0, + "grad_norm": 3.3939980312742817, + "language_loss": 0.82377291, + "learning_rate": 3.922849875688626e-06, + "loss": 0.8480441, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 3.90039062, + "router_z_loss_mlp": 0.56054688, + "step": 1930, + "time_per_iteration": 2.759760856628418 + }, + { + "auxiliary_loss_clip": 0.01766848, + "auxiliary_loss_mlp": 0.0062782, + "balance_loss_clip": 1.38046765, + "balance_loss_mlp": 0.57188749, + "epoch": 0.1160980009018488, + "flos": 22271654534400.0, + "grad_norm": 7.956732632522512, + "language_loss": 0.78631318, + "learning_rate": 3.922742711009693e-06, + "loss": 0.81025982, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 3.86132812, + "router_z_loss_mlp": 0.55957031, + "step": 1931, + "time_per_iteration": 2.6793019771575928 + }, + { + "auxiliary_loss_clip": 0.01774123, + "auxiliary_loss_mlp": 0.00670236, + "balance_loss_clip": 1.37664711, + "balance_loss_mlp": 0.61091757, + "epoch": 0.11615812415451676, + "flos": 22783633038720.0, + "grad_norm": 8.951706299283444, + "language_loss": 0.87210429, + "learning_rate": 3.922635473420164e-06, + "loss": 0.89654791, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.59350586, + "step": 1932, + "time_per_iteration": 2.69088077545166 + }, + { + "auxiliary_loss_clip": 0.01811042, + "auxiliary_loss_mlp": 0.00316513, + "balance_loss_clip": 1.48718071, + "balance_loss_mlp": 0.29171798, + "epoch": 0.11621824740718473, + "flos": 67146096107520.0, + "grad_norm": 0.7796113695804527, + "language_loss": 0.6132291, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63450462, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.24707031, + "step": 1933, + "time_per_iteration": 3.0618674755096436 + }, + { + "auxiliary_loss_clip": 0.0178179, + "auxiliary_loss_mlp": 0.00627433, + "balance_loss_clip": 1.37709999, + "balance_loss_mlp": 0.57021284, + "epoch": 0.11627837065985269, + "flos": 20375930442240.0, + "grad_norm": 11.83009940899843, + "language_loss": 0.92728949, + "learning_rate": 3.922420779525586e-06, + "loss": 0.95138174, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 4.04882812, + "router_z_loss_mlp": 0.57250977, + "step": 1934, + "time_per_iteration": 2.68870210647583 + }, + { + "auxiliary_loss_clip": 0.01790605, + "auxiliary_loss_mlp": 0.00698104, + "balance_loss_clip": 1.37114429, + "balance_loss_mlp": 0.6312272, + "epoch": 0.11633849391252067, + "flos": 21725812483200.0, + "grad_norm": 934.7222755998406, + "language_loss": 0.74774718, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.77263427, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 4.19726562, + "router_z_loss_mlp": 0.66796875, + "step": 1935, + "time_per_iteration": 2.694038152694702 + }, + { + "auxiliary_loss_clip": 0.01779276, + "auxiliary_loss_mlp": 0.00668296, + "balance_loss_clip": 1.37529993, + "balance_loss_mlp": 0.60788059, + "epoch": 0.11639861716518864, + "flos": 18805341283200.0, + "grad_norm": 67.00456238394733, + "language_loss": 0.81070447, + "learning_rate": 3.922205794037456e-06, + "loss": 0.83518022, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 4.04296875, + "router_z_loss_mlp": 0.60424805, + "step": 1936, + "time_per_iteration": 2.6756670475006104 + }, + { + "auxiliary_loss_clip": 0.01771072, + "auxiliary_loss_mlp": 0.0064326, + "balance_loss_clip": 1.3626188, + "balance_loss_mlp": 0.58556259, + "epoch": 0.1164587404178566, + "flos": 21214983214080.0, + "grad_norm": 7.875149828544808, + "language_loss": 0.908077, + "learning_rate": 3.922098191955998e-06, + "loss": 0.93222022, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 4.08984375, + "router_z_loss_mlp": 0.57739258, + "step": 1937, + "time_per_iteration": 2.7393155097961426 + }, + { + "auxiliary_loss_clip": 0.01757528, + "auxiliary_loss_mlp": 0.00644985, + "balance_loss_clip": 1.36324298, + "balance_loss_mlp": 0.58785945, + "epoch": 0.11651886367052458, + "flos": 27818632028160.0, + "grad_norm": 9.316170604279959, + "language_loss": 0.81292868, + "learning_rate": 3.921990516988384e-06, + "loss": 0.83695382, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 3.9453125, + "router_z_loss_mlp": 0.57104492, + "step": 1938, + "time_per_iteration": 2.712216377258301 + }, + { + "auxiliary_loss_clip": 0.01780941, + "auxiliary_loss_mlp": 0.00616709, + "balance_loss_clip": 1.37645078, + "balance_loss_mlp": 0.55963147, + "epoch": 0.11657898692319255, + "flos": 22889569224960.0, + "grad_norm": 30.76918893819415, + "language_loss": 0.8424567, + "learning_rate": 3.921882769138696e-06, + "loss": 0.86643314, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.57006836, + "step": 1939, + "time_per_iteration": 2.7060909271240234 + }, + { + "auxiliary_loss_clip": 0.0177362, + "auxiliary_loss_mlp": 0.00643073, + "balance_loss_clip": 1.37638533, + "balance_loss_mlp": 0.58795083, + "epoch": 0.11663911017586051, + "flos": 24315905364480.0, + "grad_norm": 132.7882142430819, + "language_loss": 0.91907585, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.94324279, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.55126953, + "step": 1940, + "time_per_iteration": 2.734760046005249 + }, + { + "auxiliary_loss_clip": 0.0178655, + "auxiliary_loss_mlp": 0.00650426, + "balance_loss_clip": 1.39379084, + "balance_loss_mlp": 0.59470731, + "epoch": 0.11669923342852849, + "flos": 42340152470400.0, + "grad_norm": 3.4921947228825023, + "language_loss": 0.81201434, + "learning_rate": 3.921667054809449e-06, + "loss": 0.83638406, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 3.92578125, + "router_z_loss_mlp": 0.55688477, + "step": 1941, + "time_per_iteration": 2.9949352741241455 + }, + { + "auxiliary_loss_clip": 0.01788201, + "auxiliary_loss_mlp": 0.00663008, + "balance_loss_clip": 1.38701844, + "balance_loss_mlp": 0.60678911, + "epoch": 0.11675935668119646, + "flos": 14642288945280.0, + "grad_norm": 7.965972463687534, + "language_loss": 0.94957578, + "learning_rate": 3.921559088338068e-06, + "loss": 0.97408783, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 4.0078125, + "router_z_loss_mlp": 0.56225586, + "step": 1942, + "time_per_iteration": 2.766167640686035 + }, + { + "auxiliary_loss_clip": 0.01799466, + "auxiliary_loss_mlp": 0.00577087, + "balance_loss_clip": 1.40223622, + "balance_loss_mlp": 0.52873528, + "epoch": 0.11681947993386442, + "flos": 35116470063360.0, + "grad_norm": 3.998693573435623, + "language_loss": 0.73761117, + "learning_rate": 3.921451049000975e-06, + "loss": 0.76137674, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 3.97070312, + "router_z_loss_mlp": 0.48339844, + "step": 1943, + "time_per_iteration": 2.8701040744781494 + }, + { + "auxiliary_loss_clip": 0.01788888, + "auxiliary_loss_mlp": 0.00625234, + "balance_loss_clip": 1.38636732, + "balance_loss_mlp": 0.56827545, + "epoch": 0.11687960318653239, + "flos": 38983259024640.0, + "grad_norm": 3.192010214528672, + "language_loss": 0.75866193, + "learning_rate": 3.921342936802265e-06, + "loss": 0.78280318, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 4.02734375, + "router_z_loss_mlp": 0.56982422, + "step": 1944, + "time_per_iteration": 2.966899871826172 + }, + { + "auxiliary_loss_clip": 0.01776145, + "auxiliary_loss_mlp": 0.00598364, + "balance_loss_clip": 1.38127017, + "balance_loss_mlp": 0.54722303, + "epoch": 0.11693972643920036, + "flos": 25994980575360.0, + "grad_norm": 9.65528747835021, + "language_loss": 0.87851322, + "learning_rate": 3.921234751746038e-06, + "loss": 0.9022584, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 3.94726562, + "router_z_loss_mlp": 0.51196289, + "step": 1945, + "time_per_iteration": 2.672325372695923 + }, + { + "auxiliary_loss_clip": 0.01784969, + "auxiliary_loss_mlp": 0.00570737, + "balance_loss_clip": 1.38640642, + "balance_loss_mlp": 0.52112192, + "epoch": 0.11699984969186833, + "flos": 27272107618560.0, + "grad_norm": 1500.2242084542074, + "language_loss": 0.81017005, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.83372712, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 3.98242188, + "router_z_loss_mlp": 0.49658203, + "step": 1946, + "time_per_iteration": 2.752255916595459 + }, + { + "auxiliary_loss_clip": 0.01782918, + "auxiliary_loss_mlp": 0.00582788, + "balance_loss_clip": 1.39419913, + "balance_loss_mlp": 0.53043175, + "epoch": 0.1170599729445363, + "flos": 15267853232640.0, + "grad_norm": 5.313038079143364, + "language_loss": 0.74473661, + "learning_rate": 3.921018163077448e-06, + "loss": 0.76839364, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 3.89257812, + "router_z_loss_mlp": 0.52368164, + "step": 1947, + "time_per_iteration": 2.5974977016448975 + }, + { + "auxiliary_loss_clip": 0.01817315, + "auxiliary_loss_mlp": 0.0063317, + "balance_loss_clip": 1.40481591, + "balance_loss_mlp": 0.57289803, + "epoch": 0.11712009619720427, + "flos": 17164439251200.0, + "grad_norm": 40.21854387612632, + "language_loss": 0.89940149, + "learning_rate": 3.920909759473295e-06, + "loss": 0.92390633, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 4.125, + "router_z_loss_mlp": 0.60302734, + "step": 1948, + "time_per_iteration": 2.664835214614868 + }, + { + "auxiliary_loss_clip": 0.01747261, + "auxiliary_loss_mlp": 0.0027056, + "balance_loss_clip": 1.48090696, + "balance_loss_mlp": 0.25129548, + "epoch": 0.11718021944987224, + "flos": 70940991997440.0, + "grad_norm": 0.829951883030092, + "language_loss": 0.65126789, + "learning_rate": 3.920801283028054e-06, + "loss": 0.67144614, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.19238281, + "step": 1949, + "time_per_iteration": 3.1453447341918945 + }, + { + "auxiliary_loss_clip": 0.01782267, + "auxiliary_loss_mlp": 0.00594337, + "balance_loss_clip": 1.3876574, + "balance_loss_mlp": 0.54250479, + "epoch": 0.1172403427025402, + "flos": 27453456408960.0, + "grad_norm": 4.495927920913043, + "language_loss": 0.76340055, + "learning_rate": 3.920692733745835e-06, + "loss": 0.7871666, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.51806641, + "step": 1950, + "time_per_iteration": 2.710724115371704 + }, + { + "auxiliary_loss_clip": 0.01821903, + "auxiliary_loss_mlp": 0.00621216, + "balance_loss_clip": 1.40860891, + "balance_loss_mlp": 0.5634473, + "epoch": 0.11730046595520818, + "flos": 15668723992320.0, + "grad_norm": 6.571310751910141, + "language_loss": 0.82690823, + "learning_rate": 3.920584111630755e-06, + "loss": 0.85133946, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.57788086, + "step": 1951, + "time_per_iteration": 2.595916986465454 + }, + { + "auxiliary_loss_clip": 0.01794216, + "auxiliary_loss_mlp": 0.00611017, + "balance_loss_clip": 1.39229703, + "balance_loss_mlp": 0.55587125, + "epoch": 0.11736058920787615, + "flos": 25630164092160.0, + "grad_norm": 3.6415155275570945, + "language_loss": 0.82260293, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.84665525, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 4.0234375, + "router_z_loss_mlp": 0.55200195, + "step": 1952, + "time_per_iteration": 2.681856632232666 + }, + { + "auxiliary_loss_clip": 0.01810857, + "auxiliary_loss_mlp": 0.00564085, + "balance_loss_clip": 1.40595686, + "balance_loss_mlp": 0.51373065, + "epoch": 0.11742071246054411, + "flos": 21434289701760.0, + "grad_norm": 7.896293010906656, + "language_loss": 0.78067684, + "learning_rate": 3.920366648918491e-06, + "loss": 0.80442631, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.50390625, + "step": 1953, + "time_per_iteration": 2.6890859603881836 + }, + { + "auxiliary_loss_clip": 0.01783727, + "auxiliary_loss_mlp": 0.00645554, + "balance_loss_clip": 1.37344086, + "balance_loss_mlp": 0.58556819, + "epoch": 0.11748083571321208, + "flos": 15997845335040.0, + "grad_norm": 212.67865253860748, + "language_loss": 0.86848998, + "learning_rate": 3.920257808329552e-06, + "loss": 0.89278281, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 4.1015625, + "router_z_loss_mlp": 0.59985352, + "step": 1954, + "time_per_iteration": 2.701418161392212 + }, + { + "auxiliary_loss_clip": 0.01798114, + "auxiliary_loss_mlp": 0.00609097, + "balance_loss_clip": 1.37913465, + "balance_loss_mlp": 0.54965913, + "epoch": 0.11754095896588006, + "flos": 16180056051840.0, + "grad_norm": 23.193245867152633, + "language_loss": 0.91114515, + "learning_rate": 3.920148894924246e-06, + "loss": 0.93521726, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 4.19140625, + "router_z_loss_mlp": 0.59448242, + "step": 1955, + "time_per_iteration": 2.7172789573669434 + }, + { + "auxiliary_loss_clip": 0.01800644, + "auxiliary_loss_mlp": 0.00592839, + "balance_loss_clip": 1.38858283, + "balance_loss_mlp": 0.53440285, + "epoch": 0.11760108221854802, + "flos": 13261596013440.0, + "grad_norm": 96.0705487860443, + "language_loss": 0.83777106, + "learning_rate": 3.920039908706701e-06, + "loss": 0.8617059, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 4.11914062, + "router_z_loss_mlp": 0.58398438, + "step": 1956, + "time_per_iteration": 2.653679847717285 + }, + { + "auxiliary_loss_clip": 0.01785293, + "auxiliary_loss_mlp": 0.00565304, + "balance_loss_clip": 1.39144897, + "balance_loss_mlp": 0.5159272, + "epoch": 0.11766120547121599, + "flos": 24498439303680.0, + "grad_norm": 19.250621891688173, + "language_loss": 0.85633063, + "learning_rate": 3.91993084968105e-06, + "loss": 0.87983656, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 3.94335938, + "router_z_loss_mlp": 0.4934082, + "step": 1957, + "time_per_iteration": 2.8523097038269043 + }, + { + "auxiliary_loss_clip": 0.01795964, + "auxiliary_loss_mlp": 0.00626798, + "balance_loss_clip": 1.38574207, + "balance_loss_mlp": 0.56972086, + "epoch": 0.11772132872388397, + "flos": 17784005967360.0, + "grad_norm": 4.7177334437267, + "language_loss": 0.83362293, + "learning_rate": 3.919821717851428e-06, + "loss": 0.85785055, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 4.10546875, + "router_z_loss_mlp": 0.57104492, + "step": 1958, + "time_per_iteration": 2.676231622695923 + }, + { + "auxiliary_loss_clip": 0.01777601, + "auxiliary_loss_mlp": 0.00581681, + "balance_loss_clip": 1.37786853, + "balance_loss_mlp": 0.5239594, + "epoch": 0.11778145197655193, + "flos": 13217030213760.0, + "grad_norm": 47.26295303034747, + "language_loss": 0.84330511, + "learning_rate": 3.919712513221976e-06, + "loss": 0.86689794, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 4.00195312, + "router_z_loss_mlp": 0.57763672, + "step": 1959, + "time_per_iteration": 2.6990253925323486 + }, + { + "auxiliary_loss_clip": 0.01756819, + "auxiliary_loss_mlp": 0.00571793, + "balance_loss_clip": 1.36498189, + "balance_loss_mlp": 0.51667023, + "epoch": 0.1178415752292199, + "flos": 20230204965120.0, + "grad_norm": 86.44643977548232, + "language_loss": 0.74515551, + "learning_rate": 3.919603235796832e-06, + "loss": 0.76844162, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 3.9140625, + "router_z_loss_mlp": 0.55078125, + "step": 1960, + "time_per_iteration": 4.21378231048584 + }, + { + "auxiliary_loss_clip": 0.0176958, + "auxiliary_loss_mlp": 0.00537774, + "balance_loss_clip": 1.36212695, + "balance_loss_mlp": 0.48310435, + "epoch": 0.11790169848188788, + "flos": 13040134709760.0, + "grad_norm": 53.72672242314129, + "language_loss": 0.89169872, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.91477227, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 4.078125, + "router_z_loss_mlp": 0.546875, + "step": 1961, + "time_per_iteration": 4.1107776165008545 + }, + { + "auxiliary_loss_clip": 0.01739942, + "auxiliary_loss_mlp": 0.00524803, + "balance_loss_clip": 1.35698009, + "balance_loss_mlp": 0.47347173, + "epoch": 0.11796182173455584, + "flos": 22265728790400.0, + "grad_norm": 21.791295169491356, + "language_loss": 0.98113585, + "learning_rate": 3.919384462576049e-06, + "loss": 1.00378335, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 0.51293945, + "step": 1962, + "time_per_iteration": 2.6620912551879883 + }, + { + "auxiliary_loss_clip": 0.01742221, + "auxiliary_loss_mlp": 0.00501832, + "balance_loss_clip": 1.35679555, + "balance_loss_mlp": 0.45269382, + "epoch": 0.1180219449872238, + "flos": 10635017892480.0, + "grad_norm": 509.2482214006204, + "language_loss": 0.95044506, + "learning_rate": 3.919274966788707e-06, + "loss": 0.97288561, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 3.85546875, + "router_z_loss_mlp": 0.4909668, + "step": 1963, + "time_per_iteration": 4.113672256469727 + }, + { + "auxiliary_loss_clip": 0.01741811, + "auxiliary_loss_mlp": 0.00489811, + "balance_loss_clip": 1.34877682, + "balance_loss_mlp": 0.43676245, + "epoch": 0.11808206823989177, + "flos": 20923532259840.0, + "grad_norm": 74.16154417803935, + "language_loss": 0.89193356, + "learning_rate": 3.919165398222265e-06, + "loss": 0.91424978, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 3.92773438, + "router_z_loss_mlp": 0.53076172, + "step": 1964, + "time_per_iteration": 2.692570209503174 + }, + { + "auxiliary_loss_clip": 0.01760356, + "auxiliary_loss_mlp": 0.00456978, + "balance_loss_clip": 1.36798882, + "balance_loss_mlp": 0.41036725, + "epoch": 0.11814219149255975, + "flos": 20777770869120.0, + "grad_norm": 16.87805373458288, + "language_loss": 0.89665926, + "learning_rate": 3.919055756880879e-06, + "loss": 0.91883266, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 3.91601562, + "router_z_loss_mlp": 0.46655273, + "step": 1965, + "time_per_iteration": 4.160141229629517 + }, + { + "auxiliary_loss_clip": 0.01751525, + "auxiliary_loss_mlp": 0.00465822, + "balance_loss_clip": 1.35395741, + "balance_loss_mlp": 0.4149434, + "epoch": 0.11820231474522772, + "flos": 48759938542080.0, + "grad_norm": 3.890609141706635, + "language_loss": 0.79338348, + "learning_rate": 3.918946042768707e-06, + "loss": 0.81555694, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 3.9765625, + "router_z_loss_mlp": 0.50952148, + "step": 1966, + "time_per_iteration": 2.9449069499969482 + }, + { + "auxiliary_loss_clip": 0.01737613, + "auxiliary_loss_mlp": 0.00395164, + "balance_loss_clip": 1.34824157, + "balance_loss_mlp": 0.35143781, + "epoch": 0.11826243799789568, + "flos": 16690598012160.0, + "grad_norm": 11.591240086599106, + "language_loss": 0.79381371, + "learning_rate": 3.918836255889908e-06, + "loss": 0.8151415, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 3.88867188, + "router_z_loss_mlp": 0.43774414, + "step": 1967, + "time_per_iteration": 2.6649765968322754 + }, + { + "auxiliary_loss_clip": 0.017486, + "auxiliary_loss_mlp": 0.00413676, + "balance_loss_clip": 1.36228669, + "balance_loss_mlp": 0.3640846, + "epoch": 0.11832256125056366, + "flos": 16909868586240.0, + "grad_norm": 17.886787894362595, + "language_loss": 0.94988602, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.97150874, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 3.86132812, + "router_z_loss_mlp": 0.49584961, + "step": 1968, + "time_per_iteration": 2.658308506011963 + }, + { + "auxiliary_loss_clip": 0.01720651, + "auxiliary_loss_mlp": 0.00395966, + "balance_loss_clip": 1.33987451, + "balance_loss_mlp": 0.35171592, + "epoch": 0.11838268450323162, + "flos": 22820405587200.0, + "grad_norm": 3.787328807302539, + "language_loss": 0.74344444, + "learning_rate": 3.918616463849087e-06, + "loss": 0.76461065, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.44287109, + "step": 1969, + "time_per_iteration": 2.7411253452301025 + }, + { + "auxiliary_loss_clip": 0.01707681, + "auxiliary_loss_mlp": 0.0043051, + "balance_loss_clip": 1.33684444, + "balance_loss_mlp": 0.38199186, + "epoch": 0.11844280775589959, + "flos": 33545844990720.0, + "grad_norm": 3.240812274592002, + "language_loss": 0.88753033, + "learning_rate": 3.918506458695399e-06, + "loss": 0.90891218, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 0.48510742, + "step": 1970, + "time_per_iteration": 2.7830018997192383 + }, + { + "auxiliary_loss_clip": 0.01604112, + "auxiliary_loss_mlp": 0.00237742, + "balance_loss_clip": 1.34109199, + "balance_loss_mlp": 0.21981309, + "epoch": 0.11850293100856757, + "flos": 66350998604160.0, + "grad_norm": 0.8751610642813445, + "language_loss": 0.66168559, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68010414, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.1796875, + "step": 1971, + "time_per_iteration": 3.124837875366211 + }, + { + "auxiliary_loss_clip": 0.01663295, + "auxiliary_loss_mlp": 0.00389748, + "balance_loss_clip": 1.30431843, + "balance_loss_mlp": 0.34678534, + "epoch": 0.11856305426123553, + "flos": 24681045070080.0, + "grad_norm": 3.3693958968051314, + "language_loss": 0.86832565, + "learning_rate": 3.918286230142327e-06, + "loss": 0.88885611, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 0.4296875, + "step": 1972, + "time_per_iteration": 2.7367348670959473 + }, + { + "auxiliary_loss_clip": 0.01642799, + "auxiliary_loss_mlp": 0.00378416, + "balance_loss_clip": 1.29048038, + "balance_loss_mlp": 0.33452311, + "epoch": 0.1186231775139035, + "flos": 24280102483200.0, + "grad_norm": 5.0315733692862175, + "language_loss": 0.78670645, + "learning_rate": 3.918176006751292e-06, + "loss": 0.80691862, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.43920898, + "step": 1973, + "time_per_iteration": 2.724130153656006 + }, + { + "auxiliary_loss_clip": 0.01635275, + "auxiliary_loss_mlp": 0.00389158, + "balance_loss_clip": 1.28355837, + "balance_loss_mlp": 0.34674379, + "epoch": 0.11868330076657148, + "flos": 21757413473280.0, + "grad_norm": 1.935074741880182, + "language_loss": 0.76805389, + "learning_rate": 3.918065710622832e-06, + "loss": 0.78829819, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 0.42431641, + "step": 1974, + "time_per_iteration": 2.70682692527771 + }, + { + "auxiliary_loss_clip": 0.01637969, + "auxiliary_loss_mlp": 0.00374947, + "balance_loss_clip": 1.27883077, + "balance_loss_mlp": 0.3297188, + "epoch": 0.11874342401923944, + "flos": 17193274894080.0, + "grad_norm": 89.76861819338684, + "language_loss": 0.85188556, + "learning_rate": 3.917955341761128e-06, + "loss": 0.8720147, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 0.45214844, + "step": 1975, + "time_per_iteration": 2.724034309387207 + }, + { + "auxiliary_loss_clip": 0.01611779, + "auxiliary_loss_mlp": 0.00362235, + "balance_loss_clip": 1.27519798, + "balance_loss_mlp": 0.32001099, + "epoch": 0.11880354727190741, + "flos": 15229572312960.0, + "grad_norm": 2.7779441585997975, + "language_loss": 0.81700099, + "learning_rate": 3.917844900170364e-06, + "loss": 0.83674115, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.42211914, + "step": 1976, + "time_per_iteration": 2.6696720123291016 + }, + { + "auxiliary_loss_clip": 0.01604672, + "auxiliary_loss_mlp": 0.00362759, + "balance_loss_clip": 1.26026082, + "balance_loss_mlp": 0.32120305, + "epoch": 0.11886367052457537, + "flos": 27309706179840.0, + "grad_norm": 1.973244759994322, + "language_loss": 0.79469442, + "learning_rate": 3.91773438585473e-06, + "loss": 0.81436872, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.41552734, + "step": 1977, + "time_per_iteration": 2.8059372901916504 + }, + { + "auxiliary_loss_clip": 0.01618286, + "auxiliary_loss_mlp": 0.00394106, + "balance_loss_clip": 1.26154613, + "balance_loss_mlp": 0.35006988, + "epoch": 0.11892379377724335, + "flos": 21798280172160.0, + "grad_norm": 38.91924888632274, + "language_loss": 0.81887102, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.83899486, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.44042969, + "step": 1978, + "time_per_iteration": 2.7566325664520264 + }, + { + "auxiliary_loss_clip": 0.01590761, + "auxiliary_loss_mlp": 0.00361329, + "balance_loss_clip": 1.25682807, + "balance_loss_mlp": 0.32122684, + "epoch": 0.11898391702991132, + "flos": 13991013498240.0, + "grad_norm": 2.7428667915447966, + "language_loss": 0.78527105, + "learning_rate": 3.917513139065616e-06, + "loss": 0.80479193, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.40112305, + "step": 1979, + "time_per_iteration": 2.702584743499756 + }, + { + "auxiliary_loss_clip": 0.01598727, + "auxiliary_loss_mlp": 0.00340386, + "balance_loss_clip": 1.25963163, + "balance_loss_mlp": 0.30078417, + "epoch": 0.11904404028257928, + "flos": 32234567091840.0, + "grad_norm": 10.219824057816563, + "language_loss": 1.03051174, + "learning_rate": 3.917402406600525e-06, + "loss": 1.04990292, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.39599609, + "step": 1980, + "time_per_iteration": 2.774045944213867 + }, + { + "auxiliary_loss_clip": 0.01597991, + "auxiliary_loss_mlp": 0.00368582, + "balance_loss_clip": 1.25351739, + "balance_loss_mlp": 0.32502288, + "epoch": 0.11910416353524726, + "flos": 23586272398080.0, + "grad_norm": 14.630210447654045, + "language_loss": 0.91074049, + "learning_rate": 3.917291601427342e-06, + "loss": 0.93040621, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.43530273, + "step": 1981, + "time_per_iteration": 2.661867141723633 + }, + { + "auxiliary_loss_clip": 0.01590517, + "auxiliary_loss_mlp": 0.00380912, + "balance_loss_clip": 1.25586689, + "balance_loss_mlp": 0.33833009, + "epoch": 0.11916428678791523, + "flos": 25333038789120.0, + "grad_norm": 166.23384718823678, + "language_loss": 0.91056794, + "learning_rate": 3.91718072355027e-06, + "loss": 0.93028224, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.42578125, + "step": 1982, + "time_per_iteration": 2.7405333518981934 + }, + { + "auxiliary_loss_clip": 0.01581916, + "auxiliary_loss_mlp": 0.00370408, + "balance_loss_clip": 1.24882925, + "balance_loss_mlp": 0.32978135, + "epoch": 0.11922441004058319, + "flos": 19788431592960.0, + "grad_norm": 6.6037418652459285, + "language_loss": 0.88808882, + "learning_rate": 3.917069772973513e-06, + "loss": 0.90761209, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.40625, + "step": 1983, + "time_per_iteration": 2.659723997116089 + }, + { + "auxiliary_loss_clip": 0.01598207, + "auxiliary_loss_mlp": 0.00367266, + "balance_loss_clip": 1.25513732, + "balance_loss_mlp": 0.32642478, + "epoch": 0.11928453329325117, + "flos": 21536347219200.0, + "grad_norm": 32.39621773485324, + "language_loss": 0.88360161, + "learning_rate": 3.916958749701277e-06, + "loss": 0.9032563, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 0.40820312, + "step": 1984, + "time_per_iteration": 2.6931474208831787 + }, + { + "auxiliary_loss_clip": 0.01600806, + "auxiliary_loss_mlp": 0.00364828, + "balance_loss_clip": 1.25914824, + "balance_loss_mlp": 0.32274759, + "epoch": 0.11934465654591914, + "flos": 20815010294400.0, + "grad_norm": 2.7800155379269245, + "language_loss": 0.88973498, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.9093914, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.4206543, + "step": 1985, + "time_per_iteration": 2.6295671463012695 + }, + { + "auxiliary_loss_clip": 0.01592769, + "auxiliary_loss_mlp": 0.00356639, + "balance_loss_clip": 1.26204228, + "balance_loss_mlp": 0.31582189, + "epoch": 0.1194047797985871, + "flos": 19060486565760.0, + "grad_norm": 7.719108086373187, + "language_loss": 0.8050521, + "learning_rate": 3.916736485087216e-06, + "loss": 0.8245461, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.40820312, + "step": 1986, + "time_per_iteration": 2.6566152572631836 + }, + { + "auxiliary_loss_clip": 0.01612723, + "auxiliary_loss_mlp": 0.00354126, + "balance_loss_clip": 1.27331305, + "balance_loss_mlp": 0.31416684, + "epoch": 0.11946490305125507, + "flos": 27190805184000.0, + "grad_norm": 78.34466963959444, + "language_loss": 0.81263101, + "learning_rate": 3.916625243753819e-06, + "loss": 0.83229953, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.39941406, + "step": 1987, + "time_per_iteration": 2.700974225997925 + }, + { + "auxiliary_loss_clip": 0.0162828, + "auxiliary_loss_mlp": 0.0043218, + "balance_loss_clip": 1.27401686, + "balance_loss_mlp": 0.38547349, + "epoch": 0.11952502630392305, + "flos": 21140791672320.0, + "grad_norm": 15.901285822012056, + "language_loss": 0.79763025, + "learning_rate": 3.916513929741799e-06, + "loss": 0.81823486, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 0.46704102, + "step": 1988, + "time_per_iteration": 2.6459853649139404 + }, + { + "auxiliary_loss_clip": 0.01614828, + "auxiliary_loss_mlp": 0.00385718, + "balance_loss_clip": 1.27737701, + "balance_loss_mlp": 0.34084803, + "epoch": 0.11958514955659101, + "flos": 22124241118080.0, + "grad_norm": 37.22381269326038, + "language_loss": 0.86806595, + "learning_rate": 3.91640254305538e-06, + "loss": 0.88807142, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.44873047, + "step": 1989, + "time_per_iteration": 2.714582920074463 + }, + { + "auxiliary_loss_clip": 0.01630511, + "auxiliary_loss_mlp": 0.00433331, + "balance_loss_clip": 1.28364778, + "balance_loss_mlp": 0.38862711, + "epoch": 0.11964527280925898, + "flos": 17421452040960.0, + "grad_norm": 276.91924065757047, + "language_loss": 0.84253955, + "learning_rate": 3.916291083698784e-06, + "loss": 0.86317801, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 0.44702148, + "step": 1990, + "time_per_iteration": 2.668959379196167 + }, + { + "auxiliary_loss_clip": 0.01580699, + "auxiliary_loss_mlp": 0.00089546, + "balance_loss_clip": 1.37270463, + "balance_loss_mlp": 0.08034267, + "epoch": 0.11970539606192696, + "flos": 70679741402880.0, + "grad_norm": 0.8625000334843518, + "language_loss": 0.55352938, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57023185, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.09179688, + "step": 1991, + "time_per_iteration": 3.134777069091797 + }, + { + "auxiliary_loss_clip": 0.01616759, + "auxiliary_loss_mlp": 0.00365401, + "balance_loss_clip": 1.2847054, + "balance_loss_mlp": 0.32782644, + "epoch": 0.11976551931459492, + "flos": 21215019127680.0, + "grad_norm": 3.2941151598836416, + "language_loss": 0.84734547, + "learning_rate": 3.916067946991971e-06, + "loss": 0.86716706, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.37548828, + "step": 1992, + "time_per_iteration": 2.6415200233459473 + }, + { + "auxiliary_loss_clip": 0.01609997, + "auxiliary_loss_mlp": 0.00418577, + "balance_loss_clip": 1.27155375, + "balance_loss_mlp": 0.3757574, + "epoch": 0.11982564256726289, + "flos": 25989306226560.0, + "grad_norm": 9.454851072798192, + "language_loss": 0.84291065, + "learning_rate": 3.915956269650216e-06, + "loss": 0.86319637, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.42797852, + "step": 1993, + "time_per_iteration": 2.6415982246398926 + }, + { + "auxiliary_loss_clip": 0.01608359, + "auxiliary_loss_mlp": 0.00375776, + "balance_loss_clip": 1.27148807, + "balance_loss_mlp": 0.33782023, + "epoch": 0.11988576581993086, + "flos": 21650866755840.0, + "grad_norm": 6.924494587883622, + "language_loss": 0.885979, + "learning_rate": 3.915844519655208e-06, + "loss": 0.90582037, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.37963867, + "step": 1994, + "time_per_iteration": 2.6363766193389893 + }, + { + "auxiliary_loss_clip": 0.01633313, + "auxiliary_loss_mlp": 0.00371836, + "balance_loss_clip": 1.29159069, + "balance_loss_mlp": 0.33626381, + "epoch": 0.11994588907259883, + "flos": 17857407409920.0, + "grad_norm": 9.56406624343577, + "language_loss": 0.96738195, + "learning_rate": 3.915732697011183e-06, + "loss": 0.98743343, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.35571289, + "step": 1995, + "time_per_iteration": 2.6397545337677 + }, + { + "auxiliary_loss_clip": 0.01632349, + "auxiliary_loss_mlp": 0.00403699, + "balance_loss_clip": 1.28681552, + "balance_loss_mlp": 0.36142725, + "epoch": 0.1200060123252668, + "flos": 24462744163200.0, + "grad_norm": 13.445292079631123, + "language_loss": 0.81188786, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.83224833, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 0.42285156, + "step": 1996, + "time_per_iteration": 2.6725575923919678 + }, + { + "auxiliary_loss_clip": 0.01642375, + "auxiliary_loss_mlp": 0.00354913, + "balance_loss_clip": 1.3034637, + "balance_loss_mlp": 0.31564572, + "epoch": 0.12006613557793476, + "flos": 18732191235840.0, + "grad_norm": 27.23634212712201, + "language_loss": 0.92744505, + "learning_rate": 3.915508833793048e-06, + "loss": 0.94741786, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.39257812, + "step": 1997, + "time_per_iteration": 2.675361394882202 + }, + { + "auxiliary_loss_clip": 0.01673091, + "auxiliary_loss_mlp": 0.00367663, + "balance_loss_clip": 1.3207078, + "balance_loss_mlp": 0.32944405, + "epoch": 0.12012625883060274, + "flos": 22267739952000.0, + "grad_norm": 8.944693923428655, + "language_loss": 0.86022878, + "learning_rate": 3.915396793227428e-06, + "loss": 0.88063639, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.38208008, + "step": 1998, + "time_per_iteration": 2.713930606842041 + }, + { + "auxiliary_loss_clip": 0.01659711, + "auxiliary_loss_mlp": 0.00332687, + "balance_loss_clip": 1.31691742, + "balance_loss_mlp": 0.29659075, + "epoch": 0.1201863820832707, + "flos": 21758885930880.0, + "grad_norm": 7.689945568161542, + "language_loss": 0.78891212, + "learning_rate": 3.915284680029769e-06, + "loss": 0.8088361, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.36108398, + "step": 1999, + "time_per_iteration": 2.680948495864868 + }, + { + "auxiliary_loss_clip": 0.01688411, + "auxiliary_loss_mlp": 0.00368746, + "balance_loss_clip": 1.32791388, + "balance_loss_mlp": 0.3265698, + "epoch": 0.12024650533593867, + "flos": 21907987286400.0, + "grad_norm": 12.76163814403361, + "language_loss": 0.83535087, + "learning_rate": 3.915172494204323e-06, + "loss": 0.8559224, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 0.42211914, + "step": 2000, + "time_per_iteration": 2.691784620285034 + }, + { + "auxiliary_loss_clip": 0.01704384, + "auxiliary_loss_mlp": 0.00332477, + "balance_loss_clip": 1.33916473, + "balance_loss_mlp": 0.2962369, + "epoch": 0.12030662858860665, + "flos": 21689219502720.0, + "grad_norm": 3.0896033462379697, + "language_loss": 0.90799725, + "learning_rate": 3.915060235755344e-06, + "loss": 0.92836589, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 0.36254883, + "step": 2001, + "time_per_iteration": 2.738615036010742 + }, + { + "auxiliary_loss_clip": 0.01699772, + "auxiliary_loss_mlp": 0.00348187, + "balance_loss_clip": 1.33698761, + "balance_loss_mlp": 0.31049299, + "epoch": 0.12036675184127461, + "flos": 12933228856320.0, + "grad_norm": 10.810892707382656, + "language_loss": 0.83120286, + "learning_rate": 3.91494790468709e-06, + "loss": 0.85168248, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 0.37695312, + "step": 2002, + "time_per_iteration": 2.651668071746826 + }, + { + "auxiliary_loss_clip": 0.01714319, + "auxiliary_loss_mlp": 0.00356142, + "balance_loss_clip": 1.33512449, + "balance_loss_mlp": 0.31561086, + "epoch": 0.12042687509394258, + "flos": 20851028657280.0, + "grad_norm": 9.159407766200552, + "language_loss": 0.85453761, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.87524223, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 3.79296875, + "router_z_loss_mlp": 0.40527344, + "step": 2003, + "time_per_iteration": 4.183563947677612 + }, + { + "auxiliary_loss_clip": 0.01711321, + "auxiliary_loss_mlp": 0.00333584, + "balance_loss_clip": 1.34493327, + "balance_loss_mlp": 0.29438794, + "epoch": 0.12048699834661056, + "flos": 23878513451520.0, + "grad_norm": 2.975325651086925, + "language_loss": 0.7869935, + "learning_rate": 3.914723024709793e-06, + "loss": 0.80744261, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 0.39208984, + "step": 2004, + "time_per_iteration": 4.133337497711182 + }, + { + "auxiliary_loss_clip": 0.01707368, + "auxiliary_loss_mlp": 0.00359485, + "balance_loss_clip": 1.33640206, + "balance_loss_mlp": 0.31647438, + "epoch": 0.12054712159927852, + "flos": 19756363726080.0, + "grad_norm": 31.27541006995268, + "language_loss": 0.83329254, + "learning_rate": 3.914610475809279e-06, + "loss": 0.85396111, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 0.42993164, + "step": 2005, + "time_per_iteration": 4.0789501667022705 + }, + { + "auxiliary_loss_clip": 0.01811753, + "auxiliary_loss_mlp": 0.00089319, + "balance_loss_clip": 1.55455613, + "balance_loss_mlp": 0.07839921, + "epoch": 0.12060724485194649, + "flos": 51672763123200.0, + "grad_norm": 4.5589941026928695, + "language_loss": 0.58246225, + "learning_rate": 3.914497854306543e-06, + "loss": 0.60147297, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.109375, + "step": 2006, + "time_per_iteration": 2.910663366317749 + }, + { + "auxiliary_loss_clip": 0.01718538, + "auxiliary_loss_mlp": 0.00332277, + "balance_loss_clip": 1.35086215, + "balance_loss_mlp": 0.29217526, + "epoch": 0.12066736810461445, + "flos": 18990425088000.0, + "grad_norm": 3.1303569453186393, + "language_loss": 0.82841963, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.8489278, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 0.40112305, + "step": 2007, + "time_per_iteration": 2.655571460723877 + }, + { + "auxiliary_loss_clip": 0.01738483, + "auxiliary_loss_mlp": 0.00359876, + "balance_loss_clip": 1.35317898, + "balance_loss_mlp": 0.31796181, + "epoch": 0.12072749135728243, + "flos": 16471973882880.0, + "grad_norm": 43.84610844280674, + "language_loss": 0.93761486, + "learning_rate": 3.914272393511494e-06, + "loss": 0.95859843, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 3.85546875, + "router_z_loss_mlp": 0.41918945, + "step": 2008, + "time_per_iteration": 4.191547870635986 + }, + { + "auxiliary_loss_clip": 0.01738813, + "auxiliary_loss_mlp": 0.00365951, + "balance_loss_clip": 1.36029565, + "balance_loss_mlp": 0.32544327, + "epoch": 0.1207876146099504, + "flos": 18077108947200.0, + "grad_norm": 5.6596940104906235, + "language_loss": 0.90523648, + "learning_rate": 3.91415955422773e-06, + "loss": 0.92628407, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.40551758, + "step": 2009, + "time_per_iteration": 2.6039528846740723 + }, + { + "auxiliary_loss_clip": 0.01734669, + "auxiliary_loss_mlp": 0.00378997, + "balance_loss_clip": 1.35602605, + "balance_loss_mlp": 0.33903801, + "epoch": 0.12084773786261836, + "flos": 21871573873920.0, + "grad_norm": 3.1658567331342407, + "language_loss": 0.90652239, + "learning_rate": 3.914046642358844e-06, + "loss": 0.92765903, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 0.39941406, + "step": 2010, + "time_per_iteration": 2.673386812210083 + }, + { + "auxiliary_loss_clip": 0.01753014, + "auxiliary_loss_mlp": 0.00386663, + "balance_loss_clip": 1.36024988, + "balance_loss_mlp": 0.34219792, + "epoch": 0.12090786111528634, + "flos": 18333044328960.0, + "grad_norm": 77.3860974396215, + "language_loss": 0.88378572, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.90518254, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 3.93359375, + "router_z_loss_mlp": 0.44433594, + "step": 2011, + "time_per_iteration": 2.6294357776641846 + }, + { + "auxiliary_loss_clip": 0.01754642, + "auxiliary_loss_mlp": 0.00358574, + "balance_loss_clip": 1.36587548, + "balance_loss_mlp": 0.31711259, + "epoch": 0.1209679843679543, + "flos": 21105850717440.0, + "grad_norm": 7084.483233650414, + "language_loss": 1.03533351, + "learning_rate": 3.913820600882834e-06, + "loss": 1.05646563, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 3.890625, + "router_z_loss_mlp": 0.41430664, + "step": 2012, + "time_per_iteration": 2.6867923736572266 + }, + { + "auxiliary_loss_clip": 0.01762264, + "auxiliary_loss_mlp": 0.00338564, + "balance_loss_clip": 1.37436604, + "balance_loss_mlp": 0.2986052, + "epoch": 0.12102810762062227, + "flos": 29241053585280.0, + "grad_norm": 16.482056652401038, + "language_loss": 0.85399854, + "learning_rate": 3.913707471284283e-06, + "loss": 0.87500679, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.39916992, + "step": 2013, + "time_per_iteration": 2.7254762649536133 + }, + { + "auxiliary_loss_clip": 0.01768462, + "auxiliary_loss_mlp": 0.00409403, + "balance_loss_clip": 1.36984849, + "balance_loss_mlp": 0.3657245, + "epoch": 0.12108823087329025, + "flos": 17930701111680.0, + "grad_norm": 59.586262703705124, + "language_loss": 0.84439778, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.86617637, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.43676758, + "step": 2014, + "time_per_iteration": 2.67704439163208 + }, + { + "auxiliary_loss_clip": 0.0176095, + "auxiliary_loss_mlp": 0.00356461, + "balance_loss_clip": 1.37730551, + "balance_loss_mlp": 0.31659785, + "epoch": 0.12114835412595822, + "flos": 22091850028800.0, + "grad_norm": 18.208118513666314, + "language_loss": 0.93590772, + "learning_rate": 3.913480994387535e-06, + "loss": 0.95708179, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 0.39892578, + "step": 2015, + "time_per_iteration": 2.7172884941101074 + }, + { + "auxiliary_loss_clip": 0.01760855, + "auxiliary_loss_mlp": 0.00351671, + "balance_loss_clip": 1.37268567, + "balance_loss_mlp": 0.31140256, + "epoch": 0.12120847737862618, + "flos": 20412343854720.0, + "grad_norm": 8.942722721751693, + "language_loss": 0.76109332, + "learning_rate": 3.913367647097926e-06, + "loss": 0.78221858, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.40258789, + "step": 2016, + "time_per_iteration": 2.670454978942871 + }, + { + "auxiliary_loss_clip": 0.01757119, + "auxiliary_loss_mlp": 0.00430485, + "balance_loss_clip": 1.36272502, + "balance_loss_mlp": 0.38375527, + "epoch": 0.12126860063129415, + "flos": 22309037614080.0, + "grad_norm": 13.268745307348686, + "language_loss": 0.8977176, + "learning_rate": 3.913254227253225e-06, + "loss": 0.91959369, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 3.94726562, + "router_z_loss_mlp": 0.46728516, + "step": 2017, + "time_per_iteration": 2.761678695678711 + }, + { + "auxiliary_loss_clip": 0.01762393, + "auxiliary_loss_mlp": 0.00389481, + "balance_loss_clip": 1.3683424, + "balance_loss_mlp": 0.34716183, + "epoch": 0.12132872388396213, + "flos": 13699275235200.0, + "grad_norm": 44.883157597221015, + "language_loss": 0.80019444, + "learning_rate": 3.913140734857731e-06, + "loss": 0.82171321, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 3.94335938, + "router_z_loss_mlp": 0.42358398, + "step": 2018, + "time_per_iteration": 2.623199224472046 + }, + { + "auxiliary_loss_clip": 0.01762792, + "auxiliary_loss_mlp": 0.0038846, + "balance_loss_clip": 1.37043333, + "balance_loss_mlp": 0.34852508, + "epoch": 0.12138884713663009, + "flos": 26466954307200.0, + "grad_norm": 1.7324546925939657, + "language_loss": 0.77129614, + "learning_rate": 3.91302716991575e-06, + "loss": 0.79280865, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 3.92578125, + "router_z_loss_mlp": 0.39941406, + "step": 2019, + "time_per_iteration": 2.7677927017211914 + }, + { + "auxiliary_loss_clip": 0.01778574, + "auxiliary_loss_mlp": 0.00428088, + "balance_loss_clip": 1.37607646, + "balance_loss_mlp": 0.38395691, + "epoch": 0.12144897038929806, + "flos": 26141603892480.0, + "grad_norm": 630.4597621471653, + "language_loss": 0.98692352, + "learning_rate": 3.912913532431586e-06, + "loss": 1.00899017, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 4.0234375, + "router_z_loss_mlp": 0.44116211, + "step": 2020, + "time_per_iteration": 2.6932878494262695 + }, + { + "auxiliary_loss_clip": 0.01774626, + "auxiliary_loss_mlp": 0.00395161, + "balance_loss_clip": 1.37995887, + "balance_loss_mlp": 0.35098228, + "epoch": 0.12150909364196603, + "flos": 24717530309760.0, + "grad_norm": 14.373613196358898, + "language_loss": 0.83206069, + "learning_rate": 3.912799822409549e-06, + "loss": 0.85375857, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 3.9453125, + "router_z_loss_mlp": 0.44189453, + "step": 2021, + "time_per_iteration": 2.735373020172119 + }, + { + "auxiliary_loss_clip": 0.01766796, + "auxiliary_loss_mlp": 0.00399005, + "balance_loss_clip": 1.37315297, + "balance_loss_mlp": 0.35735345, + "epoch": 0.121569216894634, + "flos": 25186990089600.0, + "grad_norm": 57.88323672498597, + "language_loss": 0.85934895, + "learning_rate": 3.912686039853952e-06, + "loss": 0.88100696, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 3.93554688, + "router_z_loss_mlp": 0.41650391, + "step": 2022, + "time_per_iteration": 2.6768858432769775 + }, + { + "auxiliary_loss_clip": 0.01786282, + "auxiliary_loss_mlp": 0.00396876, + "balance_loss_clip": 1.37968898, + "balance_loss_mlp": 0.35448474, + "epoch": 0.12162934014730196, + "flos": 13444094039040.0, + "grad_norm": 131.2990893911295, + "language_loss": 0.90355802, + "learning_rate": 3.912572184769108e-06, + "loss": 0.92538965, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.42382812, + "step": 2023, + "time_per_iteration": 2.6642773151397705 + }, + { + "auxiliary_loss_clip": 0.01816985, + "auxiliary_loss_mlp": 0.00460605, + "balance_loss_clip": 1.39537764, + "balance_loss_mlp": 0.41599733, + "epoch": 0.12168946339996994, + "flos": 16946138344320.0, + "grad_norm": 53.47701099902678, + "language_loss": 0.94140965, + "learning_rate": 3.912458257159335e-06, + "loss": 0.96418548, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 4.22070312, + "router_z_loss_mlp": 0.44604492, + "step": 2024, + "time_per_iteration": 2.655184030532837 + }, + { + "auxiliary_loss_clip": 0.01787935, + "auxiliary_loss_mlp": 0.0041131, + "balance_loss_clip": 1.38293242, + "balance_loss_mlp": 0.36818033, + "epoch": 0.12174958665263791, + "flos": 29821585196160.0, + "grad_norm": 11.566926015490681, + "language_loss": 0.78531778, + "learning_rate": 3.912344257028954e-06, + "loss": 0.80731022, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.4309082, + "step": 2025, + "time_per_iteration": 2.7235443592071533 + }, + { + "auxiliary_loss_clip": 0.01816719, + "auxiliary_loss_mlp": 0.00416955, + "balance_loss_clip": 1.40478957, + "balance_loss_mlp": 0.37244266, + "epoch": 0.12180970990530587, + "flos": 24641902224000.0, + "grad_norm": 16.851193805552278, + "language_loss": 0.81205708, + "learning_rate": 3.912230184382286e-06, + "loss": 0.83439386, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 4.12109375, + "router_z_loss_mlp": 0.4453125, + "step": 2026, + "time_per_iteration": 2.694554567337036 + }, + { + "auxiliary_loss_clip": 0.01822299, + "auxiliary_loss_mlp": 0.00392278, + "balance_loss_clip": 1.41021633, + "balance_loss_mlp": 0.34948194, + "epoch": 0.12186983315797385, + "flos": 20521691832960.0, + "grad_norm": 3.026219364426958, + "language_loss": 0.96441638, + "learning_rate": 3.912116039223659e-06, + "loss": 0.98656225, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 4.12109375, + "router_z_loss_mlp": 0.42822266, + "step": 2027, + "time_per_iteration": 2.6492908000946045 + }, + { + "auxiliary_loss_clip": 0.01811527, + "auxiliary_loss_mlp": 0.00420448, + "balance_loss_clip": 1.40301228, + "balance_loss_mlp": 0.37514806, + "epoch": 0.12192995641064182, + "flos": 27818344719360.0, + "grad_norm": 37.6614853711704, + "language_loss": 0.82814842, + "learning_rate": 3.912001821557399e-06, + "loss": 0.85046816, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 4.0859375, + "router_z_loss_mlp": 0.45288086, + "step": 2028, + "time_per_iteration": 2.7446517944335938 + }, + { + "auxiliary_loss_clip": 0.01847529, + "auxiliary_loss_mlp": 0.00385365, + "balance_loss_clip": 1.42352498, + "balance_loss_mlp": 0.34035128, + "epoch": 0.12199007966330978, + "flos": 22017119783040.0, + "grad_norm": 2.2147416673985565, + "language_loss": 0.8376087, + "learning_rate": 3.911887531387839e-06, + "loss": 0.85993767, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 4.24023438, + "router_z_loss_mlp": 0.45043945, + "step": 2029, + "time_per_iteration": 2.7855300903320312 + }, + { + "auxiliary_loss_clip": 0.01875314, + "auxiliary_loss_mlp": 0.00373294, + "balance_loss_clip": 1.44147325, + "balance_loss_mlp": 0.32801801, + "epoch": 0.12205020291597775, + "flos": 23295216493440.0, + "grad_norm": 2.790689213094286, + "language_loss": 0.85698032, + "learning_rate": 3.911773168719313e-06, + "loss": 0.87946641, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 4.34179688, + "router_z_loss_mlp": 0.45288086, + "step": 2030, + "time_per_iteration": 2.815398693084717 + }, + { + "auxiliary_loss_clip": 0.01896898, + "auxiliary_loss_mlp": 0.00398681, + "balance_loss_clip": 1.45480943, + "balance_loss_mlp": 0.3450129, + "epoch": 0.12211032616864573, + "flos": 26031609469440.0, + "grad_norm": 60.51771077451318, + "language_loss": 0.82735497, + "learning_rate": 3.911658733556155e-06, + "loss": 0.85031068, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 4.421875, + "router_z_loss_mlp": 0.53686523, + "step": 2031, + "time_per_iteration": 2.6838607788085938 + }, + { + "auxiliary_loss_clip": 0.01886854, + "auxiliary_loss_mlp": 0.00386467, + "balance_loss_clip": 1.45095205, + "balance_loss_mlp": 0.33892685, + "epoch": 0.12217044942131369, + "flos": 20410943224320.0, + "grad_norm": 67.98607540315224, + "language_loss": 0.81142521, + "learning_rate": 3.911544225902707e-06, + "loss": 0.83415842, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 4.3671875, + "router_z_loss_mlp": 0.4753418, + "step": 2032, + "time_per_iteration": 2.63751482963562 + }, + { + "auxiliary_loss_clip": 0.01900101, + "auxiliary_loss_mlp": 0.00398489, + "balance_loss_clip": 1.45925212, + "balance_loss_mlp": 0.3520208, + "epoch": 0.12223057267398166, + "flos": 22857142222080.0, + "grad_norm": 13.507721918288947, + "language_loss": 0.94006735, + "learning_rate": 3.911429645763311e-06, + "loss": 0.96305323, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 4.41015625, + "router_z_loss_mlp": 0.46459961, + "step": 2033, + "time_per_iteration": 2.657071590423584 + }, + { + "auxiliary_loss_clip": 0.01949918, + "auxiliary_loss_mlp": 0.00405613, + "balance_loss_clip": 1.47692037, + "balance_loss_mlp": 0.35235041, + "epoch": 0.12229069592664964, + "flos": 20047563285120.0, + "grad_norm": 8.04409813220032, + "language_loss": 0.7276063, + "learning_rate": 3.911314993142311e-06, + "loss": 0.75116158, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 4.734375, + "router_z_loss_mlp": 0.53295898, + "step": 2034, + "time_per_iteration": 2.6342110633850098 + }, + { + "auxiliary_loss_clip": 0.0191957, + "auxiliary_loss_mlp": 0.0037162, + "balance_loss_clip": 1.46885216, + "balance_loss_mlp": 0.32221979, + "epoch": 0.1223508191793176, + "flos": 22274240313600.0, + "grad_norm": 15.36725132232068, + "language_loss": 0.80696404, + "learning_rate": 3.911200268044055e-06, + "loss": 0.82987589, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 4.5078125, + "router_z_loss_mlp": 0.49414062, + "step": 2035, + "time_per_iteration": 2.6698505878448486 + }, + { + "auxiliary_loss_clip": 0.0196221, + "auxiliary_loss_mlp": 0.00408656, + "balance_loss_clip": 1.48058319, + "balance_loss_mlp": 0.35591739, + "epoch": 0.12241094243198557, + "flos": 21285978445440.0, + "grad_norm": 21.625540642824298, + "language_loss": 0.77652425, + "learning_rate": 3.911085470472892e-06, + "loss": 0.80023289, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 4.8125, + "router_z_loss_mlp": 0.52783203, + "step": 2036, + "time_per_iteration": 2.769523859024048 + }, + { + "auxiliary_loss_clip": 0.01943328, + "auxiliary_loss_mlp": 0.00395587, + "balance_loss_clip": 1.47608852, + "balance_loss_mlp": 0.34246743, + "epoch": 0.12247106568465355, + "flos": 17382381022080.0, + "grad_norm": 12.872081961977104, + "language_loss": 0.89003801, + "learning_rate": 3.910970600433178e-06, + "loss": 0.91342711, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 4.66796875, + "router_z_loss_mlp": 0.53076172, + "step": 2037, + "time_per_iteration": 2.6673941612243652 + }, + { + "auxiliary_loss_clip": 0.01979068, + "auxiliary_loss_mlp": 0.00396783, + "balance_loss_clip": 1.49738395, + "balance_loss_mlp": 0.3431389, + "epoch": 0.12253118893732151, + "flos": 27045438842880.0, + "grad_norm": 3.8415831493524353, + "language_loss": 0.89598948, + "learning_rate": 3.910855657929267e-06, + "loss": 0.91974801, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 4.8203125, + "router_z_loss_mlp": 0.53662109, + "step": 2038, + "time_per_iteration": 2.699010133743286 + }, + { + "auxiliary_loss_clip": 0.02030504, + "auxiliary_loss_mlp": 0.00163056, + "balance_loss_clip": 1.72166944, + "balance_loss_mlp": 0.10545367, + "epoch": 0.12259131218998948, + "flos": 53861518368000.0, + "grad_norm": 0.8127998182082139, + "language_loss": 0.58381355, + "learning_rate": 3.910740642965518e-06, + "loss": 0.60574913, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.57421875, + "step": 2039, + "time_per_iteration": 3.0729787349700928 + }, + { + "auxiliary_loss_clip": 0.02006546, + "auxiliary_loss_mlp": 0.00368289, + "balance_loss_clip": 1.51543725, + "balance_loss_mlp": 0.31135482, + "epoch": 0.12265143544265744, + "flos": 17891917401600.0, + "grad_norm": 29.697478144298458, + "language_loss": 0.88885707, + "learning_rate": 3.910625555546292e-06, + "loss": 0.9126054, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 4.91796875, + "router_z_loss_mlp": 0.56884766, + "step": 2040, + "time_per_iteration": 2.669980764389038 + }, + { + "auxiliary_loss_clip": 0.01964803, + "auxiliary_loss_mlp": 0.00368202, + "balance_loss_clip": 1.49581778, + "balance_loss_mlp": 0.31665611, + "epoch": 0.12271155869532542, + "flos": 21799932197760.0, + "grad_norm": 14.094835995497059, + "language_loss": 0.88352859, + "learning_rate": 3.910510395675953e-06, + "loss": 0.90685868, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 4.69140625, + "router_z_loss_mlp": 0.515625, + "step": 2041, + "time_per_iteration": 2.710817575454712 + }, + { + "auxiliary_loss_clip": 0.02011324, + "auxiliary_loss_mlp": 0.00423752, + "balance_loss_clip": 1.5201025, + "balance_loss_mlp": 0.37025109, + "epoch": 0.12277168194799339, + "flos": 19828759587840.0, + "grad_norm": 30.856014992781862, + "language_loss": 0.74871147, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.77306223, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 4.91015625, + "router_z_loss_mlp": 0.53515625, + "step": 2042, + "time_per_iteration": 2.7029571533203125 + }, + { + "auxiliary_loss_clip": 0.01974424, + "auxiliary_loss_mlp": 0.00387545, + "balance_loss_clip": 1.49793458, + "balance_loss_mlp": 0.33747724, + "epoch": 0.12283180520066135, + "flos": 23221024951680.0, + "grad_norm": 3.81319594633366, + "language_loss": 0.86901999, + "learning_rate": 3.910279858599409e-06, + "loss": 0.8926397, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 4.765625, + "router_z_loss_mlp": 0.50097656, + "step": 2043, + "time_per_iteration": 2.7868995666503906 + }, + { + "auxiliary_loss_clip": 0.0198587, + "auxiliary_loss_mlp": 0.0038744, + "balance_loss_clip": 1.49904442, + "balance_loss_mlp": 0.3337481, + "epoch": 0.12289192845332933, + "flos": 18588476920320.0, + "grad_norm": 8.442433340672633, + "language_loss": 0.86138409, + "learning_rate": 3.910164481401946e-06, + "loss": 0.88511717, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 4.8671875, + "router_z_loss_mlp": 0.53686523, + "step": 2044, + "time_per_iteration": 2.692396879196167 + }, + { + "auxiliary_loss_clip": 0.01987572, + "auxiliary_loss_mlp": 0.00394512, + "balance_loss_clip": 1.50802231, + "balance_loss_mlp": 0.34158298, + "epoch": 0.1229520517059973, + "flos": 25769532862080.0, + "grad_norm": 3.045405250600599, + "language_loss": 0.83070517, + "learning_rate": 3.910049031770853e-06, + "loss": 0.85452604, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.52954102, + "step": 2045, + "time_per_iteration": 4.1421825885772705 + }, + { + "auxiliary_loss_clip": 0.01951534, + "auxiliary_loss_mlp": 0.00413587, + "balance_loss_clip": 1.48543489, + "balance_loss_mlp": 0.36220759, + "epoch": 0.12301217495866526, + "flos": 20887154760960.0, + "grad_norm": 39.237385245264974, + "language_loss": 0.76563686, + "learning_rate": 3.90993350971051e-06, + "loss": 0.78928804, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 4.6640625, + "router_z_loss_mlp": 0.51391602, + "step": 2046, + "time_per_iteration": 4.081751585006714 + }, + { + "auxiliary_loss_clip": 0.01936632, + "auxiliary_loss_mlp": 0.00387261, + "balance_loss_clip": 1.47501516, + "balance_loss_mlp": 0.33969659, + "epoch": 0.12307229821133324, + "flos": 22378811783040.0, + "grad_norm": 3.0855459325776278, + "language_loss": 0.79870903, + "learning_rate": 3.909817915225297e-06, + "loss": 0.82194793, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.47607422, + "step": 2047, + "time_per_iteration": 4.081492185592651 + }, + { + "auxiliary_loss_clip": 0.01932108, + "auxiliary_loss_mlp": 0.00407873, + "balance_loss_clip": 1.47784257, + "balance_loss_mlp": 0.3556115, + "epoch": 0.1231324214640012, + "flos": 23367396873600.0, + "grad_norm": 4.566384644948009, + "language_loss": 0.84182286, + "learning_rate": 3.909702248319597e-06, + "loss": 0.86522257, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 4.546875, + "router_z_loss_mlp": 0.52294922, + "step": 2048, + "time_per_iteration": 2.784001111984253 + }, + { + "auxiliary_loss_clip": 0.01935365, + "auxiliary_loss_mlp": 0.00386043, + "balance_loss_clip": 1.49011409, + "balance_loss_mlp": 0.33704829, + "epoch": 0.12319254471666917, + "flos": 23767154311680.0, + "grad_norm": 8.607285925133759, + "language_loss": 0.92037117, + "learning_rate": 3.909586508997797e-06, + "loss": 0.94358528, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 4.453125, + "router_z_loss_mlp": 0.48950195, + "step": 2049, + "time_per_iteration": 2.7165257930755615 + }, + { + "auxiliary_loss_clip": 0.01910118, + "auxiliary_loss_mlp": 0.00408525, + "balance_loss_clip": 1.4636395, + "balance_loss_mlp": 0.35924417, + "epoch": 0.12325266796933713, + "flos": 23550146294400.0, + "grad_norm": 13.943077043192451, + "language_loss": 0.82887417, + "learning_rate": 3.909470697264285e-06, + "loss": 0.85206062, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 4.46875, + "router_z_loss_mlp": 0.49291992, + "step": 2050, + "time_per_iteration": 4.099667549133301 + }, + { + "auxiliary_loss_clip": 0.01916274, + "auxiliary_loss_mlp": 0.00430178, + "balance_loss_clip": 1.46171367, + "balance_loss_mlp": 0.3757475, + "epoch": 0.12331279122200511, + "flos": 24423996366720.0, + "grad_norm": 86.64768259303388, + "language_loss": 0.88433063, + "learning_rate": 3.909354813123452e-06, + "loss": 0.90779519, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 4.54296875, + "router_z_loss_mlp": 0.54418945, + "step": 2051, + "time_per_iteration": 2.694953441619873 + }, + { + "auxiliary_loss_clip": 0.0189139, + "auxiliary_loss_mlp": 0.00390438, + "balance_loss_clip": 1.4580543, + "balance_loss_mlp": 0.34087071, + "epoch": 0.12337291447467308, + "flos": 25484294960640.0, + "grad_norm": 5.801912214934079, + "language_loss": 0.8567881, + "learning_rate": 3.909238856579693e-06, + "loss": 0.87960637, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.49536133, + "step": 2052, + "time_per_iteration": 2.693880558013916 + }, + { + "auxiliary_loss_clip": 0.01882668, + "auxiliary_loss_mlp": 0.00391559, + "balance_loss_clip": 1.44145727, + "balance_loss_mlp": 0.34139621, + "epoch": 0.12343303772734104, + "flos": 23550002640000.0, + "grad_norm": 4.436952559144917, + "language_loss": 0.82103848, + "learning_rate": 3.909122827637406e-06, + "loss": 0.84378076, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 4.41796875, + "router_z_loss_mlp": 0.50146484, + "step": 2053, + "time_per_iteration": 2.6890225410461426 + }, + { + "auxiliary_loss_clip": 0.01855337, + "auxiliary_loss_mlp": 0.00418763, + "balance_loss_clip": 1.41887665, + "balance_loss_mlp": 0.36852807, + "epoch": 0.12349316098000902, + "flos": 47557074867840.0, + "grad_norm": 1.9255231796273566, + "language_loss": 0.79452527, + "learning_rate": 3.909006726300991e-06, + "loss": 0.81726629, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 4.37109375, + "router_z_loss_mlp": 0.50219727, + "step": 2054, + "time_per_iteration": 2.948261260986328 + }, + { + "auxiliary_loss_clip": 0.0184305, + "auxiliary_loss_mlp": 0.00388776, + "balance_loss_clip": 1.40926933, + "balance_loss_mlp": 0.34152189, + "epoch": 0.12355328423267699, + "flos": 25045969294080.0, + "grad_norm": 10.91568054629515, + "language_loss": 0.89208633, + "learning_rate": 3.908890552574849e-06, + "loss": 0.91440463, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.47265625, + "step": 2055, + "time_per_iteration": 2.7117483615875244 + }, + { + "auxiliary_loss_clip": 0.01817763, + "auxiliary_loss_mlp": 0.00421369, + "balance_loss_clip": 1.3977623, + "balance_loss_mlp": 0.37454316, + "epoch": 0.12361340748534495, + "flos": 27709140395520.0, + "grad_norm": 24.173134082624497, + "language_loss": 0.84100187, + "learning_rate": 3.908774306463384e-06, + "loss": 0.86339325, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 4.203125, + "router_z_loss_mlp": 0.46826172, + "step": 2056, + "time_per_iteration": 2.737187623977661 + }, + { + "auxiliary_loss_clip": 0.01819137, + "auxiliary_loss_mlp": 0.00405996, + "balance_loss_clip": 1.39532304, + "balance_loss_mlp": 0.35638082, + "epoch": 0.12367353073801293, + "flos": 26140598311680.0, + "grad_norm": 9.832387690324614, + "language_loss": 0.90314275, + "learning_rate": 3.908657987971009e-06, + "loss": 0.92539406, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 4.24023438, + "router_z_loss_mlp": 0.49584961, + "step": 2057, + "time_per_iteration": 2.700202226638794 + }, + { + "auxiliary_loss_clip": 0.0180609, + "auxiliary_loss_mlp": 0.00386127, + "balance_loss_clip": 1.39021182, + "balance_loss_mlp": 0.33746624, + "epoch": 0.1237336539906809, + "flos": 25156035544320.0, + "grad_norm": 4.584495861410982, + "language_loss": 0.83662587, + "learning_rate": 3.90854159710213e-06, + "loss": 0.85854799, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 4.1484375, + "router_z_loss_mlp": 0.48608398, + "step": 2058, + "time_per_iteration": 2.7029194831848145 + }, + { + "auxiliary_loss_clip": 0.01808693, + "auxiliary_loss_mlp": 0.00395209, + "balance_loss_clip": 1.39141035, + "balance_loss_mlp": 0.34681019, + "epoch": 0.12379377724334886, + "flos": 15304589867520.0, + "grad_norm": 2.7520352996354114, + "language_loss": 0.9058882, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.92792726, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 4.16796875, + "router_z_loss_mlp": 0.48388672, + "step": 2059, + "time_per_iteration": 2.663249969482422 + }, + { + "auxiliary_loss_clip": 0.01781805, + "auxiliary_loss_mlp": 0.00391762, + "balance_loss_clip": 1.36414659, + "balance_loss_mlp": 0.34431642, + "epoch": 0.12385390049601683, + "flos": 21316717509120.0, + "grad_norm": 75.19768123926501, + "language_loss": 0.89420092, + "learning_rate": 3.908308598252523e-06, + "loss": 0.91593659, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 4.19140625, + "router_z_loss_mlp": 0.47412109, + "step": 2060, + "time_per_iteration": 2.666761875152588 + }, + { + "auxiliary_loss_clip": 0.01770446, + "auxiliary_loss_mlp": 0.00411806, + "balance_loss_clip": 1.35978365, + "balance_loss_mlp": 0.36290622, + "epoch": 0.1239140237486848, + "flos": 15116309752320.0, + "grad_norm": 61.43247958533111, + "language_loss": 0.921507, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.94332951, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 4.10546875, + "router_z_loss_mlp": 0.48950195, + "step": 2061, + "time_per_iteration": 2.6435422897338867 + }, + { + "auxiliary_loss_clip": 0.01749322, + "auxiliary_loss_mlp": 0.00385184, + "balance_loss_clip": 1.35607731, + "balance_loss_mlp": 0.34048033, + "epoch": 0.12397414700135277, + "flos": 21976791788160.0, + "grad_norm": 12.732646878342084, + "language_loss": 0.90746045, + "learning_rate": 3.908075309949906e-06, + "loss": 0.92880547, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 3.94140625, + "router_z_loss_mlp": 0.44702148, + "step": 2062, + "time_per_iteration": 2.682983636856079 + }, + { + "auxiliary_loss_clip": 0.01760842, + "auxiliary_loss_mlp": 0.00377206, + "balance_loss_clip": 1.35988343, + "balance_loss_mlp": 0.3310717, + "epoch": 0.12403427025402074, + "flos": 13400892956160.0, + "grad_norm": 4.036626663712904, + "language_loss": 0.85537976, + "learning_rate": 3.907958557264774e-06, + "loss": 0.87676024, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 4.00976562, + "router_z_loss_mlp": 0.46142578, + "step": 2063, + "time_per_iteration": 2.7223594188690186 + }, + { + "auxiliary_loss_clip": 0.01746242, + "auxiliary_loss_mlp": 0.00378511, + "balance_loss_clip": 1.34462476, + "balance_loss_mlp": 0.3311134, + "epoch": 0.12409439350668872, + "flos": 15304374385920.0, + "grad_norm": 10.59341689024465, + "language_loss": 0.86875141, + "learning_rate": 3.907841732229663e-06, + "loss": 0.88999891, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 4.015625, + "router_z_loss_mlp": 0.47436523, + "step": 2064, + "time_per_iteration": 2.6331048011779785 + }, + { + "auxiliary_loss_clip": 0.0173812, + "auxiliary_loss_mlp": 0.00403707, + "balance_loss_clip": 1.33657312, + "balance_loss_mlp": 0.35898, + "epoch": 0.12415451675935668, + "flos": 25009376313600.0, + "grad_norm": 15.176325939830342, + "language_loss": 0.97590244, + "learning_rate": 3.907724834849002e-06, + "loss": 0.99732071, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 4.01367188, + "router_z_loss_mlp": 0.44726562, + "step": 2065, + "time_per_iteration": 2.7307064533233643 + }, + { + "auxiliary_loss_clip": 0.01722509, + "auxiliary_loss_mlp": 0.00402492, + "balance_loss_clip": 1.32592869, + "balance_loss_mlp": 0.35697755, + "epoch": 0.12421464001202465, + "flos": 23659673840640.0, + "grad_norm": 35.06853528406464, + "language_loss": 0.86829662, + "learning_rate": 3.907607865127225e-06, + "loss": 0.88954669, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 3.96484375, + "router_z_loss_mlp": 0.45483398, + "step": 2066, + "time_per_iteration": 2.7077105045318604 + }, + { + "auxiliary_loss_clip": 0.01640537, + "auxiliary_loss_mlp": 0.00181686, + "balance_loss_clip": 1.38238072, + "balance_loss_mlp": 0.16461532, + "epoch": 0.12427476326469263, + "flos": 65732904345600.0, + "grad_norm": 0.873693752439367, + "language_loss": 0.63664752, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65486979, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.17089844, + "step": 2067, + "time_per_iteration": 3.1525774002075195 + }, + { + "auxiliary_loss_clip": 0.01726075, + "auxiliary_loss_mlp": 0.00392461, + "balance_loss_clip": 1.32444191, + "balance_loss_mlp": 0.34801936, + "epoch": 0.12433488651736059, + "flos": 24535427333760.0, + "grad_norm": 31.962669120902927, + "language_loss": 1.00577569, + "learning_rate": 3.907373708678063e-06, + "loss": 1.02696097, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 4.01171875, + "router_z_loss_mlp": 0.44433594, + "step": 2068, + "time_per_iteration": 2.6813771724700928 + }, + { + "auxiliary_loss_clip": 0.01713896, + "auxiliary_loss_mlp": 0.00400623, + "balance_loss_clip": 1.31890702, + "balance_loss_mlp": 0.35806513, + "epoch": 0.12439500977002856, + "flos": 21031659175680.0, + "grad_norm": 4.688460957845058, + "language_loss": 0.8693254, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.89047062, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.42578125, + "step": 2069, + "time_per_iteration": 2.640092372894287 + }, + { + "auxiliary_loss_clip": 0.0171443, + "auxiliary_loss_mlp": 0.00406793, + "balance_loss_clip": 1.31456923, + "balance_loss_mlp": 0.35715446, + "epoch": 0.12445513302269653, + "flos": 26830621555200.0, + "grad_norm": 6.839066480456285, + "language_loss": 0.81538022, + "learning_rate": 3.907139262917696e-06, + "loss": 0.83659244, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 4.00195312, + "router_z_loss_mlp": 0.49633789, + "step": 2070, + "time_per_iteration": 2.7099668979644775 + }, + { + "auxiliary_loss_clip": 0.01711394, + "auxiliary_loss_mlp": 0.00375883, + "balance_loss_clip": 1.31867635, + "balance_loss_mlp": 0.33056006, + "epoch": 0.1245152562753645, + "flos": 18368919037440.0, + "grad_norm": 9.882925816215396, + "language_loss": 0.89791441, + "learning_rate": 3.907021931556922e-06, + "loss": 0.91878718, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 3.92773438, + "router_z_loss_mlp": 0.45336914, + "step": 2071, + "time_per_iteration": 2.6528475284576416 + }, + { + "auxiliary_loss_clip": 0.01690362, + "auxiliary_loss_mlp": 0.00400887, + "balance_loss_clip": 1.30400801, + "balance_loss_mlp": 0.35601687, + "epoch": 0.12457537952803246, + "flos": 33107986200960.0, + "grad_norm": 19.814249211283823, + "language_loss": 0.83038598, + "learning_rate": 3.906904527881684e-06, + "loss": 0.85129851, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 3.859375, + "router_z_loss_mlp": 0.44848633, + "step": 2072, + "time_per_iteration": 2.7795817852020264 + }, + { + "auxiliary_loss_clip": 0.01687993, + "auxiliary_loss_mlp": 0.00389975, + "balance_loss_clip": 1.29302573, + "balance_loss_mlp": 0.34484261, + "epoch": 0.12463550278070043, + "flos": 22270217990400.0, + "grad_norm": 3.2584239256339265, + "language_loss": 0.81820977, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.8389895, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.45166016, + "step": 2073, + "time_per_iteration": 2.7300331592559814 + }, + { + "auxiliary_loss_clip": 0.01680669, + "auxiliary_loss_mlp": 0.00359832, + "balance_loss_clip": 1.29280007, + "balance_loss_mlp": 0.3176792, + "epoch": 0.12469562603336841, + "flos": 14679025580160.0, + "grad_norm": 42.49441958192599, + "language_loss": 0.94735849, + "learning_rate": 3.906669503605631e-06, + "loss": 0.96776354, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 3.8828125, + "router_z_loss_mlp": 0.42138672, + "step": 2074, + "time_per_iteration": 2.6676025390625 + }, + { + "auxiliary_loss_clip": 0.01683008, + "auxiliary_loss_mlp": 0.00448367, + "balance_loss_clip": 1.28231716, + "balance_loss_mlp": 0.39798966, + "epoch": 0.12475574928603637, + "flos": 24644775312000.0, + "grad_norm": 48.479385495813354, + "language_loss": 0.90834355, + "learning_rate": 3.906551883013728e-06, + "loss": 0.92965734, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 4.00390625, + "router_z_loss_mlp": 0.50366211, + "step": 2075, + "time_per_iteration": 2.6708788871765137 + }, + { + "auxiliary_loss_clip": 0.01675622, + "auxiliary_loss_mlp": 0.00390652, + "balance_loss_clip": 1.28606749, + "balance_loss_mlp": 0.34232485, + "epoch": 0.12481587253870434, + "flos": 21762980081280.0, + "grad_norm": 22.528077385205204, + "language_loss": 0.79129732, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.8119601, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 3.890625, + "router_z_loss_mlp": 0.48339844, + "step": 2076, + "time_per_iteration": 2.6896884441375732 + }, + { + "auxiliary_loss_clip": 0.01699708, + "auxiliary_loss_mlp": 0.00406741, + "balance_loss_clip": 1.31020296, + "balance_loss_mlp": 0.36332482, + "epoch": 0.12487599579137232, + "flos": 21432529935360.0, + "grad_norm": 2.900477239244331, + "language_loss": 0.81243014, + "learning_rate": 3.906316424944469e-06, + "loss": 0.83349454, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 3.8984375, + "router_z_loss_mlp": 0.43408203, + "step": 2077, + "time_per_iteration": 2.6749234199523926 + }, + { + "auxiliary_loss_clip": 0.01667923, + "auxiliary_loss_mlp": 0.00445008, + "balance_loss_clip": 1.28414071, + "balance_loss_mlp": 0.39680004, + "epoch": 0.12493611904404028, + "flos": 16107624276480.0, + "grad_norm": 140.94893306817133, + "language_loss": 0.8956449, + "learning_rate": 3.906198587476043e-06, + "loss": 0.91677427, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 3.84179688, + "router_z_loss_mlp": 0.48193359, + "step": 2078, + "time_per_iteration": 2.687915086746216 + }, + { + "auxiliary_loss_clip": 0.01685985, + "auxiliary_loss_mlp": 0.00423459, + "balance_loss_clip": 1.29154086, + "balance_loss_mlp": 0.3750366, + "epoch": 0.12499624229670825, + "flos": 21580266574080.0, + "grad_norm": 76.72788183895335, + "language_loss": 0.8157469, + "learning_rate": 3.906080677724374e-06, + "loss": 0.83684134, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 3.94140625, + "router_z_loss_mlp": 0.48413086, + "step": 2079, + "time_per_iteration": 2.7145512104034424 + }, + { + "auxiliary_loss_clip": 0.01711822, + "auxiliary_loss_mlp": 0.00444153, + "balance_loss_clip": 1.31295455, + "balance_loss_mlp": 0.39844859, + "epoch": 0.1250563655493762, + "flos": 25699040421120.0, + "grad_norm": 14.901555346051175, + "language_loss": 0.91956753, + "learning_rate": 3.905962695693935e-06, + "loss": 0.9411273, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.45678711, + "step": 2080, + "time_per_iteration": 2.723886013031006 + }, + { + "auxiliary_loss_clip": 0.01698945, + "auxiliary_loss_mlp": 0.00398199, + "balance_loss_clip": 1.30922639, + "balance_loss_mlp": 0.35161245, + "epoch": 0.12511648880204418, + "flos": 16909509450240.0, + "grad_norm": 18.279877199647796, + "language_loss": 0.91025496, + "learning_rate": 3.9058446413892e-06, + "loss": 0.93122643, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 3.89648438, + "router_z_loss_mlp": 0.46630859, + "step": 2081, + "time_per_iteration": 2.666071891784668 + }, + { + "auxiliary_loss_clip": 0.01689778, + "auxiliary_loss_mlp": 0.00388622, + "balance_loss_clip": 1.30481625, + "balance_loss_mlp": 0.34658846, + "epoch": 0.12517661205471217, + "flos": 17567500740480.0, + "grad_norm": 3.487910934127648, + "language_loss": 0.81309235, + "learning_rate": 3.905726514814646e-06, + "loss": 0.83387631, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.42016602, + "step": 2082, + "time_per_iteration": 2.7497057914733887 + }, + { + "auxiliary_loss_clip": 0.01716233, + "auxiliary_loss_mlp": 0.00410481, + "balance_loss_clip": 1.31175113, + "balance_loss_mlp": 0.36415654, + "epoch": 0.12523673530738014, + "flos": 16033791870720.0, + "grad_norm": 12.881752486950456, + "language_loss": 0.87881494, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.90008211, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 4.04492188, + "router_z_loss_mlp": 0.46337891, + "step": 2083, + "time_per_iteration": 2.695235252380371 + }, + { + "auxiliary_loss_clip": 0.0171126, + "auxiliary_loss_mlp": 0.00405519, + "balance_loss_clip": 1.30931175, + "balance_loss_mlp": 0.35833639, + "epoch": 0.1252968585600481, + "flos": 18807747494400.0, + "grad_norm": 4.802378001144312, + "language_loss": 0.96305227, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.98422003, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 4.01953125, + "router_z_loss_mlp": 0.47192383, + "step": 2084, + "time_per_iteration": 2.6263034343719482 + }, + { + "auxiliary_loss_clip": 0.01719845, + "auxiliary_loss_mlp": 0.00371309, + "balance_loss_clip": 1.31958985, + "balance_loss_mlp": 0.32915649, + "epoch": 0.12535698181271607, + "flos": 27271568914560.0, + "grad_norm": 79.96079346506664, + "language_loss": 0.86879981, + "learning_rate": 3.905371701516869e-06, + "loss": 0.88971138, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 3.99804688, + "router_z_loss_mlp": 0.42114258, + "step": 2085, + "time_per_iteration": 2.827605724334717 + }, + { + "auxiliary_loss_clip": 0.01681477, + "auxiliary_loss_mlp": 0.0037234, + "balance_loss_clip": 1.28363645, + "balance_loss_mlp": 0.33056876, + "epoch": 0.12541710506538403, + "flos": 22054107813120.0, + "grad_norm": 219.47050385690088, + "language_loss": 0.94437766, + "learning_rate": 3.905253285907856e-06, + "loss": 0.96491587, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 3.98046875, + "router_z_loss_mlp": 0.41772461, + "step": 2086, + "time_per_iteration": 2.6515567302703857 + }, + { + "auxiliary_loss_clip": 0.01704364, + "auxiliary_loss_mlp": 0.00316115, + "balance_loss_clip": 1.30843055, + "balance_loss_mlp": 0.2761327, + "epoch": 0.125477228318052, + "flos": 12603173760000.0, + "grad_norm": 3.4339998040483146, + "language_loss": 0.91749728, + "learning_rate": 3.905134798051447e-06, + "loss": 0.93770206, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.39990234, + "step": 2087, + "time_per_iteration": 4.096821546554565 + }, + { + "auxiliary_loss_clip": 0.01718061, + "auxiliary_loss_mlp": 0.00351611, + "balance_loss_clip": 1.31077933, + "balance_loss_mlp": 0.3073608, + "epoch": 0.12553735157071996, + "flos": 23878549365120.0, + "grad_norm": 1.8913959827349356, + "language_loss": 0.81052017, + "learning_rate": 3.905016237952136e-06, + "loss": 0.83121687, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 4.07226562, + "router_z_loss_mlp": 0.44238281, + "step": 2088, + "time_per_iteration": 2.6884617805480957 + }, + { + "auxiliary_loss_clip": 0.0164792, + "auxiliary_loss_mlp": 0.00234942, + "balance_loss_clip": 1.33214402, + "balance_loss_mlp": 0.21911052, + "epoch": 0.12559747482338796, + "flos": 69920841830400.0, + "grad_norm": 0.7667830775052864, + "language_loss": 0.61813539, + "learning_rate": 3.904897605614418e-06, + "loss": 0.63696402, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.15820312, + "step": 2089, + "time_per_iteration": 4.502696990966797 + }, + { + "auxiliary_loss_clip": 0.01706935, + "auxiliary_loss_mlp": 0.00355111, + "balance_loss_clip": 1.29543042, + "balance_loss_mlp": 0.31260142, + "epoch": 0.12565759807605592, + "flos": 24279563779200.0, + "grad_norm": 3.6279139319812046, + "language_loss": 0.85941708, + "learning_rate": 3.904778901042793e-06, + "loss": 0.88003755, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 4.11328125, + "router_z_loss_mlp": 0.42553711, + "step": 2090, + "time_per_iteration": 4.126194953918457 + }, + { + "auxiliary_loss_clip": 0.0161513, + "auxiliary_loss_mlp": 0.00176243, + "balance_loss_clip": 1.29992962, + "balance_loss_mlp": 0.16231941, + "epoch": 0.12571772132872389, + "flos": 56451180286080.0, + "grad_norm": 0.7559432867246049, + "language_loss": 0.58621824, + "learning_rate": 3.90466012424176e-06, + "loss": 0.60413194, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.13964844, + "step": 2091, + "time_per_iteration": 3.111359119415283 + }, + { + "auxiliary_loss_clip": 0.01732001, + "auxiliary_loss_mlp": 0.00320606, + "balance_loss_clip": 1.31456399, + "balance_loss_mlp": 0.27907377, + "epoch": 0.12577784458139185, + "flos": 41245846675200.0, + "grad_norm": 46.657890669289465, + "language_loss": 0.71219409, + "learning_rate": 3.904541275215825e-06, + "loss": 0.7327202, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 4.171875, + "router_z_loss_mlp": 0.41552734, + "step": 2092, + "time_per_iteration": 4.238163232803345 + }, + { + "auxiliary_loss_clip": 0.01739254, + "auxiliary_loss_mlp": 0.00362456, + "balance_loss_clip": 1.31518829, + "balance_loss_mlp": 0.31782418, + "epoch": 0.12583796783405982, + "flos": 19755501799680.0, + "grad_norm": 14.845391907035642, + "language_loss": 0.88387609, + "learning_rate": 3.904422353969493e-06, + "loss": 0.90489316, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.44604492, + "step": 2093, + "time_per_iteration": 2.8204591274261475 + }, + { + "auxiliary_loss_clip": 0.01711548, + "auxiliary_loss_mlp": 0.00320118, + "balance_loss_clip": 1.30734444, + "balance_loss_mlp": 0.27975422, + "epoch": 0.12589809108672778, + "flos": 22602104680320.0, + "grad_norm": 3.849064650104178, + "language_loss": 0.81710386, + "learning_rate": 3.904303360507276e-06, + "loss": 0.83742052, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 4.0390625, + "router_z_loss_mlp": 0.40380859, + "step": 2094, + "time_per_iteration": 2.697709798812866 + }, + { + "auxiliary_loss_clip": 0.01729273, + "auxiliary_loss_mlp": 0.0032501, + "balance_loss_clip": 1.32296181, + "balance_loss_mlp": 0.28140366, + "epoch": 0.12595821433939577, + "flos": 45222845541120.0, + "grad_norm": 15.759362376272625, + "language_loss": 0.83316499, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.85370785, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 4.06445312, + "router_z_loss_mlp": 0.43603516, + "step": 2095, + "time_per_iteration": 2.8614957332611084 + }, + { + "auxiliary_loss_clip": 0.01730099, + "auxiliary_loss_mlp": 0.00338554, + "balance_loss_clip": 1.31732702, + "balance_loss_mlp": 0.29356402, + "epoch": 0.12601833759206374, + "flos": 14319811618560.0, + "grad_norm": 4.748756204444818, + "language_loss": 0.92814052, + "learning_rate": 3.904065156953232e-06, + "loss": 0.94882703, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 4.12304688, + "router_z_loss_mlp": 0.45019531, + "step": 2096, + "time_per_iteration": 2.6677074432373047 + }, + { + "auxiliary_loss_clip": 0.0172964, + "auxiliary_loss_mlp": 0.00320261, + "balance_loss_clip": 1.32489789, + "balance_loss_mlp": 0.27832347, + "epoch": 0.1260784608447317, + "flos": 21288241002240.0, + "grad_norm": 20.765127297429633, + "language_loss": 0.84060937, + "learning_rate": 3.903945946870439e-06, + "loss": 0.86110842, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 4.04296875, + "router_z_loss_mlp": 0.41943359, + "step": 2097, + "time_per_iteration": 2.663586378097534 + }, + { + "auxiliary_loss_clip": 0.01761472, + "auxiliary_loss_mlp": 0.00345266, + "balance_loss_clip": 1.34870267, + "balance_loss_mlp": 0.30139658, + "epoch": 0.12613858409739967, + "flos": 26251311006720.0, + "grad_norm": 17.188028424074336, + "language_loss": 0.93151975, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.95258719, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 4.13085938, + "router_z_loss_mlp": 0.43823242, + "step": 2098, + "time_per_iteration": 2.7051756381988525 + }, + { + "auxiliary_loss_clip": 0.01762013, + "auxiliary_loss_mlp": 0.00348689, + "balance_loss_clip": 1.34839749, + "balance_loss_mlp": 0.29924107, + "epoch": 0.12619870735006763, + "flos": 21579979265280.0, + "grad_norm": 2.146667909868135, + "language_loss": 0.77820039, + "learning_rate": 3.903707310115912e-06, + "loss": 0.79930747, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 4.13671875, + "router_z_loss_mlp": 0.49462891, + "step": 2099, + "time_per_iteration": 2.7075464725494385 + }, + { + "auxiliary_loss_clip": 0.01756204, + "auxiliary_loss_mlp": 0.00350586, + "balance_loss_clip": 1.34743357, + "balance_loss_mlp": 0.30337888, + "epoch": 0.1262588306027356, + "flos": 23367037737600.0, + "grad_norm": 88.94749039985169, + "language_loss": 0.88204587, + "learning_rate": 3.903587883453228e-06, + "loss": 0.90311372, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.47216797, + "step": 2100, + "time_per_iteration": 2.7222235202789307 + }, + { + "auxiliary_loss_clip": 0.01805004, + "auxiliary_loss_mlp": 0.00366676, + "balance_loss_clip": 1.38769174, + "balance_loss_mlp": 0.31954086, + "epoch": 0.12631895385540357, + "flos": 23949185460480.0, + "grad_norm": 17.015151668349162, + "language_loss": 0.8767693, + "learning_rate": 3.903468384606302e-06, + "loss": 0.89848608, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 4.171875, + "router_z_loss_mlp": 0.47119141, + "step": 2101, + "time_per_iteration": 2.7332987785339355 + }, + { + "auxiliary_loss_clip": 0.0170296, + "auxiliary_loss_mlp": 0.0011635, + "balance_loss_clip": 1.4436419, + "balance_loss_mlp": 0.06923802, + "epoch": 0.12637907710807156, + "flos": 70282138780800.0, + "grad_norm": 0.744726053726032, + "language_loss": 0.57040519, + "learning_rate": 3.903348813579662e-06, + "loss": 0.58859825, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.47070312, + "step": 2102, + "time_per_iteration": 3.211390495300293 + }, + { + "auxiliary_loss_clip": 0.01830239, + "auxiliary_loss_mlp": 0.00368598, + "balance_loss_clip": 1.40774679, + "balance_loss_mlp": 0.31867269, + "epoch": 0.12643920036073952, + "flos": 18915084311040.0, + "grad_norm": 26.38028617236145, + "language_loss": 0.99246216, + "learning_rate": 3.903229170377845e-06, + "loss": 1.01445055, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 4.2265625, + "router_z_loss_mlp": 0.4987793, + "step": 2103, + "time_per_iteration": 2.686946392059326 + }, + { + "auxiliary_loss_clip": 0.01872387, + "auxiliary_loss_mlp": 0.00371786, + "balance_loss_clip": 1.44541466, + "balance_loss_mlp": 0.32374462, + "epoch": 0.1264993236134075, + "flos": 27782470010880.0, + "grad_norm": 3.422017288889648, + "language_loss": 0.83287525, + "learning_rate": 3.903109455005387e-06, + "loss": 0.855317, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 4.26953125, + "router_z_loss_mlp": 0.48022461, + "step": 2104, + "time_per_iteration": 2.7553439140319824 + }, + { + "auxiliary_loss_clip": 0.01899143, + "auxiliary_loss_mlp": 0.00408695, + "balance_loss_clip": 1.4546032, + "balance_loss_mlp": 0.36155921, + "epoch": 0.12655944686607545, + "flos": 24754697907840.0, + "grad_norm": 2.8458516400502267, + "language_loss": 0.87583524, + "learning_rate": 3.902989667466828e-06, + "loss": 0.89891356, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 4.4453125, + "router_z_loss_mlp": 0.47119141, + "step": 2105, + "time_per_iteration": 2.718404769897461 + }, + { + "auxiliary_loss_clip": 0.01904987, + "auxiliary_loss_mlp": 0.0042953, + "balance_loss_clip": 1.44875479, + "balance_loss_mlp": 0.37865168, + "epoch": 0.12661957011874342, + "flos": 24133048202880.0, + "grad_norm": 321.0046116578536, + "language_loss": 0.8996309, + "learning_rate": 3.90286980776671e-06, + "loss": 0.92297608, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 4.56640625, + "router_z_loss_mlp": 0.5090332, + "step": 2106, + "time_per_iteration": 2.9100565910339355 + }, + { + "auxiliary_loss_clip": 0.01871835, + "auxiliary_loss_mlp": 0.00398222, + "balance_loss_clip": 1.43068004, + "balance_loss_mlp": 0.35015723, + "epoch": 0.12667969337141138, + "flos": 24569614103040.0, + "grad_norm": 125.37959747523752, + "language_loss": 0.7970767, + "learning_rate": 3.902749875909578e-06, + "loss": 0.81977725, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 4.41015625, + "router_z_loss_mlp": 0.48022461, + "step": 2107, + "time_per_iteration": 2.691894054412842 + }, + { + "auxiliary_loss_clip": 0.01874767, + "auxiliary_loss_mlp": 0.00396223, + "balance_loss_clip": 1.43763816, + "balance_loss_mlp": 0.35087615, + "epoch": 0.12673981662407935, + "flos": 22961677777920.0, + "grad_norm": 12.734446934443623, + "language_loss": 0.85792273, + "learning_rate": 3.90262987189998e-06, + "loss": 0.8806327, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 4.375, + "router_z_loss_mlp": 0.45361328, + "step": 2108, + "time_per_iteration": 2.7000575065612793 + }, + { + "auxiliary_loss_clip": 0.01882169, + "auxiliary_loss_mlp": 0.00443327, + "balance_loss_clip": 1.43769467, + "balance_loss_mlp": 0.39533347, + "epoch": 0.12679993987674734, + "flos": 17274864637440.0, + "grad_norm": 5.030709777228479, + "language_loss": 0.82092148, + "learning_rate": 3.902509795742467e-06, + "loss": 0.84417641, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 4.4453125, + "router_z_loss_mlp": 0.48022461, + "step": 2109, + "time_per_iteration": 2.6058220863342285 + }, + { + "auxiliary_loss_clip": 0.01919924, + "auxiliary_loss_mlp": 0.0043401, + "balance_loss_clip": 1.46468365, + "balance_loss_mlp": 0.3858729, + "epoch": 0.1268600631294153, + "flos": 17275080119040.0, + "grad_norm": 4.52743301064182, + "language_loss": 0.88976061, + "learning_rate": 3.902389647441592e-06, + "loss": 0.91329998, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 4.5625, + "router_z_loss_mlp": 0.48193359, + "step": 2110, + "time_per_iteration": 2.7262282371520996 + }, + { + "auxiliary_loss_clip": 0.01922748, + "auxiliary_loss_mlp": 0.00471652, + "balance_loss_clip": 1.46128762, + "balance_loss_mlp": 0.42270452, + "epoch": 0.12692018638208327, + "flos": 24061047390720.0, + "grad_norm": 2.6833899331736233, + "language_loss": 0.85108054, + "learning_rate": 3.90226942700191e-06, + "loss": 0.87502456, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 4.6171875, + "router_z_loss_mlp": 0.48974609, + "step": 2111, + "time_per_iteration": 2.6392388343811035 + }, + { + "auxiliary_loss_clip": 0.01926495, + "auxiliary_loss_mlp": 0.00437575, + "balance_loss_clip": 1.45815408, + "balance_loss_mlp": 0.38858032, + "epoch": 0.12698030963475124, + "flos": 31831900652160.0, + "grad_norm": 24.625637092623357, + "language_loss": 0.85261023, + "learning_rate": 3.902149134427982e-06, + "loss": 0.87625092, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 4.6796875, + "router_z_loss_mlp": 0.49023438, + "step": 2112, + "time_per_iteration": 2.756848096847534 + }, + { + "auxiliary_loss_clip": 0.01889376, + "auxiliary_loss_mlp": 0.00448293, + "balance_loss_clip": 1.44081557, + "balance_loss_mlp": 0.40103859, + "epoch": 0.1270404328874192, + "flos": 25187744275200.0, + "grad_norm": 4.145187832131962, + "language_loss": 0.91119504, + "learning_rate": 3.902028769724367e-06, + "loss": 0.93457174, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 4.484375, + "router_z_loss_mlp": 0.47265625, + "step": 2113, + "time_per_iteration": 2.658616781234741 + }, + { + "auxiliary_loss_clip": 0.01949149, + "auxiliary_loss_mlp": 0.00460247, + "balance_loss_clip": 1.47604668, + "balance_loss_mlp": 0.40350366, + "epoch": 0.12710055614008717, + "flos": 15997342544640.0, + "grad_norm": 8.453803976940339, + "language_loss": 0.79926491, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.82335883, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 4.73046875, + "router_z_loss_mlp": 0.56738281, + "step": 2114, + "time_per_iteration": 2.6448779106140137 + }, + { + "auxiliary_loss_clip": 0.01876034, + "auxiliary_loss_mlp": 0.00430875, + "balance_loss_clip": 1.43540645, + "balance_loss_mlp": 0.38166547, + "epoch": 0.12716067939275516, + "flos": 15085642515840.0, + "grad_norm": 16.1134788944923, + "language_loss": 0.89567077, + "learning_rate": 3.901787823946341e-06, + "loss": 0.9187398, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 4.40234375, + "router_z_loss_mlp": 0.49169922, + "step": 2115, + "time_per_iteration": 2.6348748207092285 + }, + { + "auxiliary_loss_clip": 0.01880549, + "auxiliary_loss_mlp": 0.0043009, + "balance_loss_clip": 1.43923485, + "balance_loss_mlp": 0.3832173, + "epoch": 0.12722080264542313, + "flos": 28366736636160.0, + "grad_norm": 53.450882083490065, + "language_loss": 0.93583977, + "learning_rate": 3.901667242881065e-06, + "loss": 0.95894623, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 4.4140625, + "router_z_loss_mlp": 0.46923828, + "step": 2116, + "time_per_iteration": 2.839796543121338 + }, + { + "auxiliary_loss_clip": 0.01851017, + "auxiliary_loss_mlp": 0.00450155, + "balance_loss_clip": 1.42157745, + "balance_loss_mlp": 0.40812194, + "epoch": 0.1272809258980911, + "flos": 32379897519360.0, + "grad_norm": 20.31689017739344, + "language_loss": 0.75771403, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.78072578, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 4.29296875, + "router_z_loss_mlp": 0.42016602, + "step": 2117, + "time_per_iteration": 2.7671260833740234 + }, + { + "auxiliary_loss_clip": 0.01860101, + "auxiliary_loss_mlp": 0.00455165, + "balance_loss_clip": 1.41694832, + "balance_loss_mlp": 0.40791088, + "epoch": 0.12734104915075906, + "flos": 16034402401920.0, + "grad_norm": 17.71580929895811, + "language_loss": 0.91157782, + "learning_rate": 3.901425864420852e-06, + "loss": 0.93473053, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 4.4296875, + "router_z_loss_mlp": 0.47241211, + "step": 2118, + "time_per_iteration": 2.650200366973877 + }, + { + "auxiliary_loss_clip": 0.01864874, + "auxiliary_loss_mlp": 0.00419558, + "balance_loss_clip": 1.42709148, + "balance_loss_mlp": 0.37518784, + "epoch": 0.12740117240342702, + "flos": 18260325244800.0, + "grad_norm": 3.207592072248894, + "language_loss": 0.9446193, + "learning_rate": 3.901305067035068e-06, + "loss": 0.96746367, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.4440918, + "step": 2119, + "time_per_iteration": 2.678295850753784 + }, + { + "auxiliary_loss_clip": 0.01874503, + "auxiliary_loss_mlp": 0.00432211, + "balance_loss_clip": 1.43054414, + "balance_loss_mlp": 0.38509959, + "epoch": 0.127461295656095, + "flos": 12121790664960.0, + "grad_norm": 913.2025515079109, + "language_loss": 0.95669526, + "learning_rate": 3.901184197551605e-06, + "loss": 0.97976232, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 4.43359375, + "router_z_loss_mlp": 0.47119141, + "step": 2120, + "time_per_iteration": 2.6011974811553955 + }, + { + "auxiliary_loss_clip": 0.01882495, + "auxiliary_loss_mlp": 0.00406709, + "balance_loss_clip": 1.43211985, + "balance_loss_mlp": 0.3616243, + "epoch": 0.12752141890876295, + "flos": 23149095966720.0, + "grad_norm": 3.576438013037645, + "language_loss": 0.83597523, + "learning_rate": 3.901063255975046e-06, + "loss": 0.85886729, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 4.50390625, + "router_z_loss_mlp": 0.45092773, + "step": 2121, + "time_per_iteration": 2.7812507152557373 + }, + { + "auxiliary_loss_clip": 0.01870306, + "auxiliary_loss_mlp": 0.00439022, + "balance_loss_clip": 1.4280076, + "balance_loss_mlp": 0.39217302, + "epoch": 0.12758154216143094, + "flos": 21615997628160.0, + "grad_norm": 66.86927813558697, + "language_loss": 0.90969205, + "learning_rate": 3.900942242309978e-06, + "loss": 0.93278533, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 4.42578125, + "router_z_loss_mlp": 0.46850586, + "step": 2122, + "time_per_iteration": 2.63783860206604 + }, + { + "auxiliary_loss_clip": 0.01878657, + "auxiliary_loss_mlp": 0.00442612, + "balance_loss_clip": 1.42971826, + "balance_loss_mlp": 0.39416558, + "epoch": 0.1276416654140989, + "flos": 15924874855680.0, + "grad_norm": 30.69898237085341, + "language_loss": 0.85788035, + "learning_rate": 3.90082115656099e-06, + "loss": 0.88109297, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 4.48828125, + "router_z_loss_mlp": 0.48461914, + "step": 2123, + "time_per_iteration": 2.8146605491638184 + }, + { + "auxiliary_loss_clip": 0.01878321, + "auxiliary_loss_mlp": 0.00412049, + "balance_loss_clip": 1.4380126, + "balance_loss_mlp": 0.36677346, + "epoch": 0.12770178866676687, + "flos": 22382690451840.0, + "grad_norm": 11.750155678117526, + "language_loss": 0.85184264, + "learning_rate": 3.900699998732673e-06, + "loss": 0.87474638, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 4.40625, + "router_z_loss_mlp": 0.45288086, + "step": 2124, + "time_per_iteration": 2.6366004943847656 + }, + { + "auxiliary_loss_clip": 0.01857431, + "auxiliary_loss_mlp": 0.00394344, + "balance_loss_clip": 1.41424346, + "balance_loss_mlp": 0.35045126, + "epoch": 0.12776191191943484, + "flos": 21652482867840.0, + "grad_norm": 3.7578796059147965, + "language_loss": 0.82128447, + "learning_rate": 3.900578768829623e-06, + "loss": 0.84380221, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 4.42578125, + "router_z_loss_mlp": 0.43896484, + "step": 2125, + "time_per_iteration": 2.688483953475952 + }, + { + "auxiliary_loss_clip": 0.01863992, + "auxiliary_loss_mlp": 0.00412159, + "balance_loss_clip": 1.42043209, + "balance_loss_mlp": 0.36829045, + "epoch": 0.1278220351721028, + "flos": 25735561574400.0, + "grad_norm": 3.1930173913833677, + "language_loss": 0.84304464, + "learning_rate": 3.900457466856434e-06, + "loss": 0.8658061, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 4.44140625, + "router_z_loss_mlp": 0.4387207, + "step": 2126, + "time_per_iteration": 2.6996278762817383 + }, + { + "auxiliary_loss_clip": 0.01862803, + "auxiliary_loss_mlp": 0.00419595, + "balance_loss_clip": 1.42413998, + "balance_loss_mlp": 0.37820518, + "epoch": 0.12788215842477077, + "flos": 41243224982400.0, + "grad_norm": 10.148381240003673, + "language_loss": 0.75616562, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.77898961, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.41381836, + "step": 2127, + "time_per_iteration": 2.9792075157165527 + }, + { + "auxiliary_loss_clip": 0.01860319, + "auxiliary_loss_mlp": 0.00116748, + "balance_loss_clip": 1.56499767, + "balance_loss_mlp": 0.0864211, + "epoch": 0.12794228167743876, + "flos": 70877430881280.0, + "grad_norm": 0.8212479864869062, + "language_loss": 0.62315154, + "learning_rate": 3.900214646718047e-06, + "loss": 0.64292228, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.30273438, + "step": 2128, + "time_per_iteration": 3.2068824768066406 + }, + { + "auxiliary_loss_clip": 0.01867829, + "auxiliary_loss_mlp": 0.00435067, + "balance_loss_clip": 1.42545891, + "balance_loss_mlp": 0.38752615, + "epoch": 0.12800240493010673, + "flos": 16289727252480.0, + "grad_norm": 691.7688551879252, + "language_loss": 0.8594743, + "learning_rate": 3.900093128562056e-06, + "loss": 0.88250327, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 4.421875, + "router_z_loss_mlp": 0.47558594, + "step": 2129, + "time_per_iteration": 4.040445804595947 + }, + { + "auxiliary_loss_clip": 0.01846811, + "auxiliary_loss_mlp": 0.00422801, + "balance_loss_clip": 1.40798473, + "balance_loss_mlp": 0.37561789, + "epoch": 0.1280625281827747, + "flos": 20631542601600.0, + "grad_norm": 28.7605328028082, + "language_loss": 0.87154984, + "learning_rate": 3.899971538354343e-06, + "loss": 0.89424598, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 4.38085938, + "router_z_loss_mlp": 0.47216797, + "step": 2130, + "time_per_iteration": 2.701592206954956 + }, + { + "auxiliary_loss_clip": 0.0185648, + "auxiliary_loss_mlp": 0.00447598, + "balance_loss_clip": 1.41309142, + "balance_loss_mlp": 0.40303731, + "epoch": 0.12812265143544266, + "flos": 22638230784000.0, + "grad_norm": 5.385552921053677, + "language_loss": 0.78741562, + "learning_rate": 3.899849876099518e-06, + "loss": 0.81045645, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 4.4375, + "router_z_loss_mlp": 0.44555664, + "step": 2131, + "time_per_iteration": 4.095390319824219 + }, + { + "auxiliary_loss_clip": 0.01843978, + "auxiliary_loss_mlp": 0.00443391, + "balance_loss_clip": 1.41774273, + "balance_loss_mlp": 0.39995104, + "epoch": 0.12818277468811062, + "flos": 34714701463680.0, + "grad_norm": 8.136697697140965, + "language_loss": 0.78422701, + "learning_rate": 3.899728141802197e-06, + "loss": 0.80710071, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 4.26171875, + "router_z_loss_mlp": 0.43408203, + "step": 2132, + "time_per_iteration": 4.28913950920105 + }, + { + "auxiliary_loss_clip": 0.01826986, + "auxiliary_loss_mlp": 0.00403817, + "balance_loss_clip": 1.4112184, + "balance_loss_mlp": 0.36195081, + "epoch": 0.1282428979407786, + "flos": 23112107936640.0, + "grad_norm": 3.7219319701229336, + "language_loss": 0.86790287, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.89021087, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 4.16015625, + "router_z_loss_mlp": 0.41894531, + "step": 2133, + "time_per_iteration": 2.645785093307495 + }, + { + "auxiliary_loss_clip": 0.01850686, + "auxiliary_loss_mlp": 0.00457326, + "balance_loss_clip": 1.41480219, + "balance_loss_mlp": 0.41288447, + "epoch": 0.12830302119344655, + "flos": 20886508316160.0, + "grad_norm": 16.447563226023828, + "language_loss": 0.8849923, + "learning_rate": 3.899484457098528e-06, + "loss": 0.90807247, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 4.359375, + "router_z_loss_mlp": 0.4440918, + "step": 2134, + "time_per_iteration": 2.640983819961548 + }, + { + "auxiliary_loss_clip": 0.01830474, + "auxiliary_loss_mlp": 0.00444988, + "balance_loss_clip": 1.40820003, + "balance_loss_mlp": 0.39942592, + "epoch": 0.12836314444611455, + "flos": 21397768548480.0, + "grad_norm": 17.497241013409685, + "language_loss": 0.89519072, + "learning_rate": 3.899362506701421e-06, + "loss": 0.91794538, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 4.23046875, + "router_z_loss_mlp": 0.45556641, + "step": 2135, + "time_per_iteration": 4.031800985336304 + }, + { + "auxiliary_loss_clip": 0.01814895, + "auxiliary_loss_mlp": 0.00432271, + "balance_loss_clip": 1.40296388, + "balance_loss_mlp": 0.38985634, + "epoch": 0.1284232676987825, + "flos": 13662466773120.0, + "grad_norm": 9.990883622697485, + "language_loss": 0.84238231, + "learning_rate": 3.899240484280298e-06, + "loss": 0.86485398, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 4.12109375, + "router_z_loss_mlp": 0.42407227, + "step": 2136, + "time_per_iteration": 2.6686925888061523 + }, + { + "auxiliary_loss_clip": 0.01744958, + "auxiliary_loss_mlp": 0.00230887, + "balance_loss_clip": 1.50894976, + "balance_loss_mlp": 0.21124144, + "epoch": 0.12848339095145048, + "flos": 59994737735040.0, + "grad_norm": 0.9157893838301836, + "language_loss": 0.59662038, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61637884, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.19628906, + "step": 2137, + "time_per_iteration": 3.2991943359375 + }, + { + "auxiliary_loss_clip": 0.01855914, + "auxiliary_loss_mlp": 0.00468289, + "balance_loss_clip": 1.42923236, + "balance_loss_mlp": 0.42418137, + "epoch": 0.12854351420411844, + "flos": 13881378211200.0, + "grad_norm": 2.176059162167412, + "language_loss": 0.90502697, + "learning_rate": 3.898996223384512e-06, + "loss": 0.92826903, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 4.26171875, + "router_z_loss_mlp": 0.44116211, + "step": 2138, + "time_per_iteration": 2.648012399673462 + }, + { + "auxiliary_loss_clip": 0.01872676, + "auxiliary_loss_mlp": 0.00497255, + "balance_loss_clip": 1.43520129, + "balance_loss_mlp": 0.44866493, + "epoch": 0.1286036374567864, + "flos": 22637943475200.0, + "grad_norm": 451.16768205963155, + "language_loss": 0.85783231, + "learning_rate": 3.898873984919113e-06, + "loss": 0.8815316, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 4.375, + "router_z_loss_mlp": 0.48632812, + "step": 2139, + "time_per_iteration": 2.8073408603668213 + }, + { + "auxiliary_loss_clip": 0.01838827, + "auxiliary_loss_mlp": 0.00409904, + "balance_loss_clip": 1.42780602, + "balance_loss_mlp": 0.36839515, + "epoch": 0.12866376070945437, + "flos": 16324775948160.0, + "grad_norm": 111.748254402506, + "language_loss": 0.89996672, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.922454, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 4.10742188, + "router_z_loss_mlp": 0.4152832, + "step": 2140, + "time_per_iteration": 2.643249273300171 + }, + { + "auxiliary_loss_clip": 0.01811667, + "auxiliary_loss_mlp": 0.00446993, + "balance_loss_clip": 1.40908754, + "balance_loss_mlp": 0.40512669, + "epoch": 0.12872388396212234, + "flos": 11874546374400.0, + "grad_norm": 22.491088816815292, + "language_loss": 0.92005253, + "learning_rate": 3.898629291976476e-06, + "loss": 0.94263911, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 4.03125, + "router_z_loss_mlp": 0.41894531, + "step": 2141, + "time_per_iteration": 2.6784236431121826 + }, + { + "auxiliary_loss_clip": 0.01807008, + "auxiliary_loss_mlp": 0.00453182, + "balance_loss_clip": 1.41078246, + "balance_loss_mlp": 0.40819228, + "epoch": 0.12878400721479033, + "flos": 28366700722560.0, + "grad_norm": 18.201279704919, + "language_loss": 0.76158154, + "learning_rate": 3.898506837508518e-06, + "loss": 0.78418344, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 3.95898438, + "router_z_loss_mlp": 0.44970703, + "step": 2142, + "time_per_iteration": 2.7159788608551025 + }, + { + "auxiliary_loss_clip": 0.0182741, + "auxiliary_loss_mlp": 0.00475293, + "balance_loss_clip": 1.42543662, + "balance_loss_mlp": 0.42534417, + "epoch": 0.1288441304674583, + "flos": 25885632597120.0, + "grad_norm": 485.58166862508403, + "language_loss": 0.88744438, + "learning_rate": 3.89838431104899e-06, + "loss": 0.91047144, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 4.01757812, + "router_z_loss_mlp": 0.49926758, + "step": 2143, + "time_per_iteration": 2.7095143795013428 + }, + { + "auxiliary_loss_clip": 0.01820282, + "auxiliary_loss_mlp": 0.00464024, + "balance_loss_clip": 1.42267621, + "balance_loss_mlp": 0.41633987, + "epoch": 0.12890425372012626, + "flos": 20813789232000.0, + "grad_norm": 7.284826436223412, + "language_loss": 0.86425102, + "learning_rate": 3.898261712602539e-06, + "loss": 0.88709414, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.47729492, + "step": 2144, + "time_per_iteration": 2.66925311088562 + }, + { + "auxiliary_loss_clip": 0.01777986, + "auxiliary_loss_mlp": 0.00451539, + "balance_loss_clip": 1.39586389, + "balance_loss_mlp": 0.40645421, + "epoch": 0.12896437697279423, + "flos": 22565870835840.0, + "grad_norm": 22.000421952332097, + "language_loss": 0.84740174, + "learning_rate": 3.898139042173813e-06, + "loss": 0.86969697, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 0.45043945, + "step": 2145, + "time_per_iteration": 2.7036099433898926 + }, + { + "auxiliary_loss_clip": 0.01759627, + "auxiliary_loss_mlp": 0.00440991, + "balance_loss_clip": 1.38627028, + "balance_loss_mlp": 0.39392725, + "epoch": 0.1290245002254622, + "flos": 17493776075520.0, + "grad_norm": 4.715395168394478, + "language_loss": 0.89599967, + "learning_rate": 3.898016299767465e-06, + "loss": 0.91800582, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.47094727, + "step": 2146, + "time_per_iteration": 2.657203197479248 + }, + { + "auxiliary_loss_clip": 0.01734711, + "auxiliary_loss_mlp": 0.00446417, + "balance_loss_clip": 1.37188435, + "balance_loss_mlp": 0.40168983, + "epoch": 0.12908462347813016, + "flos": 36315957859200.0, + "grad_norm": 5.77019151459533, + "language_loss": 0.7744388, + "learning_rate": 3.897893485388149e-06, + "loss": 0.7962501, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.44726562, + "step": 2147, + "time_per_iteration": 2.821298122406006 + }, + { + "auxiliary_loss_clip": 0.01726245, + "auxiliary_loss_mlp": 0.0047571, + "balance_loss_clip": 1.36647344, + "balance_loss_mlp": 0.4265486, + "epoch": 0.12914474673079815, + "flos": 22528703237760.0, + "grad_norm": 4.762964990354122, + "language_loss": 0.77224517, + "learning_rate": 3.897770599040521e-06, + "loss": 0.79426467, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 0.49194336, + "step": 2148, + "time_per_iteration": 2.75254225730896 + }, + { + "auxiliary_loss_clip": 0.01690865, + "auxiliary_loss_mlp": 0.00483768, + "balance_loss_clip": 1.34275866, + "balance_loss_mlp": 0.43582216, + "epoch": 0.12920486998346611, + "flos": 21471888263040.0, + "grad_norm": 4.588498531273309, + "language_loss": 0.82972169, + "learning_rate": 3.897647640729242e-06, + "loss": 0.85146803, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 0.47924805, + "step": 2149, + "time_per_iteration": 2.6784353256225586 + }, + { + "auxiliary_loss_clip": 0.01695018, + "auxiliary_loss_mlp": 0.00450009, + "balance_loss_clip": 1.34649622, + "balance_loss_mlp": 0.40387499, + "epoch": 0.12926499323613408, + "flos": 27308556944640.0, + "grad_norm": 29.78124015872229, + "language_loss": 0.83015358, + "learning_rate": 3.897524610458975e-06, + "loss": 0.85160387, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.4621582, + "step": 2150, + "time_per_iteration": 2.727299451828003 + }, + { + "auxiliary_loss_clip": 0.01673302, + "auxiliary_loss_mlp": 0.00456217, + "balance_loss_clip": 1.32677722, + "balance_loss_mlp": 0.41170382, + "epoch": 0.12932511648880204, + "flos": 22091131756800.0, + "grad_norm": 410.62531179703853, + "language_loss": 0.77656949, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.79786462, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.44506836, + "step": 2151, + "time_per_iteration": 2.652137517929077 + }, + { + "auxiliary_loss_clip": 0.01665846, + "auxiliary_loss_mlp": 0.00460785, + "balance_loss_clip": 1.32561648, + "balance_loss_mlp": 0.41584271, + "epoch": 0.12938523974147, + "flos": 20302780394880.0, + "grad_norm": 3.4018003286765963, + "language_loss": 0.8965103, + "learning_rate": 3.897278334060137e-06, + "loss": 0.91777658, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.44970703, + "step": 2152, + "time_per_iteration": 2.6721339225769043 + }, + { + "auxiliary_loss_clip": 0.01651161, + "auxiliary_loss_mlp": 0.00407457, + "balance_loss_clip": 1.31799555, + "balance_loss_mlp": 0.36528087, + "epoch": 0.12944536299413797, + "flos": 19499961467520.0, + "grad_norm": 5.988853955469221, + "language_loss": 0.84988821, + "learning_rate": 3.897155087940906e-06, + "loss": 0.87047434, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.421875, + "step": 2153, + "time_per_iteration": 2.665348529815674 + }, + { + "auxiliary_loss_clip": 0.01670566, + "auxiliary_loss_mlp": 0.004234, + "balance_loss_clip": 1.33499193, + "balance_loss_mlp": 0.38072309, + "epoch": 0.12950548624680594, + "flos": 27707919333120.0, + "grad_norm": 11.382817267989774, + "language_loss": 0.86426175, + "learning_rate": 3.897031769881364e-06, + "loss": 0.88520145, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.42675781, + "step": 2154, + "time_per_iteration": 2.751298427581787 + }, + { + "auxiliary_loss_clip": 0.01653418, + "auxiliary_loss_mlp": 0.00452661, + "balance_loss_clip": 1.32102323, + "balance_loss_mlp": 0.40943563, + "epoch": 0.12956560949947393, + "flos": 17565740974080.0, + "grad_norm": 4.0826583565111445, + "language_loss": 0.89364982, + "learning_rate": 3.896908379886188e-06, + "loss": 0.91471064, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.43237305, + "step": 2155, + "time_per_iteration": 2.6321706771850586 + }, + { + "auxiliary_loss_clip": 0.01626097, + "auxiliary_loss_mlp": 0.00453152, + "balance_loss_clip": 1.2906692, + "balance_loss_mlp": 0.41045088, + "epoch": 0.1296257327521419, + "flos": 20740711011840.0, + "grad_norm": 90.27107559925862, + "language_loss": 0.84624463, + "learning_rate": 3.896784917960055e-06, + "loss": 0.86703712, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.42675781, + "step": 2156, + "time_per_iteration": 2.688981533050537 + }, + { + "auxiliary_loss_clip": 0.01628206, + "auxiliary_loss_mlp": 0.00427089, + "balance_loss_clip": 1.30503094, + "balance_loss_mlp": 0.38567519, + "epoch": 0.12968585600480986, + "flos": 16395735265920.0, + "grad_norm": 11.742380929726915, + "language_loss": 0.90689915, + "learning_rate": 3.896661384107648e-06, + "loss": 0.92745209, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.4140625, + "step": 2157, + "time_per_iteration": 2.6291191577911377 + }, + { + "auxiliary_loss_clip": 0.01625416, + "auxiliary_loss_mlp": 0.00432718, + "balance_loss_clip": 1.29532659, + "balance_loss_mlp": 0.38915873, + "epoch": 0.12974597925747783, + "flos": 28329533124480.0, + "grad_norm": 173.26795372566554, + "language_loss": 0.86296451, + "learning_rate": 3.896537778333651e-06, + "loss": 0.88354588, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.43554688, + "step": 2158, + "time_per_iteration": 2.7158915996551514 + }, + { + "auxiliary_loss_clip": 0.01604328, + "auxiliary_loss_mlp": 0.00489566, + "balance_loss_clip": 1.27445376, + "balance_loss_mlp": 0.44226357, + "epoch": 0.1298061025101458, + "flos": 9683025782400.0, + "grad_norm": 45.33372519514227, + "language_loss": 0.81698406, + "learning_rate": 3.896414100642752e-06, + "loss": 0.83792293, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.47314453, + "step": 2159, + "time_per_iteration": 2.7019472122192383 + }, + { + "auxiliary_loss_clip": 0.01589649, + "auxiliary_loss_mlp": 0.00441269, + "balance_loss_clip": 1.27203035, + "balance_loss_mlp": 0.4010244, + "epoch": 0.12986622576281376, + "flos": 27709535445120.0, + "grad_norm": 32.51788720524774, + "language_loss": 0.89362848, + "learning_rate": 3.89629035103964e-06, + "loss": 0.91393763, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.40234375, + "step": 2160, + "time_per_iteration": 2.7197370529174805 + }, + { + "auxiliary_loss_clip": 0.01598905, + "auxiliary_loss_mlp": 0.00423213, + "balance_loss_clip": 1.2806952, + "balance_loss_mlp": 0.37836659, + "epoch": 0.12992634901548175, + "flos": 18802719590400.0, + "grad_norm": 276.1469797367235, + "language_loss": 0.86583245, + "learning_rate": 3.896166529529008e-06, + "loss": 0.88605356, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.44848633, + "step": 2161, + "time_per_iteration": 2.71525502204895 + }, + { + "auxiliary_loss_clip": 0.01603232, + "auxiliary_loss_mlp": 0.00456297, + "balance_loss_clip": 1.28134727, + "balance_loss_mlp": 0.41030651, + "epoch": 0.12998647226814972, + "flos": 29127575543040.0, + "grad_norm": 2.8414930949012005, + "language_loss": 0.8906157, + "learning_rate": 3.896042636115551e-06, + "loss": 0.91121101, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.45947266, + "step": 2162, + "time_per_iteration": 2.7103567123413086 + }, + { + "auxiliary_loss_clip": 0.01599689, + "auxiliary_loss_mlp": 0.00441556, + "balance_loss_clip": 1.27351594, + "balance_loss_mlp": 0.39952308, + "epoch": 0.13004659552081768, + "flos": 19573686132480.0, + "grad_norm": 229.5492161885517, + "language_loss": 0.79769391, + "learning_rate": 3.895918670803968e-06, + "loss": 0.81810635, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.42041016, + "step": 2163, + "time_per_iteration": 2.6889004707336426 + }, + { + "auxiliary_loss_clip": 0.01589461, + "auxiliary_loss_mlp": 0.00454405, + "balance_loss_clip": 1.26793075, + "balance_loss_mlp": 0.40917701, + "epoch": 0.13010671877348565, + "flos": 22490709626880.0, + "grad_norm": 37.164932538998485, + "language_loss": 0.87828308, + "learning_rate": 3.895794633598958e-06, + "loss": 0.89872169, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.45214844, + "step": 2164, + "time_per_iteration": 2.6591014862060547 + }, + { + "auxiliary_loss_clip": 0.01583285, + "auxiliary_loss_mlp": 0.00479302, + "balance_loss_clip": 1.259619, + "balance_loss_mlp": 0.4348844, + "epoch": 0.1301668420261536, + "flos": 23878226142720.0, + "grad_norm": 86.437227601016, + "language_loss": 0.79446328, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.81508917, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.44458008, + "step": 2165, + "time_per_iteration": 2.7069926261901855 + }, + { + "auxiliary_loss_clip": 0.01598496, + "auxiliary_loss_mlp": 0.00433174, + "balance_loss_clip": 1.27131283, + "balance_loss_mlp": 0.38673019, + "epoch": 0.13022696527882158, + "flos": 23150065633920.0, + "grad_norm": 230.4066382092491, + "language_loss": 0.8238278, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.84414446, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.46411133, + "step": 2166, + "time_per_iteration": 2.6700475215911865 + }, + { + "auxiliary_loss_clip": 0.01568292, + "auxiliary_loss_mlp": 0.00428638, + "balance_loss_clip": 1.25126505, + "balance_loss_mlp": 0.38805878, + "epoch": 0.13028708853148954, + "flos": 26908548111360.0, + "grad_norm": 188.65731273129157, + "language_loss": 0.88631666, + "learning_rate": 3.895422090670421e-06, + "loss": 0.90628594, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.40576172, + "step": 2167, + "time_per_iteration": 2.806652069091797 + }, + { + "auxiliary_loss_clip": 0.01567749, + "auxiliary_loss_mlp": 0.00437976, + "balance_loss_clip": 1.25153232, + "balance_loss_mlp": 0.39482185, + "epoch": 0.13034721178415754, + "flos": 21251468453760.0, + "grad_norm": 42.159399595956, + "language_loss": 0.89860308, + "learning_rate": 3.89529776593877e-06, + "loss": 0.91866028, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.43188477, + "step": 2168, + "time_per_iteration": 2.6503350734710693 + }, + { + "auxiliary_loss_clip": 0.01552491, + "auxiliary_loss_mlp": 0.00415648, + "balance_loss_clip": 1.23271501, + "balance_loss_mlp": 0.37478325, + "epoch": 0.1304073350368255, + "flos": 18767239931520.0, + "grad_norm": 6.64503790927978, + "language_loss": 0.86463511, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.88431656, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.40844727, + "step": 2169, + "time_per_iteration": 2.680546998977661 + }, + { + "auxiliary_loss_clip": 0.01555653, + "auxiliary_loss_mlp": 0.00407948, + "balance_loss_clip": 1.23892069, + "balance_loss_mlp": 0.36701128, + "epoch": 0.13046745828949347, + "flos": 28364653647360.0, + "grad_norm": 10.30477413057772, + "language_loss": 0.75914431, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.77878034, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.40942383, + "step": 2170, + "time_per_iteration": 2.702849864959717 + }, + { + "auxiliary_loss_clip": 0.01550119, + "auxiliary_loss_mlp": 0.00392287, + "balance_loss_clip": 1.23607767, + "balance_loss_mlp": 0.35378233, + "epoch": 0.13052758154216143, + "flos": 29605044055680.0, + "grad_norm": 486.2605619014117, + "language_loss": 0.72714663, + "learning_rate": 3.8949243605434e-06, + "loss": 0.74657065, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.38500977, + "step": 2171, + "time_per_iteration": 4.0728068351745605 + }, + { + "auxiliary_loss_clip": 0.01544479, + "auxiliary_loss_mlp": 0.00400007, + "balance_loss_clip": 1.22711372, + "balance_loss_mlp": 0.35988128, + "epoch": 0.1305877047948294, + "flos": 19390864884480.0, + "grad_norm": 26.121974399572995, + "language_loss": 0.82225037, + "learning_rate": 3.894799748360537e-06, + "loss": 0.84169519, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 3.17382812, + "router_z_loss_mlp": 0.40161133, + "step": 2172, + "time_per_iteration": 2.6557199954986572 + }, + { + "auxiliary_loss_clip": 0.01541205, + "auxiliary_loss_mlp": 0.0041749, + "balance_loss_clip": 1.23090601, + "balance_loss_mlp": 0.3789134, + "epoch": 0.13064782804749736, + "flos": 16873527000960.0, + "grad_norm": 9.122068002291355, + "language_loss": 0.82722831, + "learning_rate": 3.894675064326678e-06, + "loss": 0.84681523, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.38574219, + "step": 2173, + "time_per_iteration": 2.693657875061035 + }, + { + "auxiliary_loss_clip": 0.01540236, + "auxiliary_loss_mlp": 0.0040706, + "balance_loss_clip": 1.22635663, + "balance_loss_mlp": 0.36574227, + "epoch": 0.13070795130016533, + "flos": 24499085748480.0, + "grad_norm": 7.321632271148574, + "language_loss": 0.79413223, + "learning_rate": 3.894550308446551e-06, + "loss": 0.81360519, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.41308594, + "step": 2174, + "time_per_iteration": 5.572129249572754 + }, + { + "auxiliary_loss_clip": 0.01515071, + "auxiliary_loss_mlp": 0.00650605, + "balance_loss_clip": 1.27576423, + "balance_loss_mlp": 0.6248557, + "epoch": 0.13076807455283332, + "flos": 71054505953280.0, + "grad_norm": 0.8791697768683808, + "language_loss": 0.58768988, + "learning_rate": 3.894425480724886e-06, + "loss": 0.60934663, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.2578125, + "step": 2175, + "time_per_iteration": 3.23691725730896 + }, + { + "auxiliary_loss_clip": 0.01509982, + "auxiliary_loss_mlp": 0.00418481, + "balance_loss_clip": 1.19759011, + "balance_loss_mlp": 0.37845045, + "epoch": 0.13082819780550128, + "flos": 20264499475200.0, + "grad_norm": 2.7047287850236974, + "language_loss": 0.85819411, + "learning_rate": 3.894300581166417e-06, + "loss": 0.87747872, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.40039062, + "step": 2176, + "time_per_iteration": 2.6656627655029297 + }, + { + "auxiliary_loss_clip": 0.0151674, + "auxiliary_loss_mlp": 0.0034344, + "balance_loss_clip": 1.21031559, + "balance_loss_mlp": 0.3084397, + "epoch": 0.13088832105816925, + "flos": 34203441231360.0, + "grad_norm": 3.263474343284716, + "language_loss": 0.80767012, + "learning_rate": 3.894175609775881e-06, + "loss": 0.82627189, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.34985352, + "step": 2177, + "time_per_iteration": 4.122455835342407 + }, + { + "auxiliary_loss_clip": 0.0152154, + "auxiliary_loss_mlp": 0.00341388, + "balance_loss_clip": 1.21573722, + "balance_loss_mlp": 0.30691272, + "epoch": 0.13094844431083721, + "flos": 17894970057600.0, + "grad_norm": 1.8654335804426165, + "language_loss": 0.88926554, + "learning_rate": 3.894050566558015e-06, + "loss": 0.90789473, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.3449707, + "step": 2178, + "time_per_iteration": 2.6145846843719482 + }, + { + "auxiliary_loss_clip": 0.01492929, + "auxiliary_loss_mlp": 0.00361116, + "balance_loss_clip": 1.19518447, + "balance_loss_mlp": 0.32660457, + "epoch": 0.13100856756350518, + "flos": 17311313963520.0, + "grad_norm": 20.876971709083342, + "language_loss": 0.82829499, + "learning_rate": 3.893925451517562e-06, + "loss": 0.84683549, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.34484863, + "step": 2179, + "time_per_iteration": 2.6292593479156494 + }, + { + "auxiliary_loss_clip": 0.01512644, + "auxiliary_loss_mlp": 0.00317075, + "balance_loss_clip": 1.21766996, + "balance_loss_mlp": 0.28666505, + "epoch": 0.13106869081617314, + "flos": 22200551562240.0, + "grad_norm": 4.43649030559081, + "language_loss": 0.90314484, + "learning_rate": 3.893800264659266e-06, + "loss": 0.92144209, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.30444336, + "step": 2180, + "time_per_iteration": 2.680825710296631 + }, + { + "auxiliary_loss_clip": 0.01506549, + "auxiliary_loss_mlp": 0.00320719, + "balance_loss_clip": 1.21493912, + "balance_loss_mlp": 0.28927162, + "epoch": 0.13112881406884114, + "flos": 21763123735680.0, + "grad_norm": 125.50890600209725, + "language_loss": 0.94848144, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.96675414, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.31420898, + "step": 2181, + "time_per_iteration": 2.690551280975342 + }, + { + "auxiliary_loss_clip": 0.01509996, + "auxiliary_loss_mlp": 0.0035176, + "balance_loss_clip": 1.21411467, + "balance_loss_mlp": 0.32189792, + "epoch": 0.1311889373215091, + "flos": 23331091201920.0, + "grad_norm": 54.069024631043035, + "language_loss": 0.76681268, + "learning_rate": 3.893549675508137e-06, + "loss": 0.78543019, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.2989502, + "step": 2182, + "time_per_iteration": 2.646402359008789 + }, + { + "auxiliary_loss_clip": 0.01499584, + "auxiliary_loss_mlp": 0.00367912, + "balance_loss_clip": 1.20597744, + "balance_loss_mlp": 0.33312693, + "epoch": 0.13124906057417707, + "flos": 21467363149440.0, + "grad_norm": 4.154810934320002, + "language_loss": 0.85222089, + "learning_rate": 3.893424273224806e-06, + "loss": 0.87089586, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.34790039, + "step": 2183, + "time_per_iteration": 2.6701717376708984 + }, + { + "auxiliary_loss_clip": 0.01501266, + "auxiliary_loss_mlp": 0.00334045, + "balance_loss_clip": 1.2154119, + "balance_loss_mlp": 0.30644816, + "epoch": 0.13130918382684503, + "flos": 23255319461760.0, + "grad_norm": 7.481997217838419, + "language_loss": 0.91929758, + "learning_rate": 3.893298799142636e-06, + "loss": 0.93765068, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.27600098, + "step": 2184, + "time_per_iteration": 2.6474099159240723 + }, + { + "auxiliary_loss_clip": 0.01526164, + "auxiliary_loss_mlp": 0.00382464, + "balance_loss_clip": 1.23997688, + "balance_loss_mlp": 0.35101622, + "epoch": 0.131369307079513, + "flos": 20850274471680.0, + "grad_norm": 3.747045430057401, + "language_loss": 0.88129628, + "learning_rate": 3.893173253266387e-06, + "loss": 0.90038252, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.31445312, + "step": 2185, + "time_per_iteration": 2.7743515968322754 + }, + { + "auxiliary_loss_clip": 0.01522269, + "auxiliary_loss_mlp": 0.00367139, + "balance_loss_clip": 1.2372694, + "balance_loss_mlp": 0.33676463, + "epoch": 0.13142943033218096, + "flos": 17858341163520.0, + "grad_norm": 14606.935367415379, + "language_loss": 0.80836439, + "learning_rate": 3.893047635600818e-06, + "loss": 0.82725847, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.30322266, + "step": 2186, + "time_per_iteration": 2.6173899173736572 + }, + { + "auxiliary_loss_clip": 0.01525106, + "auxiliary_loss_mlp": 0.00395585, + "balance_loss_clip": 1.24039841, + "balance_loss_mlp": 0.36268288, + "epoch": 0.13148955358484893, + "flos": 20996035862400.0, + "grad_norm": 3.5067819573199586, + "language_loss": 0.88195169, + "learning_rate": 3.892921946150693e-06, + "loss": 0.90115857, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 2.84570312, + "router_z_loss_mlp": 0.32910156, + "step": 2187, + "time_per_iteration": 2.6669094562530518 + }, + { + "auxiliary_loss_clip": 0.01596754, + "auxiliary_loss_mlp": 0.0020029, + "balance_loss_clip": 1.39462495, + "balance_loss_mlp": 0.18693808, + "epoch": 0.13154967683751692, + "flos": 70172467580160.0, + "grad_norm": 0.8368093262268099, + "language_loss": 0.587924, + "learning_rate": 3.892796184920778e-06, + "loss": 0.60589445, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.13378906, + "step": 2188, + "time_per_iteration": 3.1455326080322266 + }, + { + "auxiliary_loss_clip": 0.01507433, + "auxiliary_loss_mlp": 0.00396615, + "balance_loss_clip": 1.22872186, + "balance_loss_mlp": 0.36612105, + "epoch": 0.1316098000901849, + "flos": 20376145923840.0, + "grad_norm": 8.484822081955246, + "language_loss": 0.81055248, + "learning_rate": 3.892670351915842e-06, + "loss": 0.829593, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 2.78710938, + "router_z_loss_mlp": 0.30493164, + "step": 2189, + "time_per_iteration": 2.672227621078491 + }, + { + "auxiliary_loss_clip": 0.01518846, + "auxiliary_loss_mlp": 0.00443616, + "balance_loss_clip": 1.23531187, + "balance_loss_mlp": 0.41116709, + "epoch": 0.13166992334285285, + "flos": 23221132692480.0, + "grad_norm": 2.085468847642438, + "language_loss": 0.77342689, + "learning_rate": 3.892544447140657e-06, + "loss": 0.79305148, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.32421875, + "step": 2190, + "time_per_iteration": 2.6519052982330322 + }, + { + "auxiliary_loss_clip": 0.01501935, + "auxiliary_loss_mlp": 0.00415993, + "balance_loss_clip": 1.22276151, + "balance_loss_mlp": 0.3846167, + "epoch": 0.13173004659552082, + "flos": 23330947547520.0, + "grad_norm": 80.86204830725327, + "language_loss": 0.78894478, + "learning_rate": 3.892418470599996e-06, + "loss": 0.80812407, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.3137207, + "step": 2191, + "time_per_iteration": 2.718846082687378 + }, + { + "auxiliary_loss_clip": 0.01510904, + "auxiliary_loss_mlp": 0.0042851, + "balance_loss_clip": 1.2307024, + "balance_loss_mlp": 0.39599019, + "epoch": 0.13179016984818878, + "flos": 21251504367360.0, + "grad_norm": 18.29927495970653, + "language_loss": 0.87209249, + "learning_rate": 3.892292422298637e-06, + "loss": 0.89148664, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.32519531, + "step": 2192, + "time_per_iteration": 2.690819025039673 + }, + { + "auxiliary_loss_clip": 0.01521608, + "auxiliary_loss_mlp": 0.00418408, + "balance_loss_clip": 1.24085259, + "balance_loss_mlp": 0.38662666, + "epoch": 0.13185029310085675, + "flos": 17778690754560.0, + "grad_norm": 60.83550502000043, + "language_loss": 0.90697491, + "learning_rate": 3.892166302241361e-06, + "loss": 0.92637503, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.31787109, + "step": 2193, + "time_per_iteration": 2.601816415786743 + }, + { + "auxiliary_loss_clip": 0.01662666, + "auxiliary_loss_mlp": 0.00252662, + "balance_loss_clip": 1.46294916, + "balance_loss_mlp": 0.23730765, + "epoch": 0.1319104163535247, + "flos": 69851785933440.0, + "grad_norm": 0.8034213597428385, + "language_loss": 0.54208124, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56123447, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.15332031, + "step": 2194, + "time_per_iteration": 3.0759031772613525 + }, + { + "auxiliary_loss_clip": 0.01508462, + "auxiliary_loss_mlp": 0.00405315, + "balance_loss_clip": 1.22791409, + "balance_loss_mlp": 0.3744399, + "epoch": 0.1319705396061927, + "flos": 25193095401600.0, + "grad_norm": 5.0166519753038274, + "language_loss": 0.79507476, + "learning_rate": 3.891913846878185e-06, + "loss": 0.81421256, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.30859375, + "step": 2195, + "time_per_iteration": 2.657094717025757 + }, + { + "auxiliary_loss_clip": 0.01509431, + "auxiliary_loss_mlp": 0.00437768, + "balance_loss_clip": 1.22584629, + "balance_loss_mlp": 0.40210068, + "epoch": 0.13203066285886067, + "flos": 20740459616640.0, + "grad_norm": 5.540263298466166, + "language_loss": 0.84302479, + "learning_rate": 3.891787511581859e-06, + "loss": 0.86249685, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.35644531, + "step": 2196, + "time_per_iteration": 2.7006309032440186 + }, + { + "auxiliary_loss_clip": 0.01513508, + "auxiliary_loss_mlp": 0.00408859, + "balance_loss_clip": 1.23032725, + "balance_loss_mlp": 0.37629104, + "epoch": 0.13209078611152864, + "flos": 22054395121920.0, + "grad_norm": 73.65476141643053, + "language_loss": 0.82924628, + "learning_rate": 3.89166110454876e-06, + "loss": 0.84846991, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.32568359, + "step": 2197, + "time_per_iteration": 2.615997314453125 + }, + { + "auxiliary_loss_clip": 0.01506355, + "auxiliary_loss_mlp": 0.00395182, + "balance_loss_clip": 1.22221172, + "balance_loss_mlp": 0.36301982, + "epoch": 0.1321509093641966, + "flos": 16284950743680.0, + "grad_norm": 11.998549475256134, + "language_loss": 0.86611676, + "learning_rate": 3.891534625783685e-06, + "loss": 0.88513213, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 2.84570312, + "router_z_loss_mlp": 0.32177734, + "step": 2198, + "time_per_iteration": 2.630207061767578 + }, + { + "auxiliary_loss_clip": 0.01502465, + "auxiliary_loss_mlp": 0.00406697, + "balance_loss_clip": 1.22183228, + "balance_loss_mlp": 0.37393788, + "epoch": 0.13221103261686457, + "flos": 16983018633600.0, + "grad_norm": 2.6909682703335207, + "language_loss": 0.9156608, + "learning_rate": 3.891408075291425e-06, + "loss": 0.93475246, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.32739258, + "step": 2199, + "time_per_iteration": 2.642869710922241 + }, + { + "auxiliary_loss_clip": 0.01488353, + "auxiliary_loss_mlp": 0.00374772, + "balance_loss_clip": 1.20825672, + "balance_loss_mlp": 0.34393251, + "epoch": 0.13227115586953253, + "flos": 34233605677440.0, + "grad_norm": 50.5771509691961, + "language_loss": 0.75087154, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.76950276, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.30810547, + "step": 2200, + "time_per_iteration": 2.721544027328491 + }, + { + "auxiliary_loss_clip": 0.014981, + "auxiliary_loss_mlp": 0.0035917, + "balance_loss_clip": 1.22060156, + "balance_loss_mlp": 0.32633987, + "epoch": 0.13233127912220052, + "flos": 20704656735360.0, + "grad_norm": 6.102791821254887, + "language_loss": 0.91429007, + "learning_rate": 3.891154759144557e-06, + "loss": 0.93286276, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.32836914, + "step": 2201, + "time_per_iteration": 2.64933443069458 + }, + { + "auxiliary_loss_clip": 0.01525206, + "auxiliary_loss_mlp": 0.00369605, + "balance_loss_clip": 1.23581553, + "balance_loss_mlp": 0.33553487, + "epoch": 0.1323914023748685, + "flos": 25805048434560.0, + "grad_norm": 4.14872891737483, + "language_loss": 0.93231416, + "learning_rate": 3.891027993499554e-06, + "loss": 0.95126224, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.34082031, + "step": 2202, + "time_per_iteration": 2.743112087249756 + }, + { + "auxiliary_loss_clip": 0.01533376, + "auxiliary_loss_mlp": 0.00343626, + "balance_loss_clip": 1.24354792, + "balance_loss_mlp": 0.31248829, + "epoch": 0.13245152562753645, + "flos": 21251540280960.0, + "grad_norm": 6.347976136651485, + "language_loss": 0.78070605, + "learning_rate": 3.89090115614658e-06, + "loss": 0.79947609, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.31176758, + "step": 2203, + "time_per_iteration": 2.74862003326416 + }, + { + "auxiliary_loss_clip": 0.01529506, + "auxiliary_loss_mlp": 0.0036182, + "balance_loss_clip": 1.23740554, + "balance_loss_mlp": 0.33087322, + "epoch": 0.13251164888020442, + "flos": 26610955931520.0, + "grad_norm": 24.608412848296513, + "language_loss": 0.80658054, + "learning_rate": 3.890774247090444e-06, + "loss": 0.82549381, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.30932617, + "step": 2204, + "time_per_iteration": 2.7778241634368896 + }, + { + "auxiliary_loss_clip": 0.01536044, + "auxiliary_loss_mlp": 0.00355514, + "balance_loss_clip": 1.24503493, + "balance_loss_mlp": 0.32490075, + "epoch": 0.13257177213287238, + "flos": 29826541272960.0, + "grad_norm": 39.9189360512908, + "language_loss": 0.85276902, + "learning_rate": 3.89064726633596e-06, + "loss": 0.87168461, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.30615234, + "step": 2205, + "time_per_iteration": 2.829871892929077 + }, + { + "auxiliary_loss_clip": 0.01551663, + "auxiliary_loss_mlp": 0.00327907, + "balance_loss_clip": 1.25769544, + "balance_loss_mlp": 0.2971867, + "epoch": 0.13263189538554035, + "flos": 21288456483840.0, + "grad_norm": 6.905555175618172, + "language_loss": 0.86851752, + "learning_rate": 3.890520213887941e-06, + "loss": 0.88731319, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.30737305, + "step": 2206, + "time_per_iteration": 2.6661744117736816 + }, + { + "auxiliary_loss_clip": 0.01558428, + "auxiliary_loss_mlp": 0.00348669, + "balance_loss_clip": 1.25526679, + "balance_loss_mlp": 0.31533766, + "epoch": 0.13269201863820831, + "flos": 16874101618560.0, + "grad_norm": 13.978254388666263, + "language_loss": 0.81164211, + "learning_rate": 3.890393089751208e-06, + "loss": 0.83071309, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.33325195, + "step": 2207, + "time_per_iteration": 2.6961610317230225 + }, + { + "auxiliary_loss_clip": 0.01582709, + "auxiliary_loss_mlp": 0.00321959, + "balance_loss_clip": 1.28289127, + "balance_loss_mlp": 0.28817505, + "epoch": 0.1327521418908763, + "flos": 23768914078080.0, + "grad_norm": 19.51679921604977, + "language_loss": 0.9149633, + "learning_rate": 3.890265893930578e-06, + "loss": 0.93401003, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.33789062, + "step": 2208, + "time_per_iteration": 2.6587164402008057 + }, + { + "auxiliary_loss_clip": 0.01588153, + "auxiliary_loss_mlp": 0.00393216, + "balance_loss_clip": 1.29302943, + "balance_loss_mlp": 0.35981393, + "epoch": 0.13281226514354427, + "flos": 26505594362880.0, + "grad_norm": 3.943063040618238, + "language_loss": 0.9122014, + "learning_rate": 3.890138626430876e-06, + "loss": 0.93201512, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.33374023, + "step": 2209, + "time_per_iteration": 2.7247605323791504 + }, + { + "auxiliary_loss_clip": 0.0160108, + "auxiliary_loss_mlp": 0.00443807, + "balance_loss_clip": 1.29567242, + "balance_loss_mlp": 0.40756696, + "epoch": 0.13287238839621224, + "flos": 24498762526080.0, + "grad_norm": 2.406358566978983, + "language_loss": 0.87716651, + "learning_rate": 3.890011287256929e-06, + "loss": 0.89761537, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.36230469, + "step": 2210, + "time_per_iteration": 2.7119290828704834 + }, + { + "auxiliary_loss_clip": 0.01811033, + "auxiliary_loss_mlp": 0.00525734, + "balance_loss_clip": 1.59973061, + "balance_loss_mlp": 0.49655119, + "epoch": 0.1329325116488802, + "flos": 67694344369920.0, + "grad_norm": 0.793506187976699, + "language_loss": 0.57992315, + "learning_rate": 3.889883876413563e-06, + "loss": 0.6032908, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.29101562, + "step": 2211, + "time_per_iteration": 3.2393884658813477 + }, + { + "auxiliary_loss_clip": 0.0180064, + "auxiliary_loss_mlp": 0.00343597, + "balance_loss_clip": 1.58880424, + "balance_loss_mlp": 0.3204231, + "epoch": 0.13299263490154817, + "flos": 72261894741120.0, + "grad_norm": 0.7792858491279445, + "language_loss": 0.54891938, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57036167, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.23144531, + "step": 2212, + "time_per_iteration": 3.209986686706543 + }, + { + "auxiliary_loss_clip": 0.01609738, + "auxiliary_loss_mlp": 0.00432394, + "balance_loss_clip": 1.30029213, + "balance_loss_mlp": 0.39768004, + "epoch": 0.13305275815421613, + "flos": 17931275729280.0, + "grad_norm": 47.14730013865663, + "language_loss": 0.82769728, + "learning_rate": 3.889628839737908e-06, + "loss": 0.8481186, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.34667969, + "step": 2213, + "time_per_iteration": 4.035093069076538 + }, + { + "auxiliary_loss_clip": 0.01629546, + "auxiliary_loss_mlp": 0.00434839, + "balance_loss_clip": 1.31983948, + "balance_loss_mlp": 0.39919567, + "epoch": 0.13311288140688413, + "flos": 22340889999360.0, + "grad_norm": 5.876158964549202, + "language_loss": 0.84844929, + "learning_rate": 3.889501213915291e-06, + "loss": 0.86909312, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.35668945, + "step": 2214, + "time_per_iteration": 2.6538705825805664 + }, + { + "auxiliary_loss_clip": 0.01602761, + "auxiliary_loss_mlp": 0.0049601, + "balance_loss_clip": 1.293365, + "balance_loss_mlp": 0.45361924, + "epoch": 0.1331730046595521, + "flos": 31868888682240.0, + "grad_norm": 4.213597187349654, + "language_loss": 0.7598083, + "learning_rate": 3.889373516442597e-06, + "loss": 0.78079605, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.42431641, + "step": 2215, + "time_per_iteration": 2.775247573852539 + }, + { + "auxiliary_loss_clip": 0.01621253, + "auxiliary_loss_mlp": 0.0044466, + "balance_loss_clip": 1.30882001, + "balance_loss_mlp": 0.40632188, + "epoch": 0.13323312791222006, + "flos": 22566589107840.0, + "grad_norm": 2.2264488027549962, + "language_loss": 0.86730617, + "learning_rate": 3.889245747324671e-06, + "loss": 0.88796526, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.38330078, + "step": 2216, + "time_per_iteration": 5.620861768722534 + }, + { + "auxiliary_loss_clip": 0.01625086, + "auxiliary_loss_mlp": 0.00440844, + "balance_loss_clip": 1.31634903, + "balance_loss_mlp": 0.4018144, + "epoch": 0.13329325116488802, + "flos": 15085319293440.0, + "grad_norm": 12.679864657715088, + "language_loss": 0.93800384, + "learning_rate": 3.889117906566356e-06, + "loss": 0.95866317, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.39038086, + "step": 2217, + "time_per_iteration": 2.6848409175872803 + }, + { + "auxiliary_loss_clip": 0.01633584, + "auxiliary_loss_mlp": 0.00519945, + "balance_loss_clip": 1.31734228, + "balance_loss_mlp": 0.48086828, + "epoch": 0.133353374417556, + "flos": 27453671890560.0, + "grad_norm": 12.62786802789947, + "language_loss": 0.81359899, + "learning_rate": 3.888989994172501e-06, + "loss": 0.83513433, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.39038086, + "step": 2218, + "time_per_iteration": 2.78733491897583 + }, + { + "auxiliary_loss_clip": 0.01614286, + "auxiliary_loss_mlp": 0.00443358, + "balance_loss_clip": 1.30020642, + "balance_loss_mlp": 0.40301752, + "epoch": 0.13341349767022395, + "flos": 24094695456000.0, + "grad_norm": 522.514868969117, + "language_loss": 0.92187911, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.94245553, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.40332031, + "step": 2219, + "time_per_iteration": 4.197736024856567 + }, + { + "auxiliary_loss_clip": 0.01635528, + "auxiliary_loss_mlp": 0.00401287, + "balance_loss_clip": 1.31905437, + "balance_loss_mlp": 0.3664782, + "epoch": 0.13347362092289192, + "flos": 24133335511680.0, + "grad_norm": 53.460264860587124, + "language_loss": 0.83549392, + "learning_rate": 3.888733954497574e-06, + "loss": 0.85586202, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.34838867, + "step": 2220, + "time_per_iteration": 2.6320626735687256 + }, + { + "auxiliary_loss_clip": 0.0161538, + "auxiliary_loss_mlp": 0.00451384, + "balance_loss_clip": 1.31147313, + "balance_loss_mlp": 0.41454858, + "epoch": 0.1335337441755599, + "flos": 18436538390400.0, + "grad_norm": 4.175446198787223, + "language_loss": 0.8805027, + "learning_rate": 3.888605827226212e-06, + "loss": 0.90117037, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.36816406, + "step": 2221, + "time_per_iteration": 2.659736156463623 + }, + { + "auxiliary_loss_clip": 0.01598258, + "auxiliary_loss_mlp": 0.00208975, + "balance_loss_clip": 1.38396025, + "balance_loss_mlp": 0.19657719, + "epoch": 0.13359386742822787, + "flos": 50611997652480.0, + "grad_norm": 0.9865411645579303, + "language_loss": 0.69075716, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.70882952, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.12353516, + "step": 2222, + "time_per_iteration": 2.986023426055908 + }, + { + "auxiliary_loss_clip": 0.01616356, + "auxiliary_loss_mlp": 0.00417566, + "balance_loss_clip": 1.31336594, + "balance_loss_mlp": 0.38285199, + "epoch": 0.13365399068089584, + "flos": 22778569221120.0, + "grad_norm": 4.498523299375341, + "language_loss": 0.72695112, + "learning_rate": 3.888349357839982e-06, + "loss": 0.74729037, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.34667969, + "step": 2223, + "time_per_iteration": 2.713348627090454 + }, + { + "auxiliary_loss_clip": 0.01608336, + "auxiliary_loss_mlp": 0.00448269, + "balance_loss_clip": 1.30005884, + "balance_loss_mlp": 0.40897769, + "epoch": 0.1337141139335638, + "flos": 12531603911040.0, + "grad_norm": 4.455303214597745, + "language_loss": 0.89373797, + "learning_rate": 3.88822101573484e-06, + "loss": 0.91430402, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.39306641, + "step": 2224, + "time_per_iteration": 2.647359609603882 + }, + { + "auxiliary_loss_clip": 0.01605605, + "auxiliary_loss_mlp": 0.00445814, + "balance_loss_clip": 1.29358292, + "balance_loss_mlp": 0.40268457, + "epoch": 0.13377423718623177, + "flos": 23038957889280.0, + "grad_norm": 15.854242732186917, + "language_loss": 0.75309408, + "learning_rate": 3.888092602028167e-06, + "loss": 0.77360821, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.43188477, + "step": 2225, + "time_per_iteration": 2.6947176456451416 + }, + { + "auxiliary_loss_clip": 0.015983, + "auxiliary_loss_mlp": 0.00422781, + "balance_loss_clip": 1.28761256, + "balance_loss_mlp": 0.38601679, + "epoch": 0.13383436043889974, + "flos": 16216397637120.0, + "grad_norm": 2.607003427469622, + "language_loss": 0.97944796, + "learning_rate": 3.887964116724835e-06, + "loss": 0.99965882, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.36767578, + "step": 2226, + "time_per_iteration": 2.6507370471954346 + }, + { + "auxiliary_loss_clip": 0.01610364, + "auxiliary_loss_mlp": 0.00429588, + "balance_loss_clip": 1.29584742, + "balance_loss_mlp": 0.38896164, + "epoch": 0.1338944836915677, + "flos": 24279671520000.0, + "grad_norm": 12.543298083756024, + "language_loss": 0.8216064, + "learning_rate": 3.887835559829712e-06, + "loss": 0.84200585, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.40649414, + "step": 2227, + "time_per_iteration": 2.7082040309906006 + }, + { + "auxiliary_loss_clip": 0.01604913, + "auxiliary_loss_mlp": 0.004555, + "balance_loss_clip": 1.29562604, + "balance_loss_mlp": 0.41210791, + "epoch": 0.1339546069442357, + "flos": 17598742594560.0, + "grad_norm": 2.960823765475233, + "language_loss": 0.89635676, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.91696084, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.43383789, + "step": 2228, + "time_per_iteration": 2.6851024627685547 + }, + { + "auxiliary_loss_clip": 0.01596183, + "auxiliary_loss_mlp": 0.00391853, + "balance_loss_clip": 1.29018331, + "balance_loss_mlp": 0.35437351, + "epoch": 0.13401473019690366, + "flos": 18990065952000.0, + "grad_norm": 19.463068068199583, + "language_loss": 0.87158692, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.89146727, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.375, + "step": 2229, + "time_per_iteration": 2.6916606426239014 + }, + { + "auxiliary_loss_clip": 0.01611457, + "auxiliary_loss_mlp": 0.00402227, + "balance_loss_clip": 1.29857826, + "balance_loss_mlp": 0.3643899, + "epoch": 0.13407485344957162, + "flos": 26943812288640.0, + "grad_norm": 28.615340212951335, + "language_loss": 0.8011694, + "learning_rate": 3.887449459642378e-06, + "loss": 0.82130623, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.37841797, + "step": 2230, + "time_per_iteration": 2.7726480960845947 + }, + { + "auxiliary_loss_clip": 0.016007, + "auxiliary_loss_mlp": 0.00371155, + "balance_loss_clip": 1.28641117, + "balance_loss_mlp": 0.33410469, + "epoch": 0.1341349767022396, + "flos": 20339373375360.0, + "grad_norm": 2.7602174316772317, + "language_loss": 0.85824049, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.87795901, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.37060547, + "step": 2231, + "time_per_iteration": 2.761098623275757 + }, + { + "auxiliary_loss_clip": 0.01602233, + "auxiliary_loss_mlp": 0.00458154, + "balance_loss_clip": 1.28190243, + "balance_loss_mlp": 0.41264009, + "epoch": 0.13419509995490755, + "flos": 29862020931840.0, + "grad_norm": 5.06449609013117, + "language_loss": 0.79259902, + "learning_rate": 3.887191701647992e-06, + "loss": 0.81320292, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.45507812, + "step": 2232, + "time_per_iteration": 2.7455995082855225 + }, + { + "auxiliary_loss_clip": 0.01604641, + "auxiliary_loss_mlp": 0.00403894, + "balance_loss_clip": 1.29058886, + "balance_loss_mlp": 0.36202806, + "epoch": 0.13425522320757552, + "flos": 26942986275840.0, + "grad_norm": 6.0943671304911256, + "language_loss": 0.76255077, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.78263617, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.41894531, + "step": 2233, + "time_per_iteration": 2.732440710067749 + }, + { + "auxiliary_loss_clip": 0.0160772, + "auxiliary_loss_mlp": 0.00399973, + "balance_loss_clip": 1.29598355, + "balance_loss_mlp": 0.36025292, + "epoch": 0.1343153464602435, + "flos": 15777281871360.0, + "grad_norm": 17.69639398068632, + "language_loss": 0.89476693, + "learning_rate": 3.886933657403615e-06, + "loss": 0.91484392, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.3972168, + "step": 2234, + "time_per_iteration": 2.705817222595215 + }, + { + "auxiliary_loss_clip": 0.01612115, + "auxiliary_loss_mlp": 0.00398986, + "balance_loss_clip": 1.2938627, + "balance_loss_mlp": 0.35955149, + "epoch": 0.13437546971291148, + "flos": 24314756129280.0, + "grad_norm": 7.773827299931583, + "language_loss": 0.86939394, + "learning_rate": 3.886804527949909e-06, + "loss": 0.88950491, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.39428711, + "step": 2235, + "time_per_iteration": 2.6558005809783936 + }, + { + "auxiliary_loss_clip": 0.01592731, + "auxiliary_loss_mlp": 0.00388372, + "balance_loss_clip": 1.28452063, + "balance_loss_mlp": 0.34910437, + "epoch": 0.13443559296557944, + "flos": 26650673395200.0, + "grad_norm": 5.349589835337512, + "language_loss": 0.91360945, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.93342042, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 3.08203125, + "router_z_loss_mlp": 0.39233398, + "step": 2236, + "time_per_iteration": 2.72800350189209 + }, + { + "auxiliary_loss_clip": 0.01608679, + "auxiliary_loss_mlp": 0.00436731, + "balance_loss_clip": 1.29183078, + "balance_loss_mlp": 0.39529377, + "epoch": 0.1344957162182474, + "flos": 21796197183360.0, + "grad_norm": 4.891539076869965, + "language_loss": 0.82867354, + "learning_rate": 3.886546054403946e-06, + "loss": 0.84912765, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.41430664, + "step": 2237, + "time_per_iteration": 2.663424491882324 + }, + { + "auxiliary_loss_clip": 0.01633886, + "auxiliary_loss_mlp": 0.00400472, + "balance_loss_clip": 1.31834579, + "balance_loss_mlp": 0.35700822, + "epoch": 0.13455583947091537, + "flos": 19865568049920.0, + "grad_norm": 7.946838432505047, + "language_loss": 0.86774635, + "learning_rate": 3.886416710321491e-06, + "loss": 0.88808995, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.43481445, + "step": 2238, + "time_per_iteration": 2.6577165126800537 + }, + { + "auxiliary_loss_clip": 0.01610859, + "auxiliary_loss_mlp": 0.003817, + "balance_loss_clip": 1.3045218, + "balance_loss_mlp": 0.34331429, + "epoch": 0.13461596272358334, + "flos": 30846835094400.0, + "grad_norm": 9.395647743910972, + "language_loss": 0.73465872, + "learning_rate": 3.886287294705924e-06, + "loss": 0.75458431, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.38378906, + "step": 2239, + "time_per_iteration": 2.7247626781463623 + }, + { + "auxiliary_loss_clip": 0.0163558, + "auxiliary_loss_mlp": 0.00405092, + "balance_loss_clip": 1.31769514, + "balance_loss_mlp": 0.36623013, + "epoch": 0.1346760859762513, + "flos": 12494436312960.0, + "grad_norm": 10.309963126258848, + "language_loss": 0.91402972, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.93443644, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.38891602, + "step": 2240, + "time_per_iteration": 2.6023077964782715 + }, + { + "auxiliary_loss_clip": 0.01627036, + "auxiliary_loss_mlp": 0.00388854, + "balance_loss_clip": 1.31401145, + "balance_loss_mlp": 0.34717843, + "epoch": 0.1347362092289193, + "flos": 21836022387840.0, + "grad_norm": 3.0330924478637336, + "language_loss": 0.85158211, + "learning_rate": 3.886028248895093e-06, + "loss": 0.87174094, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 3.13085938, + "router_z_loss_mlp": 0.41650391, + "step": 2241, + "time_per_iteration": 2.6687700748443604 + }, + { + "auxiliary_loss_clip": 0.01634353, + "auxiliary_loss_mlp": 0.00368491, + "balance_loss_clip": 1.32877016, + "balance_loss_mlp": 0.33003414, + "epoch": 0.13479633248158726, + "flos": 23509459163520.0, + "grad_norm": 67.55839539641968, + "language_loss": 0.90563482, + "learning_rate": 3.88589861870965e-06, + "loss": 0.92566323, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.38452148, + "step": 2242, + "time_per_iteration": 2.667794942855835 + }, + { + "auxiliary_loss_clip": 0.01636017, + "auxiliary_loss_mlp": 0.00370875, + "balance_loss_clip": 1.32142007, + "balance_loss_mlp": 0.33067805, + "epoch": 0.13485645573425523, + "flos": 29344332165120.0, + "grad_norm": 6.287714615197704, + "language_loss": 0.74839354, + "learning_rate": 3.885768917010744e-06, + "loss": 0.76846248, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.40185547, + "step": 2243, + "time_per_iteration": 2.7412161827087402 + }, + { + "auxiliary_loss_clip": 0.01617073, + "auxiliary_loss_mlp": 0.00345745, + "balance_loss_clip": 1.31119215, + "balance_loss_mlp": 0.30781221, + "epoch": 0.1349165789869232, + "flos": 28037112503040.0, + "grad_norm": 2.3511482396371775, + "language_loss": 0.78273624, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.80236447, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.37915039, + "step": 2244, + "time_per_iteration": 2.6759209632873535 + }, + { + "auxiliary_loss_clip": 0.01645138, + "auxiliary_loss_mlp": 0.00340352, + "balance_loss_clip": 1.32900214, + "balance_loss_mlp": 0.30294403, + "epoch": 0.13497670223959116, + "flos": 22853730430080.0, + "grad_norm": 4.885967132029266, + "language_loss": 0.91612893, + "learning_rate": 3.88550929909221e-06, + "loss": 0.93598384, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.37402344, + "step": 2245, + "time_per_iteration": 2.729450225830078 + }, + { + "auxiliary_loss_clip": 0.01643296, + "auxiliary_loss_mlp": 0.00342618, + "balance_loss_clip": 1.33324146, + "balance_loss_mlp": 0.30456656, + "epoch": 0.13503682549225912, + "flos": 16504580453760.0, + "grad_norm": 4.167645119209793, + "language_loss": 0.84119469, + "learning_rate": 3.88537938288243e-06, + "loss": 0.86105382, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.38037109, + "step": 2246, + "time_per_iteration": 2.6621875762939453 + }, + { + "auxiliary_loss_clip": 0.01585746, + "auxiliary_loss_mlp": 0.00308068, + "balance_loss_clip": 1.39875674, + "balance_loss_mlp": 0.29433474, + "epoch": 0.1350969487449271, + "flos": 70756303242240.0, + "grad_norm": 0.7678822874401862, + "language_loss": 0.6070075, + "learning_rate": 3.885249395178874e-06, + "loss": 0.62594569, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.13769531, + "step": 2247, + "time_per_iteration": 3.28216814994812 + }, + { + "auxiliary_loss_clip": 0.0165823, + "auxiliary_loss_mlp": 0.00341481, + "balance_loss_clip": 1.33784199, + "balance_loss_mlp": 0.30247605, + "epoch": 0.13515707199759508, + "flos": 23075981832960.0, + "grad_norm": 30.014423367760006, + "language_loss": 0.88795584, + "learning_rate": 3.885119335986473e-06, + "loss": 0.90795302, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.38989258, + "step": 2248, + "time_per_iteration": 2.7126874923706055 + }, + { + "auxiliary_loss_clip": 0.01647462, + "auxiliary_loss_mlp": 0.00331543, + "balance_loss_clip": 1.33886743, + "balance_loss_mlp": 0.2924189, + "epoch": 0.13521719525026304, + "flos": 23186371305600.0, + "grad_norm": 2.0358391031148915, + "language_loss": 0.83031321, + "learning_rate": 3.884989205310157e-06, + "loss": 0.85010326, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.39086914, + "step": 2249, + "time_per_iteration": 2.7020082473754883 + }, + { + "auxiliary_loss_clip": 0.01674203, + "auxiliary_loss_mlp": 0.00310838, + "balance_loss_clip": 1.36113513, + "balance_loss_mlp": 0.27166533, + "epoch": 0.135277318502931, + "flos": 24790931752320.0, + "grad_norm": 7.520936200690906, + "language_loss": 0.88644302, + "learning_rate": 3.884859003154862e-06, + "loss": 0.90629339, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.39160156, + "step": 2250, + "time_per_iteration": 2.8134584426879883 + }, + { + "auxiliary_loss_clip": 0.01687419, + "auxiliary_loss_mlp": 0.00345239, + "balance_loss_clip": 1.36754525, + "balance_loss_mlp": 0.30682993, + "epoch": 0.13533744175559898, + "flos": 21908525990400.0, + "grad_norm": 3.0033754364104994, + "language_loss": 0.91127849, + "learning_rate": 3.884728729525524e-06, + "loss": 0.9316051, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.3840332, + "step": 2251, + "time_per_iteration": 2.660628080368042 + }, + { + "auxiliary_loss_clip": 0.01678791, + "auxiliary_loss_mlp": 0.0031174, + "balance_loss_clip": 1.35808086, + "balance_loss_mlp": 0.26994541, + "epoch": 0.13539756500826694, + "flos": 21211643249280.0, + "grad_norm": 6.249349685778207, + "language_loss": 0.91480994, + "learning_rate": 3.884598384427084e-06, + "loss": 0.93471527, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.41772461, + "step": 2252, + "time_per_iteration": 2.6970295906066895 + }, + { + "auxiliary_loss_clip": 0.01713993, + "auxiliary_loss_mlp": 0.00086872, + "balance_loss_clip": 1.49385357, + "balance_loss_mlp": 0.06608181, + "epoch": 0.1354576882609349, + "flos": 63242103634560.0, + "grad_norm": 0.767120661851963, + "language_loss": 0.61130828, + "learning_rate": 3.884467967864485e-06, + "loss": 0.62931687, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.20800781, + "step": 2253, + "time_per_iteration": 3.2262625694274902 + }, + { + "auxiliary_loss_clip": 0.01711893, + "auxiliary_loss_mlp": 0.00309842, + "balance_loss_clip": 1.39152193, + "balance_loss_mlp": 0.26811862, + "epoch": 0.1355178115136029, + "flos": 25483037984640.0, + "grad_norm": 2.54632927031721, + "language_loss": 0.95238829, + "learning_rate": 3.884337479842671e-06, + "loss": 0.97260571, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.41723633, + "step": 2254, + "time_per_iteration": 2.7124781608581543 + }, + { + "auxiliary_loss_clip": 0.0169538, + "auxiliary_loss_mlp": 0.00307763, + "balance_loss_clip": 1.38067114, + "balance_loss_mlp": 0.26673135, + "epoch": 0.13557793476627086, + "flos": 21616967295360.0, + "grad_norm": 4.976946498557635, + "language_loss": 0.92510128, + "learning_rate": 3.884206920366591e-06, + "loss": 0.94513261, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.41015625, + "step": 2255, + "time_per_iteration": 2.7385644912719727 + }, + { + "auxiliary_loss_clip": 0.01721348, + "auxiliary_loss_mlp": 0.00284596, + "balance_loss_clip": 1.40846407, + "balance_loss_mlp": 0.24442284, + "epoch": 0.13563805801893883, + "flos": 24928253447040.0, + "grad_norm": 7.492365996922788, + "language_loss": 0.8449896, + "learning_rate": 3.884076289441196e-06, + "loss": 0.86504906, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.40161133, + "step": 2256, + "time_per_iteration": 4.071101903915405 + }, + { + "auxiliary_loss_clip": 0.01740813, + "auxiliary_loss_mlp": 0.00322762, + "balance_loss_clip": 1.41182208, + "balance_loss_mlp": 0.28182524, + "epoch": 0.1356981812716068, + "flos": 14750272206720.0, + "grad_norm": 6.8632490355233715, + "language_loss": 0.89808488, + "learning_rate": 3.88394558707144e-06, + "loss": 0.91872066, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.40917969, + "step": 2257, + "time_per_iteration": 2.6415019035339355 + }, + { + "auxiliary_loss_clip": 0.01723612, + "auxiliary_loss_mlp": 0.00296858, + "balance_loss_clip": 1.40039444, + "balance_loss_mlp": 0.25461012, + "epoch": 0.13575830452427476, + "flos": 11108571822720.0, + "grad_norm": 31.16406450417965, + "language_loss": 0.91930431, + "learning_rate": 3.883814813262277e-06, + "loss": 0.93950897, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.42260742, + "step": 2258, + "time_per_iteration": 3.9550912380218506 + }, + { + "auxiliary_loss_clip": 0.01721371, + "auxiliary_loss_mlp": 0.00297159, + "balance_loss_clip": 1.39894581, + "balance_loss_mlp": 0.25662816, + "epoch": 0.13581842777694272, + "flos": 17960290940160.0, + "grad_norm": 21.411228013274496, + "language_loss": 0.90668863, + "learning_rate": 3.883683968018669e-06, + "loss": 0.92687392, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.40527344, + "step": 2259, + "time_per_iteration": 4.002089738845825 + }, + { + "auxiliary_loss_clip": 0.01726892, + "auxiliary_loss_mlp": 0.00260637, + "balance_loss_clip": 1.41025352, + "balance_loss_mlp": 0.22263339, + "epoch": 0.1358785510296107, + "flos": 22857142222080.0, + "grad_norm": 4.335778561342026, + "language_loss": 0.81083423, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.83070952, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.37988281, + "step": 2260, + "time_per_iteration": 2.6961915493011475 + }, + { + "auxiliary_loss_clip": 0.01741186, + "auxiliary_loss_mlp": 0.00283703, + "balance_loss_clip": 1.42500925, + "balance_loss_mlp": 0.24584226, + "epoch": 0.13593867428227868, + "flos": 25739404329600.0, + "grad_norm": 168.10289749968092, + "language_loss": 0.83893788, + "learning_rate": 3.883422063247961e-06, + "loss": 0.85918677, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.37866211, + "step": 2261, + "time_per_iteration": 4.219906330108643 + }, + { + "auxiliary_loss_clip": 0.01730501, + "auxiliary_loss_mlp": 0.00280292, + "balance_loss_clip": 1.42029619, + "balance_loss_mlp": 0.24307534, + "epoch": 0.13599879753494665, + "flos": 31249214225280.0, + "grad_norm": 162.23505727950769, + "language_loss": 0.71261948, + "learning_rate": 3.883291003730794e-06, + "loss": 0.73272741, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.37207031, + "step": 2262, + "time_per_iteration": 2.7387874126434326 + }, + { + "auxiliary_loss_clip": 0.01740068, + "auxiliary_loss_mlp": 0.00314197, + "balance_loss_clip": 1.42636013, + "balance_loss_mlp": 0.27838629, + "epoch": 0.1360589207876146, + "flos": 23915034604800.0, + "grad_norm": 172.32622438737866, + "language_loss": 0.93591285, + "learning_rate": 3.883159872799043e-06, + "loss": 0.95645547, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.35839844, + "step": 2263, + "time_per_iteration": 2.748283863067627 + }, + { + "auxiliary_loss_clip": 0.01739632, + "auxiliary_loss_mlp": 0.00299105, + "balance_loss_clip": 1.4289521, + "balance_loss_mlp": 0.25909835, + "epoch": 0.13611904404028258, + "flos": 19974197756160.0, + "grad_norm": 37.94167393000394, + "language_loss": 0.96888369, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.98927104, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.39990234, + "step": 2264, + "time_per_iteration": 2.6471424102783203 + }, + { + "auxiliary_loss_clip": 0.01733124, + "auxiliary_loss_mlp": 0.00318671, + "balance_loss_clip": 1.42359829, + "balance_loss_mlp": 0.2813589, + "epoch": 0.13617916729295054, + "flos": 15340644144000.0, + "grad_norm": 47.392278766567884, + "language_loss": 0.81526411, + "learning_rate": 3.882897396711683e-06, + "loss": 0.83578217, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.37329102, + "step": 2265, + "time_per_iteration": 2.6827893257141113 + }, + { + "auxiliary_loss_clip": 0.01739002, + "auxiliary_loss_mlp": 0.00271807, + "balance_loss_clip": 1.43006039, + "balance_loss_mlp": 0.23363648, + "epoch": 0.1362392905456185, + "flos": 27451445247360.0, + "grad_norm": 28.544922001778595, + "language_loss": 0.75360972, + "learning_rate": 3.882766051566027e-06, + "loss": 0.77371776, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 3.0859375, + "router_z_loss_mlp": 0.3815918, + "step": 2266, + "time_per_iteration": 2.714566230773926 + }, + { + "auxiliary_loss_clip": 0.0174319, + "auxiliary_loss_mlp": 0.00301941, + "balance_loss_clip": 1.43010616, + "balance_loss_mlp": 0.26038456, + "epoch": 0.1362994137982865, + "flos": 25009017177600.0, + "grad_norm": 8.446508157478693, + "language_loss": 0.83757091, + "learning_rate": 3.882634635025694e-06, + "loss": 0.85802227, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.41552734, + "step": 2267, + "time_per_iteration": 2.7205007076263428 + }, + { + "auxiliary_loss_clip": 0.01744654, + "auxiliary_loss_mlp": 0.00289301, + "balance_loss_clip": 1.42825198, + "balance_loss_mlp": 0.24962792, + "epoch": 0.13635953705095447, + "flos": 20303031790080.0, + "grad_norm": 7.46225905453687, + "language_loss": 0.89355028, + "learning_rate": 3.882503147095667e-06, + "loss": 0.91388983, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.39672852, + "step": 2268, + "time_per_iteration": 2.618335008621216 + }, + { + "auxiliary_loss_clip": 0.01751331, + "auxiliary_loss_mlp": 0.002961, + "balance_loss_clip": 1.43787932, + "balance_loss_mlp": 0.255831, + "epoch": 0.13641966030362243, + "flos": 31358418549120.0, + "grad_norm": 43.41173343281043, + "language_loss": 0.85545588, + "learning_rate": 3.882371587780931e-06, + "loss": 0.87593013, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.40258789, + "step": 2269, + "time_per_iteration": 2.710521697998047 + }, + { + "auxiliary_loss_clip": 0.01735048, + "auxiliary_loss_mlp": 0.00295405, + "balance_loss_clip": 1.4235003, + "balance_loss_mlp": 0.25453985, + "epoch": 0.1364797835562904, + "flos": 20478095700480.0, + "grad_norm": 9.897315906865742, + "language_loss": 0.88210857, + "learning_rate": 3.882239957086477e-06, + "loss": 0.90241307, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.40917969, + "step": 2270, + "time_per_iteration": 2.6368048191070557 + }, + { + "auxiliary_loss_clip": 0.01742018, + "auxiliary_loss_mlp": 0.00328399, + "balance_loss_clip": 1.42704451, + "balance_loss_mlp": 0.28734294, + "epoch": 0.13653990680895836, + "flos": 13078343802240.0, + "grad_norm": 5.017469089174887, + "language_loss": 0.84668469, + "learning_rate": 3.882108255017295e-06, + "loss": 0.86738884, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.41040039, + "step": 2271, + "time_per_iteration": 2.612729787826538 + }, + { + "auxiliary_loss_clip": 0.01736909, + "auxiliary_loss_mlp": 0.00327012, + "balance_loss_clip": 1.42554927, + "balance_loss_mlp": 0.28540829, + "epoch": 0.13660003006162633, + "flos": 16946712961920.0, + "grad_norm": 3.626413875493014, + "language_loss": 0.87713838, + "learning_rate": 3.881976481578379e-06, + "loss": 0.89777756, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.41625977, + "step": 2272, + "time_per_iteration": 2.616694927215576 + }, + { + "auxiliary_loss_clip": 0.01811654, + "auxiliary_loss_mlp": 0.00137848, + "balance_loss_clip": 1.53115749, + "balance_loss_mlp": 0.12335243, + "epoch": 0.1366601533142943, + "flos": 68682749892480.0, + "grad_norm": 0.727302690083977, + "language_loss": 0.60572517, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62522018, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.14453125, + "step": 2273, + "time_per_iteration": 3.250225782394409 + }, + { + "auxiliary_loss_clip": 0.01704164, + "auxiliary_loss_mlp": 0.00315045, + "balance_loss_clip": 1.39727592, + "balance_loss_mlp": 0.27160525, + "epoch": 0.13672027656696228, + "flos": 19244241567360.0, + "grad_norm": 11.072227576530562, + "language_loss": 0.85491383, + "learning_rate": 3.881712720611336e-06, + "loss": 0.87510598, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.43432617, + "step": 2274, + "time_per_iteration": 2.640986204147339 + }, + { + "auxiliary_loss_clip": 0.01690059, + "auxiliary_loss_mlp": 0.00354835, + "balance_loss_clip": 1.38865674, + "balance_loss_mlp": 0.31659275, + "epoch": 0.13678039981963025, + "flos": 24534924543360.0, + "grad_norm": 7.537891747889789, + "language_loss": 0.85110444, + "learning_rate": 3.881580733093211e-06, + "loss": 0.87155342, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.38232422, + "step": 2275, + "time_per_iteration": 2.6740224361419678 + }, + { + "auxiliary_loss_clip": 0.01703761, + "auxiliary_loss_mlp": 0.00347841, + "balance_loss_clip": 1.39837861, + "balance_loss_mlp": 0.30766797, + "epoch": 0.13684052307229821, + "flos": 15669334523520.0, + "grad_norm": 176.7944160985356, + "language_loss": 0.89733207, + "learning_rate": 3.881448674225356e-06, + "loss": 0.91784811, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.40185547, + "step": 2276, + "time_per_iteration": 2.7013766765594482 + }, + { + "auxiliary_loss_clip": 0.01677465, + "auxiliary_loss_mlp": 0.00348976, + "balance_loss_clip": 1.37171948, + "balance_loss_mlp": 0.30820638, + "epoch": 0.13690064632496618, + "flos": 28364689560960.0, + "grad_norm": 6.600546384164188, + "language_loss": 0.82908988, + "learning_rate": 3.881316544012779e-06, + "loss": 0.84935427, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.4074707, + "step": 2277, + "time_per_iteration": 2.8744826316833496 + }, + { + "auxiliary_loss_clip": 0.01706424, + "auxiliary_loss_mlp": 0.00362518, + "balance_loss_clip": 1.39746642, + "balance_loss_mlp": 0.32277328, + "epoch": 0.13696076957763414, + "flos": 23404779953280.0, + "grad_norm": 33.597015001327236, + "language_loss": 0.87765014, + "learning_rate": 3.88118434246049e-06, + "loss": 0.89833957, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.3972168, + "step": 2278, + "time_per_iteration": 2.693669557571411 + }, + { + "auxiliary_loss_clip": 0.01694892, + "auxiliary_loss_mlp": 0.00327992, + "balance_loss_clip": 1.38306856, + "balance_loss_mlp": 0.28636366, + "epoch": 0.1370208928303021, + "flos": 37196595601920.0, + "grad_norm": 16.60111835588534, + "language_loss": 0.82618892, + "learning_rate": 3.881052069573502e-06, + "loss": 0.84641773, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.41601562, + "step": 2279, + "time_per_iteration": 2.772155523300171 + }, + { + "auxiliary_loss_clip": 0.01697846, + "auxiliary_loss_mlp": 0.00323736, + "balance_loss_clip": 1.38707042, + "balance_loss_mlp": 0.2835623, + "epoch": 0.13708101608297008, + "flos": 26976311118720.0, + "grad_norm": 18.74074016764787, + "language_loss": 0.83749127, + "learning_rate": 3.880919725356831e-06, + "loss": 0.85770702, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 3.109375, + "router_z_loss_mlp": 0.40185547, + "step": 2280, + "time_per_iteration": 2.6925134658813477 + }, + { + "auxiliary_loss_clip": 0.01726125, + "auxiliary_loss_mlp": 0.00362877, + "balance_loss_clip": 1.41533172, + "balance_loss_mlp": 0.32318014, + "epoch": 0.13714113933563807, + "flos": 32556864850560.0, + "grad_norm": 6.844636792640624, + "language_loss": 0.84932792, + "learning_rate": 3.880787309815496e-06, + "loss": 0.87021798, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.39697266, + "step": 2281, + "time_per_iteration": 2.758948564529419 + }, + { + "auxiliary_loss_clip": 0.01721064, + "auxiliary_loss_mlp": 0.0033469, + "balance_loss_clip": 1.4046092, + "balance_loss_mlp": 0.295279, + "epoch": 0.13720126258830603, + "flos": 16101267569280.0, + "grad_norm": 6.90506497429327, + "language_loss": 0.90604949, + "learning_rate": 3.880654822954518e-06, + "loss": 0.92660695, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.39428711, + "step": 2282, + "time_per_iteration": 2.608741283416748 + }, + { + "auxiliary_loss_clip": 0.01727841, + "auxiliary_loss_mlp": 0.00373573, + "balance_loss_clip": 1.41768861, + "balance_loss_mlp": 0.33211195, + "epoch": 0.137261385840974, + "flos": 18953544798720.0, + "grad_norm": 4.16002700806474, + "language_loss": 0.78833055, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.80934465, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.41455078, + "step": 2283, + "time_per_iteration": 2.66733717918396 + }, + { + "auxiliary_loss_clip": 0.0172209, + "auxiliary_loss_mlp": 0.00331278, + "balance_loss_clip": 1.41356301, + "balance_loss_mlp": 0.28991193, + "epoch": 0.13732150909364196, + "flos": 23295360147840.0, + "grad_norm": 27.589505943992513, + "language_loss": 0.91841602, + "learning_rate": 3.880389635293729e-06, + "loss": 0.9389497, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.41381836, + "step": 2284, + "time_per_iteration": 2.655518054962158 + }, + { + "auxiliary_loss_clip": 0.01728103, + "auxiliary_loss_mlp": 0.00402843, + "balance_loss_clip": 1.40869045, + "balance_loss_mlp": 0.35854456, + "epoch": 0.13738163234630993, + "flos": 29351263489920.0, + "grad_norm": 19.65830084499441, + "language_loss": 0.82795978, + "learning_rate": 3.880256934503974e-06, + "loss": 0.84926921, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.44311523, + "step": 2285, + "time_per_iteration": 2.7559046745300293 + }, + { + "auxiliary_loss_clip": 0.01712256, + "auxiliary_loss_mlp": 0.00390329, + "balance_loss_clip": 1.39832366, + "balance_loss_mlp": 0.34896368, + "epoch": 0.1374417555989779, + "flos": 26651319840000.0, + "grad_norm": 13.808260770955913, + "language_loss": 0.8201161, + "learning_rate": 3.880124162414689e-06, + "loss": 0.84114194, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.41381836, + "step": 2286, + "time_per_iteration": 2.68526554107666 + }, + { + "auxiliary_loss_clip": 0.01743714, + "auxiliary_loss_mlp": 0.0033834, + "balance_loss_clip": 1.42121363, + "balance_loss_mlp": 0.29072827, + "epoch": 0.1375018788516459, + "flos": 28403401443840.0, + "grad_norm": 4.880562342215428, + "language_loss": 0.94983089, + "learning_rate": 3.879991319030908e-06, + "loss": 0.97065145, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 3.2265625, + "router_z_loss_mlp": 0.47607422, + "step": 2287, + "time_per_iteration": 2.735827922821045 + }, + { + "auxiliary_loss_clip": 0.01752692, + "auxiliary_loss_mlp": 0.00360393, + "balance_loss_clip": 1.43236089, + "balance_loss_mlp": 0.31857491, + "epoch": 0.13756200210431385, + "flos": 37413783187200.0, + "grad_norm": 17.292672356005877, + "language_loss": 0.75064814, + "learning_rate": 3.879858404357666e-06, + "loss": 0.77177894, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.41821289, + "step": 2288, + "time_per_iteration": 2.8927557468414307 + }, + { + "auxiliary_loss_clip": 0.01733083, + "auxiliary_loss_mlp": 0.00347856, + "balance_loss_clip": 1.41397572, + "balance_loss_mlp": 0.30386761, + "epoch": 0.13762212535698182, + "flos": 22711021695360.0, + "grad_norm": 48.78721831489223, + "language_loss": 0.97330827, + "learning_rate": 3.879725418400005e-06, + "loss": 0.99411762, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.43969727, + "step": 2289, + "time_per_iteration": 2.6906795501708984 + }, + { + "auxiliary_loss_clip": 0.01733606, + "auxiliary_loss_mlp": 0.00309337, + "balance_loss_clip": 1.41860378, + "balance_loss_mlp": 0.26508641, + "epoch": 0.13768224860964978, + "flos": 23952130375680.0, + "grad_norm": 7.006691013093668, + "language_loss": 0.80991888, + "learning_rate": 3.879592361162969e-06, + "loss": 0.83034837, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.44287109, + "step": 2290, + "time_per_iteration": 2.699404716491699 + }, + { + "auxiliary_loss_clip": 0.01691771, + "auxiliary_loss_mlp": 0.00073373, + "balance_loss_clip": 1.44319534, + "balance_loss_mlp": 0.03846836, + "epoch": 0.13774237186231775, + "flos": 63590438753280.0, + "grad_norm": 0.7183001898460868, + "language_loss": 0.51825833, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53590977, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.34960938, + "step": 2291, + "time_per_iteration": 3.2160511016845703 + }, + { + "auxiliary_loss_clip": 0.01722475, + "auxiliary_loss_mlp": 0.00328149, + "balance_loss_clip": 1.40784371, + "balance_loss_mlp": 0.28630635, + "epoch": 0.1378024951149857, + "flos": 24279456038400.0, + "grad_norm": 4.523887314175646, + "language_loss": 0.77549624, + "learning_rate": 3.879326032870952e-06, + "loss": 0.79600251, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.41821289, + "step": 2292, + "time_per_iteration": 2.78210186958313 + }, + { + "auxiliary_loss_clip": 0.01734337, + "auxiliary_loss_mlp": 0.00338729, + "balance_loss_clip": 1.41377878, + "balance_loss_mlp": 0.29576588, + "epoch": 0.13786261836765368, + "flos": 14021537080320.0, + "grad_norm": 6.361987942899286, + "language_loss": 0.89421207, + "learning_rate": 3.879192761826071e-06, + "loss": 0.91494274, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.42944336, + "step": 2293, + "time_per_iteration": 2.6341216564178467 + }, + { + "auxiliary_loss_clip": 0.01716172, + "auxiliary_loss_mlp": 0.00334573, + "balance_loss_clip": 1.39678299, + "balance_loss_mlp": 0.29129964, + "epoch": 0.13792274162032167, + "flos": 28878679226880.0, + "grad_norm": 4.96587203884443, + "language_loss": 0.84903502, + "learning_rate": 3.879059419522011e-06, + "loss": 0.86954248, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.43261719, + "step": 2294, + "time_per_iteration": 2.7460155487060547 + }, + { + "auxiliary_loss_clip": 0.01709893, + "auxiliary_loss_mlp": 0.00340479, + "balance_loss_clip": 1.39471698, + "balance_loss_mlp": 0.29846984, + "epoch": 0.13798286487298964, + "flos": 21141150808320.0, + "grad_norm": 34.48939666128053, + "language_loss": 0.88074201, + "learning_rate": 3.878926005963831e-06, + "loss": 0.90124577, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 3.15234375, + "router_z_loss_mlp": 0.42016602, + "step": 2295, + "time_per_iteration": 2.6310698986053467 + }, + { + "auxiliary_loss_clip": 0.0172946, + "auxiliary_loss_mlp": 0.00354658, + "balance_loss_clip": 1.40933859, + "balance_loss_mlp": 0.31171906, + "epoch": 0.1380429881256576, + "flos": 22487477402880.0, + "grad_norm": 39.19919527120189, + "language_loss": 0.84933639, + "learning_rate": 3.878792521156588e-06, + "loss": 0.87017763, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.42919922, + "step": 2296, + "time_per_iteration": 2.67309308052063 + }, + { + "auxiliary_loss_clip": 0.01728347, + "auxiliary_loss_mlp": 0.00346332, + "balance_loss_clip": 1.40845037, + "balance_loss_mlp": 0.30310631, + "epoch": 0.13810311137832557, + "flos": 21393674398080.0, + "grad_norm": 2.9252574518194634, + "language_loss": 0.83854431, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.85929108, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.43237305, + "step": 2297, + "time_per_iteration": 2.7222647666931152 + }, + { + "auxiliary_loss_clip": 0.01728057, + "auxiliary_loss_mlp": 0.00326588, + "balance_loss_clip": 1.40978229, + "balance_loss_mlp": 0.28646231, + "epoch": 0.13816323463099353, + "flos": 25989844930560.0, + "grad_norm": 2.53191593005944, + "language_loss": 0.76338178, + "learning_rate": 3.878525337815164e-06, + "loss": 0.78392828, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.40136719, + "step": 2298, + "time_per_iteration": 4.020915746688843 + }, + { + "auxiliary_loss_clip": 0.01725038, + "auxiliary_loss_mlp": 0.00366753, + "balance_loss_clip": 1.40024281, + "balance_loss_mlp": 0.32235941, + "epoch": 0.1382233578836615, + "flos": 19244313394560.0, + "grad_norm": 2.032312426022189, + "language_loss": 0.92791325, + "learning_rate": 3.878391639291116e-06, + "loss": 0.9488312, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.44384766, + "step": 2299, + "time_per_iteration": 2.7385354042053223 + }, + { + "auxiliary_loss_clip": 0.01735572, + "auxiliary_loss_mlp": 0.00315968, + "balance_loss_clip": 1.41294074, + "balance_loss_mlp": 0.27212322, + "epoch": 0.1382834811363295, + "flos": 25666290195840.0, + "grad_norm": 22.603018952368355, + "language_loss": 0.83358324, + "learning_rate": 3.878257869538267e-06, + "loss": 0.85409856, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.43847656, + "step": 2300, + "time_per_iteration": 2.6817822456359863 + }, + { + "auxiliary_loss_clip": 0.01733898, + "auxiliary_loss_mlp": 0.00299292, + "balance_loss_clip": 1.41570377, + "balance_loss_mlp": 0.25637668, + "epoch": 0.13834360438899745, + "flos": 19784193788160.0, + "grad_norm": 12.587972417867086, + "language_loss": 0.93122494, + "learning_rate": 3.878124028561692e-06, + "loss": 0.95155692, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.42895508, + "step": 2301, + "time_per_iteration": 5.430131435394287 + }, + { + "auxiliary_loss_clip": 0.01740478, + "auxiliary_loss_mlp": 0.0034479, + "balance_loss_clip": 1.41849768, + "balance_loss_mlp": 0.30244705, + "epoch": 0.13840372764166542, + "flos": 26651858544000.0, + "grad_norm": 6.872056206854722, + "language_loss": 0.92295885, + "learning_rate": 3.877990116366466e-06, + "loss": 0.94381154, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.4230957, + "step": 2302, + "time_per_iteration": 2.7313973903656006 + }, + { + "auxiliary_loss_clip": 0.01707113, + "auxiliary_loss_mlp": 0.00105995, + "balance_loss_clip": 1.47101235, + "balance_loss_mlp": 0.06689429, + "epoch": 0.13846385089433338, + "flos": 70510998286080.0, + "grad_norm": 0.7617015544967195, + "language_loss": 0.65824926, + "learning_rate": 3.877856132957667e-06, + "loss": 0.67638034, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.390625, + "step": 2303, + "time_per_iteration": 4.653339385986328 + }, + { + "auxiliary_loss_clip": 0.01761544, + "auxiliary_loss_mlp": 0.00298012, + "balance_loss_clip": 1.4360075, + "balance_loss_mlp": 0.25717112, + "epoch": 0.13852397414700135, + "flos": 17348732956800.0, + "grad_norm": 67.54570059512443, + "language_loss": 0.83863068, + "learning_rate": 3.877722078340374e-06, + "loss": 0.85922629, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.40844727, + "step": 2304, + "time_per_iteration": 2.62591552734375 + }, + { + "auxiliary_loss_clip": 0.01771703, + "auxiliary_loss_mlp": 0.00303932, + "balance_loss_clip": 1.44084692, + "balance_loss_mlp": 0.26356822, + "epoch": 0.13858409739966931, + "flos": 21543781334400.0, + "grad_norm": 4.704670060317913, + "language_loss": 0.84059846, + "learning_rate": 3.877587952519672e-06, + "loss": 0.86135483, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.40380859, + "step": 2305, + "time_per_iteration": 2.691819906234741 + }, + { + "auxiliary_loss_clip": 0.01791838, + "auxiliary_loss_mlp": 0.0035628, + "balance_loss_clip": 1.45667458, + "balance_loss_mlp": 0.30964553, + "epoch": 0.13864422065233728, + "flos": 21579907438080.0, + "grad_norm": 11.761922827058488, + "language_loss": 0.94843209, + "learning_rate": 3.877453755500647e-06, + "loss": 0.96991324, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.46655273, + "step": 2306, + "time_per_iteration": 2.6301190853118896 + }, + { + "auxiliary_loss_clip": 0.01749834, + "auxiliary_loss_mlp": 0.00055483, + "balance_loss_clip": 1.49697828, + "balance_loss_mlp": 0.02553809, + "epoch": 0.13870434390500527, + "flos": 53371156872960.0, + "grad_norm": 0.8480063924496607, + "language_loss": 0.5845781, + "learning_rate": 3.877319487288387e-06, + "loss": 0.60263127, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.29882812, + "step": 2307, + "time_per_iteration": 3.166128635406494 + }, + { + "auxiliary_loss_clip": 0.01861496, + "auxiliary_loss_mlp": 0.00375112, + "balance_loss_clip": 1.49340355, + "balance_loss_mlp": 0.32859692, + "epoch": 0.13876446715767324, + "flos": 22565906749440.0, + "grad_norm": 7.964671322011736, + "language_loss": 0.86287397, + "learning_rate": 3.877185147887984e-06, + "loss": 0.88524008, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.46484375, + "step": 2308, + "time_per_iteration": 2.6811676025390625 + }, + { + "auxiliary_loss_clip": 0.01891951, + "auxiliary_loss_mlp": 0.00356229, + "balance_loss_clip": 1.52612126, + "balance_loss_mlp": 0.31412396, + "epoch": 0.1388245904103412, + "flos": 20705231352960.0, + "grad_norm": 6.488551298485882, + "language_loss": 0.85698849, + "learning_rate": 3.877050737304533e-06, + "loss": 0.87947023, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 0.42089844, + "step": 2309, + "time_per_iteration": 2.6998629570007324 + }, + { + "auxiliary_loss_clip": 0.01925409, + "auxiliary_loss_mlp": 0.00392009, + "balance_loss_clip": 1.53719926, + "balance_loss_mlp": 0.34923679, + "epoch": 0.13888471366300917, + "flos": 20554729367040.0, + "grad_norm": 22.183255974732976, + "language_loss": 0.76913357, + "learning_rate": 3.876916255543129e-06, + "loss": 0.79230779, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 3.88476562, + "router_z_loss_mlp": 0.42773438, + "step": 2310, + "time_per_iteration": 2.660141706466675 + }, + { + "auxiliary_loss_clip": 0.01971124, + "auxiliary_loss_mlp": 0.0040137, + "balance_loss_clip": 1.56523359, + "balance_loss_mlp": 0.35745305, + "epoch": 0.13894483691567713, + "flos": 13838033473920.0, + "grad_norm": 3.571705419098448, + "language_loss": 0.91221684, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.93594176, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 4.05859375, + "router_z_loss_mlp": 0.43945312, + "step": 2311, + "time_per_iteration": 2.5867483615875244 + }, + { + "auxiliary_loss_clip": 0.01966215, + "auxiliary_loss_mlp": 0.00388726, + "balance_loss_clip": 1.55660105, + "balance_loss_mlp": 0.34576306, + "epoch": 0.1390049601683451, + "flos": 28031186759040.0, + "grad_norm": 3.623504820829164, + "language_loss": 0.88195312, + "learning_rate": 3.876647078506866e-06, + "loss": 0.9055025, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.4296875, + "step": 2312, + "time_per_iteration": 2.7510104179382324 + }, + { + "auxiliary_loss_clip": 0.01991648, + "auxiliary_loss_mlp": 0.00472129, + "balance_loss_clip": 1.57087708, + "balance_loss_mlp": 0.4284265, + "epoch": 0.13906508342101306, + "flos": 26756860976640.0, + "grad_norm": 7.888174054819899, + "language_loss": 0.93511891, + "learning_rate": 3.876512383242215e-06, + "loss": 0.95975661, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 4.2109375, + "router_z_loss_mlp": 0.43676758, + "step": 2313, + "time_per_iteration": 2.6738502979278564 + }, + { + "auxiliary_loss_clip": 0.02014479, + "auxiliary_loss_mlp": 0.00434077, + "balance_loss_clip": 1.58464479, + "balance_loss_mlp": 0.38718003, + "epoch": 0.13912520667368106, + "flos": 24535104111360.0, + "grad_norm": 7.594127547465398, + "language_loss": 0.88928771, + "learning_rate": 3.876377616820024e-06, + "loss": 0.9137733, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 4.30078125, + "router_z_loss_mlp": 0.46899414, + "step": 2314, + "time_per_iteration": 2.717693328857422 + }, + { + "auxiliary_loss_clip": 0.02015357, + "auxiliary_loss_mlp": 0.00482565, + "balance_loss_clip": 1.58206487, + "balance_loss_mlp": 0.43631226, + "epoch": 0.13918532992634902, + "flos": 19383215287680.0, + "grad_norm": 31.820317228997382, + "language_loss": 0.93154335, + "learning_rate": 3.876242779245409e-06, + "loss": 0.95652258, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 4.3359375, + "router_z_loss_mlp": 0.46240234, + "step": 2315, + "time_per_iteration": 2.622575521469116 + }, + { + "auxiliary_loss_clip": 0.02032928, + "auxiliary_loss_mlp": 0.00531646, + "balance_loss_clip": 1.59063399, + "balance_loss_mlp": 0.48465356, + "epoch": 0.139245453179017, + "flos": 21323756574720.0, + "grad_norm": 5.6011943662800645, + "language_loss": 0.86876619, + "learning_rate": 3.876107870523477e-06, + "loss": 0.89441192, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 4.42578125, + "router_z_loss_mlp": 0.47045898, + "step": 2316, + "time_per_iteration": 2.6522772312164307 + }, + { + "auxiliary_loss_clip": 0.02063999, + "auxiliary_loss_mlp": 0.00553569, + "balance_loss_clip": 1.61072147, + "balance_loss_mlp": 0.50431168, + "epoch": 0.13930557643168495, + "flos": 19500607912320.0, + "grad_norm": 4.419107648026177, + "language_loss": 0.82260925, + "learning_rate": 3.875972890659349e-06, + "loss": 0.84878492, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 4.53125, + "router_z_loss_mlp": 0.49291992, + "step": 2317, + "time_per_iteration": 2.6538596153259277 + }, + { + "auxiliary_loss_clip": 0.020831, + "auxiliary_loss_mlp": 0.00526248, + "balance_loss_clip": 1.61954033, + "balance_loss_mlp": 0.4777537, + "epoch": 0.13936569968435292, + "flos": 25410821690880.0, + "grad_norm": 39.29366775197261, + "language_loss": 0.87803257, + "learning_rate": 3.875837839658139e-06, + "loss": 0.90412605, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 4.62890625, + "router_z_loss_mlp": 0.48510742, + "step": 2318, + "time_per_iteration": 2.7538936138153076 + }, + { + "auxiliary_loss_clip": 0.01956781, + "auxiliary_loss_mlp": 0.00141006, + "balance_loss_clip": 1.667202, + "balance_loss_mlp": 0.12202819, + "epoch": 0.13942582293702088, + "flos": 70771063731840.0, + "grad_norm": 0.8435574320031615, + "language_loss": 0.59259385, + "learning_rate": 3.87570271752497e-06, + "loss": 0.6135717, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.18945312, + "step": 2319, + "time_per_iteration": 3.2219576835632324 + }, + { + "auxiliary_loss_clip": 0.02041471, + "auxiliary_loss_mlp": 0.00471772, + "balance_loss_clip": 1.59567428, + "balance_loss_mlp": 0.42628184, + "epoch": 0.13948594618968888, + "flos": 35590885920000.0, + "grad_norm": 7.943269784462708, + "language_loss": 0.74114347, + "learning_rate": 3.875567524264967e-06, + "loss": 0.76627588, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 4.45703125, + "router_z_loss_mlp": 0.45483398, + "step": 2320, + "time_per_iteration": 2.7902398109436035 + }, + { + "auxiliary_loss_clip": 0.02064874, + "auxiliary_loss_mlp": 0.00494552, + "balance_loss_clip": 1.61355841, + "balance_loss_mlp": 0.44806001, + "epoch": 0.13954606944235684, + "flos": 21105204272640.0, + "grad_norm": 5.844751817752818, + "language_loss": 0.74904084, + "learning_rate": 3.875432259883256e-06, + "loss": 0.77463508, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 4.51171875, + "router_z_loss_mlp": 0.46484375, + "step": 2321, + "time_per_iteration": 2.663471221923828 + }, + { + "auxiliary_loss_clip": 0.02068644, + "auxiliary_loss_mlp": 0.00484691, + "balance_loss_clip": 1.60742033, + "balance_loss_mlp": 0.43903407, + "epoch": 0.1396061926950248, + "flos": 25044425009280.0, + "grad_norm": 44.97992675602325, + "language_loss": 0.92569709, + "learning_rate": 3.875296924384965e-06, + "loss": 0.95123041, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 4.609375, + "router_z_loss_mlp": 0.45678711, + "step": 2322, + "time_per_iteration": 2.7132582664489746 + }, + { + "auxiliary_loss_clip": 0.02098242, + "auxiliary_loss_mlp": 0.0046136, + "balance_loss_clip": 1.6317966, + "balance_loss_mlp": 0.41484424, + "epoch": 0.13966631594769277, + "flos": 37634023428480.0, + "grad_norm": 17.46903950478155, + "language_loss": 0.7180748, + "learning_rate": 3.875161517775226e-06, + "loss": 0.74367082, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 4.66796875, + "router_z_loss_mlp": 0.46533203, + "step": 2323, + "time_per_iteration": 2.8142175674438477 + }, + { + "auxiliary_loss_clip": 0.0207826, + "auxiliary_loss_mlp": 0.00494412, + "balance_loss_clip": 1.61026907, + "balance_loss_mlp": 0.44625127, + "epoch": 0.13972643920036074, + "flos": 16690993061760.0, + "grad_norm": 33.7020484288853, + "language_loss": 0.97969031, + "learning_rate": 3.875026040059175e-06, + "loss": 1.00541711, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 4.68359375, + "router_z_loss_mlp": 0.48120117, + "step": 2324, + "time_per_iteration": 2.604588031768799 + }, + { + "auxiliary_loss_clip": 0.02089966, + "auxiliary_loss_mlp": 0.00419522, + "balance_loss_clip": 1.62247849, + "balance_loss_mlp": 0.37341198, + "epoch": 0.1397865624530287, + "flos": 23331055288320.0, + "grad_norm": 17.336120014140587, + "language_loss": 0.80208284, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.82717776, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 4.67578125, + "router_z_loss_mlp": 0.46118164, + "step": 2325, + "time_per_iteration": 2.6371679306030273 + }, + { + "auxiliary_loss_clip": 0.02125297, + "auxiliary_loss_mlp": 0.00430376, + "balance_loss_clip": 1.63803983, + "balance_loss_mlp": 0.38421804, + "epoch": 0.13984668570569667, + "flos": 22778317825920.0, + "grad_norm": 46.88244748697015, + "language_loss": 0.88785732, + "learning_rate": 3.874754871328688e-06, + "loss": 0.91341406, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 4.8671875, + "router_z_loss_mlp": 0.46118164, + "step": 2326, + "time_per_iteration": 2.6156606674194336 + }, + { + "auxiliary_loss_clip": 0.02138633, + "auxiliary_loss_mlp": 0.0046333, + "balance_loss_clip": 1.65406132, + "balance_loss_mlp": 0.41888878, + "epoch": 0.13990680895836466, + "flos": 19464553635840.0, + "grad_norm": 2.6135810138222157, + "language_loss": 0.92901158, + "learning_rate": 3.874619180324534e-06, + "loss": 0.95503128, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.44482422, + "step": 2327, + "time_per_iteration": 2.631580114364624 + }, + { + "auxiliary_loss_clip": 0.02149381, + "auxiliary_loss_mlp": 0.00470044, + "balance_loss_clip": 1.65263152, + "balance_loss_mlp": 0.42390972, + "epoch": 0.13996693221103262, + "flos": 20303283185280.0, + "grad_norm": 23.547553533120677, + "language_loss": 0.90065622, + "learning_rate": 3.874483418234632e-06, + "loss": 0.92685044, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 4.9609375, + "router_z_loss_mlp": 0.46118164, + "step": 2328, + "time_per_iteration": 2.5938303470611572 + }, + { + "auxiliary_loss_clip": 0.02124869, + "auxiliary_loss_mlp": 0.00478792, + "balance_loss_clip": 1.63643241, + "balance_loss_mlp": 0.4330875, + "epoch": 0.1400270554637006, + "flos": 26617707688320.0, + "grad_norm": 25.617212924096126, + "language_loss": 0.79347759, + "learning_rate": 3.874347585064131e-06, + "loss": 0.81951427, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 4.8828125, + "router_z_loss_mlp": 0.45703125, + "step": 2329, + "time_per_iteration": 2.6913211345672607 + }, + { + "auxiliary_loss_clip": 0.02136505, + "auxiliary_loss_mlp": 0.00448635, + "balance_loss_clip": 1.64422739, + "balance_loss_mlp": 0.40159535, + "epoch": 0.14008717871636855, + "flos": 19391475415680.0, + "grad_norm": 6.028791536012523, + "language_loss": 0.84277952, + "learning_rate": 3.874211680818183e-06, + "loss": 0.86863101, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 4.9140625, + "router_z_loss_mlp": 0.47070312, + "step": 2330, + "time_per_iteration": 2.6378204822540283 + }, + { + "auxiliary_loss_clip": 0.02135177, + "auxiliary_loss_mlp": 0.00448244, + "balance_loss_clip": 1.64123321, + "balance_loss_mlp": 0.40363634, + "epoch": 0.14014730196903652, + "flos": 15304266645120.0, + "grad_norm": 124.29133595590609, + "language_loss": 0.79233456, + "learning_rate": 3.87407570550194e-06, + "loss": 0.81816882, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 4.93359375, + "router_z_loss_mlp": 0.44604492, + "step": 2331, + "time_per_iteration": 2.7307207584381104 + }, + { + "auxiliary_loss_clip": 0.02128656, + "auxiliary_loss_mlp": 0.0040661, + "balance_loss_clip": 1.64735413, + "balance_loss_mlp": 0.36581695, + "epoch": 0.14020742522170448, + "flos": 14939701557120.0, + "grad_norm": 8.531332065999765, + "language_loss": 0.78101063, + "learning_rate": 3.873939659120557e-06, + "loss": 0.80636322, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 4.8125, + "router_z_loss_mlp": 0.40795898, + "step": 2332, + "time_per_iteration": 2.6431050300598145 + }, + { + "auxiliary_loss_clip": 0.0205468, + "auxiliary_loss_mlp": 0.00108062, + "balance_loss_clip": 1.73468852, + "balance_loss_mlp": 0.09041937, + "epoch": 0.14026754847437245, + "flos": 48824580044160.0, + "grad_norm": 1.3786048769294796, + "language_loss": 0.55892992, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58055735, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.17675781, + "step": 2333, + "time_per_iteration": 3.010904312133789 + }, + { + "auxiliary_loss_clip": 0.02105501, + "auxiliary_loss_mlp": 0.00383396, + "balance_loss_clip": 1.6255132, + "balance_loss_mlp": 0.34224448, + "epoch": 0.14032767172704044, + "flos": 25773267876480.0, + "grad_norm": 13.395522076625811, + "language_loss": 0.87654328, + "learning_rate": 3.873667353183016e-06, + "loss": 0.90143228, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 4.796875, + "router_z_loss_mlp": 0.41162109, + "step": 2334, + "time_per_iteration": 2.70176100730896 + }, + { + "auxiliary_loss_clip": 0.02094586, + "auxiliary_loss_mlp": 0.0041486, + "balance_loss_clip": 1.6170584, + "balance_loss_mlp": 0.37044269, + "epoch": 0.1403877949797084, + "flos": 21216312017280.0, + "grad_norm": 1.8910572150981582, + "language_loss": 0.85997134, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.88506579, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 4.7734375, + "router_z_loss_mlp": 0.44433594, + "step": 2335, + "time_per_iteration": 2.6413543224334717 + }, + { + "auxiliary_loss_clip": 0.02036472, + "auxiliary_loss_mlp": 0.00460885, + "balance_loss_clip": 1.58144999, + "balance_loss_mlp": 0.41110307, + "epoch": 0.14044791823237637, + "flos": 22747973811840.0, + "grad_norm": 40.659141386231646, + "language_loss": 0.88444376, + "learning_rate": 3.873394763046862e-06, + "loss": 0.90941727, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 4.55078125, + "router_z_loss_mlp": 0.49707031, + "step": 2336, + "time_per_iteration": 2.759342908859253 + }, + { + "auxiliary_loss_clip": 0.02074772, + "auxiliary_loss_mlp": 0.00416953, + "balance_loss_clip": 1.60831463, + "balance_loss_mlp": 0.37055698, + "epoch": 0.14050804148504434, + "flos": 22964443125120.0, + "grad_norm": 8.284414062387494, + "language_loss": 0.86074984, + "learning_rate": 3.873258361417225e-06, + "loss": 0.88566709, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 4.66796875, + "router_z_loss_mlp": 0.46411133, + "step": 2337, + "time_per_iteration": 2.6402621269226074 + }, + { + "auxiliary_loss_clip": 0.02096601, + "auxiliary_loss_mlp": 0.00440076, + "balance_loss_clip": 1.62700009, + "balance_loss_mlp": 0.39248723, + "epoch": 0.1405681647377123, + "flos": 22200336080640.0, + "grad_norm": 50.81895528868738, + "language_loss": 0.85825264, + "learning_rate": 3.873121888753442e-06, + "loss": 0.88361937, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 4.69140625, + "router_z_loss_mlp": 0.47631836, + "step": 2338, + "time_per_iteration": 2.6338813304901123 + }, + { + "auxiliary_loss_clip": 0.02082928, + "auxiliary_loss_mlp": 0.00391347, + "balance_loss_clip": 1.60980248, + "balance_loss_mlp": 0.34614325, + "epoch": 0.14062828799038027, + "flos": 23732787974400.0, + "grad_norm": 36.28339122645138, + "language_loss": 0.92307603, + "learning_rate": 3.87298534506069e-06, + "loss": 0.94781882, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 4.73828125, + "router_z_loss_mlp": 0.45214844, + "step": 2339, + "time_per_iteration": 2.6442949771881104 + }, + { + "auxiliary_loss_clip": 0.02059916, + "auxiliary_loss_mlp": 0.00367875, + "balance_loss_clip": 1.60308719, + "balance_loss_mlp": 0.3268432, + "epoch": 0.14068841124304826, + "flos": 39202493685120.0, + "grad_norm": 9.838141493739467, + "language_loss": 0.71504223, + "learning_rate": 3.872848730344146e-06, + "loss": 0.73932016, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 4.5625, + "router_z_loss_mlp": 0.41015625, + "step": 2340, + "time_per_iteration": 4.135826826095581 + }, + { + "auxiliary_loss_clip": 0.0207862, + "auxiliary_loss_mlp": 0.00424127, + "balance_loss_clip": 1.6196661, + "balance_loss_mlp": 0.37837428, + "epoch": 0.14074853449571623, + "flos": 20192283181440.0, + "grad_norm": 12.995165935408608, + "language_loss": 0.86857504, + "learning_rate": 3.87271204460899e-06, + "loss": 0.89360255, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 4.59375, + "router_z_loss_mlp": 0.45727539, + "step": 2341, + "time_per_iteration": 2.6327874660491943 + }, + { + "auxiliary_loss_clip": 0.0206525, + "auxiliary_loss_mlp": 0.00420583, + "balance_loss_clip": 1.60622334, + "balance_loss_mlp": 0.3744725, + "epoch": 0.1408086577483842, + "flos": 18405871153920.0, + "grad_norm": 87.27164423831798, + "language_loss": 0.87307137, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.89792967, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 4.59375, + "router_z_loss_mlp": 0.4609375, + "step": 2342, + "time_per_iteration": 2.6445467472076416 + }, + { + "auxiliary_loss_clip": 0.02082878, + "auxiliary_loss_mlp": 0.00388842, + "balance_loss_clip": 1.63145471, + "balance_loss_mlp": 0.34575993, + "epoch": 0.14086878100105216, + "flos": 25264593423360.0, + "grad_norm": 3.1388579377937753, + "language_loss": 0.84251189, + "learning_rate": 3.87243846010358e-06, + "loss": 0.8672291, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 4.515625, + "router_z_loss_mlp": 0.43066406, + "step": 2343, + "time_per_iteration": 4.072537183761597 + }, + { + "auxiliary_loss_clip": 0.01670557, + "auxiliary_loss_mlp": 0.00229138, + "balance_loss_clip": 1.44151235, + "balance_loss_mlp": 0.21626341, + "epoch": 0.14092890425372012, + "flos": 65978388869760.0, + "grad_norm": 0.8218595230577952, + "language_loss": 0.6127317, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63172865, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.12890625, + "step": 2344, + "time_per_iteration": 4.485307693481445 + }, + { + "auxiliary_loss_clip": 0.02033222, + "auxiliary_loss_mlp": 0.00415453, + "balance_loss_clip": 1.59605706, + "balance_loss_mlp": 0.3728233, + "epoch": 0.1409890275063881, + "flos": 23694973931520.0, + "grad_norm": 6.724294118531236, + "language_loss": 0.70419282, + "learning_rate": 3.872164591585956e-06, + "loss": 0.72867954, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 4.3671875, + "router_z_loss_mlp": 0.42651367, + "step": 2345, + "time_per_iteration": 2.6330220699310303 + }, + { + "auxiliary_loss_clip": 0.01990281, + "auxiliary_loss_mlp": 0.00386722, + "balance_loss_clip": 1.55611944, + "balance_loss_mlp": 0.34290054, + "epoch": 0.14104915075905605, + "flos": 23623152687360.0, + "grad_norm": 36.090470136190845, + "language_loss": 0.83182681, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.85559678, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 4.33984375, + "router_z_loss_mlp": 0.43823242, + "step": 2346, + "time_per_iteration": 4.011506080627441 + }, + { + "auxiliary_loss_clip": 0.01940948, + "auxiliary_loss_mlp": 0.00409171, + "balance_loss_clip": 1.53149128, + "balance_loss_mlp": 0.36132064, + "epoch": 0.14110927401172405, + "flos": 20595165102720.0, + "grad_norm": 8.792522701452404, + "language_loss": 0.83181393, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.85531509, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.47802734, + "step": 2347, + "time_per_iteration": 2.6255335807800293 + }, + { + "auxiliary_loss_clip": 0.01938959, + "auxiliary_loss_mlp": 0.00410601, + "balance_loss_clip": 1.53087556, + "balance_loss_mlp": 0.36732781, + "epoch": 0.141169397264392, + "flos": 28548049512960.0, + "grad_norm": 540.9514366630341, + "language_loss": 0.8413018, + "learning_rate": 3.8717532563775e-06, + "loss": 0.86479741, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 4.08007812, + "router_z_loss_mlp": 0.43286133, + "step": 2348, + "time_per_iteration": 2.743312358856201 + }, + { + "auxiliary_loss_clip": 0.0192801, + "auxiliary_loss_mlp": 0.00411596, + "balance_loss_clip": 1.52277517, + "balance_loss_mlp": 0.36634457, + "epoch": 0.14122952051705998, + "flos": 17092258871040.0, + "grad_norm": 34.876575289301215, + "language_loss": 0.92166328, + "learning_rate": 3.871616002680272e-06, + "loss": 0.94505942, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 4.05664062, + "router_z_loss_mlp": 0.45288086, + "step": 2349, + "time_per_iteration": 2.641409158706665 + }, + { + "auxiliary_loss_clip": 0.01876492, + "auxiliary_loss_mlp": 0.00397231, + "balance_loss_clip": 1.49277854, + "balance_loss_mlp": 0.35579389, + "epoch": 0.14128964376972794, + "flos": 28946801370240.0, + "grad_norm": 25.23571791026447, + "language_loss": 0.93530399, + "learning_rate": 3.871478678011177e-06, + "loss": 0.95804119, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 3.83789062, + "router_z_loss_mlp": 0.41430664, + "step": 2350, + "time_per_iteration": 2.7005515098571777 + }, + { + "auxiliary_loss_clip": 0.01837938, + "auxiliary_loss_mlp": 0.00406417, + "balance_loss_clip": 1.47458291, + "balance_loss_mlp": 0.36128461, + "epoch": 0.1413497670223959, + "flos": 18989778643200.0, + "grad_norm": 2.69384665636976, + "language_loss": 0.87426722, + "learning_rate": 3.871341282375423e-06, + "loss": 0.89671075, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 0.45141602, + "step": 2351, + "time_per_iteration": 2.662097215652466 + }, + { + "auxiliary_loss_clip": 0.01820379, + "auxiliary_loss_mlp": 0.0040464, + "balance_loss_clip": 1.45459998, + "balance_loss_mlp": 0.36472824, + "epoch": 0.14140989027506387, + "flos": 29862236413440.0, + "grad_norm": 4.688304809743304, + "language_loss": 0.89524376, + "learning_rate": 3.871203815778219e-06, + "loss": 0.91749394, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.39916992, + "step": 2352, + "time_per_iteration": 2.7027509212493896 + }, + { + "auxiliary_loss_clip": 0.01471364, + "auxiliary_loss_mlp": 0.00259536, + "balance_loss_clip": 1.26722121, + "balance_loss_mlp": 0.24513583, + "epoch": 0.14147001352773186, + "flos": 62079532041600.0, + "grad_norm": 5.362709869596716, + "language_loss": 0.62034833, + "learning_rate": 3.87106627822478e-06, + "loss": 0.63765734, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.14355469, + "step": 2353, + "time_per_iteration": 3.09131121635437 + }, + { + "auxiliary_loss_clip": 0.01808988, + "auxiliary_loss_mlp": 0.00430917, + "balance_loss_clip": 1.4499681, + "balance_loss_mlp": 0.39071941, + "epoch": 0.14153013678039983, + "flos": 22017514832640.0, + "grad_norm": 5.219364425302211, + "language_loss": 0.92351568, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.94591469, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 0.40185547, + "step": 2354, + "time_per_iteration": 2.6497347354888916 + }, + { + "auxiliary_loss_clip": 0.0180633, + "auxiliary_loss_mlp": 0.00407942, + "balance_loss_clip": 1.4441396, + "balance_loss_mlp": 0.36793587, + "epoch": 0.1415902600330678, + "flos": 19720093968000.0, + "grad_norm": 8.222446717949463, + "language_loss": 0.79457772, + "learning_rate": 3.870790990270057e-06, + "loss": 0.81672037, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.40014648, + "step": 2355, + "time_per_iteration": 2.7837769985198975 + }, + { + "auxiliary_loss_clip": 0.01570487, + "auxiliary_loss_mlp": 0.00371067, + "balance_loss_clip": 1.35978174, + "balance_loss_mlp": 0.35161239, + "epoch": 0.14165038328573576, + "flos": 65900929190400.0, + "grad_norm": 0.6735003149634003, + "language_loss": 0.51615012, + "learning_rate": 3.870653239879212e-06, + "loss": 0.53556567, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.19433594, + "step": 2356, + "time_per_iteration": 3.071575880050659 + }, + { + "auxiliary_loss_clip": 0.0181862, + "auxiliary_loss_mlp": 0.00369091, + "balance_loss_clip": 1.45944858, + "balance_loss_mlp": 0.33008525, + "epoch": 0.14171050653840372, + "flos": 12130158533760.0, + "grad_norm": 10.699405528032358, + "language_loss": 0.78852111, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.81039822, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 0.38989258, + "step": 2357, + "time_per_iteration": 2.620875835418701 + }, + { + "auxiliary_loss_clip": 0.01803306, + "auxiliary_loss_mlp": 0.00374961, + "balance_loss_clip": 1.44701469, + "balance_loss_mlp": 0.33519322, + "epoch": 0.1417706297910717, + "flos": 20412487509120.0, + "grad_norm": 16.5363197681133, + "language_loss": 0.89480424, + "learning_rate": 3.870377526296674e-06, + "loss": 0.916587, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.39770508, + "step": 2358, + "time_per_iteration": 2.6388328075408936 + }, + { + "auxiliary_loss_clip": 0.01787049, + "auxiliary_loss_mlp": 0.00411493, + "balance_loss_clip": 1.42879152, + "balance_loss_mlp": 0.36724269, + "epoch": 0.14183075304373965, + "flos": 22380607463040.0, + "grad_norm": 107.35664589963177, + "language_loss": 0.76729172, + "learning_rate": 3.870239563115436e-06, + "loss": 0.7892772, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 0.44238281, + "step": 2359, + "time_per_iteration": 2.6506779193878174 + }, + { + "auxiliary_loss_clip": 0.01797141, + "auxiliary_loss_mlp": 0.00346592, + "balance_loss_clip": 1.43993688, + "balance_loss_mlp": 0.30875525, + "epoch": 0.14189087629640765, + "flos": 21580913018880.0, + "grad_norm": 4.3025988163657045, + "language_loss": 0.83552408, + "learning_rate": 3.870101529014526e-06, + "loss": 0.85696149, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.37866211, + "step": 2360, + "time_per_iteration": 2.6694557666778564 + }, + { + "auxiliary_loss_clip": 0.01780456, + "auxiliary_loss_mlp": 0.00393872, + "balance_loss_clip": 1.4302392, + "balance_loss_mlp": 0.35167181, + "epoch": 0.1419509995490756, + "flos": 20008564093440.0, + "grad_norm": 18.53570245932095, + "language_loss": 0.88981366, + "learning_rate": 3.869963423999178e-06, + "loss": 0.9115569, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.42211914, + "step": 2361, + "time_per_iteration": 2.6207611560821533 + }, + { + "auxiliary_loss_clip": 0.01780153, + "auxiliary_loss_mlp": 0.00398142, + "balance_loss_clip": 1.43254113, + "balance_loss_mlp": 0.3578974, + "epoch": 0.14201112280174358, + "flos": 31941464112000.0, + "grad_norm": 9.995260919987503, + "language_loss": 0.81404436, + "learning_rate": 3.86982524807463e-06, + "loss": 0.83582735, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.40234375, + "step": 2362, + "time_per_iteration": 2.6999425888061523 + }, + { + "auxiliary_loss_clip": 0.01759006, + "auxiliary_loss_mlp": 0.0039743, + "balance_loss_clip": 1.41712737, + "balance_loss_mlp": 0.3564699, + "epoch": 0.14207124605441154, + "flos": 41464147582080.0, + "grad_norm": 25.58773738037776, + "language_loss": 0.79121161, + "learning_rate": 3.869687001246122e-06, + "loss": 0.81277591, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 0.40966797, + "step": 2363, + "time_per_iteration": 2.7948222160339355 + }, + { + "auxiliary_loss_clip": 0.0177647, + "auxiliary_loss_mlp": 0.00367624, + "balance_loss_clip": 1.43155575, + "balance_loss_mlp": 0.32728389, + "epoch": 0.1421313693070795, + "flos": 31905086613120.0, + "grad_norm": 23.199158576795803, + "language_loss": 0.78158778, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.8030287, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 0.40307617, + "step": 2364, + "time_per_iteration": 2.7205965518951416 + }, + { + "auxiliary_loss_clip": 0.01757075, + "auxiliary_loss_mlp": 0.00378757, + "balance_loss_clip": 1.41780233, + "balance_loss_mlp": 0.34096786, + "epoch": 0.14219149255974747, + "flos": 26871165031680.0, + "grad_norm": 329.28910810495034, + "language_loss": 0.95872331, + "learning_rate": 3.869410294898195e-06, + "loss": 0.98008168, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.37817383, + "step": 2365, + "time_per_iteration": 2.690016269683838 + }, + { + "auxiliary_loss_clip": 0.01752763, + "auxiliary_loss_mlp": 0.00382856, + "balance_loss_clip": 1.40990853, + "balance_loss_mlp": 0.34413719, + "epoch": 0.14225161581241544, + "flos": 27454426076160.0, + "grad_norm": 8.404680241420886, + "language_loss": 0.73477799, + "learning_rate": 3.869271835389268e-06, + "loss": 0.75613415, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 3.42382812, + "router_z_loss_mlp": 0.38696289, + "step": 2366, + "time_per_iteration": 2.6327381134033203 + }, + { + "auxiliary_loss_clip": 0.01764242, + "auxiliary_loss_mlp": 0.00374986, + "balance_loss_clip": 1.42452884, + "balance_loss_mlp": 0.33590925, + "epoch": 0.14231173906508343, + "flos": 10561436881920.0, + "grad_norm": 6.434289115218777, + "language_loss": 0.87306643, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.89445865, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 0.390625, + "step": 2367, + "time_per_iteration": 2.6381852626800537 + }, + { + "auxiliary_loss_clip": 0.01767899, + "auxiliary_loss_mlp": 0.00348111, + "balance_loss_clip": 1.42072153, + "balance_loss_mlp": 0.30903459, + "epoch": 0.1423718623177514, + "flos": 28360882719360.0, + "grad_norm": 25.832668353102697, + "language_loss": 0.88672334, + "learning_rate": 3.868994703727742e-06, + "loss": 0.90788341, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.390625, + "step": 2368, + "time_per_iteration": 2.6883957386016846 + }, + { + "auxiliary_loss_clip": 0.01774214, + "auxiliary_loss_mlp": 0.00378691, + "balance_loss_clip": 1.42986274, + "balance_loss_mlp": 0.3401868, + "epoch": 0.14243198557041936, + "flos": 19354235990400.0, + "grad_norm": 4.616907820275121, + "language_loss": 0.92658049, + "learning_rate": 3.868856031585652e-06, + "loss": 0.94810957, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.38476562, + "step": 2369, + "time_per_iteration": 2.7533013820648193 + }, + { + "auxiliary_loss_clip": 0.01756007, + "auxiliary_loss_mlp": 0.00365948, + "balance_loss_clip": 1.41179323, + "balance_loss_mlp": 0.32794419, + "epoch": 0.14249210882308733, + "flos": 28806857982720.0, + "grad_norm": 2.9828542658169948, + "language_loss": 0.81392866, + "learning_rate": 3.868717288576354e-06, + "loss": 0.83514822, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 3.44140625, + "router_z_loss_mlp": 0.37988281, + "step": 2370, + "time_per_iteration": 2.676750421524048 + }, + { + "auxiliary_loss_clip": 0.01751973, + "auxiliary_loss_mlp": 0.00366553, + "balance_loss_clip": 1.41430831, + "balance_loss_mlp": 0.32871571, + "epoch": 0.1425522320757553, + "flos": 21835016807040.0, + "grad_norm": 14.589047947275677, + "language_loss": 0.88186145, + "learning_rate": 3.868578474705109e-06, + "loss": 0.90304673, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.37817383, + "step": 2371, + "time_per_iteration": 2.7274887561798096 + }, + { + "auxiliary_loss_clip": 0.01779486, + "auxiliary_loss_mlp": 0.00374675, + "balance_loss_clip": 1.43787217, + "balance_loss_mlp": 0.3339054, + "epoch": 0.14261235532842326, + "flos": 17311457617920.0, + "grad_norm": 5.262649518158599, + "language_loss": 0.88381112, + "learning_rate": 3.868439589977181e-06, + "loss": 0.90535271, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 0.40771484, + "step": 2372, + "time_per_iteration": 2.583505392074585 + }, + { + "auxiliary_loss_clip": 0.01760009, + "auxiliary_loss_mlp": 0.00382406, + "balance_loss_clip": 1.41863215, + "balance_loss_mlp": 0.34325802, + "epoch": 0.14267247858109125, + "flos": 18806741913600.0, + "grad_norm": 48.06618907907713, + "language_loss": 0.90282965, + "learning_rate": 3.868300634397836e-06, + "loss": 0.92425376, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.39135742, + "step": 2373, + "time_per_iteration": 2.623603105545044 + }, + { + "auxiliary_loss_clip": 0.01784183, + "auxiliary_loss_mlp": 0.00367861, + "balance_loss_clip": 1.43486762, + "balance_loss_mlp": 0.33069167, + "epoch": 0.14273260183375922, + "flos": 11358904682880.0, + "grad_norm": 4.223651037681332, + "language_loss": 0.92535776, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.94687819, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 0.37158203, + "step": 2374, + "time_per_iteration": 2.6366302967071533 + }, + { + "auxiliary_loss_clip": 0.01778329, + "auxiliary_loss_mlp": 0.00389582, + "balance_loss_clip": 1.42927122, + "balance_loss_mlp": 0.3485744, + "epoch": 0.14279272508642718, + "flos": 27567688636800.0, + "grad_norm": 3.4394784391948057, + "language_loss": 0.84618247, + "learning_rate": 3.868022510705977e-06, + "loss": 0.86786157, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 0.41015625, + "step": 2375, + "time_per_iteration": 2.6902639865875244 + }, + { + "auxiliary_loss_clip": 0.0178427, + "auxiliary_loss_mlp": 0.00343937, + "balance_loss_clip": 1.43798435, + "balance_loss_mlp": 0.30648115, + "epoch": 0.14285284833909515, + "flos": 16252559654400.0, + "grad_norm": 11.33482313462612, + "language_loss": 0.84746844, + "learning_rate": 3.867883342604009e-06, + "loss": 0.86875045, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.37451172, + "step": 2376, + "time_per_iteration": 2.568498373031616 + }, + { + "auxiliary_loss_clip": 0.01783164, + "auxiliary_loss_mlp": 0.00363178, + "balance_loss_clip": 1.43492997, + "balance_loss_mlp": 0.32600805, + "epoch": 0.1429129715917631, + "flos": 19755609540480.0, + "grad_norm": 7.0043441061109055, + "language_loss": 1.00391674, + "learning_rate": 3.867744103671717e-06, + "loss": 1.02538037, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.37182617, + "step": 2377, + "time_per_iteration": 2.650938034057617 + }, + { + "auxiliary_loss_clip": 0.01773079, + "auxiliary_loss_mlp": 0.00357865, + "balance_loss_clip": 1.42280698, + "balance_loss_mlp": 0.3164992, + "epoch": 0.14297309484443108, + "flos": 21137092571520.0, + "grad_norm": 282.80613083252666, + "language_loss": 0.97854519, + "learning_rate": 3.867604793914382e-06, + "loss": 0.99985462, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 0.41381836, + "step": 2378, + "time_per_iteration": 2.6553452014923096 + }, + { + "auxiliary_loss_clip": 0.0177129, + "auxiliary_loss_mlp": 0.00338589, + "balance_loss_clip": 1.41915929, + "balance_loss_mlp": 0.30039483, + "epoch": 0.14303321809709904, + "flos": 23586667447680.0, + "grad_norm": 3.1218105197579544, + "language_loss": 0.80126053, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.82235932, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.38208008, + "step": 2379, + "time_per_iteration": 2.663740634918213 + }, + { + "auxiliary_loss_clip": 0.01792043, + "auxiliary_loss_mlp": 0.00353282, + "balance_loss_clip": 1.43882346, + "balance_loss_mlp": 0.31380028, + "epoch": 0.14309334134976703, + "flos": 15888281875200.0, + "grad_norm": 335.0049147347527, + "language_loss": 0.84888309, + "learning_rate": 3.867325961945714e-06, + "loss": 0.87033629, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.39453125, + "step": 2380, + "time_per_iteration": 2.6369264125823975 + }, + { + "auxiliary_loss_clip": 0.01833814, + "auxiliary_loss_mlp": 0.00364252, + "balance_loss_clip": 1.46819806, + "balance_loss_mlp": 0.32333899, + "epoch": 0.143153464602435, + "flos": 16325601960960.0, + "grad_norm": 10.921680456110401, + "language_loss": 0.95441324, + "learning_rate": 3.867186439744955e-06, + "loss": 0.97639394, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 0.40917969, + "step": 2381, + "time_per_iteration": 2.6100270748138428 + }, + { + "auxiliary_loss_clip": 0.01840909, + "auxiliary_loss_mlp": 0.00343158, + "balance_loss_clip": 1.47377086, + "balance_loss_mlp": 0.30236414, + "epoch": 0.14321358785510296, + "flos": 17092079303040.0, + "grad_norm": 213.88943368808603, + "language_loss": 0.81535631, + "learning_rate": 3.867046846740299e-06, + "loss": 0.83719695, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 0.40771484, + "step": 2382, + "time_per_iteration": 4.0442914962768555 + }, + { + "auxiliary_loss_clip": 0.01842518, + "auxiliary_loss_mlp": 0.00365934, + "balance_loss_clip": 1.47344756, + "balance_loss_mlp": 0.32611817, + "epoch": 0.14327371110777093, + "flos": 26322916769280.0, + "grad_norm": 39.32595587675109, + "language_loss": 0.82171184, + "learning_rate": 3.866907182937039e-06, + "loss": 0.84379637, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 0.39794922, + "step": 2383, + "time_per_iteration": 2.6947667598724365 + }, + { + "auxiliary_loss_clip": 0.01868796, + "auxiliary_loss_mlp": 0.00386162, + "balance_loss_clip": 1.49248052, + "balance_loss_mlp": 0.34110087, + "epoch": 0.1433338343604389, + "flos": 18076462502400.0, + "grad_norm": 32.11274620152352, + "language_loss": 0.95956588, + "learning_rate": 3.866767448340471e-06, + "loss": 0.98211551, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.45043945, + "step": 2384, + "time_per_iteration": 2.6118345260620117 + }, + { + "auxiliary_loss_clip": 0.01862857, + "auxiliary_loss_mlp": 0.00408285, + "balance_loss_clip": 1.486871, + "balance_loss_mlp": 0.36124513, + "epoch": 0.14339395761310686, + "flos": 15522783033600.0, + "grad_norm": 23.268996123529075, + "language_loss": 0.87953472, + "learning_rate": 3.866627642955895e-06, + "loss": 0.90224612, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.47045898, + "step": 2385, + "time_per_iteration": 4.064507722854614 + }, + { + "auxiliary_loss_clip": 0.01905547, + "auxiliary_loss_mlp": 0.00366629, + "balance_loss_clip": 1.51066089, + "balance_loss_mlp": 0.32435691, + "epoch": 0.14345408086577485, + "flos": 28548767784960.0, + "grad_norm": 3.0406915829310965, + "language_loss": 0.81431013, + "learning_rate": 3.866487766788612e-06, + "loss": 0.8370319, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 3.9453125, + "router_z_loss_mlp": 0.4230957, + "step": 2386, + "time_per_iteration": 4.154547214508057 + }, + { + "auxiliary_loss_clip": 0.01904887, + "auxiliary_loss_mlp": 0.00373229, + "balance_loss_clip": 1.52083826, + "balance_loss_mlp": 0.32888326, + "epoch": 0.14351420411844282, + "flos": 20230061310720.0, + "grad_norm": 16.363844209280597, + "language_loss": 0.84211659, + "learning_rate": 3.866347819843925e-06, + "loss": 0.86489773, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 3.84179688, + "router_z_loss_mlp": 0.44360352, + "step": 2387, + "time_per_iteration": 2.609616994857788 + }, + { + "auxiliary_loss_clip": 0.01930062, + "auxiliary_loss_mlp": 0.00357174, + "balance_loss_clip": 1.53863335, + "balance_loss_mlp": 0.31452096, + "epoch": 0.14357432737111078, + "flos": 19865029345920.0, + "grad_norm": 39.692086733002924, + "language_loss": 0.8902775, + "learning_rate": 3.866207802127143e-06, + "loss": 0.91314983, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 3.90820312, + "router_z_loss_mlp": 0.42651367, + "step": 2388, + "time_per_iteration": 4.02367901802063 + }, + { + "auxiliary_loss_clip": 0.01956346, + "auxiliary_loss_mlp": 0.00405264, + "balance_loss_clip": 1.55022836, + "balance_loss_mlp": 0.36230069, + "epoch": 0.14363445062377875, + "flos": 28256814040320.0, + "grad_norm": 16.79098415104295, + "language_loss": 0.88824296, + "learning_rate": 3.866067713643573e-06, + "loss": 0.9118591, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 4.0625, + "router_z_loss_mlp": 0.42993164, + "step": 2389, + "time_per_iteration": 2.702955961227417 + }, + { + "auxiliary_loss_clip": 0.0196807, + "auxiliary_loss_mlp": 0.00410461, + "balance_loss_clip": 1.55828047, + "balance_loss_mlp": 0.36370677, + "epoch": 0.1436945738764467, + "flos": 18186672407040.0, + "grad_norm": 34.88833990220932, + "language_loss": 0.91085166, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.93463695, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 4.09570312, + "router_z_loss_mlp": 0.4675293, + "step": 2390, + "time_per_iteration": 2.5904757976531982 + }, + { + "auxiliary_loss_clip": 0.01981039, + "auxiliary_loss_mlp": 0.00378743, + "balance_loss_clip": 1.55858874, + "balance_loss_mlp": 0.33337238, + "epoch": 0.14375469712911468, + "flos": 27307910499840.0, + "grad_norm": 40.63468257236911, + "language_loss": 0.80678785, + "learning_rate": 3.865787324397324e-06, + "loss": 0.83038568, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 4.23046875, + "router_z_loss_mlp": 0.45410156, + "step": 2391, + "time_per_iteration": 2.685671806335449 + }, + { + "auxiliary_loss_clip": 0.01762629, + "auxiliary_loss_mlp": 0.00159422, + "balance_loss_clip": 1.50587845, + "balance_loss_mlp": 0.14435354, + "epoch": 0.14381482038178264, + "flos": 56891445287040.0, + "grad_norm": 0.8952846955429091, + "language_loss": 0.61896211, + "learning_rate": 3.865647023645277e-06, + "loss": 0.63818264, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.15039062, + "step": 2392, + "time_per_iteration": 2.9671523571014404 + }, + { + "auxiliary_loss_clip": 0.01989099, + "auxiliary_loss_mlp": 0.00397518, + "balance_loss_clip": 1.56482434, + "balance_loss_mlp": 0.35183722, + "epoch": 0.14387494363445064, + "flos": 14282177143680.0, + "grad_norm": 11.840894828039522, + "language_loss": 0.85670269, + "learning_rate": 3.865506652147709e-06, + "loss": 0.88056886, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.45727539, + "step": 2393, + "time_per_iteration": 2.6457254886627197 + }, + { + "auxiliary_loss_clip": 0.01969779, + "auxiliary_loss_mlp": 0.00408238, + "balance_loss_clip": 1.55305719, + "balance_loss_mlp": 0.36613351, + "epoch": 0.1439350668871186, + "flos": 26761493831040.0, + "grad_norm": 26.748057841034655, + "language_loss": 0.82969052, + "learning_rate": 3.865366209909941e-06, + "loss": 0.85347062, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 4.16992188, + "router_z_loss_mlp": 0.42089844, + "step": 2394, + "time_per_iteration": 2.743086099624634 + }, + { + "auxiliary_loss_clip": 0.01993561, + "auxiliary_loss_mlp": 0.00395187, + "balance_loss_clip": 1.56672835, + "balance_loss_mlp": 0.35038793, + "epoch": 0.14399519013978657, + "flos": 40700040537600.0, + "grad_norm": 4.41345706136014, + "language_loss": 0.91500854, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.93889606, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 4.26757812, + "router_z_loss_mlp": 0.44824219, + "step": 2395, + "time_per_iteration": 2.888425827026367 + }, + { + "auxiliary_loss_clip": 0.01996581, + "auxiliary_loss_mlp": 0.00392601, + "balance_loss_clip": 1.56460333, + "balance_loss_mlp": 0.34708679, + "epoch": 0.14405531339245453, + "flos": 20557530627840.0, + "grad_norm": 2.822293253579182, + "language_loss": 0.86897254, + "learning_rate": 3.865085113235113e-06, + "loss": 0.89286435, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 4.32421875, + "router_z_loss_mlp": 0.45556641, + "step": 2396, + "time_per_iteration": 2.620025873184204 + }, + { + "auxiliary_loss_clip": 0.02013514, + "auxiliary_loss_mlp": 0.00379534, + "balance_loss_clip": 1.5828526, + "balance_loss_mlp": 0.33621341, + "epoch": 0.1441154366451225, + "flos": 19572931946880.0, + "grad_norm": 3.9101165647065943, + "language_loss": 0.90137064, + "learning_rate": 3.864944458808712e-06, + "loss": 0.92530107, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 4.30859375, + "router_z_loss_mlp": 0.43310547, + "step": 2397, + "time_per_iteration": 2.6658072471618652 + }, + { + "auxiliary_loss_clip": 0.0202295, + "auxiliary_loss_mlp": 0.00378338, + "balance_loss_clip": 1.58125758, + "balance_loss_mlp": 0.3357566, + "epoch": 0.14417555989779046, + "flos": 18515721922560.0, + "grad_norm": 1.72657615346728, + "language_loss": 0.85516918, + "learning_rate": 3.86480373366343e-06, + "loss": 0.8791821, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 4.41796875, + "router_z_loss_mlp": 0.42602539, + "step": 2398, + "time_per_iteration": 2.6088039875030518 + }, + { + "auxiliary_loss_clip": 0.01985428, + "auxiliary_loss_mlp": 0.0037477, + "balance_loss_clip": 1.55962074, + "balance_loss_mlp": 0.3313542, + "epoch": 0.14423568315045843, + "flos": 26031681296640.0, + "grad_norm": 5.397865341436297, + "language_loss": 0.70474309, + "learning_rate": 3.864662937804603e-06, + "loss": 0.72834504, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 4.2578125, + "router_z_loss_mlp": 0.43432617, + "step": 2399, + "time_per_iteration": 2.6856799125671387 + }, + { + "auxiliary_loss_clip": 0.02000048, + "auxiliary_loss_mlp": 0.0038467, + "balance_loss_clip": 1.57548606, + "balance_loss_mlp": 0.34242195, + "epoch": 0.14429580640312642, + "flos": 21288743792640.0, + "grad_norm": 7.140133738073261, + "language_loss": 0.87982303, + "learning_rate": 3.864522071237571e-06, + "loss": 0.90367019, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 4.24023438, + "router_z_loss_mlp": 0.42236328, + "step": 2400, + "time_per_iteration": 2.643049955368042 + }, + { + "auxiliary_loss_clip": 0.0196364, + "auxiliary_loss_mlp": 0.00343506, + "balance_loss_clip": 1.54349065, + "balance_loss_mlp": 0.30094877, + "epoch": 0.14435592965579438, + "flos": 25627865621760.0, + "grad_norm": 8.092756271382756, + "language_loss": 0.80937696, + "learning_rate": 3.864381133967676e-06, + "loss": 0.83244836, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 4.20117188, + "router_z_loss_mlp": 0.42553711, + "step": 2401, + "time_per_iteration": 2.6657564640045166 + }, + { + "auxiliary_loss_clip": 0.01953315, + "auxiliary_loss_mlp": 0.00328671, + "balance_loss_clip": 1.54539299, + "balance_loss_mlp": 0.28701976, + "epoch": 0.14441605290846235, + "flos": 22965053656320.0, + "grad_norm": 21.731361269867882, + "language_loss": 0.86387753, + "learning_rate": 3.86424012600026e-06, + "loss": 0.88669741, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 4.08007812, + "router_z_loss_mlp": 0.41699219, + "step": 2402, + "time_per_iteration": 2.780905246734619 + }, + { + "auxiliary_loss_clip": 0.01933597, + "auxiliary_loss_mlp": 0.00362055, + "balance_loss_clip": 1.53447413, + "balance_loss_mlp": 0.31935406, + "epoch": 0.14447617616113032, + "flos": 17347655548800.0, + "grad_norm": 40.011975048933046, + "language_loss": 0.89684606, + "learning_rate": 3.864099047340673e-06, + "loss": 0.91980261, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 3.9921875, + "router_z_loss_mlp": 0.42700195, + "step": 2403, + "time_per_iteration": 2.6401710510253906 + }, + { + "auxiliary_loss_clip": 0.01945862, + "auxiliary_loss_mlp": 0.00373547, + "balance_loss_clip": 1.5433166, + "balance_loss_mlp": 0.33020258, + "epoch": 0.14453629941379828, + "flos": 24060185464320.0, + "grad_norm": 3.275541944880875, + "language_loss": 0.75479782, + "learning_rate": 3.863957897994262e-06, + "loss": 0.77799189, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 4.02539062, + "router_z_loss_mlp": 0.43334961, + "step": 2404, + "time_per_iteration": 2.665656566619873 + }, + { + "auxiliary_loss_clip": 0.01964149, + "auxiliary_loss_mlp": 0.00371528, + "balance_loss_clip": 1.54704273, + "balance_loss_mlp": 0.32804018, + "epoch": 0.14459642266646625, + "flos": 14429554646400.0, + "grad_norm": 10.821297317715707, + "language_loss": 0.80543625, + "learning_rate": 3.863816677966381e-06, + "loss": 0.82879305, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 4.1640625, + "router_z_loss_mlp": 0.43481445, + "step": 2405, + "time_per_iteration": 2.5817453861236572 + }, + { + "auxiliary_loss_clip": 0.0191782, + "auxiliary_loss_mlp": 0.00345224, + "balance_loss_clip": 1.52187777, + "balance_loss_mlp": 0.3051694, + "epoch": 0.14465654591913424, + "flos": 9867032179200.0, + "grad_norm": 11.358528036896125, + "language_loss": 0.7975536, + "learning_rate": 3.863675387262386e-06, + "loss": 0.82018399, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 3.9609375, + "router_z_loss_mlp": 0.40014648, + "step": 2406, + "time_per_iteration": 2.609281301498413 + }, + { + "auxiliary_loss_clip": 0.01913331, + "auxiliary_loss_mlp": 0.00412383, + "balance_loss_clip": 1.51612663, + "balance_loss_mlp": 0.36851397, + "epoch": 0.1447166691718022, + "flos": 24972926987520.0, + "grad_norm": 8.564345447161083, + "language_loss": 0.8308816, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.85413879, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.4387207, + "step": 2407, + "time_per_iteration": 2.680971384048462 + }, + { + "auxiliary_loss_clip": 0.01873664, + "auxiliary_loss_mlp": 0.00346411, + "balance_loss_clip": 1.4928422, + "balance_loss_mlp": 0.30719098, + "epoch": 0.14477679242447017, + "flos": 21908023200000.0, + "grad_norm": 3.6568476621556907, + "language_loss": 0.83740807, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.85960877, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 3.80664062, + "router_z_loss_mlp": 0.39233398, + "step": 2408, + "time_per_iteration": 2.698587656021118 + }, + { + "auxiliary_loss_clip": 0.01856084, + "auxiliary_loss_mlp": 0.00409487, + "balance_loss_clip": 1.47945261, + "balance_loss_mlp": 0.36673906, + "epoch": 0.14483691567713813, + "flos": 20740746925440.0, + "grad_norm": 61.966759737185065, + "language_loss": 0.8914237, + "learning_rate": 3.863251091147299e-06, + "loss": 0.91407943, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 0.42724609, + "step": 2409, + "time_per_iteration": 2.6334686279296875 + }, + { + "auxiliary_loss_clip": 0.01851472, + "auxiliary_loss_mlp": 0.00375725, + "balance_loss_clip": 1.47214603, + "balance_loss_mlp": 0.33502728, + "epoch": 0.1448970389298061, + "flos": 35407705536000.0, + "grad_norm": 3.021720198671276, + "language_loss": 0.83693218, + "learning_rate": 3.863109517792446e-06, + "loss": 0.85920417, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 3.79101562, + "router_z_loss_mlp": 0.40698242, + "step": 2410, + "time_per_iteration": 2.7589001655578613 + }, + { + "auxiliary_loss_clip": 0.01827099, + "auxiliary_loss_mlp": 0.00370853, + "balance_loss_clip": 1.46402705, + "balance_loss_mlp": 0.33242032, + "epoch": 0.14495716218247406, + "flos": 15414368808960.0, + "grad_norm": 4.5319304439509205, + "language_loss": 0.87651128, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.89849079, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.38427734, + "step": 2411, + "time_per_iteration": 2.58463978767395 + }, + { + "auxiliary_loss_clip": 0.01803531, + "auxiliary_loss_mlp": 0.0043411, + "balance_loss_clip": 1.44635355, + "balance_loss_mlp": 0.39398423, + "epoch": 0.14501728543514203, + "flos": 33693222493440.0, + "grad_norm": 13.367023216544364, + "language_loss": 0.78341079, + "learning_rate": 3.862826159140214e-06, + "loss": 0.80578721, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.40161133, + "step": 2412, + "time_per_iteration": 2.735686779022217 + }, + { + "auxiliary_loss_clip": 0.01787869, + "auxiliary_loss_mlp": 0.00405399, + "balance_loss_clip": 1.44090044, + "balance_loss_mlp": 0.36858737, + "epoch": 0.14507740868781002, + "flos": 15596112648960.0, + "grad_norm": 3.257091823027264, + "language_loss": 0.82785404, + "learning_rate": 3.862684373853579e-06, + "loss": 0.84978676, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 3.47460938, + "router_z_loss_mlp": 0.36816406, + "step": 2413, + "time_per_iteration": 2.574233055114746 + }, + { + "auxiliary_loss_clip": 0.01526728, + "auxiliary_loss_mlp": 0.00146514, + "balance_loss_clip": 1.3108052, + "balance_loss_mlp": 0.13435462, + "epoch": 0.145137531940478, + "flos": 66675343438080.0, + "grad_norm": 1.0579816518967537, + "language_loss": 0.58784711, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.60457945, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.12158203, + "step": 2414, + "time_per_iteration": 3.015807867050171 + }, + { + "auxiliary_loss_clip": 0.01524396, + "auxiliary_loss_mlp": 0.00167644, + "balance_loss_clip": 1.31194592, + "balance_loss_mlp": 0.15724865, + "epoch": 0.14519765519314595, + "flos": 67521578929920.0, + "grad_norm": 0.8388172853507377, + "language_loss": 0.61602587, + "learning_rate": 3.862400591386154e-06, + "loss": 0.63294625, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.10400391, + "step": 2415, + "time_per_iteration": 3.173332929611206 + }, + { + "auxiliary_loss_clip": 0.01687723, + "auxiliary_loss_mlp": 0.00404728, + "balance_loss_clip": 1.36368835, + "balance_loss_mlp": 0.36622381, + "epoch": 0.14525777844581392, + "flos": 17198913329280.0, + "grad_norm": 9.306326006444426, + "language_loss": 0.80068541, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.82160991, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.38549805, + "step": 2416, + "time_per_iteration": 2.6195569038391113 + }, + { + "auxiliary_loss_clip": 0.01482566, + "auxiliary_loss_mlp": 0.00209679, + "balance_loss_clip": 1.2708292, + "balance_loss_mlp": 0.19699517, + "epoch": 0.14531790169848188, + "flos": 65404609015680.0, + "grad_norm": 0.7281077376760037, + "language_loss": 0.60357046, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62049294, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.12695312, + "step": 2417, + "time_per_iteration": 3.153162717819214 + }, + { + "auxiliary_loss_clip": 0.01647566, + "auxiliary_loss_mlp": 0.00464211, + "balance_loss_clip": 1.32781172, + "balance_loss_mlp": 0.42432371, + "epoch": 0.14537802495114985, + "flos": 32562467372160.0, + "grad_norm": 17.33098192830113, + "language_loss": 0.85923439, + "learning_rate": 3.861974388030356e-06, + "loss": 0.88035214, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.39916992, + "step": 2418, + "time_per_iteration": 2.748457670211792 + }, + { + "auxiliary_loss_clip": 0.01635654, + "auxiliary_loss_mlp": 0.00456069, + "balance_loss_clip": 1.32627416, + "balance_loss_mlp": 0.42189133, + "epoch": 0.1454381482038178, + "flos": 20226685432320.0, + "grad_norm": 6.879132254755813, + "language_loss": 0.77730578, + "learning_rate": 3.861832179025394e-06, + "loss": 0.79822302, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.3416748, + "step": 2419, + "time_per_iteration": 2.740342140197754 + }, + { + "auxiliary_loss_clip": 0.01639496, + "auxiliary_loss_mlp": 0.00444252, + "balance_loss_clip": 1.32498145, + "balance_loss_mlp": 0.40949056, + "epoch": 0.1454982714564858, + "flos": 22893124671360.0, + "grad_norm": 5.665089792536025, + "language_loss": 0.98082501, + "learning_rate": 3.861689899419569e-06, + "loss": 1.00166249, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.34765625, + "step": 2420, + "time_per_iteration": 2.731821298599243 + }, + { + "auxiliary_loss_clip": 0.01627975, + "auxiliary_loss_mlp": 0.00452102, + "balance_loss_clip": 1.32084846, + "balance_loss_mlp": 0.41712597, + "epoch": 0.14555839470915377, + "flos": 20229845829120.0, + "grad_norm": 6.659441200517906, + "language_loss": 0.88514543, + "learning_rate": 3.861547549218276e-06, + "loss": 0.90594625, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.34960938, + "step": 2421, + "time_per_iteration": 2.6730456352233887 + }, + { + "auxiliary_loss_clip": 0.0160957, + "auxiliary_loss_mlp": 0.00461438, + "balance_loss_clip": 1.30253482, + "balance_loss_mlp": 0.4257943, + "epoch": 0.14561851796182174, + "flos": 22236282616320.0, + "grad_norm": 23.475026272286634, + "language_loss": 0.86136413, + "learning_rate": 3.861405128426914e-06, + "loss": 0.88207418, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.35668945, + "step": 2422, + "time_per_iteration": 2.7132580280303955 + }, + { + "auxiliary_loss_clip": 0.0144891, + "auxiliary_loss_mlp": 0.00131355, + "balance_loss_clip": 1.23413551, + "balance_loss_mlp": 0.12243789, + "epoch": 0.1456786412144897, + "flos": 52636786289280.0, + "grad_norm": 0.9157392098813985, + "language_loss": 0.63366085, + "learning_rate": 3.861262637050883e-06, + "loss": 0.64946347, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.08935547, + "step": 2423, + "time_per_iteration": 3.1558289527893066 + }, + { + "auxiliary_loss_clip": 0.01608988, + "auxiliary_loss_mlp": 0.00549941, + "balance_loss_clip": 1.30252528, + "balance_loss_mlp": 0.51379699, + "epoch": 0.14573876446715767, + "flos": 23221671396480.0, + "grad_norm": 256.27471738754525, + "language_loss": 0.87042552, + "learning_rate": 3.861120075095585e-06, + "loss": 0.8920148, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.36132812, + "step": 2424, + "time_per_iteration": 4.153766393661499 + }, + { + "auxiliary_loss_clip": 0.01618282, + "auxiliary_loss_mlp": 0.0053708, + "balance_loss_clip": 1.30931282, + "balance_loss_mlp": 0.49776453, + "epoch": 0.14579888771982563, + "flos": 18114384286080.0, + "grad_norm": 18.658564631669613, + "language_loss": 0.85264325, + "learning_rate": 3.860977442566429e-06, + "loss": 0.87419689, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.39306641, + "step": 2425, + "time_per_iteration": 2.613293170928955 + }, + { + "auxiliary_loss_clip": 0.01617538, + "auxiliary_loss_mlp": 0.00553011, + "balance_loss_clip": 1.30527663, + "balance_loss_mlp": 0.51290882, + "epoch": 0.14585901097249362, + "flos": 23001107932800.0, + "grad_norm": 44.84661269710091, + "language_loss": 0.9051888, + "learning_rate": 3.860834739468821e-06, + "loss": 0.92689431, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 3.12304688, + "router_z_loss_mlp": 0.40136719, + "step": 2426, + "time_per_iteration": 2.6549105644226074 + }, + { + "auxiliary_loss_clip": 0.01609023, + "auxiliary_loss_mlp": 0.00502745, + "balance_loss_clip": 1.29771686, + "balance_loss_mlp": 0.46593302, + "epoch": 0.1459191342251616, + "flos": 21908669644800.0, + "grad_norm": 5.006199808961258, + "language_loss": 0.94188672, + "learning_rate": 3.860691965808173e-06, + "loss": 0.96300447, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.36816406, + "step": 2427, + "time_per_iteration": 4.08456563949585 + }, + { + "auxiliary_loss_clip": 0.01603691, + "auxiliary_loss_mlp": 0.00539492, + "balance_loss_clip": 1.29076529, + "balance_loss_mlp": 0.49929434, + "epoch": 0.14597925747782955, + "flos": 14975504438400.0, + "grad_norm": 55.03266341130837, + "language_loss": 0.7804482, + "learning_rate": 3.8605491215899e-06, + "loss": 0.80188, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.40209961, + "step": 2428, + "time_per_iteration": 4.062969207763672 + }, + { + "auxiliary_loss_clip": 0.01606852, + "auxiliary_loss_mlp": 0.00537626, + "balance_loss_clip": 1.29211533, + "balance_loss_mlp": 0.49940798, + "epoch": 0.14603938073049752, + "flos": 21068898600960.0, + "grad_norm": 25.84155161362217, + "language_loss": 0.8835879, + "learning_rate": 3.860406206819417e-06, + "loss": 0.90503263, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.38208008, + "step": 2429, + "time_per_iteration": 2.6535232067108154 + }, + { + "auxiliary_loss_clip": 0.01600901, + "auxiliary_loss_mlp": 0.00510743, + "balance_loss_clip": 1.29073954, + "balance_loss_mlp": 0.47428912, + "epoch": 0.14609950398316549, + "flos": 19864777950720.0, + "grad_norm": 25.13788080417407, + "language_loss": 0.83297211, + "learning_rate": 3.860263221502145e-06, + "loss": 0.85408854, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.36499023, + "step": 2430, + "time_per_iteration": 4.0280537605285645 + }, + { + "auxiliary_loss_clip": 0.01624085, + "auxiliary_loss_mlp": 0.00540022, + "balance_loss_clip": 1.30452919, + "balance_loss_mlp": 0.50289977, + "epoch": 0.14615962723583345, + "flos": 22418852469120.0, + "grad_norm": 5.418015849039973, + "language_loss": 0.91539216, + "learning_rate": 3.860120165643504e-06, + "loss": 0.93703324, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.37158203, + "step": 2431, + "time_per_iteration": 2.6182899475097656 + }, + { + "auxiliary_loss_clip": 0.01626289, + "auxiliary_loss_mlp": 0.0049305, + "balance_loss_clip": 1.29826939, + "balance_loss_mlp": 0.45373505, + "epoch": 0.14621975048850142, + "flos": 22346241125760.0, + "grad_norm": 3.5085942427230656, + "language_loss": 0.86390316, + "learning_rate": 3.859977039248921e-06, + "loss": 0.88509655, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.39306641, + "step": 2432, + "time_per_iteration": 2.648545503616333 + }, + { + "auxiliary_loss_clip": 0.01633579, + "auxiliary_loss_mlp": 0.00528693, + "balance_loss_clip": 1.30801368, + "balance_loss_mlp": 0.48975909, + "epoch": 0.1462798737411694, + "flos": 24389163152640.0, + "grad_norm": 9.95977124781716, + "language_loss": 0.87824857, + "learning_rate": 3.859833842323822e-06, + "loss": 0.89987135, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.38964844, + "step": 2433, + "time_per_iteration": 2.7057135105133057 + }, + { + "auxiliary_loss_clip": 0.01626861, + "auxiliary_loss_mlp": 0.00482402, + "balance_loss_clip": 1.30688667, + "balance_loss_mlp": 0.44649625, + "epoch": 0.14633999699383737, + "flos": 19244672530560.0, + "grad_norm": 11.671656497308854, + "language_loss": 0.8679235, + "learning_rate": 3.859690574873638e-06, + "loss": 0.88901615, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.35913086, + "step": 2434, + "time_per_iteration": 2.6057655811309814 + }, + { + "auxiliary_loss_clip": 0.01452155, + "auxiliary_loss_mlp": 0.00239953, + "balance_loss_clip": 1.22696102, + "balance_loss_mlp": 0.22860418, + "epoch": 0.14640012024650534, + "flos": 62660638270080.0, + "grad_norm": 0.9120752209568954, + "language_loss": 0.58460754, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60152858, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.11328125, + "step": 2435, + "time_per_iteration": 3.0956122875213623 + }, + { + "auxiliary_loss_clip": 0.01624844, + "auxiliary_loss_mlp": 0.00473727, + "balance_loss_clip": 1.30112565, + "balance_loss_mlp": 0.43834567, + "epoch": 0.1464602434991733, + "flos": 12276243146880.0, + "grad_norm": 40.346732921935896, + "language_loss": 0.94926429, + "learning_rate": 3.859403828419744e-06, + "loss": 0.97025001, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 3.23828125, + "router_z_loss_mlp": 0.35400391, + "step": 2436, + "time_per_iteration": 2.5820891857147217 + }, + { + "auxiliary_loss_clip": 0.01642271, + "auxiliary_loss_mlp": 0.00544176, + "balance_loss_clip": 1.31358778, + "balance_loss_mlp": 0.5038597, + "epoch": 0.14652036675184127, + "flos": 20922311197440.0, + "grad_norm": 8.045499417585694, + "language_loss": 0.80211848, + "learning_rate": 3.85926034942691e-06, + "loss": 0.82398295, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.40332031, + "step": 2437, + "time_per_iteration": 2.684666156768799 + }, + { + "auxiliary_loss_clip": 0.01628367, + "auxiliary_loss_mlp": 0.00461795, + "balance_loss_clip": 1.29916334, + "balance_loss_mlp": 0.42250371, + "epoch": 0.14658049000450923, + "flos": 27703681528320.0, + "grad_norm": 20.42496793216016, + "language_loss": 0.78786564, + "learning_rate": 3.859116799930736e-06, + "loss": 0.80876732, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.39306641, + "step": 2438, + "time_per_iteration": 2.7619826793670654 + }, + { + "auxiliary_loss_clip": 0.01656095, + "auxiliary_loss_mlp": 0.00469984, + "balance_loss_clip": 1.32182932, + "balance_loss_mlp": 0.43329161, + "epoch": 0.14664061325717723, + "flos": 24936513575040.0, + "grad_norm": 3.8551481201298734, + "language_loss": 0.80649745, + "learning_rate": 3.858973179936668e-06, + "loss": 0.82775825, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.36669922, + "step": 2439, + "time_per_iteration": 2.686614990234375 + }, + { + "auxiliary_loss_clip": 0.01635439, + "auxiliary_loss_mlp": 0.00415878, + "balance_loss_clip": 1.30665374, + "balance_loss_mlp": 0.37789762, + "epoch": 0.1467007365098452, + "flos": 40297661406720.0, + "grad_norm": 24.752089340466576, + "language_loss": 0.79378921, + "learning_rate": 3.85882948945015e-06, + "loss": 0.81430233, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.37963867, + "step": 2440, + "time_per_iteration": 2.790498733520508 + }, + { + "auxiliary_loss_clip": 0.01612895, + "auxiliary_loss_mlp": 0.00422945, + "balance_loss_clip": 1.28836679, + "balance_loss_mlp": 0.38641936, + "epoch": 0.14676085976251316, + "flos": 26541074021760.0, + "grad_norm": 1.4720986998048202, + "language_loss": 0.87044477, + "learning_rate": 3.85868572847663e-06, + "loss": 0.89080316, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.36499023, + "step": 2441, + "time_per_iteration": 2.6785061359405518 + }, + { + "auxiliary_loss_clip": 0.01640292, + "auxiliary_loss_mlp": 0.00444358, + "balance_loss_clip": 1.304322, + "balance_loss_mlp": 0.40551916, + "epoch": 0.14682098301518112, + "flos": 23550110380800.0, + "grad_norm": 36.636410065655106, + "language_loss": 0.81636333, + "learning_rate": 3.858541897021563e-06, + "loss": 0.83720988, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 3.36132812, + "router_z_loss_mlp": 0.38818359, + "step": 2442, + "time_per_iteration": 2.6433072090148926 + }, + { + "auxiliary_loss_clip": 0.01622206, + "auxiliary_loss_mlp": 0.00437106, + "balance_loss_clip": 1.28583813, + "balance_loss_mlp": 0.39824328, + "epoch": 0.1468811062678491, + "flos": 11651073909120.0, + "grad_norm": 5.256031133961432, + "language_loss": 0.92289931, + "learning_rate": 3.8583979950904e-06, + "loss": 0.94349241, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.38842773, + "step": 2443, + "time_per_iteration": 2.632796049118042 + }, + { + "auxiliary_loss_clip": 0.01628511, + "auxiliary_loss_mlp": 0.00402201, + "balance_loss_clip": 1.29727435, + "balance_loss_mlp": 0.36691481, + "epoch": 0.14694122952051705, + "flos": 23002616304000.0, + "grad_norm": 5.436685729397422, + "language_loss": 0.90654802, + "learning_rate": 3.858254022688599e-06, + "loss": 0.92685515, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.35302734, + "step": 2444, + "time_per_iteration": 2.6500275135040283 + }, + { + "auxiliary_loss_clip": 0.01642299, + "auxiliary_loss_mlp": 0.00372631, + "balance_loss_clip": 1.30441093, + "balance_loss_mlp": 0.33684427, + "epoch": 0.14700135277318502, + "flos": 26502972670080.0, + "grad_norm": 28.952639969668795, + "language_loss": 0.78250134, + "learning_rate": 3.85810997982162e-06, + "loss": 0.80265069, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.35791016, + "step": 2445, + "time_per_iteration": 2.680816173553467 + }, + { + "auxiliary_loss_clip": 0.01499433, + "auxiliary_loss_mlp": 0.00227165, + "balance_loss_clip": 1.28249657, + "balance_loss_mlp": 0.21381362, + "epoch": 0.147061476025853, + "flos": 59449434387840.0, + "grad_norm": 0.8314020279283899, + "language_loss": 0.62566644, + "learning_rate": 3.857965866494923e-06, + "loss": 0.64293242, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.13378906, + "step": 2446, + "time_per_iteration": 3.040093421936035 + }, + { + "auxiliary_loss_clip": 0.01615849, + "auxiliary_loss_mlp": 0.0036199, + "balance_loss_clip": 1.28785634, + "balance_loss_mlp": 0.32765758, + "epoch": 0.14712159927852098, + "flos": 28330897841280.0, + "grad_norm": 168.5745105581572, + "language_loss": 0.81941146, + "learning_rate": 3.857821682713975e-06, + "loss": 0.83918983, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.34350586, + "step": 2447, + "time_per_iteration": 2.711287260055542 + }, + { + "auxiliary_loss_clip": 0.01617077, + "auxiliary_loss_mlp": 0.00400508, + "balance_loss_clip": 1.29264891, + "balance_loss_mlp": 0.36338618, + "epoch": 0.14718172253118894, + "flos": 27089825074560.0, + "grad_norm": 38.995425830165296, + "language_loss": 0.93901086, + "learning_rate": 3.857677428484242e-06, + "loss": 0.95918673, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.37133789, + "step": 2448, + "time_per_iteration": 2.7602267265319824 + }, + { + "auxiliary_loss_clip": 0.01455175, + "auxiliary_loss_mlp": 0.00181904, + "balance_loss_clip": 1.24937224, + "balance_loss_mlp": 0.16941111, + "epoch": 0.1472418457838569, + "flos": 66706764860160.0, + "grad_norm": 0.7453985158857575, + "language_loss": 0.56667739, + "learning_rate": 3.857533103811195e-06, + "loss": 0.58304816, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.125, + "step": 2449, + "time_per_iteration": 3.064450740814209 + }, + { + "auxiliary_loss_clip": 0.01609355, + "auxiliary_loss_mlp": 0.00361545, + "balance_loss_clip": 1.29101348, + "balance_loss_mlp": 0.32573426, + "epoch": 0.14730196903652487, + "flos": 19573578391680.0, + "grad_norm": 4.121024637974723, + "language_loss": 0.9166441, + "learning_rate": 3.857388708700307e-06, + "loss": 0.93635309, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.35839844, + "step": 2450, + "time_per_iteration": 2.6369552612304688 + }, + { + "auxiliary_loss_clip": 0.01600476, + "auxiliary_loss_mlp": 0.00374141, + "balance_loss_clip": 1.28184557, + "balance_loss_mlp": 0.33480185, + "epoch": 0.14736209228919284, + "flos": 16071031296000.0, + "grad_norm": 227.27998717514515, + "language_loss": 0.84079111, + "learning_rate": 3.857244243157052e-06, + "loss": 0.86053729, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.39355469, + "step": 2451, + "time_per_iteration": 2.6080753803253174 + }, + { + "auxiliary_loss_clip": 0.01584178, + "auxiliary_loss_mlp": 0.00370436, + "balance_loss_clip": 1.27595162, + "balance_loss_mlp": 0.33534086, + "epoch": 0.1474222155418608, + "flos": 23039460679680.0, + "grad_norm": 6.82664270847563, + "language_loss": 0.85971248, + "learning_rate": 3.85709970718691e-06, + "loss": 0.87925863, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 3.08203125, + "router_z_loss_mlp": 0.35083008, + "step": 2452, + "time_per_iteration": 2.626696825027466 + }, + { + "auxiliary_loss_clip": 0.01588116, + "auxiliary_loss_mlp": 0.00356505, + "balance_loss_clip": 1.27803755, + "balance_loss_mlp": 0.32226759, + "epoch": 0.1474823387945288, + "flos": 17018641946880.0, + "grad_norm": 2.4895493554001145, + "language_loss": 0.78355277, + "learning_rate": 3.856955100795361e-06, + "loss": 0.80299896, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.3425293, + "step": 2453, + "time_per_iteration": 2.5891456604003906 + }, + { + "auxiliary_loss_clip": 0.01576921, + "auxiliary_loss_mlp": 0.00383691, + "balance_loss_clip": 1.25857353, + "balance_loss_mlp": 0.34592554, + "epoch": 0.14754246204719676, + "flos": 17895041884800.0, + "grad_norm": 7.82085192995453, + "language_loss": 0.84399265, + "learning_rate": 3.856810423987889e-06, + "loss": 0.86359876, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.37792969, + "step": 2454, + "time_per_iteration": 2.599808931350708 + }, + { + "auxiliary_loss_clip": 0.01560374, + "auxiliary_loss_mlp": 0.00341085, + "balance_loss_clip": 1.25317168, + "balance_loss_mlp": 0.30861247, + "epoch": 0.14760258529986472, + "flos": 13079097987840.0, + "grad_norm": 1.8163254146491146, + "language_loss": 0.8873539, + "learning_rate": 3.856665676769979e-06, + "loss": 0.90636843, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.32470703, + "step": 2455, + "time_per_iteration": 2.590508222579956 + }, + { + "auxiliary_loss_clip": 0.01573932, + "auxiliary_loss_mlp": 0.00369374, + "balance_loss_clip": 1.25997353, + "balance_loss_mlp": 0.33466068, + "epoch": 0.1476627085525327, + "flos": 30806399358720.0, + "grad_norm": 6.88079383058879, + "language_loss": 0.90353501, + "learning_rate": 3.85652085914712e-06, + "loss": 0.92296803, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.34716797, + "step": 2456, + "time_per_iteration": 2.69640851020813 + }, + { + "auxiliary_loss_clip": 0.01543405, + "auxiliary_loss_mlp": 0.00312464, + "balance_loss_clip": 1.24504697, + "balance_loss_mlp": 0.2808018, + "epoch": 0.14772283180520066, + "flos": 21689434984320.0, + "grad_norm": 9.67383785333568, + "language_loss": 0.89389837, + "learning_rate": 3.856375971124805e-06, + "loss": 0.91245705, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.31689453, + "step": 2457, + "time_per_iteration": 2.6199755668640137 + }, + { + "auxiliary_loss_clip": 0.01547731, + "auxiliary_loss_mlp": 0.00317204, + "balance_loss_clip": 1.24855065, + "balance_loss_mlp": 0.28504151, + "epoch": 0.14778295505786862, + "flos": 18770400328320.0, + "grad_norm": 6.206287917888965, + "language_loss": 0.82055318, + "learning_rate": 3.856231012708527e-06, + "loss": 0.83920258, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 2.99023438, + "router_z_loss_mlp": 0.32128906, + "step": 2458, + "time_per_iteration": 2.6181480884552 + }, + { + "auxiliary_loss_clip": 0.01557336, + "auxiliary_loss_mlp": 0.00344438, + "balance_loss_clip": 1.24836731, + "balance_loss_mlp": 0.30855626, + "epoch": 0.1478430783105366, + "flos": 22893555634560.0, + "grad_norm": 335.5896793670183, + "language_loss": 0.90951377, + "learning_rate": 3.856085983903782e-06, + "loss": 0.92853153, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.35913086, + "step": 2459, + "time_per_iteration": 2.6861929893493652 + }, + { + "auxiliary_loss_clip": 0.01534149, + "auxiliary_loss_mlp": 0.00326284, + "balance_loss_clip": 1.23634291, + "balance_loss_mlp": 0.29412162, + "epoch": 0.14790320156320458, + "flos": 15085319293440.0, + "grad_norm": 5.018673776356329, + "language_loss": 0.80742931, + "learning_rate": 3.855940884716071e-06, + "loss": 0.82603359, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.3215332, + "step": 2460, + "time_per_iteration": 2.569617509841919 + }, + { + "auxiliary_loss_clip": 0.01566871, + "auxiliary_loss_mlp": 0.00331369, + "balance_loss_clip": 1.25053966, + "balance_loss_mlp": 0.29677412, + "epoch": 0.14796332481587254, + "flos": 26504768350080.0, + "grad_norm": 3.3749445794675026, + "language_loss": 0.8735944, + "learning_rate": 3.855795715150896e-06, + "loss": 0.89257681, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 3.16601562, + "router_z_loss_mlp": 0.34594727, + "step": 2461, + "time_per_iteration": 2.763770341873169 + }, + { + "auxiliary_loss_clip": 0.01548533, + "auxiliary_loss_mlp": 0.00322656, + "balance_loss_clip": 1.24675655, + "balance_loss_mlp": 0.28930128, + "epoch": 0.1480234480685405, + "flos": 17563191108480.0, + "grad_norm": 6.424820318841996, + "language_loss": 0.75674993, + "learning_rate": 3.855650475213761e-06, + "loss": 0.77546185, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.33349609, + "step": 2462, + "time_per_iteration": 2.6681056022644043 + }, + { + "auxiliary_loss_clip": 0.01542891, + "auxiliary_loss_mlp": 0.00335617, + "balance_loss_clip": 1.24031544, + "balance_loss_mlp": 0.30021143, + "epoch": 0.14808357132120847, + "flos": 53582203232640.0, + "grad_norm": 3.2747585938813493, + "language_loss": 0.73050928, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.74929434, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.35449219, + "step": 2463, + "time_per_iteration": 2.9880292415618896 + }, + { + "auxiliary_loss_clip": 0.01545092, + "auxiliary_loss_mlp": 0.00317081, + "balance_loss_clip": 1.23889279, + "balance_loss_mlp": 0.28484654, + "epoch": 0.14814369457387644, + "flos": 19829190551040.0, + "grad_norm": 2.775371980376823, + "language_loss": 0.83397096, + "learning_rate": 3.855359784245646e-06, + "loss": 0.85259271, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.32250977, + "step": 2464, + "time_per_iteration": 2.626296043395996 + }, + { + "auxiliary_loss_clip": 0.01517346, + "auxiliary_loss_mlp": 0.00284037, + "balance_loss_clip": 1.22637498, + "balance_loss_mlp": 0.25502157, + "epoch": 0.1482038178265444, + "flos": 23914962777600.0, + "grad_norm": 2.9901439681316733, + "language_loss": 0.86422062, + "learning_rate": 3.855214333225688e-06, + "loss": 0.88223445, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.29016113, + "step": 2465, + "time_per_iteration": 2.6636416912078857 + }, + { + "auxiliary_loss_clip": 0.01544309, + "auxiliary_loss_mlp": 0.00308295, + "balance_loss_clip": 1.23659933, + "balance_loss_mlp": 0.276299, + "epoch": 0.1482639410792124, + "flos": 24170503109760.0, + "grad_norm": 18.980502148221706, + "language_loss": 0.82083958, + "learning_rate": 3.855068811855817e-06, + "loss": 0.8393656, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 3.07617188, + "router_z_loss_mlp": 0.32006836, + "step": 2466, + "time_per_iteration": 2.6584107875823975 + }, + { + "auxiliary_loss_clip": 0.01325923, + "auxiliary_loss_mlp": 0.00074163, + "balance_loss_clip": 1.13118994, + "balance_loss_mlp": 0.06429276, + "epoch": 0.14832406433188036, + "flos": 66191051341440.0, + "grad_norm": 0.7785810398500818, + "language_loss": 0.60094774, + "learning_rate": 3.854923220141551e-06, + "loss": 0.61494863, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.09863281, + "step": 2467, + "time_per_iteration": 4.47997522354126 + }, + { + "auxiliary_loss_clip": 0.01546701, + "auxiliary_loss_mlp": 0.00308748, + "balance_loss_clip": 1.24208391, + "balance_loss_mlp": 0.27663311, + "epoch": 0.14838418758454833, + "flos": 25411252654080.0, + "grad_norm": 12.46212472057566, + "language_loss": 0.96241748, + "learning_rate": 3.85477755808841e-06, + "loss": 0.98097193, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.32128906, + "step": 2468, + "time_per_iteration": 2.6732184886932373 + }, + { + "auxiliary_loss_clip": 0.01571752, + "auxiliary_loss_mlp": 0.00307303, + "balance_loss_clip": 1.25737298, + "balance_loss_mlp": 0.27418655, + "epoch": 0.1484443108372163, + "flos": 23289901280640.0, + "grad_norm": 9.60751845626421, + "language_loss": 0.82821751, + "learning_rate": 3.854631825701919e-06, + "loss": 0.84700811, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.33129883, + "step": 2469, + "time_per_iteration": 2.609269618988037 + }, + { + "auxiliary_loss_clip": 0.01558435, + "auxiliary_loss_mlp": 0.00290291, + "balance_loss_clip": 1.24974203, + "balance_loss_mlp": 0.25874832, + "epoch": 0.14850443408988426, + "flos": 14647675985280.0, + "grad_norm": 167.46675395117833, + "language_loss": 0.81615615, + "learning_rate": 3.854486022987603e-06, + "loss": 0.83464336, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 3.0859375, + "router_z_loss_mlp": 0.31518555, + "step": 2470, + "time_per_iteration": 5.462946176528931 + }, + { + "auxiliary_loss_clip": 0.01554916, + "auxiliary_loss_mlp": 0.00278917, + "balance_loss_clip": 1.2519958, + "balance_loss_mlp": 0.24684966, + "epoch": 0.14856455734255222, + "flos": 23548314700800.0, + "grad_norm": 6.1487229628716555, + "language_loss": 0.77934361, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.79768199, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.3203125, + "step": 2471, + "time_per_iteration": 2.632833242416382 + }, + { + "auxiliary_loss_clip": 0.01566739, + "auxiliary_loss_mlp": 0.00315349, + "balance_loss_clip": 1.25171185, + "balance_loss_mlp": 0.28228074, + "epoch": 0.1486246805952202, + "flos": 18077288515200.0, + "grad_norm": 190.23979086722719, + "language_loss": 0.95600599, + "learning_rate": 3.854194206597615e-06, + "loss": 0.97482687, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.33032227, + "step": 2472, + "time_per_iteration": 3.9915647506713867 + }, + { + "auxiliary_loss_clip": 0.01569873, + "auxiliary_loss_mlp": 0.00292529, + "balance_loss_clip": 1.25750101, + "balance_loss_mlp": 0.25953197, + "epoch": 0.14868480384788818, + "flos": 19353625459200.0, + "grad_norm": 14.82720353246185, + "language_loss": 0.89305174, + "learning_rate": 3.854048192933008e-06, + "loss": 0.91167581, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 3.12304688, + "router_z_loss_mlp": 0.32983398, + "step": 2473, + "time_per_iteration": 2.5711441040039062 + }, + { + "auxiliary_loss_clip": 0.01558636, + "auxiliary_loss_mlp": 0.00289657, + "balance_loss_clip": 1.24572551, + "balance_loss_mlp": 0.25918737, + "epoch": 0.14874492710055615, + "flos": 22200192426240.0, + "grad_norm": 57.07355075763097, + "language_loss": 0.88195264, + "learning_rate": 3.853902108962709e-06, + "loss": 0.90043557, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.30456543, + "step": 2474, + "time_per_iteration": 2.6235275268554688 + }, + { + "auxiliary_loss_clip": 0.01582245, + "auxiliary_loss_mlp": 0.00355248, + "balance_loss_clip": 1.26073575, + "balance_loss_mlp": 0.31910419, + "epoch": 0.1488050503532241, + "flos": 21103444506240.0, + "grad_norm": 3.6648964368455954, + "language_loss": 0.87978381, + "learning_rate": 3.853755954692255e-06, + "loss": 0.89915872, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.36157227, + "step": 2475, + "time_per_iteration": 2.5865561962127686 + }, + { + "auxiliary_loss_clip": 0.01585071, + "auxiliary_loss_mlp": 0.00304587, + "balance_loss_clip": 1.26173913, + "balance_loss_mlp": 0.27187639, + "epoch": 0.14886517360589208, + "flos": 12786569625600.0, + "grad_norm": 188.62960135640387, + "language_loss": 0.8840704, + "learning_rate": 3.85360973012719e-06, + "loss": 0.90296698, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.32714844, + "step": 2476, + "time_per_iteration": 2.585855484008789 + }, + { + "auxiliary_loss_clip": 0.01569625, + "auxiliary_loss_mlp": 0.00279169, + "balance_loss_clip": 1.25515366, + "balance_loss_mlp": 0.24846017, + "epoch": 0.14892529685856004, + "flos": 29022860419200.0, + "grad_norm": 12099.088897209576, + "language_loss": 0.83214116, + "learning_rate": 3.853463435273058e-06, + "loss": 0.85062909, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.30712891, + "step": 2477, + "time_per_iteration": 2.6612794399261475 + }, + { + "auxiliary_loss_clip": 0.01364918, + "auxiliary_loss_mlp": 0.0008118, + "balance_loss_clip": 1.16748261, + "balance_loss_mlp": 0.06401383, + "epoch": 0.148985420111228, + "flos": 61926121054080.0, + "grad_norm": 0.7845629868148587, + "language_loss": 0.59985578, + "learning_rate": 3.853317070135407e-06, + "loss": 0.61431676, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.171875, + "step": 2478, + "time_per_iteration": 3.148921489715576 + }, + { + "auxiliary_loss_clip": 0.01588782, + "auxiliary_loss_mlp": 0.00287077, + "balance_loss_clip": 1.26392519, + "balance_loss_mlp": 0.25522476, + "epoch": 0.149045543363896, + "flos": 23915106432000.0, + "grad_norm": 4.428772815430691, + "language_loss": 0.78747118, + "learning_rate": 3.853170634719787e-06, + "loss": 0.80622977, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.31860352, + "step": 2479, + "time_per_iteration": 2.676405191421509 + }, + { + "auxiliary_loss_clip": 0.01596338, + "auxiliary_loss_mlp": 0.00322482, + "balance_loss_clip": 1.26893413, + "balance_loss_mlp": 0.28910339, + "epoch": 0.14910566661656396, + "flos": 23654394541440.0, + "grad_norm": 3.87379940899696, + "language_loss": 0.86007911, + "learning_rate": 3.853024129031751e-06, + "loss": 0.87926733, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33349609, + "step": 2480, + "time_per_iteration": 2.623211622238159 + }, + { + "auxiliary_loss_clip": 0.01624366, + "auxiliary_loss_mlp": 0.00299422, + "balance_loss_clip": 1.28487372, + "balance_loss_mlp": 0.26879716, + "epoch": 0.14916578986923193, + "flos": 20515299212160.0, + "grad_norm": 4.406775484455945, + "language_loss": 0.9151746, + "learning_rate": 3.852877553076854e-06, + "loss": 0.93441254, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.30651855, + "step": 2481, + "time_per_iteration": 2.682844400405884 + }, + { + "auxiliary_loss_clip": 0.01625743, + "auxiliary_loss_mlp": 0.00307336, + "balance_loss_clip": 1.28582084, + "balance_loss_mlp": 0.27448177, + "epoch": 0.1492259131218999, + "flos": 22491822948480.0, + "grad_norm": 89.13939956583506, + "language_loss": 0.85025942, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.86959022, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.32800293, + "step": 2482, + "time_per_iteration": 2.6415438652038574 + }, + { + "auxiliary_loss_clip": 0.0164181, + "auxiliary_loss_mlp": 0.00276053, + "balance_loss_clip": 1.293064, + "balance_loss_mlp": 0.24343684, + "epoch": 0.14928603637456786, + "flos": 23185868515200.0, + "grad_norm": 9.643467203083583, + "language_loss": 0.89552939, + "learning_rate": 3.852584190388713e-06, + "loss": 0.91470802, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 0.32641602, + "step": 2483, + "time_per_iteration": 2.7075886726379395 + }, + { + "auxiliary_loss_clip": 0.01628216, + "auxiliary_loss_mlp": 0.00269768, + "balance_loss_clip": 1.28980017, + "balance_loss_mlp": 0.24060898, + "epoch": 0.14934615962723582, + "flos": 21653237053440.0, + "grad_norm": 1.5141872351610104, + "language_loss": 0.76758659, + "learning_rate": 3.852437403666595e-06, + "loss": 0.78656644, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.29174805, + "step": 2484, + "time_per_iteration": 2.63010835647583 + }, + { + "auxiliary_loss_clip": 0.01639742, + "auxiliary_loss_mlp": 0.00290324, + "balance_loss_clip": 1.29734516, + "balance_loss_mlp": 0.25685039, + "epoch": 0.1494062828799038, + "flos": 27010066924800.0, + "grad_norm": 3.7297601773064155, + "language_loss": 0.91307777, + "learning_rate": 3.852290546699863e-06, + "loss": 0.93237841, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.33447266, + "step": 2485, + "time_per_iteration": 2.67417311668396 + }, + { + "auxiliary_loss_clip": 0.0165301, + "auxiliary_loss_mlp": 0.0026006, + "balance_loss_clip": 1.30327702, + "balance_loss_mlp": 0.22763523, + "epoch": 0.14946640613257178, + "flos": 21214947300480.0, + "grad_norm": 52.62297084528538, + "language_loss": 0.92717242, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.94630313, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.32446289, + "step": 2486, + "time_per_iteration": 2.597712755203247 + }, + { + "auxiliary_loss_clip": 0.01616514, + "auxiliary_loss_mlp": 0.00237, + "balance_loss_clip": 1.28329265, + "balance_loss_mlp": 0.20840198, + "epoch": 0.14952652938523975, + "flos": 13370872164480.0, + "grad_norm": 125.83621675804288, + "language_loss": 0.80113089, + "learning_rate": 3.851996622054842e-06, + "loss": 0.81966603, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.28552246, + "step": 2487, + "time_per_iteration": 2.593411445617676 + }, + { + "auxiliary_loss_clip": 0.01629921, + "auxiliary_loss_mlp": 0.00261316, + "balance_loss_clip": 1.29360318, + "balance_loss_mlp": 0.22860458, + "epoch": 0.1495866526379077, + "flos": 35517699959040.0, + "grad_norm": 10.607304822189466, + "language_loss": 0.78926998, + "learning_rate": 3.8518495543877e-06, + "loss": 0.80818236, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.3269043, + "step": 2488, + "time_per_iteration": 2.7316718101501465 + }, + { + "auxiliary_loss_clip": 0.01624984, + "auxiliary_loss_mlp": 0.00305617, + "balance_loss_clip": 1.28619993, + "balance_loss_mlp": 0.27149954, + "epoch": 0.14964677589057568, + "flos": 17632749795840.0, + "grad_norm": 7.869198219612591, + "language_loss": 0.77973109, + "learning_rate": 3.851702416498235e-06, + "loss": 0.7990371, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.34106445, + "step": 2489, + "time_per_iteration": 2.6027779579162598 + }, + { + "auxiliary_loss_clip": 0.01638582, + "auxiliary_loss_mlp": 0.00256353, + "balance_loss_clip": 1.29780877, + "balance_loss_mlp": 0.2238563, + "epoch": 0.14970689914324364, + "flos": 20185280029440.0, + "grad_norm": 366.0743527532123, + "language_loss": 0.92903459, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.94798398, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.32495117, + "step": 2490, + "time_per_iteration": 2.5710806846618652 + }, + { + "auxiliary_loss_clip": 0.01605017, + "auxiliary_loss_mlp": 0.00256177, + "balance_loss_clip": 1.26925695, + "balance_loss_mlp": 0.22208276, + "epoch": 0.1497670223959116, + "flos": 37228699382400.0, + "grad_norm": 3.5425547441319027, + "language_loss": 0.8661859, + "learning_rate": 3.851407930074666e-06, + "loss": 0.88479787, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.34057617, + "step": 2491, + "time_per_iteration": 2.721687078475952 + }, + { + "auxiliary_loss_clip": 0.01601996, + "auxiliary_loss_mlp": 0.00267855, + "balance_loss_clip": 1.2723, + "balance_loss_mlp": 0.23399997, + "epoch": 0.1498271456485796, + "flos": 24455848752000.0, + "grad_norm": 3.304164621645694, + "language_loss": 0.96676511, + "learning_rate": 3.851260581551727e-06, + "loss": 0.98546362, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.33862305, + "step": 2492, + "time_per_iteration": 2.623558282852173 + }, + { + "auxiliary_loss_clip": 0.01598484, + "auxiliary_loss_mlp": 0.00268035, + "balance_loss_clip": 1.27145219, + "balance_loss_mlp": 0.23212895, + "epoch": 0.14988726890124757, + "flos": 16253601148800.0, + "grad_norm": 5.094206394411597, + "language_loss": 0.89801371, + "learning_rate": 3.851113162828802e-06, + "loss": 0.91667891, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.35913086, + "step": 2493, + "time_per_iteration": 2.576873779296875 + }, + { + "auxiliary_loss_clip": 0.01590926, + "auxiliary_loss_mlp": 0.00278372, + "balance_loss_clip": 1.26681423, + "balance_loss_mlp": 0.24120273, + "epoch": 0.14994739215391553, + "flos": 20666555383680.0, + "grad_norm": 37.58936596639462, + "language_loss": 0.86877656, + "learning_rate": 3.85096567391148e-06, + "loss": 0.88746953, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.37207031, + "step": 2494, + "time_per_iteration": 2.6444461345672607 + }, + { + "auxiliary_loss_clip": 0.01568131, + "auxiliary_loss_mlp": 0.00242941, + "balance_loss_clip": 1.25324011, + "balance_loss_mlp": 0.2084896, + "epoch": 0.1500075154065835, + "flos": 70652375239680.0, + "grad_norm": 104.57574075176639, + "language_loss": 0.74512506, + "learning_rate": 3.850818114805354e-06, + "loss": 0.76323581, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.34472656, + "step": 2495, + "time_per_iteration": 3.054676055908203 + }, + { + "auxiliary_loss_clip": 0.01450753, + "auxiliary_loss_mlp": 0.00082425, + "balance_loss_clip": 1.24505997, + "balance_loss_mlp": 0.06602141, + "epoch": 0.15006763865925146, + "flos": 68011937447040.0, + "grad_norm": 0.8669193022401698, + "language_loss": 0.59335506, + "learning_rate": 3.850670485516019e-06, + "loss": 0.60868686, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.1640625, + "step": 2496, + "time_per_iteration": 3.142096996307373 + }, + { + "auxiliary_loss_clip": 0.01594813, + "auxiliary_loss_mlp": 0.00268259, + "balance_loss_clip": 1.27118826, + "balance_loss_mlp": 0.2326152, + "epoch": 0.15012776191191943, + "flos": 18916269459840.0, + "grad_norm": 8.079476178037334, + "language_loss": 0.74926746, + "learning_rate": 3.850522786049075e-06, + "loss": 0.7678982, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.35644531, + "step": 2497, + "time_per_iteration": 2.665170431137085 + }, + { + "auxiliary_loss_clip": 0.01587707, + "auxiliary_loss_mlp": 0.00252835, + "balance_loss_clip": 1.26921391, + "balance_loss_mlp": 0.22088695, + "epoch": 0.1501878851645874, + "flos": 23701330638720.0, + "grad_norm": 2.0506151961532537, + "language_loss": 0.81187433, + "learning_rate": 3.850375016410121e-06, + "loss": 0.83027977, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.31933594, + "step": 2498, + "time_per_iteration": 2.709078311920166 + }, + { + "auxiliary_loss_clip": 0.01589785, + "auxiliary_loss_mlp": 0.00234432, + "balance_loss_clip": 1.26903987, + "balance_loss_mlp": 0.20176834, + "epoch": 0.15024800841725539, + "flos": 20412523422720.0, + "grad_norm": 29.24965262041153, + "language_loss": 0.79991865, + "learning_rate": 3.850227176604761e-06, + "loss": 0.81816083, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.3269043, + "step": 2499, + "time_per_iteration": 2.5754408836364746 + }, + { + "auxiliary_loss_clip": 0.01570721, + "auxiliary_loss_mlp": 0.00242156, + "balance_loss_clip": 1.25895286, + "balance_loss_mlp": 0.20925438, + "epoch": 0.15030813166992335, + "flos": 31831002812160.0, + "grad_norm": 40.63285611098953, + "language_loss": 0.78963661, + "learning_rate": 3.850079266638601e-06, + "loss": 0.80776536, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.32885742, + "step": 2500, + "time_per_iteration": 2.7095417976379395 + }, + { + "auxiliary_loss_clip": 0.01562254, + "auxiliary_loss_mlp": 0.00234233, + "balance_loss_clip": 1.25282264, + "balance_loss_mlp": 0.20185606, + "epoch": 0.15036825492259132, + "flos": 35657822914560.0, + "grad_norm": 1297.8321854101105, + "language_loss": 0.73788387, + "learning_rate": 3.849931286517249e-06, + "loss": 0.75584877, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.32373047, + "step": 2501, + "time_per_iteration": 2.7395715713500977 + }, + { + "auxiliary_loss_clip": 0.0154255, + "auxiliary_loss_mlp": 0.00225286, + "balance_loss_clip": 1.23447847, + "balance_loss_mlp": 0.19121641, + "epoch": 0.15042837817525928, + "flos": 18838163335680.0, + "grad_norm": 18.088775367675087, + "language_loss": 0.92135644, + "learning_rate": 3.849783236246318e-06, + "loss": 0.93903482, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 3.08203125, + "router_z_loss_mlp": 0.34106445, + "step": 2502, + "time_per_iteration": 2.605713367462158 + }, + { + "auxiliary_loss_clip": 0.01555657, + "auxiliary_loss_mlp": 0.00243045, + "balance_loss_clip": 1.24765432, + "balance_loss_mlp": 0.20921384, + "epoch": 0.15048850142792725, + "flos": 19535548867200.0, + "grad_norm": 16.224585492607996, + "language_loss": 0.83266205, + "learning_rate": 3.849635115831421e-06, + "loss": 0.85064912, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.33813477, + "step": 2503, + "time_per_iteration": 2.6008641719818115 + }, + { + "auxiliary_loss_clip": 0.01542385, + "auxiliary_loss_mlp": 0.00216759, + "balance_loss_clip": 1.24204433, + "balance_loss_mlp": 0.18702865, + "epoch": 0.1505486246805952, + "flos": 22017550746240.0, + "grad_norm": 168.39530036590995, + "language_loss": 0.94323653, + "learning_rate": 3.849486925278176e-06, + "loss": 0.96082795, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.29711914, + "step": 2504, + "time_per_iteration": 2.619610548019409 + }, + { + "auxiliary_loss_clip": 0.01544203, + "auxiliary_loss_mlp": 0.00218379, + "balance_loss_clip": 1.2394321, + "balance_loss_mlp": 0.18569203, + "epoch": 0.15060874793326318, + "flos": 20743153136640.0, + "grad_norm": 9.221524369359267, + "language_loss": 0.88750851, + "learning_rate": 3.8493386645922e-06, + "loss": 0.90513438, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.32666016, + "step": 2505, + "time_per_iteration": 2.6584722995758057 + }, + { + "auxiliary_loss_clip": 0.01536385, + "auxiliary_loss_mlp": 0.00230294, + "balance_loss_clip": 1.23773146, + "balance_loss_mlp": 0.19836934, + "epoch": 0.15066887118593117, + "flos": 16471902055680.0, + "grad_norm": 2.390167478194995, + "language_loss": 0.83467561, + "learning_rate": 3.849190333779117e-06, + "loss": 0.85234237, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.31933594, + "step": 2506, + "time_per_iteration": 2.570402145385742 + }, + { + "auxiliary_loss_clip": 0.01546421, + "auxiliary_loss_mlp": 0.00237708, + "balance_loss_clip": 1.2358681, + "balance_loss_mlp": 0.20494929, + "epoch": 0.15072899443859913, + "flos": 19859319083520.0, + "grad_norm": 3.922555026375878, + "language_loss": 0.91204834, + "learning_rate": 3.849041932844552e-06, + "loss": 0.92988962, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.32763672, + "step": 2507, + "time_per_iteration": 2.6086461544036865 + }, + { + "auxiliary_loss_clip": 0.01531268, + "auxiliary_loss_mlp": 0.00234611, + "balance_loss_clip": 1.23072898, + "balance_loss_mlp": 0.2025203, + "epoch": 0.1507891176912671, + "flos": 20776226584320.0, + "grad_norm": 14.990724293909658, + "language_loss": 0.76193988, + "learning_rate": 3.848893461794131e-06, + "loss": 0.77959859, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.32104492, + "step": 2508, + "time_per_iteration": 2.58901309967041 + }, + { + "auxiliary_loss_clip": 0.0154761, + "auxiliary_loss_mlp": 0.00239725, + "balance_loss_clip": 1.23886132, + "balance_loss_mlp": 0.20527393, + "epoch": 0.15084924094393506, + "flos": 23586631534080.0, + "grad_norm": 2.1232111206102813, + "language_loss": 0.85818493, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.87605834, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.34472656, + "step": 2509, + "time_per_iteration": 4.161874294281006 + }, + { + "auxiliary_loss_clip": 0.0155794, + "auxiliary_loss_mlp": 0.00237386, + "balance_loss_clip": 1.24415421, + "balance_loss_mlp": 0.20457938, + "epoch": 0.15090936419660303, + "flos": 18911313383040.0, + "grad_norm": 9.826453223536552, + "language_loss": 0.908952, + "learning_rate": 3.848596309368246e-06, + "loss": 0.92690527, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.328125, + "step": 2510, + "time_per_iteration": 2.6408462524414062 + }, + { + "auxiliary_loss_clip": 0.01570842, + "auxiliary_loss_mlp": 0.00240406, + "balance_loss_clip": 1.25548065, + "balance_loss_mlp": 0.20538205, + "epoch": 0.150969487449271, + "flos": 17928223073280.0, + "grad_norm": 8.359064558418252, + "language_loss": 0.83340919, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.85152161, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.3503418, + "step": 2511, + "time_per_iteration": 2.666086435317993 + }, + { + "auxiliary_loss_clip": 0.01555481, + "auxiliary_loss_mlp": 0.00244282, + "balance_loss_clip": 1.24898553, + "balance_loss_mlp": 0.21319203, + "epoch": 0.151029610701939, + "flos": 24243078539520.0, + "grad_norm": 40.59272651019958, + "language_loss": 0.79566783, + "learning_rate": 3.848298876546534e-06, + "loss": 0.81366545, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.31054688, + "step": 2512, + "time_per_iteration": 5.474567651748657 + }, + { + "auxiliary_loss_clip": 0.01564395, + "auxiliary_loss_mlp": 0.00228505, + "balance_loss_clip": 1.25920343, + "balance_loss_mlp": 0.1968666, + "epoch": 0.15108973395460695, + "flos": 30262496641920.0, + "grad_norm": 28.64030331798118, + "language_loss": 0.82314253, + "learning_rate": 3.84815005500134e-06, + "loss": 0.84107149, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.31665039, + "step": 2513, + "time_per_iteration": 2.7438831329345703 + }, + { + "auxiliary_loss_clip": 0.01434861, + "auxiliary_loss_mlp": 0.00242872, + "balance_loss_clip": 1.23072898, + "balance_loss_mlp": 0.22894879, + "epoch": 0.15114985720727492, + "flos": 60437624428800.0, + "grad_norm": 0.8611693719913415, + "language_loss": 0.64273036, + "learning_rate": 3.84800116337411e-06, + "loss": 0.65950775, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.13964844, + "step": 2514, + "time_per_iteration": 3.084643602371216 + }, + { + "auxiliary_loss_clip": 0.01577903, + "auxiliary_loss_mlp": 0.00241994, + "balance_loss_clip": 1.26363897, + "balance_loss_mlp": 0.2106415, + "epoch": 0.15120998045994288, + "flos": 20521691832960.0, + "grad_norm": 12.463132858349073, + "language_loss": 0.79227948, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.81047845, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.3137207, + "step": 2515, + "time_per_iteration": 4.032611131668091 + }, + { + "auxiliary_loss_clip": 0.01579375, + "auxiliary_loss_mlp": 0.00228658, + "balance_loss_clip": 1.26846027, + "balance_loss_mlp": 0.19644743, + "epoch": 0.15127010371261085, + "flos": 21178893024000.0, + "grad_norm": 257.0006272322974, + "language_loss": 0.8435123, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.86159265, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.32177734, + "step": 2516, + "time_per_iteration": 2.6606013774871826 + }, + { + "auxiliary_loss_clip": 0.01467249, + "auxiliary_loss_mlp": 0.00122547, + "balance_loss_clip": 1.25531423, + "balance_loss_mlp": 0.11043525, + "epoch": 0.1513302269652788, + "flos": 65320648974720.0, + "grad_norm": 0.7378599727952849, + "language_loss": 0.54866964, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56456757, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.12109375, + "step": 2517, + "time_per_iteration": 3.124859571456909 + }, + { + "auxiliary_loss_clip": 0.0159449, + "auxiliary_loss_mlp": 0.00222864, + "balance_loss_clip": 1.27775824, + "balance_loss_mlp": 0.18729232, + "epoch": 0.15139035021794678, + "flos": 19135827342720.0, + "grad_norm": 8.328798327243442, + "language_loss": 0.85610127, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.87427479, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 3.16601562, + "router_z_loss_mlp": 0.35571289, + "step": 2518, + "time_per_iteration": 2.622445583343506 + }, + { + "auxiliary_loss_clip": 0.01611652, + "auxiliary_loss_mlp": 0.00234718, + "balance_loss_clip": 1.2888, + "balance_loss_mlp": 0.19759651, + "epoch": 0.15145047347061477, + "flos": 26578564842240.0, + "grad_norm": 8.978993990265556, + "language_loss": 0.76849484, + "learning_rate": 3.847255654205137e-06, + "loss": 0.78695858, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 3.2265625, + "router_z_loss_mlp": 0.37133789, + "step": 2519, + "time_per_iteration": 2.699012041091919 + }, + { + "auxiliary_loss_clip": 0.01611398, + "auxiliary_loss_mlp": 0.00225545, + "balance_loss_clip": 1.29162693, + "balance_loss_mlp": 0.19183244, + "epoch": 0.15151059672328274, + "flos": 20302959962880.0, + "grad_norm": 61.45689820302245, + "language_loss": 0.87648678, + "learning_rate": 3.847106342204354e-06, + "loss": 0.89485615, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.33740234, + "step": 2520, + "time_per_iteration": 2.6811656951904297 + }, + { + "auxiliary_loss_clip": 0.0161782, + "auxiliary_loss_mlp": 0.0024508, + "balance_loss_clip": 1.29340506, + "balance_loss_mlp": 0.20822001, + "epoch": 0.1515707199759507, + "flos": 27228367831680.0, + "grad_norm": 30.889466296905333, + "language_loss": 0.84168392, + "learning_rate": 3.846956960161114e-06, + "loss": 0.86031294, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.3684082, + "step": 2521, + "time_per_iteration": 2.7189838886260986 + }, + { + "auxiliary_loss_clip": 0.01618224, + "auxiliary_loss_mlp": 0.00247881, + "balance_loss_clip": 1.29107726, + "balance_loss_mlp": 0.20844644, + "epoch": 0.15163084322861867, + "flos": 23587349806080.0, + "grad_norm": 18.740129060944867, + "language_loss": 0.91395271, + "learning_rate": 3.84680750808108e-06, + "loss": 0.93261385, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.39453125, + "step": 2522, + "time_per_iteration": 2.6862030029296875 + }, + { + "auxiliary_loss_clip": 0.01440721, + "auxiliary_loss_mlp": 0.00124767, + "balance_loss_clip": 1.22685742, + "balance_loss_mlp": 0.10874529, + "epoch": 0.15169096648128663, + "flos": 66889622021760.0, + "grad_norm": 0.8237152892074878, + "language_loss": 0.58199298, + "learning_rate": 3.846657985969922e-06, + "loss": 0.59764791, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.16015625, + "step": 2523, + "time_per_iteration": 3.0839016437530518 + }, + { + "auxiliary_loss_clip": 0.01625573, + "auxiliary_loss_mlp": 0.00209027, + "balance_loss_clip": 1.2976073, + "balance_loss_mlp": 0.17073706, + "epoch": 0.1517510897339546, + "flos": 29095435848960.0, + "grad_norm": 1.9383839463572936, + "language_loss": 0.82129604, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.83964205, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.38305664, + "step": 2524, + "time_per_iteration": 2.6963484287261963 + }, + { + "auxiliary_loss_clip": 0.01640849, + "auxiliary_loss_mlp": 0.00250713, + "balance_loss_clip": 1.30471408, + "balance_loss_mlp": 0.21237499, + "epoch": 0.1518112129866226, + "flos": 18406553512320.0, + "grad_norm": 22.47027800107615, + "language_loss": 0.8259356, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.84485126, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.38305664, + "step": 2525, + "time_per_iteration": 2.7734124660491943 + }, + { + "auxiliary_loss_clip": 0.01635398, + "auxiliary_loss_mlp": 0.00262942, + "balance_loss_clip": 1.29702294, + "balance_loss_mlp": 0.22250581, + "epoch": 0.15187133623929056, + "flos": 19425410789760.0, + "grad_norm": 6.939908492055646, + "language_loss": 0.87264699, + "learning_rate": 3.846208999506402e-06, + "loss": 0.89163041, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.40405273, + "step": 2526, + "time_per_iteration": 2.608898639678955 + }, + { + "auxiliary_loss_clip": 0.01630783, + "auxiliary_loss_mlp": 0.00238119, + "balance_loss_clip": 1.29695153, + "balance_loss_mlp": 0.2003772, + "epoch": 0.15193145949195852, + "flos": 17566207850880.0, + "grad_norm": 20.16195856476102, + "language_loss": 0.92105526, + "learning_rate": 3.846059197327466e-06, + "loss": 0.93974435, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.37744141, + "step": 2527, + "time_per_iteration": 2.650644302368164 + }, + { + "auxiliary_loss_clip": 0.01630969, + "auxiliary_loss_mlp": 0.00258386, + "balance_loss_clip": 1.29788852, + "balance_loss_mlp": 0.2212165, + "epoch": 0.15199158274462649, + "flos": 36176265866880.0, + "grad_norm": 141.8763667575267, + "language_loss": 0.74725777, + "learning_rate": 3.845909325145779e-06, + "loss": 0.76615131, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.37182617, + "step": 2528, + "time_per_iteration": 2.7457351684570312 + }, + { + "auxiliary_loss_clip": 0.01622804, + "auxiliary_loss_mlp": 0.0023328, + "balance_loss_clip": 1.28689814, + "balance_loss_mlp": 0.19205755, + "epoch": 0.15205170599729445, + "flos": 23074042498560.0, + "grad_norm": 28.76182727348651, + "language_loss": 0.94781882, + "learning_rate": 3.845759382967026e-06, + "loss": 0.96637964, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.41210938, + "step": 2529, + "time_per_iteration": 2.70662784576416 + }, + { + "auxiliary_loss_clip": 0.01626107, + "auxiliary_loss_mlp": 0.00252221, + "balance_loss_clip": 1.29531932, + "balance_loss_mlp": 0.21173792, + "epoch": 0.15211182924996242, + "flos": 21908382336000.0, + "grad_norm": 18.565547915021952, + "language_loss": 0.91219878, + "learning_rate": 3.845609370796893e-06, + "loss": 0.93098211, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.40478516, + "step": 2530, + "time_per_iteration": 2.6448428630828857 + }, + { + "auxiliary_loss_clip": 0.01616085, + "auxiliary_loss_mlp": 0.00252873, + "balance_loss_clip": 1.28461266, + "balance_loss_mlp": 0.21460673, + "epoch": 0.15217195250263038, + "flos": 13881521865600.0, + "grad_norm": 5.936961852443786, + "language_loss": 0.89857721, + "learning_rate": 3.845459288641066e-06, + "loss": 0.91726685, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.3828125, + "step": 2531, + "time_per_iteration": 2.6350133419036865 + }, + { + "auxiliary_loss_clip": 0.01620508, + "auxiliary_loss_mlp": 0.00233963, + "balance_loss_clip": 1.2866087, + "balance_loss_mlp": 0.19617343, + "epoch": 0.15223207575529837, + "flos": 24535319592960.0, + "grad_norm": 14.184620266236404, + "language_loss": 0.87394327, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.892488, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.37841797, + "step": 2532, + "time_per_iteration": 2.7181296348571777 + }, + { + "auxiliary_loss_clip": 0.0161876, + "auxiliary_loss_mlp": 0.00258474, + "balance_loss_clip": 1.28644562, + "balance_loss_mlp": 0.21799043, + "epoch": 0.15229219900796634, + "flos": 25556798563200.0, + "grad_norm": 78.57601574113427, + "language_loss": 0.93418705, + "learning_rate": 3.845158914395105e-06, + "loss": 0.95295942, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.40478516, + "step": 2533, + "time_per_iteration": 2.642825126647949 + }, + { + "auxiliary_loss_clip": 0.01628306, + "auxiliary_loss_mlp": 0.0025612, + "balance_loss_clip": 1.28857446, + "balance_loss_mlp": 0.21585152, + "epoch": 0.1523523222606343, + "flos": 18217806520320.0, + "grad_norm": 101.29676945424707, + "language_loss": 0.90375048, + "learning_rate": 3.84500862231636e-06, + "loss": 0.92259479, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.40283203, + "step": 2534, + "time_per_iteration": 2.6192991733551025 + }, + { + "auxiliary_loss_clip": 0.01640767, + "auxiliary_loss_mlp": 0.00253091, + "balance_loss_clip": 1.2933526, + "balance_loss_mlp": 0.21160659, + "epoch": 0.15241244551330227, + "flos": 13260087642240.0, + "grad_norm": 5.237232206398613, + "language_loss": 0.8798399, + "learning_rate": 3.844858260274702e-06, + "loss": 0.8987785, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 0.41479492, + "step": 2535, + "time_per_iteration": 2.6173343658447266 + }, + { + "auxiliary_loss_clip": 0.01629169, + "auxiliary_loss_mlp": 0.00281706, + "balance_loss_clip": 1.28631902, + "balance_loss_mlp": 0.23790854, + "epoch": 0.15247256876597023, + "flos": 19715568854400.0, + "grad_norm": 798.6603842417262, + "language_loss": 0.86047918, + "learning_rate": 3.844707828275835e-06, + "loss": 0.87958789, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.43774414, + "step": 2536, + "time_per_iteration": 2.5932931900024414 + }, + { + "auxiliary_loss_clip": 0.01611211, + "auxiliary_loss_mlp": 0.00254777, + "balance_loss_clip": 1.2795465, + "balance_loss_mlp": 0.21128914, + "epoch": 0.1525326920186382, + "flos": 20375858615040.0, + "grad_norm": 20.9788629796572, + "language_loss": 0.8418116, + "learning_rate": 3.844557326325461e-06, + "loss": 0.86047149, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.43457031, + "step": 2537, + "time_per_iteration": 2.6311850547790527 + }, + { + "auxiliary_loss_clip": 0.01624565, + "auxiliary_loss_mlp": 0.00263381, + "balance_loss_clip": 1.28702021, + "balance_loss_mlp": 0.21753323, + "epoch": 0.15259281527130616, + "flos": 13589963170560.0, + "grad_norm": 9.886280834137322, + "language_loss": 0.87735939, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.8962388, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.45874023, + "step": 2538, + "time_per_iteration": 2.5921809673309326 + }, + { + "auxiliary_loss_clip": 0.01612354, + "auxiliary_loss_mlp": 0.0023165, + "balance_loss_clip": 1.27915776, + "balance_loss_mlp": 0.18906876, + "epoch": 0.15265293852397416, + "flos": 22860374446080.0, + "grad_norm": 10.75103225888784, + "language_loss": 0.95733005, + "learning_rate": 3.844256112593029e-06, + "loss": 0.97577006, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.42602539, + "step": 2539, + "time_per_iteration": 2.721728801727295 + }, + { + "auxiliary_loss_clip": 0.01635738, + "auxiliary_loss_mlp": 0.00252977, + "balance_loss_clip": 1.30021596, + "balance_loss_mlp": 0.20760572, + "epoch": 0.15271306177664212, + "flos": 29238108670080.0, + "grad_norm": 8.476933983522327, + "language_loss": 0.99781525, + "learning_rate": 3.844105400822391e-06, + "loss": 1.01670241, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.45410156, + "step": 2540, + "time_per_iteration": 2.7217564582824707 + }, + { + "auxiliary_loss_clip": 0.01626099, + "auxiliary_loss_mlp": 0.00250871, + "balance_loss_clip": 1.28776598, + "balance_loss_mlp": 0.21017267, + "epoch": 0.1527731850293101, + "flos": 31246269310080.0, + "grad_norm": 15.218628472577134, + "language_loss": 0.816257, + "learning_rate": 3.843954619123092e-06, + "loss": 0.83502674, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.40698242, + "step": 2541, + "time_per_iteration": 2.7615857124328613 + }, + { + "auxiliary_loss_clip": 0.01608346, + "auxiliary_loss_mlp": 0.00273214, + "balance_loss_clip": 1.27522755, + "balance_loss_mlp": 0.2297266, + "epoch": 0.15283330828197805, + "flos": 22382079920640.0, + "grad_norm": 167.74467643550037, + "language_loss": 0.8777352, + "learning_rate": 3.84380376750085e-06, + "loss": 0.89655077, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.43505859, + "step": 2542, + "time_per_iteration": 2.68774676322937 + }, + { + "auxiliary_loss_clip": 0.01636098, + "auxiliary_loss_mlp": 0.00251484, + "balance_loss_clip": 1.29186249, + "balance_loss_mlp": 0.20446792, + "epoch": 0.15289343153464602, + "flos": 25520133755520.0, + "grad_norm": 2.814139679603823, + "language_loss": 0.86786318, + "learning_rate": 3.843652845961383e-06, + "loss": 0.88673902, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.47021484, + "step": 2543, + "time_per_iteration": 2.6428306102752686 + }, + { + "auxiliary_loss_clip": 0.01614323, + "auxiliary_loss_mlp": 0.00215879, + "balance_loss_clip": 1.28177977, + "balance_loss_mlp": 0.17310674, + "epoch": 0.15295355478731398, + "flos": 22710016114560.0, + "grad_norm": 8.918182586822097, + "language_loss": 0.94078326, + "learning_rate": 3.843501854510416e-06, + "loss": 0.95908529, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.42797852, + "step": 2544, + "time_per_iteration": 2.681633472442627 + }, + { + "auxiliary_loss_clip": 0.01632535, + "auxiliary_loss_mlp": 0.00266262, + "balance_loss_clip": 1.28883576, + "balance_loss_mlp": 0.21583624, + "epoch": 0.15301367803998198, + "flos": 23251907669760.0, + "grad_norm": 2.423267833349333, + "language_loss": 0.9081493, + "learning_rate": 3.843350793153673e-06, + "loss": 0.92713726, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.50439453, + "step": 2545, + "time_per_iteration": 2.6305911540985107 + }, + { + "auxiliary_loss_clip": 0.01629965, + "auxiliary_loss_mlp": 0.0023342, + "balance_loss_clip": 1.29757571, + "balance_loss_mlp": 0.18578373, + "epoch": 0.15307380129264994, + "flos": 25886279041920.0, + "grad_norm": 15.931722474072362, + "language_loss": 0.78658283, + "learning_rate": 3.843199661896884e-06, + "loss": 0.80521667, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.4765625, + "step": 2546, + "time_per_iteration": 2.7246267795562744 + }, + { + "auxiliary_loss_clip": 0.01616977, + "auxiliary_loss_mlp": 0.00267973, + "balance_loss_clip": 1.28204763, + "balance_loss_mlp": 0.22098053, + "epoch": 0.1531339245453179, + "flos": 46973239205760.0, + "grad_norm": 2.6994356513097184, + "language_loss": 0.83969665, + "learning_rate": 3.843048460745779e-06, + "loss": 0.8585462, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.46948242, + "step": 2547, + "time_per_iteration": 2.934541940689087 + }, + { + "auxiliary_loss_clip": 0.01634875, + "auxiliary_loss_mlp": 0.00249529, + "balance_loss_clip": 1.29406452, + "balance_loss_mlp": 0.20308527, + "epoch": 0.15319404779798587, + "flos": 35882049565440.0, + "grad_norm": 21.148930899683883, + "language_loss": 0.8371681, + "learning_rate": 3.842897189706092e-06, + "loss": 0.85601217, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 3.41210938, + "router_z_loss_mlp": 0.46459961, + "step": 2548, + "time_per_iteration": 2.8117611408233643 + }, + { + "auxiliary_loss_clip": 0.01595455, + "auxiliary_loss_mlp": 0.00226833, + "balance_loss_clip": 1.26647949, + "balance_loss_mlp": 0.17755231, + "epoch": 0.15325417105065384, + "flos": 25664638170240.0, + "grad_norm": 1.9856712484717172, + "language_loss": 0.87923318, + "learning_rate": 3.842745848783558e-06, + "loss": 0.89745611, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.49267578, + "step": 2549, + "time_per_iteration": 2.6900737285614014 + }, + { + "auxiliary_loss_clip": 0.01583488, + "auxiliary_loss_mlp": 0.00231177, + "balance_loss_clip": 1.25284767, + "balance_loss_mlp": 0.17948794, + "epoch": 0.1533142943033218, + "flos": 18770831291520.0, + "grad_norm": 3.098756463030102, + "language_loss": 0.8176403, + "learning_rate": 3.842594437983917e-06, + "loss": 0.835787, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.51708984, + "step": 2550, + "time_per_iteration": 2.61804461479187 + }, + { + "auxiliary_loss_clip": 0.01592688, + "auxiliary_loss_mlp": 0.00229981, + "balance_loss_clip": 1.25931847, + "balance_loss_mlp": 0.18170102, + "epoch": 0.15337441755598977, + "flos": 23107367341440.0, + "grad_norm": 47.74865767098378, + "language_loss": 0.86577916, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.88400578, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.48266602, + "step": 2551, + "time_per_iteration": 4.052828073501587 + }, + { + "auxiliary_loss_clip": 0.01485482, + "auxiliary_loss_mlp": 0.00099328, + "balance_loss_clip": 1.25168014, + "balance_loss_mlp": 0.07911022, + "epoch": 0.15343454080865776, + "flos": 59861079227520.0, + "grad_norm": 0.9186994025996846, + "language_loss": 0.56737566, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58322382, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.20214844, + "step": 2552, + "time_per_iteration": 3.0606324672698975 + }, + { + "auxiliary_loss_clip": 0.01570777, + "auxiliary_loss_mlp": 0.00279082, + "balance_loss_clip": 1.24603093, + "balance_loss_mlp": 0.23568988, + "epoch": 0.15349466406132573, + "flos": 11910887959680.0, + "grad_norm": 6.884758469108168, + "language_loss": 0.972067, + "learning_rate": 3.84213978637978e-06, + "loss": 0.99056554, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.43408203, + "step": 2553, + "time_per_iteration": 2.6267240047454834 + }, + { + "auxiliary_loss_clip": 0.01561026, + "auxiliary_loss_mlp": 0.00260332, + "balance_loss_clip": 1.24011159, + "balance_loss_mlp": 0.21517539, + "epoch": 0.1535547873139937, + "flos": 24096922099200.0, + "grad_norm": 9.921957576485191, + "language_loss": 0.85563958, + "learning_rate": 3.841988096129152e-06, + "loss": 0.87385321, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.4519043, + "step": 2554, + "time_per_iteration": 4.165403127670288 + }, + { + "auxiliary_loss_clip": 0.01541127, + "auxiliary_loss_mlp": 0.00307741, + "balance_loss_clip": 1.2266022, + "balance_loss_mlp": 0.26809162, + "epoch": 0.15361491056666166, + "flos": 17566459246080.0, + "grad_norm": 39.73989395282854, + "language_loss": 0.86981714, + "learning_rate": 3.841836336030151e-06, + "loss": 0.88830578, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.39599609, + "step": 2555, + "time_per_iteration": 2.583627462387085 + }, + { + "auxiliary_loss_clip": 0.01524074, + "auxiliary_loss_mlp": 0.00262746, + "balance_loss_clip": 1.21455228, + "balance_loss_mlp": 0.22283429, + "epoch": 0.15367503381932962, + "flos": 25046041121280.0, + "grad_norm": 36.16690594957559, + "language_loss": 0.83966595, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.85753411, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.39916992, + "step": 2556, + "time_per_iteration": 2.734755039215088 + }, + { + "auxiliary_loss_clip": 0.01516425, + "auxiliary_loss_mlp": 0.00280889, + "balance_loss_clip": 1.21158671, + "balance_loss_mlp": 0.24376762, + "epoch": 0.15373515707199759, + "flos": 21507332008320.0, + "grad_norm": 903.3952187479553, + "language_loss": 0.98869383, + "learning_rate": 3.84153260631005e-06, + "loss": 1.00666702, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.37109375, + "step": 2557, + "time_per_iteration": 4.029478073120117 + }, + { + "auxiliary_loss_clip": 0.01509724, + "auxiliary_loss_mlp": 0.00275112, + "balance_loss_clip": 1.20548248, + "balance_loss_mlp": 0.23837146, + "epoch": 0.15379528032466555, + "flos": 25994729180160.0, + "grad_norm": 30.307277811038187, + "language_loss": 0.7766034, + "learning_rate": 3.841380636700468e-06, + "loss": 0.79445171, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.36743164, + "step": 2558, + "time_per_iteration": 2.6918795108795166 + }, + { + "auxiliary_loss_clip": 0.01509371, + "auxiliary_loss_mlp": 0.00299406, + "balance_loss_clip": 1.20460677, + "balance_loss_mlp": 0.26307061, + "epoch": 0.15385540357733354, + "flos": 19277315015040.0, + "grad_norm": 4.803617757527174, + "language_loss": 1.00611627, + "learning_rate": 3.841228597265548e-06, + "loss": 1.02420402, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.36303711, + "step": 2559, + "time_per_iteration": 2.6553070545196533 + }, + { + "auxiliary_loss_clip": 0.01495525, + "auxiliary_loss_mlp": 0.00310812, + "balance_loss_clip": 1.19795132, + "balance_loss_mlp": 0.27671832, + "epoch": 0.1539155268300015, + "flos": 28549126920960.0, + "grad_norm": 7.653277802502302, + "language_loss": 0.74739462, + "learning_rate": 3.841076488011055e-06, + "loss": 0.76545799, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.34106445, + "step": 2560, + "time_per_iteration": 2.739847421646118 + }, + { + "auxiliary_loss_clip": 0.01485899, + "auxiliary_loss_mlp": 0.00309504, + "balance_loss_clip": 1.18430352, + "balance_loss_mlp": 0.27393243, + "epoch": 0.15397565008266947, + "flos": 23547883737600.0, + "grad_norm": 6.991742381012541, + "language_loss": 0.95442653, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.97238052, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.35571289, + "step": 2561, + "time_per_iteration": 2.6655654907226562 + }, + { + "auxiliary_loss_clip": 0.01489339, + "auxiliary_loss_mlp": 0.00313716, + "balance_loss_clip": 1.1975311, + "balance_loss_mlp": 0.28152919, + "epoch": 0.15403577333533744, + "flos": 17129821518720.0, + "grad_norm": 21.602920206214563, + "language_loss": 0.90192831, + "learning_rate": 3.840772060066425e-06, + "loss": 0.91995889, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.32177734, + "step": 2562, + "time_per_iteration": 2.620293140411377 + }, + { + "auxiliary_loss_clip": 0.01485022, + "auxiliary_loss_mlp": 0.00350451, + "balance_loss_clip": 1.18550348, + "balance_loss_mlp": 0.31576061, + "epoch": 0.1540958965880054, + "flos": 17894503180800.0, + "grad_norm": 4.98717428674485, + "language_loss": 0.8547194, + "learning_rate": 3.840619741387832e-06, + "loss": 0.87307417, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.34741211, + "step": 2563, + "time_per_iteration": 2.605706214904785 + }, + { + "auxiliary_loss_clip": 0.01479408, + "auxiliary_loss_mlp": 0.00332988, + "balance_loss_clip": 1.18054247, + "balance_loss_mlp": 0.30077717, + "epoch": 0.15415601984067337, + "flos": 32161057908480.0, + "grad_norm": 23.61477204818768, + "language_loss": 0.86962521, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.8877492, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.32202148, + "step": 2564, + "time_per_iteration": 2.7559595108032227 + }, + { + "auxiliary_loss_clip": 0.01475131, + "auxiliary_loss_mlp": 0.00325261, + "balance_loss_clip": 1.1828655, + "balance_loss_mlp": 0.29080963, + "epoch": 0.15421614309334136, + "flos": 24024418496640.0, + "grad_norm": 2.8614464463678955, + "language_loss": 0.78839713, + "learning_rate": 3.840314894646969e-06, + "loss": 0.80640107, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.34448242, + "step": 2565, + "time_per_iteration": 2.6494717597961426 + }, + { + "auxiliary_loss_clip": 0.01475977, + "auxiliary_loss_mlp": 0.00332215, + "balance_loss_clip": 1.18491316, + "balance_loss_mlp": 0.30019492, + "epoch": 0.15427626634600933, + "flos": 24386290064640.0, + "grad_norm": 50.414544979775535, + "language_loss": 0.79050028, + "learning_rate": 3.840162366596259e-06, + "loss": 0.80858219, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.3203125, + "step": 2566, + "time_per_iteration": 2.673837661743164 + }, + { + "auxiliary_loss_clip": 0.01458597, + "auxiliary_loss_mlp": 0.00315778, + "balance_loss_clip": 1.1742897, + "balance_loss_mlp": 0.28442585, + "epoch": 0.1543363895986773, + "flos": 23331522165120.0, + "grad_norm": 6.988010211601425, + "language_loss": 0.9099223, + "learning_rate": 3.840009768766408e-06, + "loss": 0.92766607, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.31347656, + "step": 2567, + "time_per_iteration": 2.635493755340576 + }, + { + "auxiliary_loss_clip": 0.01455228, + "auxiliary_loss_mlp": 0.00315324, + "balance_loss_clip": 1.16958523, + "balance_loss_mlp": 0.28216028, + "epoch": 0.15439651285134526, + "flos": 24274284480000.0, + "grad_norm": 2.752623766419202, + "language_loss": 0.86643553, + "learning_rate": 3.839857101163202e-06, + "loss": 0.88414109, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.33154297, + "step": 2568, + "time_per_iteration": 2.68095064163208 + }, + { + "auxiliary_loss_clip": 0.01467577, + "auxiliary_loss_mlp": 0.00343389, + "balance_loss_clip": 1.18275833, + "balance_loss_mlp": 0.31148869, + "epoch": 0.15445663610401322, + "flos": 22456163721600.0, + "grad_norm": 40.824314213216915, + "language_loss": 0.77444482, + "learning_rate": 3.83970436379243e-06, + "loss": 0.7925545, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.3190918, + "step": 2569, + "time_per_iteration": 2.681640625 + }, + { + "auxiliary_loss_clip": 0.01452415, + "auxiliary_loss_mlp": 0.00344314, + "balance_loss_clip": 1.16895247, + "balance_loss_mlp": 0.31403527, + "epoch": 0.1545167593566812, + "flos": 22049510872320.0, + "grad_norm": 3.413883933694002, + "language_loss": 0.83541799, + "learning_rate": 3.839551556659884e-06, + "loss": 0.85338533, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.30249023, + "step": 2570, + "time_per_iteration": 2.693608283996582 + }, + { + "auxiliary_loss_clip": 0.01457221, + "auxiliary_loss_mlp": 0.0029191, + "balance_loss_clip": 1.17364204, + "balance_loss_mlp": 0.26170245, + "epoch": 0.15457688260934915, + "flos": 19318253541120.0, + "grad_norm": 134.60965151379412, + "language_loss": 0.86954844, + "learning_rate": 3.839398679771359e-06, + "loss": 0.88703978, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.30200195, + "step": 2571, + "time_per_iteration": 2.586700916290283 + }, + { + "auxiliary_loss_clip": 0.0145876, + "auxiliary_loss_mlp": 0.00307149, + "balance_loss_clip": 1.17471766, + "balance_loss_mlp": 0.27369875, + "epoch": 0.15463700586201715, + "flos": 24133981956480.0, + "grad_norm": 10.681428776470964, + "language_loss": 0.89696288, + "learning_rate": 3.839245733132652e-06, + "loss": 0.91462195, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.33447266, + "step": 2572, + "time_per_iteration": 2.7055938243865967 + }, + { + "auxiliary_loss_clip": 0.01465159, + "auxiliary_loss_mlp": 0.00346388, + "balance_loss_clip": 1.17677069, + "balance_loss_mlp": 0.31360537, + "epoch": 0.1546971291146851, + "flos": 22420935457920.0, + "grad_norm": 14.416738600053371, + "language_loss": 0.95889413, + "learning_rate": 3.839092716749563e-06, + "loss": 0.97700959, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.328125, + "step": 2573, + "time_per_iteration": 2.6784615516662598 + }, + { + "auxiliary_loss_clip": 0.01465327, + "auxiliary_loss_mlp": 0.00295428, + "balance_loss_clip": 1.17368364, + "balance_loss_mlp": 0.26507744, + "epoch": 0.15475725236735308, + "flos": 17530225401600.0, + "grad_norm": 431.44591558706276, + "language_loss": 0.78680819, + "learning_rate": 3.838939630627893e-06, + "loss": 0.80441576, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.3034668, + "step": 2574, + "time_per_iteration": 2.7049922943115234 + }, + { + "auxiliary_loss_clip": 0.01461284, + "auxiliary_loss_mlp": 0.00269148, + "balance_loss_clip": 1.17161691, + "balance_loss_mlp": 0.23872612, + "epoch": 0.15481737562002104, + "flos": 22561740771840.0, + "grad_norm": 11.52945808412478, + "language_loss": 0.88106215, + "learning_rate": 3.838786474773448e-06, + "loss": 0.89836645, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.30419922, + "step": 2575, + "time_per_iteration": 2.6296682357788086 + }, + { + "auxiliary_loss_clip": 0.01480177, + "auxiliary_loss_mlp": 0.00314932, + "balance_loss_clip": 1.18693161, + "balance_loss_mlp": 0.28410494, + "epoch": 0.154877498872689, + "flos": 24900567039360.0, + "grad_norm": 21.16407426982429, + "language_loss": 0.89805096, + "learning_rate": 3.838633249192036e-06, + "loss": 0.91600204, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.30786133, + "step": 2576, + "time_per_iteration": 2.6712937355041504 + }, + { + "auxiliary_loss_clip": 0.01486819, + "auxiliary_loss_mlp": 0.00346065, + "balance_loss_clip": 1.19123673, + "balance_loss_mlp": 0.31397378, + "epoch": 0.15493762212535697, + "flos": 28147501975680.0, + "grad_norm": 12.330311065628313, + "language_loss": 0.89937872, + "learning_rate": 3.838479953889465e-06, + "loss": 0.91770756, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.32104492, + "step": 2577, + "time_per_iteration": 2.6715023517608643 + }, + { + "auxiliary_loss_clip": 0.01509158, + "auxiliary_loss_mlp": 0.00372982, + "balance_loss_clip": 1.20803463, + "balance_loss_mlp": 0.34288186, + "epoch": 0.15499774537802496, + "flos": 25411073086080.0, + "grad_norm": 13.952922026592379, + "language_loss": 0.87261856, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.89143991, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.30114746, + "step": 2578, + "time_per_iteration": 2.6950864791870117 + }, + { + "auxiliary_loss_clip": 0.01499788, + "auxiliary_loss_mlp": 0.00308255, + "balance_loss_clip": 1.19632804, + "balance_loss_mlp": 0.27723712, + "epoch": 0.15505786863069293, + "flos": 22091562720000.0, + "grad_norm": 3.2017028943249892, + "language_loss": 0.88576221, + "learning_rate": 3.83817315414411e-06, + "loss": 0.90384269, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.31054688, + "step": 2579, + "time_per_iteration": 2.598050355911255 + }, + { + "auxiliary_loss_clip": 0.01502329, + "auxiliary_loss_mlp": 0.00293024, + "balance_loss_clip": 1.19733095, + "balance_loss_mlp": 0.25888291, + "epoch": 0.1551179918833609, + "flos": 18917131386240.0, + "grad_norm": 154.83660107877932, + "language_loss": 0.8632561, + "learning_rate": 3.838019649712958e-06, + "loss": 0.88120967, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.34106445, + "step": 2580, + "time_per_iteration": 2.6332767009735107 + }, + { + "auxiliary_loss_clip": 0.01485671, + "auxiliary_loss_mlp": 0.00115499, + "balance_loss_clip": 1.24617839, + "balance_loss_mlp": 0.10090782, + "epoch": 0.15517811513602886, + "flos": 66239172587520.0, + "grad_norm": 0.8945207642247979, + "language_loss": 0.5908972, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.60690892, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.14550781, + "step": 2581, + "time_per_iteration": 3.2989563941955566 + }, + { + "auxiliary_loss_clip": 0.01525063, + "auxiliary_loss_mlp": 0.00292339, + "balance_loss_clip": 1.21906114, + "balance_loss_mlp": 0.26046211, + "epoch": 0.15523823838869683, + "flos": 24021078531840.0, + "grad_norm": 3.9832219091750147, + "language_loss": 0.91446042, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.93263447, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.3190918, + "step": 2582, + "time_per_iteration": 2.6220510005950928 + }, + { + "auxiliary_loss_clip": 0.01540849, + "auxiliary_loss_mlp": 0.00308837, + "balance_loss_clip": 1.22973299, + "balance_loss_mlp": 0.27588743, + "epoch": 0.1552983616413648, + "flos": 20485062938880.0, + "grad_norm": 58.792296300678316, + "language_loss": 0.86030918, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.87880599, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.3293457, + "step": 2583, + "time_per_iteration": 2.6678519248962402 + }, + { + "auxiliary_loss_clip": 0.01526271, + "auxiliary_loss_mlp": 0.00378041, + "balance_loss_clip": 1.22040606, + "balance_loss_mlp": 0.34382787, + "epoch": 0.15535848489403276, + "flos": 32123710742400.0, + "grad_norm": 21.634238965583553, + "language_loss": 0.82757521, + "learning_rate": 3.837404935067705e-06, + "loss": 0.84661829, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.34204102, + "step": 2584, + "time_per_iteration": 2.7165346145629883 + }, + { + "auxiliary_loss_clip": 0.0155321, + "auxiliary_loss_mlp": 0.00350123, + "balance_loss_clip": 1.24243534, + "balance_loss_mlp": 0.31734061, + "epoch": 0.15541860814670075, + "flos": 19098444263040.0, + "grad_norm": 27.808968027693233, + "language_loss": 0.84162283, + "learning_rate": 3.837251082205368e-06, + "loss": 0.86065614, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.328125, + "step": 2585, + "time_per_iteration": 2.68513560295105 + }, + { + "auxiliary_loss_clip": 0.01541487, + "auxiliary_loss_mlp": 0.00326095, + "balance_loss_clip": 1.23273206, + "balance_loss_mlp": 0.29250151, + "epoch": 0.1554787313993687, + "flos": 19172097100800.0, + "grad_norm": 31.06605321888724, + "language_loss": 0.70684701, + "learning_rate": 3.837097159674286e-06, + "loss": 0.72552288, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 3.08398438, + "router_z_loss_mlp": 0.33569336, + "step": 2586, + "time_per_iteration": 2.703218698501587 + }, + { + "auxiliary_loss_clip": 0.01556801, + "auxiliary_loss_mlp": 0.00376244, + "balance_loss_clip": 1.24627686, + "balance_loss_mlp": 0.34141108, + "epoch": 0.15553885465203668, + "flos": 16143822207360.0, + "grad_norm": 16.72471161076826, + "language_loss": 0.88593817, + "learning_rate": 3.836943167480296e-06, + "loss": 0.90526855, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.34790039, + "step": 2587, + "time_per_iteration": 2.722289562225342 + }, + { + "auxiliary_loss_clip": 0.01566025, + "auxiliary_loss_mlp": 0.0039617, + "balance_loss_clip": 1.24897683, + "balance_loss_mlp": 0.35699767, + "epoch": 0.15559897790470464, + "flos": 25337779384320.0, + "grad_norm": 2.7725611997411184, + "language_loss": 0.95930409, + "learning_rate": 3.836789105629236e-06, + "loss": 0.97892606, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.39160156, + "step": 2588, + "time_per_iteration": 2.710186004638672 + }, + { + "auxiliary_loss_clip": 0.01567545, + "auxiliary_loss_mlp": 0.00341813, + "balance_loss_clip": 1.2569778, + "balance_loss_mlp": 0.30779094, + "epoch": 0.1556591011573726, + "flos": 23148772744320.0, + "grad_norm": 21.950528294190818, + "language_loss": 0.73083258, + "learning_rate": 3.83663497412695e-06, + "loss": 0.74992621, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.34008789, + "step": 2589, + "time_per_iteration": 2.6287307739257812 + }, + { + "auxiliary_loss_clip": 0.01573898, + "auxiliary_loss_mlp": 0.00335027, + "balance_loss_clip": 1.26334667, + "balance_loss_mlp": 0.30131432, + "epoch": 0.15571922441004057, + "flos": 25370888745600.0, + "grad_norm": 6.498655226574372, + "language_loss": 0.8839978, + "learning_rate": 3.836480772979281e-06, + "loss": 0.90308702, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.33691406, + "step": 2590, + "time_per_iteration": 2.6553735733032227 + }, + { + "auxiliary_loss_clip": 0.01578203, + "auxiliary_loss_mlp": 0.00333866, + "balance_loss_clip": 1.2677635, + "balance_loss_mlp": 0.30141741, + "epoch": 0.15577934766270854, + "flos": 14501375890560.0, + "grad_norm": 12.63559943265338, + "language_loss": 0.88026792, + "learning_rate": 3.836326502192077e-06, + "loss": 0.89938861, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.32446289, + "step": 2591, + "time_per_iteration": 2.639390468597412 + }, + { + "auxiliary_loss_clip": 0.01576345, + "auxiliary_loss_mlp": 0.00316978, + "balance_loss_clip": 1.26633525, + "balance_loss_mlp": 0.28524399, + "epoch": 0.15583947091537653, + "flos": 37414537372800.0, + "grad_norm": 8.264318151868066, + "language_loss": 0.74604517, + "learning_rate": 3.836172161771189e-06, + "loss": 0.76497835, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.31762695, + "step": 2592, + "time_per_iteration": 2.8106682300567627 + }, + { + "auxiliary_loss_clip": 0.01595681, + "auxiliary_loss_mlp": 0.00328823, + "balance_loss_clip": 1.27481735, + "balance_loss_mlp": 0.29554003, + "epoch": 0.1558995941680445, + "flos": 21834729498240.0, + "grad_norm": 11.934889752563848, + "language_loss": 0.90146577, + "learning_rate": 3.836017751722467e-06, + "loss": 0.9207108, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.33276367, + "step": 2593, + "time_per_iteration": 4.023046970367432 + }, + { + "auxiliary_loss_clip": 0.0159261, + "auxiliary_loss_mlp": 0.00363401, + "balance_loss_clip": 1.28128147, + "balance_loss_mlp": 0.32787675, + "epoch": 0.15595971742071246, + "flos": 19792633484160.0, + "grad_norm": 2.7458738174247186, + "language_loss": 0.78902453, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.80858457, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.35522461, + "step": 2594, + "time_per_iteration": 2.635490655899048 + }, + { + "auxiliary_loss_clip": 0.0158967, + "auxiliary_loss_mlp": 0.0030033, + "balance_loss_clip": 1.2820003, + "balance_loss_mlp": 0.26730919, + "epoch": 0.15601984067338043, + "flos": 26722135503360.0, + "grad_norm": 3.0463256708396598, + "language_loss": 0.87282169, + "learning_rate": 3.835708722764952e-06, + "loss": 0.89172173, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 3.07617188, + "router_z_loss_mlp": 0.33007812, + "step": 2595, + "time_per_iteration": 2.770974636077881 + }, + { + "auxiliary_loss_clip": 0.01579849, + "auxiliary_loss_mlp": 0.00317172, + "balance_loss_clip": 1.26811373, + "balance_loss_mlp": 0.28281629, + "epoch": 0.1560799639260484, + "flos": 18369278173440.0, + "grad_norm": 11.410284020540507, + "language_loss": 0.93354243, + "learning_rate": 3.835554103867876e-06, + "loss": 0.95251262, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.34326172, + "step": 2596, + "time_per_iteration": 4.137763500213623 + }, + { + "auxiliary_loss_clip": 0.01586004, + "auxiliary_loss_mlp": 0.00297567, + "balance_loss_clip": 1.27753186, + "balance_loss_mlp": 0.26440316, + "epoch": 0.15614008717871636, + "flos": 22598980197120.0, + "grad_norm": 3.6540057159934807, + "language_loss": 0.73644364, + "learning_rate": 3.835399415366404e-06, + "loss": 0.75527942, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 3.08398438, + "router_z_loss_mlp": 0.33178711, + "step": 2597, + "time_per_iteration": 2.6384782791137695 + }, + { + "auxiliary_loss_clip": 0.01596133, + "auxiliary_loss_mlp": 0.00326302, + "balance_loss_clip": 1.28960323, + "balance_loss_mlp": 0.29485494, + "epoch": 0.15620021043138435, + "flos": 22746860490240.0, + "grad_norm": 5.201426088448285, + "language_loss": 0.85769761, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.87692189, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.31494141, + "step": 2598, + "time_per_iteration": 2.692850351333618 + }, + { + "auxiliary_loss_clip": 0.01600591, + "auxiliary_loss_mlp": 0.00331565, + "balance_loss_clip": 1.29030657, + "balance_loss_mlp": 0.29656544, + "epoch": 0.15626033368405232, + "flos": 13114936782720.0, + "grad_norm": 28.65367191081519, + "language_loss": 0.87953258, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.89885414, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.35009766, + "step": 2599, + "time_per_iteration": 4.153528690338135 + }, + { + "auxiliary_loss_clip": 0.01611025, + "auxiliary_loss_mlp": 0.00322718, + "balance_loss_clip": 1.2950604, + "balance_loss_mlp": 0.2852383, + "epoch": 0.15632045693672028, + "flos": 16472297105280.0, + "grad_norm": 33.551668115540636, + "language_loss": 0.88049442, + "learning_rate": 3.834934932294287e-06, + "loss": 0.89983189, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.375, + "step": 2600, + "time_per_iteration": 2.6539618968963623 + }, + { + "auxiliary_loss_clip": 0.01616843, + "auxiliary_loss_mlp": 0.00332869, + "balance_loss_clip": 1.30248582, + "balance_loss_mlp": 0.29784557, + "epoch": 0.15638058018938825, + "flos": 20850346298880.0, + "grad_norm": 14.637595730480623, + "language_loss": 0.94794333, + "learning_rate": 3.834779965433917e-06, + "loss": 0.96744049, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.3503418, + "step": 2601, + "time_per_iteration": 2.680041790008545 + }, + { + "auxiliary_loss_clip": 0.01639848, + "auxiliary_loss_mlp": 0.00311866, + "balance_loss_clip": 1.31595731, + "balance_loss_mlp": 0.27517292, + "epoch": 0.1564407034420562, + "flos": 21872220318720.0, + "grad_norm": 6.253866759679896, + "language_loss": 0.86380893, + "learning_rate": 3.834624928998508e-06, + "loss": 0.88332605, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.36669922, + "step": 2602, + "time_per_iteration": 2.663860321044922 + }, + { + "auxiliary_loss_clip": 0.01631214, + "auxiliary_loss_mlp": 0.00273266, + "balance_loss_clip": 1.30994987, + "balance_loss_mlp": 0.23862353, + "epoch": 0.15650082669472418, + "flos": 21834549930240.0, + "grad_norm": 3.432073245431192, + "language_loss": 0.80171072, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.82075548, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.34619141, + "step": 2603, + "time_per_iteration": 2.6073925495147705 + }, + { + "auxiliary_loss_clip": 0.01621949, + "auxiliary_loss_mlp": 0.002787, + "balance_loss_clip": 1.30390775, + "balance_loss_mlp": 0.24617946, + "epoch": 0.15656094994739214, + "flos": 13800542653440.0, + "grad_norm": 2.782779524868727, + "language_loss": 0.97018343, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.98918986, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.32519531, + "step": 2604, + "time_per_iteration": 2.633498191833496 + }, + { + "auxiliary_loss_clip": 0.01638548, + "auxiliary_loss_mlp": 0.00246522, + "balance_loss_clip": 1.31166792, + "balance_loss_mlp": 0.2111406, + "epoch": 0.15662107320006013, + "flos": 27308197808640.0, + "grad_norm": 510.61866400522115, + "language_loss": 0.91254258, + "learning_rate": 3.834159402300841e-06, + "loss": 0.93139327, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.35375977, + "step": 2605, + "time_per_iteration": 2.6894805431365967 + }, + { + "auxiliary_loss_clip": 0.01642257, + "auxiliary_loss_mlp": 0.00259192, + "balance_loss_clip": 1.3119278, + "balance_loss_mlp": 0.22276208, + "epoch": 0.1566811964527281, + "flos": 26685075646080.0, + "grad_norm": 36.90407084438398, + "language_loss": 0.82168567, + "learning_rate": 3.834004087624087e-06, + "loss": 0.84070015, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.36376953, + "step": 2606, + "time_per_iteration": 2.708345651626587 + }, + { + "auxiliary_loss_clip": 0.01657621, + "auxiliary_loss_mlp": 0.00241075, + "balance_loss_clip": 1.32816553, + "balance_loss_mlp": 0.20593223, + "epoch": 0.15674131970539606, + "flos": 16103422385280.0, + "grad_norm": 9.335648599823031, + "language_loss": 0.83225858, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.85124558, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.35131836, + "step": 2607, + "time_per_iteration": 2.5973219871520996 + }, + { + "auxiliary_loss_clip": 0.01641805, + "auxiliary_loss_mlp": 0.0022514, + "balance_loss_clip": 1.31427228, + "balance_loss_mlp": 0.19126114, + "epoch": 0.15680144295806403, + "flos": 19169690889600.0, + "grad_norm": 27.805256096089785, + "language_loss": 0.88044912, + "learning_rate": 3.833693249639615e-06, + "loss": 0.89911854, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.33886719, + "step": 2608, + "time_per_iteration": 2.6723620891571045 + }, + { + "auxiliary_loss_clip": 0.01648134, + "auxiliary_loss_mlp": 0.0026406, + "balance_loss_clip": 1.31379414, + "balance_loss_mlp": 0.22624652, + "epoch": 0.156861566210732, + "flos": 20813430096000.0, + "grad_norm": 5.41946598638552, + "language_loss": 0.79876554, + "learning_rate": 3.833537726343684e-06, + "loss": 0.81788743, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.37817383, + "step": 2609, + "time_per_iteration": 2.6655771732330322 + }, + { + "auxiliary_loss_clip": 0.01653146, + "auxiliary_loss_mlp": 0.00253563, + "balance_loss_clip": 1.31731677, + "balance_loss_mlp": 0.21844402, + "epoch": 0.15692168946339996, + "flos": 20047922421120.0, + "grad_norm": 21.26977573189445, + "language_loss": 0.83414245, + "learning_rate": 3.833382133519818e-06, + "loss": 0.8532095, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.35131836, + "step": 2610, + "time_per_iteration": 2.622344732284546 + }, + { + "auxiliary_loss_clip": 0.01651632, + "auxiliary_loss_mlp": 0.00261938, + "balance_loss_clip": 1.31205845, + "balance_loss_mlp": 0.22016656, + "epoch": 0.15698181271606793, + "flos": 21398019943680.0, + "grad_norm": 2.6199104824434496, + "language_loss": 0.80716926, + "learning_rate": 3.833226471173919e-06, + "loss": 0.82630491, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.41748047, + "step": 2611, + "time_per_iteration": 2.721489429473877 + }, + { + "auxiliary_loss_clip": 0.01650468, + "auxiliary_loss_mlp": 0.00262909, + "balance_loss_clip": 1.31496191, + "balance_loss_mlp": 0.22833827, + "epoch": 0.15704193596873592, + "flos": 20845785271680.0, + "grad_norm": 11.648019938953155, + "language_loss": 0.79038036, + "learning_rate": 3.833070739311887e-06, + "loss": 0.80951416, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.34594727, + "step": 2612, + "time_per_iteration": 2.6574203968048096 + }, + { + "auxiliary_loss_clip": 0.01649243, + "auxiliary_loss_mlp": 0.002369, + "balance_loss_clip": 1.31278896, + "balance_loss_mlp": 0.20373577, + "epoch": 0.15710205922140388, + "flos": 21762908254080.0, + "grad_norm": 187.5337019008567, + "language_loss": 0.84351325, + "learning_rate": 3.83291493793963e-06, + "loss": 0.86237466, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 3.36132812, + "router_z_loss_mlp": 0.33154297, + "step": 2613, + "time_per_iteration": 2.7396273612976074 + }, + { + "auxiliary_loss_clip": 0.01643194, + "auxiliary_loss_mlp": 0.00258654, + "balance_loss_clip": 1.30705953, + "balance_loss_mlp": 0.22041139, + "epoch": 0.15716218247407185, + "flos": 25007760201600.0, + "grad_norm": 11.17782383318674, + "language_loss": 0.74144739, + "learning_rate": 3.832759067063055e-06, + "loss": 0.76046586, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 3.36132812, + "router_z_loss_mlp": 0.38208008, + "step": 2614, + "time_per_iteration": 2.682082414627075 + }, + { + "auxiliary_loss_clip": 0.01658328, + "auxiliary_loss_mlp": 0.00226303, + "balance_loss_clip": 1.31434512, + "balance_loss_mlp": 0.19225693, + "epoch": 0.1572223057267398, + "flos": 20191780391040.0, + "grad_norm": 55.7729619456281, + "language_loss": 0.83776665, + "learning_rate": 3.832603126688072e-06, + "loss": 0.85661304, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 3.43945312, + "router_z_loss_mlp": 0.34057617, + "step": 2615, + "time_per_iteration": 2.6215434074401855 + }, + { + "auxiliary_loss_clip": 0.01659563, + "auxiliary_loss_mlp": 0.00256509, + "balance_loss_clip": 1.31827569, + "balance_loss_mlp": 0.22084117, + "epoch": 0.15728242897940778, + "flos": 20959514709120.0, + "grad_norm": 9.629710574978073, + "language_loss": 0.78766906, + "learning_rate": 3.832447116820594e-06, + "loss": 0.80682981, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.35693359, + "step": 2616, + "time_per_iteration": 2.636673927307129 + }, + { + "auxiliary_loss_clip": 0.01656929, + "auxiliary_loss_mlp": 0.00248591, + "balance_loss_clip": 1.31584907, + "balance_loss_mlp": 0.21251872, + "epoch": 0.15734255223207574, + "flos": 23038275530880.0, + "grad_norm": 39.09485207485857, + "language_loss": 0.7952913, + "learning_rate": 3.832291037466539e-06, + "loss": 0.81434655, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.36108398, + "step": 2617, + "time_per_iteration": 2.7147884368896484 + }, + { + "auxiliary_loss_clip": 0.01676724, + "auxiliary_loss_mlp": 0.00257395, + "balance_loss_clip": 1.33182907, + "balance_loss_mlp": 0.22232336, + "epoch": 0.15740267548474374, + "flos": 20551281661440.0, + "grad_norm": 26.424849103071082, + "language_loss": 0.80752236, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.82686353, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.35058594, + "step": 2618, + "time_per_iteration": 2.596367120742798 + }, + { + "auxiliary_loss_clip": 0.01696111, + "auxiliary_loss_mlp": 0.0026809, + "balance_loss_clip": 1.33625603, + "balance_loss_mlp": 0.22839305, + "epoch": 0.1574627987374117, + "flos": 22666922772480.0, + "grad_norm": 3.56764192313242, + "language_loss": 0.86945373, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.88909572, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.39697266, + "step": 2619, + "time_per_iteration": 2.647993564605713 + }, + { + "auxiliary_loss_clip": 0.01686203, + "auxiliary_loss_mlp": 0.00250304, + "balance_loss_clip": 1.33280981, + "balance_loss_mlp": 0.21532822, + "epoch": 0.15752292199007967, + "flos": 16800664262400.0, + "grad_norm": 5.078227502989648, + "language_loss": 0.81878126, + "learning_rate": 3.831822382544101e-06, + "loss": 0.83814633, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.35009766, + "step": 2620, + "time_per_iteration": 2.5828206539154053 + }, + { + "auxiliary_loss_clip": 0.01684453, + "auxiliary_loss_mlp": 0.00245744, + "balance_loss_clip": 1.33075905, + "balance_loss_mlp": 0.20702486, + "epoch": 0.15758304524274763, + "flos": 29826002568960.0, + "grad_norm": 2.591791935082625, + "language_loss": 0.78352451, + "learning_rate": 3.831666025302944e-06, + "loss": 0.80282652, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.38720703, + "step": 2621, + "time_per_iteration": 2.725764513015747 + }, + { + "auxiliary_loss_clip": 0.01695707, + "auxiliary_loss_mlp": 0.00254706, + "balance_loss_clip": 1.33514118, + "balance_loss_mlp": 0.21863329, + "epoch": 0.1576431684954156, + "flos": 53577426723840.0, + "grad_norm": 30.46959782951778, + "language_loss": 0.80646968, + "learning_rate": 3.831509598604828e-06, + "loss": 0.82597381, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 0.36083984, + "step": 2622, + "time_per_iteration": 2.9106900691986084 + }, + { + "auxiliary_loss_clip": 0.01695764, + "auxiliary_loss_mlp": 0.00247053, + "balance_loss_clip": 1.34150732, + "balance_loss_mlp": 0.21348388, + "epoch": 0.15770329174808356, + "flos": 20813609664000.0, + "grad_norm": 6.952143479317169, + "language_loss": 0.93389189, + "learning_rate": 3.831353102455684e-06, + "loss": 0.95332009, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 0.33544922, + "step": 2623, + "time_per_iteration": 2.6047191619873047 + }, + { + "auxiliary_loss_clip": 0.01688175, + "auxiliary_loss_mlp": 0.0025232, + "balance_loss_clip": 1.33443284, + "balance_loss_mlp": 0.21767816, + "epoch": 0.15776341500075153, + "flos": 24974004395520.0, + "grad_norm": 252.55605854039928, + "language_loss": 0.85329765, + "learning_rate": 3.831196536861448e-06, + "loss": 0.8727026, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.34667969, + "step": 2624, + "time_per_iteration": 2.6776247024536133 + }, + { + "auxiliary_loss_clip": 0.01689593, + "auxiliary_loss_mlp": 0.00251853, + "balance_loss_clip": 1.32877612, + "balance_loss_mlp": 0.21485044, + "epoch": 0.15782353825341952, + "flos": 21907915459200.0, + "grad_norm": 163.35626486849284, + "language_loss": 0.87918651, + "learning_rate": 3.831039901828054e-06, + "loss": 0.898601, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 0.37011719, + "step": 2625, + "time_per_iteration": 2.6763527393341064 + }, + { + "auxiliary_loss_clip": 0.01677082, + "auxiliary_loss_mlp": 0.00224205, + "balance_loss_clip": 1.32228518, + "balance_loss_mlp": 0.19006389, + "epoch": 0.15788366150608749, + "flos": 26177191292160.0, + "grad_norm": 36.51602713488243, + "language_loss": 0.8802532, + "learning_rate": 3.830883197361445e-06, + "loss": 0.89926606, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 0.34130859, + "step": 2626, + "time_per_iteration": 2.6627957820892334 + }, + { + "auxiliary_loss_clip": 0.01673849, + "auxiliary_loss_mlp": 0.00248605, + "balance_loss_clip": 1.32187295, + "balance_loss_mlp": 0.21293791, + "epoch": 0.15794378475875545, + "flos": 27709822753920.0, + "grad_norm": 2.1084359455671406, + "language_loss": 0.82395166, + "learning_rate": 3.830726423467561e-06, + "loss": 0.84317625, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 0.35668945, + "step": 2627, + "time_per_iteration": 2.667367696762085 + }, + { + "auxiliary_loss_clip": 0.01662976, + "auxiliary_loss_mlp": 0.00256148, + "balance_loss_clip": 1.31007564, + "balance_loss_mlp": 0.222984, + "epoch": 0.15800390801142342, + "flos": 12130158533760.0, + "grad_norm": 11.37584649812117, + "language_loss": 0.93822759, + "learning_rate": 3.830569580152348e-06, + "loss": 0.9574188, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.33154297, + "step": 2628, + "time_per_iteration": 2.5871810913085938 + }, + { + "auxiliary_loss_clip": 0.01649001, + "auxiliary_loss_mlp": 0.00236664, + "balance_loss_clip": 1.30186307, + "balance_loss_mlp": 0.20462099, + "epoch": 0.15806403126409138, + "flos": 20704728562560.0, + "grad_norm": 33.7366911159071, + "language_loss": 0.83724153, + "learning_rate": 3.830412667421752e-06, + "loss": 0.85609818, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.3203125, + "step": 2629, + "time_per_iteration": 2.632408380508423 + }, + { + "auxiliary_loss_clip": 0.01645039, + "auxiliary_loss_mlp": 0.00229443, + "balance_loss_clip": 1.29784453, + "balance_loss_mlp": 0.1968272, + "epoch": 0.15812415451675935, + "flos": 17821712269440.0, + "grad_norm": 919.5490562472952, + "language_loss": 0.83190012, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.85064495, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.32617188, + "step": 2630, + "time_per_iteration": 2.69235897064209 + }, + { + "auxiliary_loss_clip": 0.01634841, + "auxiliary_loss_mlp": 0.00267264, + "balance_loss_clip": 1.28532994, + "balance_loss_mlp": 0.23274116, + "epoch": 0.15818427776942734, + "flos": 20084048524800.0, + "grad_norm": 92.57361178130363, + "language_loss": 0.92002684, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.93904793, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 0.3449707, + "step": 2631, + "time_per_iteration": 2.6491079330444336 + }, + { + "auxiliary_loss_clip": 0.01621319, + "auxiliary_loss_mlp": 0.00272597, + "balance_loss_clip": 1.27777445, + "balance_loss_mlp": 0.23781152, + "epoch": 0.1582444010220953, + "flos": 21214911386880.0, + "grad_norm": 2.4289931335920336, + "language_loss": 0.88437468, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.90331388, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 3.43554688, + "router_z_loss_mlp": 0.34790039, + "step": 2632, + "time_per_iteration": 2.6516783237457275 + }, + { + "auxiliary_loss_clip": 0.01614379, + "auxiliary_loss_mlp": 0.00254858, + "balance_loss_clip": 1.27052891, + "balance_loss_mlp": 0.22400703, + "epoch": 0.15830452427476327, + "flos": 17858341163520.0, + "grad_norm": 1086.3276714529295, + "language_loss": 0.90639508, + "learning_rate": 3.829784322464594e-06, + "loss": 0.92508745, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 3.44140625, + "router_z_loss_mlp": 0.30883789, + "step": 2633, + "time_per_iteration": 2.6136326789855957 + }, + { + "auxiliary_loss_clip": 0.01615842, + "auxiliary_loss_mlp": 0.00274609, + "balance_loss_clip": 1.27264607, + "balance_loss_mlp": 0.24137366, + "epoch": 0.15836464752743123, + "flos": 24534960456960.0, + "grad_norm": 5.963081096991649, + "language_loss": 0.83877909, + "learning_rate": 3.829627062746394e-06, + "loss": 0.85768366, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.33227539, + "step": 2634, + "time_per_iteration": 2.6822216510772705 + }, + { + "auxiliary_loss_clip": 0.01595213, + "auxiliary_loss_mlp": 0.00270926, + "balance_loss_clip": 1.24889565, + "balance_loss_mlp": 0.23914506, + "epoch": 0.1584247707800992, + "flos": 20120821073280.0, + "grad_norm": 8.437586342943668, + "language_loss": 0.95932919, + "learning_rate": 3.829469733648552e-06, + "loss": 0.97799063, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.31762695, + "step": 2635, + "time_per_iteration": 4.1620330810546875 + }, + { + "auxiliary_loss_clip": 0.01597383, + "auxiliary_loss_mlp": 0.00306145, + "balance_loss_clip": 1.25848711, + "balance_loss_mlp": 0.27434021, + "epoch": 0.15848489403276717, + "flos": 20375966355840.0, + "grad_norm": 172.2723016430246, + "language_loss": 0.8423236, + "learning_rate": 3.829312335177034e-06, + "loss": 0.86135888, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.31811523, + "step": 2636, + "time_per_iteration": 2.648315191268921 + }, + { + "auxiliary_loss_clip": 0.01613965, + "auxiliary_loss_mlp": 0.00290471, + "balance_loss_clip": 1.2687459, + "balance_loss_mlp": 0.25742584, + "epoch": 0.15854501728543513, + "flos": 39346890359040.0, + "grad_norm": 19.50841596235931, + "language_loss": 0.79897535, + "learning_rate": 3.82915486733781e-06, + "loss": 0.81801975, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.33056641, + "step": 2637, + "time_per_iteration": 2.8147454261779785 + }, + { + "auxiliary_loss_clip": 0.01574872, + "auxiliary_loss_mlp": 0.00272426, + "balance_loss_clip": 1.24458027, + "balance_loss_mlp": 0.2407399, + "epoch": 0.15860514053810312, + "flos": 24864225454080.0, + "grad_norm": 3.8410829649171396, + "language_loss": 0.84517193, + "learning_rate": 3.82899733013685e-06, + "loss": 0.86364496, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.31713867, + "step": 2638, + "time_per_iteration": 4.168790340423584 + }, + { + "auxiliary_loss_clip": 0.01576488, + "auxiliary_loss_mlp": 0.00280258, + "balance_loss_clip": 1.24106956, + "balance_loss_mlp": 0.24854819, + "epoch": 0.1586652637907711, + "flos": 26177694082560.0, + "grad_norm": 1.7974579543169713, + "language_loss": 0.81622612, + "learning_rate": 3.828839723580128e-06, + "loss": 0.83479363, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.31689453, + "step": 2639, + "time_per_iteration": 4.10056471824646 + }, + { + "auxiliary_loss_clip": 0.0158749, + "auxiliary_loss_mlp": 0.00312686, + "balance_loss_clip": 1.25399828, + "balance_loss_mlp": 0.28151238, + "epoch": 0.15872538704343905, + "flos": 19792058866560.0, + "grad_norm": 12.02060547710641, + "language_loss": 0.87061715, + "learning_rate": 3.82868204767362e-06, + "loss": 0.88961899, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.31164551, + "step": 2640, + "time_per_iteration": 2.629162549972534 + }, + { + "auxiliary_loss_clip": 0.01576248, + "auxiliary_loss_mlp": 0.00292534, + "balance_loss_clip": 1.24464881, + "balance_loss_mlp": 0.26217121, + "epoch": 0.15878551029610702, + "flos": 28475366342400.0, + "grad_norm": 11.902807807471113, + "language_loss": 0.72325945, + "learning_rate": 3.828524302423306e-06, + "loss": 0.74194723, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.30358887, + "step": 2641, + "time_per_iteration": 4.065160512924194 + }, + { + "auxiliary_loss_clip": 0.01583132, + "auxiliary_loss_mlp": 0.00296364, + "balance_loss_clip": 1.24132681, + "balance_loss_mlp": 0.26389116, + "epoch": 0.15884563354877498, + "flos": 24206701040640.0, + "grad_norm": 71.73898257856055, + "language_loss": 0.84056884, + "learning_rate": 3.828366487835167e-06, + "loss": 0.85936379, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.32446289, + "step": 2642, + "time_per_iteration": 2.648829460144043 + }, + { + "auxiliary_loss_clip": 0.01592669, + "auxiliary_loss_mlp": 0.00296621, + "balance_loss_clip": 1.25519586, + "balance_loss_mlp": 0.26300442, + "epoch": 0.15890575680144295, + "flos": 23949795991680.0, + "grad_norm": 10.21000418656979, + "language_loss": 0.76032066, + "learning_rate": 3.828208603915186e-06, + "loss": 0.77921361, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.33618164, + "step": 2643, + "time_per_iteration": 2.6624574661254883 + }, + { + "auxiliary_loss_clip": 0.01584298, + "auxiliary_loss_mlp": 0.00262387, + "balance_loss_clip": 1.25193501, + "balance_loss_mlp": 0.23201287, + "epoch": 0.15896588005411091, + "flos": 21215019127680.0, + "grad_norm": 4.201785397662325, + "language_loss": 0.8680833, + "learning_rate": 3.828050650669353e-06, + "loss": 0.88655013, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.30371094, + "step": 2644, + "time_per_iteration": 2.6380221843719482 + }, + { + "auxiliary_loss_clip": 0.01585512, + "auxiliary_loss_mlp": 0.00332104, + "balance_loss_clip": 1.25574005, + "balance_loss_mlp": 0.30013174, + "epoch": 0.1590260033067789, + "flos": 24352390604160.0, + "grad_norm": 34.19428091317874, + "language_loss": 0.88873821, + "learning_rate": 3.827892628103657e-06, + "loss": 0.90791434, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.31982422, + "step": 2645, + "time_per_iteration": 2.701603651046753 + }, + { + "auxiliary_loss_clip": 0.01567159, + "auxiliary_loss_mlp": 0.00306443, + "balance_loss_clip": 1.23598957, + "balance_loss_mlp": 0.27325484, + "epoch": 0.15908612655944687, + "flos": 32048944583040.0, + "grad_norm": 10.249123123465564, + "language_loss": 0.77741045, + "learning_rate": 3.827734536224087e-06, + "loss": 0.79614645, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.33203125, + "step": 2646, + "time_per_iteration": 2.722123622894287 + }, + { + "auxiliary_loss_clip": 0.01563278, + "auxiliary_loss_mlp": 0.00299807, + "balance_loss_clip": 1.23718596, + "balance_loss_mlp": 0.26962343, + "epoch": 0.15914624981211484, + "flos": 17785370684160.0, + "grad_norm": 10.423381618675766, + "language_loss": 0.70291907, + "learning_rate": 3.827576375036642e-06, + "loss": 0.72154987, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.30175781, + "step": 2647, + "time_per_iteration": 2.6305108070373535 + }, + { + "auxiliary_loss_clip": 0.01571646, + "auxiliary_loss_mlp": 0.00263591, + "balance_loss_clip": 1.2492609, + "balance_loss_mlp": 0.23338288, + "epoch": 0.1592063730647828, + "flos": 17712507945600.0, + "grad_norm": 67.61574811986122, + "language_loss": 0.9643929, + "learning_rate": 3.827418144547318e-06, + "loss": 0.98274529, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.30200195, + "step": 2648, + "time_per_iteration": 2.6609647274017334 + }, + { + "auxiliary_loss_clip": 0.01573025, + "auxiliary_loss_mlp": 0.00312912, + "balance_loss_clip": 1.25080729, + "balance_loss_mlp": 0.28333664, + "epoch": 0.15926649631745077, + "flos": 18803545603200.0, + "grad_norm": 3.582292185688921, + "language_loss": 0.97373873, + "learning_rate": 3.827259844762114e-06, + "loss": 0.99259818, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.29577637, + "step": 2649, + "time_per_iteration": 2.6487786769866943 + }, + { + "auxiliary_loss_clip": 0.01591808, + "auxiliary_loss_mlp": 0.00346136, + "balance_loss_clip": 1.24882817, + "balance_loss_mlp": 0.31125581, + "epoch": 0.15932661957011873, + "flos": 17566243764480.0, + "grad_norm": 3.3458307383052537, + "language_loss": 0.82949138, + "learning_rate": 3.827101475687033e-06, + "loss": 0.84887081, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 0.34887695, + "step": 2650, + "time_per_iteration": 2.6652326583862305 + }, + { + "auxiliary_loss_clip": 0.0157642, + "auxiliary_loss_mlp": 0.00292448, + "balance_loss_clip": 1.25345993, + "balance_loss_mlp": 0.2608695, + "epoch": 0.15938674282278673, + "flos": 13334351011200.0, + "grad_norm": 25.65695111808917, + "language_loss": 0.79484904, + "learning_rate": 3.826943037328082e-06, + "loss": 0.81353766, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.3157959, + "step": 2651, + "time_per_iteration": 2.622145175933838 + }, + { + "auxiliary_loss_clip": 0.01574955, + "auxiliary_loss_mlp": 0.0032264, + "balance_loss_clip": 1.25005174, + "balance_loss_mlp": 0.29083544, + "epoch": 0.1594468660754547, + "flos": 22488842119680.0, + "grad_norm": 13.629683634613663, + "language_loss": 0.85702252, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.87599844, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.31787109, + "step": 2652, + "time_per_iteration": 2.6910243034362793 + }, + { + "auxiliary_loss_clip": 0.01587214, + "auxiliary_loss_mlp": 0.0029708, + "balance_loss_clip": 1.26191378, + "balance_loss_mlp": 0.26825476, + "epoch": 0.15950698932812266, + "flos": 15007320910080.0, + "grad_norm": 7.27668784656326, + "language_loss": 0.79208094, + "learning_rate": 3.826625952782601e-06, + "loss": 0.81092387, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.28820801, + "step": 2653, + "time_per_iteration": 2.583361864089966 + }, + { + "auxiliary_loss_clip": 0.01571586, + "auxiliary_loss_mlp": 0.00294018, + "balance_loss_clip": 1.24422383, + "balance_loss_mlp": 0.26235652, + "epoch": 0.15956711258079062, + "flos": 30155052084480.0, + "grad_norm": 9.729465956418146, + "language_loss": 0.86591345, + "learning_rate": 3.826467306608095e-06, + "loss": 0.88456953, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.31652832, + "step": 2654, + "time_per_iteration": 2.774886131286621 + }, + { + "auxiliary_loss_clip": 0.01569108, + "auxiliary_loss_mlp": 0.00306644, + "balance_loss_clip": 1.24487507, + "balance_loss_mlp": 0.27686539, + "epoch": 0.1596272358334586, + "flos": 21032700670080.0, + "grad_norm": 8.589924780455577, + "language_loss": 0.86640328, + "learning_rate": 3.826308591173765e-06, + "loss": 0.88516068, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 3.23828125, + "router_z_loss_mlp": 0.29785156, + "step": 2655, + "time_per_iteration": 2.6170763969421387 + }, + { + "auxiliary_loss_clip": 0.01567749, + "auxiliary_loss_mlp": 0.00329643, + "balance_loss_clip": 1.24568069, + "balance_loss_mlp": 0.29979303, + "epoch": 0.15968735908612655, + "flos": 15268032800640.0, + "grad_norm": 36.55571346816118, + "language_loss": 0.80531311, + "learning_rate": 3.826149806485631e-06, + "loss": 0.82428706, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.29821777, + "step": 2656, + "time_per_iteration": 2.659684658050537 + }, + { + "auxiliary_loss_clip": 0.01563948, + "auxiliary_loss_mlp": 0.00295472, + "balance_loss_clip": 1.24710619, + "balance_loss_mlp": 0.26657584, + "epoch": 0.15974748233879452, + "flos": 52665726695040.0, + "grad_norm": 6.863134475577112, + "language_loss": 0.83095384, + "learning_rate": 3.825990952549713e-06, + "loss": 0.8495481, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.2890625, + "step": 2657, + "time_per_iteration": 2.9867470264434814 + }, + { + "auxiliary_loss_clip": 0.01589598, + "auxiliary_loss_mlp": 0.00336382, + "balance_loss_clip": 1.2624377, + "balance_loss_mlp": 0.30500564, + "epoch": 0.1598076055914625, + "flos": 18733232730240.0, + "grad_norm": 424.18736051083096, + "language_loss": 0.80669785, + "learning_rate": 3.825832029372035e-06, + "loss": 0.82595766, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.3137207, + "step": 2658, + "time_per_iteration": 2.6460344791412354 + }, + { + "auxiliary_loss_clip": 0.01578292, + "auxiliary_loss_mlp": 0.00332919, + "balance_loss_clip": 1.25049782, + "balance_loss_mlp": 0.2992304, + "epoch": 0.15986772884413047, + "flos": 34349238535680.0, + "grad_norm": 15.534975021043405, + "language_loss": 0.81617999, + "learning_rate": 3.825673036958624e-06, + "loss": 0.8352921, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.33691406, + "step": 2659, + "time_per_iteration": 2.7632296085357666 + }, + { + "auxiliary_loss_clip": 0.01574144, + "auxiliary_loss_mlp": 0.00373306, + "balance_loss_clip": 1.24541855, + "balance_loss_mlp": 0.33980861, + "epoch": 0.15992785209679844, + "flos": 22054969739520.0, + "grad_norm": 12.74794777171614, + "language_loss": 0.98307019, + "learning_rate": 3.825513975315508e-06, + "loss": 1.00254464, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.33520508, + "step": 2660, + "time_per_iteration": 2.6263175010681152 + }, + { + "auxiliary_loss_clip": 0.01590633, + "auxiliary_loss_mlp": 0.00339649, + "balance_loss_clip": 1.2573601, + "balance_loss_mlp": 0.30703324, + "epoch": 0.1599879753494664, + "flos": 33066652625280.0, + "grad_norm": 7.270379550388851, + "language_loss": 0.84005153, + "learning_rate": 3.82535484444872e-06, + "loss": 0.85935426, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.32617188, + "step": 2661, + "time_per_iteration": 2.7240869998931885 + }, + { + "auxiliary_loss_clip": 0.01575904, + "auxiliary_loss_mlp": 0.0035141, + "balance_loss_clip": 1.24956727, + "balance_loss_mlp": 0.32027221, + "epoch": 0.16004809860213437, + "flos": 28038010343040.0, + "grad_norm": 6.043222471438579, + "language_loss": 0.78394473, + "learning_rate": 3.825195644364292e-06, + "loss": 0.80321789, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.31091309, + "step": 2662, + "time_per_iteration": 2.700563669204712 + }, + { + "auxiliary_loss_clip": 0.01571625, + "auxiliary_loss_mlp": 0.00321206, + "balance_loss_clip": 1.24054813, + "balance_loss_mlp": 0.28849518, + "epoch": 0.16010822185480234, + "flos": 22780113505920.0, + "grad_norm": 34.49566461666907, + "language_loss": 0.89167869, + "learning_rate": 3.825036375068263e-06, + "loss": 0.91060698, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.32739258, + "step": 2663, + "time_per_iteration": 2.6508285999298096 + }, + { + "auxiliary_loss_clip": 0.01575064, + "auxiliary_loss_mlp": 0.0034158, + "balance_loss_clip": 1.24843001, + "balance_loss_mlp": 0.30908376, + "epoch": 0.16016834510747033, + "flos": 20084012611200.0, + "grad_norm": 3.093260048421951, + "language_loss": 0.89060497, + "learning_rate": 3.824877036566672e-06, + "loss": 0.90977144, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.32495117, + "step": 2664, + "time_per_iteration": 2.643136739730835 + }, + { + "auxiliary_loss_clip": 0.01558862, + "auxiliary_loss_mlp": 0.00325449, + "balance_loss_clip": 1.23547769, + "balance_loss_mlp": 0.29327431, + "epoch": 0.1602284683601383, + "flos": 21173829206400.0, + "grad_norm": 84.10507895827095, + "language_loss": 0.99097407, + "learning_rate": 3.824717628865561e-06, + "loss": 1.00981724, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.32141113, + "step": 2665, + "time_per_iteration": 2.623013496398926 + }, + { + "auxiliary_loss_clip": 0.01572031, + "auxiliary_loss_mlp": 0.00306682, + "balance_loss_clip": 1.24411285, + "balance_loss_mlp": 0.2755447, + "epoch": 0.16028859161280626, + "flos": 14647568244480.0, + "grad_norm": 104.06287967914122, + "language_loss": 0.90355217, + "learning_rate": 3.824558151970974e-06, + "loss": 0.92233932, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.3112793, + "step": 2666, + "time_per_iteration": 2.582685947418213 + }, + { + "auxiliary_loss_clip": 0.01570422, + "auxiliary_loss_mlp": 0.00342745, + "balance_loss_clip": 1.24499702, + "balance_loss_mlp": 0.31067723, + "epoch": 0.16034871486547422, + "flos": 20990325600000.0, + "grad_norm": 21.760050002744773, + "language_loss": 0.87986791, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.89899957, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.32080078, + "step": 2667, + "time_per_iteration": 2.63266921043396 + }, + { + "auxiliary_loss_clip": 0.01590987, + "auxiliary_loss_mlp": 0.00352187, + "balance_loss_clip": 1.26556277, + "balance_loss_mlp": 0.31873661, + "epoch": 0.1604088381181422, + "flos": 21397732634880.0, + "grad_norm": 5.894846609074103, + "language_loss": 0.810862, + "learning_rate": 3.824238990625567e-06, + "loss": 0.83029377, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.33447266, + "step": 2668, + "time_per_iteration": 2.607182741165161 + }, + { + "auxiliary_loss_clip": 0.01558201, + "auxiliary_loss_mlp": 0.00358836, + "balance_loss_clip": 1.23460722, + "balance_loss_mlp": 0.32716221, + "epoch": 0.16046896137081015, + "flos": 23877040993920.0, + "grad_norm": 69.74517656543513, + "language_loss": 0.8304494, + "learning_rate": 3.824079306186848e-06, + "loss": 0.84961975, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.31652832, + "step": 2669, + "time_per_iteration": 2.6844518184661865 + }, + { + "auxiliary_loss_clip": 0.01614687, + "auxiliary_loss_mlp": 0.00258412, + "balance_loss_clip": 1.34819472, + "balance_loss_mlp": 0.24854124, + "epoch": 0.16052908462347812, + "flos": 59806709015040.0, + "grad_norm": 0.8106444371479665, + "language_loss": 0.55457699, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57330793, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.09863281, + "step": 2670, + "time_per_iteration": 3.0058343410491943 + }, + { + "auxiliary_loss_clip": 0.01556094, + "auxiliary_loss_mlp": 0.00356893, + "balance_loss_clip": 1.23311639, + "balance_loss_mlp": 0.32430115, + "epoch": 0.1605892078761461, + "flos": 18296559089280.0, + "grad_norm": 3.917739041216195, + "language_loss": 0.85281831, + "learning_rate": 3.82375972980766e-06, + "loss": 0.87194812, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.32617188, + "step": 2671, + "time_per_iteration": 2.6324734687805176 + }, + { + "auxiliary_loss_clip": 0.01563785, + "auxiliary_loss_mlp": 0.00349293, + "balance_loss_clip": 1.24001789, + "balance_loss_mlp": 0.31708288, + "epoch": 0.16064933112881408, + "flos": 32160734686080.0, + "grad_norm": 42.28605023878604, + "language_loss": 0.72794318, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.74707395, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.32214355, + "step": 2672, + "time_per_iteration": 2.7562615871429443 + }, + { + "auxiliary_loss_clip": 0.01546132, + "auxiliary_loss_mlp": 0.00348406, + "balance_loss_clip": 1.22496939, + "balance_loss_mlp": 0.31545621, + "epoch": 0.16070945438148204, + "flos": 19828795501440.0, + "grad_norm": 2.840082842040135, + "language_loss": 0.93799025, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.95693564, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.32958984, + "step": 2673, + "time_per_iteration": 2.6335413455963135 + }, + { + "auxiliary_loss_clip": 0.01551736, + "auxiliary_loss_mlp": 0.0032407, + "balance_loss_clip": 1.2321229, + "balance_loss_mlp": 0.2938858, + "epoch": 0.16076957763415, + "flos": 18913144976640.0, + "grad_norm": 31.29970428900479, + "language_loss": 0.81051505, + "learning_rate": 3.823279846575403e-06, + "loss": 0.8292731, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.30200195, + "step": 2674, + "time_per_iteration": 2.6648240089416504 + }, + { + "auxiliary_loss_clip": 0.01519245, + "auxiliary_loss_mlp": 0.00323296, + "balance_loss_clip": 1.20640838, + "balance_loss_mlp": 0.29410189, + "epoch": 0.16082970088681797, + "flos": 16764358590720.0, + "grad_norm": 5.0200210509485474, + "language_loss": 0.89212847, + "learning_rate": 3.823119747211986e-06, + "loss": 0.91055393, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.29174805, + "step": 2675, + "time_per_iteration": 2.6344492435455322 + }, + { + "auxiliary_loss_clip": 0.01540878, + "auxiliary_loss_mlp": 0.00357451, + "balance_loss_clip": 1.22136497, + "balance_loss_mlp": 0.32411981, + "epoch": 0.16088982413948594, + "flos": 35150261783040.0, + "grad_norm": 92.23674115353234, + "language_loss": 0.88970441, + "learning_rate": 3.822959578715685e-06, + "loss": 0.90868771, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.33325195, + "step": 2676, + "time_per_iteration": 2.8063149452209473 + }, + { + "auxiliary_loss_clip": 0.01528491, + "auxiliary_loss_mlp": 0.0032605, + "balance_loss_clip": 1.21991777, + "balance_loss_mlp": 0.29805934, + "epoch": 0.1609499473921539, + "flos": 18625105814400.0, + "grad_norm": 1644.3029219872897, + "language_loss": 0.80902827, + "learning_rate": 3.822799341092573e-06, + "loss": 0.82757366, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 3.0859375, + "router_z_loss_mlp": 0.27978516, + "step": 2677, + "time_per_iteration": 4.018671035766602 + }, + { + "auxiliary_loss_clip": 0.01514041, + "auxiliary_loss_mlp": 0.00309619, + "balance_loss_clip": 1.20559525, + "balance_loss_mlp": 0.27874357, + "epoch": 0.1610100706448219, + "flos": 33145728416640.0, + "grad_norm": 100.98841025221115, + "language_loss": 0.82716465, + "learning_rate": 3.822639034348728e-06, + "loss": 0.84540129, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 3.08398438, + "router_z_loss_mlp": 0.30883789, + "step": 2678, + "time_per_iteration": 2.785815954208374 + }, + { + "auxiliary_loss_clip": 0.01508048, + "auxiliary_loss_mlp": 0.00354232, + "balance_loss_clip": 1.19319367, + "balance_loss_mlp": 0.3232491, + "epoch": 0.16107019389748986, + "flos": 34676707852800.0, + "grad_norm": 223.4515652288639, + "language_loss": 0.76176667, + "learning_rate": 3.822478658490228e-06, + "loss": 0.78038949, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.30993652, + "step": 2679, + "time_per_iteration": 2.7988789081573486 + }, + { + "auxiliary_loss_clip": 0.01545061, + "auxiliary_loss_mlp": 0.00093612, + "balance_loss_clip": 1.28559136, + "balance_loss_mlp": 0.08393177, + "epoch": 0.16113031715015783, + "flos": 65713403260800.0, + "grad_norm": 0.7656348627607401, + "language_loss": 0.51709282, + "learning_rate": 3.822318213523154e-06, + "loss": 0.53347957, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.09667969, + "step": 2680, + "time_per_iteration": 4.674199104309082 + }, + { + "auxiliary_loss_clip": 0.01501354, + "auxiliary_loss_mlp": 0.00363516, + "balance_loss_clip": 1.18978953, + "balance_loss_mlp": 0.33142501, + "epoch": 0.1611904404028258, + "flos": 20810413353600.0, + "grad_norm": 28.31040329341951, + "language_loss": 0.87912714, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.89777583, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.32080078, + "step": 2681, + "time_per_iteration": 4.036627292633057 + }, + { + "auxiliary_loss_clip": 0.01501736, + "auxiliary_loss_mlp": 0.00338649, + "balance_loss_clip": 1.19294286, + "balance_loss_mlp": 0.30923998, + "epoch": 0.16125056365549376, + "flos": 27013335062400.0, + "grad_norm": 25.53976665953396, + "language_loss": 0.75182319, + "learning_rate": 3.821997116287627e-06, + "loss": 0.77022696, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.29431152, + "step": 2682, + "time_per_iteration": 2.7181761264801025 + }, + { + "auxiliary_loss_clip": 0.01504543, + "auxiliary_loss_mlp": 0.00334228, + "balance_loss_clip": 1.19047189, + "balance_loss_mlp": 0.30517656, + "epoch": 0.16131068690816172, + "flos": 19276524915840.0, + "grad_norm": 79.47713798211194, + "language_loss": 0.94243979, + "learning_rate": 3.821836464031348e-06, + "loss": 0.96082753, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.29016113, + "step": 2683, + "time_per_iteration": 2.6084911823272705 + }, + { + "auxiliary_loss_clip": 0.01485201, + "auxiliary_loss_mlp": 0.00380955, + "balance_loss_clip": 1.17942047, + "balance_loss_mlp": 0.35136747, + "epoch": 0.16137081016082971, + "flos": 35337931367040.0, + "grad_norm": 3.2506449100138823, + "language_loss": 0.79911184, + "learning_rate": 3.821675742690849e-06, + "loss": 0.8177734, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.29614258, + "step": 2684, + "time_per_iteration": 4.145632266998291 + }, + { + "auxiliary_loss_clip": 0.01494012, + "auxiliary_loss_mlp": 0.00352254, + "balance_loss_clip": 1.18417799, + "balance_loss_mlp": 0.32090202, + "epoch": 0.16143093341349768, + "flos": 34235257703040.0, + "grad_norm": 5.650808995633419, + "language_loss": 0.78644603, + "learning_rate": 3.821514952272223e-06, + "loss": 0.80490863, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.31347656, + "step": 2685, + "time_per_iteration": 2.7523105144500732 + }, + { + "auxiliary_loss_clip": 0.01493039, + "auxiliary_loss_mlp": 0.00365117, + "balance_loss_clip": 1.18247318, + "balance_loss_mlp": 0.33479014, + "epoch": 0.16149105666616564, + "flos": 27999262546560.0, + "grad_norm": 88.80787229962928, + "language_loss": 0.77555597, + "learning_rate": 3.821354092781567e-06, + "loss": 0.7941376, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.3034668, + "step": 2686, + "time_per_iteration": 2.6700022220611572 + }, + { + "auxiliary_loss_clip": 0.01489696, + "auxiliary_loss_mlp": 0.00368774, + "balance_loss_clip": 1.17607188, + "balance_loss_mlp": 0.33768457, + "epoch": 0.1615511799188336, + "flos": 19422214479360.0, + "grad_norm": 4.186947625028718, + "language_loss": 0.88581407, + "learning_rate": 3.821193164224981e-06, + "loss": 0.9043988, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.31079102, + "step": 2687, + "time_per_iteration": 2.725583553314209 + }, + { + "auxiliary_loss_clip": 0.01485894, + "auxiliary_loss_mlp": 0.00354146, + "balance_loss_clip": 1.17319274, + "balance_loss_mlp": 0.32172066, + "epoch": 0.16161130317150157, + "flos": 22854915578880.0, + "grad_norm": 37.2355222020744, + "language_loss": 0.79072726, + "learning_rate": 3.821032166608568e-06, + "loss": 0.80912769, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.32446289, + "step": 2688, + "time_per_iteration": 2.6536107063293457 + }, + { + "auxiliary_loss_clip": 0.01475126, + "auxiliary_loss_mlp": 0.00368322, + "balance_loss_clip": 1.166134, + "balance_loss_mlp": 0.3367793, + "epoch": 0.16167142642416954, + "flos": 26110577520000.0, + "grad_norm": 695.9941335939913, + "language_loss": 0.79992294, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.81835735, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.31518555, + "step": 2689, + "time_per_iteration": 2.770781993865967 + }, + { + "auxiliary_loss_clip": 0.01491304, + "auxiliary_loss_mlp": 0.00341294, + "balance_loss_clip": 1.18214345, + "balance_loss_mlp": 0.31281522, + "epoch": 0.1617315496768375, + "flos": 22779646629120.0, + "grad_norm": 7.5926790475690495, + "language_loss": 0.92410958, + "learning_rate": 3.820709964220683e-06, + "loss": 0.9424355, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.28491211, + "step": 2690, + "time_per_iteration": 2.6786348819732666 + }, + { + "auxiliary_loss_clip": 0.01487629, + "auxiliary_loss_mlp": 0.00360201, + "balance_loss_clip": 1.17780983, + "balance_loss_mlp": 0.33172178, + "epoch": 0.1617916729295055, + "flos": 22017299351040.0, + "grad_norm": 7.711460672445254, + "language_loss": 0.92270446, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.94118279, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.28479004, + "step": 2691, + "time_per_iteration": 2.643651008605957 + }, + { + "auxiliary_loss_clip": 0.01497395, + "auxiliary_loss_mlp": 0.0035929, + "balance_loss_clip": 1.18159211, + "balance_loss_mlp": 0.32679379, + "epoch": 0.16185179618217346, + "flos": 23438248450560.0, + "grad_norm": 5.860582725108861, + "language_loss": 0.88634086, + "learning_rate": 3.820387485666784e-06, + "loss": 0.90490764, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.32470703, + "step": 2692, + "time_per_iteration": 2.652879238128662 + }, + { + "auxiliary_loss_clip": 0.01503198, + "auxiliary_loss_mlp": 0.0033079, + "balance_loss_clip": 1.18503058, + "balance_loss_mlp": 0.29888949, + "epoch": 0.16191191943484143, + "flos": 25666110627840.0, + "grad_norm": 90.94622664893245, + "language_loss": 0.9040885, + "learning_rate": 3.820226142842862e-06, + "loss": 0.92242843, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.31872559, + "step": 2693, + "time_per_iteration": 2.666032552719116 + }, + { + "auxiliary_loss_clip": 0.01474381, + "auxiliary_loss_mlp": 0.00274449, + "balance_loss_clip": 1.16376472, + "balance_loss_mlp": 0.24951032, + "epoch": 0.1619720426875094, + "flos": 23477355383040.0, + "grad_norm": 4.46592753887704, + "language_loss": 0.91420865, + "learning_rate": 3.820064730995783e-06, + "loss": 0.93169695, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.24938965, + "step": 2694, + "time_per_iteration": 2.8285701274871826 + }, + { + "auxiliary_loss_clip": 0.01491694, + "auxiliary_loss_mlp": 0.00375932, + "balance_loss_clip": 1.17725515, + "balance_loss_mlp": 0.34659445, + "epoch": 0.16203216594017736, + "flos": 24133658734080.0, + "grad_norm": 29.479188773876324, + "language_loss": 0.76774526, + "learning_rate": 3.819903250131667e-06, + "loss": 0.78642154, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.29333496, + "step": 2695, + "time_per_iteration": 2.6555259227752686 + }, + { + "auxiliary_loss_clip": 0.01490895, + "auxiliary_loss_mlp": 0.00348358, + "balance_loss_clip": 1.17483401, + "balance_loss_mlp": 0.3176139, + "epoch": 0.16209228919284532, + "flos": 22340889999360.0, + "grad_norm": 61.2603312191152, + "language_loss": 0.89614302, + "learning_rate": 3.819741700256637e-06, + "loss": 0.91453552, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.30749512, + "step": 2696, + "time_per_iteration": 2.689598560333252 + }, + { + "auxiliary_loss_clip": 0.01500437, + "auxiliary_loss_mlp": 0.00391026, + "balance_loss_clip": 1.17920876, + "balance_loss_mlp": 0.35814798, + "epoch": 0.1621524124455133, + "flos": 15815131827840.0, + "grad_norm": 4.770994033786437, + "language_loss": 1.00805831, + "learning_rate": 3.8195800813768194e-06, + "loss": 1.02697289, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.32885742, + "step": 2697, + "time_per_iteration": 2.5794692039489746 + }, + { + "auxiliary_loss_clip": 0.01473411, + "auxiliary_loss_mlp": 0.00318192, + "balance_loss_clip": 1.16698444, + "balance_loss_mlp": 0.29063115, + "epoch": 0.16221253569818128, + "flos": 30186688988160.0, + "grad_norm": 13.425801390947946, + "language_loss": 0.86699176, + "learning_rate": 3.819418393498343e-06, + "loss": 0.88490772, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.2755127, + "step": 2698, + "time_per_iteration": 2.7508957386016846 + }, + { + "auxiliary_loss_clip": 0.0148537, + "auxiliary_loss_mlp": 0.00357338, + "balance_loss_clip": 1.18238366, + "balance_loss_mlp": 0.32761919, + "epoch": 0.16227265895084925, + "flos": 24605991601920.0, + "grad_norm": 3.924991351855062, + "language_loss": 0.82409328, + "learning_rate": 3.819256636627339e-06, + "loss": 0.84252036, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.29699707, + "step": 2699, + "time_per_iteration": 2.72969913482666 + }, + { + "auxiliary_loss_clip": 0.01480903, + "auxiliary_loss_mlp": 0.0032273, + "balance_loss_clip": 1.17293429, + "balance_loss_mlp": 0.29547843, + "epoch": 0.1623327822035172, + "flos": 19573326996480.0, + "grad_norm": 3.505079491547812, + "language_loss": 0.93516052, + "learning_rate": 3.81909481076994e-06, + "loss": 0.95319676, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 3.07617188, + "router_z_loss_mlp": 0.27258301, + "step": 2700, + "time_per_iteration": 2.744044065475464 + }, + { + "auxiliary_loss_clip": 0.01468434, + "auxiliary_loss_mlp": 0.00299343, + "balance_loss_clip": 1.16150403, + "balance_loss_mlp": 0.27064919, + "epoch": 0.16239290545618518, + "flos": 26468462678400.0, + "grad_norm": 144.93768435477512, + "language_loss": 0.85072631, + "learning_rate": 3.818932915932284e-06, + "loss": 0.86840409, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.28674316, + "step": 2701, + "time_per_iteration": 2.73068904876709 + }, + { + "auxiliary_loss_clip": 0.01476671, + "auxiliary_loss_mlp": 0.00344652, + "balance_loss_clip": 1.16599941, + "balance_loss_mlp": 0.31282273, + "epoch": 0.16245302870885314, + "flos": 15851940289920.0, + "grad_norm": 41.42322094182128, + "language_loss": 0.80711383, + "learning_rate": 3.818770952120511e-06, + "loss": 0.82532704, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.31811523, + "step": 2702, + "time_per_iteration": 2.639944076538086 + }, + { + "auxiliary_loss_clip": 0.01481008, + "auxiliary_loss_mlp": 0.00313635, + "balance_loss_clip": 1.172719, + "balance_loss_mlp": 0.28390419, + "epoch": 0.1625131519615211, + "flos": 14756521173120.0, + "grad_norm": 2.83468396014323, + "language_loss": 0.81229645, + "learning_rate": 3.81860891934076e-06, + "loss": 0.83024287, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 3.08398438, + "router_z_loss_mlp": 0.29760742, + "step": 2703, + "time_per_iteration": 2.582578182220459 + }, + { + "auxiliary_loss_clip": 0.01488281, + "auxiliary_loss_mlp": 0.00321263, + "balance_loss_clip": 1.17740321, + "balance_loss_mlp": 0.28907627, + "epoch": 0.1625732752141891, + "flos": 28220508368640.0, + "grad_norm": 3.1986356517563084, + "language_loss": 0.76752788, + "learning_rate": 3.818446817599176e-06, + "loss": 0.78562331, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.32177734, + "step": 2704, + "time_per_iteration": 2.7193050384521484 + }, + { + "auxiliary_loss_clip": 0.0149688, + "auxiliary_loss_mlp": 0.00056253, + "balance_loss_clip": 1.25535989, + "balance_loss_mlp": 0.04576214, + "epoch": 0.16263339846685707, + "flos": 67327947688320.0, + "grad_norm": 0.774855822536366, + "language_loss": 0.53273785, + "learning_rate": 3.818284646901907e-06, + "loss": 0.54826915, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.10498047, + "step": 2705, + "time_per_iteration": 3.117077350616455 + }, + { + "auxiliary_loss_clip": 0.01493922, + "auxiliary_loss_mlp": 0.00326283, + "balance_loss_clip": 1.18135762, + "balance_loss_mlp": 0.29630196, + "epoch": 0.16269352171952503, + "flos": 14319165173760.0, + "grad_norm": 4.05398422709656, + "language_loss": 0.8509953, + "learning_rate": 3.818122407255102e-06, + "loss": 0.86919737, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.29968262, + "step": 2706, + "time_per_iteration": 2.5767924785614014 + }, + { + "auxiliary_loss_clip": 0.01483845, + "auxiliary_loss_mlp": 0.00371876, + "balance_loss_clip": 1.17664731, + "balance_loss_mlp": 0.34164411, + "epoch": 0.162753644972193, + "flos": 28361205941760.0, + "grad_norm": 3.1104604119048274, + "language_loss": 0.79787266, + "learning_rate": 3.817960098664914e-06, + "loss": 0.81642985, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.30224609, + "step": 2707, + "time_per_iteration": 2.675731897354126 + }, + { + "auxiliary_loss_clip": 0.01527578, + "auxiliary_loss_mlp": 0.00359955, + "balance_loss_clip": 1.21470177, + "balance_loss_mlp": 0.32888907, + "epoch": 0.16281376822486096, + "flos": 19937856170880.0, + "grad_norm": 7.252683704048497, + "language_loss": 0.90741777, + "learning_rate": 3.817797721137495e-06, + "loss": 0.92629313, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.31054688, + "step": 2708, + "time_per_iteration": 2.578489303588867 + }, + { + "auxiliary_loss_clip": 0.01521617, + "auxiliary_loss_mlp": 0.00334664, + "balance_loss_clip": 1.20661044, + "balance_loss_mlp": 0.30185804, + "epoch": 0.16287389147752893, + "flos": 21251719848960.0, + "grad_norm": 4.278918636421453, + "language_loss": 0.94295567, + "learning_rate": 3.817635274679006e-06, + "loss": 0.96151853, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.328125, + "step": 2709, + "time_per_iteration": 2.643327474594116 + }, + { + "auxiliary_loss_clip": 0.01525925, + "auxiliary_loss_mlp": 0.00361664, + "balance_loss_clip": 1.20881009, + "balance_loss_mlp": 0.33035967, + "epoch": 0.1629340147301969, + "flos": 19244672530560.0, + "grad_norm": 8.831786939509964, + "language_loss": 0.96190214, + "learning_rate": 3.817472759295605e-06, + "loss": 0.98077798, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.31323242, + "step": 2710, + "time_per_iteration": 2.6302928924560547 + }, + { + "auxiliary_loss_clip": 0.01532237, + "auxiliary_loss_mlp": 0.00317204, + "balance_loss_clip": 1.21722209, + "balance_loss_mlp": 0.2853992, + "epoch": 0.16299413798286488, + "flos": 21249816428160.0, + "grad_norm": 10.49339854084623, + "language_loss": 0.89061755, + "learning_rate": 3.817310174993453e-06, + "loss": 0.90911186, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.31762695, + "step": 2711, + "time_per_iteration": 2.6194944381713867 + }, + { + "auxiliary_loss_clip": 0.0153955, + "auxiliary_loss_mlp": 0.00319387, + "balance_loss_clip": 1.21803784, + "balance_loss_mlp": 0.28817818, + "epoch": 0.16305426123553285, + "flos": 18770579896320.0, + "grad_norm": 7.595074077114739, + "language_loss": 0.89767945, + "learning_rate": 3.817147521778719e-06, + "loss": 0.91626883, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.31176758, + "step": 2712, + "time_per_iteration": 2.690333366394043 + }, + { + "auxiliary_loss_clip": 0.01536871, + "auxiliary_loss_mlp": 0.00327091, + "balance_loss_clip": 1.22118711, + "balance_loss_mlp": 0.29600126, + "epoch": 0.16311438448820081, + "flos": 22087648137600.0, + "grad_norm": 51.28455592055306, + "language_loss": 0.83963013, + "learning_rate": 3.816984799657568e-06, + "loss": 0.85826981, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.31079102, + "step": 2713, + "time_per_iteration": 2.690988063812256 + }, + { + "auxiliary_loss_clip": 0.01546404, + "auxiliary_loss_mlp": 0.00309744, + "balance_loss_clip": 1.23403871, + "balance_loss_mlp": 0.2797513, + "epoch": 0.16317450774086878, + "flos": 16467700164480.0, + "grad_norm": 59.14708318536763, + "language_loss": 0.88047516, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.89903665, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 3.12304688, + "router_z_loss_mlp": 0.29980469, + "step": 2714, + "time_per_iteration": 2.6443002223968506 + }, + { + "auxiliary_loss_clip": 0.01537956, + "auxiliary_loss_mlp": 0.00336149, + "balance_loss_clip": 1.22752881, + "balance_loss_mlp": 0.30596554, + "epoch": 0.16323463099353674, + "flos": 24352929308160.0, + "grad_norm": 31.848344689164975, + "language_loss": 0.83808923, + "learning_rate": 3.816659148720702e-06, + "loss": 0.85683024, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.30200195, + "step": 2715, + "time_per_iteration": 2.684872627258301 + }, + { + "auxiliary_loss_clip": 0.0151838, + "auxiliary_loss_mlp": 0.00317532, + "balance_loss_clip": 1.20830309, + "balance_loss_mlp": 0.287682, + "epoch": 0.1632947542462047, + "flos": 24900782520960.0, + "grad_norm": 69.27278249030107, + "language_loss": 0.90019464, + "learning_rate": 3.816496219917336e-06, + "loss": 0.91855371, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.29858398, + "step": 2716, + "time_per_iteration": 2.7345104217529297 + }, + { + "auxiliary_loss_clip": 0.01517208, + "auxiliary_loss_mlp": 0.00352637, + "balance_loss_clip": 1.20465088, + "balance_loss_mlp": 0.32091558, + "epoch": 0.1633548774988727, + "flos": 24900279730560.0, + "grad_norm": 7573.817811784336, + "language_loss": 0.92966998, + "learning_rate": 3.816333222232251e-06, + "loss": 0.94836837, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.31750488, + "step": 2717, + "time_per_iteration": 2.7030417919158936 + }, + { + "auxiliary_loss_clip": 0.01501098, + "auxiliary_loss_mlp": 0.00414825, + "balance_loss_clip": 1.19563258, + "balance_loss_mlp": 0.38385475, + "epoch": 0.16341500075154067, + "flos": 30441798357120.0, + "grad_norm": 8.630009765511026, + "language_loss": 0.84394771, + "learning_rate": 3.816170155671629e-06, + "loss": 0.86310697, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.30957031, + "step": 2718, + "time_per_iteration": 2.75041127204895 + }, + { + "auxiliary_loss_clip": 0.01494856, + "auxiliary_loss_mlp": 0.00486355, + "balance_loss_clip": 1.19174278, + "balance_loss_mlp": 0.45378727, + "epoch": 0.16347512400420863, + "flos": 22784530878720.0, + "grad_norm": 19.929234099519352, + "language_loss": 0.8170321, + "learning_rate": 3.816007020241652e-06, + "loss": 0.83684421, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.32568359, + "step": 2719, + "time_per_iteration": 4.039881944656372 + }, + { + "auxiliary_loss_clip": 0.01490797, + "auxiliary_loss_mlp": 0.00524836, + "balance_loss_clip": 1.19029224, + "balance_loss_mlp": 0.49202994, + "epoch": 0.1635352472568766, + "flos": 22633274707200.0, + "grad_norm": 9.227010051049746, + "language_loss": 0.77038616, + "learning_rate": 3.815843815948507e-06, + "loss": 0.79054248, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.328125, + "step": 2720, + "time_per_iteration": 2.6892387866973877 + }, + { + "auxiliary_loss_clip": 0.01478013, + "auxiliary_loss_mlp": 0.0060964, + "balance_loss_clip": 1.1804626, + "balance_loss_mlp": 0.57151645, + "epoch": 0.16359537050954456, + "flos": 15522998515200.0, + "grad_norm": 45.44461887361915, + "language_loss": 0.83546692, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.85634345, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.38110352, + "step": 2721, + "time_per_iteration": 2.578968048095703 + }, + { + "auxiliary_loss_clip": 0.01496874, + "auxiliary_loss_mlp": 0.00604759, + "balance_loss_clip": 1.18958092, + "balance_loss_mlp": 0.56635022, + "epoch": 0.16365549376221253, + "flos": 22090162089600.0, + "grad_norm": 4.765991255892765, + "language_loss": 0.83918065, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.86019701, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.38427734, + "step": 2722, + "time_per_iteration": 2.6746537685394287 + }, + { + "auxiliary_loss_clip": 0.01509678, + "auxiliary_loss_mlp": 0.00663292, + "balance_loss_clip": 1.20350432, + "balance_loss_mlp": 0.6205914, + "epoch": 0.1637156170148805, + "flos": 24060400945920.0, + "grad_norm": 26.9548465627625, + "language_loss": 0.92477477, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.94650447, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.42700195, + "step": 2723, + "time_per_iteration": 5.510082006454468 + }, + { + "auxiliary_loss_clip": 0.01486114, + "auxiliary_loss_mlp": 0.0067569, + "balance_loss_clip": 1.19219112, + "balance_loss_mlp": 0.63692343, + "epoch": 0.1637757402675485, + "flos": 26685362954880.0, + "grad_norm": 4.1000060520368145, + "language_loss": 0.76033413, + "learning_rate": 3.815190310268058e-06, + "loss": 0.78195214, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.38793945, + "step": 2724, + "time_per_iteration": 2.698376417160034 + }, + { + "auxiliary_loss_clip": 0.01494247, + "auxiliary_loss_mlp": 0.00722343, + "balance_loss_clip": 1.20127964, + "balance_loss_mlp": 0.68078679, + "epoch": 0.16383586352021645, + "flos": 16106941918080.0, + "grad_norm": 16.709666065525667, + "language_loss": 0.77340174, + "learning_rate": 3.815026761751955e-06, + "loss": 0.79556763, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.4152832, + "step": 2725, + "time_per_iteration": 2.6056911945343018 + }, + { + "auxiliary_loss_clip": 0.01488647, + "auxiliary_loss_mlp": 0.00687636, + "balance_loss_clip": 1.19494009, + "balance_loss_mlp": 0.64593613, + "epoch": 0.16389598677288442, + "flos": 19165991788800.0, + "grad_norm": 16.77786495887254, + "language_loss": 0.92969573, + "learning_rate": 3.814863144409855e-06, + "loss": 0.95145857, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.41748047, + "step": 2726, + "time_per_iteration": 4.062068462371826 + }, + { + "auxiliary_loss_clip": 0.01512188, + "auxiliary_loss_mlp": 0.00767425, + "balance_loss_clip": 1.21410418, + "balance_loss_mlp": 0.72529638, + "epoch": 0.16395611002555238, + "flos": 21507008785920.0, + "grad_norm": 44.43119539631893, + "language_loss": 0.78856421, + "learning_rate": 3.814699458247963e-06, + "loss": 0.81136036, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.42138672, + "step": 2727, + "time_per_iteration": 2.622298002243042 + }, + { + "auxiliary_loss_clip": 0.01508921, + "auxiliary_loss_mlp": 0.0078251, + "balance_loss_clip": 1.21662354, + "balance_loss_mlp": 0.74154997, + "epoch": 0.16401623327822035, + "flos": 21470918595840.0, + "grad_norm": 67.47763260228845, + "language_loss": 0.88574898, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.90866327, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.40966797, + "step": 2728, + "time_per_iteration": 2.61509108543396 + }, + { + "auxiliary_loss_clip": 0.01500354, + "auxiliary_loss_mlp": 0.00742427, + "balance_loss_clip": 1.20129681, + "balance_loss_mlp": 0.6977948, + "epoch": 0.1640763565308883, + "flos": 13626232928640.0, + "grad_norm": 19.596142921214614, + "language_loss": 0.92963934, + "learning_rate": 3.814371879489633e-06, + "loss": 0.95206714, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.44702148, + "step": 2729, + "time_per_iteration": 2.592879295349121 + }, + { + "auxiliary_loss_clip": 0.01511738, + "auxiliary_loss_mlp": 0.00723265, + "balance_loss_clip": 1.21525383, + "balance_loss_mlp": 0.6802305, + "epoch": 0.16413647978355628, + "flos": 15451464579840.0, + "grad_norm": 5.415165963883723, + "language_loss": 0.81626475, + "learning_rate": 3.814207986905616e-06, + "loss": 0.83861476, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.43041992, + "step": 2730, + "time_per_iteration": 2.5956099033355713 + }, + { + "auxiliary_loss_clip": 0.0150523, + "auxiliary_loss_mlp": 0.00787382, + "balance_loss_clip": 1.20745564, + "balance_loss_mlp": 0.74027008, + "epoch": 0.16419660303622427, + "flos": 45878682015360.0, + "grad_norm": 4.182912979124772, + "language_loss": 0.81178451, + "learning_rate": 3.814044025526651e-06, + "loss": 0.83471054, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.47167969, + "step": 2731, + "time_per_iteration": 2.8144614696502686 + }, + { + "auxiliary_loss_clip": 0.01499421, + "auxiliary_loss_mlp": 0.00751234, + "balance_loss_clip": 1.20190048, + "balance_loss_mlp": 0.70479012, + "epoch": 0.16425672628889224, + "flos": 18952826526720.0, + "grad_norm": 508.96703258864056, + "language_loss": 0.85445988, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.87696648, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.46459961, + "step": 2732, + "time_per_iteration": 2.631650924682617 + }, + { + "auxiliary_loss_clip": 0.015121, + "auxiliary_loss_mlp": 0.00771298, + "balance_loss_clip": 1.21442461, + "balance_loss_mlp": 0.72521174, + "epoch": 0.1643168495415602, + "flos": 24312996362880.0, + "grad_norm": 11.22616128029311, + "language_loss": 0.74435639, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.7671904, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.46118164, + "step": 2733, + "time_per_iteration": 2.767143964767456 + }, + { + "auxiliary_loss_clip": 0.01499214, + "auxiliary_loss_mlp": 0.00769566, + "balance_loss_clip": 1.20607233, + "balance_loss_mlp": 0.72445738, + "epoch": 0.16437697279422817, + "flos": 26428421992320.0, + "grad_norm": 3.2040007811817697, + "language_loss": 0.8586145, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.88130236, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.45092773, + "step": 2734, + "time_per_iteration": 2.649622678756714 + }, + { + "auxiliary_loss_clip": 0.01498412, + "auxiliary_loss_mlp": 0.00705137, + "balance_loss_clip": 1.20352888, + "balance_loss_mlp": 0.66274655, + "epoch": 0.16443709604689613, + "flos": 34532239351680.0, + "grad_norm": 3.608109154713085, + "language_loss": 0.90270656, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.9247421, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.42407227, + "step": 2735, + "time_per_iteration": 2.803454637527466 + }, + { + "auxiliary_loss_clip": 0.01502671, + "auxiliary_loss_mlp": 0.00634138, + "balance_loss_clip": 1.21297789, + "balance_loss_mlp": 0.59606278, + "epoch": 0.1644972192995641, + "flos": 23258048895360.0, + "grad_norm": 21.164701853935167, + "language_loss": 0.86329299, + "learning_rate": 3.813223186925296e-06, + "loss": 0.88466108, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.38110352, + "step": 2736, + "time_per_iteration": 2.5941431522369385 + }, + { + "auxiliary_loss_clip": 0.01520267, + "auxiliary_loss_mlp": 0.00555395, + "balance_loss_clip": 1.23267269, + "balance_loss_mlp": 0.51906013, + "epoch": 0.1645573425522321, + "flos": 26979543342720.0, + "grad_norm": 1233.5714948751527, + "language_loss": 0.85806489, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.87882155, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.36303711, + "step": 2737, + "time_per_iteration": 2.7108564376831055 + }, + { + "auxiliary_loss_clip": 0.01516658, + "auxiliary_loss_mlp": 0.00526131, + "balance_loss_clip": 1.22384238, + "balance_loss_mlp": 0.49094027, + "epoch": 0.16461746580490005, + "flos": 28731768600960.0, + "grad_norm": 2357.4209483279783, + "language_loss": 0.92703778, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.94746566, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.35180664, + "step": 2738, + "time_per_iteration": 2.7346370220184326 + }, + { + "auxiliary_loss_clip": 0.01499073, + "auxiliary_loss_mlp": 0.00494643, + "balance_loss_clip": 1.20406806, + "balance_loss_mlp": 0.46116859, + "epoch": 0.16467758905756802, + "flos": 24930156867840.0, + "grad_norm": 53.364908275663474, + "language_loss": 0.76337326, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.78331041, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.3347168, + "step": 2739, + "time_per_iteration": 2.6788041591644287 + }, + { + "auxiliary_loss_clip": 0.01507913, + "auxiliary_loss_mlp": 0.00483519, + "balance_loss_clip": 1.21390259, + "balance_loss_mlp": 0.44992566, + "epoch": 0.16473771231023598, + "flos": 24826519152000.0, + "grad_norm": 7.415243957160203, + "language_loss": 0.88038731, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.90030158, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.3359375, + "step": 2740, + "time_per_iteration": 2.663074254989624 + }, + { + "auxiliary_loss_clip": 0.0152237, + "auxiliary_loss_mlp": 0.00506633, + "balance_loss_clip": 1.22777677, + "balance_loss_mlp": 0.47175264, + "epoch": 0.16479783556290395, + "flos": 39896072375040.0, + "grad_norm": 65.14322682502963, + "language_loss": 0.7745229, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.79481292, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.34887695, + "step": 2741, + "time_per_iteration": 2.8516757488250732 + }, + { + "auxiliary_loss_clip": 0.01519222, + "auxiliary_loss_mlp": 0.00464428, + "balance_loss_clip": 1.22273457, + "balance_loss_mlp": 0.43155015, + "epoch": 0.16485795881557191, + "flos": 19897061299200.0, + "grad_norm": 7.524343480413302, + "language_loss": 0.85168797, + "learning_rate": 3.812235911671472e-06, + "loss": 0.87152445, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.32885742, + "step": 2742, + "time_per_iteration": 2.623344659805298 + }, + { + "auxiliary_loss_clip": 0.01531458, + "auxiliary_loss_mlp": 0.00425975, + "balance_loss_clip": 1.23353493, + "balance_loss_mlp": 0.3955524, + "epoch": 0.16491808206823988, + "flos": 20556129997440.0, + "grad_norm": 16.06150438959695, + "language_loss": 0.89797938, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.91755372, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.30444336, + "step": 2743, + "time_per_iteration": 2.627279043197632 + }, + { + "auxiliary_loss_clip": 0.01501605, + "auxiliary_loss_mlp": 0.00458106, + "balance_loss_clip": 1.20581722, + "balance_loss_mlp": 0.42534706, + "epoch": 0.16497820532090787, + "flos": 23800802376960.0, + "grad_norm": 8.222120306605389, + "language_loss": 0.90509182, + "learning_rate": 3.811906270092265e-06, + "loss": 0.92468894, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.32739258, + "step": 2744, + "time_per_iteration": 2.6992557048797607 + }, + { + "auxiliary_loss_clip": 0.01496332, + "auxiliary_loss_mlp": 0.00414858, + "balance_loss_clip": 1.2007879, + "balance_loss_mlp": 0.38786942, + "epoch": 0.16503832857357584, + "flos": 25482642935040.0, + "grad_norm": 8.250385815604746, + "language_loss": 0.8796792, + "learning_rate": 3.811741346238036e-06, + "loss": 0.89879107, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.26965332, + "step": 2745, + "time_per_iteration": 2.610992670059204 + }, + { + "auxiliary_loss_clip": 0.01518094, + "auxiliary_loss_mlp": 0.00469632, + "balance_loss_clip": 1.22074008, + "balance_loss_mlp": 0.43583575, + "epoch": 0.1650984518262438, + "flos": 17676058619520.0, + "grad_norm": 419.22654607005035, + "language_loss": 0.8147589, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.83463615, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.3380127, + "step": 2746, + "time_per_iteration": 2.596585750579834 + }, + { + "auxiliary_loss_clip": 0.0152074, + "auxiliary_loss_mlp": 0.00427491, + "balance_loss_clip": 1.22142982, + "balance_loss_mlp": 0.39494729, + "epoch": 0.16515857507891177, + "flos": 18698327688960.0, + "grad_norm": 259.5125276363661, + "language_loss": 0.85699075, + "learning_rate": 3.811411292431592e-06, + "loss": 0.87647307, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 2.99023438, + "router_z_loss_mlp": 0.32568359, + "step": 2747, + "time_per_iteration": 2.592313766479492 + }, + { + "auxiliary_loss_clip": 0.01532142, + "auxiliary_loss_mlp": 0.00426771, + "balance_loss_clip": 1.23334384, + "balance_loss_mlp": 0.39699274, + "epoch": 0.16521869833157973, + "flos": 15010481306880.0, + "grad_norm": 17.751421239793256, + "language_loss": 0.76698685, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.78657597, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.29760742, + "step": 2748, + "time_per_iteration": 2.62016224861145 + }, + { + "auxiliary_loss_clip": 0.01529937, + "auxiliary_loss_mlp": 0.00450328, + "balance_loss_clip": 1.2282759, + "balance_loss_mlp": 0.42057347, + "epoch": 0.1652788215842477, + "flos": 22121152548480.0, + "grad_norm": 2.5734466565648746, + "language_loss": 0.9402017, + "learning_rate": 3.811080963869561e-06, + "loss": 0.96000433, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.29736328, + "step": 2749, + "time_per_iteration": 2.5881943702697754 + }, + { + "auxiliary_loss_clip": 0.01557474, + "auxiliary_loss_mlp": 0.00517101, + "balance_loss_clip": 1.25423551, + "balance_loss_mlp": 0.480838, + "epoch": 0.16533894483691566, + "flos": 18333080242560.0, + "grad_norm": 21.682374896252586, + "language_loss": 0.84844917, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.86919492, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.36254883, + "step": 2750, + "time_per_iteration": 2.6236283779144287 + }, + { + "auxiliary_loss_clip": 0.01547834, + "auxiliary_loss_mlp": 0.00480992, + "balance_loss_clip": 1.24434686, + "balance_loss_mlp": 0.4478282, + "epoch": 0.16539906808958366, + "flos": 22382115834240.0, + "grad_norm": 10.05253584043569, + "language_loss": 1.01491499, + "learning_rate": 3.8107503606020455e-06, + "loss": 1.03520322, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.33178711, + "step": 2751, + "time_per_iteration": 2.5954554080963135 + }, + { + "auxiliary_loss_clip": 0.01540075, + "auxiliary_loss_mlp": 0.00472519, + "balance_loss_clip": 1.23961258, + "balance_loss_mlp": 0.43966442, + "epoch": 0.16545919134225162, + "flos": 22711093522560.0, + "grad_norm": 36.96308707823127, + "language_loss": 0.77336812, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.79349411, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.32824707, + "step": 2752, + "time_per_iteration": 2.6553573608398438 + }, + { + "auxiliary_loss_clip": 0.01680246, + "auxiliary_loss_mlp": 0.00306021, + "balance_loss_clip": 1.42792106, + "balance_loss_mlp": 0.28418145, + "epoch": 0.1655193145949196, + "flos": 67802974076160.0, + "grad_norm": 0.7586558537862054, + "language_loss": 0.53750372, + "learning_rate": 3.810419482679192e-06, + "loss": 0.55736637, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.21875, + "step": 2753, + "time_per_iteration": 3.200726270675659 + }, + { + "auxiliary_loss_clip": 0.01561789, + "auxiliary_loss_mlp": 0.00515637, + "balance_loss_clip": 1.26514244, + "balance_loss_mlp": 0.47925472, + "epoch": 0.16557943784758755, + "flos": 24280389792000.0, + "grad_norm": 8.498209702674462, + "language_loss": 0.81139821, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.83217251, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.36401367, + "step": 2754, + "time_per_iteration": 2.6174275875091553 + }, + { + "auxiliary_loss_clip": 0.01581704, + "auxiliary_loss_mlp": 0.00541582, + "balance_loss_clip": 1.27486062, + "balance_loss_mlp": 0.5073455, + "epoch": 0.16563956110025552, + "flos": 20083617561600.0, + "grad_norm": 4.179186631994736, + "language_loss": 0.94976014, + "learning_rate": 3.810088330151188e-06, + "loss": 0.97099298, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.34228516, + "step": 2755, + "time_per_iteration": 2.645301103591919 + }, + { + "auxiliary_loss_clip": 0.01574657, + "auxiliary_loss_mlp": 0.00527446, + "balance_loss_clip": 1.27316022, + "balance_loss_mlp": 0.49306589, + "epoch": 0.16569968435292348, + "flos": 28034454896640.0, + "grad_norm": 29.937220770831704, + "language_loss": 0.77596354, + "learning_rate": 3.80992265092595e-06, + "loss": 0.79698455, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.34350586, + "step": 2756, + "time_per_iteration": 2.713578224182129 + }, + { + "auxiliary_loss_clip": 0.01579459, + "auxiliary_loss_mlp": 0.00542375, + "balance_loss_clip": 1.28080463, + "balance_loss_mlp": 0.50787616, + "epoch": 0.16575980760559147, + "flos": 26250233598720.0, + "grad_norm": 174.6002239115907, + "language_loss": 0.81242263, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.83364099, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.34472656, + "step": 2757, + "time_per_iteration": 2.6382925510406494 + }, + { + "auxiliary_loss_clip": 0.01593656, + "auxiliary_loss_mlp": 0.00520746, + "balance_loss_clip": 1.28837538, + "balance_loss_mlp": 0.48743927, + "epoch": 0.16581993085825944, + "flos": 26943955943040.0, + "grad_norm": 7.540760953872778, + "language_loss": 0.89776254, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.91890657, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.33300781, + "step": 2758, + "time_per_iteration": 2.6746013164520264 + }, + { + "auxiliary_loss_clip": 0.01601131, + "auxiliary_loss_mlp": 0.00489292, + "balance_loss_clip": 1.2964828, + "balance_loss_mlp": 0.4580833, + "epoch": 0.1658800541109274, + "flos": 21653632103040.0, + "grad_norm": 159.05122660516275, + "language_loss": 0.85809779, + "learning_rate": 3.809425201480689e-06, + "loss": 0.87900198, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 3.04492188, + "router_z_loss_mlp": 0.31164551, + "step": 2759, + "time_per_iteration": 2.6031949520111084 + }, + { + "auxiliary_loss_clip": 0.01582353, + "auxiliary_loss_mlp": 0.00505667, + "balance_loss_clip": 1.2721498, + "balance_loss_mlp": 0.47076252, + "epoch": 0.16594017736359537, + "flos": 16435488643200.0, + "grad_norm": 5.340647119360419, + "language_loss": 0.85851657, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.8793968, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.34838867, + "step": 2760, + "time_per_iteration": 2.5691797733306885 + }, + { + "auxiliary_loss_clip": 0.0159188, + "auxiliary_loss_mlp": 0.00533651, + "balance_loss_clip": 1.28510141, + "balance_loss_mlp": 0.49941453, + "epoch": 0.16600030061626334, + "flos": 22637297030400.0, + "grad_norm": 9.237520839732298, + "language_loss": 0.79982007, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.82107532, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.34277344, + "step": 2761, + "time_per_iteration": 2.655872106552124 + }, + { + "auxiliary_loss_clip": 0.01577531, + "auxiliary_loss_mlp": 0.00520251, + "balance_loss_clip": 1.26571012, + "balance_loss_mlp": 0.48558456, + "epoch": 0.1660604238689313, + "flos": 26396569607040.0, + "grad_norm": 56.81007174309186, + "language_loss": 0.93187624, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.9528541, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.34692383, + "step": 2762, + "time_per_iteration": 4.067171096801758 + }, + { + "auxiliary_loss_clip": 0.01568703, + "auxiliary_loss_mlp": 0.0048465, + "balance_loss_clip": 1.25455451, + "balance_loss_mlp": 0.45279688, + "epoch": 0.16612054712159927, + "flos": 23039999383680.0, + "grad_norm": 3.122237771536203, + "language_loss": 0.9585017, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.97903526, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.31860352, + "step": 2763, + "time_per_iteration": 2.6473898887634277 + }, + { + "auxiliary_loss_clip": 0.01639962, + "auxiliary_loss_mlp": 0.00203403, + "balance_loss_clip": 1.38814592, + "balance_loss_mlp": 0.18795316, + "epoch": 0.16618067037426726, + "flos": 59241225202560.0, + "grad_norm": 0.8102432604774885, + "language_loss": 0.59855008, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.61698371, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.15429688, + "step": 2764, + "time_per_iteration": 3.158813238143921 + }, + { + "auxiliary_loss_clip": 0.01604923, + "auxiliary_loss_mlp": 0.00535246, + "balance_loss_clip": 1.29501128, + "balance_loss_mlp": 0.50065142, + "epoch": 0.16624079362693522, + "flos": 27198813916800.0, + "grad_norm": 12.571611601855002, + "language_loss": 0.86643255, + "learning_rate": 3.808428450193401e-06, + "loss": 0.88783419, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.34594727, + "step": 2765, + "time_per_iteration": 4.181353569030762 + }, + { + "auxiliary_loss_clip": 0.01597185, + "auxiliary_loss_mlp": 0.00530132, + "balance_loss_clip": 1.29039037, + "balance_loss_mlp": 0.49420235, + "epoch": 0.1663009168796032, + "flos": 10925068216320.0, + "grad_norm": 2.562859459698781, + "language_loss": 0.79835808, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.81963122, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.359375, + "step": 2766, + "time_per_iteration": 2.701281785964966 + }, + { + "auxiliary_loss_clip": 0.01579458, + "auxiliary_loss_mlp": 0.00486304, + "balance_loss_clip": 1.2717545, + "balance_loss_mlp": 0.45561963, + "epoch": 0.16636104013227115, + "flos": 17894431353600.0, + "grad_norm": 25.151213035663982, + "language_loss": 0.95959461, + "learning_rate": 3.808095651090769e-06, + "loss": 0.98025227, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.30712891, + "step": 2767, + "time_per_iteration": 2.545445680618286 + }, + { + "auxiliary_loss_clip": 0.01615571, + "auxiliary_loss_mlp": 0.00208724, + "balance_loss_clip": 1.37825775, + "balance_loss_mlp": 0.19413239, + "epoch": 0.16642116338493912, + "flos": 66726050463360.0, + "grad_norm": 0.6343364974405218, + "language_loss": 0.52650172, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.54474467, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.14550781, + "step": 2768, + "time_per_iteration": 4.549867630004883 + }, + { + "auxiliary_loss_clip": 0.015795, + "auxiliary_loss_mlp": 0.00464358, + "balance_loss_clip": 1.26934862, + "balance_loss_mlp": 0.43250555, + "epoch": 0.16648128663760708, + "flos": 19026048401280.0, + "grad_norm": 18.37728515768898, + "language_loss": 0.95549893, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.97593749, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.31884766, + "step": 2769, + "time_per_iteration": 2.5749552249908447 + }, + { + "auxiliary_loss_clip": 0.0159671, + "auxiliary_loss_mlp": 0.0016211, + "balance_loss_clip": 1.35810804, + "balance_loss_mlp": 0.14894941, + "epoch": 0.16654140989027508, + "flos": 70134976759680.0, + "grad_norm": 0.842726617912059, + "language_loss": 0.57276988, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59035808, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.13183594, + "step": 2770, + "time_per_iteration": 2.9766881465911865 + }, + { + "auxiliary_loss_clip": 0.01583239, + "auxiliary_loss_mlp": 0.00173811, + "balance_loss_clip": 1.34685087, + "balance_loss_mlp": 0.16131769, + "epoch": 0.16660153314294304, + "flos": 70272406195200.0, + "grad_norm": 0.8384171893313379, + "language_loss": 0.56193471, + "learning_rate": 3.807429230178015e-06, + "loss": 0.57950521, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.12451172, + "step": 2771, + "time_per_iteration": 2.9132072925567627 + }, + { + "auxiliary_loss_clip": 0.01552938, + "auxiliary_loss_mlp": 0.00420022, + "balance_loss_clip": 1.24767756, + "balance_loss_mlp": 0.39231783, + "epoch": 0.166661656395611, + "flos": 23075048079360.0, + "grad_norm": 86.22638596179698, + "language_loss": 0.7947619, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.81449157, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.27734375, + "step": 2772, + "time_per_iteration": 2.617250442504883 + }, + { + "auxiliary_loss_clip": 0.01566926, + "auxiliary_loss_mlp": 0.00419063, + "balance_loss_clip": 1.26243353, + "balance_loss_mlp": 0.39225245, + "epoch": 0.16672177964827897, + "flos": 28366341586560.0, + "grad_norm": 481.57444742963145, + "language_loss": 0.92570812, + "learning_rate": 3.807095608468975e-06, + "loss": 0.94556797, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.26806641, + "step": 2773, + "time_per_iteration": 2.7371411323547363 + }, + { + "auxiliary_loss_clip": 0.01570316, + "auxiliary_loss_mlp": 0.00430611, + "balance_loss_clip": 1.26422417, + "balance_loss_mlp": 0.40219134, + "epoch": 0.16678190290094694, + "flos": 19091010147840.0, + "grad_norm": 5.978239333596565, + "language_loss": 0.86509436, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.8851037, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.28430176, + "step": 2774, + "time_per_iteration": 2.6032941341400146 + }, + { + "auxiliary_loss_clip": 0.01538553, + "auxiliary_loss_mlp": 0.00424004, + "balance_loss_clip": 1.23546624, + "balance_loss_mlp": 0.3956798, + "epoch": 0.1668420261536149, + "flos": 21799106184960.0, + "grad_norm": 8.832182760926356, + "language_loss": 0.90142351, + "learning_rate": 3.806761712658952e-06, + "loss": 0.92104906, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.28308105, + "step": 2775, + "time_per_iteration": 2.6223034858703613 + }, + { + "auxiliary_loss_clip": 0.01547029, + "auxiliary_loss_mlp": 0.00441895, + "balance_loss_clip": 1.24255562, + "balance_loss_mlp": 0.41115141, + "epoch": 0.16690214940628287, + "flos": 19062533640960.0, + "grad_norm": 3.8027990523989175, + "language_loss": 0.86457282, + "learning_rate": 3.806594661981897e-06, + "loss": 0.884462, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.30737305, + "step": 2776, + "time_per_iteration": 2.7541089057922363 + }, + { + "auxiliary_loss_clip": 0.01569447, + "auxiliary_loss_mlp": 0.0038257, + "balance_loss_clip": 1.26347387, + "balance_loss_mlp": 0.35722601, + "epoch": 0.16696227265895086, + "flos": 18588548747520.0, + "grad_norm": 44.109686125714234, + "language_loss": 0.84417802, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.86369824, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.25366211, + "step": 2777, + "time_per_iteration": 2.6745004653930664 + }, + { + "auxiliary_loss_clip": 0.01551495, + "auxiliary_loss_mlp": 0.00400781, + "balance_loss_clip": 1.240134, + "balance_loss_mlp": 0.37357736, + "epoch": 0.16702239591161883, + "flos": 23294139085440.0, + "grad_norm": 2.926498193492357, + "language_loss": 0.91054499, + "learning_rate": 3.806260355115371e-06, + "loss": 0.93006778, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.27246094, + "step": 2778, + "time_per_iteration": 2.645562171936035 + }, + { + "auxiliary_loss_clip": 0.01531061, + "auxiliary_loss_mlp": 0.0037395, + "balance_loss_clip": 1.22234726, + "balance_loss_mlp": 0.34642416, + "epoch": 0.1670825191642868, + "flos": 24425648392320.0, + "grad_norm": 15.960819624421113, + "language_loss": 0.82165581, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.84070593, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.27526855, + "step": 2779, + "time_per_iteration": 2.6379306316375732 + }, + { + "auxiliary_loss_clip": 0.01515334, + "auxiliary_loss_mlp": 0.00316151, + "balance_loss_clip": 1.19882321, + "balance_loss_mlp": 0.28930533, + "epoch": 0.16714264241695476, + "flos": 26797512193920.0, + "grad_norm": 10.668924561281605, + "language_loss": 0.74301565, + "learning_rate": 3.805925774274554e-06, + "loss": 0.76133054, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.26818848, + "step": 2780, + "time_per_iteration": 2.6986050605773926 + }, + { + "auxiliary_loss_clip": 0.01527653, + "auxiliary_loss_mlp": 0.00325159, + "balance_loss_clip": 1.21148992, + "balance_loss_mlp": 0.30024457, + "epoch": 0.16720276566962272, + "flos": 21835304115840.0, + "grad_norm": 2.5041010820845755, + "language_loss": 0.86352217, + "learning_rate": 3.805758381129643e-06, + "loss": 0.88205028, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.24890137, + "step": 2781, + "time_per_iteration": 2.6219475269317627 + }, + { + "auxiliary_loss_clip": 0.01512222, + "auxiliary_loss_mlp": 0.00321511, + "balance_loss_clip": 1.20203257, + "balance_loss_mlp": 0.29575032, + "epoch": 0.1672628889222907, + "flos": 21470415805440.0, + "grad_norm": 23.297196733120938, + "language_loss": 0.80978429, + "learning_rate": 3.805590919510193e-06, + "loss": 0.82812166, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.25769043, + "step": 2782, + "time_per_iteration": 2.6472222805023193 + }, + { + "auxiliary_loss_clip": 0.015128, + "auxiliary_loss_mlp": 0.00309962, + "balance_loss_clip": 1.19345582, + "balance_loss_mlp": 0.28322306, + "epoch": 0.16732301217495865, + "flos": 30774008269440.0, + "grad_norm": 253.98528690377086, + "language_loss": 0.76301354, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.78124118, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.26745605, + "step": 2783, + "time_per_iteration": 2.7067902088165283 + }, + { + "auxiliary_loss_clip": 0.0151243, + "auxiliary_loss_mlp": 0.00295296, + "balance_loss_clip": 1.19831514, + "balance_loss_mlp": 0.26928422, + "epoch": 0.16738313542762664, + "flos": 23474625949440.0, + "grad_norm": 32.268752284769626, + "language_loss": 0.78705752, + "learning_rate": 3.805255790873081e-06, + "loss": 0.80513477, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.2598877, + "step": 2784, + "time_per_iteration": 2.641408681869507 + }, + { + "auxiliary_loss_clip": 0.01503695, + "auxiliary_loss_mlp": 0.00331044, + "balance_loss_clip": 1.18442655, + "balance_loss_mlp": 0.30336326, + "epoch": 0.1674432586802946, + "flos": 29789086366080.0, + "grad_norm": 237.84675070714448, + "language_loss": 0.69798136, + "learning_rate": 3.805088123868126e-06, + "loss": 0.71632874, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.27661133, + "step": 2785, + "time_per_iteration": 2.6646230220794678 + }, + { + "auxiliary_loss_clip": 0.01465874, + "auxiliary_loss_mlp": 0.00085867, + "balance_loss_clip": 1.21767473, + "balance_loss_mlp": 0.07737903, + "epoch": 0.16750338193296258, + "flos": 66136073575680.0, + "grad_norm": 0.7651810220694134, + "language_loss": 0.58054984, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.59606731, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.08496094, + "step": 2786, + "time_per_iteration": 3.1776509284973145 + }, + { + "auxiliary_loss_clip": 0.01500163, + "auxiliary_loss_mlp": 0.0030499, + "balance_loss_clip": 1.18208218, + "balance_loss_mlp": 0.27866888, + "epoch": 0.16756350518563054, + "flos": 25696777864320.0, + "grad_norm": 327.87107599919943, + "language_loss": 0.84408772, + "learning_rate": 3.80475258451721e-06, + "loss": 0.86213923, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.26306152, + "step": 2787, + "time_per_iteration": 2.7246758937835693 + }, + { + "auxiliary_loss_clip": 0.01484567, + "auxiliary_loss_mlp": 0.00279647, + "balance_loss_clip": 1.17085564, + "balance_loss_mlp": 0.25480354, + "epoch": 0.1676236284382985, + "flos": 23836102467840.0, + "grad_norm": 2.2608941604488004, + "language_loss": 0.85842586, + "learning_rate": 3.804584712183972e-06, + "loss": 0.876068, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.24829102, + "step": 2788, + "time_per_iteration": 2.675745725631714 + }, + { + "auxiliary_loss_clip": 0.0150304, + "auxiliary_loss_mlp": 0.00087819, + "balance_loss_clip": 1.26242805, + "balance_loss_mlp": 0.07637501, + "epoch": 0.16768375169096647, + "flos": 59874902985600.0, + "grad_norm": 0.8455351134761456, + "language_loss": 0.59354115, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.60944974, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.11425781, + "step": 2789, + "time_per_iteration": 2.992363691329956 + }, + { + "auxiliary_loss_clip": 0.0149192, + "auxiliary_loss_mlp": 0.00281453, + "balance_loss_clip": 1.17521858, + "balance_loss_mlp": 0.25703883, + "epoch": 0.16774387494363446, + "flos": 38435657207040.0, + "grad_norm": 4.2473990238099955, + "language_loss": 0.75751221, + "learning_rate": 3.804248762233765e-06, + "loss": 0.77524596, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.24401855, + "step": 2790, + "time_per_iteration": 2.816970109939575 + }, + { + "auxiliary_loss_clip": 0.01497149, + "auxiliary_loss_mlp": 0.00281097, + "balance_loss_clip": 1.18015349, + "balance_loss_mlp": 0.25669488, + "epoch": 0.16780399819630243, + "flos": 22637620252800.0, + "grad_norm": 56.11641076431466, + "language_loss": 0.85745847, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.87524092, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 3.16601562, + "router_z_loss_mlp": 0.24401855, + "step": 2791, + "time_per_iteration": 2.673689603805542 + }, + { + "auxiliary_loss_clip": 0.0151221, + "auxiliary_loss_mlp": 0.00263106, + "balance_loss_clip": 1.19389415, + "balance_loss_mlp": 0.23814318, + "epoch": 0.1678641214489704, + "flos": 32891516887680.0, + "grad_norm": 14.241501356933936, + "language_loss": 0.80406058, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.82181376, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.24951172, + "step": 2792, + "time_per_iteration": 2.7420248985290527 + }, + { + "auxiliary_loss_clip": 0.01518821, + "auxiliary_loss_mlp": 0.00270292, + "balance_loss_clip": 1.20173311, + "balance_loss_mlp": 0.24620029, + "epoch": 0.16792424470163836, + "flos": 19974916028160.0, + "grad_norm": 26.244529015290727, + "language_loss": 0.80911475, + "learning_rate": 3.803744324194691e-06, + "loss": 0.82700586, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.2409668, + "step": 2793, + "time_per_iteration": 2.6261637210845947 + }, + { + "auxiliary_loss_clip": 0.01523985, + "auxiliary_loss_mlp": 0.00274408, + "balance_loss_clip": 1.20541143, + "balance_loss_mlp": 0.2488492, + "epoch": 0.16798436795430632, + "flos": 19719878486400.0, + "grad_norm": 604.4563994635255, + "language_loss": 0.84471512, + "learning_rate": 3.803576041376831e-06, + "loss": 0.86269903, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.25549316, + "step": 2794, + "time_per_iteration": 2.5863535404205322 + }, + { + "auxiliary_loss_clip": 0.01472357, + "auxiliary_loss_mlp": 0.00279302, + "balance_loss_clip": 1.15909767, + "balance_loss_mlp": 0.25377926, + "epoch": 0.1680444912069743, + "flos": 28104839596800.0, + "grad_norm": 8.294355255018363, + "language_loss": 0.80171728, + "learning_rate": 3.803407690167187e-06, + "loss": 0.81923389, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 3.13085938, + "router_z_loss_mlp": 0.25524902, + "step": 2795, + "time_per_iteration": 2.6292593479156494 + }, + { + "auxiliary_loss_clip": 0.01504711, + "auxiliary_loss_mlp": 0.00252945, + "balance_loss_clip": 1.18623018, + "balance_loss_mlp": 0.22879352, + "epoch": 0.16810461445964225, + "flos": 18075205526400.0, + "grad_norm": 149.06251058494846, + "language_loss": 0.91055453, + "learning_rate": 3.803239270572142e-06, + "loss": 0.9281311, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.24169922, + "step": 2796, + "time_per_iteration": 2.5971531867980957 + }, + { + "auxiliary_loss_clip": 0.01525732, + "auxiliary_loss_mlp": 0.00291347, + "balance_loss_clip": 1.20983219, + "balance_loss_mlp": 0.26611087, + "epoch": 0.16816473771231025, + "flos": 23878657105920.0, + "grad_norm": 1191.2003497130668, + "language_loss": 0.88844228, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.90661305, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.25231934, + "step": 2797, + "time_per_iteration": 2.6292848587036133 + }, + { + "auxiliary_loss_clip": 0.01573033, + "auxiliary_loss_mlp": 0.00310382, + "balance_loss_clip": 1.26075554, + "balance_loss_mlp": 0.28679004, + "epoch": 0.1682248609649782, + "flos": 22783597125120.0, + "grad_norm": 8.040820399661333, + "language_loss": 0.80318642, + "learning_rate": 3.802902226251401e-06, + "loss": 0.82202065, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 3.12304688, + "router_z_loss_mlp": 0.23620605, + "step": 2798, + "time_per_iteration": 2.697493553161621 + }, + { + "auxiliary_loss_clip": 0.01575002, + "auxiliary_loss_mlp": 0.00310083, + "balance_loss_clip": 1.2593956, + "balance_loss_mlp": 0.28507242, + "epoch": 0.16828498421764618, + "flos": 20705123612160.0, + "grad_norm": 26.91652603900832, + "language_loss": 0.86144269, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.88029361, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.25036621, + "step": 2799, + "time_per_iteration": 2.6669394969940186 + }, + { + "auxiliary_loss_clip": 0.01576658, + "auxiliary_loss_mlp": 0.00314405, + "balance_loss_clip": 1.26029146, + "balance_loss_mlp": 0.28637904, + "epoch": 0.16834510747031414, + "flos": 29420606695680.0, + "grad_norm": 31.1364554509091, + "language_loss": 0.77877808, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.79768872, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.28039551, + "step": 2800, + "time_per_iteration": 2.7169628143310547 + }, + { + "auxiliary_loss_clip": 0.0158241, + "auxiliary_loss_mlp": 0.00324991, + "balance_loss_clip": 1.27189386, + "balance_loss_mlp": 0.29641628, + "epoch": 0.1684052307229821, + "flos": 18145374744960.0, + "grad_norm": 35.31192006779947, + "language_loss": 0.9041118, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.92318583, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.28564453, + "step": 2801, + "time_per_iteration": 2.6092381477355957 + }, + { + "auxiliary_loss_clip": 0.01614615, + "auxiliary_loss_mlp": 0.00335256, + "balance_loss_clip": 1.29622948, + "balance_loss_mlp": 0.308505, + "epoch": 0.16846535397565007, + "flos": 16574929240320.0, + "grad_norm": 181.47337450471304, + "language_loss": 0.93160427, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.95110297, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.2677002, + "step": 2802, + "time_per_iteration": 2.662832498550415 + }, + { + "auxiliary_loss_clip": 0.0163436, + "auxiliary_loss_mlp": 0.0034435, + "balance_loss_clip": 1.31581306, + "balance_loss_mlp": 0.31696722, + "epoch": 0.16852547722831807, + "flos": 30408868563840.0, + "grad_norm": 7.681828039676108, + "language_loss": 0.88171291, + "learning_rate": 3.802058419152413e-06, + "loss": 0.90149999, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.27368164, + "step": 2803, + "time_per_iteration": 2.7021379470825195 + }, + { + "auxiliary_loss_clip": 0.0162498, + "auxiliary_loss_mlp": 0.00336877, + "balance_loss_clip": 1.30701327, + "balance_loss_mlp": 0.30931574, + "epoch": 0.16858560048098603, + "flos": 33507420416640.0, + "grad_norm": 32.47858904236908, + "language_loss": 0.85075468, + "learning_rate": 3.801889452704297e-06, + "loss": 0.87037331, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 3.17382812, + "router_z_loss_mlp": 0.27539062, + "step": 2804, + "time_per_iteration": 4.215130090713501 + }, + { + "auxiliary_loss_clip": 0.01728264, + "auxiliary_loss_mlp": 0.00133016, + "balance_loss_clip": 1.47415257, + "balance_loss_mlp": 0.12157159, + "epoch": 0.168645723733654, + "flos": 67370502326400.0, + "grad_norm": 0.8460022134653995, + "language_loss": 0.55344337, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57205617, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.11425781, + "step": 2805, + "time_per_iteration": 3.1155099868774414 + }, + { + "auxiliary_loss_clip": 0.01660068, + "auxiliary_loss_mlp": 0.00404778, + "balance_loss_clip": 1.33746064, + "balance_loss_mlp": 0.37424862, + "epoch": 0.16870584698632196, + "flos": 21324618501120.0, + "grad_norm": 216.68695019328382, + "language_loss": 0.77917558, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.79982412, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 3.2265625, + "router_z_loss_mlp": 0.30529785, + "step": 2806, + "time_per_iteration": 2.616473436355591 + }, + { + "auxiliary_loss_clip": 0.01634981, + "auxiliary_loss_mlp": 0.00363363, + "balance_loss_clip": 1.31464696, + "balance_loss_mlp": 0.3341094, + "epoch": 0.16876597023898993, + "flos": 20740746925440.0, + "grad_norm": 2.3559913442754365, + "language_loss": 0.76223707, + "learning_rate": 3.80138214341862e-06, + "loss": 0.78222048, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.29248047, + "step": 2807, + "time_per_iteration": 4.067338228225708 + }, + { + "auxiliary_loss_clip": 0.01655668, + "auxiliary_loss_mlp": 0.00390862, + "balance_loss_clip": 1.33546805, + "balance_loss_mlp": 0.36008215, + "epoch": 0.1688260934916579, + "flos": 20303498666880.0, + "grad_norm": 6.406218688776993, + "language_loss": 0.79034853, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.8108139, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.30761719, + "step": 2808, + "time_per_iteration": 2.625635862350464 + }, + { + "auxiliary_loss_clip": 0.01670111, + "auxiliary_loss_mlp": 0.00385115, + "balance_loss_clip": 1.34275007, + "balance_loss_mlp": 0.35278583, + "epoch": 0.16888621674432586, + "flos": 20340702178560.0, + "grad_norm": 90.9861267908468, + "language_loss": 0.89700931, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.91756153, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.32324219, + "step": 2809, + "time_per_iteration": 2.5992331504821777 + }, + { + "auxiliary_loss_clip": 0.01675969, + "auxiliary_loss_mlp": 0.00402916, + "balance_loss_clip": 1.35013485, + "balance_loss_mlp": 0.37223184, + "epoch": 0.16894633999699385, + "flos": 16244802316800.0, + "grad_norm": 7.5278243519433925, + "language_loss": 0.95603281, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.97682166, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.30688477, + "step": 2810, + "time_per_iteration": 3.968993902206421 + }, + { + "auxiliary_loss_clip": 0.01660066, + "auxiliary_loss_mlp": 0.00402561, + "balance_loss_clip": 1.3315798, + "balance_loss_mlp": 0.37027878, + "epoch": 0.16900646324966181, + "flos": 19610171372160.0, + "grad_norm": 19.405482780028084, + "language_loss": 0.99006462, + "learning_rate": 3.800704774747416e-06, + "loss": 1.01069093, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.32348633, + "step": 2811, + "time_per_iteration": 2.5867760181427 + }, + { + "auxiliary_loss_clip": 0.01647623, + "auxiliary_loss_mlp": 0.00372266, + "balance_loss_clip": 1.32019138, + "balance_loss_mlp": 0.34127158, + "epoch": 0.16906658650232978, + "flos": 22018089450240.0, + "grad_norm": 65.2263568623519, + "language_loss": 0.86907345, + "learning_rate": 3.800535261856291e-06, + "loss": 0.88927233, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.30957031, + "step": 2812, + "time_per_iteration": 2.5909907817840576 + }, + { + "auxiliary_loss_clip": 0.01667008, + "auxiliary_loss_mlp": 0.00381654, + "balance_loss_clip": 1.33975077, + "balance_loss_mlp": 0.35251892, + "epoch": 0.16912670975499774, + "flos": 11763690024960.0, + "grad_norm": 16.548029630438844, + "language_loss": 0.83302999, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.85351658, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.29150391, + "step": 2813, + "time_per_iteration": 2.5343430042266846 + }, + { + "auxiliary_loss_clip": 0.01659957, + "auxiliary_loss_mlp": 0.00396537, + "balance_loss_clip": 1.33452737, + "balance_loss_mlp": 0.364088, + "epoch": 0.1691868330076657, + "flos": 17161386595200.0, + "grad_norm": 131.7533502246331, + "language_loss": 0.77569634, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.79626125, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.32470703, + "step": 2814, + "time_per_iteration": 2.567167282104492 + }, + { + "auxiliary_loss_clip": 0.01675933, + "auxiliary_loss_mlp": 0.00407834, + "balance_loss_clip": 1.34716904, + "balance_loss_mlp": 0.37679175, + "epoch": 0.16924695626033368, + "flos": 22416553998720.0, + "grad_norm": 2.2792626266000338, + "language_loss": 0.68135762, + "learning_rate": 3.800026313549776e-06, + "loss": 0.70219529, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.31054688, + "step": 2815, + "time_per_iteration": 2.644531011581421 + }, + { + "auxiliary_loss_clip": 0.01672263, + "auxiliary_loss_mlp": 0.00403161, + "balance_loss_clip": 1.34141111, + "balance_loss_mlp": 0.37116539, + "epoch": 0.16930707951300164, + "flos": 25739655724800.0, + "grad_norm": 1.7443031059889547, + "language_loss": 0.88414025, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.90489453, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.31982422, + "step": 2816, + "time_per_iteration": 2.611846446990967 + }, + { + "auxiliary_loss_clip": 0.01677454, + "auxiliary_loss_mlp": 0.00419093, + "balance_loss_clip": 1.34849858, + "balance_loss_mlp": 0.384451, + "epoch": 0.16936720276566963, + "flos": 22747040058240.0, + "grad_norm": 47.46971541708254, + "language_loss": 0.96680909, + "learning_rate": 3.799686673382153e-06, + "loss": 0.98777455, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.34643555, + "step": 2817, + "time_per_iteration": 2.584468126296997 + }, + { + "auxiliary_loss_clip": 0.01649988, + "auxiliary_loss_mlp": 0.00384509, + "balance_loss_clip": 1.32484341, + "balance_loss_mlp": 0.35253704, + "epoch": 0.1694273260183376, + "flos": 19573973441280.0, + "grad_norm": 17.06018620342636, + "language_loss": 0.86834371, + "learning_rate": 3.799516750928672e-06, + "loss": 0.88868868, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.31982422, + "step": 2818, + "time_per_iteration": 2.583502769470215 + }, + { + "auxiliary_loss_clip": 0.01659341, + "auxiliary_loss_mlp": 0.00408215, + "balance_loss_clip": 1.33772683, + "balance_loss_mlp": 0.37526602, + "epoch": 0.16948744927100556, + "flos": 12457843332480.0, + "grad_norm": 44.55527737448798, + "language_loss": 0.89862347, + "learning_rate": 3.799346760237336e-06, + "loss": 0.91929907, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.3293457, + "step": 2819, + "time_per_iteration": 2.5876431465148926 + }, + { + "auxiliary_loss_clip": 0.01715909, + "auxiliary_loss_mlp": 0.00078152, + "balance_loss_clip": 1.45182276, + "balance_loss_mlp": 0.06918719, + "epoch": 0.16954757252367353, + "flos": 71291694435840.0, + "grad_norm": 0.9265422276775296, + "language_loss": 0.60369706, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.6216377, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.08984375, + "step": 2820, + "time_per_iteration": 3.0801053047180176 + }, + { + "auxiliary_loss_clip": 0.01656389, + "auxiliary_loss_mlp": 0.00387422, + "balance_loss_clip": 1.33244133, + "balance_loss_mlp": 0.35421038, + "epoch": 0.1696076957763415, + "flos": 29606516513280.0, + "grad_norm": 554.7585460850772, + "language_loss": 0.86952692, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.88996506, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 3.23828125, + "router_z_loss_mlp": 0.33203125, + "step": 2821, + "time_per_iteration": 2.754221200942993 + }, + { + "auxiliary_loss_clip": 0.01668593, + "auxiliary_loss_mlp": 0.00394684, + "balance_loss_clip": 1.34350967, + "balance_loss_mlp": 0.35925505, + "epoch": 0.16966781902900946, + "flos": 24388588535040.0, + "grad_norm": 54.604614263795426, + "language_loss": 0.86286175, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.88349456, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.35473633, + "step": 2822, + "time_per_iteration": 2.6833648681640625 + }, + { + "auxiliary_loss_clip": 0.01655116, + "auxiliary_loss_mlp": 0.00424732, + "balance_loss_clip": 1.33501482, + "balance_loss_mlp": 0.39316529, + "epoch": 0.16972794228167745, + "flos": 23038814234880.0, + "grad_norm": 56.78750161072761, + "language_loss": 0.81057334, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.83137178, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.31591797, + "step": 2823, + "time_per_iteration": 2.6734437942504883 + }, + { + "auxiliary_loss_clip": 0.01663379, + "auxiliary_loss_mlp": 0.00411163, + "balance_loss_clip": 1.34112644, + "balance_loss_mlp": 0.37737906, + "epoch": 0.16978806553434542, + "flos": 35228691129600.0, + "grad_norm": 5.211912583678025, + "language_loss": 0.64940435, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.6701498, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 3.2265625, + "router_z_loss_mlp": 0.33837891, + "step": 2824, + "time_per_iteration": 2.7787842750549316 + }, + { + "auxiliary_loss_clip": 0.01646402, + "auxiliary_loss_mlp": 0.00398883, + "balance_loss_clip": 1.32455254, + "balance_loss_mlp": 0.36564767, + "epoch": 0.16984818878701338, + "flos": 32014290936960.0, + "grad_norm": 18.740832071887603, + "language_loss": 0.8000986, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.82055146, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.33276367, + "step": 2825, + "time_per_iteration": 2.7301807403564453 + }, + { + "auxiliary_loss_clip": 0.01630732, + "auxiliary_loss_mlp": 0.00475312, + "balance_loss_clip": 1.31052089, + "balance_loss_mlp": 0.43847629, + "epoch": 0.16990831203968135, + "flos": 22818609907200.0, + "grad_norm": 24.04716773255392, + "language_loss": 0.93173462, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.95279509, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.3684082, + "step": 2826, + "time_per_iteration": 2.6366043090820312 + }, + { + "auxiliary_loss_clip": 0.01636179, + "auxiliary_loss_mlp": 0.00427333, + "balance_loss_clip": 1.31419909, + "balance_loss_mlp": 0.39414543, + "epoch": 0.1699684352923493, + "flos": 23039604334080.0, + "grad_norm": 2.3689432902324294, + "language_loss": 0.87891853, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.89955372, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.33203125, + "step": 2827, + "time_per_iteration": 2.6390578746795654 + }, + { + "auxiliary_loss_clip": 0.01643637, + "auxiliary_loss_mlp": 0.00435449, + "balance_loss_clip": 1.3171525, + "balance_loss_mlp": 0.39904234, + "epoch": 0.17002855854501728, + "flos": 21434110133760.0, + "grad_norm": 4.010884877575574, + "language_loss": 0.80720967, + "learning_rate": 3.797813774376267e-06, + "loss": 0.82800055, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.36376953, + "step": 2828, + "time_per_iteration": 2.6938681602478027 + }, + { + "auxiliary_loss_clip": 0.01764534, + "auxiliary_loss_mlp": 0.00078511, + "balance_loss_clip": 1.51312447, + "balance_loss_mlp": 0.0680206, + "epoch": 0.17008868179768524, + "flos": 71453509205760.0, + "grad_norm": 0.781746573654529, + "language_loss": 0.56404388, + "learning_rate": 3.797643101661336e-06, + "loss": 0.58247435, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.10498047, + "step": 2829, + "time_per_iteration": 3.189039468765259 + }, + { + "auxiliary_loss_clip": 0.01654784, + "auxiliary_loss_mlp": 0.00441777, + "balance_loss_clip": 1.33113551, + "balance_loss_mlp": 0.40889949, + "epoch": 0.17014880505035324, + "flos": 24900315644160.0, + "grad_norm": 65.75571949180538, + "language_loss": 0.90996832, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.93093395, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.32885742, + "step": 2830, + "time_per_iteration": 2.6856963634490967 + }, + { + "auxiliary_loss_clip": 0.01676789, + "auxiliary_loss_mlp": 0.00463686, + "balance_loss_clip": 1.34546256, + "balance_loss_mlp": 0.42646927, + "epoch": 0.1702089283030212, + "flos": 29862415981440.0, + "grad_norm": 13.277333253177126, + "language_loss": 0.85197097, + "learning_rate": 3.797301551737529e-06, + "loss": 0.87337571, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.37207031, + "step": 2831, + "time_per_iteration": 2.6245081424713135 + }, + { + "auxiliary_loss_clip": 0.01641537, + "auxiliary_loss_mlp": 0.0045993, + "balance_loss_clip": 1.3210541, + "balance_loss_mlp": 0.42397696, + "epoch": 0.17026905155568917, + "flos": 17744180762880.0, + "grad_norm": 13.562530573707724, + "language_loss": 0.85494024, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.87595499, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.359375, + "step": 2832, + "time_per_iteration": 2.627206802368164 + }, + { + "auxiliary_loss_clip": 0.01652525, + "auxiliary_loss_mlp": 0.00464651, + "balance_loss_clip": 1.32758057, + "balance_loss_mlp": 0.42860177, + "epoch": 0.17032917480835713, + "flos": 23148665003520.0, + "grad_norm": 8.35840670056133, + "language_loss": 0.94780076, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.96897256, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.3605957, + "step": 2833, + "time_per_iteration": 2.584639310836792 + }, + { + "auxiliary_loss_clip": 0.01656563, + "auxiliary_loss_mlp": 0.00399127, + "balance_loss_clip": 1.33496213, + "balance_loss_mlp": 0.36410373, + "epoch": 0.1703892980610251, + "flos": 39202565512320.0, + "grad_norm": 8.038519835931204, + "language_loss": 0.80757254, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.82812947, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.35009766, + "step": 2834, + "time_per_iteration": 2.823976516723633 + }, + { + "auxiliary_loss_clip": 0.01653492, + "auxiliary_loss_mlp": 0.00398106, + "balance_loss_clip": 1.32546937, + "balance_loss_mlp": 0.36441779, + "epoch": 0.17044942131369306, + "flos": 23039101543680.0, + "grad_norm": 5.063417193159619, + "language_loss": 0.93216622, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.95268214, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.33691406, + "step": 2835, + "time_per_iteration": 2.684882164001465 + }, + { + "auxiliary_loss_clip": 0.01629655, + "auxiliary_loss_mlp": 0.00446716, + "balance_loss_clip": 1.29893351, + "balance_loss_mlp": 0.40706676, + "epoch": 0.17050954456636103, + "flos": 17054983532160.0, + "grad_norm": 1987.830848429452, + "language_loss": 0.84194738, + "learning_rate": 3.796446484348989e-06, + "loss": 0.86271107, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.39648438, + "step": 2836, + "time_per_iteration": 2.635378122329712 + }, + { + "auxiliary_loss_clip": 0.01640859, + "auxiliary_loss_mlp": 0.0047696, + "balance_loss_clip": 1.31160736, + "balance_loss_mlp": 0.44134012, + "epoch": 0.17056966781902902, + "flos": 16836969934080.0, + "grad_norm": 9.71112704236635, + "language_loss": 0.88252681, + "learning_rate": 3.796275266481036e-06, + "loss": 0.903705, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.35620117, + "step": 2837, + "time_per_iteration": 2.606732130050659 + }, + { + "auxiliary_loss_clip": 0.01626567, + "auxiliary_loss_mlp": 0.00378911, + "balance_loss_clip": 1.30357409, + "balance_loss_mlp": 0.34620047, + "epoch": 0.17062979107169698, + "flos": 17712543859200.0, + "grad_norm": 12.361487907693673, + "language_loss": 0.90404415, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.92409891, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.32714844, + "step": 2838, + "time_per_iteration": 2.630979299545288 + }, + { + "auxiliary_loss_clip": 0.01642016, + "auxiliary_loss_mlp": 0.00414983, + "balance_loss_clip": 1.31354284, + "balance_loss_mlp": 0.3816998, + "epoch": 0.17068991432436495, + "flos": 22525040050560.0, + "grad_norm": 13.836299511764548, + "language_loss": 0.99272883, + "learning_rate": 3.795932626406812e-06, + "loss": 1.01329887, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.33300781, + "step": 2839, + "time_per_iteration": 2.5943171977996826 + }, + { + "auxiliary_loss_clip": 0.01638842, + "auxiliary_loss_mlp": 0.0040927, + "balance_loss_clip": 1.31083298, + "balance_loss_mlp": 0.37407941, + "epoch": 0.17075003757703291, + "flos": 25882939077120.0, + "grad_norm": 3.833333150688125, + "language_loss": 0.89484662, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.91532779, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.35205078, + "step": 2840, + "time_per_iteration": 2.631910800933838 + }, + { + "auxiliary_loss_clip": 0.01628894, + "auxiliary_loss_mlp": 0.00369787, + "balance_loss_clip": 1.29778123, + "balance_loss_mlp": 0.33628887, + "epoch": 0.17081016082970088, + "flos": 20120713332480.0, + "grad_norm": 81.37501012014637, + "language_loss": 0.82781237, + "learning_rate": 3.79558971392481e-06, + "loss": 0.84779918, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.3347168, + "step": 2841, + "time_per_iteration": 2.61309814453125 + }, + { + "auxiliary_loss_clip": 0.01623514, + "auxiliary_loss_mlp": 0.00356261, + "balance_loss_clip": 1.29632163, + "balance_loss_mlp": 0.32612506, + "epoch": 0.17087028408236885, + "flos": 24936477661440.0, + "grad_norm": 20.446512603463738, + "language_loss": 0.83108056, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.8508783, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.30175781, + "step": 2842, + "time_per_iteration": 2.7079126834869385 + }, + { + "auxiliary_loss_clip": 0.0164326, + "auxiliary_loss_mlp": 0.00373944, + "balance_loss_clip": 1.31246006, + "balance_loss_mlp": 0.34190035, + "epoch": 0.17093040733503684, + "flos": 19057864872960.0, + "grad_norm": 9.800974212912745, + "language_loss": 0.91800606, + "learning_rate": 3.795246529087043e-06, + "loss": 0.93817818, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.3203125, + "step": 2843, + "time_per_iteration": 2.6590230464935303 + }, + { + "auxiliary_loss_clip": 0.01643642, + "auxiliary_loss_mlp": 0.00375031, + "balance_loss_clip": 1.31347764, + "balance_loss_mlp": 0.34227204, + "epoch": 0.1709905305877048, + "flos": 13078954333440.0, + "grad_norm": 24.490727227864625, + "language_loss": 0.76118588, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.78137261, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.32739258, + "step": 2844, + "time_per_iteration": 2.6607186794281006 + }, + { + "auxiliary_loss_clip": 0.01630198, + "auxiliary_loss_mlp": 0.00370138, + "balance_loss_clip": 1.29966712, + "balance_loss_mlp": 0.33635366, + "epoch": 0.17105065384037277, + "flos": 19209336526080.0, + "grad_norm": 186.08842537310363, + "language_loss": 0.84722114, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.86722445, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.33789062, + "step": 2845, + "time_per_iteration": 2.629296064376831 + }, + { + "auxiliary_loss_clip": 0.01614087, + "auxiliary_loss_mlp": 0.00349518, + "balance_loss_clip": 1.28028154, + "balance_loss_mlp": 0.31592497, + "epoch": 0.17111077709304073, + "flos": 18515183218560.0, + "grad_norm": 13.082785880506599, + "language_loss": 0.86483365, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.88446969, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.33618164, + "step": 2846, + "time_per_iteration": 3.971649646759033 + }, + { + "auxiliary_loss_clip": 0.01630911, + "auxiliary_loss_mlp": 0.00329462, + "balance_loss_clip": 1.29876733, + "balance_loss_mlp": 0.29648834, + "epoch": 0.1711709003457087, + "flos": 25082670015360.0, + "grad_norm": 400.72037222133144, + "language_loss": 0.85869235, + "learning_rate": 3.794559342552472e-06, + "loss": 0.87829608, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32958984, + "step": 2847, + "time_per_iteration": 2.633068323135376 + }, + { + "auxiliary_loss_clip": 0.0162445, + "auxiliary_loss_mlp": 0.00387494, + "balance_loss_clip": 1.28304482, + "balance_loss_mlp": 0.35120708, + "epoch": 0.17123102359837666, + "flos": 17566387418880.0, + "grad_norm": 15.425640227217771, + "language_loss": 0.94748604, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.96760553, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.36303711, + "step": 2848, + "time_per_iteration": 2.626005172729492 + }, + { + "auxiliary_loss_clip": 0.01600459, + "auxiliary_loss_mlp": 0.0039117, + "balance_loss_clip": 1.26286459, + "balance_loss_mlp": 0.35776731, + "epoch": 0.17129114685104463, + "flos": 26173635845760.0, + "grad_norm": 10.1262811475921, + "language_loss": 0.81883603, + "learning_rate": 3.794215340959902e-06, + "loss": 0.83875227, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.33422852, + "step": 2849, + "time_per_iteration": 2.6590452194213867 + }, + { + "auxiliary_loss_clip": 0.01572823, + "auxiliary_loss_mlp": 0.00082247, + "balance_loss_clip": 1.33486319, + "balance_loss_mlp": 0.07032644, + "epoch": 0.17135127010371262, + "flos": 69269710037760.0, + "grad_norm": 0.9335737949777978, + "language_loss": 0.57459354, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59114426, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.11914062, + "step": 2850, + "time_per_iteration": 4.603308916091919 + }, + { + "auxiliary_loss_clip": 0.01612907, + "auxiliary_loss_mlp": 0.00370566, + "balance_loss_clip": 1.27980733, + "balance_loss_mlp": 0.337044, + "epoch": 0.1714113933563806, + "flos": 23550110380800.0, + "grad_norm": 28.298466022075466, + "language_loss": 0.86533904, + "learning_rate": 3.793871067220031e-06, + "loss": 0.8851738, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.33520508, + "step": 2851, + "time_per_iteration": 2.5969908237457275 + }, + { + "auxiliary_loss_clip": 0.01615864, + "auxiliary_loss_mlp": 0.00368225, + "balance_loss_clip": 1.27923226, + "balance_loss_mlp": 0.33379769, + "epoch": 0.17147151660904855, + "flos": 21142443697920.0, + "grad_norm": 54.052901534375486, + "language_loss": 1.01103592, + "learning_rate": 3.7936988283111764e-06, + "loss": 1.03087687, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.34423828, + "step": 2852, + "time_per_iteration": 4.044406414031982 + }, + { + "auxiliary_loss_clip": 0.01618915, + "auxiliary_loss_mlp": 0.00435226, + "balance_loss_clip": 1.27672994, + "balance_loss_mlp": 0.39536253, + "epoch": 0.17153163986171652, + "flos": 18624890332800.0, + "grad_norm": 2.1055841302276557, + "language_loss": 0.7540015, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.77454293, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 3.42382812, + "router_z_loss_mlp": 0.39892578, + "step": 2853, + "time_per_iteration": 2.6490702629089355 + }, + { + "auxiliary_loss_clip": 0.01613396, + "auxiliary_loss_mlp": 0.00403401, + "balance_loss_clip": 1.27666473, + "balance_loss_mlp": 0.36792457, + "epoch": 0.17159176311438448, + "flos": 18223265387520.0, + "grad_norm": 7.127158032635832, + "language_loss": 0.76272076, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.78288871, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.35498047, + "step": 2854, + "time_per_iteration": 2.607835292816162 + }, + { + "auxiliary_loss_clip": 0.01620031, + "auxiliary_loss_mlp": 0.00396779, + "balance_loss_clip": 1.28297758, + "balance_loss_mlp": 0.36468828, + "epoch": 0.17165188636705245, + "flos": 20738987159040.0, + "grad_norm": 27.280916722840864, + "language_loss": 0.94863307, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.96880126, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.32128906, + "step": 2855, + "time_per_iteration": 2.6052746772766113 + }, + { + "auxiliary_loss_clip": 0.01611442, + "auxiliary_loss_mlp": 0.00396367, + "balance_loss_clip": 1.27662611, + "balance_loss_mlp": 0.36387029, + "epoch": 0.17171200961972044, + "flos": 24899884680960.0, + "grad_norm": 5.145771858813035, + "language_loss": 0.909863, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.92994106, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.32458496, + "step": 2856, + "time_per_iteration": 2.645092725753784 + }, + { + "auxiliary_loss_clip": 0.01608803, + "auxiliary_loss_mlp": 0.0041388, + "balance_loss_clip": 1.27200615, + "balance_loss_mlp": 0.37952423, + "epoch": 0.1717721328723884, + "flos": 20157234485760.0, + "grad_norm": 9.834272703315236, + "language_loss": 0.93375814, + "learning_rate": 3.792836613639026e-06, + "loss": 0.95398498, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.34350586, + "step": 2857, + "time_per_iteration": 2.6408448219299316 + }, + { + "auxiliary_loss_clip": 0.01598982, + "auxiliary_loss_mlp": 0.00386329, + "balance_loss_clip": 1.2617979, + "balance_loss_mlp": 0.35275951, + "epoch": 0.17183225612505637, + "flos": 23361650697600.0, + "grad_norm": 5.767570431749359, + "language_loss": 0.84936011, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.86921322, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.33532715, + "step": 2858, + "time_per_iteration": 2.6481873989105225 + }, + { + "auxiliary_loss_clip": 0.0160334, + "auxiliary_loss_mlp": 0.0039149, + "balance_loss_clip": 1.26237941, + "balance_loss_mlp": 0.35413021, + "epoch": 0.17189237937772434, + "flos": 18114240631680.0, + "grad_norm": 7.252808194461094, + "language_loss": 0.83874846, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.85869682, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.37329102, + "step": 2859, + "time_per_iteration": 2.60473895072937 + }, + { + "auxiliary_loss_clip": 0.01591254, + "auxiliary_loss_mlp": 0.00411685, + "balance_loss_clip": 1.25407791, + "balance_loss_mlp": 0.37716195, + "epoch": 0.1719525026303923, + "flos": 23258408031360.0, + "grad_norm": 29.671229301623022, + "language_loss": 0.81750619, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.83753562, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.34521484, + "step": 2860, + "time_per_iteration": 2.6374683380126953 + }, + { + "auxiliary_loss_clip": 0.01579502, + "auxiliary_loss_mlp": 0.00403119, + "balance_loss_clip": 1.24171948, + "balance_loss_mlp": 0.36761808, + "epoch": 0.17201262588306027, + "flos": 20810413353600.0, + "grad_norm": 10.7246006357813, + "language_loss": 0.88041478, + "learning_rate": 3.792145618140317e-06, + "loss": 0.90024102, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.35498047, + "step": 2861, + "time_per_iteration": 2.581437349319458 + }, + { + "auxiliary_loss_clip": 0.01596656, + "auxiliary_loss_mlp": 0.0040611, + "balance_loss_clip": 1.25437844, + "balance_loss_mlp": 0.37118199, + "epoch": 0.17207274913572823, + "flos": 20375858615040.0, + "grad_norm": 11.668236617794646, + "language_loss": 0.93676561, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.95679331, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 3.41992188, + "router_z_loss_mlp": 0.34912109, + "step": 2862, + "time_per_iteration": 2.6143763065338135 + }, + { + "auxiliary_loss_clip": 0.01590826, + "auxiliary_loss_mlp": 0.00391797, + "balance_loss_clip": 1.25542021, + "balance_loss_mlp": 0.35789347, + "epoch": 0.17213287238839622, + "flos": 26797727675520.0, + "grad_norm": 6.172739402101407, + "language_loss": 0.83415174, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.85397792, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.33886719, + "step": 2863, + "time_per_iteration": 2.737022876739502 + }, + { + "auxiliary_loss_clip": 0.01590477, + "auxiliary_loss_mlp": 0.00401119, + "balance_loss_clip": 1.25725436, + "balance_loss_mlp": 0.36311495, + "epoch": 0.1721929956410642, + "flos": 26030819370240.0, + "grad_norm": 1.8819394461679213, + "language_loss": 0.77633798, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.79625386, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.37988281, + "step": 2864, + "time_per_iteration": 2.7099649906158447 + }, + { + "auxiliary_loss_clip": 0.01582786, + "auxiliary_loss_mlp": 0.00432173, + "balance_loss_clip": 1.24684215, + "balance_loss_mlp": 0.39636245, + "epoch": 0.17225311889373215, + "flos": 22273091078400.0, + "grad_norm": 15.269430613730918, + "language_loss": 0.80278313, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.82293278, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.35791016, + "step": 2865, + "time_per_iteration": 2.6830692291259766 + }, + { + "auxiliary_loss_clip": 0.01576678, + "auxiliary_loss_mlp": 0.0039839, + "balance_loss_clip": 1.23932767, + "balance_loss_mlp": 0.36227, + "epoch": 0.17231324214640012, + "flos": 21287774125440.0, + "grad_norm": 16.62298810063414, + "language_loss": 0.8651793, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.88493001, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.36132812, + "step": 2866, + "time_per_iteration": 2.661207675933838 + }, + { + "auxiliary_loss_clip": 0.01592461, + "auxiliary_loss_mlp": 0.00424143, + "balance_loss_clip": 1.25185013, + "balance_loss_mlp": 0.38234797, + "epoch": 0.17237336539906808, + "flos": 19680735640320.0, + "grad_norm": 2.3027466197440725, + "language_loss": 0.85206628, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.87223232, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.41748047, + "step": 2867, + "time_per_iteration": 2.6447298526763916 + }, + { + "auxiliary_loss_clip": 0.01583327, + "auxiliary_loss_mlp": 0.00428947, + "balance_loss_clip": 1.24739957, + "balance_loss_mlp": 0.39153942, + "epoch": 0.17243348865173605, + "flos": 17529650784000.0, + "grad_norm": 28.360011264946444, + "language_loss": 0.85568464, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.8758074, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.37451172, + "step": 2868, + "time_per_iteration": 2.700200319290161 + }, + { + "auxiliary_loss_clip": 0.01590867, + "auxiliary_loss_mlp": 0.00437063, + "balance_loss_clip": 1.25185227, + "balance_loss_mlp": 0.39882106, + "epoch": 0.17249361190440402, + "flos": 18259858368000.0, + "grad_norm": 10.913980959844753, + "language_loss": 0.89937437, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.91965365, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.38256836, + "step": 2869, + "time_per_iteration": 2.572721242904663 + }, + { + "auxiliary_loss_clip": 0.01577495, + "auxiliary_loss_mlp": 0.00369882, + "balance_loss_clip": 1.24228406, + "balance_loss_mlp": 0.33471549, + "epoch": 0.172553735157072, + "flos": 21174367910400.0, + "grad_norm": 48.51235025531055, + "language_loss": 0.83528829, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.85476208, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.35180664, + "step": 2870, + "time_per_iteration": 2.657640218734741 + }, + { + "auxiliary_loss_clip": 0.01587733, + "auxiliary_loss_mlp": 0.0040183, + "balance_loss_clip": 1.25555539, + "balance_loss_mlp": 0.36921456, + "epoch": 0.17261385840973997, + "flos": 22273270646400.0, + "grad_norm": 7.677152564749584, + "language_loss": 0.82970178, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.84959745, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.32592773, + "step": 2871, + "time_per_iteration": 2.5949013233184814 + }, + { + "auxiliary_loss_clip": 0.01593417, + "auxiliary_loss_mlp": 0.00371812, + "balance_loss_clip": 1.2535646, + "balance_loss_mlp": 0.33838564, + "epoch": 0.17267398166240794, + "flos": 27922233830400.0, + "grad_norm": 27.564721315137902, + "language_loss": 0.83275104, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.85240334, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.33422852, + "step": 2872, + "time_per_iteration": 2.750235080718994 + }, + { + "auxiliary_loss_clip": 0.01577911, + "auxiliary_loss_mlp": 0.00362736, + "balance_loss_clip": 1.24798131, + "balance_loss_mlp": 0.32654357, + "epoch": 0.1727341049150759, + "flos": 21945118970880.0, + "grad_norm": 12.922535113781615, + "language_loss": 0.87429404, + "learning_rate": 3.790066109323988e-06, + "loss": 0.8937006, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.36206055, + "step": 2873, + "time_per_iteration": 2.6698966026306152 + }, + { + "auxiliary_loss_clip": 0.01585722, + "auxiliary_loss_mlp": 0.00339382, + "balance_loss_clip": 1.2517271, + "balance_loss_mlp": 0.30450135, + "epoch": 0.17279422816774387, + "flos": 18107883924480.0, + "grad_norm": 8.626853250133276, + "language_loss": 0.81331134, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.83256233, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.34912109, + "step": 2874, + "time_per_iteration": 2.619825839996338 + }, + { + "auxiliary_loss_clip": 0.01596486, + "auxiliary_loss_mlp": 0.00386172, + "balance_loss_clip": 1.25883949, + "balance_loss_mlp": 0.34571266, + "epoch": 0.17285435142041183, + "flos": 21835447770240.0, + "grad_norm": 3.035800439321808, + "language_loss": 0.88006592, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.89989251, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.40478516, + "step": 2875, + "time_per_iteration": 2.583841562271118 + }, + { + "auxiliary_loss_clip": 0.01580444, + "auxiliary_loss_mlp": 0.00371598, + "balance_loss_clip": 1.24768186, + "balance_loss_mlp": 0.33433324, + "epoch": 0.17291447467307983, + "flos": 18368452160640.0, + "grad_norm": 4.674852513174099, + "language_loss": 0.97543585, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.99495631, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.37304688, + "step": 2876, + "time_per_iteration": 2.574932336807251 + }, + { + "auxiliary_loss_clip": 0.01601962, + "auxiliary_loss_mlp": 0.00367482, + "balance_loss_clip": 1.27176642, + "balance_loss_mlp": 0.3329111, + "epoch": 0.1729745979257478, + "flos": 18624638937600.0, + "grad_norm": 6.499217159425447, + "language_loss": 0.91084963, + "learning_rate": 3.789370767013681e-06, + "loss": 0.93054402, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.34570312, + "step": 2877, + "time_per_iteration": 2.628335952758789 + }, + { + "auxiliary_loss_clip": 0.0160922, + "auxiliary_loss_mlp": 0.00359238, + "balance_loss_clip": 1.27446008, + "balance_loss_mlp": 0.32285553, + "epoch": 0.17303472117841576, + "flos": 22998234844800.0, + "grad_norm": 45.291922993575035, + "language_loss": 0.84763825, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.8673228, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.36352539, + "step": 2878, + "time_per_iteration": 2.60723876953125 + }, + { + "auxiliary_loss_clip": 0.01590195, + "auxiliary_loss_mlp": 0.0035414, + "balance_loss_clip": 1.25985241, + "balance_loss_mlp": 0.31806698, + "epoch": 0.17309484443108372, + "flos": 25664386775040.0, + "grad_norm": 6.531736077439721, + "language_loss": 0.76387012, + "learning_rate": 3.78902268871344e-06, + "loss": 0.78331351, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.36035156, + "step": 2879, + "time_per_iteration": 2.6278786659240723 + }, + { + "auxiliary_loss_clip": 0.01608715, + "auxiliary_loss_mlp": 0.00378227, + "balance_loss_clip": 1.27330804, + "balance_loss_mlp": 0.34027085, + "epoch": 0.1731549676837517, + "flos": 13552903313280.0, + "grad_norm": 68.48929197781315, + "language_loss": 0.91919184, + "learning_rate": 3.78884854780014e-06, + "loss": 0.93906128, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.37963867, + "step": 2880, + "time_per_iteration": 2.612574577331543 + }, + { + "auxiliary_loss_clip": 0.01618108, + "auxiliary_loss_mlp": 0.00389651, + "balance_loss_clip": 1.28299117, + "balance_loss_mlp": 0.35107481, + "epoch": 0.17321509093641965, + "flos": 22857070394880.0, + "grad_norm": 10.968661474908936, + "language_loss": 0.87553382, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.89561146, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.38598633, + "step": 2881, + "time_per_iteration": 2.730792284011841 + }, + { + "auxiliary_loss_clip": 0.01598009, + "auxiliary_loss_mlp": 0.00368109, + "balance_loss_clip": 1.2681396, + "balance_loss_mlp": 0.33267987, + "epoch": 0.17327521418908762, + "flos": 24352785653760.0, + "grad_norm": 10.183450504537923, + "language_loss": 0.83834422, + "learning_rate": 3.788500062480197e-06, + "loss": 0.8580054, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35424805, + "step": 2882, + "time_per_iteration": 2.7336947917938232 + }, + { + "auxiliary_loss_clip": 0.01611117, + "auxiliary_loss_mlp": 0.00366005, + "balance_loss_clip": 1.28305721, + "balance_loss_mlp": 0.33002803, + "epoch": 0.1733353374417556, + "flos": 33105651816960.0, + "grad_norm": 3.228718968165747, + "language_loss": 0.81622279, + "learning_rate": 3.788325718086769e-06, + "loss": 0.83599401, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.35961914, + "step": 2883, + "time_per_iteration": 2.7368886470794678 + }, + { + "auxiliary_loss_clip": 0.01598902, + "auxiliary_loss_mlp": 0.00371454, + "balance_loss_clip": 1.26809013, + "balance_loss_mlp": 0.33330733, + "epoch": 0.17339546069442358, + "flos": 24388947671040.0, + "grad_norm": 13.028930845886626, + "language_loss": 0.91194785, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.93165141, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.38183594, + "step": 2884, + "time_per_iteration": 2.621738910675049 + }, + { + "auxiliary_loss_clip": 0.01631193, + "auxiliary_loss_mlp": 0.00387395, + "balance_loss_clip": 1.29682231, + "balance_loss_mlp": 0.35170352, + "epoch": 0.17345558394709154, + "flos": 27454174680960.0, + "grad_norm": 16.70027813332801, + "language_loss": 0.80197388, + "learning_rate": 3.787976825866055e-06, + "loss": 0.82215977, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.35693359, + "step": 2885, + "time_per_iteration": 2.695791006088257 + }, + { + "auxiliary_loss_clip": 0.01609248, + "auxiliary_loss_mlp": 0.00318444, + "balance_loss_clip": 1.28542733, + "balance_loss_mlp": 0.28713953, + "epoch": 0.1735157071997595, + "flos": 24682158391680.0, + "grad_norm": 6.932721830328607, + "language_loss": 0.75935137, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.77862829, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.31274414, + "step": 2886, + "time_per_iteration": 2.6621205806732178 + }, + { + "auxiliary_loss_clip": 0.0161782, + "auxiliary_loss_mlp": 0.0036193, + "balance_loss_clip": 1.28891516, + "balance_loss_mlp": 0.32385454, + "epoch": 0.17357583045242747, + "flos": 21688932193920.0, + "grad_norm": 9.930228298911933, + "language_loss": 0.77582335, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.79562086, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.38061523, + "step": 2887, + "time_per_iteration": 2.6286046504974365 + }, + { + "auxiliary_loss_clip": 0.01608346, + "auxiliary_loss_mlp": 0.00374176, + "balance_loss_clip": 1.28270197, + "balance_loss_mlp": 0.33605278, + "epoch": 0.17363595370509544, + "flos": 15375728753280.0, + "grad_norm": 66.80689679060514, + "language_loss": 0.91617334, + "learning_rate": 3.787452979049585e-06, + "loss": 0.93599856, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.38110352, + "step": 2888, + "time_per_iteration": 4.02417516708374 + }, + { + "auxiliary_loss_clip": 0.0163217, + "auxiliary_loss_mlp": 0.00386811, + "balance_loss_clip": 1.29867196, + "balance_loss_mlp": 0.34628007, + "epoch": 0.1736960769577634, + "flos": 23440941970560.0, + "grad_norm": 19.88501214419633, + "language_loss": 0.88021713, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.90040696, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.40527344, + "step": 2889, + "time_per_iteration": 2.6377077102661133 + }, + { + "auxiliary_loss_clip": 0.01596061, + "auxiliary_loss_mlp": 0.00360905, + "balance_loss_clip": 1.27159023, + "balance_loss_mlp": 0.32633463, + "epoch": 0.1737562002104314, + "flos": 18587830475520.0, + "grad_norm": 133.64726117882464, + "language_loss": 0.91790432, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.93747401, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.34570312, + "step": 2890, + "time_per_iteration": 2.64852237701416 + }, + { + "auxiliary_loss_clip": 0.01608835, + "auxiliary_loss_mlp": 0.00386094, + "balance_loss_clip": 1.27760363, + "balance_loss_mlp": 0.34890032, + "epoch": 0.17381632346309936, + "flos": 15998060816640.0, + "grad_norm": 15.781227661519257, + "language_loss": 0.89875042, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.91869974, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.37207031, + "step": 2891, + "time_per_iteration": 2.5430305004119873 + }, + { + "auxiliary_loss_clip": 0.01590069, + "auxiliary_loss_mlp": 0.00364138, + "balance_loss_clip": 1.26050878, + "balance_loss_mlp": 0.32951951, + "epoch": 0.17387644671576732, + "flos": 13369830670080.0, + "grad_norm": 8.993123411689751, + "language_loss": 0.88440019, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.90394223, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.34643555, + "step": 2892, + "time_per_iteration": 3.9830968379974365 + }, + { + "auxiliary_loss_clip": 0.01611171, + "auxiliary_loss_mlp": 0.00396143, + "balance_loss_clip": 1.28262115, + "balance_loss_mlp": 0.35461077, + "epoch": 0.1739365699684353, + "flos": 26615516958720.0, + "grad_norm": 12.351561806973393, + "language_loss": 0.81906676, + "learning_rate": 3.786578545502627e-06, + "loss": 0.83913994, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.41503906, + "step": 2893, + "time_per_iteration": 2.6654129028320312 + }, + { + "auxiliary_loss_clip": 0.01602249, + "auxiliary_loss_mlp": 0.00401749, + "balance_loss_clip": 1.2743336, + "balance_loss_mlp": 0.3629581, + "epoch": 0.17399669322110325, + "flos": 23367971491200.0, + "grad_norm": 24.058606751004426, + "language_loss": 0.8996911, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.91973102, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.38793945, + "step": 2894, + "time_per_iteration": 4.076308250427246 + }, + { + "auxiliary_loss_clip": 0.0161018, + "auxiliary_loss_mlp": 0.00363714, + "balance_loss_clip": 1.28257573, + "balance_loss_mlp": 0.32296783, + "epoch": 0.17405681647377122, + "flos": 22054107813120.0, + "grad_norm": 10.093248332996742, + "language_loss": 0.81053376, + "learning_rate": 3.786228297806741e-06, + "loss": 0.83027267, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.4074707, + "step": 2895, + "time_per_iteration": 2.703037977218628 + }, + { + "auxiliary_loss_clip": 0.01532734, + "auxiliary_loss_mlp": 0.00160578, + "balance_loss_clip": 1.27944827, + "balance_loss_mlp": 0.14856122, + "epoch": 0.1741169397264392, + "flos": 61457559114240.0, + "grad_norm": 0.8615490908528676, + "language_loss": 0.62389958, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.64083266, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.12011719, + "step": 2896, + "time_per_iteration": 3.1833369731903076 + }, + { + "auxiliary_loss_clip": 0.01604582, + "auxiliary_loss_mlp": 0.00371149, + "balance_loss_clip": 1.27812648, + "balance_loss_mlp": 0.33345526, + "epoch": 0.17417706297910718, + "flos": 27017680608000.0, + "grad_norm": 4.568816182369421, + "language_loss": 0.8203221, + "learning_rate": 3.785877779175034e-06, + "loss": 0.84007943, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.37695312, + "step": 2897, + "time_per_iteration": 2.663837432861328 + }, + { + "auxiliary_loss_clip": 0.01607266, + "auxiliary_loss_mlp": 0.00374937, + "balance_loss_clip": 1.28561187, + "balance_loss_mlp": 0.33729118, + "epoch": 0.17423718623177514, + "flos": 33508856960640.0, + "grad_norm": 37.498714508948694, + "language_loss": 0.7570107, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.7768327, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.3762207, + "step": 2898, + "time_per_iteration": 2.7769758701324463 + }, + { + "auxiliary_loss_clip": 0.0161216, + "auxiliary_loss_mlp": 0.00400165, + "balance_loss_clip": 1.28530169, + "balance_loss_mlp": 0.36082608, + "epoch": 0.1742973094844431, + "flos": 27198634348800.0, + "grad_norm": 4.268951667534003, + "language_loss": 0.85133278, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.87145603, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.39331055, + "step": 2899, + "time_per_iteration": 2.6533725261688232 + }, + { + "auxiliary_loss_clip": 0.01582769, + "auxiliary_loss_mlp": 0.00361439, + "balance_loss_clip": 1.2630254, + "balance_loss_mlp": 0.3262009, + "epoch": 0.17435743273711107, + "flos": 22710734386560.0, + "grad_norm": 38.374200433209104, + "language_loss": 0.77501225, + "learning_rate": 3.785351493339121e-06, + "loss": 0.79445434, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.35205078, + "step": 2900, + "time_per_iteration": 2.613215684890747 + }, + { + "auxiliary_loss_clip": 0.01597426, + "auxiliary_loss_mlp": 0.00357499, + "balance_loss_clip": 1.27692652, + "balance_loss_mlp": 0.32114041, + "epoch": 0.17441755598977904, + "flos": 41646466039680.0, + "grad_norm": 6.686397054162063, + "language_loss": 0.75523388, + "learning_rate": 3.785175929316863e-06, + "loss": 0.77478313, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.36352539, + "step": 2901, + "time_per_iteration": 2.758744239807129 + }, + { + "auxiliary_loss_clip": 0.01617854, + "auxiliary_loss_mlp": 0.00396284, + "balance_loss_clip": 1.29500806, + "balance_loss_mlp": 0.3600443, + "epoch": 0.174477679242447, + "flos": 26287077974400.0, + "grad_norm": 3.325677044410505, + "language_loss": 0.79899657, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.81913793, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 3.2265625, + "router_z_loss_mlp": 0.36230469, + "step": 2902, + "time_per_iteration": 2.6566319465637207 + }, + { + "auxiliary_loss_clip": 0.01601601, + "auxiliary_loss_mlp": 0.00379343, + "balance_loss_clip": 1.28042245, + "balance_loss_mlp": 0.34210208, + "epoch": 0.174537802495115, + "flos": 17858412990720.0, + "grad_norm": 52.36683900420012, + "language_loss": 0.86866605, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.88847554, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.37231445, + "step": 2903, + "time_per_iteration": 2.5493059158325195 + }, + { + "auxiliary_loss_clip": 0.0159092, + "auxiliary_loss_mlp": 0.00409244, + "balance_loss_clip": 1.27330947, + "balance_loss_mlp": 0.37271842, + "epoch": 0.17459792574778296, + "flos": 16940715390720.0, + "grad_norm": 1.751282112256498, + "language_loss": 0.78762889, + "learning_rate": 3.784648831112429e-06, + "loss": 0.80763054, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.36572266, + "step": 2904, + "time_per_iteration": 2.5679984092712402 + }, + { + "auxiliary_loss_clip": 0.01589778, + "auxiliary_loss_mlp": 0.00326536, + "balance_loss_clip": 1.2674185, + "balance_loss_mlp": 0.29434934, + "epoch": 0.17465804900045093, + "flos": 25520026014720.0, + "grad_norm": 2.4991700814130833, + "language_loss": 0.71851599, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.73767912, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.32177734, + "step": 2905, + "time_per_iteration": 2.6622025966644287 + }, + { + "auxiliary_loss_clip": 0.01601111, + "auxiliary_loss_mlp": 0.00398787, + "balance_loss_clip": 1.27539778, + "balance_loss_mlp": 0.36183202, + "epoch": 0.1747181722531189, + "flos": 24129708238080.0, + "grad_norm": 48.261952315982604, + "language_loss": 0.84766102, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.86766005, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.36938477, + "step": 2906, + "time_per_iteration": 2.6521172523498535 + }, + { + "auxiliary_loss_clip": 0.01599127, + "auxiliary_loss_mlp": 0.00383298, + "balance_loss_clip": 1.27790368, + "balance_loss_mlp": 0.34708241, + "epoch": 0.17477829550578686, + "flos": 17748813617280.0, + "grad_norm": 10.63724007889585, + "language_loss": 0.86949801, + "learning_rate": 3.784121123841449e-06, + "loss": 0.88932228, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.36206055, + "step": 2907, + "time_per_iteration": 2.6125190258026123 + }, + { + "auxiliary_loss_clip": 0.01585183, + "auxiliary_loss_mlp": 0.00361715, + "balance_loss_clip": 1.26188719, + "balance_loss_mlp": 0.32712024, + "epoch": 0.17483841875845482, + "flos": 15377344865280.0, + "grad_norm": 21.354094184309663, + "language_loss": 0.88914657, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.90861547, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.34619141, + "step": 2908, + "time_per_iteration": 2.6068763732910156 + }, + { + "auxiliary_loss_clip": 0.01587302, + "auxiliary_loss_mlp": 0.00390461, + "balance_loss_clip": 1.26719642, + "balance_loss_mlp": 0.35620067, + "epoch": 0.17489854201112282, + "flos": 17163254102400.0, + "grad_norm": 14.529207281734951, + "language_loss": 0.89763999, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.91741759, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.3425293, + "step": 2909, + "time_per_iteration": 2.538262128829956 + }, + { + "auxiliary_loss_clip": 0.01598266, + "auxiliary_loss_mlp": 0.00436541, + "balance_loss_clip": 1.2728889, + "balance_loss_mlp": 0.39631939, + "epoch": 0.17495866526379078, + "flos": 19755286318080.0, + "grad_norm": 5.124005520305537, + "language_loss": 0.82317436, + "learning_rate": 3.783592807684017e-06, + "loss": 0.84352243, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.40234375, + "step": 2910, + "time_per_iteration": 2.6428537368774414 + }, + { + "auxiliary_loss_clip": 0.01578194, + "auxiliary_loss_mlp": 0.00378694, + "balance_loss_clip": 1.25941813, + "balance_loss_mlp": 0.3385438, + "epoch": 0.17501878851645875, + "flos": 28511133310080.0, + "grad_norm": 3.90100587792205, + "language_loss": 0.91507351, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.93464231, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.40185547, + "step": 2911, + "time_per_iteration": 2.660956382751465 + }, + { + "auxiliary_loss_clip": 0.01582655, + "auxiliary_loss_mlp": 0.00393829, + "balance_loss_clip": 1.25783026, + "balance_loss_mlp": 0.35615936, + "epoch": 0.1750789117691267, + "flos": 17931203902080.0, + "grad_norm": 147.70845487285666, + "language_loss": 0.9506216, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.97038651, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.37646484, + "step": 2912, + "time_per_iteration": 2.610936403274536 + }, + { + "auxiliary_loss_clip": 0.01575759, + "auxiliary_loss_mlp": 0.00379939, + "balance_loss_clip": 1.2592423, + "balance_loss_mlp": 0.34353295, + "epoch": 0.17513903502179468, + "flos": 18259427404800.0, + "grad_norm": 26.16938017364483, + "language_loss": 0.80345166, + "learning_rate": 3.783063882820439e-06, + "loss": 0.82300872, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.36425781, + "step": 2913, + "time_per_iteration": 2.6082494258880615 + }, + { + "auxiliary_loss_clip": 0.01592171, + "auxiliary_loss_mlp": 0.00365247, + "balance_loss_clip": 1.27639866, + "balance_loss_mlp": 0.33134434, + "epoch": 0.17519915827446264, + "flos": 20704728562560.0, + "grad_norm": 5.242876795117796, + "language_loss": 0.77149445, + "learning_rate": 3.782887439295741e-06, + "loss": 0.79106867, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.33911133, + "step": 2914, + "time_per_iteration": 2.6174118518829346 + }, + { + "auxiliary_loss_clip": 0.01600625, + "auxiliary_loss_mlp": 0.00430517, + "balance_loss_clip": 1.28212202, + "balance_loss_mlp": 0.39141595, + "epoch": 0.1752592815271306, + "flos": 20523415685760.0, + "grad_norm": 5.795631581172803, + "language_loss": 0.98740137, + "learning_rate": 3.782710928163772e-06, + "loss": 1.00771284, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.39086914, + "step": 2915, + "time_per_iteration": 2.6912147998809814 + }, + { + "auxiliary_loss_clip": 0.01587949, + "auxiliary_loss_mlp": 0.00414755, + "balance_loss_clip": 1.26779032, + "balance_loss_mlp": 0.37830073, + "epoch": 0.1753194047797986, + "flos": 21799178012160.0, + "grad_norm": 350.89470061078777, + "language_loss": 0.86085725, + "learning_rate": 3.782534349431226e-06, + "loss": 0.88088429, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.36474609, + "step": 2916, + "time_per_iteration": 2.681788921356201 + }, + { + "auxiliary_loss_clip": 0.01618591, + "auxiliary_loss_mlp": 0.00408516, + "balance_loss_clip": 1.29518652, + "balance_loss_mlp": 0.37179971, + "epoch": 0.17537952803246656, + "flos": 20668351063680.0, + "grad_norm": 491.97340489743493, + "language_loss": 0.79733551, + "learning_rate": 3.782357703104799e-06, + "loss": 0.81760657, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.36743164, + "step": 2917, + "time_per_iteration": 2.674717426300049 + }, + { + "auxiliary_loss_clip": 0.01589015, + "auxiliary_loss_mlp": 0.00373005, + "balance_loss_clip": 1.27175236, + "balance_loss_mlp": 0.33709908, + "epoch": 0.17543965128513453, + "flos": 23295072839040.0, + "grad_norm": 5.496399629848053, + "language_loss": 0.8346073, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.85422754, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 3.17382812, + "router_z_loss_mlp": 0.35888672, + "step": 2918, + "time_per_iteration": 2.602198600769043 + }, + { + "auxiliary_loss_clip": 0.01604583, + "auxiliary_loss_mlp": 0.00388511, + "balance_loss_clip": 1.27981532, + "balance_loss_mlp": 0.35205692, + "epoch": 0.1754997745378025, + "flos": 29095615416960.0, + "grad_norm": 25.443771634742056, + "language_loss": 0.81094688, + "learning_rate": 3.782004207697098e-06, + "loss": 0.83087778, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.36450195, + "step": 2919, + "time_per_iteration": 2.681159257888794 + }, + { + "auxiliary_loss_clip": 0.01605154, + "auxiliary_loss_mlp": 0.0043776, + "balance_loss_clip": 1.28187251, + "balance_loss_mlp": 0.40037608, + "epoch": 0.17555989779047046, + "flos": 30371844620160.0, + "grad_norm": 21.530003988813743, + "language_loss": 0.79337358, + "learning_rate": 3.781827358629228e-06, + "loss": 0.81380284, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.3737793, + "step": 2920, + "time_per_iteration": 2.659468650817871 + }, + { + "auxiliary_loss_clip": 0.01602124, + "auxiliary_loss_mlp": 0.00417619, + "balance_loss_clip": 1.28167009, + "balance_loss_mlp": 0.38185662, + "epoch": 0.17562002104313842, + "flos": 23287746464640.0, + "grad_norm": 75.49101778596429, + "language_loss": 0.86763257, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.88783002, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.35766602, + "step": 2921, + "time_per_iteration": 2.675539970397949 + }, + { + "auxiliary_loss_clip": 0.01608216, + "auxiliary_loss_mlp": 0.00463809, + "balance_loss_clip": 1.28321838, + "balance_loss_mlp": 0.42568564, + "epoch": 0.1756801442958064, + "flos": 24790500789120.0, + "grad_norm": 26.40520267773833, + "language_loss": 0.92113149, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.94185174, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.38110352, + "step": 2922, + "time_per_iteration": 2.6112544536590576 + }, + { + "auxiliary_loss_clip": 0.01610331, + "auxiliary_loss_mlp": 0.00428057, + "balance_loss_clip": 1.28202629, + "balance_loss_mlp": 0.38878909, + "epoch": 0.17574026754847438, + "flos": 25771651764480.0, + "grad_norm": 7.002267340918597, + "language_loss": 0.73321843, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.75360233, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.39282227, + "step": 2923, + "time_per_iteration": 2.6469991207122803 + }, + { + "auxiliary_loss_clip": 0.0163022, + "auxiliary_loss_mlp": 0.00443538, + "balance_loss_clip": 1.30487692, + "balance_loss_mlp": 0.40353128, + "epoch": 0.17580039080114235, + "flos": 17456608477440.0, + "grad_norm": 9.667956862944681, + "language_loss": 0.89992672, + "learning_rate": 3.78111928675413e-06, + "loss": 0.92066431, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.40014648, + "step": 2924, + "time_per_iteration": 2.554562568664551 + }, + { + "auxiliary_loss_clip": 0.01632631, + "auxiliary_loss_mlp": 0.00442953, + "balance_loss_clip": 1.30426228, + "balance_loss_mlp": 0.40337533, + "epoch": 0.1758605140538103, + "flos": 14864648088960.0, + "grad_norm": 21.73680363197503, + "language_loss": 0.79385144, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.81460726, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.39550781, + "step": 2925, + "time_per_iteration": 2.560873508453369 + }, + { + "auxiliary_loss_clip": 0.01628887, + "auxiliary_loss_mlp": 0.00437879, + "balance_loss_clip": 1.30445957, + "balance_loss_mlp": 0.40028062, + "epoch": 0.17592063730647828, + "flos": 23004268329600.0, + "grad_norm": 2.4194001496581854, + "language_loss": 0.74484652, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.76551414, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.37548828, + "step": 2926, + "time_per_iteration": 2.603431224822998 + }, + { + "auxiliary_loss_clip": 0.01632783, + "auxiliary_loss_mlp": 0.00442581, + "balance_loss_clip": 1.29832661, + "balance_loss_mlp": 0.40111953, + "epoch": 0.17598076055914624, + "flos": 20741501111040.0, + "grad_norm": 146.88174429117282, + "language_loss": 0.93348622, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.95423979, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.41430664, + "step": 2927, + "time_per_iteration": 2.6514618396759033 + }, + { + "auxiliary_loss_clip": 0.01641799, + "auxiliary_loss_mlp": 0.00390354, + "balance_loss_clip": 1.31493592, + "balance_loss_mlp": 0.35656977, + "epoch": 0.1760408838118142, + "flos": 34092441227520.0, + "grad_norm": 21.36636927406695, + "language_loss": 0.76367545, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.78399694, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 3.265625, + "router_z_loss_mlp": 0.33789062, + "step": 2928, + "time_per_iteration": 2.792037010192871 + }, + { + "auxiliary_loss_clip": 0.01647981, + "auxiliary_loss_mlp": 0.004187, + "balance_loss_clip": 1.31929374, + "balance_loss_mlp": 0.3821741, + "epoch": 0.1761010070644822, + "flos": 24168384207360.0, + "grad_norm": 25.618610409814156, + "language_loss": 0.88052809, + "learning_rate": 3.780232677305744e-06, + "loss": 0.90119493, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.36499023, + "step": 2929, + "time_per_iteration": 2.637108564376831 + }, + { + "auxiliary_loss_clip": 0.01657872, + "auxiliary_loss_mlp": 0.00438965, + "balance_loss_clip": 1.32631516, + "balance_loss_mlp": 0.40167686, + "epoch": 0.17616113031715017, + "flos": 26576697335040.0, + "grad_norm": 38.91506179530911, + "language_loss": 0.86022699, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.88119531, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.37304688, + "step": 2930, + "time_per_iteration": 4.185786724090576 + }, + { + "auxiliary_loss_clip": 0.01645231, + "auxiliary_loss_mlp": 0.00402437, + "balance_loss_clip": 1.31648839, + "balance_loss_mlp": 0.36603099, + "epoch": 0.17622125356981813, + "flos": 25666685245440.0, + "grad_norm": 13.05541651622911, + "language_loss": 0.84028715, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.86076379, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.36425781, + "step": 2931, + "time_per_iteration": 2.6340599060058594 + }, + { + "auxiliary_loss_clip": 0.01674262, + "auxiliary_loss_mlp": 0.00403149, + "balance_loss_clip": 1.34208357, + "balance_loss_mlp": 0.37008068, + "epoch": 0.1762813768224861, + "flos": 16508530949760.0, + "grad_norm": 5.4447707557350835, + "language_loss": 0.83436322, + "learning_rate": 3.779699901503696e-06, + "loss": 0.85513735, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.33056641, + "step": 2932, + "time_per_iteration": 2.669804334640503 + }, + { + "auxiliary_loss_clip": 0.01650202, + "auxiliary_loss_mlp": 0.00442855, + "balance_loss_clip": 1.31151271, + "balance_loss_mlp": 0.40451756, + "epoch": 0.17634150007515406, + "flos": 11211850402560.0, + "grad_norm": 6.97909541168818, + "language_loss": 0.96766657, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.98859715, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.38330078, + "step": 2933, + "time_per_iteration": 2.6442039012908936 + }, + { + "auxiliary_loss_clip": 0.01676802, + "auxiliary_loss_mlp": 0.00445503, + "balance_loss_clip": 1.34833312, + "balance_loss_mlp": 0.4096688, + "epoch": 0.17640162332782203, + "flos": 23659925235840.0, + "grad_norm": 22.062400279232012, + "language_loss": 0.92080224, + "learning_rate": 3.779344380192448e-06, + "loss": 0.9420253, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.3581543, + "step": 2934, + "time_per_iteration": 4.027655601501465 + }, + { + "auxiliary_loss_clip": 0.01676797, + "auxiliary_loss_mlp": 0.00419326, + "balance_loss_clip": 1.34497797, + "balance_loss_mlp": 0.38294357, + "epoch": 0.17646174658049, + "flos": 53796984606720.0, + "grad_norm": 8.430966167994338, + "language_loss": 0.76824796, + "learning_rate": 3.779166518324077e-06, + "loss": 0.78920925, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.36401367, + "step": 2935, + "time_per_iteration": 2.8843462467193604 + }, + { + "auxiliary_loss_clip": 0.0167338, + "auxiliary_loss_mlp": 0.00418557, + "balance_loss_clip": 1.33619153, + "balance_loss_mlp": 0.38312793, + "epoch": 0.17652186983315798, + "flos": 24243868638720.0, + "grad_norm": 215.89171324135654, + "language_loss": 0.7804361, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.80135548, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.35449219, + "step": 2936, + "time_per_iteration": 2.638343095779419 + }, + { + "auxiliary_loss_clip": 0.01672947, + "auxiliary_loss_mlp": 0.00422136, + "balance_loss_clip": 1.34283781, + "balance_loss_mlp": 0.38665906, + "epoch": 0.17658199308582595, + "flos": 27454282421760.0, + "grad_norm": 5.151603016907603, + "language_loss": 0.78139842, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.80234921, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.35498047, + "step": 2937, + "time_per_iteration": 4.051487684249878 + }, + { + "auxiliary_loss_clip": 0.01677436, + "auxiliary_loss_mlp": 0.00401206, + "balance_loss_clip": 1.34153032, + "balance_loss_mlp": 0.36513281, + "epoch": 0.17664211633849392, + "flos": 22418672901120.0, + "grad_norm": 113.96221877319367, + "language_loss": 0.84845877, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.86924517, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.36083984, + "step": 2938, + "time_per_iteration": 2.638329267501831 + }, + { + "auxiliary_loss_clip": 0.01692726, + "auxiliary_loss_mlp": 0.00382023, + "balance_loss_clip": 1.35435128, + "balance_loss_mlp": 0.34547395, + "epoch": 0.17670223959116188, + "flos": 24715124098560.0, + "grad_norm": 9.834567763018205, + "language_loss": 0.80581206, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.82655954, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.36523438, + "step": 2939, + "time_per_iteration": 2.5841493606567383 + }, + { + "auxiliary_loss_clip": 0.0169262, + "auxiliary_loss_mlp": 0.00374332, + "balance_loss_clip": 1.36250257, + "balance_loss_mlp": 0.34212148, + "epoch": 0.17676236284382985, + "flos": 22527051212160.0, + "grad_norm": 18.31124762552899, + "language_loss": 0.83250117, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.85317075, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.3215332, + "step": 2940, + "time_per_iteration": 2.6149258613586426 + }, + { + "auxiliary_loss_clip": 0.01696319, + "auxiliary_loss_mlp": 0.00385421, + "balance_loss_clip": 1.36533344, + "balance_loss_mlp": 0.35225686, + "epoch": 0.1768224860964978, + "flos": 12385160161920.0, + "grad_norm": 86.14131534271506, + "language_loss": 0.96077275, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.98159015, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.33178711, + "step": 2941, + "time_per_iteration": 2.5285208225250244 + }, + { + "auxiliary_loss_clip": 0.01680912, + "auxiliary_loss_mlp": 0.00368907, + "balance_loss_clip": 1.34848094, + "balance_loss_mlp": 0.33238173, + "epoch": 0.1768826093491658, + "flos": 24353360271360.0, + "grad_norm": 67.75715478845214, + "language_loss": 0.83356899, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.85406715, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.36572266, + "step": 2942, + "time_per_iteration": 2.63105845451355 + }, + { + "auxiliary_loss_clip": 0.01695853, + "auxiliary_loss_mlp": 0.00364749, + "balance_loss_clip": 1.36694086, + "balance_loss_mlp": 0.32853323, + "epoch": 0.17694273260183377, + "flos": 23587062497280.0, + "grad_norm": 11.366194466393583, + "language_loss": 0.8774178, + "learning_rate": 3.77774119516197e-06, + "loss": 0.89802384, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.36206055, + "step": 2943, + "time_per_iteration": 2.6408491134643555 + }, + { + "auxiliary_loss_clip": 0.01697855, + "auxiliary_loss_mlp": 0.00365484, + "balance_loss_clip": 1.36424184, + "balance_loss_mlp": 0.32862401, + "epoch": 0.17700285585450173, + "flos": 26760991040640.0, + "grad_norm": 274.3347478158071, + "language_loss": 0.87030655, + "learning_rate": 3.777562726341155e-06, + "loss": 0.89093995, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.36816406, + "step": 2944, + "time_per_iteration": 2.624326467514038 + }, + { + "auxiliary_loss_clip": 0.01704861, + "auxiliary_loss_mlp": 0.00361897, + "balance_loss_clip": 1.37440419, + "balance_loss_mlp": 0.32696825, + "epoch": 0.1770629791071697, + "flos": 42776323320960.0, + "grad_norm": 2.5124722969935656, + "language_loss": 0.79054618, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.81121373, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.34936523, + "step": 2945, + "time_per_iteration": 2.759526014328003 + }, + { + "auxiliary_loss_clip": 0.01703649, + "auxiliary_loss_mlp": 0.00344105, + "balance_loss_clip": 1.37417841, + "balance_loss_mlp": 0.31151336, + "epoch": 0.17712310235983766, + "flos": 17345572560000.0, + "grad_norm": 36.61454995864865, + "language_loss": 0.87209976, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.89257729, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.32543945, + "step": 2946, + "time_per_iteration": 2.5537919998168945 + }, + { + "auxiliary_loss_clip": 0.01707089, + "auxiliary_loss_mlp": 0.00331169, + "balance_loss_clip": 1.38010812, + "balance_loss_mlp": 0.29788613, + "epoch": 0.17718322561250563, + "flos": 23878477537920.0, + "grad_norm": 10.783045203490603, + "language_loss": 0.83512717, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.85550976, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.33276367, + "step": 2947, + "time_per_iteration": 2.5929367542266846 + }, + { + "auxiliary_loss_clip": 0.01727974, + "auxiliary_loss_mlp": 0.00353401, + "balance_loss_clip": 1.40152514, + "balance_loss_mlp": 0.31697044, + "epoch": 0.1772433488651736, + "flos": 36466352104320.0, + "grad_norm": 54.89672201836535, + "language_loss": 0.78584141, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.80665517, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.36450195, + "step": 2948, + "time_per_iteration": 2.7245473861694336 + }, + { + "auxiliary_loss_clip": 0.01719247, + "auxiliary_loss_mlp": 0.00331097, + "balance_loss_clip": 1.39323151, + "balance_loss_mlp": 0.29812342, + "epoch": 0.1773034721178416, + "flos": 26684716510080.0, + "grad_norm": 11.237604013678606, + "language_loss": 0.86718899, + "learning_rate": 3.776669371292171e-06, + "loss": 0.88769239, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.32958984, + "step": 2949, + "time_per_iteration": 2.6410107612609863 + }, + { + "auxiliary_loss_clip": 0.01627314, + "auxiliary_loss_mlp": 0.00149781, + "balance_loss_clip": 1.39291215, + "balance_loss_mlp": 0.13847953, + "epoch": 0.17736359537050955, + "flos": 57117467617920.0, + "grad_norm": 0.7861432250145216, + "language_loss": 0.6513325, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.66910338, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.11279297, + "step": 2950, + "time_per_iteration": 3.1811716556549072 + }, + { + "auxiliary_loss_clip": 0.01736788, + "auxiliary_loss_mlp": 0.0034283, + "balance_loss_clip": 1.41344702, + "balance_loss_mlp": 0.30747265, + "epoch": 0.17742371862317752, + "flos": 27198203385600.0, + "grad_norm": 10.759356774054934, + "language_loss": 0.89189374, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.91268992, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.35375977, + "step": 2951, + "time_per_iteration": 2.6922221183776855 + }, + { + "auxiliary_loss_clip": 0.01729113, + "auxiliary_loss_mlp": 0.00321605, + "balance_loss_clip": 1.39563727, + "balance_loss_mlp": 0.2859613, + "epoch": 0.17748384187584548, + "flos": 20959694277120.0, + "grad_norm": 1011.9644537042604, + "language_loss": 0.86944306, + "learning_rate": 3.776132549750806e-06, + "loss": 0.88995028, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.35644531, + "step": 2952, + "time_per_iteration": 2.5570600032806396 + }, + { + "auxiliary_loss_clip": 0.01738004, + "auxiliary_loss_mlp": 0.00289055, + "balance_loss_clip": 1.41097569, + "balance_loss_mlp": 0.25469866, + "epoch": 0.17754396512851345, + "flos": 25009986844800.0, + "grad_norm": 12.462222984757997, + "language_loss": 0.87606007, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.8963306, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.34375, + "step": 2953, + "time_per_iteration": 2.639462471008301 + }, + { + "auxiliary_loss_clip": 0.01735225, + "auxiliary_loss_mlp": 0.00328721, + "balance_loss_clip": 1.39782691, + "balance_loss_mlp": 0.29305398, + "epoch": 0.1776040883811814, + "flos": 32051566275840.0, + "grad_norm": 3.371151765940661, + "language_loss": 0.93892044, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.95955998, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.35668945, + "step": 2954, + "time_per_iteration": 2.7158682346343994 + }, + { + "auxiliary_loss_clip": 0.01729755, + "auxiliary_loss_mlp": 0.00333638, + "balance_loss_clip": 1.40050173, + "balance_loss_mlp": 0.29983044, + "epoch": 0.17766421163384938, + "flos": 21574125348480.0, + "grad_norm": 8.437973349051529, + "language_loss": 0.89335573, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.91398966, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.33813477, + "step": 2955, + "time_per_iteration": 2.6209347248077393 + }, + { + "auxiliary_loss_clip": 0.01737991, + "auxiliary_loss_mlp": 0.00319681, + "balance_loss_clip": 1.40009212, + "balance_loss_mlp": 0.2837511, + "epoch": 0.17772433488651737, + "flos": 22419319345920.0, + "grad_norm": 33.24802141760314, + "language_loss": 0.78516418, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.80574095, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.35913086, + "step": 2956, + "time_per_iteration": 2.6015970706939697 + }, + { + "auxiliary_loss_clip": 0.01744026, + "auxiliary_loss_mlp": 0.00307125, + "balance_loss_clip": 1.41112041, + "balance_loss_mlp": 0.27195811, + "epoch": 0.17778445813918534, + "flos": 25629445820160.0, + "grad_norm": 4.115659050266342, + "language_loss": 0.89783955, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.91835111, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.35180664, + "step": 2957, + "time_per_iteration": 2.655021905899048 + }, + { + "auxiliary_loss_clip": 0.01738989, + "auxiliary_loss_mlp": 0.0031842, + "balance_loss_clip": 1.40319526, + "balance_loss_mlp": 0.2835632, + "epoch": 0.1778445813918533, + "flos": 25628871202560.0, + "grad_norm": 28.016264880930855, + "language_loss": 0.81137729, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.83195144, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.34838867, + "step": 2958, + "time_per_iteration": 2.6327555179595947 + }, + { + "auxiliary_loss_clip": 0.01755487, + "auxiliary_loss_mlp": 0.00323676, + "balance_loss_clip": 1.41350555, + "balance_loss_mlp": 0.2887243, + "epoch": 0.17790470464452127, + "flos": 22345522853760.0, + "grad_norm": 66.3605069460464, + "language_loss": 0.87486207, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.89565372, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 0.34985352, + "step": 2959, + "time_per_iteration": 2.6341238021850586 + }, + { + "auxiliary_loss_clip": 0.01740333, + "auxiliary_loss_mlp": 0.00349226, + "balance_loss_clip": 1.39871073, + "balance_loss_mlp": 0.31122184, + "epoch": 0.17796482789718923, + "flos": 18765875214720.0, + "grad_norm": 19.947908678351737, + "language_loss": 0.61702979, + "learning_rate": 3.774698062689362e-06, + "loss": 0.63792551, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.38012695, + "step": 2960, + "time_per_iteration": 2.6058034896850586 + }, + { + "auxiliary_loss_clip": 0.01751233, + "auxiliary_loss_mlp": 0.00314273, + "balance_loss_clip": 1.41222453, + "balance_loss_mlp": 0.27579197, + "epoch": 0.1780249511498572, + "flos": 23440941970560.0, + "grad_norm": 14.328636854777201, + "language_loss": 0.94532275, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.96597779, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.38500977, + "step": 2961, + "time_per_iteration": 2.641568899154663 + }, + { + "auxiliary_loss_clip": 0.01732506, + "auxiliary_loss_mlp": 0.00317439, + "balance_loss_clip": 1.3925246, + "balance_loss_mlp": 0.27848125, + "epoch": 0.1780850744025252, + "flos": 23367468700800.0, + "grad_norm": 9.9922042343844, + "language_loss": 0.84823644, + "learning_rate": 3.774338767820631e-06, + "loss": 0.86873591, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 3.39648438, + "router_z_loss_mlp": 0.38989258, + "step": 2962, + "time_per_iteration": 2.59748911857605 + }, + { + "auxiliary_loss_clip": 0.01743214, + "auxiliary_loss_mlp": 0.00329258, + "balance_loss_clip": 1.40280378, + "balance_loss_mlp": 0.29287505, + "epoch": 0.17814519765519315, + "flos": 13771994319360.0, + "grad_norm": 3.4282332768667527, + "language_loss": 0.81049562, + "learning_rate": 3.774159019458203e-06, + "loss": 0.83122039, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.36352539, + "step": 2963, + "time_per_iteration": 2.6482770442962646 + }, + { + "auxiliary_loss_clip": 0.01749487, + "auxiliary_loss_mlp": 0.00338113, + "balance_loss_clip": 1.40777266, + "balance_loss_mlp": 0.29910779, + "epoch": 0.17820532090786112, + "flos": 21976396738560.0, + "grad_norm": 2.897777777567761, + "language_loss": 0.84387374, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.86474967, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.39013672, + "step": 2964, + "time_per_iteration": 2.578401565551758 + }, + { + "auxiliary_loss_clip": 0.01729338, + "auxiliary_loss_mlp": 0.00321472, + "balance_loss_clip": 1.38907647, + "balance_loss_mlp": 0.28415996, + "epoch": 0.17826544416052909, + "flos": 24790752184320.0, + "grad_norm": 2.1681078261603846, + "language_loss": 0.86494362, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.88545173, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.37329102, + "step": 2965, + "time_per_iteration": 2.6014368534088135 + }, + { + "auxiliary_loss_clip": 0.01738834, + "auxiliary_loss_mlp": 0.00321579, + "balance_loss_clip": 1.39769292, + "balance_loss_mlp": 0.2841236, + "epoch": 0.17832556741319705, + "flos": 13879582531200.0, + "grad_norm": 2.477764050587281, + "language_loss": 1.02825379, + "learning_rate": 3.7736193707404698e-06, + "loss": 1.04885793, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 3.41210938, + "router_z_loss_mlp": 0.37451172, + "step": 2966, + "time_per_iteration": 2.59429669380188 + }, + { + "auxiliary_loss_clip": 0.01732891, + "auxiliary_loss_mlp": 0.00327143, + "balance_loss_clip": 1.39397597, + "balance_loss_mlp": 0.28723198, + "epoch": 0.17838569066586502, + "flos": 36641703323520.0, + "grad_norm": 12.094449908993102, + "language_loss": 0.79137981, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.81198013, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.39916992, + "step": 2967, + "time_per_iteration": 2.6967883110046387 + }, + { + "auxiliary_loss_clip": 0.01745885, + "auxiliary_loss_mlp": 0.00304427, + "balance_loss_clip": 1.40467668, + "balance_loss_mlp": 0.26873618, + "epoch": 0.17844581391853298, + "flos": 18727271072640.0, + "grad_norm": 5.043658559497623, + "language_loss": 0.83195406, + "learning_rate": 3.773259268638157e-06, + "loss": 0.85245723, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.35717773, + "step": 2968, + "time_per_iteration": 2.6440539360046387 + }, + { + "auxiliary_loss_clip": 0.0173193, + "auxiliary_loss_mlp": 0.00319039, + "balance_loss_clip": 1.3951993, + "balance_loss_mlp": 0.28144023, + "epoch": 0.17850593717120097, + "flos": 27378259286400.0, + "grad_norm": 3.989225412152135, + "language_loss": 0.81231725, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.83282691, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.37597656, + "step": 2969, + "time_per_iteration": 2.6635966300964355 + }, + { + "auxiliary_loss_clip": 0.01582409, + "auxiliary_loss_mlp": 0.00183316, + "balance_loss_clip": 1.34134841, + "balance_loss_mlp": 0.17091796, + "epoch": 0.17856606042386894, + "flos": 66996025084800.0, + "grad_norm": 0.8162667103462051, + "language_loss": 0.69044936, + "learning_rate": 3.772898897567171e-06, + "loss": 0.70810652, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.12402344, + "step": 2970, + "time_per_iteration": 3.1826894283294678 + }, + { + "auxiliary_loss_clip": 0.01709007, + "auxiliary_loss_mlp": 0.00326818, + "balance_loss_clip": 1.37270594, + "balance_loss_mlp": 0.29405957, + "epoch": 0.1786261836765369, + "flos": 36977001805440.0, + "grad_norm": 12.770457510303977, + "language_loss": 0.75235891, + "learning_rate": 3.772718611185505e-06, + "loss": 0.77271712, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 3.36132812, + "router_z_loss_mlp": 0.32788086, + "step": 2971, + "time_per_iteration": 2.7497899532318115 + }, + { + "auxiliary_loss_clip": 0.01714477, + "auxiliary_loss_mlp": 0.00319479, + "balance_loss_clip": 1.37695849, + "balance_loss_mlp": 0.28452706, + "epoch": 0.17868630692920487, + "flos": 24825441744000.0, + "grad_norm": 13.889424513544334, + "language_loss": 0.94690609, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.9672457, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.34912109, + "step": 2972, + "time_per_iteration": 2.690767765045166 + }, + { + "auxiliary_loss_clip": 0.01722765, + "auxiliary_loss_mlp": 0.00328235, + "balance_loss_clip": 1.38340521, + "balance_loss_mlp": 0.29087442, + "epoch": 0.17874643018187283, + "flos": 16981977139200.0, + "grad_norm": 8.055810466751607, + "language_loss": 0.95602608, + "learning_rate": 3.77235783676401e-06, + "loss": 0.9765361, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.3737793, + "step": 2973, + "time_per_iteration": 4.099715232849121 + }, + { + "auxiliary_loss_clip": 0.01712391, + "auxiliary_loss_mlp": 0.00335766, + "balance_loss_clip": 1.37017941, + "balance_loss_mlp": 0.29854932, + "epoch": 0.1788065534345408, + "flos": 21032233793280.0, + "grad_norm": 22.90800371097213, + "language_loss": 0.83586377, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.8563453, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 3.41992188, + "router_z_loss_mlp": 0.37255859, + "step": 2974, + "time_per_iteration": 2.623253345489502 + }, + { + "auxiliary_loss_clip": 0.01705575, + "auxiliary_loss_mlp": 0.00322176, + "balance_loss_clip": 1.37818599, + "balance_loss_mlp": 0.28665155, + "epoch": 0.17886667668720876, + "flos": 23987717775360.0, + "grad_norm": 213.7282657397749, + "language_loss": 0.83928984, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.8595674, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.35546875, + "step": 2975, + "time_per_iteration": 2.6972732543945312 + }, + { + "auxiliary_loss_clip": 0.0173197, + "auxiliary_loss_mlp": 0.00347597, + "balance_loss_clip": 1.40147626, + "balance_loss_mlp": 0.31200093, + "epoch": 0.17892679993987676, + "flos": 25739476156800.0, + "grad_norm": 12.268018027700547, + "language_loss": 0.77136731, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.79216301, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.35595703, + "step": 2976, + "time_per_iteration": 4.171847343444824 + }, + { + "auxiliary_loss_clip": 0.01731515, + "auxiliary_loss_mlp": 0.00303757, + "balance_loss_clip": 1.39559615, + "balance_loss_mlp": 0.26801774, + "epoch": 0.17898692319254472, + "flos": 25699686865920.0, + "grad_norm": 53.69721796196957, + "language_loss": 0.82290632, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.84325904, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.35717773, + "step": 2977, + "time_per_iteration": 2.6970789432525635 + }, + { + "auxiliary_loss_clip": 0.01725318, + "auxiliary_loss_mlp": 0.00305957, + "balance_loss_clip": 1.39877892, + "balance_loss_mlp": 0.27076614, + "epoch": 0.1790470464452127, + "flos": 19317786664320.0, + "grad_norm": 14.271263707593098, + "language_loss": 0.87599075, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.89630353, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 3.265625, + "router_z_loss_mlp": 0.35205078, + "step": 2978, + "time_per_iteration": 2.718693256378174 + }, + { + "auxiliary_loss_clip": 0.01736702, + "auxiliary_loss_mlp": 0.003478, + "balance_loss_clip": 1.40664399, + "balance_loss_mlp": 0.31361043, + "epoch": 0.17910716969788065, + "flos": 30044267562240.0, + "grad_norm": 3.96724576160593, + "language_loss": 0.8139168, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.83476174, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.34204102, + "step": 2979, + "time_per_iteration": 4.12359881401062 + }, + { + "auxiliary_loss_clip": 0.01723404, + "auxiliary_loss_mlp": 0.00336503, + "balance_loss_clip": 1.39784646, + "balance_loss_mlp": 0.30073985, + "epoch": 0.17916729295054862, + "flos": 19427709260160.0, + "grad_norm": 87.9157679879234, + "language_loss": 0.74933898, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.76993799, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.35766602, + "step": 2980, + "time_per_iteration": 2.595059394836426 + }, + { + "auxiliary_loss_clip": 0.01700824, + "auxiliary_loss_mlp": 0.00354776, + "balance_loss_clip": 1.37049806, + "balance_loss_mlp": 0.31681943, + "epoch": 0.17922741620321658, + "flos": 14611549881600.0, + "grad_norm": 3.95538549600532, + "language_loss": 0.77905303, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.79960901, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.37963867, + "step": 2981, + "time_per_iteration": 2.579810857772827 + }, + { + "auxiliary_loss_clip": 0.01707679, + "auxiliary_loss_mlp": 0.00329053, + "balance_loss_clip": 1.37563455, + "balance_loss_mlp": 0.29224092, + "epoch": 0.17928753945588458, + "flos": 17165301177600.0, + "grad_norm": 50.23924997308542, + "language_loss": 0.90267283, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.92304015, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.36791992, + "step": 2982, + "time_per_iteration": 2.562204122543335 + }, + { + "auxiliary_loss_clip": 0.01703467, + "auxiliary_loss_mlp": 0.00308121, + "balance_loss_clip": 1.37842298, + "balance_loss_mlp": 0.27550513, + "epoch": 0.17934766270855254, + "flos": 31395622060800.0, + "grad_norm": 19.187454847675095, + "language_loss": 0.88739014, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.90750599, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.32592773, + "step": 2983, + "time_per_iteration": 2.6947436332702637 + }, + { + "auxiliary_loss_clip": 0.01713181, + "auxiliary_loss_mlp": 0.00342694, + "balance_loss_clip": 1.37918186, + "balance_loss_mlp": 0.30571482, + "epoch": 0.1794077859612205, + "flos": 20814184281600.0, + "grad_norm": 7.513000495462913, + "language_loss": 0.95339924, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.97395802, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.36987305, + "step": 2984, + "time_per_iteration": 2.593282699584961 + }, + { + "auxiliary_loss_clip": 0.01697895, + "auxiliary_loss_mlp": 0.00354002, + "balance_loss_clip": 1.36893857, + "balance_loss_mlp": 0.31773835, + "epoch": 0.17946790921388847, + "flos": 28986447006720.0, + "grad_norm": 328.4096078659858, + "language_loss": 0.9513039, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.97182292, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.36303711, + "step": 2985, + "time_per_iteration": 2.6815977096557617 + }, + { + "auxiliary_loss_clip": 0.01696503, + "auxiliary_loss_mlp": 0.00300552, + "balance_loss_clip": 1.37294292, + "balance_loss_mlp": 0.26769772, + "epoch": 0.17952803246655644, + "flos": 20737406960640.0, + "grad_norm": 12.560622880019952, + "language_loss": 0.77010977, + "learning_rate": 3.770006252694922e-06, + "loss": 0.79008037, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.32861328, + "step": 2986, + "time_per_iteration": 2.567049980163574 + }, + { + "auxiliary_loss_clip": 0.01696403, + "auxiliary_loss_mlp": 0.00321083, + "balance_loss_clip": 1.36649585, + "balance_loss_mlp": 0.28775212, + "epoch": 0.1795881557192244, + "flos": 28255988027520.0, + "grad_norm": 70.71938500588794, + "language_loss": 0.8672213, + "learning_rate": 3.769824891588688e-06, + "loss": 0.88739622, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.33325195, + "step": 2987, + "time_per_iteration": 2.7324111461639404 + }, + { + "auxiliary_loss_clip": 0.01712289, + "auxiliary_loss_mlp": 0.00346523, + "balance_loss_clip": 1.37701678, + "balance_loss_mlp": 0.30873358, + "epoch": 0.17964827897189237, + "flos": 18552027594240.0, + "grad_norm": 8.425555809036089, + "language_loss": 0.84660536, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.86719346, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.37768555, + "step": 2988, + "time_per_iteration": 2.556929588317871 + }, + { + "auxiliary_loss_clip": 0.01489245, + "auxiliary_loss_mlp": 0.00175781, + "balance_loss_clip": 1.2391988, + "balance_loss_mlp": 0.16543382, + "epoch": 0.17970840222456036, + "flos": 58165088711040.0, + "grad_norm": 0.7378195025042584, + "language_loss": 0.624349, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64099932, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.10351562, + "step": 2989, + "time_per_iteration": 3.0365383625030518 + }, + { + "auxiliary_loss_clip": 0.0167958, + "auxiliary_loss_mlp": 0.00320419, + "balance_loss_clip": 1.35192096, + "balance_loss_mlp": 0.28721875, + "epoch": 0.17976852547722832, + "flos": 20300805146880.0, + "grad_norm": 11.45524792244918, + "language_loss": 0.78051841, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.80051845, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.33215332, + "step": 2990, + "time_per_iteration": 2.5805442333221436 + }, + { + "auxiliary_loss_clip": 0.01690764, + "auxiliary_loss_mlp": 0.00364355, + "balance_loss_clip": 1.35487235, + "balance_loss_mlp": 0.32880697, + "epoch": 0.1798286487298963, + "flos": 39669367685760.0, + "grad_norm": 224.03424478886822, + "language_loss": 0.75368762, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.77423882, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.35571289, + "step": 2991, + "time_per_iteration": 2.7790701389312744 + }, + { + "auxiliary_loss_clip": 0.01685172, + "auxiliary_loss_mlp": 0.00332463, + "balance_loss_clip": 1.35609412, + "balance_loss_mlp": 0.29569882, + "epoch": 0.17988877198256426, + "flos": 25520313323520.0, + "grad_norm": 78.71793802308387, + "language_loss": 0.87999076, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.90016711, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.36767578, + "step": 2992, + "time_per_iteration": 2.7004621028900146 + }, + { + "auxiliary_loss_clip": 0.01679679, + "auxiliary_loss_mlp": 0.00299524, + "balance_loss_clip": 1.35797906, + "balance_loss_mlp": 0.26507223, + "epoch": 0.17994889523523222, + "flos": 18807496099200.0, + "grad_norm": 8.702798553513622, + "language_loss": 0.88998121, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.90977323, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.34411621, + "step": 2993, + "time_per_iteration": 2.6051950454711914 + }, + { + "auxiliary_loss_clip": 0.01655502, + "auxiliary_loss_mlp": 0.00345757, + "balance_loss_clip": 1.32746911, + "balance_loss_mlp": 0.30930272, + "epoch": 0.18000901848790019, + "flos": 21104450087040.0, + "grad_norm": 5.913629477648111, + "language_loss": 0.84598565, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.86599827, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.36450195, + "step": 2994, + "time_per_iteration": 2.609543561935425 + }, + { + "auxiliary_loss_clip": 0.01662038, + "auxiliary_loss_mlp": 0.00315141, + "balance_loss_clip": 1.33400416, + "balance_loss_mlp": 0.28037998, + "epoch": 0.18006914174056818, + "flos": 19646441130240.0, + "grad_norm": 19.75128363653461, + "language_loss": 0.89762259, + "learning_rate": 3.768371587287296e-06, + "loss": 0.9173944, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.34765625, + "step": 2995, + "time_per_iteration": 2.6026792526245117 + }, + { + "auxiliary_loss_clip": 0.01694431, + "auxiliary_loss_mlp": 0.00330936, + "balance_loss_clip": 1.36711013, + "balance_loss_mlp": 0.29627013, + "epoch": 0.18012926499323614, + "flos": 19499889640320.0, + "grad_norm": 2.151278568227547, + "language_loss": 0.89566374, + "learning_rate": 3.768189622421512e-06, + "loss": 0.9159174, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.34667969, + "step": 2996, + "time_per_iteration": 2.5955255031585693 + }, + { + "auxiliary_loss_clip": 0.01673255, + "auxiliary_loss_mlp": 0.00304325, + "balance_loss_clip": 1.3561064, + "balance_loss_mlp": 0.27120847, + "epoch": 0.1801893882459041, + "flos": 19464553635840.0, + "grad_norm": 13.890924310523276, + "language_loss": 0.9211309, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.94090664, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 3.17382812, + "router_z_loss_mlp": 0.33105469, + "step": 2997, + "time_per_iteration": 2.6080853939056396 + }, + { + "auxiliary_loss_clip": 0.01685063, + "auxiliary_loss_mlp": 0.00328987, + "balance_loss_clip": 1.35412836, + "balance_loss_mlp": 0.29483312, + "epoch": 0.18024951149857207, + "flos": 26870590414080.0, + "grad_norm": 2.9129340973406768, + "language_loss": 0.92107046, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.94121099, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.3416748, + "step": 2998, + "time_per_iteration": 2.6586430072784424 + }, + { + "auxiliary_loss_clip": 0.01662252, + "auxiliary_loss_mlp": 0.00288985, + "balance_loss_clip": 1.34279728, + "balance_loss_mlp": 0.25708452, + "epoch": 0.18030963475124004, + "flos": 30226621933440.0, + "grad_norm": 44.86701455215213, + "language_loss": 0.91554213, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.93505454, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.3190918, + "step": 2999, + "time_per_iteration": 2.7043211460113525 + }, + { + "auxiliary_loss_clip": 0.01682654, + "auxiliary_loss_mlp": 0.00333092, + "balance_loss_clip": 1.35433412, + "balance_loss_mlp": 0.30047619, + "epoch": 0.180369758003908, + "flos": 22307493329280.0, + "grad_norm": 22.19195405369308, + "language_loss": 0.80710542, + "learning_rate": 3.76746109252814e-06, + "loss": 0.82726288, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.32617188, + "step": 3000, + "time_per_iteration": 2.6410958766937256 + }, + { + "auxiliary_loss_clip": 0.0168598, + "auxiliary_loss_mlp": 0.00323721, + "balance_loss_clip": 1.3685329, + "balance_loss_mlp": 0.28948399, + "epoch": 0.18042988125657597, + "flos": 23732033788800.0, + "grad_norm": 3.6694151905245174, + "language_loss": 0.76644146, + "learning_rate": 3.76727879248177e-06, + "loss": 0.78653854, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.34228516, + "step": 3001, + "time_per_iteration": 2.6299209594726562 + }, + { + "auxiliary_loss_clip": 0.01665649, + "auxiliary_loss_mlp": 0.00301173, + "balance_loss_clip": 1.34528089, + "balance_loss_mlp": 0.26824784, + "epoch": 0.18049000450924396, + "flos": 24093582134400.0, + "grad_norm": 2006.06929397038, + "language_loss": 0.94246328, + "learning_rate": 3.767096425420011e-06, + "loss": 0.9621315, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.3293457, + "step": 3002, + "time_per_iteration": 2.676598310470581 + }, + { + "auxiliary_loss_clip": 0.01698899, + "auxiliary_loss_mlp": 0.00302597, + "balance_loss_clip": 1.37676084, + "balance_loss_mlp": 0.270291, + "epoch": 0.18055012776191193, + "flos": 22163168482560.0, + "grad_norm": 27.942912998788085, + "language_loss": 0.89463246, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.91464746, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.32299805, + "step": 3003, + "time_per_iteration": 2.592668294906616 + }, + { + "auxiliary_loss_clip": 0.01676738, + "auxiliary_loss_mlp": 0.0030622, + "balance_loss_clip": 1.35590661, + "balance_loss_mlp": 0.27238861, + "epoch": 0.1806102510145799, + "flos": 28913512440960.0, + "grad_norm": 11.006163318859617, + "language_loss": 0.7503584, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.77018797, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.33813477, + "step": 3004, + "time_per_iteration": 2.6520204544067383 + }, + { + "auxiliary_loss_clip": 0.01659004, + "auxiliary_loss_mlp": 0.00292867, + "balance_loss_clip": 1.33897948, + "balance_loss_mlp": 0.25901124, + "epoch": 0.18067037426724786, + "flos": 19025689265280.0, + "grad_norm": 65.28706522446724, + "language_loss": 0.90715241, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.92667115, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.33837891, + "step": 3005, + "time_per_iteration": 2.5417354106903076 + }, + { + "auxiliary_loss_clip": 0.01675851, + "auxiliary_loss_mlp": 0.00258744, + "balance_loss_clip": 1.36522603, + "balance_loss_mlp": 0.22553219, + "epoch": 0.18073049751991582, + "flos": 27453635976960.0, + "grad_norm": 2.1086146418220304, + "language_loss": 0.90258849, + "learning_rate": 3.766366287157432e-06, + "loss": 0.92193437, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 3.109375, + "router_z_loss_mlp": 0.33227539, + "step": 3006, + "time_per_iteration": 2.6629040241241455 + }, + { + "auxiliary_loss_clip": 0.01676902, + "auxiliary_loss_mlp": 0.00270779, + "balance_loss_clip": 1.36334813, + "balance_loss_mlp": 0.23978503, + "epoch": 0.1807906207725838, + "flos": 28729039167360.0, + "grad_norm": 28.04779476156249, + "language_loss": 0.82791805, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.84739488, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.31005859, + "step": 3007, + "time_per_iteration": 2.6661691665649414 + }, + { + "auxiliary_loss_clip": 0.01457761, + "auxiliary_loss_mlp": 0.00079587, + "balance_loss_clip": 1.24021363, + "balance_loss_mlp": 0.06623592, + "epoch": 0.18085074402525175, + "flos": 64466515468800.0, + "grad_norm": 0.8395286573026242, + "language_loss": 0.56903189, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.58440542, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.13378906, + "step": 3008, + "time_per_iteration": 3.3074886798858643 + }, + { + "auxiliary_loss_clip": 0.01651575, + "auxiliary_loss_mlp": 0.00283196, + "balance_loss_clip": 1.34233356, + "balance_loss_mlp": 0.24945962, + "epoch": 0.18091086727791975, + "flos": 23476960333440.0, + "grad_norm": 4.526457623270812, + "language_loss": 0.7566669, + "learning_rate": 3.765817980138021e-06, + "loss": 0.77601463, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.33740234, + "step": 3009, + "time_per_iteration": 2.597426176071167 + }, + { + "auxiliary_loss_clip": 0.016532, + "auxiliary_loss_mlp": 0.00261022, + "balance_loss_clip": 1.34916711, + "balance_loss_mlp": 0.22962236, + "epoch": 0.1809709905305877, + "flos": 24170467196160.0, + "grad_norm": 2.390700977920598, + "language_loss": 0.83269376, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.85183597, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.31420898, + "step": 3010, + "time_per_iteration": 2.7888777256011963 + }, + { + "auxiliary_loss_clip": 0.01640787, + "auxiliary_loss_mlp": 0.00233289, + "balance_loss_clip": 1.34223425, + "balance_loss_mlp": 0.20012541, + "epoch": 0.18103111378325568, + "flos": 21650902669440.0, + "grad_norm": 6.964429054075578, + "language_loss": 0.72664809, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.74538887, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.33154297, + "step": 3011, + "time_per_iteration": 2.6180331707000732 + }, + { + "auxiliary_loss_clip": 0.01646313, + "auxiliary_loss_mlp": 0.00271565, + "balance_loss_clip": 1.34096861, + "balance_loss_mlp": 0.23980737, + "epoch": 0.18109123703592364, + "flos": 53686918356480.0, + "grad_norm": 3.1153833235035484, + "language_loss": 0.75622672, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.77540553, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.31738281, + "step": 3012, + "time_per_iteration": 2.8878352642059326 + }, + { + "auxiliary_loss_clip": 0.01628623, + "auxiliary_loss_mlp": 0.00273187, + "balance_loss_clip": 1.32895041, + "balance_loss_mlp": 0.23883133, + "epoch": 0.1811513602885916, + "flos": 35845564325760.0, + "grad_norm": 12.810328237979878, + "language_loss": 0.70167935, + "learning_rate": 3.765085966704609e-06, + "loss": 0.7206974, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 2.99414062, + "router_z_loss_mlp": 0.34350586, + "step": 3013, + "time_per_iteration": 2.721540927886963 + }, + { + "auxiliary_loss_clip": 0.01640371, + "auxiliary_loss_mlp": 0.00238664, + "balance_loss_clip": 1.33793116, + "balance_loss_mlp": 0.20881385, + "epoch": 0.18121148354125957, + "flos": 23732572492800.0, + "grad_norm": 14.987145821908094, + "language_loss": 0.82502991, + "learning_rate": 3.764902795998309e-06, + "loss": 0.84382027, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.29858398, + "step": 3014, + "time_per_iteration": 2.599924325942993 + }, + { + "auxiliary_loss_clip": 0.01630344, + "auxiliary_loss_mlp": 0.00261756, + "balance_loss_clip": 1.32219374, + "balance_loss_mlp": 0.22775801, + "epoch": 0.18127160679392756, + "flos": 28728320895360.0, + "grad_norm": 6.522643119014666, + "language_loss": 0.74301332, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.76193428, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 3.08398438, + "router_z_loss_mlp": 0.33959961, + "step": 3015, + "time_per_iteration": 4.063644647598267 + }, + { + "auxiliary_loss_clip": 0.01641756, + "auxiliary_loss_mlp": 0.00234972, + "balance_loss_clip": 1.34390068, + "balance_loss_mlp": 0.19978154, + "epoch": 0.18133173004659553, + "flos": 20485062938880.0, + "grad_norm": 31.766011942606454, + "language_loss": 0.84133601, + "learning_rate": 3.764536253816785e-06, + "loss": 0.86010331, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.3515625, + "step": 3016, + "time_per_iteration": 2.6370136737823486 + }, + { + "auxiliary_loss_clip": 0.0164514, + "auxiliary_loss_mlp": 0.00251869, + "balance_loss_clip": 1.34361911, + "balance_loss_mlp": 0.21915779, + "epoch": 0.1813918532992635, + "flos": 22852078404480.0, + "grad_norm": 5.223894471365019, + "language_loss": 0.89790022, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.91687024, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.32714844, + "step": 3017, + "time_per_iteration": 2.6008665561676025 + }, + { + "auxiliary_loss_clip": 0.01629628, + "auxiliary_loss_mlp": 0.00213637, + "balance_loss_clip": 1.33357525, + "balance_loss_mlp": 0.1813077, + "epoch": 0.18145197655193146, + "flos": 36065122208640.0, + "grad_norm": 23.724683614973813, + "language_loss": 0.7380234, + "learning_rate": 3.764169443989697e-06, + "loss": 0.75645614, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.32324219, + "step": 3018, + "time_per_iteration": 4.182072162628174 + }, + { + "auxiliary_loss_clip": 0.01611859, + "auxiliary_loss_mlp": 0.00219539, + "balance_loss_clip": 1.31341958, + "balance_loss_mlp": 0.18637478, + "epoch": 0.18151209980459942, + "flos": 24023951619840.0, + "grad_norm": 2.470574539886916, + "language_loss": 0.84734225, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.86565626, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.33154297, + "step": 3019, + "time_per_iteration": 2.601876974105835 + }, + { + "auxiliary_loss_clip": 0.01592414, + "auxiliary_loss_mlp": 0.00240348, + "balance_loss_clip": 1.29247987, + "balance_loss_mlp": 0.20627841, + "epoch": 0.1815722230572674, + "flos": 23951627585280.0, + "grad_norm": 10.297823279830373, + "language_loss": 0.89034975, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.90867734, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.34082031, + "step": 3020, + "time_per_iteration": 2.6422231197357178 + }, + { + "auxiliary_loss_clip": 0.01599681, + "auxiliary_loss_mlp": 0.0022762, + "balance_loss_clip": 1.29366398, + "balance_loss_mlp": 0.19445556, + "epoch": 0.18163234630993536, + "flos": 24386469632640.0, + "grad_norm": 2.5844281565568217, + "language_loss": 0.86744654, + "learning_rate": 3.763618727535352e-06, + "loss": 0.88571954, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.33154297, + "step": 3021, + "time_per_iteration": 2.6211998462677 + }, + { + "auxiliary_loss_clip": 0.01566757, + "auxiliary_loss_mlp": 0.00192223, + "balance_loss_clip": 1.26606131, + "balance_loss_mlp": 0.15872476, + "epoch": 0.18169246956260335, + "flos": 24681332378880.0, + "grad_norm": 8.45266447611984, + "language_loss": 0.90282154, + "learning_rate": 3.763435021621422e-06, + "loss": 0.92041135, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.33496094, + "step": 3022, + "time_per_iteration": 4.022431135177612 + }, + { + "auxiliary_loss_clip": 0.01572966, + "auxiliary_loss_mlp": 0.00203187, + "balance_loss_clip": 1.26314616, + "balance_loss_mlp": 0.16930732, + "epoch": 0.1817525928152713, + "flos": 24243294021120.0, + "grad_norm": 2.4853845226181566, + "language_loss": 0.78706867, + "learning_rate": 3.763251248837859e-06, + "loss": 0.80483019, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.33862305, + "step": 3023, + "time_per_iteration": 2.695844888687134 + }, + { + "auxiliary_loss_clip": 0.01566108, + "auxiliary_loss_mlp": 0.00214371, + "balance_loss_clip": 1.25170684, + "balance_loss_mlp": 0.17906076, + "epoch": 0.18181271606793928, + "flos": 16472081623680.0, + "grad_norm": 12.689207528060845, + "language_loss": 0.80776989, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.8255747, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.35302734, + "step": 3024, + "time_per_iteration": 2.6130528450012207 + }, + { + "auxiliary_loss_clip": 0.01569997, + "auxiliary_loss_mlp": 0.00208506, + "balance_loss_clip": 1.2528584, + "balance_loss_mlp": 0.17491266, + "epoch": 0.18187283932060724, + "flos": 18581042805120.0, + "grad_norm": 8.947148205378314, + "language_loss": 0.97816902, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.9959541, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.3359375, + "step": 3025, + "time_per_iteration": 2.6164488792419434 + }, + { + "auxiliary_loss_clip": 0.01550449, + "auxiliary_loss_mlp": 0.00215074, + "balance_loss_clip": 1.23061681, + "balance_loss_mlp": 0.18155271, + "epoch": 0.1819329625732752, + "flos": 20266833859200.0, + "grad_norm": 5.084657501051491, + "language_loss": 0.85703528, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.87469053, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.33520508, + "step": 3026, + "time_per_iteration": 2.5609233379364014 + }, + { + "auxiliary_loss_clip": 0.01555259, + "auxiliary_loss_mlp": 0.00214967, + "balance_loss_clip": 1.22758722, + "balance_loss_mlp": 0.18299495, + "epoch": 0.18199308582594317, + "flos": 25915186512000.0, + "grad_norm": 6.087223560368189, + "language_loss": 0.82150435, + "learning_rate": 3.762515489146692e-06, + "loss": 0.83920658, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.31982422, + "step": 3027, + "time_per_iteration": 2.6569607257843018 + }, + { + "auxiliary_loss_clip": 0.01537206, + "auxiliary_loss_mlp": 0.0021924, + "balance_loss_clip": 1.20712817, + "balance_loss_mlp": 0.18640974, + "epoch": 0.18205320907861114, + "flos": 15377524433280.0, + "grad_norm": 4.148316799096969, + "language_loss": 0.9392786, + "learning_rate": 3.762331382119546e-06, + "loss": 0.95684302, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.328125, + "step": 3028, + "time_per_iteration": 2.540600061416626 + }, + { + "auxiliary_loss_clip": 0.01543849, + "auxiliary_loss_mlp": 0.00231834, + "balance_loss_clip": 1.20713091, + "balance_loss_mlp": 0.20186469, + "epoch": 0.18211333233127913, + "flos": 25624310175360.0, + "grad_norm": 22.35304574870908, + "language_loss": 0.90079319, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.91855001, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.29980469, + "step": 3029, + "time_per_iteration": 2.6343696117401123 + }, + { + "auxiliary_loss_clip": 0.01549099, + "auxiliary_loss_mlp": 0.00207912, + "balance_loss_clip": 1.21343362, + "balance_loss_mlp": 0.17298374, + "epoch": 0.1821734555839471, + "flos": 14976007228800.0, + "grad_norm": 47.87984202987346, + "language_loss": 0.89879811, + "learning_rate": 3.761962967588891e-06, + "loss": 0.91636825, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.34960938, + "step": 3030, + "time_per_iteration": 2.557075262069702 + }, + { + "auxiliary_loss_clip": 0.01530729, + "auxiliary_loss_mlp": 0.00200664, + "balance_loss_clip": 1.1906873, + "balance_loss_mlp": 0.16685635, + "epoch": 0.18223357883661506, + "flos": 20194007034240.0, + "grad_norm": 20.100231624213162, + "language_loss": 0.93519866, + "learning_rate": 3.761778660099352e-06, + "loss": 0.95251262, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.33837891, + "step": 3031, + "time_per_iteration": 2.6064417362213135 + }, + { + "auxiliary_loss_clip": 0.01536648, + "auxiliary_loss_mlp": 0.00196139, + "balance_loss_clip": 1.19581473, + "balance_loss_mlp": 0.1639998, + "epoch": 0.18229370208928303, + "flos": 15231978524160.0, + "grad_norm": 3.4775715214089935, + "language_loss": 0.8847568, + "learning_rate": 3.76159428580299e-06, + "loss": 0.90208465, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.3215332, + "step": 3032, + "time_per_iteration": 2.6153273582458496 + }, + { + "auxiliary_loss_clip": 0.01524865, + "auxiliary_loss_mlp": 0.00256709, + "balance_loss_clip": 1.173051, + "balance_loss_mlp": 0.22514214, + "epoch": 0.182353825341951, + "flos": 23840483927040.0, + "grad_norm": 10.635545684782272, + "language_loss": 0.91162407, + "learning_rate": 3.761409844706795e-06, + "loss": 0.92943978, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.31567383, + "step": 3033, + "time_per_iteration": 2.6098062992095947 + }, + { + "auxiliary_loss_clip": 0.01452129, + "auxiliary_loss_mlp": 0.00111752, + "balance_loss_clip": 1.13004816, + "balance_loss_mlp": 0.10121351, + "epoch": 0.18241394859461896, + "flos": 61190957393280.0, + "grad_norm": 0.9288711447928368, + "language_loss": 0.6333729, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.64901173, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.10546875, + "step": 3034, + "time_per_iteration": 3.0390024185180664 + }, + { + "auxiliary_loss_clip": 0.01532379, + "auxiliary_loss_mlp": 0.00268305, + "balance_loss_clip": 1.17863894, + "balance_loss_mlp": 0.23599888, + "epoch": 0.18247407184728695, + "flos": 18471694826880.0, + "grad_norm": 8.327328192466585, + "language_loss": 0.8887347, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.9067415, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.32299805, + "step": 3035, + "time_per_iteration": 2.5623927116394043 + }, + { + "auxiliary_loss_clip": 0.01539332, + "auxiliary_loss_mlp": 0.00252808, + "balance_loss_clip": 1.18825841, + "balance_loss_mlp": 0.22469863, + "epoch": 0.18253419509995492, + "flos": 21795191602560.0, + "grad_norm": 275.4651971801066, + "language_loss": 0.90366232, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.92158365, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.28100586, + "step": 3036, + "time_per_iteration": 2.6810507774353027 + }, + { + "auxiliary_loss_clip": 0.01539592, + "auxiliary_loss_mlp": 0.00246177, + "balance_loss_clip": 1.18787551, + "balance_loss_mlp": 0.21929517, + "epoch": 0.18259431835262288, + "flos": 20149764456960.0, + "grad_norm": 17.017284671938828, + "language_loss": 0.85899162, + "learning_rate": 3.760671412463617e-06, + "loss": 0.87684929, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 0.26904297, + "step": 3037, + "time_per_iteration": 2.5536224842071533 + }, + { + "auxiliary_loss_clip": 0.01522676, + "auxiliary_loss_mlp": 0.00343535, + "balance_loss_clip": 1.16861236, + "balance_loss_mlp": 0.31254059, + "epoch": 0.18265444160529085, + "flos": 16981653916800.0, + "grad_norm": 21.853940020894015, + "language_loss": 0.91725051, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.93591261, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 0.31005859, + "step": 3038, + "time_per_iteration": 2.6585006713867188 + }, + { + "auxiliary_loss_clip": 0.01533899, + "auxiliary_loss_mlp": 0.00283749, + "balance_loss_clip": 1.18551874, + "balance_loss_mlp": 0.25741538, + "epoch": 0.1827145648579588, + "flos": 34423250509440.0, + "grad_norm": 3.751234473651476, + "language_loss": 0.76615125, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.78432775, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 0.26318359, + "step": 3039, + "time_per_iteration": 2.6795053482055664 + }, + { + "auxiliary_loss_clip": 0.015226, + "auxiliary_loss_mlp": 0.00305527, + "balance_loss_clip": 1.17529881, + "balance_loss_mlp": 0.27733353, + "epoch": 0.18277468811062678, + "flos": 53287017264000.0, + "grad_norm": 132.1370017285496, + "language_loss": 0.81015903, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.82844025, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 0.28173828, + "step": 3040, + "time_per_iteration": 2.900322914123535 + }, + { + "auxiliary_loss_clip": 0.01532154, + "auxiliary_loss_mlp": 0.00314611, + "balance_loss_clip": 1.19415569, + "balance_loss_mlp": 0.28788412, + "epoch": 0.18283481136329474, + "flos": 31650659602560.0, + "grad_norm": 58.77434475039216, + "language_loss": 0.68659317, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.70506078, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.26696777, + "step": 3041, + "time_per_iteration": 2.676438808441162 + }, + { + "auxiliary_loss_clip": 0.01535405, + "auxiliary_loss_mlp": 0.00307351, + "balance_loss_clip": 1.1921407, + "balance_loss_mlp": 0.27885988, + "epoch": 0.18289493461596273, + "flos": 53137664513280.0, + "grad_norm": 44.839220510283674, + "language_loss": 0.66380012, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.68222767, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 0.28527832, + "step": 3042, + "time_per_iteration": 2.8691580295562744 + }, + { + "auxiliary_loss_clip": 0.01507125, + "auxiliary_loss_mlp": 0.00391083, + "balance_loss_clip": 1.15712988, + "balance_loss_mlp": 0.36426112, + "epoch": 0.1829550578686307, + "flos": 25589369220480.0, + "grad_norm": 22.513259382506913, + "language_loss": 0.92922699, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.94820911, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 0.26831055, + "step": 3043, + "time_per_iteration": 2.6782615184783936 + }, + { + "auxiliary_loss_clip": 0.01516849, + "auxiliary_loss_mlp": 0.00349954, + "balance_loss_clip": 1.17193842, + "balance_loss_mlp": 0.32086667, + "epoch": 0.18301518112129866, + "flos": 22601422321920.0, + "grad_norm": 7.130446727406232, + "language_loss": 0.79814345, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.8168115, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 0.29077148, + "step": 3044, + "time_per_iteration": 2.595714807510376 + }, + { + "auxiliary_loss_clip": 0.01524968, + "auxiliary_loss_mlp": 0.00394175, + "balance_loss_clip": 1.18041754, + "balance_loss_mlp": 0.3638007, + "epoch": 0.18307530437396663, + "flos": 34020799551360.0, + "grad_norm": 10.12515332767027, + "language_loss": 0.70910883, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.72830027, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.30371094, + "step": 3045, + "time_per_iteration": 2.7020103931427 + }, + { + "auxiliary_loss_clip": 0.01535054, + "auxiliary_loss_mlp": 0.00313165, + "balance_loss_clip": 1.19381785, + "balance_loss_mlp": 0.28496039, + "epoch": 0.1831354276266346, + "flos": 21279765392640.0, + "grad_norm": 4.093416381513643, + "language_loss": 0.88761044, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.90609264, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.28186035, + "step": 3046, + "time_per_iteration": 2.555715560913086 + }, + { + "auxiliary_loss_clip": 0.01499988, + "auxiliary_loss_mlp": 0.00371524, + "balance_loss_clip": 1.15956402, + "balance_loss_mlp": 0.33874139, + "epoch": 0.18319555087930256, + "flos": 21032952065280.0, + "grad_norm": 3.3708243403215166, + "language_loss": 0.877698, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.89641315, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.32788086, + "step": 3047, + "time_per_iteration": 2.6175742149353027 + }, + { + "auxiliary_loss_clip": 0.01524021, + "auxiliary_loss_mlp": 0.00361434, + "balance_loss_clip": 1.1801064, + "balance_loss_mlp": 0.33370563, + "epoch": 0.18325567413197055, + "flos": 34382958428160.0, + "grad_norm": 12.606682037121889, + "language_loss": 0.84488767, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.86374223, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 3.44140625, + "router_z_loss_mlp": 0.27722168, + "step": 3048, + "time_per_iteration": 2.72282338142395 + }, + { + "auxiliary_loss_clip": 0.01511098, + "auxiliary_loss_mlp": 0.00382315, + "balance_loss_clip": 1.17021632, + "balance_loss_mlp": 0.35394287, + "epoch": 0.18331579738463852, + "flos": 20558464381440.0, + "grad_norm": 3.3494763780074455, + "language_loss": 0.94696087, + "learning_rate": 3.758449708105424e-06, + "loss": 0.965895, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.28344727, + "step": 3049, + "time_per_iteration": 2.6326675415039062 + }, + { + "auxiliary_loss_clip": 0.01518786, + "auxiliary_loss_mlp": 0.00336595, + "balance_loss_clip": 1.17543769, + "balance_loss_mlp": 0.30669704, + "epoch": 0.18337592063730648, + "flos": 19607872901760.0, + "grad_norm": 13.068060450919587, + "language_loss": 0.90371454, + "learning_rate": 3.75826413248424e-06, + "loss": 0.92226833, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.2989502, + "step": 3050, + "time_per_iteration": 2.548203945159912 + }, + { + "auxiliary_loss_clip": 0.01508053, + "auxiliary_loss_mlp": 0.00420147, + "balance_loss_clip": 1.16921198, + "balance_loss_mlp": 0.38860396, + "epoch": 0.18343604388997445, + "flos": 20850885002880.0, + "grad_norm": 20.892119590154767, + "language_loss": 1.0634172, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.08269906, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.31567383, + "step": 3051, + "time_per_iteration": 2.616490602493286 + }, + { + "auxiliary_loss_clip": 0.0150332, + "auxiliary_loss_mlp": 0.00334232, + "balance_loss_clip": 1.16466808, + "balance_loss_mlp": 0.30650395, + "epoch": 0.1834961671426424, + "flos": 24394370624640.0, + "grad_norm": 4.514240941947403, + "language_loss": 0.92719942, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.94557488, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.27734375, + "step": 3052, + "time_per_iteration": 2.655106544494629 + }, + { + "auxiliary_loss_clip": 0.01507711, + "auxiliary_loss_mlp": 0.00361111, + "balance_loss_clip": 1.17524457, + "balance_loss_mlp": 0.33248913, + "epoch": 0.18355629039531038, + "flos": 21251612108160.0, + "grad_norm": 2.154978564720787, + "language_loss": 0.80751288, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.82620108, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.28662109, + "step": 3053, + "time_per_iteration": 2.6018311977386475 + }, + { + "auxiliary_loss_clip": 0.01512497, + "auxiliary_loss_mlp": 0.00438294, + "balance_loss_clip": 1.1777308, + "balance_loss_mlp": 0.40584546, + "epoch": 0.18361641364797834, + "flos": 28656499651200.0, + "grad_norm": 7.501178405587123, + "language_loss": 0.70897812, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.728486, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.32446289, + "step": 3054, + "time_per_iteration": 2.649409532546997 + }, + { + "auxiliary_loss_clip": 0.01511643, + "auxiliary_loss_mlp": 0.00396537, + "balance_loss_clip": 1.18099594, + "balance_loss_mlp": 0.36690125, + "epoch": 0.18367653690064634, + "flos": 20918827578240.0, + "grad_norm": 2.4464338155383376, + "language_loss": 0.85562807, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.87470996, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.29614258, + "step": 3055, + "time_per_iteration": 2.657930850982666 + }, + { + "auxiliary_loss_clip": 0.01507194, + "auxiliary_loss_mlp": 0.00347019, + "balance_loss_clip": 1.17737567, + "balance_loss_mlp": 0.31924361, + "epoch": 0.1837366601533143, + "flos": 28765596234240.0, + "grad_norm": 15.959901365618911, + "language_loss": 0.76733142, + "learning_rate": 3.757149278859014e-06, + "loss": 0.78587353, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.2779541, + "step": 3056, + "time_per_iteration": 2.6315054893493652 + }, + { + "auxiliary_loss_clip": 0.0149491, + "auxiliary_loss_mlp": 0.00350059, + "balance_loss_clip": 1.17263365, + "balance_loss_mlp": 0.32339215, + "epoch": 0.18379678340598227, + "flos": 21251432540160.0, + "grad_norm": 5.2901480736551445, + "language_loss": 0.85555905, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.87400877, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.26647949, + "step": 3057, + "time_per_iteration": 4.03260064125061 + }, + { + "auxiliary_loss_clip": 0.01507422, + "auxiliary_loss_mlp": 0.00329852, + "balance_loss_clip": 1.18009496, + "balance_loss_mlp": 0.30114681, + "epoch": 0.18385690665865023, + "flos": 20449619193600.0, + "grad_norm": 3.9438564556305855, + "language_loss": 0.92049032, + "learning_rate": 3.756777127858533e-06, + "loss": 0.93886304, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.28686523, + "step": 3058, + "time_per_iteration": 2.6942460536956787 + }, + { + "auxiliary_loss_clip": 0.0150065, + "auxiliary_loss_mlp": 0.00361643, + "balance_loss_clip": 1.17639399, + "balance_loss_mlp": 0.33272249, + "epoch": 0.1839170299113182, + "flos": 26140562398080.0, + "grad_norm": 17.464637340676852, + "language_loss": 0.92989159, + "learning_rate": 3.756590952429017e-06, + "loss": 0.94851446, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.28930664, + "step": 3059, + "time_per_iteration": 2.6272411346435547 + }, + { + "auxiliary_loss_clip": 0.01508019, + "auxiliary_loss_mlp": 0.00281497, + "balance_loss_clip": 1.1859473, + "balance_loss_mlp": 0.25206465, + "epoch": 0.18397715316398616, + "flos": 31758032332800.0, + "grad_norm": 120.29838418827718, + "language_loss": 0.78195632, + "learning_rate": 3.756404710389396e-06, + "loss": 0.79985142, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.29443359, + "step": 3060, + "time_per_iteration": 4.124142169952393 + }, + { + "auxiliary_loss_clip": 0.01515102, + "auxiliary_loss_mlp": 0.00327383, + "balance_loss_clip": 1.1929487, + "balance_loss_mlp": 0.29693681, + "epoch": 0.18403727641665413, + "flos": 24611989173120.0, + "grad_norm": 37.123678917385796, + "language_loss": 0.78681952, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.80524433, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.30444336, + "step": 3061, + "time_per_iteration": 2.645393133163452 + }, + { + "auxiliary_loss_clip": 0.01531172, + "auxiliary_loss_mlp": 0.00281633, + "balance_loss_clip": 1.21432424, + "balance_loss_mlp": 0.25206938, + "epoch": 0.18409739966932212, + "flos": 23439900476160.0, + "grad_norm": 14.703830509584694, + "language_loss": 0.87885475, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.89698279, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.2956543, + "step": 3062, + "time_per_iteration": 2.6376729011535645 + }, + { + "auxiliary_loss_clip": 0.0153072, + "auxiliary_loss_mlp": 0.003261, + "balance_loss_clip": 1.20866191, + "balance_loss_mlp": 0.29831272, + "epoch": 0.18415752292199009, + "flos": 21872112577920.0, + "grad_norm": 9.588343245182154, + "language_loss": 0.83274961, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.85131776, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.27807617, + "step": 3063, + "time_per_iteration": 2.6708319187164307 + }, + { + "auxiliary_loss_clip": 0.01524029, + "auxiliary_loss_mlp": 0.00253417, + "balance_loss_clip": 1.20539212, + "balance_loss_mlp": 0.22456875, + "epoch": 0.18421764617465805, + "flos": 25410678036480.0, + "grad_norm": 2.6304513275061354, + "language_loss": 0.73307568, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.75085014, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.28857422, + "step": 3064, + "time_per_iteration": 4.002074241638184 + }, + { + "auxiliary_loss_clip": 0.01530815, + "auxiliary_loss_mlp": 0.00286609, + "balance_loss_clip": 1.21132636, + "balance_loss_mlp": 0.25666338, + "epoch": 0.18427776942732602, + "flos": 27198131558400.0, + "grad_norm": 2.0957779810111776, + "language_loss": 0.77010643, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.78828073, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.29907227, + "step": 3065, + "time_per_iteration": 2.663722038269043 + }, + { + "auxiliary_loss_clip": 0.01538587, + "auxiliary_loss_mlp": 0.00298987, + "balance_loss_clip": 1.22327256, + "balance_loss_mlp": 0.26932755, + "epoch": 0.18433789267999398, + "flos": 27852351920640.0, + "grad_norm": 14.850803833542345, + "language_loss": 0.82790166, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.84627736, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.29638672, + "step": 3066, + "time_per_iteration": 2.6276907920837402 + }, + { + "auxiliary_loss_clip": 0.0155798, + "auxiliary_loss_mlp": 0.00259755, + "balance_loss_clip": 1.24285328, + "balance_loss_mlp": 0.23258701, + "epoch": 0.18439801593266195, + "flos": 17856940533120.0, + "grad_norm": 10.66565467096761, + "language_loss": 0.89436281, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.91254008, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.27172852, + "step": 3067, + "time_per_iteration": 2.563652992248535 + }, + { + "auxiliary_loss_clip": 0.01607479, + "auxiliary_loss_mlp": 0.00629785, + "balance_loss_clip": 1.28700876, + "balance_loss_mlp": 0.61567092, + "epoch": 0.18445813918532994, + "flos": 56389522590720.0, + "grad_norm": 0.9182244779711894, + "language_loss": 0.59430492, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61667752, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.14160156, + "step": 3068, + "time_per_iteration": 2.9921112060546875 + }, + { + "auxiliary_loss_clip": 0.01572683, + "auxiliary_loss_mlp": 0.00251538, + "balance_loss_clip": 1.25611138, + "balance_loss_mlp": 0.22259408, + "epoch": 0.1845182624379979, + "flos": 20957180325120.0, + "grad_norm": 10.637392938624341, + "language_loss": 0.83823407, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.85647631, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 3.16601562, + "router_z_loss_mlp": 0.28918457, + "step": 3069, + "time_per_iteration": 2.601224899291992 + }, + { + "auxiliary_loss_clip": 0.01588986, + "auxiliary_loss_mlp": 0.00253962, + "balance_loss_clip": 1.27510524, + "balance_loss_mlp": 0.2214891, + "epoch": 0.18457838569066587, + "flos": 20485170679680.0, + "grad_norm": 44.36376174698488, + "language_loss": 0.92129409, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.93972355, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.32470703, + "step": 3070, + "time_per_iteration": 2.6287648677825928 + }, + { + "auxiliary_loss_clip": 0.01609731, + "auxiliary_loss_mlp": 0.00262295, + "balance_loss_clip": 1.28948689, + "balance_loss_mlp": 0.22991753, + "epoch": 0.18463850894333383, + "flos": 25010022758400.0, + "grad_norm": 8.82375874693897, + "language_loss": 0.85198462, + "learning_rate": 3.754351653708265e-06, + "loss": 0.87070489, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.32373047, + "step": 3071, + "time_per_iteration": 2.710223436355591 + }, + { + "auxiliary_loss_clip": 0.01619681, + "auxiliary_loss_mlp": 0.0027835, + "balance_loss_clip": 1.30217505, + "balance_loss_mlp": 0.24797508, + "epoch": 0.1846986321960018, + "flos": 16800628348800.0, + "grad_norm": 33.72595325429311, + "language_loss": 0.8729825, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.89196277, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 3.17382812, + "router_z_loss_mlp": 0.30395508, + "step": 3072, + "time_per_iteration": 2.581709146499634 + }, + { + "auxiliary_loss_clip": 0.01622678, + "auxiliary_loss_mlp": 0.00283665, + "balance_loss_clip": 1.29768562, + "balance_loss_mlp": 0.25133556, + "epoch": 0.18475875544866976, + "flos": 20814327936000.0, + "grad_norm": 2.5826192403305406, + "language_loss": 0.93197834, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.95104182, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.32324219, + "step": 3073, + "time_per_iteration": 2.5655696392059326 + }, + { + "auxiliary_loss_clip": 0.01646229, + "auxiliary_loss_mlp": 0.00256326, + "balance_loss_clip": 1.3219862, + "balance_loss_mlp": 0.2251054, + "epoch": 0.18481887870133773, + "flos": 22601422321920.0, + "grad_norm": 22.803176044839347, + "language_loss": 1.0104413, + "learning_rate": 3.7537903317331732e-06, + "loss": 1.02946675, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.31237793, + "step": 3074, + "time_per_iteration": 2.6393306255340576 + }, + { + "auxiliary_loss_clip": 0.01622781, + "auxiliary_loss_mlp": 0.00272142, + "balance_loss_clip": 1.30226135, + "balance_loss_mlp": 0.23831087, + "epoch": 0.18487900195400572, + "flos": 29458815788160.0, + "grad_norm": 3.7651780498563587, + "language_loss": 0.71848583, + "learning_rate": 3.75360309139087e-06, + "loss": 0.73743504, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.33813477, + "step": 3075, + "time_per_iteration": 2.731102705001831 + }, + { + "auxiliary_loss_clip": 0.01644452, + "auxiliary_loss_mlp": 0.00264533, + "balance_loss_clip": 1.32234311, + "balance_loss_mlp": 0.23388433, + "epoch": 0.1849391252066737, + "flos": 20628777254400.0, + "grad_norm": 53.663054589538426, + "language_loss": 0.8111192, + "learning_rate": 3.753415784551761e-06, + "loss": 0.83020908, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.30627441, + "step": 3076, + "time_per_iteration": 2.744987726211548 + }, + { + "auxiliary_loss_clip": 0.01637985, + "auxiliary_loss_mlp": 0.00255599, + "balance_loss_clip": 1.31661785, + "balance_loss_mlp": 0.22424677, + "epoch": 0.18499924845934165, + "flos": 14428549065600.0, + "grad_norm": 34.71133263912376, + "language_loss": 0.9111377, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.9300735, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.31347656, + "step": 3077, + "time_per_iteration": 2.5895066261291504 + }, + { + "auxiliary_loss_clip": 0.01628087, + "auxiliary_loss_mlp": 0.00249541, + "balance_loss_clip": 1.30823159, + "balance_loss_mlp": 0.21966726, + "epoch": 0.18505937171200962, + "flos": 23727652329600.0, + "grad_norm": 49.11950615877729, + "language_loss": 0.84906769, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.86784393, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.29858398, + "step": 3078, + "time_per_iteration": 2.626169443130493 + }, + { + "auxiliary_loss_clip": 0.01666263, + "auxiliary_loss_mlp": 0.00273188, + "balance_loss_clip": 1.34567809, + "balance_loss_mlp": 0.24221763, + "epoch": 0.18511949496467758, + "flos": 25957489754880.0, + "grad_norm": 998.3502174207551, + "language_loss": 0.84222996, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.86162454, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.30981445, + "step": 3079, + "time_per_iteration": 2.606353759765625 + }, + { + "auxiliary_loss_clip": 0.01631679, + "auxiliary_loss_mlp": 0.00273747, + "balance_loss_clip": 1.31730044, + "balance_loss_mlp": 0.24349129, + "epoch": 0.18517961821734555, + "flos": 42413553912960.0, + "grad_norm": 19.33584671295054, + "language_loss": 0.86888385, + "learning_rate": 3.752665892369369e-06, + "loss": 0.88793814, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.30236816, + "step": 3080, + "time_per_iteration": 2.788346767425537 + }, + { + "auxiliary_loss_clip": 0.01637691, + "auxiliary_loss_mlp": 0.00276854, + "balance_loss_clip": 1.31555009, + "balance_loss_mlp": 0.24333228, + "epoch": 0.18523974147001354, + "flos": 24097568544000.0, + "grad_norm": 18.569887118270916, + "language_loss": 0.83369905, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.85284448, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.33496094, + "step": 3081, + "time_per_iteration": 2.658571243286133 + }, + { + "auxiliary_loss_clip": 0.01661257, + "auxiliary_loss_mlp": 0.00281242, + "balance_loss_clip": 1.33963203, + "balance_loss_mlp": 0.24745817, + "epoch": 0.1852998647226815, + "flos": 27375278457600.0, + "grad_norm": 20.21335762303755, + "language_loss": 0.80306554, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.82249051, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.33789062, + "step": 3082, + "time_per_iteration": 2.6207902431488037 + }, + { + "auxiliary_loss_clip": 0.01673668, + "auxiliary_loss_mlp": 0.0031532, + "balance_loss_clip": 1.34993315, + "balance_loss_mlp": 0.28055903, + "epoch": 0.18535998797534947, + "flos": 18332757020160.0, + "grad_norm": 34.48322051706046, + "language_loss": 0.78025854, + "learning_rate": 3.752102775364407e-06, + "loss": 0.80014837, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.34765625, + "step": 3083, + "time_per_iteration": 2.588942766189575 + }, + { + "auxiliary_loss_clip": 0.01655624, + "auxiliary_loss_mlp": 0.00295205, + "balance_loss_clip": 1.33394718, + "balance_loss_mlp": 0.26232731, + "epoch": 0.18542011122801744, + "flos": 37845859887360.0, + "grad_norm": 5.541538689232545, + "language_loss": 0.75980663, + "learning_rate": 3.751914936806767e-06, + "loss": 0.77931488, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.32910156, + "step": 3084, + "time_per_iteration": 2.7301909923553467 + }, + { + "auxiliary_loss_clip": 0.01656585, + "auxiliary_loss_mlp": 0.00296004, + "balance_loss_clip": 1.33580422, + "balance_loss_mlp": 0.2652007, + "epoch": 0.1854802344806854, + "flos": 25186128163200.0, + "grad_norm": 15.226640048535147, + "language_loss": 0.82346809, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.84299397, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.30810547, + "step": 3085, + "time_per_iteration": 2.678938150405884 + }, + { + "auxiliary_loss_clip": 0.0166076, + "auxiliary_loss_mlp": 0.00307056, + "balance_loss_clip": 1.33820057, + "balance_loss_mlp": 0.2752271, + "epoch": 0.18554035773335337, + "flos": 26684788337280.0, + "grad_norm": 2.272015834789527, + "language_loss": 0.8073827, + "learning_rate": 3.751539060400244e-06, + "loss": 0.82706082, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.31811523, + "step": 3086, + "time_per_iteration": 2.630436420440674 + }, + { + "auxiliary_loss_clip": 0.01653726, + "auxiliary_loss_mlp": 0.00301985, + "balance_loss_clip": 1.33216238, + "balance_loss_mlp": 0.27101445, + "epoch": 0.18560048098602133, + "flos": 22346887570560.0, + "grad_norm": 7.536261808767377, + "language_loss": 0.79234982, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.81190693, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.30981445, + "step": 3087, + "time_per_iteration": 2.627048969268799 + }, + { + "auxiliary_loss_clip": 0.01656228, + "auxiliary_loss_mlp": 0.00313633, + "balance_loss_clip": 1.33613563, + "balance_loss_mlp": 0.28056473, + "epoch": 0.18566060423868933, + "flos": 17748526308480.0, + "grad_norm": 3.247123598900782, + "language_loss": 0.80494082, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.82463944, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.33056641, + "step": 3088, + "time_per_iteration": 2.570807933807373 + }, + { + "auxiliary_loss_clip": 0.01675206, + "auxiliary_loss_mlp": 0.00334502, + "balance_loss_clip": 1.3531971, + "balance_loss_mlp": 0.30086082, + "epoch": 0.1857207274913573, + "flos": 24677274142080.0, + "grad_norm": 1180.1543405635794, + "language_loss": 1.00281715, + "learning_rate": 3.7509747476692663e-06, + "loss": 1.02291417, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.33618164, + "step": 3089, + "time_per_iteration": 2.6639456748962402 + }, + { + "auxiliary_loss_clip": 0.01664038, + "auxiliary_loss_mlp": 0.00311069, + "balance_loss_clip": 1.33855033, + "balance_loss_mlp": 0.27790463, + "epoch": 0.18578085074402526, + "flos": 28147825198080.0, + "grad_norm": 3.16832276232297, + "language_loss": 0.65056133, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.67031246, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.33154297, + "step": 3090, + "time_per_iteration": 2.681551218032837 + }, + { + "auxiliary_loss_clip": 0.01643296, + "auxiliary_loss_mlp": 0.00320944, + "balance_loss_clip": 1.32096982, + "balance_loss_mlp": 0.28873318, + "epoch": 0.18584097399669322, + "flos": 23951878980480.0, + "grad_norm": 2.037289682048942, + "language_loss": 0.86448622, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.88412857, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.32177734, + "step": 3091, + "time_per_iteration": 2.6657636165618896 + }, + { + "auxiliary_loss_clip": 0.01650816, + "auxiliary_loss_mlp": 0.00345887, + "balance_loss_clip": 1.32416439, + "balance_loss_mlp": 0.31174541, + "epoch": 0.18590109724936119, + "flos": 17201678676480.0, + "grad_norm": 65.93320914315568, + "language_loss": 0.92212653, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.94209355, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.34130859, + "step": 3092, + "time_per_iteration": 2.5892868041992188 + }, + { + "auxiliary_loss_clip": 0.01659514, + "auxiliary_loss_mlp": 0.00338185, + "balance_loss_clip": 1.32853985, + "balance_loss_mlp": 0.30404329, + "epoch": 0.18596122050202915, + "flos": 17234644383360.0, + "grad_norm": 534.2992507356756, + "language_loss": 1.01749563, + "learning_rate": 3.750221401168038e-06, + "loss": 1.03747261, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.34130859, + "step": 3093, + "time_per_iteration": 2.5600461959838867 + }, + { + "auxiliary_loss_clip": 0.0164149, + "auxiliary_loss_mlp": 0.00359868, + "balance_loss_clip": 1.32020283, + "balance_loss_mlp": 0.32777715, + "epoch": 0.18602134375469712, + "flos": 19020733188480.0, + "grad_norm": 44.38019003382586, + "language_loss": 0.82982826, + "learning_rate": 3.750032898603443e-06, + "loss": 0.84984183, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.32104492, + "step": 3094, + "time_per_iteration": 2.5621337890625 + }, + { + "auxiliary_loss_clip": 0.01650664, + "auxiliary_loss_mlp": 0.0033597, + "balance_loss_clip": 1.32793999, + "balance_loss_mlp": 0.30449882, + "epoch": 0.1860814670073651, + "flos": 50950094417280.0, + "grad_norm": 103.00390043862315, + "language_loss": 0.75940788, + "learning_rate": 3.749844329677425e-06, + "loss": 0.77927423, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.31469727, + "step": 3095, + "time_per_iteration": 2.8782858848571777 + }, + { + "auxiliary_loss_clip": 0.01651775, + "auxiliary_loss_mlp": 0.00376687, + "balance_loss_clip": 1.31888914, + "balance_loss_mlp": 0.34135303, + "epoch": 0.18614159026003307, + "flos": 19390972625280.0, + "grad_norm": 8.645657819694993, + "language_loss": 0.87413257, + "learning_rate": 3.749655694397135e-06, + "loss": 0.89441717, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.35302734, + "step": 3096, + "time_per_iteration": 2.5710058212280273 + }, + { + "auxiliary_loss_clip": 0.0165442, + "auxiliary_loss_mlp": 0.00368992, + "balance_loss_clip": 1.3217485, + "balance_loss_mlp": 0.33353949, + "epoch": 0.18620171351270104, + "flos": 21798782962560.0, + "grad_norm": 43.14234731923158, + "language_loss": 0.82141632, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.84165043, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.35424805, + "step": 3097, + "time_per_iteration": 2.595466375350952 + }, + { + "auxiliary_loss_clip": 0.01667801, + "auxiliary_loss_mlp": 0.00388953, + "balance_loss_clip": 1.33644772, + "balance_loss_mlp": 0.35402519, + "epoch": 0.186261836765369, + "flos": 16362877299840.0, + "grad_norm": 10.359216165902295, + "language_loss": 0.73551846, + "learning_rate": 3.749278224802352e-06, + "loss": 0.75608605, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.34912109, + "step": 3098, + "time_per_iteration": 2.566417932510376 + }, + { + "auxiliary_loss_clip": 0.01672423, + "auxiliary_loss_mlp": 0.00425081, + "balance_loss_clip": 1.33346868, + "balance_loss_mlp": 0.38319075, + "epoch": 0.18632196001803697, + "flos": 23370054480000.0, + "grad_norm": 18.66550426789156, + "language_loss": 0.75581026, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.77678531, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.41894531, + "step": 3099, + "time_per_iteration": 4.084285020828247 + }, + { + "auxiliary_loss_clip": 0.01683319, + "auxiliary_loss_mlp": 0.00433986, + "balance_loss_clip": 1.34544551, + "balance_loss_mlp": 0.39469451, + "epoch": 0.18638208327070493, + "flos": 22492002516480.0, + "grad_norm": 4.045755484108542, + "language_loss": 0.78052115, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.80169415, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.39282227, + "step": 3100, + "time_per_iteration": 2.6126701831817627 + }, + { + "auxiliary_loss_clip": 0.01700443, + "auxiliary_loss_mlp": 0.00452558, + "balance_loss_clip": 1.35660517, + "balance_loss_mlp": 0.40983325, + "epoch": 0.18644220652337293, + "flos": 29165245931520.0, + "grad_norm": 286.22405897435686, + "language_loss": 0.85808468, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.87961471, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.42700195, + "step": 3101, + "time_per_iteration": 2.6593501567840576 + }, + { + "auxiliary_loss_clip": 0.01691024, + "auxiliary_loss_mlp": 0.00427232, + "balance_loss_clip": 1.34987283, + "balance_loss_mlp": 0.38677257, + "epoch": 0.1865023297760409, + "flos": 24243796811520.0, + "grad_norm": 41.57034909205917, + "language_loss": 0.81701523, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.83819783, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 3.41210938, + "router_z_loss_mlp": 0.40454102, + "step": 3102, + "time_per_iteration": 4.114346742630005 + }, + { + "auxiliary_loss_clip": 0.01686224, + "auxiliary_loss_mlp": 0.0045867, + "balance_loss_clip": 1.34484947, + "balance_loss_mlp": 0.41594553, + "epoch": 0.18656245302870886, + "flos": 19128716449920.0, + "grad_norm": 35.20736077242844, + "language_loss": 0.82689422, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.84834313, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.42749023, + "step": 3103, + "time_per_iteration": 2.6716675758361816 + }, + { + "auxiliary_loss_clip": 0.01700646, + "auxiliary_loss_mlp": 0.00467027, + "balance_loss_clip": 1.35464787, + "balance_loss_mlp": 0.42303836, + "epoch": 0.18662257628137682, + "flos": 17786088956160.0, + "grad_norm": 25.84075737771718, + "language_loss": 0.85729289, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.87896967, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.44018555, + "step": 3104, + "time_per_iteration": 2.567841053009033 + }, + { + "auxiliary_loss_clip": 0.01698843, + "auxiliary_loss_mlp": 0.00485334, + "balance_loss_clip": 1.35364544, + "balance_loss_mlp": 0.44022554, + "epoch": 0.1866826995340448, + "flos": 24024382583040.0, + "grad_norm": 4.352019525951, + "language_loss": 0.9217062, + "learning_rate": 3.747954992113354e-06, + "loss": 0.94354802, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.45117188, + "step": 3105, + "time_per_iteration": 2.7265758514404297 + }, + { + "auxiliary_loss_clip": 0.01682993, + "auxiliary_loss_mlp": 0.00488468, + "balance_loss_clip": 1.32703757, + "balance_loss_mlp": 0.44214332, + "epoch": 0.18674282278671275, + "flos": 26141244756480.0, + "grad_norm": 136.91393057563036, + "language_loss": 0.94989121, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.97160578, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 0.46313477, + "step": 3106, + "time_per_iteration": 2.6285946369171143 + }, + { + "auxiliary_loss_clip": 0.01716439, + "auxiliary_loss_mlp": 0.00506679, + "balance_loss_clip": 1.3561151, + "balance_loss_mlp": 0.45944789, + "epoch": 0.18680294603938072, + "flos": 19201938324480.0, + "grad_norm": 255.61588346492675, + "language_loss": 0.84183848, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.86406964, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.47216797, + "step": 3107, + "time_per_iteration": 3.9831018447875977 + }, + { + "auxiliary_loss_clip": 0.01703067, + "auxiliary_loss_mlp": 0.00551313, + "balance_loss_clip": 1.34396195, + "balance_loss_mlp": 0.49881315, + "epoch": 0.1868630692920487, + "flos": 28544889116160.0, + "grad_norm": 7.365904566217898, + "language_loss": 0.81877816, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.84132195, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 0.52490234, + "step": 3108, + "time_per_iteration": 2.6588711738586426 + }, + { + "auxiliary_loss_clip": 0.01718029, + "auxiliary_loss_mlp": 0.0052768, + "balance_loss_clip": 1.35255516, + "balance_loss_mlp": 0.47546685, + "epoch": 0.18692319254471668, + "flos": 17238020261760.0, + "grad_norm": 25.62521045093071, + "language_loss": 0.80681783, + "learning_rate": 3.747197400772658e-06, + "loss": 0.82927495, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 0.5222168, + "step": 3109, + "time_per_iteration": 2.550187826156616 + }, + { + "auxiliary_loss_clip": 0.01723372, + "auxiliary_loss_mlp": 0.00507979, + "balance_loss_clip": 1.35829604, + "balance_loss_mlp": 0.456862, + "epoch": 0.18698331579738464, + "flos": 23185186156800.0, + "grad_norm": 14.426702453136896, + "language_loss": 0.90136915, + "learning_rate": 3.747007837284772e-06, + "loss": 0.92368269, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 0.51098633, + "step": 3110, + "time_per_iteration": 2.7198309898376465 + }, + { + "auxiliary_loss_clip": 0.0171439, + "auxiliary_loss_mlp": 0.00546005, + "balance_loss_clip": 1.34636474, + "balance_loss_mlp": 0.49469697, + "epoch": 0.1870434390500526, + "flos": 25516721963520.0, + "grad_norm": 13.59083231314776, + "language_loss": 0.89388096, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.91648483, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 0.51293945, + "step": 3111, + "time_per_iteration": 2.6117324829101562 + }, + { + "auxiliary_loss_clip": 0.01723638, + "auxiliary_loss_mlp": 0.00524833, + "balance_loss_clip": 1.34981394, + "balance_loss_mlp": 0.47505161, + "epoch": 0.18710356230272057, + "flos": 19500823393920.0, + "grad_norm": 7.4288106980061945, + "language_loss": 0.83689344, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.85937822, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 0.49829102, + "step": 3112, + "time_per_iteration": 2.61617374420166 + }, + { + "auxiliary_loss_clip": 0.01712126, + "auxiliary_loss_mlp": 0.00533277, + "balance_loss_clip": 1.3341136, + "balance_loss_mlp": 0.48521161, + "epoch": 0.18716368555538854, + "flos": 26760847386240.0, + "grad_norm": 10.66984050648729, + "language_loss": 0.71814096, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.74059498, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.48046875, + "step": 3113, + "time_per_iteration": 2.650690793991089 + }, + { + "auxiliary_loss_clip": 0.01734602, + "auxiliary_loss_mlp": 0.00580481, + "balance_loss_clip": 1.34600663, + "balance_loss_mlp": 0.52504909, + "epoch": 0.1872238088080565, + "flos": 25189827264000.0, + "grad_norm": 27.495445565186536, + "language_loss": 0.87814927, + "learning_rate": 3.746248920938024e-06, + "loss": 0.90130013, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.55517578, + "step": 3114, + "time_per_iteration": 2.6658849716186523 + }, + { + "auxiliary_loss_clip": 0.01721143, + "auxiliary_loss_mlp": 0.00608043, + "balance_loss_clip": 1.32654834, + "balance_loss_mlp": 0.54877222, + "epoch": 0.1872839320607245, + "flos": 24134305178880.0, + "grad_norm": 7.214666887042284, + "language_loss": 0.64978838, + "learning_rate": 3.74605902628851e-06, + "loss": 0.67308021, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 3.94335938, + "router_z_loss_mlp": 0.59204102, + "step": 3115, + "time_per_iteration": 2.6355226039886475 + }, + { + "auxiliary_loss_clip": 0.01748832, + "auxiliary_loss_mlp": 0.00545443, + "balance_loss_clip": 1.35340738, + "balance_loss_mlp": 0.49396843, + "epoch": 0.18734405531339246, + "flos": 21173793292800.0, + "grad_norm": 5.508442552968976, + "language_loss": 0.77868479, + "learning_rate": 3.745869065428261e-06, + "loss": 0.80162752, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 3.95507812, + "router_z_loss_mlp": 0.51513672, + "step": 3116, + "time_per_iteration": 2.6324305534362793 + }, + { + "auxiliary_loss_clip": 0.01751178, + "auxiliary_loss_mlp": 0.0052754, + "balance_loss_clip": 1.34480417, + "balance_loss_mlp": 0.47601774, + "epoch": 0.18740417856606043, + "flos": 17237697039360.0, + "grad_norm": 7.915268138544755, + "language_loss": 0.86085224, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.88363945, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 4.0625, + "router_z_loss_mlp": 0.51513672, + "step": 3117, + "time_per_iteration": 2.605804681777954 + }, + { + "auxiliary_loss_clip": 0.01752969, + "auxiliary_loss_mlp": 0.00576902, + "balance_loss_clip": 1.3463378, + "balance_loss_mlp": 0.5221135, + "epoch": 0.1874643018187284, + "flos": 32558049999360.0, + "grad_norm": 40.72247025046098, + "language_loss": 0.88354445, + "learning_rate": 3.745488945104381e-06, + "loss": 0.90684319, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 4.0703125, + "router_z_loss_mlp": 0.5480957, + "step": 3118, + "time_per_iteration": 2.7086985111236572 + }, + { + "auxiliary_loss_clip": 0.01715843, + "auxiliary_loss_mlp": 0.00540873, + "balance_loss_clip": 1.3060267, + "balance_loss_mlp": 0.49032873, + "epoch": 0.18752442507139636, + "flos": 23258156636160.0, + "grad_norm": 186.65513159591933, + "language_loss": 0.82141459, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.84398174, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 4.09960938, + "router_z_loss_mlp": 0.50585938, + "step": 3119, + "time_per_iteration": 2.73049259185791 + }, + { + "auxiliary_loss_clip": 0.01744478, + "auxiliary_loss_mlp": 0.00524367, + "balance_loss_clip": 1.32349443, + "balance_loss_mlp": 0.47513381, + "epoch": 0.18758454832406432, + "flos": 21760933006080.0, + "grad_norm": 10.159702496633102, + "language_loss": 0.86584151, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.88853002, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 4.20703125, + "router_z_loss_mlp": 0.49267578, + "step": 3120, + "time_per_iteration": 2.623866081237793 + }, + { + "auxiliary_loss_clip": 0.01742199, + "auxiliary_loss_mlp": 0.00517546, + "balance_loss_clip": 1.31460452, + "balance_loss_mlp": 0.46750218, + "epoch": 0.1876446715767323, + "flos": 29570210841600.0, + "grad_norm": 6.2013043520552396, + "language_loss": 0.90806007, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.93065751, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 4.28125, + "router_z_loss_mlp": 0.5, + "step": 3121, + "time_per_iteration": 2.693110466003418 + }, + { + "auxiliary_loss_clip": 0.01787672, + "auxiliary_loss_mlp": 0.00542846, + "balance_loss_clip": 1.34576845, + "balance_loss_mlp": 0.49184823, + "epoch": 0.18770479482940028, + "flos": 30339992234880.0, + "grad_norm": 31.033603838031425, + "language_loss": 0.78074563, + "learning_rate": 3.744727910244937e-06, + "loss": 0.8040508, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 4.421875, + "router_z_loss_mlp": 0.50976562, + "step": 3122, + "time_per_iteration": 2.6971802711486816 + }, + { + "auxiliary_loss_clip": 0.01764462, + "auxiliary_loss_mlp": 0.00530373, + "balance_loss_clip": 1.32902455, + "balance_loss_mlp": 0.48092464, + "epoch": 0.18776491808206824, + "flos": 14465357527680.0, + "grad_norm": 4.321147745477317, + "language_loss": 0.77292508, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.7958734, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 4.35546875, + "router_z_loss_mlp": 0.49511719, + "step": 3123, + "time_per_iteration": 2.5785975456237793 + }, + { + "auxiliary_loss_clip": 0.01767734, + "auxiliary_loss_mlp": 0.00488694, + "balance_loss_clip": 1.32811284, + "balance_loss_mlp": 0.44177362, + "epoch": 0.1878250413347362, + "flos": 24498547044480.0, + "grad_norm": 48.374349847369565, + "language_loss": 0.8068195, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.82938373, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 4.390625, + "router_z_loss_mlp": 0.46923828, + "step": 3124, + "time_per_iteration": 2.6318585872650146 + }, + { + "auxiliary_loss_clip": 0.01752127, + "auxiliary_loss_mlp": 0.00562982, + "balance_loss_clip": 1.30620027, + "balance_loss_mlp": 0.5112927, + "epoch": 0.18788516458740417, + "flos": 39786185692800.0, + "grad_norm": 15.218568735909521, + "language_loss": 0.86856413, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.89171529, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 4.45703125, + "router_z_loss_mlp": 0.51733398, + "step": 3125, + "time_per_iteration": 2.862583637237549 + }, + { + "auxiliary_loss_clip": 0.01939333, + "auxiliary_loss_mlp": 0.00723333, + "balance_loss_clip": 1.38223183, + "balance_loss_mlp": 0.68289757, + "epoch": 0.18794528784007214, + "flos": 64699250664960.0, + "grad_norm": 1.0481687503649295, + "language_loss": 0.63828248, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.66490912, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 5.5625, + "router_z_loss_mlp": 0.40429688, + "step": 3126, + "time_per_iteration": 3.166856527328491 + }, + { + "auxiliary_loss_clip": 0.01766415, + "auxiliary_loss_mlp": 0.00525707, + "balance_loss_clip": 1.32589114, + "balance_loss_mlp": 0.47728425, + "epoch": 0.1880054110927401, + "flos": 28622061486720.0, + "grad_norm": 3.6620536353428688, + "language_loss": 0.85888076, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.88180196, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 4.4140625, + "router_z_loss_mlp": 0.48413086, + "step": 3127, + "time_per_iteration": 2.7034950256347656 + }, + { + "auxiliary_loss_clip": 0.01933443, + "auxiliary_loss_mlp": 0.00490453, + "balance_loss_clip": 1.35019016, + "balance_loss_mlp": 0.46336871, + "epoch": 0.1880655343454081, + "flos": 64488958490880.0, + "grad_norm": 1.0074141727830415, + "language_loss": 0.61944276, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.6436817, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 5.84375, + "router_z_loss_mlp": 0.27148438, + "step": 3128, + "time_per_iteration": 3.1770265102386475 + }, + { + "auxiliary_loss_clip": 0.01735192, + "auxiliary_loss_mlp": 0.00454923, + "balance_loss_clip": 1.31358731, + "balance_loss_mlp": 0.4123894, + "epoch": 0.18812565759807606, + "flos": 32124464928000.0, + "grad_norm": 249.8593586899356, + "language_loss": 0.79409349, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.81599474, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 4.2109375, + "router_z_loss_mlp": 0.42553711, + "step": 3129, + "time_per_iteration": 2.7239208221435547 + }, + { + "auxiliary_loss_clip": 0.01733419, + "auxiliary_loss_mlp": 0.00443824, + "balance_loss_clip": 1.32632768, + "balance_loss_mlp": 0.40045533, + "epoch": 0.18818578085074403, + "flos": 20624539449600.0, + "grad_norm": 3.200542563102962, + "language_loss": 0.92112917, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.94290161, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 4.06835938, + "router_z_loss_mlp": 0.43383789, + "step": 3130, + "time_per_iteration": 2.598921060562134 + }, + { + "auxiliary_loss_clip": 0.0171425, + "auxiliary_loss_mlp": 0.00416584, + "balance_loss_clip": 1.30755246, + "balance_loss_mlp": 0.37760258, + "epoch": 0.188245904103412, + "flos": 28840506048000.0, + "grad_norm": 5.470956120316693, + "language_loss": 0.82556313, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.8468715, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 4.06835938, + "router_z_loss_mlp": 0.38989258, + "step": 3131, + "time_per_iteration": 2.6537117958068848 + }, + { + "auxiliary_loss_clip": 0.01702177, + "auxiliary_loss_mlp": 0.00432302, + "balance_loss_clip": 1.31361103, + "balance_loss_mlp": 0.39251003, + "epoch": 0.18830602735607996, + "flos": 29420319386880.0, + "grad_norm": 27.450373763479938, + "language_loss": 0.88559705, + "learning_rate": 3.74282069289017e-06, + "loss": 0.90694189, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.39819336, + "step": 3132, + "time_per_iteration": 2.756167411804199 + }, + { + "auxiliary_loss_clip": 0.01691329, + "auxiliary_loss_mlp": 0.0046265, + "balance_loss_clip": 1.30717707, + "balance_loss_mlp": 0.41982996, + "epoch": 0.18836615060874792, + "flos": 28872933050880.0, + "grad_norm": 24.080784711086867, + "language_loss": 0.86716044, + "learning_rate": 3.742629607551614e-06, + "loss": 0.88870019, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 3.84570312, + "router_z_loss_mlp": 0.42822266, + "step": 3133, + "time_per_iteration": 2.6437807083129883 + }, + { + "auxiliary_loss_clip": 0.01684743, + "auxiliary_loss_mlp": 0.00440287, + "balance_loss_clip": 1.31290007, + "balance_loss_mlp": 0.39961284, + "epoch": 0.18842627386141592, + "flos": 22601673717120.0, + "grad_norm": 2.581375290161202, + "language_loss": 0.876647, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.8978973, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.40673828, + "step": 3134, + "time_per_iteration": 2.67865252494812 + }, + { + "auxiliary_loss_clip": 0.01681223, + "auxiliary_loss_mlp": 0.00418853, + "balance_loss_clip": 1.31104696, + "balance_loss_mlp": 0.37918055, + "epoch": 0.18848639711408388, + "flos": 24573600512640.0, + "grad_norm": 15.99826296251864, + "language_loss": 0.87127078, + "learning_rate": 3.742247238639684e-06, + "loss": 0.89227152, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 0.39697266, + "step": 3135, + "time_per_iteration": 2.6381287574768066 + }, + { + "auxiliary_loss_clip": 0.01684208, + "auxiliary_loss_mlp": 0.00389484, + "balance_loss_clip": 1.32177138, + "balance_loss_mlp": 0.35486591, + "epoch": 0.18854652036675185, + "flos": 34166920078080.0, + "grad_norm": 4.954205564589724, + "language_loss": 0.84542209, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.86615896, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.34643555, + "step": 3136, + "time_per_iteration": 2.7060134410858154 + }, + { + "auxiliary_loss_clip": 0.0169789, + "auxiliary_loss_mlp": 0.00448964, + "balance_loss_clip": 1.33808458, + "balance_loss_mlp": 0.4105069, + "epoch": 0.1886066436194198, + "flos": 24200236592640.0, + "grad_norm": 7.025106882528393, + "language_loss": 0.88601649, + "learning_rate": 3.741864605462996e-06, + "loss": 0.90748501, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 0.38452148, + "step": 3137, + "time_per_iteration": 2.60099720954895 + }, + { + "auxiliary_loss_clip": 0.01712367, + "auxiliary_loss_mlp": 0.00466973, + "balance_loss_clip": 1.35512495, + "balance_loss_mlp": 0.42954099, + "epoch": 0.18866676687208778, + "flos": 21251109317760.0, + "grad_norm": 4.566958245055196, + "language_loss": 0.85802186, + "learning_rate": 3.741673189793504e-06, + "loss": 0.87981522, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 0.37451172, + "step": 3138, + "time_per_iteration": 2.6089391708374023 + }, + { + "auxiliary_loss_clip": 0.0169109, + "auxiliary_loss_mlp": 0.00439093, + "balance_loss_clip": 1.33560061, + "balance_loss_mlp": 0.40066001, + "epoch": 0.18872689012475574, + "flos": 37308673013760.0, + "grad_norm": 13.546352358803889, + "language_loss": 0.70658588, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.72788775, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 0.38427734, + "step": 3139, + "time_per_iteration": 2.758195638656616 + }, + { + "auxiliary_loss_clip": 0.01672774, + "auxiliary_loss_mlp": 0.00460229, + "balance_loss_clip": 1.32042575, + "balance_loss_mlp": 0.42174792, + "epoch": 0.1887870133774237, + "flos": 21652303299840.0, + "grad_norm": 91.98992531998626, + "language_loss": 0.77013695, + "learning_rate": 3.741290160328514e-06, + "loss": 0.79146701, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 0.38500977, + "step": 3140, + "time_per_iteration": 2.5791168212890625 + }, + { + "auxiliary_loss_clip": 0.01659691, + "auxiliary_loss_mlp": 0.00432422, + "balance_loss_clip": 1.30910778, + "balance_loss_mlp": 0.39572984, + "epoch": 0.1888471366300917, + "flos": 15924659374080.0, + "grad_norm": 14.615255164886163, + "language_loss": 0.94353902, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.96446013, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.36645508, + "step": 3141, + "time_per_iteration": 3.9974100589752197 + }, + { + "auxiliary_loss_clip": 0.0168594, + "auxiliary_loss_mlp": 0.00455319, + "balance_loss_clip": 1.33716142, + "balance_loss_mlp": 0.4157179, + "epoch": 0.18890725988275966, + "flos": 18551955767040.0, + "grad_norm": 49.767196238323, + "language_loss": 0.82598871, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.84740138, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 0.39575195, + "step": 3142, + "time_per_iteration": 2.5662922859191895 + }, + { + "auxiliary_loss_clip": 0.01699314, + "auxiliary_loss_mlp": 0.00487268, + "balance_loss_clip": 1.35408139, + "balance_loss_mlp": 0.45255435, + "epoch": 0.18896738313542763, + "flos": 28840865184000.0, + "grad_norm": 60.26195208321762, + "language_loss": 0.84146488, + "learning_rate": 3.740715120924971e-06, + "loss": 0.86333066, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 0.34716797, + "step": 3143, + "time_per_iteration": 2.6825618743896484 + }, + { + "auxiliary_loss_clip": 0.01682691, + "auxiliary_loss_mlp": 0.00458, + "balance_loss_clip": 1.33822024, + "balance_loss_mlp": 0.42173609, + "epoch": 0.1890275063880956, + "flos": 22412747157120.0, + "grad_norm": 5.944887681619039, + "language_loss": 0.777179, + "learning_rate": 3.740523309097912e-06, + "loss": 0.79858589, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.36254883, + "step": 3144, + "time_per_iteration": 4.012053728103638 + }, + { + "auxiliary_loss_clip": 0.01685102, + "auxiliary_loss_mlp": 0.00424817, + "balance_loss_clip": 1.34271133, + "balance_loss_mlp": 0.38793316, + "epoch": 0.18908762964076356, + "flos": 24243904552320.0, + "grad_norm": 358.6020331637659, + "language_loss": 0.82951605, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.8506152, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 3.42382812, + "router_z_loss_mlp": 0.36865234, + "step": 3145, + "time_per_iteration": 2.6142802238464355 + }, + { + "auxiliary_loss_clip": 0.01684259, + "auxiliary_loss_mlp": 0.0045995, + "balance_loss_clip": 1.34727931, + "balance_loss_mlp": 0.4238776, + "epoch": 0.18914775289343153, + "flos": 16982910892800.0, + "grad_norm": 24.256245211400067, + "language_loss": 0.84361869, + "learning_rate": 3.740139487448616e-06, + "loss": 0.86506081, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.36108398, + "step": 3146, + "time_per_iteration": 2.5728037357330322 + }, + { + "auxiliary_loss_clip": 0.0169326, + "auxiliary_loss_mlp": 0.00451143, + "balance_loss_clip": 1.35486603, + "balance_loss_mlp": 0.41475987, + "epoch": 0.1892078761460995, + "flos": 21543781334400.0, + "grad_norm": 8.667830695849041, + "language_loss": 0.85222363, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.87366766, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.36401367, + "step": 3147, + "time_per_iteration": 2.6194007396698 + }, + { + "auxiliary_loss_clip": 0.01688237, + "auxiliary_loss_mlp": 0.00493153, + "balance_loss_clip": 1.35143888, + "balance_loss_mlp": 0.45569712, + "epoch": 0.18926799939876748, + "flos": 23001538896000.0, + "grad_norm": 34.37827438415028, + "language_loss": 0.73301387, + "learning_rate": 3.739755401854267e-06, + "loss": 0.75482774, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.37475586, + "step": 3148, + "time_per_iteration": 2.6175315380096436 + }, + { + "auxiliary_loss_clip": 0.01682207, + "auxiliary_loss_mlp": 0.00494901, + "balance_loss_clip": 1.34355259, + "balance_loss_mlp": 0.45601457, + "epoch": 0.18932812265143545, + "flos": 22273019251200.0, + "grad_norm": 7.522618279025313, + "language_loss": 0.81988382, + "learning_rate": 3.739563260095902e-06, + "loss": 0.8416549, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.38891602, + "step": 3149, + "time_per_iteration": 4.02577543258667 + }, + { + "auxiliary_loss_clip": 0.01682246, + "auxiliary_loss_mlp": 0.00458202, + "balance_loss_clip": 1.34741235, + "balance_loss_mlp": 0.42360789, + "epoch": 0.1893882459041034, + "flos": 18624423456000.0, + "grad_norm": 17.840704409322555, + "language_loss": 0.88037741, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.90178192, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.34594727, + "step": 3150, + "time_per_iteration": 2.6240153312683105 + }, + { + "auxiliary_loss_clip": 0.01692824, + "auxiliary_loss_mlp": 0.00477347, + "balance_loss_clip": 1.35719061, + "balance_loss_mlp": 0.44105953, + "epoch": 0.18944836915677138, + "flos": 22892981016960.0, + "grad_norm": 4.225318384464344, + "language_loss": 0.91111046, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.93281221, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.36230469, + "step": 3151, + "time_per_iteration": 2.6164162158966064 + }, + { + "auxiliary_loss_clip": 0.01726362, + "auxiliary_loss_mlp": 0.00470049, + "balance_loss_clip": 1.38360023, + "balance_loss_mlp": 0.4327361, + "epoch": 0.18950849240943934, + "flos": 26796542526720.0, + "grad_norm": 5.1562454968299285, + "language_loss": 0.80787349, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.82983756, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.37304688, + "step": 3152, + "time_per_iteration": 2.623049020767212 + }, + { + "auxiliary_loss_clip": 0.01711428, + "auxiliary_loss_mlp": 0.00472568, + "balance_loss_clip": 1.37572014, + "balance_loss_mlp": 0.43499294, + "epoch": 0.1895686156621073, + "flos": 24971239048320.0, + "grad_norm": 119.27796595724571, + "language_loss": 0.81564045, + "learning_rate": 3.738794033491209e-06, + "loss": 0.83748049, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.37573242, + "step": 3153, + "time_per_iteration": 2.6324825286865234 + }, + { + "auxiliary_loss_clip": 0.01699784, + "auxiliary_loss_mlp": 0.00488908, + "balance_loss_clip": 1.36432338, + "balance_loss_mlp": 0.45176214, + "epoch": 0.1896287389147753, + "flos": 21944544353280.0, + "grad_norm": 10.716313949959078, + "language_loss": 0.85321635, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.87510324, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.37158203, + "step": 3154, + "time_per_iteration": 2.633535623550415 + }, + { + "auxiliary_loss_clip": 0.01719398, + "auxiliary_loss_mlp": 0.00499323, + "balance_loss_clip": 1.38208508, + "balance_loss_mlp": 0.45945951, + "epoch": 0.18968886216744327, + "flos": 18179058723840.0, + "grad_norm": 10.827747172857457, + "language_loss": 0.79158169, + "learning_rate": 3.738409024548223e-06, + "loss": 0.81376886, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.39892578, + "step": 3155, + "time_per_iteration": 2.5578527450561523 + }, + { + "auxiliary_loss_clip": 0.01706976, + "auxiliary_loss_mlp": 0.00481855, + "balance_loss_clip": 1.37100351, + "balance_loss_mlp": 0.44361269, + "epoch": 0.18974898542011123, + "flos": 20412487509120.0, + "grad_norm": 11.8894964010461, + "language_loss": 0.79751754, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.81940585, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.38208008, + "step": 3156, + "time_per_iteration": 2.603217363357544 + }, + { + "auxiliary_loss_clip": 0.01682323, + "auxiliary_loss_mlp": 0.00507131, + "balance_loss_clip": 1.35021579, + "balance_loss_mlp": 0.46917507, + "epoch": 0.1898091086727792, + "flos": 23985024255360.0, + "grad_norm": 6.864596717716565, + "language_loss": 0.73516095, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.75705552, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.37939453, + "step": 3157, + "time_per_iteration": 2.6270220279693604 + }, + { + "auxiliary_loss_clip": 0.01734374, + "auxiliary_loss_mlp": 0.00545288, + "balance_loss_clip": 1.39743471, + "balance_loss_mlp": 0.50346923, + "epoch": 0.18986923192544716, + "flos": 27637067756160.0, + "grad_norm": 14.539530760746285, + "language_loss": 0.87216377, + "learning_rate": 3.737831016747176e-06, + "loss": 0.8949604, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.41821289, + "step": 3158, + "time_per_iteration": 2.636578321456909 + }, + { + "auxiliary_loss_clip": 0.01724401, + "auxiliary_loss_mlp": 0.00569128, + "balance_loss_clip": 1.38515735, + "balance_loss_mlp": 0.52687973, + "epoch": 0.18992935517811513, + "flos": 25484151306240.0, + "grad_norm": 80.11180810353741, + "language_loss": 0.78750932, + "learning_rate": 3.737638215672964e-06, + "loss": 0.81044465, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.42260742, + "step": 3159, + "time_per_iteration": 2.6683781147003174 + }, + { + "auxiliary_loss_clip": 0.01742414, + "auxiliary_loss_mlp": 0.00582249, + "balance_loss_clip": 1.4096148, + "balance_loss_mlp": 0.53678286, + "epoch": 0.1899894784307831, + "flos": 17420805596160.0, + "grad_norm": 22.18055371203775, + "language_loss": 0.92426836, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.94751501, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.45458984, + "step": 3160, + "time_per_iteration": 2.5529603958129883 + }, + { + "auxiliary_loss_clip": 0.01728278, + "auxiliary_loss_mlp": 0.00513645, + "balance_loss_clip": 1.39832497, + "balance_loss_mlp": 0.47661859, + "epoch": 0.19004960168345109, + "flos": 27492240119040.0, + "grad_norm": 21.50903482356712, + "language_loss": 0.78881133, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.81123054, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.37011719, + "step": 3161, + "time_per_iteration": 2.6545088291168213 + }, + { + "auxiliary_loss_clip": 0.01721999, + "auxiliary_loss_mlp": 0.00592185, + "balance_loss_clip": 1.39157426, + "balance_loss_mlp": 0.54810095, + "epoch": 0.19010972493611905, + "flos": 38654676385920.0, + "grad_norm": 3.244544897909727, + "language_loss": 0.85762691, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.88076878, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.44116211, + "step": 3162, + "time_per_iteration": 2.720372200012207 + }, + { + "auxiliary_loss_clip": 0.017322, + "auxiliary_loss_mlp": 0.00565331, + "balance_loss_clip": 1.39665806, + "balance_loss_mlp": 0.52267808, + "epoch": 0.19016984818878702, + "flos": 19244744357760.0, + "grad_norm": 35.066218610506546, + "language_loss": 0.82353741, + "learning_rate": 3.73686635253511e-06, + "loss": 0.84651268, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.42675781, + "step": 3163, + "time_per_iteration": 2.568563938140869 + }, + { + "auxiliary_loss_clip": 0.017303, + "auxiliary_loss_mlp": 0.00540716, + "balance_loss_clip": 1.39999807, + "balance_loss_mlp": 0.50082821, + "epoch": 0.19022997144145498, + "flos": 37596891744000.0, + "grad_norm": 24.98464141287195, + "language_loss": 0.79829139, + "learning_rate": 3.736673222076982e-06, + "loss": 0.82100153, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.39868164, + "step": 3164, + "time_per_iteration": 2.7046163082122803 + }, + { + "auxiliary_loss_clip": 0.01737076, + "auxiliary_loss_mlp": 0.0054128, + "balance_loss_clip": 1.40142763, + "balance_loss_mlp": 0.50155962, + "epoch": 0.19029009469412295, + "flos": 61530921665280.0, + "grad_norm": 110.19614337506819, + "language_loss": 0.72108704, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.74387056, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.39697266, + "step": 3165, + "time_per_iteration": 2.9933810234069824 + }, + { + "auxiliary_loss_clip": 0.01736881, + "auxiliary_loss_mlp": 0.00582006, + "balance_loss_clip": 1.40213513, + "balance_loss_mlp": 0.53916168, + "epoch": 0.1903502179467909, + "flos": 13954851480960.0, + "grad_norm": 20.500342837392825, + "language_loss": 0.81278324, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.83597219, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.4284668, + "step": 3166, + "time_per_iteration": 2.6156959533691406 + }, + { + "auxiliary_loss_clip": 0.01700595, + "auxiliary_loss_mlp": 0.00305785, + "balance_loss_clip": 1.38142514, + "balance_loss_mlp": 0.29195657, + "epoch": 0.1904103411994589, + "flos": 66899641916160.0, + "grad_norm": 0.8115192165274928, + "language_loss": 0.50529623, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52535999, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.13867188, + "step": 3167, + "time_per_iteration": 3.141352891921997 + }, + { + "auxiliary_loss_clip": 0.01699917, + "auxiliary_loss_mlp": 0.00507817, + "balance_loss_clip": 1.3670541, + "balance_loss_mlp": 0.47057557, + "epoch": 0.19047046445212687, + "flos": 21908741472000.0, + "grad_norm": 3.4659399671662197, + "language_loss": 0.79691911, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.81899655, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.37207031, + "step": 3168, + "time_per_iteration": 2.6150496006011963 + }, + { + "auxiliary_loss_clip": 0.01656833, + "auxiliary_loss_mlp": 0.00325207, + "balance_loss_clip": 1.34073234, + "balance_loss_mlp": 0.30947137, + "epoch": 0.19053058770479483, + "flos": 59255156701440.0, + "grad_norm": 0.8571464627917788, + "language_loss": 0.60157406, + "learning_rate": 3.73570658211056e-06, + "loss": 0.62139446, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.15722656, + "step": 3169, + "time_per_iteration": 3.0184872150421143 + }, + { + "auxiliary_loss_clip": 0.01698343, + "auxiliary_loss_mlp": 0.00585109, + "balance_loss_clip": 1.35985923, + "balance_loss_mlp": 0.54543638, + "epoch": 0.1905907109574628, + "flos": 23951304362880.0, + "grad_norm": 154.06182242227857, + "language_loss": 0.84933531, + "learning_rate": 3.735513056633436e-06, + "loss": 0.87216985, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.39672852, + "step": 3170, + "time_per_iteration": 2.661673069000244 + }, + { + "auxiliary_loss_clip": 0.01680322, + "auxiliary_loss_mlp": 0.00512149, + "balance_loss_clip": 1.3440311, + "balance_loss_mlp": 0.47517067, + "epoch": 0.19065083421013077, + "flos": 20812316774400.0, + "grad_norm": 133.88367285757852, + "language_loss": 0.83186501, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.85378969, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.36938477, + "step": 3171, + "time_per_iteration": 2.578706979751587 + }, + { + "auxiliary_loss_clip": 0.01664233, + "auxiliary_loss_mlp": 0.00523108, + "balance_loss_clip": 1.32850647, + "balance_loss_mlp": 0.48319691, + "epoch": 0.19071095746279873, + "flos": 31284981192960.0, + "grad_norm": 9.470600559681218, + "language_loss": 0.86916053, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.89103401, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.39916992, + "step": 3172, + "time_per_iteration": 2.707645893096924 + }, + { + "auxiliary_loss_clip": 0.01662084, + "auxiliary_loss_mlp": 0.00467493, + "balance_loss_clip": 1.32937312, + "balance_loss_mlp": 0.43256474, + "epoch": 0.1907710807154667, + "flos": 14356117290240.0, + "grad_norm": 7.421585674796623, + "language_loss": 0.86654681, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.88784254, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.34936523, + "step": 3173, + "time_per_iteration": 2.573373556137085 + }, + { + "auxiliary_loss_clip": 0.01643711, + "auxiliary_loss_mlp": 0.0047709, + "balance_loss_clip": 1.31029963, + "balance_loss_mlp": 0.44027787, + "epoch": 0.1908312039681347, + "flos": 26907039740160.0, + "grad_norm": 3.257679032550108, + "language_loss": 0.85263026, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.87383819, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.36816406, + "step": 3174, + "time_per_iteration": 2.6593265533447266 + }, + { + "auxiliary_loss_clip": 0.01657578, + "auxiliary_loss_mlp": 0.00445548, + "balance_loss_clip": 1.32520199, + "balance_loss_mlp": 0.41140658, + "epoch": 0.19089132722080265, + "flos": 14494695960960.0, + "grad_norm": 54.74568498638887, + "language_loss": 0.87250876, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.89354002, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.34155273, + "step": 3175, + "time_per_iteration": 2.61513352394104 + }, + { + "auxiliary_loss_clip": 0.01656494, + "auxiliary_loss_mlp": 0.00486098, + "balance_loss_clip": 1.32302237, + "balance_loss_mlp": 0.44821298, + "epoch": 0.19095145047347062, + "flos": 13952876232960.0, + "grad_norm": 5.834205291434537, + "language_loss": 0.93476737, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.95619321, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.37890625, + "step": 3176, + "time_per_iteration": 2.5750370025634766 + }, + { + "auxiliary_loss_clip": 0.01643382, + "auxiliary_loss_mlp": 0.00453435, + "balance_loss_clip": 1.3091687, + "balance_loss_mlp": 0.41679049, + "epoch": 0.19101157372613858, + "flos": 25301832848640.0, + "grad_norm": 3.8591835531981067, + "language_loss": 0.87430769, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.89527595, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.36621094, + "step": 3177, + "time_per_iteration": 2.634519100189209 + }, + { + "auxiliary_loss_clip": 0.01644322, + "auxiliary_loss_mlp": 0.0044323, + "balance_loss_clip": 1.31537127, + "balance_loss_mlp": 0.4094218, + "epoch": 0.19107169697880655, + "flos": 20558212986240.0, + "grad_norm": 22.533891966428826, + "language_loss": 0.8448416, + "learning_rate": 3.73396248424356e-06, + "loss": 0.86571711, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.33862305, + "step": 3178, + "time_per_iteration": 2.625331163406372 + }, + { + "auxiliary_loss_clip": 0.01640721, + "auxiliary_loss_mlp": 0.00424712, + "balance_loss_clip": 1.31341505, + "balance_loss_mlp": 0.39104712, + "epoch": 0.19113182023147451, + "flos": 22163204396160.0, + "grad_norm": 13.038697754679694, + "language_loss": 0.87683761, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.89749193, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.33666992, + "step": 3179, + "time_per_iteration": 2.6003575325012207 + }, + { + "auxiliary_loss_clip": 0.01657215, + "auxiliary_loss_mlp": 0.00386645, + "balance_loss_clip": 1.32963037, + "balance_loss_mlp": 0.35610369, + "epoch": 0.19119194348414248, + "flos": 18581796990720.0, + "grad_norm": 3.491195817568203, + "language_loss": 0.89037251, + "learning_rate": 3.733574183478691e-06, + "loss": 0.91081113, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.30517578, + "step": 3180, + "time_per_iteration": 2.589942216873169 + }, + { + "auxiliary_loss_clip": 0.01637488, + "auxiliary_loss_mlp": 0.00402802, + "balance_loss_clip": 1.30825853, + "balance_loss_mlp": 0.37023354, + "epoch": 0.19125206673681047, + "flos": 19026623018880.0, + "grad_norm": 4.0711558769185965, + "language_loss": 0.85761929, + "learning_rate": 3.733379934486615e-06, + "loss": 0.87802219, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.32556152, + "step": 3181, + "time_per_iteration": 2.6085383892059326 + }, + { + "auxiliary_loss_clip": 0.01653187, + "auxiliary_loss_mlp": 0.00387023, + "balance_loss_clip": 1.32197189, + "balance_loss_mlp": 0.35753012, + "epoch": 0.19131218998947844, + "flos": 21690153256320.0, + "grad_norm": 128.23821637245732, + "language_loss": 0.80774218, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.82814425, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.29516602, + "step": 3182, + "time_per_iteration": 2.5882487297058105 + }, + { + "auxiliary_loss_clip": 0.01628729, + "auxiliary_loss_mlp": 0.00384122, + "balance_loss_clip": 1.3030479, + "balance_loss_mlp": 0.35353342, + "epoch": 0.1913723132421464, + "flos": 18442500048000.0, + "grad_norm": 2.06982825414926, + "language_loss": 0.70666963, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.72679818, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.30578613, + "step": 3183, + "time_per_iteration": 3.947877883911133 + }, + { + "auxiliary_loss_clip": 0.01623501, + "auxiliary_loss_mlp": 0.00383398, + "balance_loss_clip": 1.29447615, + "balance_loss_mlp": 0.35056737, + "epoch": 0.19143243649481437, + "flos": 27160102033920.0, + "grad_norm": 2.821852268310014, + "language_loss": 0.7835089, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.8035779, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.32836914, + "step": 3184, + "time_per_iteration": 2.613609790802002 + }, + { + "auxiliary_loss_clip": 0.01632295, + "auxiliary_loss_mlp": 0.00338942, + "balance_loss_clip": 1.30450284, + "balance_loss_mlp": 0.30735135, + "epoch": 0.19149255974748233, + "flos": 21718952985600.0, + "grad_norm": 4.244454214301981, + "language_loss": 0.95121968, + "learning_rate": 3.732602281292598e-06, + "loss": 0.97093201, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.31591797, + "step": 3185, + "time_per_iteration": 2.5876574516296387 + }, + { + "auxiliary_loss_clip": 0.01604428, + "auxiliary_loss_mlp": 0.0031529, + "balance_loss_clip": 1.27808881, + "balance_loss_mlp": 0.28729957, + "epoch": 0.1915526830001503, + "flos": 22963293889920.0, + "grad_norm": 2.2611775775228935, + "language_loss": 0.7927134, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.81191051, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.27966309, + "step": 3186, + "time_per_iteration": 4.122814178466797 + }, + { + "auxiliary_loss_clip": 0.01608653, + "auxiliary_loss_mlp": 0.00294895, + "balance_loss_clip": 1.28335381, + "balance_loss_mlp": 0.26711908, + "epoch": 0.1916128062528183, + "flos": 26140741966080.0, + "grad_norm": 20.10331539568603, + "language_loss": 0.91689736, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.93593287, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.27807617, + "step": 3187, + "time_per_iteration": 2.615020275115967 + }, + { + "auxiliary_loss_clip": 0.01625526, + "auxiliary_loss_mlp": 0.00149605, + "balance_loss_clip": 1.28951526, + "balance_loss_mlp": 0.14016354, + "epoch": 0.19167292950548626, + "flos": 54925767457920.0, + "grad_norm": 0.9244426911560535, + "language_loss": 0.5581308, + "learning_rate": 3.732018351516544e-06, + "loss": 0.57588208, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.09423828, + "step": 3188, + "time_per_iteration": 3.146901845932007 + }, + { + "auxiliary_loss_clip": 0.01578909, + "auxiliary_loss_mlp": 0.0032034, + "balance_loss_clip": 1.25893736, + "balance_loss_mlp": 0.29158688, + "epoch": 0.19173305275815422, + "flos": 29935601942400.0, + "grad_norm": 6.987586508764683, + "language_loss": 0.77237087, + "learning_rate": 3.731823576891397e-06, + "loss": 0.79136336, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.28771973, + "step": 3189, + "time_per_iteration": 2.675635576248169 + }, + { + "auxiliary_loss_clip": 0.01586177, + "auxiliary_loss_mlp": 0.00250596, + "balance_loss_clip": 1.26246405, + "balance_loss_mlp": 0.22519284, + "epoch": 0.1917931760108222, + "flos": 24752471264640.0, + "grad_norm": 5.529098131234571, + "language_loss": 0.80960274, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.8279705, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.2545166, + "step": 3190, + "time_per_iteration": 2.646608352661133 + }, + { + "auxiliary_loss_clip": 0.01589356, + "auxiliary_loss_mlp": 0.00283156, + "balance_loss_clip": 1.26543069, + "balance_loss_mlp": 0.25727558, + "epoch": 0.19185329926349015, + "flos": 18843550375680.0, + "grad_norm": 11.718236291407097, + "language_loss": 0.93292677, + "learning_rate": 3.73143383063572e-06, + "loss": 0.95165187, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 3.23828125, + "router_z_loss_mlp": 0.2590332, + "step": 3191, + "time_per_iteration": 2.563753843307495 + }, + { + "auxiliary_loss_clip": 0.01580256, + "auxiliary_loss_mlp": 0.00286726, + "balance_loss_clip": 1.26097918, + "balance_loss_mlp": 0.26052403, + "epoch": 0.19191342251615812, + "flos": 22086858038400.0, + "grad_norm": 3.363389266911956, + "language_loss": 0.96057326, + "learning_rate": 3.73123885901997e-06, + "loss": 0.9792431, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.26220703, + "step": 3192, + "time_per_iteration": 3.994459867477417 + }, + { + "auxiliary_loss_clip": 0.01571292, + "auxiliary_loss_mlp": 0.0028291, + "balance_loss_clip": 1.2452451, + "balance_loss_mlp": 0.2560637, + "epoch": 0.19197354576882608, + "flos": 22199115018240.0, + "grad_norm": 30.930871743137494, + "language_loss": 0.84659785, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.86513984, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.26843262, + "step": 3193, + "time_per_iteration": 2.6646499633789062 + }, + { + "auxiliary_loss_clip": 0.01577765, + "auxiliary_loss_mlp": 0.0027571, + "balance_loss_clip": 1.24861217, + "balance_loss_mlp": 0.24957965, + "epoch": 0.19203366902149407, + "flos": 24896185580160.0, + "grad_norm": 1.9357922858737837, + "language_loss": 0.82536793, + "learning_rate": 3.730848718849612e-06, + "loss": 0.84390271, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.2611084, + "step": 3194, + "time_per_iteration": 2.647264003753662 + }, + { + "auxiliary_loss_clip": 0.01498183, + "auxiliary_loss_mlp": 0.0005207, + "balance_loss_clip": 1.17927778, + "balance_loss_mlp": 0.04167522, + "epoch": 0.19209379227416204, + "flos": 68416722789120.0, + "grad_norm": 0.7479671120193637, + "language_loss": 0.68280375, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.69830626, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.10400391, + "step": 3195, + "time_per_iteration": 3.0740277767181396 + }, + { + "auxiliary_loss_clip": 0.01571938, + "auxiliary_loss_mlp": 0.00292027, + "balance_loss_clip": 1.24250722, + "balance_loss_mlp": 0.26557493, + "epoch": 0.19215391552683, + "flos": 22055185221120.0, + "grad_norm": 3.51951529921557, + "language_loss": 0.82061028, + "learning_rate": 3.730458316143429e-06, + "loss": 0.83924997, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.26477051, + "step": 3196, + "time_per_iteration": 2.6202383041381836 + }, + { + "auxiliary_loss_clip": 0.0155831, + "auxiliary_loss_mlp": 0.00270302, + "balance_loss_clip": 1.22946107, + "balance_loss_mlp": 0.24414796, + "epoch": 0.19221403877949797, + "flos": 20302959962880.0, + "grad_norm": 12.42846778506849, + "language_loss": 0.91353506, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.93182123, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.26123047, + "step": 3197, + "time_per_iteration": 2.641179323196411 + }, + { + "auxiliary_loss_clip": 0.015395, + "auxiliary_loss_mlp": 0.00296278, + "balance_loss_clip": 1.21631181, + "balance_loss_mlp": 0.26938394, + "epoch": 0.19227416203216594, + "flos": 23185329811200.0, + "grad_norm": 7.319071579042978, + "language_loss": 0.89620799, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.9145658, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 3.2265625, + "router_z_loss_mlp": 0.2689209, + "step": 3198, + "time_per_iteration": 2.631232738494873 + }, + { + "auxiliary_loss_clip": 0.01552381, + "auxiliary_loss_mlp": 0.00318306, + "balance_loss_clip": 1.23324513, + "balance_loss_mlp": 0.28938597, + "epoch": 0.1923342852848339, + "flos": 25776607841280.0, + "grad_norm": 5008.620430211642, + "language_loss": 0.8664639, + "learning_rate": 3.729872219959029e-06, + "loss": 0.88517076, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.28955078, + "step": 3199, + "time_per_iteration": 2.7067389488220215 + }, + { + "auxiliary_loss_clip": 0.01545267, + "auxiliary_loss_mlp": 0.00292914, + "balance_loss_clip": 1.22488368, + "balance_loss_mlp": 0.26781997, + "epoch": 0.19239440853750187, + "flos": 17128349061120.0, + "grad_norm": 4.831782627042346, + "language_loss": 0.94012517, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.958507, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.25085449, + "step": 3200, + "time_per_iteration": 2.560321807861328 + }, + { + "auxiliary_loss_clip": 0.01536913, + "auxiliary_loss_mlp": 0.00269732, + "balance_loss_clip": 1.22186017, + "balance_loss_mlp": 0.24672502, + "epoch": 0.19245453179016986, + "flos": 16435093593600.0, + "grad_norm": 7.5696250507158584, + "language_loss": 0.86634266, + "learning_rate": 3.729481161172443e-06, + "loss": 0.88440907, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.23022461, + "step": 3201, + "time_per_iteration": 2.546917200088501 + }, + { + "auxiliary_loss_clip": 0.01541743, + "auxiliary_loss_mlp": 0.00305686, + "balance_loss_clip": 1.22569704, + "balance_loss_mlp": 0.27935275, + "epoch": 0.19251465504283782, + "flos": 20230276792320.0, + "grad_norm": 11.262665574878952, + "language_loss": 0.7813201, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.79979432, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.26318359, + "step": 3202, + "time_per_iteration": 2.5913736820220947 + }, + { + "auxiliary_loss_clip": 0.01520481, + "auxiliary_loss_mlp": 0.00246281, + "balance_loss_clip": 1.20529473, + "balance_loss_mlp": 0.22431117, + "epoch": 0.1925747782955058, + "flos": 19464374067840.0, + "grad_norm": 1.8181907214211948, + "language_loss": 0.99412167, + "learning_rate": 3.7290898400574627e-06, + "loss": 1.01178932, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.21984863, + "step": 3203, + "time_per_iteration": 2.5787971019744873 + }, + { + "auxiliary_loss_clip": 0.01531795, + "auxiliary_loss_mlp": 0.00288486, + "balance_loss_clip": 1.21618617, + "balance_loss_mlp": 0.26091325, + "epoch": 0.19263490154817375, + "flos": 17785586165760.0, + "grad_norm": 9.223906728087742, + "language_loss": 0.93070364, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.94890642, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.27563477, + "step": 3204, + "time_per_iteration": 2.563422441482544 + }, + { + "auxiliary_loss_clip": 0.01536477, + "auxiliary_loss_mlp": 0.00280499, + "balance_loss_clip": 1.2221663, + "balance_loss_mlp": 0.25464231, + "epoch": 0.19269502480084172, + "flos": 17457075354240.0, + "grad_norm": 7.90045946676244, + "language_loss": 0.83433664, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.8525064, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.25866699, + "step": 3205, + "time_per_iteration": 2.568598747253418 + }, + { + "auxiliary_loss_clip": 0.01529023, + "auxiliary_loss_mlp": 0.00289945, + "balance_loss_clip": 1.2153759, + "balance_loss_mlp": 0.2654475, + "epoch": 0.19275514805350968, + "flos": 21506901045120.0, + "grad_norm": 2.918520140873357, + "language_loss": 0.93835187, + "learning_rate": 3.728502366649107e-06, + "loss": 0.9565416, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.24499512, + "step": 3206, + "time_per_iteration": 2.6173465251922607 + }, + { + "auxiliary_loss_clip": 0.01507756, + "auxiliary_loss_mlp": 0.00122668, + "balance_loss_clip": 1.18335152, + "balance_loss_mlp": 0.11327467, + "epoch": 0.19281527130617768, + "flos": 47695979738880.0, + "grad_norm": 0.8302323192630588, + "language_loss": 0.60544157, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62174582, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.09375, + "step": 3207, + "time_per_iteration": 2.9312236309051514 + }, + { + "auxiliary_loss_clip": 0.01511495, + "auxiliary_loss_mlp": 0.00266999, + "balance_loss_clip": 1.19941664, + "balance_loss_mlp": 0.24179861, + "epoch": 0.19287539455884564, + "flos": 11801252672640.0, + "grad_norm": 9.03414216578795, + "language_loss": 0.87089837, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.88868332, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 3.12304688, + "router_z_loss_mlp": 0.25183105, + "step": 3208, + "time_per_iteration": 2.5573394298553467 + }, + { + "auxiliary_loss_clip": 0.0152567, + "auxiliary_loss_mlp": 0.00282935, + "balance_loss_clip": 1.21338844, + "balance_loss_mlp": 0.25691134, + "epoch": 0.1929355178115136, + "flos": 20631434860800.0, + "grad_norm": 7.985640367104113, + "language_loss": 0.70671535, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.72480136, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.26037598, + "step": 3209, + "time_per_iteration": 2.59397292137146 + }, + { + "auxiliary_loss_clip": 0.01523835, + "auxiliary_loss_mlp": 0.00304598, + "balance_loss_clip": 1.2110033, + "balance_loss_mlp": 0.27912319, + "epoch": 0.19299564106418157, + "flos": 40807916058240.0, + "grad_norm": 2.7675692869286768, + "language_loss": 0.91212916, + "learning_rate": 3.727718151176243e-06, + "loss": 0.93041348, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.25488281, + "step": 3210, + "time_per_iteration": 2.8131754398345947 + }, + { + "auxiliary_loss_clip": 0.01527552, + "auxiliary_loss_mlp": 0.00251844, + "balance_loss_clip": 1.21480989, + "balance_loss_mlp": 0.22967061, + "epoch": 0.19305576431684954, + "flos": 11361418634880.0, + "grad_norm": 49.43777214939405, + "language_loss": 0.90687996, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.92467391, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.22180176, + "step": 3211, + "time_per_iteration": 2.5791499614715576 + }, + { + "auxiliary_loss_clip": 0.01420028, + "auxiliary_loss_mlp": 0.00063104, + "balance_loss_clip": 1.1288532, + "balance_loss_mlp": 0.04660593, + "epoch": 0.1931158875695175, + "flos": 54511895975040.0, + "grad_norm": 0.9865258043191949, + "language_loss": 0.6354875, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.6503188, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.16503906, + "step": 3212, + "time_per_iteration": 3.007046699523926 + }, + { + "auxiliary_loss_clip": 0.01525118, + "auxiliary_loss_mlp": 0.00293143, + "balance_loss_clip": 1.2150358, + "balance_loss_mlp": 0.27007562, + "epoch": 0.19317601082218547, + "flos": 19828436365440.0, + "grad_norm": 7.176983289105826, + "language_loss": 0.8222028, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.84038544, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.23071289, + "step": 3213, + "time_per_iteration": 2.647026300430298 + }, + { + "auxiliary_loss_clip": 0.01531024, + "auxiliary_loss_mlp": 0.00298517, + "balance_loss_clip": 1.21970582, + "balance_loss_mlp": 0.27368575, + "epoch": 0.19323613407485346, + "flos": 13152068467200.0, + "grad_norm": 4.383594630789381, + "language_loss": 0.80707854, + "learning_rate": 3.726932887459503e-06, + "loss": 0.82537401, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.24841309, + "step": 3214, + "time_per_iteration": 2.7162423133850098 + }, + { + "auxiliary_loss_clip": 0.01531078, + "auxiliary_loss_mlp": 0.00309725, + "balance_loss_clip": 1.22213387, + "balance_loss_mlp": 0.28093591, + "epoch": 0.19329625732752143, + "flos": 14027247342720.0, + "grad_norm": 38.00908272136673, + "language_loss": 0.84704304, + "learning_rate": 3.72673640779803e-06, + "loss": 0.8654511, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.2878418, + "step": 3215, + "time_per_iteration": 2.5573880672454834 + }, + { + "auxiliary_loss_clip": 0.01561551, + "auxiliary_loss_mlp": 0.00294325, + "balance_loss_clip": 1.2487669, + "balance_loss_mlp": 0.26898113, + "epoch": 0.1933563805801894, + "flos": 23441732069760.0, + "grad_norm": 43.10569843756784, + "language_loss": 0.93581194, + "learning_rate": 3.72653986265854e-06, + "loss": 0.95437062, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.25354004, + "step": 3216, + "time_per_iteration": 2.59704327583313 + }, + { + "auxiliary_loss_clip": 0.0155366, + "auxiliary_loss_mlp": 0.00287761, + "balance_loss_clip": 1.24043894, + "balance_loss_mlp": 0.26365662, + "epoch": 0.19341650383285736, + "flos": 20485314334080.0, + "grad_norm": 3.4903914627740993, + "language_loss": 0.87523758, + "learning_rate": 3.726343252048485e-06, + "loss": 0.89365184, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 3.13085938, + "router_z_loss_mlp": 0.24108887, + "step": 3217, + "time_per_iteration": 2.5640950202941895 + }, + { + "auxiliary_loss_clip": 0.01536309, + "auxiliary_loss_mlp": 0.00347293, + "balance_loss_clip": 1.22123694, + "balance_loss_mlp": 0.31625053, + "epoch": 0.19347662708552532, + "flos": 17858484817920.0, + "grad_norm": 70.59647263118362, + "language_loss": 0.77558446, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.79442048, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.31054688, + "step": 3218, + "time_per_iteration": 2.589664936065674 + }, + { + "auxiliary_loss_clip": 0.01568165, + "auxiliary_loss_mlp": 0.00342923, + "balance_loss_clip": 1.2515831, + "balance_loss_mlp": 0.31334701, + "epoch": 0.1935367503381933, + "flos": 18187247024640.0, + "grad_norm": 4.300051209379901, + "language_loss": 0.86463904, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.8837499, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.29614258, + "step": 3219, + "time_per_iteration": 2.57383394241333 + }, + { + "auxiliary_loss_clip": 0.01584395, + "auxiliary_loss_mlp": 0.0030114, + "balance_loss_clip": 1.2677269, + "balance_loss_mlp": 0.27634415, + "epoch": 0.19359687359086128, + "flos": 15957122290560.0, + "grad_norm": 4.287716622493703, + "language_loss": 0.96456349, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.98341876, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.24829102, + "step": 3220, + "time_per_iteration": 2.578460693359375 + }, + { + "auxiliary_loss_clip": 0.01606903, + "auxiliary_loss_mlp": 0.00274851, + "balance_loss_clip": 1.29085171, + "balance_loss_mlp": 0.25056759, + "epoch": 0.19365699684352924, + "flos": 21215198695680.0, + "grad_norm": 6.073350953552274, + "language_loss": 0.90303695, + "learning_rate": 3.725556155051766e-06, + "loss": 0.92185444, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.24267578, + "step": 3221, + "time_per_iteration": 2.66359806060791 + }, + { + "auxiliary_loss_clip": 0.0160858, + "auxiliary_loss_mlp": 0.00253991, + "balance_loss_clip": 1.29554391, + "balance_loss_mlp": 0.23078066, + "epoch": 0.1937171200961972, + "flos": 17311098481920.0, + "grad_norm": 160.08196496133073, + "language_loss": 0.93217432, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.95080006, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.23193359, + "step": 3222, + "time_per_iteration": 2.6556615829467773 + }, + { + "auxiliary_loss_clip": 0.01615682, + "auxiliary_loss_mlp": 0.00252913, + "balance_loss_clip": 1.30022717, + "balance_loss_mlp": 0.22747336, + "epoch": 0.19377724334886517, + "flos": 22635968227200.0, + "grad_norm": 9.60117720472628, + "language_loss": 0.85160828, + "learning_rate": 3.72516221392398e-06, + "loss": 0.87029421, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.25463867, + "step": 3223, + "time_per_iteration": 2.6099977493286133 + }, + { + "auxiliary_loss_clip": 0.01617165, + "auxiliary_loss_mlp": 0.00308245, + "balance_loss_clip": 1.30267155, + "balance_loss_mlp": 0.2793963, + "epoch": 0.19383736660153314, + "flos": 15077813351040.0, + "grad_norm": 23.97655141826144, + "language_loss": 0.84178293, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.86103702, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.28833008, + "step": 3224, + "time_per_iteration": 2.5219767093658447 + }, + { + "auxiliary_loss_clip": 0.0161095, + "auxiliary_loss_mlp": 0.00242923, + "balance_loss_clip": 1.2951715, + "balance_loss_mlp": 0.21772206, + "epoch": 0.1938974898542011, + "flos": 47119934350080.0, + "grad_norm": 4.889514434809209, + "language_loss": 0.801862, + "learning_rate": 3.7247680111229e-06, + "loss": 0.82040071, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.2520752, + "step": 3225, + "time_per_iteration": 2.8198182582855225 + }, + { + "auxiliary_loss_clip": 0.01625276, + "auxiliary_loss_mlp": 0.00241733, + "balance_loss_clip": 1.3116461, + "balance_loss_mlp": 0.21551889, + "epoch": 0.19395761310686907, + "flos": 25812554376960.0, + "grad_norm": 13.425299724030042, + "language_loss": 0.78042305, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.79909313, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.26196289, + "step": 3226, + "time_per_iteration": 4.014801740646362 + }, + { + "auxiliary_loss_clip": 0.01652959, + "auxiliary_loss_mlp": 0.00259572, + "balance_loss_clip": 1.33562589, + "balance_loss_mlp": 0.23044896, + "epoch": 0.19401773635953706, + "flos": 23039604334080.0, + "grad_norm": 1.6251928367387278, + "language_loss": 0.83147204, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.85059738, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.29125977, + "step": 3227, + "time_per_iteration": 2.607297658920288 + }, + { + "auxiliary_loss_clip": 0.0164526, + "auxiliary_loss_mlp": 0.00236606, + "balance_loss_clip": 1.33034325, + "balance_loss_mlp": 0.21111855, + "epoch": 0.19407785961220503, + "flos": 15920780705280.0, + "grad_norm": 4.537221649175126, + "language_loss": 0.76813185, + "learning_rate": 3.724176216414662e-06, + "loss": 0.78695053, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.25488281, + "step": 3228, + "time_per_iteration": 3.99920654296875 + }, + { + "auxiliary_loss_clip": 0.01646969, + "auxiliary_loss_mlp": 0.00244338, + "balance_loss_clip": 1.33159196, + "balance_loss_mlp": 0.22060341, + "epoch": 0.194137982864873, + "flos": 25921722787200.0, + "grad_norm": 48.87272818523903, + "language_loss": 0.80981684, + "learning_rate": 3.72397882074007e-06, + "loss": 0.82872987, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.23742676, + "step": 3229, + "time_per_iteration": 4.061859846115112 + }, + { + "auxiliary_loss_clip": 0.01666606, + "auxiliary_loss_mlp": 0.00237628, + "balance_loss_clip": 1.3459748, + "balance_loss_mlp": 0.20993581, + "epoch": 0.19419810611754096, + "flos": 13261344618240.0, + "grad_norm": 182.51295499116637, + "language_loss": 0.73077691, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.74981922, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.2767334, + "step": 3230, + "time_per_iteration": 2.5338315963745117 + }, + { + "auxiliary_loss_clip": 0.01657417, + "auxiliary_loss_mlp": 0.00211156, + "balance_loss_clip": 1.34110689, + "balance_loss_mlp": 0.18384495, + "epoch": 0.19425822937020892, + "flos": 15705568368000.0, + "grad_norm": 82.72635826933988, + "language_loss": 0.89025158, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.90893734, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.27307129, + "step": 3231, + "time_per_iteration": 2.6019997596740723 + }, + { + "auxiliary_loss_clip": 0.01670709, + "auxiliary_loss_mlp": 0.00250838, + "balance_loss_clip": 1.34881997, + "balance_loss_mlp": 0.22206089, + "epoch": 0.1943183526228769, + "flos": 23105392093440.0, + "grad_norm": 2.274731211728245, + "language_loss": 0.95267725, + "learning_rate": 3.72338624150555e-06, + "loss": 0.97189271, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.2878418, + "step": 3232, + "time_per_iteration": 2.6186575889587402 + }, + { + "auxiliary_loss_clip": 0.01683839, + "auxiliary_loss_mlp": 0.00233035, + "balance_loss_clip": 1.36116242, + "balance_loss_mlp": 0.20688051, + "epoch": 0.19437847587554485, + "flos": 24712610146560.0, + "grad_norm": 11.33676279920379, + "language_loss": 0.91370392, + "learning_rate": 3.723188584382096e-06, + "loss": 0.93287265, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.26184082, + "step": 3233, + "time_per_iteration": 2.725860357284546 + }, + { + "auxiliary_loss_clip": 0.0168499, + "auxiliary_loss_mlp": 0.00235845, + "balance_loss_clip": 1.36028886, + "balance_loss_mlp": 0.20960662, + "epoch": 0.19443859912821285, + "flos": 23116130259840.0, + "grad_norm": 4.8888295552587415, + "language_loss": 0.94255215, + "learning_rate": 3.722990861915158e-06, + "loss": 0.96176052, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.2623291, + "step": 3234, + "time_per_iteration": 4.124841213226318 + }, + { + "auxiliary_loss_clip": 0.01684681, + "auxiliary_loss_mlp": 0.00265449, + "balance_loss_clip": 1.35862708, + "balance_loss_mlp": 0.23521727, + "epoch": 0.1944987223808808, + "flos": 15084385539840.0, + "grad_norm": 4.748176247558305, + "language_loss": 0.90344107, + "learning_rate": 3.722793074112234e-06, + "loss": 0.9229424, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.30249023, + "step": 3235, + "time_per_iteration": 2.5719330310821533 + }, + { + "auxiliary_loss_clip": 0.01715049, + "auxiliary_loss_mlp": 0.00215271, + "balance_loss_clip": 1.39019072, + "balance_loss_mlp": 0.18805593, + "epoch": 0.19455884563354878, + "flos": 17126876603520.0, + "grad_norm": 19.095545863884347, + "language_loss": 0.86214787, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.88145107, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.2722168, + "step": 3236, + "time_per_iteration": 2.6139965057373047 + }, + { + "auxiliary_loss_clip": 0.01731205, + "auxiliary_loss_mlp": 0.00223825, + "balance_loss_clip": 1.39889121, + "balance_loss_mlp": 0.19558483, + "epoch": 0.19461896888621674, + "flos": 20193396503040.0, + "grad_norm": 8.616125287420768, + "language_loss": 0.84036893, + "learning_rate": 3.72239730252843e-06, + "loss": 0.85991919, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.28222656, + "step": 3237, + "time_per_iteration": 2.563685894012451 + }, + { + "auxiliary_loss_clip": 0.01697933, + "auxiliary_loss_mlp": 0.00242972, + "balance_loss_clip": 1.36904168, + "balance_loss_mlp": 0.21443306, + "epoch": 0.1946790921388847, + "flos": 25301365971840.0, + "grad_norm": 249.32142840880246, + "language_loss": 0.81131625, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.83072525, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.28540039, + "step": 3238, + "time_per_iteration": 2.6591615676879883 + }, + { + "auxiliary_loss_clip": 0.01704329, + "auxiliary_loss_mlp": 0.00200079, + "balance_loss_clip": 1.37938893, + "balance_loss_mlp": 0.17373402, + "epoch": 0.19473921539155267, + "flos": 20193396503040.0, + "grad_norm": 11.472237620737886, + "language_loss": 0.83058816, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.84963226, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.26379395, + "step": 3239, + "time_per_iteration": 2.5623295307159424 + }, + { + "auxiliary_loss_clip": 0.01694685, + "auxiliary_loss_mlp": 0.00232481, + "balance_loss_clip": 1.37377417, + "balance_loss_mlp": 0.20549163, + "epoch": 0.19479933864422067, + "flos": 20887549810560.0, + "grad_norm": 141.88924090250637, + "language_loss": 0.80549228, + "learning_rate": 3.721803155320412e-06, + "loss": 0.82476383, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.2701416, + "step": 3240, + "time_per_iteration": 2.6704821586608887 + }, + { + "auxiliary_loss_clip": 0.01677755, + "auxiliary_loss_mlp": 0.00198554, + "balance_loss_clip": 1.35479772, + "balance_loss_mlp": 0.17063521, + "epoch": 0.19485946189688863, + "flos": 23295072839040.0, + "grad_norm": 4.764016391708516, + "language_loss": 0.78066552, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.79942864, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.27929688, + "step": 3241, + "time_per_iteration": 2.599961996078491 + }, + { + "auxiliary_loss_clip": 0.01676667, + "auxiliary_loss_mlp": 0.00217073, + "balance_loss_clip": 1.35208106, + "balance_loss_mlp": 0.18963075, + "epoch": 0.1949195851495566, + "flos": 23295036925440.0, + "grad_norm": 652.8906793645195, + "language_loss": 0.8882848, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.90722215, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.2746582, + "step": 3242, + "time_per_iteration": 2.6265292167663574 + }, + { + "auxiliary_loss_clip": 0.01607915, + "auxiliary_loss_mlp": 0.00054748, + "balance_loss_clip": 1.31007528, + "balance_loss_mlp": 0.04196914, + "epoch": 0.19497970840222456, + "flos": 64962871557120.0, + "grad_norm": 0.8195852625520489, + "language_loss": 0.5745995, + "learning_rate": 3.721208420493875e-06, + "loss": 0.5912261, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.12792969, + "step": 3243, + "time_per_iteration": 3.100541591644287 + }, + { + "auxiliary_loss_clip": 0.0166141, + "auxiliary_loss_mlp": 0.00209324, + "balance_loss_clip": 1.33653462, + "balance_loss_mlp": 0.18331209, + "epoch": 0.19503983165489253, + "flos": 19644717277440.0, + "grad_norm": 27.70060724365806, + "language_loss": 0.91429317, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.93300056, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.26025391, + "step": 3244, + "time_per_iteration": 2.6034953594207764 + }, + { + "auxiliary_loss_clip": 0.016588, + "auxiliary_loss_mlp": 0.00238076, + "balance_loss_clip": 1.33229518, + "balance_loss_mlp": 0.2112059, + "epoch": 0.1950999549075605, + "flos": 21141976821120.0, + "grad_norm": 30.40523846212217, + "language_loss": 0.84722865, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.86619747, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.26879883, + "step": 3245, + "time_per_iteration": 2.5805068016052246 + }, + { + "auxiliary_loss_clip": 0.01651127, + "auxiliary_loss_mlp": 0.00233567, + "balance_loss_clip": 1.32023668, + "balance_loss_mlp": 0.20632738, + "epoch": 0.19516007816022846, + "flos": 20884820376960.0, + "grad_norm": 6.987449608936733, + "language_loss": 0.92598778, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.94483471, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.27282715, + "step": 3246, + "time_per_iteration": 2.5855815410614014 + }, + { + "auxiliary_loss_clip": 0.01666601, + "auxiliary_loss_mlp": 0.00253858, + "balance_loss_clip": 1.33901501, + "balance_loss_mlp": 0.22639214, + "epoch": 0.19522020141289645, + "flos": 16910515031040.0, + "grad_norm": 6.410970970995804, + "language_loss": 0.85470837, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.87391299, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.27478027, + "step": 3247, + "time_per_iteration": 2.555216073989868 + }, + { + "auxiliary_loss_clip": 0.01669246, + "auxiliary_loss_mlp": 0.00232711, + "balance_loss_clip": 1.33948183, + "balance_loss_mlp": 0.20658062, + "epoch": 0.19528032466556441, + "flos": 26724829023360.0, + "grad_norm": 20.109160932901432, + "language_loss": 0.81755388, + "learning_rate": 3.720215890515421e-06, + "loss": 0.83657348, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.26123047, + "step": 3248, + "time_per_iteration": 2.6028952598571777 + }, + { + "auxiliary_loss_clip": 0.01654535, + "auxiliary_loss_mlp": 0.00255235, + "balance_loss_clip": 1.31965685, + "balance_loss_mlp": 0.22847281, + "epoch": 0.19534044791823238, + "flos": 21032808410880.0, + "grad_norm": 3.9015438136738982, + "language_loss": 0.85646415, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.87556183, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.2677002, + "step": 3249, + "time_per_iteration": 2.561229705810547 + }, + { + "auxiliary_loss_clip": 0.0164282, + "auxiliary_loss_mlp": 0.00253993, + "balance_loss_clip": 1.3118726, + "balance_loss_mlp": 0.22720672, + "epoch": 0.19540057117090034, + "flos": 22344050396160.0, + "grad_norm": 7.707379802710601, + "language_loss": 0.8037318, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.8226999, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.26806641, + "step": 3250, + "time_per_iteration": 2.607150077819824 + }, + { + "auxiliary_loss_clip": 0.01651784, + "auxiliary_loss_mlp": 0.00222811, + "balance_loss_clip": 1.31708229, + "balance_loss_mlp": 0.19526163, + "epoch": 0.1954606944235683, + "flos": 20301631159680.0, + "grad_norm": 50.738659312622794, + "language_loss": 0.8640238, + "learning_rate": 3.719619589699017e-06, + "loss": 0.8827697, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.27563477, + "step": 3251, + "time_per_iteration": 2.5346450805664062 + }, + { + "auxiliary_loss_clip": 0.01626174, + "auxiliary_loss_mlp": 0.00268794, + "balance_loss_clip": 1.29893017, + "balance_loss_mlp": 0.24137592, + "epoch": 0.19552081767623627, + "flos": 17346865449600.0, + "grad_norm": 4.593978402027078, + "language_loss": 0.91841972, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.93736941, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.27380371, + "step": 3252, + "time_per_iteration": 2.5551297664642334 + }, + { + "auxiliary_loss_clip": 0.01598297, + "auxiliary_loss_mlp": 0.0027466, + "balance_loss_clip": 1.2698493, + "balance_loss_mlp": 0.24552572, + "epoch": 0.19558094092890424, + "flos": 31977626129280.0, + "grad_norm": 7.008590108362895, + "language_loss": 0.78824085, + "learning_rate": 3.719221729768117e-06, + "loss": 0.80697036, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.29150391, + "step": 3253, + "time_per_iteration": 2.6741738319396973 + }, + { + "auxiliary_loss_clip": 0.01598262, + "auxiliary_loss_mlp": 0.00263282, + "balance_loss_clip": 1.26974499, + "balance_loss_mlp": 0.2355302, + "epoch": 0.19564106418157223, + "flos": 22268889187200.0, + "grad_norm": 12.051633557383417, + "language_loss": 0.85108376, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.86969924, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.27758789, + "step": 3254, + "time_per_iteration": 2.637686252593994 + }, + { + "auxiliary_loss_clip": 0.01556163, + "auxiliary_loss_mlp": 0.00053273, + "balance_loss_clip": 1.23647332, + "balance_loss_mlp": 0.03810933, + "epoch": 0.1957011874342402, + "flos": 54364554385920.0, + "grad_norm": 0.859020874271997, + "language_loss": 0.54955333, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.56564772, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.15136719, + "step": 3255, + "time_per_iteration": 3.1449332237243652 + }, + { + "auxiliary_loss_clip": 0.01597168, + "auxiliary_loss_mlp": 0.00314119, + "balance_loss_clip": 1.26381254, + "balance_loss_mlp": 0.28549671, + "epoch": 0.19576131068690816, + "flos": 16506699356160.0, + "grad_norm": 323.8292888862172, + "language_loss": 0.8376385, + "learning_rate": 3.718624450942688e-06, + "loss": 0.85675132, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.28625488, + "step": 3256, + "time_per_iteration": 2.5308613777160645 + }, + { + "auxiliary_loss_clip": 0.01592761, + "auxiliary_loss_mlp": 0.00247345, + "balance_loss_clip": 1.25916219, + "balance_loss_mlp": 0.22283517, + "epoch": 0.19582143393957613, + "flos": 14719676797440.0, + "grad_norm": 24.582982927365926, + "language_loss": 0.89647067, + "learning_rate": 3.718425227649987e-06, + "loss": 0.91487163, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.24511719, + "step": 3257, + "time_per_iteration": 2.5857226848602295 + }, + { + "auxiliary_loss_clip": 0.01594991, + "auxiliary_loss_mlp": 0.0022352, + "balance_loss_clip": 1.2615484, + "balance_loss_mlp": 0.19847424, + "epoch": 0.1958815571922441, + "flos": 24425504737920.0, + "grad_norm": 37.01552989354877, + "language_loss": 0.82487923, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.84306431, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.25061035, + "step": 3258, + "time_per_iteration": 2.6177713871002197 + }, + { + "auxiliary_loss_clip": 0.01579942, + "auxiliary_loss_mlp": 0.00272231, + "balance_loss_clip": 1.24838233, + "balance_loss_mlp": 0.24596961, + "epoch": 0.19594168044491206, + "flos": 24900279730560.0, + "grad_norm": 11.72590324452655, + "language_loss": 0.82417285, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.84269464, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.2623291, + "step": 3259, + "time_per_iteration": 2.6536808013916016 + }, + { + "auxiliary_loss_clip": 0.01594172, + "auxiliary_loss_mlp": 0.00287591, + "balance_loss_clip": 1.25965738, + "balance_loss_mlp": 0.26028055, + "epoch": 0.19600180369758005, + "flos": 12057008486400.0, + "grad_norm": 7.262466502680778, + "language_loss": 0.87930322, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.89812082, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.27319336, + "step": 3260, + "time_per_iteration": 2.5647459030151367 + }, + { + "auxiliary_loss_clip": 0.01591344, + "auxiliary_loss_mlp": 0.00269369, + "balance_loss_clip": 1.25700498, + "balance_loss_mlp": 0.24350038, + "epoch": 0.19606192695024802, + "flos": 20850202644480.0, + "grad_norm": 2.724765949332248, + "language_loss": 0.91394007, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.93254721, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.25891113, + "step": 3261, + "time_per_iteration": 2.6633474826812744 + }, + { + "auxiliary_loss_clip": 0.01605398, + "auxiliary_loss_mlp": 0.00290633, + "balance_loss_clip": 1.2678467, + "balance_loss_mlp": 0.26277351, + "epoch": 0.19612205020291598, + "flos": 28475509996800.0, + "grad_norm": 34.69001845062122, + "language_loss": 0.82035899, + "learning_rate": 3.717428133894807e-06, + "loss": 0.83931929, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.27880859, + "step": 3262, + "time_per_iteration": 2.7044639587402344 + }, + { + "auxiliary_loss_clip": 0.01594504, + "auxiliary_loss_mlp": 0.00268086, + "balance_loss_clip": 1.25316107, + "balance_loss_mlp": 0.24197899, + "epoch": 0.19618217345558395, + "flos": 25556618995200.0, + "grad_norm": 2.283297995293271, + "language_loss": 0.8979274, + "learning_rate": 3.71722851973837e-06, + "loss": 0.91655332, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.26098633, + "step": 3263, + "time_per_iteration": 2.66318416595459 + }, + { + "auxiliary_loss_clip": 0.01593306, + "auxiliary_loss_mlp": 0.0027123, + "balance_loss_clip": 1.25414801, + "balance_loss_mlp": 0.24481362, + "epoch": 0.1962422967082519, + "flos": 25264413855360.0, + "grad_norm": 1.690568253237969, + "language_loss": 0.81223977, + "learning_rate": 3.717028840464455e-06, + "loss": 0.83088517, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.26428223, + "step": 3264, + "time_per_iteration": 2.6438686847686768 + }, + { + "auxiliary_loss_clip": 0.01586305, + "auxiliary_loss_mlp": 0.00280257, + "balance_loss_clip": 1.24798751, + "balance_loss_mlp": 0.25484174, + "epoch": 0.19630241996091988, + "flos": 18807352444800.0, + "grad_norm": 74.70703631031417, + "language_loss": 0.88453114, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.90319681, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.2545166, + "step": 3265, + "time_per_iteration": 2.600177526473999 + }, + { + "auxiliary_loss_clip": 0.01509854, + "auxiliary_loss_mlp": 0.00118852, + "balance_loss_clip": 1.1622293, + "balance_loss_mlp": 0.10569097, + "epoch": 0.19636254321358784, + "flos": 62321137896960.0, + "grad_norm": 0.8339496170310328, + "language_loss": 0.52962291, + "learning_rate": 3.716629286594483e-06, + "loss": 0.54591, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.13183594, + "step": 3266, + "time_per_iteration": 3.160677671432495 + }, + { + "auxiliary_loss_clip": 0.01569376, + "auxiliary_loss_mlp": 0.00231915, + "balance_loss_clip": 1.23266482, + "balance_loss_mlp": 0.20585546, + "epoch": 0.19642266646625584, + "flos": 21069329564160.0, + "grad_norm": 117.83953483340771, + "language_loss": 0.89349329, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.91150624, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.26074219, + "step": 3267, + "time_per_iteration": 2.5824849605560303 + }, + { + "auxiliary_loss_clip": 0.01574902, + "auxiliary_loss_mlp": 0.00229888, + "balance_loss_clip": 1.23787069, + "balance_loss_mlp": 0.20539078, + "epoch": 0.1964827897189238, + "flos": 14538651229440.0, + "grad_norm": 55.96121150539209, + "language_loss": 0.94693041, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.96497834, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.24487305, + "step": 3268, + "time_per_iteration": 3.924222230911255 + }, + { + "auxiliary_loss_clip": 0.01580856, + "auxiliary_loss_mlp": 0.00236368, + "balance_loss_clip": 1.24035406, + "balance_loss_mlp": 0.21330056, + "epoch": 0.19654291297159177, + "flos": 19244636616960.0, + "grad_norm": 13.958782838057187, + "language_loss": 0.77618098, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.79435325, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.23095703, + "step": 3269, + "time_per_iteration": 2.566984176635742 + }, + { + "auxiliary_loss_clip": 0.01568333, + "auxiliary_loss_mlp": 0.00232781, + "balance_loss_clip": 1.23283708, + "balance_loss_mlp": 0.2069488, + "epoch": 0.19660303622425973, + "flos": 25775710001280.0, + "grad_norm": 9.828762208481296, + "language_loss": 0.88941944, + "learning_rate": 3.715829397778135e-06, + "loss": 0.90743059, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.25805664, + "step": 3270, + "time_per_iteration": 2.6500797271728516 + }, + { + "auxiliary_loss_clip": 0.01579069, + "auxiliary_loss_mlp": 0.00213741, + "balance_loss_clip": 1.24414897, + "balance_loss_mlp": 0.18778956, + "epoch": 0.1966631594769277, + "flos": 20595093275520.0, + "grad_norm": 14.859535110237664, + "language_loss": 0.91404366, + "learning_rate": 3.715629262894028e-06, + "loss": 0.93197179, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.25927734, + "step": 3271, + "time_per_iteration": 5.381535530090332 + }, + { + "auxiliary_loss_clip": 0.01606197, + "auxiliary_loss_mlp": 0.00236038, + "balance_loss_clip": 1.26541901, + "balance_loss_mlp": 0.21176746, + "epoch": 0.19672328272959566, + "flos": 23623188600960.0, + "grad_norm": 24.964557214334917, + "language_loss": 0.86506951, + "learning_rate": 3.715429062953087e-06, + "loss": 0.88349193, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.24279785, + "step": 3272, + "time_per_iteration": 2.581650495529175 + }, + { + "auxiliary_loss_clip": 0.016054, + "auxiliary_loss_mlp": 0.00244732, + "balance_loss_clip": 1.26355278, + "balance_loss_mlp": 0.21558538, + "epoch": 0.19678340598226365, + "flos": 23110922787840.0, + "grad_norm": 3.952600402349236, + "language_loss": 0.89029813, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.90879941, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.29174805, + "step": 3273, + "time_per_iteration": 2.5990960597991943 + }, + { + "auxiliary_loss_clip": 0.01632096, + "auxiliary_loss_mlp": 0.0023012, + "balance_loss_clip": 1.28645742, + "balance_loss_mlp": 0.20512211, + "epoch": 0.19684352923493162, + "flos": 24534852716160.0, + "grad_norm": 15.225643396958727, + "language_loss": 0.86697149, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.88559365, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.25036621, + "step": 3274, + "time_per_iteration": 2.6105189323425293 + }, + { + "auxiliary_loss_clip": 0.01629337, + "auxiliary_loss_mlp": 0.00267273, + "balance_loss_clip": 1.28881109, + "balance_loss_mlp": 0.2388058, + "epoch": 0.19690365248759958, + "flos": 21796448578560.0, + "grad_norm": 7.28893316008536, + "language_loss": 0.905047, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.92401314, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.28479004, + "step": 3275, + "time_per_iteration": 2.592250347137451 + }, + { + "auxiliary_loss_clip": 0.01648171, + "auxiliary_loss_mlp": 0.00230493, + "balance_loss_clip": 1.2998004, + "balance_loss_mlp": 0.20337246, + "epoch": 0.19696377574026755, + "flos": 19056643810560.0, + "grad_norm": 95.67508139728109, + "language_loss": 0.88158441, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.90037107, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 0.27099609, + "step": 3276, + "time_per_iteration": 4.026085615158081 + }, + { + "auxiliary_loss_clip": 0.01654294, + "auxiliary_loss_mlp": 0.00233652, + "balance_loss_clip": 1.30787325, + "balance_loss_mlp": 0.20588838, + "epoch": 0.19702389899293551, + "flos": 22820656982400.0, + "grad_norm": 68.74576517792558, + "language_loss": 0.97608268, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.9949621, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.27770996, + "step": 3277, + "time_per_iteration": 2.561866283416748 + }, + { + "auxiliary_loss_clip": 0.01649894, + "auxiliary_loss_mlp": 0.00244971, + "balance_loss_clip": 1.29940057, + "balance_loss_mlp": 0.21577683, + "epoch": 0.19708402224560348, + "flos": 22894237992960.0, + "grad_norm": 46.74227684443058, + "language_loss": 0.73361373, + "learning_rate": 3.714226497539239e-06, + "loss": 0.75256228, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.29162598, + "step": 3278, + "time_per_iteration": 2.591475009918213 + }, + { + "auxiliary_loss_clip": 0.01655038, + "auxiliary_loss_mlp": 0.00246773, + "balance_loss_clip": 1.3059113, + "balance_loss_mlp": 0.21732855, + "epoch": 0.19714414549827144, + "flos": 25662519267840.0, + "grad_norm": 154.38355946251596, + "language_loss": 0.85475028, + "learning_rate": 3.714025842413166e-06, + "loss": 0.87376845, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 0.29443359, + "step": 3279, + "time_per_iteration": 2.6198086738586426 + }, + { + "auxiliary_loss_clip": 0.01695197, + "auxiliary_loss_mlp": 0.00235821, + "balance_loss_clip": 1.34679103, + "balance_loss_mlp": 0.20877242, + "epoch": 0.19720426875093944, + "flos": 23915824704000.0, + "grad_norm": 5.044738938900593, + "language_loss": 0.88805658, + "learning_rate": 3.713825122291061e-06, + "loss": 0.90736675, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.27050781, + "step": 3280, + "time_per_iteration": 2.5816550254821777 + }, + { + "auxiliary_loss_clip": 0.01713167, + "auxiliary_loss_mlp": 0.00213185, + "balance_loss_clip": 1.35716152, + "balance_loss_mlp": 0.1839671, + "epoch": 0.1972643920036074, + "flos": 13881952828800.0, + "grad_norm": 124.44930068522754, + "language_loss": 0.84941959, + "learning_rate": 3.713624337180536e-06, + "loss": 0.8686831, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.29211426, + "step": 3281, + "time_per_iteration": 2.544574737548828 + }, + { + "auxiliary_loss_clip": 0.01716477, + "auxiliary_loss_mlp": 0.00219945, + "balance_loss_clip": 1.37145531, + "balance_loss_mlp": 0.19258657, + "epoch": 0.19732451525627537, + "flos": 19863592801920.0, + "grad_norm": 12.622962000113143, + "language_loss": 0.86611956, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.88548386, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 0.27355957, + "step": 3282, + "time_per_iteration": 2.5634145736694336 + }, + { + "auxiliary_loss_clip": 0.01716792, + "auxiliary_loss_mlp": 0.00228697, + "balance_loss_clip": 1.37278748, + "balance_loss_mlp": 0.19964564, + "epoch": 0.19738463850894333, + "flos": 24973429777920.0, + "grad_norm": 7.243081746527281, + "language_loss": 0.7910558, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.81051069, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 3.43945312, + "router_z_loss_mlp": 0.29064941, + "step": 3283, + "time_per_iteration": 2.657886505126953 + }, + { + "auxiliary_loss_clip": 0.01736662, + "auxiliary_loss_mlp": 0.00233106, + "balance_loss_clip": 1.38794756, + "balance_loss_mlp": 0.20308878, + "epoch": 0.1974447617616113, + "flos": 18368883123840.0, + "grad_norm": 3.7591961800815237, + "language_loss": 0.86467797, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.88437563, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 3.48828125, + "router_z_loss_mlp": 0.30029297, + "step": 3284, + "time_per_iteration": 2.6388537883758545 + }, + { + "auxiliary_loss_clip": 0.01751444, + "auxiliary_loss_mlp": 0.00224701, + "balance_loss_clip": 1.40368629, + "balance_loss_mlp": 0.19451755, + "epoch": 0.19750488501427926, + "flos": 22892945103360.0, + "grad_norm": 110.83966114522237, + "language_loss": 0.93079031, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.95055181, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.30175781, + "step": 3285, + "time_per_iteration": 2.5931429862976074 + }, + { + "auxiliary_loss_clip": 0.01740417, + "auxiliary_loss_mlp": 0.0021716, + "balance_loss_clip": 1.39478922, + "balance_loss_mlp": 0.18721455, + "epoch": 0.19756500826694723, + "flos": 21871502046720.0, + "grad_norm": 3.3545109221201104, + "language_loss": 0.97753358, + "learning_rate": 3.712619437068174e-06, + "loss": 0.99710935, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 0.29931641, + "step": 3286, + "time_per_iteration": 2.6027541160583496 + }, + { + "auxiliary_loss_clip": 0.01740278, + "auxiliary_loss_mlp": 0.00242743, + "balance_loss_clip": 1.39507473, + "balance_loss_mlp": 0.20652735, + "epoch": 0.19762513151961522, + "flos": 15158972131200.0, + "grad_norm": 630.9313052211288, + "language_loss": 0.88830584, + "learning_rate": 3.712418262187102e-06, + "loss": 0.90813601, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.36230469, + "step": 3287, + "time_per_iteration": 2.587345838546753 + }, + { + "auxiliary_loss_clip": 0.01735259, + "auxiliary_loss_mlp": 0.00243358, + "balance_loss_clip": 1.38841462, + "balance_loss_mlp": 0.21112339, + "epoch": 0.1976852547722832, + "flos": 16979175878400.0, + "grad_norm": 3.862111390240117, + "language_loss": 0.91753525, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.93732142, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 3.46679688, + "router_z_loss_mlp": 0.32250977, + "step": 3288, + "time_per_iteration": 2.5336194038391113 + }, + { + "auxiliary_loss_clip": 0.01730471, + "auxiliary_loss_mlp": 0.00237552, + "balance_loss_clip": 1.38795483, + "balance_loss_mlp": 0.20436421, + "epoch": 0.19774537802495115, + "flos": 20302924049280.0, + "grad_norm": 13.162279079117496, + "language_loss": 0.78869843, + "learning_rate": 3.712015717627374e-06, + "loss": 0.80837864, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 3.41992188, + "router_z_loss_mlp": 0.33203125, + "step": 3289, + "time_per_iteration": 2.629972219467163 + }, + { + "auxiliary_loss_clip": 0.01739909, + "auxiliary_loss_mlp": 0.00222653, + "balance_loss_clip": 1.39211178, + "balance_loss_mlp": 0.19218348, + "epoch": 0.19780550127761912, + "flos": 27235478724480.0, + "grad_norm": 3.0284548510970453, + "language_loss": 0.86550552, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.88513112, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 3.47460938, + "router_z_loss_mlp": 0.3046875, + "step": 3290, + "time_per_iteration": 2.6082053184509277 + }, + { + "auxiliary_loss_clip": 0.01742699, + "auxiliary_loss_mlp": 0.0010688, + "balance_loss_clip": 1.45337915, + "balance_loss_mlp": 0.0966282, + "epoch": 0.19786562453028708, + "flos": 63550972684800.0, + "grad_norm": 0.9105546651491299, + "language_loss": 0.60590959, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62440538, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.10253906, + "step": 3291, + "time_per_iteration": 3.11200213432312 + }, + { + "auxiliary_loss_clip": 0.0171837, + "auxiliary_loss_mlp": 0.0025278, + "balance_loss_clip": 1.37536407, + "balance_loss_mlp": 0.21661225, + "epoch": 0.19792574778295505, + "flos": 26286647011200.0, + "grad_norm": 6.936450996173116, + "language_loss": 0.90199518, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.92170674, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.36157227, + "step": 3292, + "time_per_iteration": 2.6358885765075684 + }, + { + "auxiliary_loss_clip": 0.01711311, + "auxiliary_loss_mlp": 0.00235949, + "balance_loss_clip": 1.37358785, + "balance_loss_mlp": 0.20698127, + "epoch": 0.19798587103562304, + "flos": 19938107566080.0, + "grad_norm": 6.387644399729946, + "language_loss": 0.8901372, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.90960979, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.28930664, + "step": 3293, + "time_per_iteration": 2.604734420776367 + }, + { + "auxiliary_loss_clip": 0.01698447, + "auxiliary_loss_mlp": 0.00324164, + "balance_loss_clip": 1.35732806, + "balance_loss_mlp": 0.29014122, + "epoch": 0.198045994288291, + "flos": 20120282369280.0, + "grad_norm": 11.302691434676905, + "language_loss": 0.72843885, + "learning_rate": 3.711008220265093e-06, + "loss": 0.74866492, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.34057617, + "step": 3294, + "time_per_iteration": 2.6524415016174316 + }, + { + "auxiliary_loss_clip": 0.0170503, + "auxiliary_loss_mlp": 0.00246196, + "balance_loss_clip": 1.36619186, + "balance_loss_mlp": 0.21522534, + "epoch": 0.19810611754095897, + "flos": 17967653228160.0, + "grad_norm": 153.68008581875495, + "language_loss": 0.97740352, + "learning_rate": 3.710806526117251e-06, + "loss": 0.99691582, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.30981445, + "step": 3295, + "time_per_iteration": 2.543225049972534 + }, + { + "auxiliary_loss_clip": 0.0169991, + "auxiliary_loss_mlp": 0.00247386, + "balance_loss_clip": 1.36094975, + "balance_loss_mlp": 0.21753648, + "epoch": 0.19816624079362694, + "flos": 15084996071040.0, + "grad_norm": 10.439087509340725, + "language_loss": 0.88308656, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.90255952, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.29858398, + "step": 3296, + "time_per_iteration": 2.5369954109191895 + }, + { + "auxiliary_loss_clip": 0.01700846, + "auxiliary_loss_mlp": 0.00288074, + "balance_loss_clip": 1.36334109, + "balance_loss_mlp": 0.25545788, + "epoch": 0.1982263640462949, + "flos": 24900315644160.0, + "grad_norm": 7.633848241817781, + "language_loss": 0.74273217, + "learning_rate": 3.710402943207354e-06, + "loss": 0.7626214, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.32592773, + "step": 3297, + "time_per_iteration": 2.668119192123413 + }, + { + "auxiliary_loss_clip": 0.01706412, + "auxiliary_loss_mlp": 0.00256837, + "balance_loss_clip": 1.36589777, + "balance_loss_mlp": 0.22654626, + "epoch": 0.19828648729896287, + "flos": 20376181837440.0, + "grad_norm": 8.157638779079988, + "language_loss": 0.89080775, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.91044021, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.30322266, + "step": 3298, + "time_per_iteration": 2.5893654823303223 + }, + { + "auxiliary_loss_clip": 0.01687409, + "auxiliary_loss_mlp": 0.00281477, + "balance_loss_clip": 1.3476038, + "balance_loss_mlp": 0.24845564, + "epoch": 0.19834661055163083, + "flos": 18880035615360.0, + "grad_norm": 14.265885134114791, + "language_loss": 0.91875201, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.9384408, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 3.39648438, + "router_z_loss_mlp": 0.33032227, + "step": 3299, + "time_per_iteration": 2.640726327896118 + }, + { + "auxiliary_loss_clip": 0.01773688, + "auxiliary_loss_mlp": 0.00086313, + "balance_loss_clip": 1.49320865, + "balance_loss_mlp": 0.07749163, + "epoch": 0.19840673380429882, + "flos": 60259184640000.0, + "grad_norm": 0.7508830124101357, + "language_loss": 0.5337733, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55237329, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.08837891, + "step": 3300, + "time_per_iteration": 3.0891201496124268 + }, + { + "auxiliary_loss_clip": 0.01683655, + "auxiliary_loss_mlp": 0.0028706, + "balance_loss_clip": 1.34750783, + "balance_loss_mlp": 0.25411052, + "epoch": 0.1984668570569668, + "flos": 19902017376000.0, + "grad_norm": 3.6401044588337967, + "language_loss": 0.80211008, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.82181722, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.32958984, + "step": 3301, + "time_per_iteration": 2.6006088256835938 + }, + { + "auxiliary_loss_clip": 0.01662954, + "auxiliary_loss_mlp": 0.00263136, + "balance_loss_clip": 1.33083558, + "balance_loss_mlp": 0.23433554, + "epoch": 0.19852698030963475, + "flos": 15630766295040.0, + "grad_norm": 7.479622966378967, + "language_loss": 0.97151214, + "learning_rate": 3.709392851040235e-06, + "loss": 0.99077308, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.2878418, + "step": 3302, + "time_per_iteration": 2.611330509185791 + }, + { + "auxiliary_loss_clip": 0.0166016, + "auxiliary_loss_mlp": 0.00256904, + "balance_loss_clip": 1.32933974, + "balance_loss_mlp": 0.22672033, + "epoch": 0.19858710356230272, + "flos": 43143007311360.0, + "grad_norm": 10.686118776709614, + "language_loss": 0.82607174, + "learning_rate": 3.709190638115111e-06, + "loss": 0.84524238, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.30175781, + "step": 3303, + "time_per_iteration": 2.7769429683685303 + }, + { + "auxiliary_loss_clip": 0.01636269, + "auxiliary_loss_mlp": 0.00263882, + "balance_loss_clip": 1.30631685, + "balance_loss_mlp": 0.23518831, + "epoch": 0.19864722681497068, + "flos": 35144084643840.0, + "grad_norm": 8.281393734692466, + "language_loss": 0.83481812, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.85381961, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.28686523, + "step": 3304, + "time_per_iteration": 2.72213077545166 + }, + { + "auxiliary_loss_clip": 0.01629099, + "auxiliary_loss_mlp": 0.00252444, + "balance_loss_clip": 1.3002938, + "balance_loss_mlp": 0.22285652, + "epoch": 0.19870735006763865, + "flos": 19426200888960.0, + "grad_norm": 23.41635539005961, + "language_loss": 0.92207527, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.94089067, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.29614258, + "step": 3305, + "time_per_iteration": 2.5579800605773926 + }, + { + "auxiliary_loss_clip": 0.01625723, + "auxiliary_loss_mlp": 0.00302223, + "balance_loss_clip": 1.29301739, + "balance_loss_mlp": 0.27300435, + "epoch": 0.19876747332030664, + "flos": 23547380947200.0, + "grad_norm": 40.70781241757538, + "language_loss": 0.75657457, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.77585399, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.29187012, + "step": 3306, + "time_per_iteration": 2.6503536701202393 + }, + { + "auxiliary_loss_clip": 0.01604146, + "auxiliary_loss_mlp": 0.0032183, + "balance_loss_clip": 1.27577662, + "balance_loss_mlp": 0.29296976, + "epoch": 0.1988275965729746, + "flos": 19829406032640.0, + "grad_norm": 6.690909754332828, + "language_loss": 0.83644748, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.85570717, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.28869629, + "step": 3307, + "time_per_iteration": 2.6274518966674805 + }, + { + "auxiliary_loss_clip": 0.0160666, + "auxiliary_loss_mlp": 0.00315734, + "balance_loss_clip": 1.27453637, + "balance_loss_mlp": 0.28835118, + "epoch": 0.19888771982564257, + "flos": 23513625141120.0, + "grad_norm": 18.171112408422946, + "language_loss": 0.82150424, + "learning_rate": 3.708178601452737e-06, + "loss": 0.84072816, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.27404785, + "step": 3308, + "time_per_iteration": 2.6042158603668213 + }, + { + "auxiliary_loss_clip": 0.01596576, + "auxiliary_loss_mlp": 0.00351054, + "balance_loss_clip": 1.25541186, + "balance_loss_mlp": 0.32194272, + "epoch": 0.19894784307831054, + "flos": 18150510389760.0, + "grad_norm": 306.08984528305217, + "language_loss": 0.83588636, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.85536265, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.29089355, + "step": 3309, + "time_per_iteration": 2.683746814727783 + }, + { + "auxiliary_loss_clip": 0.01596646, + "auxiliary_loss_mlp": 0.00371989, + "balance_loss_clip": 1.25151408, + "balance_loss_mlp": 0.34399891, + "epoch": 0.1990079663309785, + "flos": 24276044246400.0, + "grad_norm": 4.172722762901281, + "language_loss": 0.93227172, + "learning_rate": 3.707773333313917e-06, + "loss": 0.95195812, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.2800293, + "step": 3310, + "time_per_iteration": 4.0087034702301025 + }, + { + "auxiliary_loss_clip": 0.01595356, + "auxiliary_loss_mlp": 0.0035237, + "balance_loss_clip": 1.24066317, + "balance_loss_mlp": 0.32683563, + "epoch": 0.19906808958364647, + "flos": 34897666366080.0, + "grad_norm": 7.433018810100654, + "language_loss": 0.70414293, + "learning_rate": 3.70757060210226e-06, + "loss": 0.72362018, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 0.25512695, + "step": 3311, + "time_per_iteration": 2.744691848754883 + }, + { + "auxiliary_loss_clip": 0.01593955, + "auxiliary_loss_mlp": 0.00384696, + "balance_loss_clip": 1.23552847, + "balance_loss_mlp": 0.35560927, + "epoch": 0.19912821283631443, + "flos": 24024885373440.0, + "grad_norm": 2.514730262116759, + "language_loss": 0.84682864, + "learning_rate": 3.707367806139355e-06, + "loss": 0.86661518, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 0.29101562, + "step": 3312, + "time_per_iteration": 2.5754823684692383 + }, + { + "auxiliary_loss_clip": 0.01595006, + "auxiliary_loss_mlp": 0.00438479, + "balance_loss_clip": 1.23171496, + "balance_loss_mlp": 0.40889117, + "epoch": 0.19918833608898243, + "flos": 19859031774720.0, + "grad_norm": 10.902971306348944, + "language_loss": 0.90011317, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.92044806, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 0.29589844, + "step": 3313, + "time_per_iteration": 5.489395380020142 + }, + { + "auxiliary_loss_clip": 0.01588437, + "auxiliary_loss_mlp": 0.00413274, + "balance_loss_clip": 1.21301162, + "balance_loss_mlp": 0.3861773, + "epoch": 0.1992484593416504, + "flos": 29095794984960.0, + "grad_norm": 7.682048378198502, + "language_loss": 0.87474686, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.89476395, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.27111816, + "step": 3314, + "time_per_iteration": 2.6807971000671387 + }, + { + "auxiliary_loss_clip": 0.01585371, + "auxiliary_loss_mlp": 0.00391069, + "balance_loss_clip": 1.20168972, + "balance_loss_mlp": 0.36464036, + "epoch": 0.19930858259431836, + "flos": 23295001011840.0, + "grad_norm": 4.936649381620733, + "language_loss": 0.93458748, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.9543519, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.26428223, + "step": 3315, + "time_per_iteration": 2.643106460571289 + }, + { + "auxiliary_loss_clip": 0.01593602, + "auxiliary_loss_mlp": 0.00396192, + "balance_loss_clip": 1.20489264, + "balance_loss_mlp": 0.37008524, + "epoch": 0.19936870584698632, + "flos": 25378825651200.0, + "grad_norm": 4.554801574021615, + "language_loss": 0.78209412, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.80199212, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 3.88867188, + "router_z_loss_mlp": 0.26123047, + "step": 3316, + "time_per_iteration": 2.6912410259246826 + }, + { + "auxiliary_loss_clip": 0.01704241, + "auxiliary_loss_mlp": 0.0017224, + "balance_loss_clip": 1.33988428, + "balance_loss_mlp": 0.1609392, + "epoch": 0.1994288290996543, + "flos": 62168053109760.0, + "grad_norm": 0.8424836031286608, + "language_loss": 0.66263014, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68139499, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.11279297, + "step": 3317, + "time_per_iteration": 3.1789515018463135 + }, + { + "auxiliary_loss_clip": 0.01581061, + "auxiliary_loss_mlp": 0.00354117, + "balance_loss_clip": 1.19659793, + "balance_loss_mlp": 0.32766449, + "epoch": 0.19948895235232225, + "flos": 19025832919680.0, + "grad_norm": 3.608630906279698, + "language_loss": 0.79290491, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.81225669, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.26452637, + "step": 3318, + "time_per_iteration": 4.025881290435791 + }, + { + "auxiliary_loss_clip": 0.01583877, + "auxiliary_loss_mlp": 0.00328978, + "balance_loss_clip": 1.1956749, + "balance_loss_mlp": 0.30297869, + "epoch": 0.19954907560499022, + "flos": 37815803182080.0, + "grad_norm": 35.982699549367446, + "language_loss": 0.85352486, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.87265337, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 3.88085938, + "router_z_loss_mlp": 0.25964355, + "step": 3319, + "time_per_iteration": 2.7836551666259766 + }, + { + "auxiliary_loss_clip": 0.01572318, + "auxiliary_loss_mlp": 0.00343487, + "balance_loss_clip": 1.18881881, + "balance_loss_mlp": 0.31500751, + "epoch": 0.1996091988576582, + "flos": 49565199594240.0, + "grad_norm": 4.2297402216123325, + "language_loss": 0.83273607, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.85189408, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 0.28442383, + "step": 3320, + "time_per_iteration": 2.8594417572021484 + }, + { + "auxiliary_loss_clip": 0.01564143, + "auxiliary_loss_mlp": 0.00385496, + "balance_loss_clip": 1.1853652, + "balance_loss_mlp": 0.35818481, + "epoch": 0.19966932211032618, + "flos": 22635788659200.0, + "grad_norm": 3.295923545560843, + "language_loss": 0.86883295, + "learning_rate": 3.705539729936701e-06, + "loss": 0.88832927, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 0.27294922, + "step": 3321, + "time_per_iteration": 2.6478614807128906 + }, + { + "auxiliary_loss_clip": 0.01671173, + "auxiliary_loss_mlp": 0.00104199, + "balance_loss_clip": 1.33721757, + "balance_loss_mlp": 0.0940423, + "epoch": 0.19972944536299414, + "flos": 54082117745280.0, + "grad_norm": 0.9342064908490068, + "language_loss": 0.65404201, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67179573, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.1015625, + "step": 3322, + "time_per_iteration": 2.966479778289795 + }, + { + "auxiliary_loss_clip": 0.01691782, + "auxiliary_loss_mlp": 0.00210978, + "balance_loss_clip": 1.34446049, + "balance_loss_mlp": 0.19910431, + "epoch": 0.1997895686156621, + "flos": 69355031817600.0, + "grad_norm": 1.3034215182918623, + "language_loss": 0.56620556, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.58523315, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.11865234, + "step": 3323, + "time_per_iteration": 3.2528433799743652 + }, + { + "auxiliary_loss_clip": 0.01561051, + "auxiliary_loss_mlp": 0.00403117, + "balance_loss_clip": 1.18227172, + "balance_loss_mlp": 0.37512687, + "epoch": 0.19984969186833007, + "flos": 18552063507840.0, + "grad_norm": 14.035140574822844, + "language_loss": 0.88788182, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.90752351, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 0.27990723, + "step": 3324, + "time_per_iteration": 2.620413064956665 + }, + { + "auxiliary_loss_clip": 0.01549292, + "auxiliary_loss_mlp": 0.00387552, + "balance_loss_clip": 1.17324889, + "balance_loss_mlp": 0.35793999, + "epoch": 0.19990981512099804, + "flos": 26429678968320.0, + "grad_norm": 33.84582193987661, + "language_loss": 0.60351312, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.62288159, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.29614258, + "step": 3325, + "time_per_iteration": 2.629257917404175 + }, + { + "auxiliary_loss_clip": 0.01549427, + "auxiliary_loss_mlp": 0.00421685, + "balance_loss_clip": 1.17834485, + "balance_loss_mlp": 0.39147726, + "epoch": 0.19996993837366603, + "flos": 16325997010560.0, + "grad_norm": 14.631101618667708, + "language_loss": 0.92372203, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.94343317, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.30224609, + "step": 3326, + "time_per_iteration": 2.5861263275146484 + }, + { + "auxiliary_loss_clip": 0.01531438, + "auxiliary_loss_mlp": 0.00508936, + "balance_loss_clip": 1.16712594, + "balance_loss_mlp": 0.47820407, + "epoch": 0.200030061626334, + "flos": 20844169159680.0, + "grad_norm": 5.712766190258668, + "language_loss": 0.79595602, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.81635982, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.30761719, + "step": 3327, + "time_per_iteration": 2.5910654067993164 + }, + { + "auxiliary_loss_clip": 0.01531077, + "auxiliary_loss_mlp": 0.00447128, + "balance_loss_clip": 1.16439843, + "balance_loss_mlp": 0.41563246, + "epoch": 0.20009018487900196, + "flos": 23762629198080.0, + "grad_norm": 13.395911085146682, + "language_loss": 0.83659095, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.85637295, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.31518555, + "step": 3328, + "time_per_iteration": 2.6371023654937744 + }, + { + "auxiliary_loss_clip": 0.01533503, + "auxiliary_loss_mlp": 0.0044544, + "balance_loss_clip": 1.16094589, + "balance_loss_mlp": 0.41616213, + "epoch": 0.20015030813166992, + "flos": 28111555440000.0, + "grad_norm": 9.32326841338631, + "language_loss": 0.74682915, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.76661861, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.29248047, + "step": 3329, + "time_per_iteration": 2.7418808937072754 + }, + { + "auxiliary_loss_clip": 0.01527665, + "auxiliary_loss_mlp": 0.00448887, + "balance_loss_clip": 1.16034532, + "balance_loss_mlp": 0.41702259, + "epoch": 0.2002104313843379, + "flos": 26067160955520.0, + "grad_norm": 8.924714763284829, + "language_loss": 0.86036003, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.88012546, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.31848145, + "step": 3330, + "time_per_iteration": 2.7741734981536865 + }, + { + "auxiliary_loss_clip": 0.01522699, + "auxiliary_loss_mlp": 0.00457109, + "balance_loss_clip": 1.15919459, + "balance_loss_mlp": 0.42678258, + "epoch": 0.20027055463700585, + "flos": 22966633854720.0, + "grad_norm": 44.498109570884026, + "language_loss": 0.84106565, + "learning_rate": 3.703502390349417e-06, + "loss": 0.86086369, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.3034668, + "step": 3331, + "time_per_iteration": 2.756143808364868 + }, + { + "auxiliary_loss_clip": 0.01517393, + "auxiliary_loss_mlp": 0.00505927, + "balance_loss_clip": 1.15742302, + "balance_loss_mlp": 0.47211909, + "epoch": 0.20033067788967382, + "flos": 17165660313600.0, + "grad_norm": 1649.7640189605431, + "language_loss": 0.87632197, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.89655513, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 0.33789062, + "step": 3332, + "time_per_iteration": 2.692411184310913 + }, + { + "auxiliary_loss_clip": 0.01567493, + "auxiliary_loss_mlp": 0.00349, + "balance_loss_clip": 1.28823555, + "balance_loss_mlp": 0.33507681, + "epoch": 0.2003908011423418, + "flos": 60825566292480.0, + "grad_norm": 0.9259355977721593, + "language_loss": 0.61901093, + "learning_rate": 3.703094147020776e-06, + "loss": 0.63817596, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.13964844, + "step": 3333, + "time_per_iteration": 3.038001775741577 + }, + { + "auxiliary_loss_clip": 0.01519179, + "auxiliary_loss_mlp": 0.00500474, + "balance_loss_clip": 1.15896297, + "balance_loss_mlp": 0.46683359, + "epoch": 0.20045092439500978, + "flos": 24206234163840.0, + "grad_norm": 66.62227776734161, + "language_loss": 0.89682627, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.91702282, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 0.33642578, + "step": 3334, + "time_per_iteration": 2.6374704837799072 + }, + { + "auxiliary_loss_clip": 0.01529288, + "auxiliary_loss_mlp": 0.0047696, + "balance_loss_clip": 1.16898465, + "balance_loss_mlp": 0.44093537, + "epoch": 0.20051104764767774, + "flos": 29387605075200.0, + "grad_norm": 10.686283717365457, + "language_loss": 0.84153485, + "learning_rate": 3.702685645366134e-06, + "loss": 0.8615973, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 0.36035156, + "step": 3335, + "time_per_iteration": 2.7275431156158447 + }, + { + "auxiliary_loss_clip": 0.01514967, + "auxiliary_loss_mlp": 0.00501732, + "balance_loss_clip": 1.16363955, + "balance_loss_mlp": 0.46816313, + "epoch": 0.2005711709003457, + "flos": 23513804709120.0, + "grad_norm": 2.173876061963717, + "language_loss": 0.85632032, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.87648726, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 0.33544922, + "step": 3336, + "time_per_iteration": 2.682661294937134 + }, + { + "auxiliary_loss_clip": 0.01510042, + "auxiliary_loss_mlp": 0.00427259, + "balance_loss_clip": 1.15812051, + "balance_loss_mlp": 0.39473909, + "epoch": 0.20063129415301367, + "flos": 22523388024960.0, + "grad_norm": 7.607943727113308, + "language_loss": 0.86350936, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.88288236, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.32519531, + "step": 3337, + "time_per_iteration": 2.6008236408233643 + }, + { + "auxiliary_loss_clip": 0.01518032, + "auxiliary_loss_mlp": 0.0039276, + "balance_loss_clip": 1.16708767, + "balance_loss_mlp": 0.36084771, + "epoch": 0.20069141740568164, + "flos": 25958243940480.0, + "grad_norm": 317.1963663038814, + "language_loss": 0.75748283, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.77659082, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 0.31896973, + "step": 3338, + "time_per_iteration": 2.7257776260375977 + }, + { + "auxiliary_loss_clip": 0.01504782, + "auxiliary_loss_mlp": 0.00417458, + "balance_loss_clip": 1.15869117, + "balance_loss_mlp": 0.3872022, + "epoch": 0.2007515406583496, + "flos": 24790608529920.0, + "grad_norm": 677.7815077373339, + "language_loss": 0.78648484, + "learning_rate": 3.701867867326735e-06, + "loss": 0.80570716, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.30273438, + "step": 3339, + "time_per_iteration": 2.689835786819458 + }, + { + "auxiliary_loss_clip": 0.01509634, + "auxiliary_loss_mlp": 0.00414509, + "balance_loss_clip": 1.16250086, + "balance_loss_mlp": 0.38160771, + "epoch": 0.2008116639110176, + "flos": 37925582123520.0, + "grad_norm": 9.40711996023775, + "language_loss": 0.77309477, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.79233623, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.32885742, + "step": 3340, + "time_per_iteration": 2.831068277359009 + }, + { + "auxiliary_loss_clip": 0.01508913, + "auxiliary_loss_mlp": 0.00376285, + "balance_loss_clip": 1.16042948, + "balance_loss_mlp": 0.34571967, + "epoch": 0.20087178716368556, + "flos": 20740531443840.0, + "grad_norm": 6.766968220932683, + "language_loss": 0.82865059, + "learning_rate": 3.701458591066019e-06, + "loss": 0.84750253, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 3.48828125, + "router_z_loss_mlp": 0.30566406, + "step": 3341, + "time_per_iteration": 2.643202543258667 + }, + { + "auxiliary_loss_clip": 0.01507529, + "auxiliary_loss_mlp": 0.00363777, + "balance_loss_clip": 1.16245139, + "balance_loss_mlp": 0.33409435, + "epoch": 0.20093191041635353, + "flos": 23842279607040.0, + "grad_norm": 3.157524871586432, + "language_loss": 0.79988825, + "learning_rate": 3.70125385615256e-06, + "loss": 0.81860131, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 0.29663086, + "step": 3342, + "time_per_iteration": 2.631683111190796 + }, + { + "auxiliary_loss_clip": 0.01517439, + "auxiliary_loss_mlp": 0.00377055, + "balance_loss_clip": 1.17156649, + "balance_loss_mlp": 0.34741932, + "epoch": 0.2009920336690215, + "flos": 21792067119360.0, + "grad_norm": 4.184161842415689, + "language_loss": 0.81668246, + "learning_rate": 3.701049056727384e-06, + "loss": 0.83562744, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.29638672, + "step": 3343, + "time_per_iteration": 2.6480438709259033 + }, + { + "auxiliary_loss_clip": 0.01517147, + "auxiliary_loss_mlp": 0.00341978, + "balance_loss_clip": 1.1715318, + "balance_loss_mlp": 0.31036386, + "epoch": 0.20105215692168946, + "flos": 26359222440960.0, + "grad_norm": 134.9078972057988, + "language_loss": 0.89974582, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.91833711, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 3.45703125, + "router_z_loss_mlp": 0.31616211, + "step": 3344, + "time_per_iteration": 2.669680118560791 + }, + { + "auxiliary_loss_clip": 0.01518323, + "auxiliary_loss_mlp": 0.00332701, + "balance_loss_clip": 1.17399549, + "balance_loss_mlp": 0.30084848, + "epoch": 0.20111228017435742, + "flos": 18807280617600.0, + "grad_norm": 99.58664664894089, + "language_loss": 0.92340553, + "learning_rate": 3.700639264372948e-06, + "loss": 0.94191575, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.31835938, + "step": 3345, + "time_per_iteration": 2.6143481731414795 + }, + { + "auxiliary_loss_clip": 0.01521331, + "auxiliary_loss_mlp": 0.00320309, + "balance_loss_clip": 1.17720163, + "balance_loss_mlp": 0.29314113, + "epoch": 0.20117240342702541, + "flos": 19975059682560.0, + "grad_norm": 9.367958661174363, + "language_loss": 0.75286436, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.77128077, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 3.43945312, + "router_z_loss_mlp": 0.27185059, + "step": 3346, + "time_per_iteration": 2.614147901535034 + }, + { + "auxiliary_loss_clip": 0.01508834, + "auxiliary_loss_mlp": 0.00307114, + "balance_loss_clip": 1.1719538, + "balance_loss_mlp": 0.2783961, + "epoch": 0.20123252667969338, + "flos": 23142703345920.0, + "grad_norm": 7.673286117842933, + "language_loss": 0.80577469, + "learning_rate": 3.70022921406487e-06, + "loss": 0.82393426, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.2869873, + "step": 3347, + "time_per_iteration": 2.7201085090637207 + }, + { + "auxiliary_loss_clip": 0.01520355, + "auxiliary_loss_mlp": 0.0029426, + "balance_loss_clip": 1.18578732, + "balance_loss_mlp": 0.26641315, + "epoch": 0.20129264993236134, + "flos": 23221671396480.0, + "grad_norm": 2.0396528566841767, + "language_loss": 0.93233061, + "learning_rate": 3.70002409219765e-06, + "loss": 0.95047677, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.27844238, + "step": 3348, + "time_per_iteration": 2.63254976272583 + }, + { + "auxiliary_loss_clip": 0.0151194, + "auxiliary_loss_mlp": 0.00318252, + "balance_loss_clip": 1.1773535, + "balance_loss_mlp": 0.29028562, + "epoch": 0.2013527731850293, + "flos": 21871466133120.0, + "grad_norm": 237.23598117162115, + "language_loss": 0.77857113, + "learning_rate": 3.699818905865346e-06, + "loss": 0.79687303, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.27978516, + "step": 3349, + "time_per_iteration": 2.599344253540039 + }, + { + "auxiliary_loss_clip": 0.01514325, + "auxiliary_loss_mlp": 0.00273531, + "balance_loss_clip": 1.18013287, + "balance_loss_mlp": 0.24509984, + "epoch": 0.20141289643769728, + "flos": 18040803275520.0, + "grad_norm": 1.614194178611948, + "language_loss": 0.78708875, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.80496728, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.28442383, + "step": 3350, + "time_per_iteration": 2.5658459663391113 + }, + { + "auxiliary_loss_clip": 0.01502884, + "auxiliary_loss_mlp": 0.00270649, + "balance_loss_clip": 1.16854692, + "balance_loss_mlp": 0.24201477, + "epoch": 0.20147301969036524, + "flos": 23951412103680.0, + "grad_norm": 22.350809425950196, + "language_loss": 0.84767377, + "learning_rate": 3.69940833983661e-06, + "loss": 0.86540902, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.28662109, + "step": 3351, + "time_per_iteration": 2.603834390640259 + }, + { + "auxiliary_loss_clip": 0.01512502, + "auxiliary_loss_mlp": 0.00295265, + "balance_loss_clip": 1.18027329, + "balance_loss_mlp": 0.26577234, + "epoch": 0.2015331429430332, + "flos": 25588471380480.0, + "grad_norm": 2.7912079088564217, + "language_loss": 0.86898744, + "learning_rate": 3.699202960155748e-06, + "loss": 0.88706505, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.29492188, + "step": 3352, + "time_per_iteration": 4.127259016036987 + }, + { + "auxiliary_loss_clip": 0.01507006, + "auxiliary_loss_mlp": 0.00276564, + "balance_loss_clip": 1.17614794, + "balance_loss_mlp": 0.24877609, + "epoch": 0.2015932661957012, + "flos": 26724972677760.0, + "grad_norm": 6.577881360054237, + "language_loss": 0.87050712, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.88834274, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.27807617, + "step": 3353, + "time_per_iteration": 2.6864709854125977 + }, + { + "auxiliary_loss_clip": 0.01510269, + "auxiliary_loss_mlp": 0.00244704, + "balance_loss_clip": 1.18215239, + "balance_loss_mlp": 0.21581987, + "epoch": 0.20165338944836916, + "flos": 15633136592640.0, + "grad_norm": 726.9587299641073, + "language_loss": 0.97875476, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.99630451, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.28869629, + "step": 3354, + "time_per_iteration": 2.575960636138916 + }, + { + "auxiliary_loss_clip": 0.01597888, + "auxiliary_loss_mlp": 0.00122063, + "balance_loss_clip": 1.30263436, + "balance_loss_mlp": 0.11214524, + "epoch": 0.20171351270103713, + "flos": 57912529207680.0, + "grad_norm": 0.8228369603789506, + "language_loss": 0.55678731, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.57398683, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.09912109, + "step": 3355, + "time_per_iteration": 5.961647987365723 + }, + { + "auxiliary_loss_clip": 0.01506457, + "auxiliary_loss_mlp": 0.00236028, + "balance_loss_clip": 1.18018508, + "balance_loss_mlp": 0.21021903, + "epoch": 0.2017736359537051, + "flos": 20814363849600.0, + "grad_norm": 5.963360373944326, + "language_loss": 0.89915085, + "learning_rate": 3.698380797170751e-06, + "loss": 0.91657567, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.25793457, + "step": 3356, + "time_per_iteration": 2.62278151512146 + }, + { + "auxiliary_loss_clip": 0.01491746, + "auxiliary_loss_mlp": 0.00333766, + "balance_loss_clip": 1.16633165, + "balance_loss_mlp": 0.30186585, + "epoch": 0.20183375920637306, + "flos": 17092043389440.0, + "grad_norm": 21.369250909035507, + "language_loss": 0.81281453, + "learning_rate": 3.698175095398085e-06, + "loss": 0.83106959, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.31884766, + "step": 3357, + "time_per_iteration": 2.620171070098877 + }, + { + "auxiliary_loss_clip": 0.01497693, + "auxiliary_loss_mlp": 0.00285772, + "balance_loss_clip": 1.17324197, + "balance_loss_mlp": 0.25637469, + "epoch": 0.20189388245904102, + "flos": 18661339658880.0, + "grad_norm": 3.2318712401182994, + "language_loss": 0.80791003, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.82574469, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.29394531, + "step": 3358, + "time_per_iteration": 2.6616175174713135 + }, + { + "auxiliary_loss_clip": 0.01494767, + "auxiliary_loss_mlp": 0.00308913, + "balance_loss_clip": 1.17580438, + "balance_loss_mlp": 0.28311577, + "epoch": 0.20195400571170902, + "flos": 16797539779200.0, + "grad_norm": 9.617034052988245, + "language_loss": 0.89514023, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.91317701, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.2578125, + "step": 3359, + "time_per_iteration": 2.5680928230285645 + }, + { + "auxiliary_loss_clip": 0.01541946, + "auxiliary_loss_mlp": 0.00069522, + "balance_loss_clip": 1.23367977, + "balance_loss_mlp": 0.0575535, + "epoch": 0.20201412896437698, + "flos": 67174716268800.0, + "grad_norm": 0.7841451856825833, + "language_loss": 0.58765346, + "learning_rate": 3.697557603741482e-06, + "loss": 0.60376817, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.11962891, + "step": 3360, + "time_per_iteration": 4.528645992279053 + }, + { + "auxiliary_loss_clip": 0.01496429, + "auxiliary_loss_mlp": 0.00287216, + "balance_loss_clip": 1.17936158, + "balance_loss_mlp": 0.26072752, + "epoch": 0.20207425221704495, + "flos": 21325013550720.0, + "grad_norm": 25.45651006361267, + "language_loss": 0.71807045, + "learning_rate": 3.697351644435763e-06, + "loss": 0.7359069, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.26477051, + "step": 3361, + "time_per_iteration": 2.6669304370880127 + }, + { + "auxiliary_loss_clip": 0.01503622, + "auxiliary_loss_mlp": 0.00312401, + "balance_loss_clip": 1.18721151, + "balance_loss_mlp": 0.28662819, + "epoch": 0.2021343754697129, + "flos": 22527158952960.0, + "grad_norm": 80.34154106419717, + "language_loss": 0.85824645, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.87640667, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.25793457, + "step": 3362, + "time_per_iteration": 2.661623477935791 + }, + { + "auxiliary_loss_clip": 0.01505652, + "auxiliary_loss_mlp": 0.00282386, + "balance_loss_clip": 1.1877079, + "balance_loss_mlp": 0.25598109, + "epoch": 0.20219449872238088, + "flos": 19062785036160.0, + "grad_norm": 2.8500539898704607, + "language_loss": 0.83720404, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.85508442, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.2644043, + "step": 3363, + "time_per_iteration": 2.6145687103271484 + }, + { + "auxiliary_loss_clip": 0.01510939, + "auxiliary_loss_mlp": 0.00320953, + "balance_loss_clip": 1.19561934, + "balance_loss_mlp": 0.2960856, + "epoch": 0.20225462197504884, + "flos": 24717027519360.0, + "grad_norm": 34.14082761948568, + "language_loss": 0.80552053, + "learning_rate": 3.696733380367391e-06, + "loss": 0.82383949, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 3.15234375, + "router_z_loss_mlp": 0.24890137, + "step": 3364, + "time_per_iteration": 2.631901264190674 + }, + { + "auxiliary_loss_clip": 0.01515703, + "auxiliary_loss_mlp": 0.00316571, + "balance_loss_clip": 1.19852555, + "balance_loss_mlp": 0.29020232, + "epoch": 0.2023147452277168, + "flos": 22018304931840.0, + "grad_norm": 3.030024924426711, + "language_loss": 0.80118322, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.81950593, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.26391602, + "step": 3365, + "time_per_iteration": 2.6476588249206543 + }, + { + "auxiliary_loss_clip": 0.01522185, + "auxiliary_loss_mlp": 0.00333287, + "balance_loss_clip": 1.20441651, + "balance_loss_mlp": 0.30806217, + "epoch": 0.2023748684803848, + "flos": 17745365911680.0, + "grad_norm": 8.108945897119938, + "language_loss": 0.91959721, + "learning_rate": 3.696320882607286e-06, + "loss": 0.93815196, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.25219727, + "step": 3366, + "time_per_iteration": 2.631880760192871 + }, + { + "auxiliary_loss_clip": 0.01520155, + "auxiliary_loss_mlp": 0.00262881, + "balance_loss_clip": 1.1992054, + "balance_loss_mlp": 0.23762105, + "epoch": 0.20243499173305277, + "flos": 31138932493440.0, + "grad_norm": 6.546396227672464, + "language_loss": 0.75997448, + "learning_rate": 3.696114537236335e-06, + "loss": 0.77780485, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.25231934, + "step": 3367, + "time_per_iteration": 2.7376925945281982 + }, + { + "auxiliary_loss_clip": 0.01525276, + "auxiliary_loss_mlp": 0.00312198, + "balance_loss_clip": 1.19821644, + "balance_loss_mlp": 0.28307551, + "epoch": 0.20249511498572073, + "flos": 33839235279360.0, + "grad_norm": 12.747926838855287, + "language_loss": 0.74108064, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.75945538, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.29138184, + "step": 3368, + "time_per_iteration": 2.819646120071411 + }, + { + "auxiliary_loss_clip": 0.01534908, + "auxiliary_loss_mlp": 0.00237421, + "balance_loss_clip": 1.21162701, + "balance_loss_mlp": 0.21369854, + "epoch": 0.2025552382383887, + "flos": 21215629658880.0, + "grad_norm": 12.827577633490813, + "language_loss": 0.85566616, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.87338936, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.23730469, + "step": 3369, + "time_per_iteration": 2.6154844760894775 + }, + { + "auxiliary_loss_clip": 0.01543912, + "auxiliary_loss_mlp": 0.00260122, + "balance_loss_clip": 1.21384406, + "balance_loss_mlp": 0.23542157, + "epoch": 0.20261536149105666, + "flos": 14647388676480.0, + "grad_norm": 5.2475378444922205, + "language_loss": 0.77221775, + "learning_rate": 3.695495115253795e-06, + "loss": 0.79025805, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.24719238, + "step": 3370, + "time_per_iteration": 2.5543203353881836 + }, + { + "auxiliary_loss_clip": 0.01671191, + "auxiliary_loss_mlp": 0.00097779, + "balance_loss_clip": 1.3596853, + "balance_loss_mlp": 0.08995858, + "epoch": 0.20267548474372463, + "flos": 66783649921920.0, + "grad_norm": 0.6849595341462454, + "language_loss": 0.5826382, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60032791, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.078125, + "step": 3371, + "time_per_iteration": 3.165196657180786 + }, + { + "auxiliary_loss_clip": 0.01548251, + "auxiliary_loss_mlp": 0.00216743, + "balance_loss_clip": 1.21448016, + "balance_loss_mlp": 0.19358087, + "epoch": 0.2027356079963926, + "flos": 24680793674880.0, + "grad_norm": 6.037760275745413, + "language_loss": 0.97728312, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.99493301, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.23156738, + "step": 3372, + "time_per_iteration": 2.611786365509033 + }, + { + "auxiliary_loss_clip": 0.01551281, + "auxiliary_loss_mlp": 0.00253058, + "balance_loss_clip": 1.21597433, + "balance_loss_mlp": 0.22778599, + "epoch": 0.20279573124906058, + "flos": 26392762765440.0, + "grad_norm": 49.567010792146085, + "language_loss": 0.85446185, + "learning_rate": 3.694875114631167e-06, + "loss": 0.87250525, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.25280762, + "step": 3373, + "time_per_iteration": 2.6386327743530273 + }, + { + "auxiliary_loss_clip": 0.01561693, + "auxiliary_loss_mlp": 0.00238562, + "balance_loss_clip": 1.22808337, + "balance_loss_mlp": 0.21612637, + "epoch": 0.20285585450172855, + "flos": 33799984692480.0, + "grad_norm": 54.81005446443365, + "language_loss": 0.77972323, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.7977258, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.2244873, + "step": 3374, + "time_per_iteration": 2.7467145919799805 + }, + { + "auxiliary_loss_clip": 0.0169152, + "auxiliary_loss_mlp": 0.00121682, + "balance_loss_clip": 1.39716697, + "balance_loss_mlp": 0.11209805, + "epoch": 0.20291597775439651, + "flos": 71164823598720.0, + "grad_norm": 1.0436133620611483, + "language_loss": 0.6257416, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64387357, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.09570312, + "step": 3375, + "time_per_iteration": 3.0800652503967285 + }, + { + "auxiliary_loss_clip": 0.01567303, + "auxiliary_loss_mlp": 0.00217794, + "balance_loss_clip": 1.23085845, + "balance_loss_mlp": 0.19321296, + "epoch": 0.20297610100706448, + "flos": 19494287118720.0, + "grad_norm": 3.3206856613344953, + "language_loss": 0.87056077, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.88841176, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.24609375, + "step": 3376, + "time_per_iteration": 2.5829644203186035 + }, + { + "auxiliary_loss_clip": 0.01562887, + "auxiliary_loss_mlp": 0.00261626, + "balance_loss_clip": 1.22263026, + "balance_loss_mlp": 0.23393402, + "epoch": 0.20303622425973245, + "flos": 25044245441280.0, + "grad_norm": 2.799308150810884, + "language_loss": 0.88729787, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.90554303, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.2767334, + "step": 3377, + "time_per_iteration": 2.6033878326416016 + }, + { + "auxiliary_loss_clip": 0.01585014, + "auxiliary_loss_mlp": 0.00240337, + "balance_loss_clip": 1.24871802, + "balance_loss_mlp": 0.21356282, + "epoch": 0.2030963475124004, + "flos": 21979988098560.0, + "grad_norm": 28.6956874020484, + "language_loss": 0.8586973, + "learning_rate": 3.69384049496805e-06, + "loss": 0.87695086, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.26806641, + "step": 3378, + "time_per_iteration": 2.5993027687072754 + }, + { + "auxiliary_loss_clip": 0.015947, + "auxiliary_loss_mlp": 0.0022938, + "balance_loss_clip": 1.25321138, + "balance_loss_mlp": 0.20230751, + "epoch": 0.2031564707650684, + "flos": 19500392430720.0, + "grad_norm": 1.9801851521290597, + "language_loss": 0.89292645, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.91116726, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.27087402, + "step": 3379, + "time_per_iteration": 2.6782355308532715 + }, + { + "auxiliary_loss_clip": 0.01604663, + "auxiliary_loss_mlp": 0.00219142, + "balance_loss_clip": 1.26123857, + "balance_loss_mlp": 0.19432312, + "epoch": 0.20321659401773637, + "flos": 22747075971840.0, + "grad_norm": 4.36700379299658, + "language_loss": 0.92286992, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.94110799, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 3.43554688, + "router_z_loss_mlp": 0.24804688, + "step": 3380, + "time_per_iteration": 2.6686646938323975 + }, + { + "auxiliary_loss_clip": 0.01596012, + "auxiliary_loss_mlp": 0.00273094, + "balance_loss_clip": 1.25996852, + "balance_loss_mlp": 0.24314886, + "epoch": 0.20327671727040433, + "flos": 22455840499200.0, + "grad_norm": 5.09450527223547, + "language_loss": 0.86512101, + "learning_rate": 3.693218952340186e-06, + "loss": 0.88381207, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.29956055, + "step": 3381, + "time_per_iteration": 2.6566500663757324 + }, + { + "auxiliary_loss_clip": 0.01608725, + "auxiliary_loss_mlp": 0.00223643, + "balance_loss_clip": 1.27324712, + "balance_loss_mlp": 0.19550988, + "epoch": 0.2033368405230723, + "flos": 19535010163200.0, + "grad_norm": 2.5919786411866346, + "language_loss": 0.86449647, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.88282019, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.28161621, + "step": 3382, + "time_per_iteration": 2.601025342941284 + }, + { + "auxiliary_loss_clip": 0.01598668, + "auxiliary_loss_mlp": 0.00240579, + "balance_loss_clip": 1.2686615, + "balance_loss_mlp": 0.21035984, + "epoch": 0.20339696377574026, + "flos": 13809233744640.0, + "grad_norm": 2.1146859180999975, + "language_loss": 0.86342019, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.88181263, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.30236816, + "step": 3383, + "time_per_iteration": 2.6038742065429688 + }, + { + "auxiliary_loss_clip": 0.01611558, + "auxiliary_loss_mlp": 0.00221967, + "balance_loss_clip": 1.28001082, + "balance_loss_mlp": 0.19433458, + "epoch": 0.20345708702840823, + "flos": 20339409288960.0, + "grad_norm": 9.197926958184379, + "language_loss": 0.85287344, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.87120867, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.27685547, + "step": 3384, + "time_per_iteration": 2.6110074520111084 + }, + { + "auxiliary_loss_clip": 0.01595453, + "auxiliary_loss_mlp": 0.0025757, + "balance_loss_clip": 1.26830411, + "balance_loss_mlp": 0.22873281, + "epoch": 0.2035172102810762, + "flos": 20333950421760.0, + "grad_norm": 20.205458880514904, + "language_loss": 0.88728279, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.90581298, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.28845215, + "step": 3385, + "time_per_iteration": 2.621403217315674 + }, + { + "auxiliary_loss_clip": 0.01600082, + "auxiliary_loss_mlp": 0.00223345, + "balance_loss_clip": 1.28044963, + "balance_loss_mlp": 0.19666633, + "epoch": 0.2035773335337442, + "flos": 23330983461120.0, + "grad_norm": 417.981607225826, + "language_loss": 0.7760759, + "learning_rate": 3.692181763924639e-06, + "loss": 0.79431009, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.26696777, + "step": 3386, + "time_per_iteration": 2.596284866333008 + }, + { + "auxiliary_loss_clip": 0.01602364, + "auxiliary_loss_mlp": 0.00245257, + "balance_loss_clip": 1.28420913, + "balance_loss_mlp": 0.21680161, + "epoch": 0.20363745678641215, + "flos": 28330287310080.0, + "grad_norm": 5.111261424118402, + "language_loss": 0.86794835, + "learning_rate": 3.691974133706947e-06, + "loss": 0.88642454, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.28466797, + "step": 3387, + "time_per_iteration": 2.6831936836242676 + }, + { + "auxiliary_loss_clip": 0.01622614, + "auxiliary_loss_mlp": 0.00232352, + "balance_loss_clip": 1.30398202, + "balance_loss_mlp": 0.20442149, + "epoch": 0.20369758003908012, + "flos": 18915658928640.0, + "grad_norm": 5.361596552323315, + "language_loss": 0.88992524, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.90847486, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.27905273, + "step": 3388, + "time_per_iteration": 2.6128056049346924 + }, + { + "auxiliary_loss_clip": 0.01610407, + "auxiliary_loss_mlp": 0.00242753, + "balance_loss_clip": 1.29408455, + "balance_loss_mlp": 0.21641986, + "epoch": 0.20375770329174808, + "flos": 19206499351680.0, + "grad_norm": 16.864635970179933, + "language_loss": 0.79363906, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.81217074, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.26342773, + "step": 3389, + "time_per_iteration": 2.678162097930908 + }, + { + "auxiliary_loss_clip": 0.01642832, + "auxiliary_loss_mlp": 0.00252772, + "balance_loss_clip": 1.31946445, + "balance_loss_mlp": 0.22486468, + "epoch": 0.20381782654441605, + "flos": 19391008538880.0, + "grad_norm": 19.53617162703793, + "language_loss": 0.93168974, + "learning_rate": 3.691350858126404e-06, + "loss": 0.9506458, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.27905273, + "step": 3390, + "time_per_iteration": 2.561232805252075 + }, + { + "auxiliary_loss_clip": 0.01638442, + "auxiliary_loss_mlp": 0.00254923, + "balance_loss_clip": 1.31780708, + "balance_loss_mlp": 0.22915001, + "epoch": 0.203877949797084, + "flos": 24827704300800.0, + "grad_norm": 9.215542849925786, + "language_loss": 0.8014183, + "learning_rate": 3.691142971316662e-06, + "loss": 0.82035196, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.2578125, + "step": 3391, + "time_per_iteration": 2.7119133472442627 + }, + { + "auxiliary_loss_clip": 0.01650216, + "auxiliary_loss_mlp": 0.00255684, + "balance_loss_clip": 1.32348835, + "balance_loss_mlp": 0.22790802, + "epoch": 0.20393807304975198, + "flos": 18003707504640.0, + "grad_norm": 81.13360015909967, + "language_loss": 0.96719253, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.98625159, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.27819824, + "step": 3392, + "time_per_iteration": 2.551901340484619 + }, + { + "auxiliary_loss_clip": 0.01628498, + "auxiliary_loss_mlp": 0.00260821, + "balance_loss_clip": 1.30063677, + "balance_loss_mlp": 0.23340249, + "epoch": 0.20399819630241997, + "flos": 24206988349440.0, + "grad_norm": 343.009814005169, + "language_loss": 0.85664582, + "learning_rate": 3.69072700532013e-06, + "loss": 0.875539, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.27429199, + "step": 3393, + "time_per_iteration": 2.6687848567962646 + }, + { + "auxiliary_loss_clip": 0.0164115, + "auxiliary_loss_mlp": 0.00229031, + "balance_loss_clip": 1.31089234, + "balance_loss_mlp": 0.20267418, + "epoch": 0.20405831955508794, + "flos": 20777124424320.0, + "grad_norm": 572.2904127198888, + "language_loss": 0.93745506, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.95615685, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.26367188, + "step": 3394, + "time_per_iteration": 4.007560968399048 + }, + { + "auxiliary_loss_clip": 0.01654198, + "auxiliary_loss_mlp": 0.00269416, + "balance_loss_clip": 1.32305634, + "balance_loss_mlp": 0.24342832, + "epoch": 0.2041184428077559, + "flos": 15486908325120.0, + "grad_norm": 8.836536611023096, + "language_loss": 0.93967533, + "learning_rate": 3.69031078287345e-06, + "loss": 0.95891148, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.25976562, + "step": 3395, + "time_per_iteration": 2.6000635623931885 + }, + { + "auxiliary_loss_clip": 0.01659335, + "auxiliary_loss_mlp": 0.00274768, + "balance_loss_clip": 1.32388389, + "balance_loss_mlp": 0.24811271, + "epoch": 0.20417856606042387, + "flos": 15588463052160.0, + "grad_norm": 35.41051793387967, + "language_loss": 0.93267965, + "learning_rate": 3.690102575501033e-06, + "loss": 0.95202076, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.26660156, + "step": 3396, + "time_per_iteration": 2.5839202404022217 + }, + { + "auxiliary_loss_clip": 0.01647255, + "auxiliary_loss_mlp": 0.0025339, + "balance_loss_clip": 1.3145597, + "balance_loss_mlp": 0.22824919, + "epoch": 0.20423868931309183, + "flos": 24279348297600.0, + "grad_norm": 1.784467159728919, + "language_loss": 0.85187602, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.87088245, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.25146484, + "step": 3397, + "time_per_iteration": 5.472602367401123 + }, + { + "auxiliary_loss_clip": 0.01650926, + "auxiliary_loss_mlp": 0.00314472, + "balance_loss_clip": 1.31654835, + "balance_loss_mlp": 0.28639823, + "epoch": 0.2042988125657598, + "flos": 18614870438400.0, + "grad_norm": 42676.41699550635, + "language_loss": 0.96768296, + "learning_rate": 3.689685968497518e-06, + "loss": 0.98733693, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.28088379, + "step": 3398, + "time_per_iteration": 2.566080331802368 + }, + { + "auxiliary_loss_clip": 0.01655344, + "auxiliary_loss_mlp": 0.00291827, + "balance_loss_clip": 1.31904101, + "balance_loss_mlp": 0.26390821, + "epoch": 0.2043589358184278, + "flos": 17851230270720.0, + "grad_norm": 221.27310823486246, + "language_loss": 0.86389589, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.88336754, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.27929688, + "step": 3399, + "time_per_iteration": 2.602679491043091 + }, + { + "auxiliary_loss_clip": 0.0165212, + "auxiliary_loss_mlp": 0.00258034, + "balance_loss_clip": 1.31164551, + "balance_loss_mlp": 0.23223668, + "epoch": 0.20441905907109575, + "flos": 21435223455360.0, + "grad_norm": 2.8727257365162533, + "language_loss": 0.83160019, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.85070169, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.25805664, + "step": 3400, + "time_per_iteration": 2.6926517486572266 + }, + { + "auxiliary_loss_clip": 0.01653535, + "auxiliary_loss_mlp": 0.00266974, + "balance_loss_clip": 1.3145498, + "balance_loss_mlp": 0.24148686, + "epoch": 0.20447918232376372, + "flos": 27707703851520.0, + "grad_norm": 77.54622684439366, + "language_loss": 0.857126, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.87633109, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.25476074, + "step": 3401, + "time_per_iteration": 2.7303481101989746 + }, + { + "auxiliary_loss_clip": 0.01649737, + "auxiliary_loss_mlp": 0.00305939, + "balance_loss_clip": 1.3101964, + "balance_loss_mlp": 0.27719805, + "epoch": 0.20453930557643168, + "flos": 30524214113280.0, + "grad_norm": 21.619341932435333, + "language_loss": 0.75505161, + "learning_rate": 3.688851985676991e-06, + "loss": 0.77460837, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.28747559, + "step": 3402, + "time_per_iteration": 4.172479629516602 + }, + { + "auxiliary_loss_clip": 0.0166139, + "auxiliary_loss_mlp": 0.00314098, + "balance_loss_clip": 1.32149839, + "balance_loss_mlp": 0.28529733, + "epoch": 0.20459942882909965, + "flos": 18987767481600.0, + "grad_norm": 3.782809885665757, + "language_loss": 0.89789522, + "learning_rate": 3.688643329848496e-06, + "loss": 0.9176501, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 0.28808594, + "step": 3403, + "time_per_iteration": 2.5721402168273926 + }, + { + "auxiliary_loss_clip": 0.01655339, + "auxiliary_loss_mlp": 0.00343152, + "balance_loss_clip": 1.31937504, + "balance_loss_mlp": 0.31634146, + "epoch": 0.20465955208176762, + "flos": 20339050152960.0, + "grad_norm": 3.259970943546447, + "language_loss": 0.91736883, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.93735373, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.26831055, + "step": 3404, + "time_per_iteration": 2.6321499347686768 + }, + { + "auxiliary_loss_clip": 0.01649667, + "auxiliary_loss_mlp": 0.00331272, + "balance_loss_clip": 1.31059027, + "balance_loss_mlp": 0.30409181, + "epoch": 0.20471967533443558, + "flos": 21251288885760.0, + "grad_norm": 4.7994159641283956, + "language_loss": 0.92647988, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.9462893, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.27148438, + "step": 3405, + "time_per_iteration": 2.656306266784668 + }, + { + "auxiliary_loss_clip": 0.01644965, + "auxiliary_loss_mlp": 0.0033092, + "balance_loss_clip": 1.30679595, + "balance_loss_mlp": 0.30394292, + "epoch": 0.20477979858710357, + "flos": 14501555458560.0, + "grad_norm": 5.208504489349091, + "language_loss": 0.91892326, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.93868208, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.26965332, + "step": 3406, + "time_per_iteration": 2.631866693496704 + }, + { + "auxiliary_loss_clip": 0.01635769, + "auxiliary_loss_mlp": 0.00381906, + "balance_loss_clip": 1.30233002, + "balance_loss_mlp": 0.35484517, + "epoch": 0.20483992183977154, + "flos": 11400310085760.0, + "grad_norm": 6.3971463467254, + "language_loss": 0.76506352, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.78524029, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.27062988, + "step": 3407, + "time_per_iteration": 2.636451244354248 + }, + { + "auxiliary_loss_clip": 0.01628363, + "auxiliary_loss_mlp": 0.00394079, + "balance_loss_clip": 1.29657114, + "balance_loss_mlp": 0.36774603, + "epoch": 0.2049000450924395, + "flos": 19060271084160.0, + "grad_norm": 35.91726013818885, + "language_loss": 0.91911602, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.93934047, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.26330566, + "step": 3408, + "time_per_iteration": 2.61816668510437 + }, + { + "auxiliary_loss_clip": 0.01634087, + "auxiliary_loss_mlp": 0.00354034, + "balance_loss_clip": 1.29848421, + "balance_loss_mlp": 0.326985, + "epoch": 0.20496016834510747, + "flos": 14574561851520.0, + "grad_norm": 524.091563396857, + "language_loss": 0.76707935, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.78696066, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.27050781, + "step": 3409, + "time_per_iteration": 2.6266613006591797 + }, + { + "auxiliary_loss_clip": 0.01620664, + "auxiliary_loss_mlp": 0.00389432, + "balance_loss_clip": 1.29065633, + "balance_loss_mlp": 0.36132252, + "epoch": 0.20502029159777543, + "flos": 22126647329280.0, + "grad_norm": 2.5733695051172374, + "language_loss": 0.85703129, + "learning_rate": 3.687180946553745e-06, + "loss": 0.8771323, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.28076172, + "step": 3410, + "time_per_iteration": 2.64896559715271 + }, + { + "auxiliary_loss_clip": 0.01632537, + "auxiliary_loss_mlp": 0.00374717, + "balance_loss_clip": 1.30517364, + "balance_loss_mlp": 0.34739411, + "epoch": 0.2050804148504434, + "flos": 25367907916800.0, + "grad_norm": 2.8881756770158913, + "language_loss": 0.81938505, + "learning_rate": 3.686971778678803e-06, + "loss": 0.83945763, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.27331543, + "step": 3411, + "time_per_iteration": 2.639554977416992 + }, + { + "auxiliary_loss_clip": 0.01630737, + "auxiliary_loss_mlp": 0.00359529, + "balance_loss_clip": 1.30510497, + "balance_loss_mlp": 0.33219397, + "epoch": 0.2051405381031114, + "flos": 23620171858560.0, + "grad_norm": 21.217442003131445, + "language_loss": 0.79906112, + "learning_rate": 3.686762546833722e-06, + "loss": 0.81896377, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.27307129, + "step": 3412, + "time_per_iteration": 2.658635377883911 + }, + { + "auxiliary_loss_clip": 0.01611325, + "auxiliary_loss_mlp": 0.00347606, + "balance_loss_clip": 1.28489017, + "balance_loss_mlp": 0.32123655, + "epoch": 0.20520066135577936, + "flos": 19565533745280.0, + "grad_norm": 295.601258342676, + "language_loss": 0.90155691, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.92114627, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.2635498, + "step": 3413, + "time_per_iteration": 2.721214771270752 + }, + { + "auxiliary_loss_clip": 0.01615723, + "auxiliary_loss_mlp": 0.00346379, + "balance_loss_clip": 1.29449153, + "balance_loss_mlp": 0.32073668, + "epoch": 0.20526078460844732, + "flos": 17676345928320.0, + "grad_norm": 5.0327704699043405, + "language_loss": 0.89967704, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.91929805, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.25646973, + "step": 3414, + "time_per_iteration": 2.6033012866973877 + }, + { + "auxiliary_loss_clip": 0.01595973, + "auxiliary_loss_mlp": 0.00370183, + "balance_loss_clip": 1.27703285, + "balance_loss_mlp": 0.34356323, + "epoch": 0.2053209078611153, + "flos": 21500328856320.0, + "grad_norm": 10.63310112067813, + "language_loss": 0.86197746, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.881639, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.26635742, + "step": 3415, + "time_per_iteration": 2.6020355224609375 + }, + { + "auxiliary_loss_clip": 0.01615826, + "auxiliary_loss_mlp": 0.00364803, + "balance_loss_clip": 1.29708862, + "balance_loss_mlp": 0.33854145, + "epoch": 0.20538103111378325, + "flos": 25663524848640.0, + "grad_norm": 2.277293005570793, + "language_loss": 0.78654754, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.80635381, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.26245117, + "step": 3416, + "time_per_iteration": 2.6249454021453857 + }, + { + "auxiliary_loss_clip": 0.01593092, + "auxiliary_loss_mlp": 0.00339258, + "balance_loss_clip": 1.27881312, + "balance_loss_mlp": 0.31509417, + "epoch": 0.20544115436645122, + "flos": 23148952312320.0, + "grad_norm": 9.862127523498302, + "language_loss": 0.85359675, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.87292033, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.24121094, + "step": 3417, + "time_per_iteration": 2.64970326423645 + }, + { + "auxiliary_loss_clip": 0.01596697, + "auxiliary_loss_mlp": 0.003335, + "balance_loss_clip": 1.28046095, + "balance_loss_mlp": 0.30789363, + "epoch": 0.20550127761911918, + "flos": 19390433921280.0, + "grad_norm": 67.74697726379345, + "language_loss": 0.94924796, + "learning_rate": 3.685505812834798e-06, + "loss": 0.96854997, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.25598145, + "step": 3418, + "time_per_iteration": 2.559479236602783 + }, + { + "auxiliary_loss_clip": 0.01581149, + "auxiliary_loss_mlp": 0.00395809, + "balance_loss_clip": 1.26559734, + "balance_loss_mlp": 0.36650687, + "epoch": 0.20556140087178718, + "flos": 22893124671360.0, + "grad_norm": 20.825222298144745, + "language_loss": 0.72692871, + "learning_rate": 3.685296133421035e-06, + "loss": 0.74669838, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 3.15234375, + "router_z_loss_mlp": 0.29296875, + "step": 3419, + "time_per_iteration": 2.610783815383911 + }, + { + "auxiliary_loss_clip": 0.01599869, + "auxiliary_loss_mlp": 0.00373111, + "balance_loss_clip": 1.2835573, + "balance_loss_mlp": 0.34608638, + "epoch": 0.20562152412445514, + "flos": 19789652655360.0, + "grad_norm": 2.7608884073767523, + "language_loss": 0.92610961, + "learning_rate": 3.685086390100674e-06, + "loss": 0.94583941, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.27026367, + "step": 3420, + "time_per_iteration": 2.640362501144409 + }, + { + "auxiliary_loss_clip": 0.01603708, + "auxiliary_loss_mlp": 0.00360196, + "balance_loss_clip": 1.28931236, + "balance_loss_mlp": 0.3322649, + "epoch": 0.2056816473771231, + "flos": 31501989210240.0, + "grad_norm": 55.96737164770995, + "language_loss": 0.77535105, + "learning_rate": 3.684876582881668e-06, + "loss": 0.79499006, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.27941895, + "step": 3421, + "time_per_iteration": 2.7539775371551514 + }, + { + "auxiliary_loss_clip": 0.01599289, + "auxiliary_loss_mlp": 0.00365219, + "balance_loss_clip": 1.2847743, + "balance_loss_mlp": 0.34018552, + "epoch": 0.20574177062979107, + "flos": 23258372117760.0, + "grad_norm": 2323.579834697472, + "language_loss": 0.7795496, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.79919469, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.25048828, + "step": 3422, + "time_per_iteration": 2.6117100715637207 + }, + { + "auxiliary_loss_clip": 0.019372, + "auxiliary_loss_mlp": 0.00191137, + "balance_loss_clip": 1.66549277, + "balance_loss_mlp": 0.17807183, + "epoch": 0.20580189388245904, + "flos": 70312518708480.0, + "grad_norm": 0.7490472599166385, + "language_loss": 0.55186224, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57314557, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.13085938, + "step": 3423, + "time_per_iteration": 3.162081003189087 + }, + { + "auxiliary_loss_clip": 0.01596263, + "auxiliary_loss_mlp": 0.00394264, + "balance_loss_clip": 1.27944934, + "balance_loss_mlp": 0.36726344, + "epoch": 0.205862017135127, + "flos": 30737846252160.0, + "grad_norm": 8.66890325869687, + "language_loss": 0.79612827, + "learning_rate": 3.684246777912353e-06, + "loss": 0.81603354, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.26989746, + "step": 3424, + "time_per_iteration": 2.677460193634033 + }, + { + "auxiliary_loss_clip": 0.01623967, + "auxiliary_loss_mlp": 0.0038267, + "balance_loss_clip": 1.3021934, + "balance_loss_mlp": 0.35659844, + "epoch": 0.20592214038779497, + "flos": 21324546673920.0, + "grad_norm": 2.574371186720462, + "language_loss": 0.79404241, + "learning_rate": 3.684036715178351e-06, + "loss": 0.81410879, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.26074219, + "step": 3425, + "time_per_iteration": 2.614572048187256 + }, + { + "auxiliary_loss_clip": 0.01623601, + "auxiliary_loss_mlp": 0.00364461, + "balance_loss_clip": 1.30400586, + "balance_loss_mlp": 0.3375439, + "epoch": 0.20598226364046296, + "flos": 22891652213760.0, + "grad_norm": 8.314073934996204, + "language_loss": 0.93471134, + "learning_rate": 3.683826588585508e-06, + "loss": 0.95459193, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.2689209, + "step": 3426, + "time_per_iteration": 2.6819353103637695 + }, + { + "auxiliary_loss_clip": 0.01636193, + "auxiliary_loss_mlp": 0.00384769, + "balance_loss_clip": 1.31070876, + "balance_loss_mlp": 0.35747048, + "epoch": 0.20604238689313092, + "flos": 23878549365120.0, + "grad_norm": 4.675669870223246, + "language_loss": 0.82711005, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.84731966, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.27307129, + "step": 3427, + "time_per_iteration": 2.660818576812744 + }, + { + "auxiliary_loss_clip": 0.01626741, + "auxiliary_loss_mlp": 0.00375657, + "balance_loss_clip": 1.29882133, + "balance_loss_mlp": 0.34904966, + "epoch": 0.2061025101457989, + "flos": 22491535639680.0, + "grad_norm": 12.120466228516872, + "language_loss": 0.8118363, + "learning_rate": 3.683406143855174e-06, + "loss": 0.8318603, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.26623535, + "step": 3428, + "time_per_iteration": 2.716301202774048 + }, + { + "auxiliary_loss_clip": 0.01622234, + "auxiliary_loss_mlp": 0.00387349, + "balance_loss_clip": 1.29597163, + "balance_loss_mlp": 0.35938269, + "epoch": 0.20616263339846685, + "flos": 22778928357120.0, + "grad_norm": 13.043607647072895, + "language_loss": 0.79431707, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.81441295, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.27990723, + "step": 3429, + "time_per_iteration": 2.64923357963562 + }, + { + "auxiliary_loss_clip": 0.01628854, + "auxiliary_loss_mlp": 0.00404414, + "balance_loss_clip": 1.30584359, + "balance_loss_mlp": 0.37587526, + "epoch": 0.20622275665113482, + "flos": 20882198684160.0, + "grad_norm": 43.03457891061135, + "language_loss": 0.90849483, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.92882746, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 3.2265625, + "router_z_loss_mlp": 0.28540039, + "step": 3430, + "time_per_iteration": 2.714949131011963 + }, + { + "auxiliary_loss_clip": 0.01614169, + "auxiliary_loss_mlp": 0.00399461, + "balance_loss_clip": 1.28958392, + "balance_loss_mlp": 0.37079161, + "epoch": 0.20628287990380278, + "flos": 19354415558400.0, + "grad_norm": 1.6863102550098865, + "language_loss": 0.75216895, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.77230525, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.28674316, + "step": 3431, + "time_per_iteration": 2.6162221431732178 + }, + { + "auxiliary_loss_clip": 0.01979379, + "auxiliary_loss_mlp": 0.00135157, + "balance_loss_clip": 1.68527603, + "balance_loss_mlp": 0.1239995, + "epoch": 0.20634300315647078, + "flos": 71517932248320.0, + "grad_norm": 0.7982807750735834, + "language_loss": 0.60388893, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62503433, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.11181641, + "step": 3432, + "time_per_iteration": 3.2824318408966064 + }, + { + "auxiliary_loss_clip": 0.01630191, + "auxiliary_loss_mlp": 0.00403265, + "balance_loss_clip": 1.30411184, + "balance_loss_mlp": 0.3750959, + "epoch": 0.20640312640913874, + "flos": 21723944976000.0, + "grad_norm": 3.6011748710983533, + "language_loss": 0.78365314, + "learning_rate": 3.682353915057679e-06, + "loss": 0.80398774, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.28198242, + "step": 3433, + "time_per_iteration": 2.6188950538635254 + }, + { + "auxiliary_loss_clip": 0.01618594, + "auxiliary_loss_mlp": 0.00364079, + "balance_loss_clip": 1.29421425, + "balance_loss_mlp": 0.33627987, + "epoch": 0.2064632496618067, + "flos": 20554621626240.0, + "grad_norm": 10.199312787621931, + "language_loss": 0.91035402, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.93018079, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.27783203, + "step": 3434, + "time_per_iteration": 2.636929988861084 + }, + { + "auxiliary_loss_clip": 0.01624412, + "auxiliary_loss_mlp": 0.00399533, + "balance_loss_clip": 1.29842997, + "balance_loss_mlp": 0.37156606, + "epoch": 0.20652337291447467, + "flos": 29823273135360.0, + "grad_norm": 7.4466518976917895, + "language_loss": 0.75249171, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.77273118, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.27990723, + "step": 3435, + "time_per_iteration": 2.695666790008545 + }, + { + "auxiliary_loss_clip": 0.01615234, + "auxiliary_loss_mlp": 0.00366358, + "balance_loss_clip": 1.29581654, + "balance_loss_mlp": 0.33860642, + "epoch": 0.20658349616714264, + "flos": 26213640618240.0, + "grad_norm": 3.8371164130045314, + "language_loss": 0.95477957, + "learning_rate": 3.681721812174988e-06, + "loss": 0.97459549, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.27758789, + "step": 3436, + "time_per_iteration": 2.6524510383605957 + }, + { + "auxiliary_loss_clip": 0.01607378, + "auxiliary_loss_mlp": 0.00348222, + "balance_loss_clip": 1.28746057, + "balance_loss_mlp": 0.32128114, + "epoch": 0.2066436194198106, + "flos": 25994370044160.0, + "grad_norm": 3.150885301141018, + "language_loss": 0.83754802, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.857104, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.26940918, + "step": 3437, + "time_per_iteration": 4.058243274688721 + }, + { + "auxiliary_loss_clip": 0.0158777, + "auxiliary_loss_mlp": 0.00348412, + "balance_loss_clip": 1.27042508, + "balance_loss_mlp": 0.31989723, + "epoch": 0.20670374267247857, + "flos": 21361067827200.0, + "grad_norm": 14.743508904785255, + "language_loss": 0.84167385, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.86103565, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.28527832, + "step": 3438, + "time_per_iteration": 2.5773627758026123 + }, + { + "auxiliary_loss_clip": 0.0195629, + "auxiliary_loss_mlp": 0.0013994, + "balance_loss_clip": 1.65516067, + "balance_loss_mlp": 0.12744719, + "epoch": 0.20676386592514656, + "flos": 66383281952640.0, + "grad_norm": 0.8067213537012692, + "language_loss": 0.66886163, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.68982399, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.125, + "step": 3439, + "time_per_iteration": 4.422634840011597 + }, + { + "auxiliary_loss_clip": 0.01577008, + "auxiliary_loss_mlp": 0.00292686, + "balance_loss_clip": 1.26101398, + "balance_loss_mlp": 0.26352757, + "epoch": 0.20682398917781453, + "flos": 17274577328640.0, + "grad_norm": 14.549635434568739, + "language_loss": 0.915043, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.9337399, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.29150391, + "step": 3440, + "time_per_iteration": 4.1038658618927 + }, + { + "auxiliary_loss_clip": 0.01561346, + "auxiliary_loss_mlp": 0.00273275, + "balance_loss_clip": 1.25119352, + "balance_loss_mlp": 0.24517736, + "epoch": 0.2068841124304825, + "flos": 18077288515200.0, + "grad_norm": 9.99898717846034, + "language_loss": 0.92073292, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.93907917, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.28088379, + "step": 3441, + "time_per_iteration": 2.6067090034484863 + }, + { + "auxiliary_loss_clip": 0.01559821, + "auxiliary_loss_mlp": 0.00252097, + "balance_loss_clip": 1.25086105, + "balance_loss_mlp": 0.22430936, + "epoch": 0.20694423568315046, + "flos": 27347017432320.0, + "grad_norm": 6.950946598884631, + "language_loss": 0.91382182, + "learning_rate": 3.680455884806959e-06, + "loss": 0.93194097, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.27807617, + "step": 3442, + "time_per_iteration": 2.6761128902435303 + }, + { + "auxiliary_loss_clip": 0.01557453, + "auxiliary_loss_mlp": 0.00261859, + "balance_loss_clip": 1.24520755, + "balance_loss_mlp": 0.2311033, + "epoch": 0.20700435893581842, + "flos": 20229845829120.0, + "grad_norm": 4.806985466567603, + "language_loss": 0.79234052, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.81053364, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.30737305, + "step": 3443, + "time_per_iteration": 2.657576560974121 + }, + { + "auxiliary_loss_clip": 0.0154284, + "auxiliary_loss_mlp": 0.00263485, + "balance_loss_clip": 1.24033225, + "balance_loss_mlp": 0.23775993, + "epoch": 0.2070644821884864, + "flos": 20631111638400.0, + "grad_norm": 6.095224429213428, + "language_loss": 0.92028964, + "learning_rate": 3.680033399147797e-06, + "loss": 0.93835294, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.25769043, + "step": 3444, + "time_per_iteration": 4.159827470779419 + }, + { + "auxiliary_loss_clip": 0.01824833, + "auxiliary_loss_mlp": 0.00117273, + "balance_loss_clip": 1.57938385, + "balance_loss_mlp": 0.10520913, + "epoch": 0.20712460544115438, + "flos": 65941077617280.0, + "grad_norm": 1.299388323095974, + "language_loss": 0.56714767, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.58656865, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.12060547, + "step": 3445, + "time_per_iteration": 3.089937448501587 + }, + { + "auxiliary_loss_clip": 0.01552392, + "auxiliary_loss_mlp": 0.00242712, + "balance_loss_clip": 1.25052214, + "balance_loss_mlp": 0.21496055, + "epoch": 0.20718472869382235, + "flos": 19425734012160.0, + "grad_norm": 4.9941269192228015, + "language_loss": 0.84043962, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.85839069, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.27770996, + "step": 3446, + "time_per_iteration": 2.657238721847534 + }, + { + "auxiliary_loss_clip": 0.01539615, + "auxiliary_loss_mlp": 0.00317822, + "balance_loss_clip": 1.23502886, + "balance_loss_mlp": 0.28670806, + "epoch": 0.2072448519464903, + "flos": 24499049834880.0, + "grad_norm": 15.051926211485783, + "language_loss": 0.73498869, + "learning_rate": 3.679399192876334e-06, + "loss": 0.75356311, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 3.04492188, + "router_z_loss_mlp": 0.31079102, + "step": 3447, + "time_per_iteration": 2.7353036403656006 + }, + { + "auxiliary_loss_clip": 0.01547214, + "auxiliary_loss_mlp": 0.00263206, + "balance_loss_clip": 1.24945951, + "balance_loss_mlp": 0.23349863, + "epoch": 0.20730497519915828, + "flos": 23075694524160.0, + "grad_norm": 4.123866600566712, + "language_loss": 0.93480545, + "learning_rate": 3.679187663409184e-06, + "loss": 0.95290959, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.29736328, + "step": 3448, + "time_per_iteration": 2.630033493041992 + }, + { + "auxiliary_loss_clip": 0.0153343, + "auxiliary_loss_mlp": 0.00260254, + "balance_loss_clip": 1.23837566, + "balance_loss_mlp": 0.22890174, + "epoch": 0.20736509845182624, + "flos": 21069042255360.0, + "grad_norm": 3.9479438323139466, + "language_loss": 0.8394224, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.85735929, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.31359863, + "step": 3449, + "time_per_iteration": 2.6151719093322754 + }, + { + "auxiliary_loss_clip": 0.01531342, + "auxiliary_loss_mlp": 0.0028821, + "balance_loss_clip": 1.23594165, + "balance_loss_mlp": 0.25478372, + "epoch": 0.2074252217044942, + "flos": 17633288499840.0, + "grad_norm": 3.41647008663153, + "language_loss": 0.86091793, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.87911344, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.33422852, + "step": 3450, + "time_per_iteration": 2.608027219772339 + }, + { + "auxiliary_loss_clip": 0.01530017, + "auxiliary_loss_mlp": 0.00251156, + "balance_loss_clip": 1.23736644, + "balance_loss_mlp": 0.22041172, + "epoch": 0.20748534495716217, + "flos": 23546985897600.0, + "grad_norm": 290.4984777867855, + "language_loss": 0.88355505, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.90136671, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.30749512, + "step": 3451, + "time_per_iteration": 2.6280319690704346 + }, + { + "auxiliary_loss_clip": 0.0172223, + "auxiliary_loss_mlp": 0.00063115, + "balance_loss_clip": 1.51969755, + "balance_loss_mlp": 0.05143248, + "epoch": 0.20754546820983016, + "flos": 52252935598080.0, + "grad_norm": 0.7779057981045974, + "language_loss": 0.56402302, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58187652, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.11669922, + "step": 3452, + "time_per_iteration": 3.044283866882324 + }, + { + "auxiliary_loss_clip": 0.01530528, + "auxiliary_loss_mlp": 0.00260037, + "balance_loss_clip": 1.2421329, + "balance_loss_mlp": 0.22770774, + "epoch": 0.20760559146249813, + "flos": 20412379768320.0, + "grad_norm": 7.9543338709598315, + "language_loss": 0.96564621, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.98355186, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.32324219, + "step": 3453, + "time_per_iteration": 2.658588171005249 + }, + { + "auxiliary_loss_clip": 0.01541158, + "auxiliary_loss_mlp": 0.00285819, + "balance_loss_clip": 1.25256503, + "balance_loss_mlp": 0.25239253, + "epoch": 0.2076657147151661, + "flos": 23186012169600.0, + "grad_norm": 56.34865306124164, + "language_loss": 0.86570168, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.88397145, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.33422852, + "step": 3454, + "time_per_iteration": 2.746335983276367 + }, + { + "auxiliary_loss_clip": 0.01527543, + "auxiliary_loss_mlp": 0.00214838, + "balance_loss_clip": 1.24100876, + "balance_loss_mlp": 0.18546452, + "epoch": 0.20772583796783406, + "flos": 18293219124480.0, + "grad_norm": 141.10677190198854, + "language_loss": 0.87963068, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.89705443, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.2935791, + "step": 3455, + "time_per_iteration": 2.657067060470581 + }, + { + "auxiliary_loss_clip": 0.01529163, + "auxiliary_loss_mlp": 0.00242774, + "balance_loss_clip": 1.24341655, + "balance_loss_mlp": 0.21161285, + "epoch": 0.20778596122050202, + "flos": 17602800831360.0, + "grad_norm": 8.093492405228416, + "language_loss": 0.8911888, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.90890825, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.31201172, + "step": 3456, + "time_per_iteration": 2.607266902923584 + }, + { + "auxiliary_loss_clip": 0.01522489, + "auxiliary_loss_mlp": 0.00255058, + "balance_loss_clip": 1.23261213, + "balance_loss_mlp": 0.21865159, + "epoch": 0.20784608447317, + "flos": 23805578885760.0, + "grad_norm": 1.9875749830715173, + "language_loss": 0.86231649, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.88009191, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.36401367, + "step": 3457, + "time_per_iteration": 2.7015886306762695 + }, + { + "auxiliary_loss_clip": 0.0151803, + "auxiliary_loss_mlp": 0.00239295, + "balance_loss_clip": 1.23102736, + "balance_loss_mlp": 0.20517752, + "epoch": 0.20790620772583795, + "flos": 17639286071040.0, + "grad_norm": 4.908145904256742, + "language_loss": 0.92116439, + "learning_rate": 3.677068867939333e-06, + "loss": 0.93873763, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.34082031, + "step": 3458, + "time_per_iteration": 2.6327426433563232 + }, + { + "auxiliary_loss_clip": 0.01522829, + "auxiliary_loss_mlp": 0.0023263, + "balance_loss_clip": 1.23837709, + "balance_loss_mlp": 0.20120651, + "epoch": 0.20796633097850595, + "flos": 27673481168640.0, + "grad_norm": 11.016511292540851, + "language_loss": 0.82655764, + "learning_rate": 3.676856638489272e-06, + "loss": 0.84411216, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.31396484, + "step": 3459, + "time_per_iteration": 2.7089052200317383 + }, + { + "auxiliary_loss_clip": 0.01523641, + "auxiliary_loss_mlp": 0.00223303, + "balance_loss_clip": 1.23591876, + "balance_loss_mlp": 0.19259456, + "epoch": 0.2080264542311739, + "flos": 19245606284160.0, + "grad_norm": 12.664261057694537, + "language_loss": 0.84410602, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.86157548, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.30712891, + "step": 3460, + "time_per_iteration": 2.6089885234832764 + }, + { + "auxiliary_loss_clip": 0.01502653, + "auxiliary_loss_mlp": 0.0023224, + "balance_loss_clip": 1.21884668, + "balance_loss_mlp": 0.20231888, + "epoch": 0.20808657748384188, + "flos": 27525924097920.0, + "grad_norm": 4.47772688058867, + "language_loss": 0.80849522, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.82584417, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.29907227, + "step": 3461, + "time_per_iteration": 2.6895523071289062 + }, + { + "auxiliary_loss_clip": 0.01507396, + "auxiliary_loss_mlp": 0.00262131, + "balance_loss_clip": 1.21937275, + "balance_loss_mlp": 0.22863282, + "epoch": 0.20814670073650984, + "flos": 26906931999360.0, + "grad_norm": 13.538259324522214, + "language_loss": 0.96118182, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.97887707, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.3347168, + "step": 3462, + "time_per_iteration": 2.6575088500976562 + }, + { + "auxiliary_loss_clip": 0.01658133, + "auxiliary_loss_mlp": 0.00098021, + "balance_loss_clip": 1.45861745, + "balance_loss_mlp": 0.0837159, + "epoch": 0.2082068239891778, + "flos": 70175735717760.0, + "grad_norm": 0.7468089376822732, + "language_loss": 0.58884764, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.60640913, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.14257812, + "step": 3463, + "time_per_iteration": 3.221330165863037 + }, + { + "auxiliary_loss_clip": 0.01494031, + "auxiliary_loss_mlp": 0.00250351, + "balance_loss_clip": 1.2094084, + "balance_loss_mlp": 0.21816432, + "epoch": 0.20826694724184577, + "flos": 24608074590720.0, + "grad_norm": 142.2321810307915, + "language_loss": 0.77303743, + "learning_rate": 3.675794537601429e-06, + "loss": 0.79048133, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 2.84570312, + "router_z_loss_mlp": 0.32177734, + "step": 3464, + "time_per_iteration": 2.6308348178863525 + }, + { + "auxiliary_loss_clip": 0.01515384, + "auxiliary_loss_mlp": 0.00247369, + "balance_loss_clip": 1.22611904, + "balance_loss_mlp": 0.21780525, + "epoch": 0.20832707049451377, + "flos": 12892829034240.0, + "grad_norm": 14.905240692430773, + "language_loss": 0.91202343, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.92965096, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.29614258, + "step": 3465, + "time_per_iteration": 2.6086678504943848 + }, + { + "auxiliary_loss_clip": 0.01502501, + "auxiliary_loss_mlp": 0.00234264, + "balance_loss_clip": 1.21559024, + "balance_loss_mlp": 0.20396104, + "epoch": 0.20838719374718173, + "flos": 22198827709440.0, + "grad_norm": 10.340112984963907, + "language_loss": 0.90856552, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.92593324, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.30322266, + "step": 3466, + "time_per_iteration": 2.7197132110595703 + }, + { + "auxiliary_loss_clip": 0.01494578, + "auxiliary_loss_mlp": 0.00205797, + "balance_loss_clip": 1.20628679, + "balance_loss_mlp": 0.17784223, + "epoch": 0.2084473169998497, + "flos": 15158648908800.0, + "grad_norm": 2.482656317275437, + "language_loss": 0.89261448, + "learning_rate": 3.675156514448716e-06, + "loss": 0.90961826, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.27941895, + "step": 3467, + "time_per_iteration": 2.646552801132202 + }, + { + "auxiliary_loss_clip": 0.01498714, + "auxiliary_loss_mlp": 0.00202872, + "balance_loss_clip": 1.21592093, + "balance_loss_mlp": 0.17689598, + "epoch": 0.20850744025251766, + "flos": 17456788045440.0, + "grad_norm": 4.953790343819207, + "language_loss": 0.88777995, + "learning_rate": 3.674943713009518e-06, + "loss": 0.90479583, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.25976562, + "step": 3468, + "time_per_iteration": 2.5931687355041504 + }, + { + "auxiliary_loss_clip": 0.01495698, + "auxiliary_loss_mlp": 0.00232745, + "balance_loss_clip": 1.20579147, + "balance_loss_mlp": 0.20196518, + "epoch": 0.20856756350518563, + "flos": 25698968593920.0, + "grad_norm": 2.4463649288419425, + "language_loss": 0.96888745, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.98617196, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.30761719, + "step": 3469, + "time_per_iteration": 2.6519081592559814 + }, + { + "auxiliary_loss_clip": 0.01501217, + "auxiliary_loss_mlp": 0.00197717, + "balance_loss_clip": 1.20796692, + "balance_loss_mlp": 0.16913083, + "epoch": 0.2086276867578536, + "flos": 37889060970240.0, + "grad_norm": 51.27594386544996, + "language_loss": 0.83493388, + "learning_rate": 3.674517919597092e-06, + "loss": 0.85192323, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.28588867, + "step": 3470, + "time_per_iteration": 2.808732748031616 + }, + { + "auxiliary_loss_clip": 0.01490091, + "auxiliary_loss_mlp": 0.00212821, + "balance_loss_clip": 1.20005763, + "balance_loss_mlp": 0.18525934, + "epoch": 0.20868781001052156, + "flos": 25557049958400.0, + "grad_norm": 6.610273730900486, + "language_loss": 0.81131488, + "learning_rate": 3.674304927640011e-06, + "loss": 0.82834399, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.27587891, + "step": 3471, + "time_per_iteration": 2.74190092086792 + }, + { + "auxiliary_loss_clip": 0.01498807, + "auxiliary_loss_mlp": 0.00228271, + "balance_loss_clip": 1.19936085, + "balance_loss_mlp": 0.19819482, + "epoch": 0.20874793326318955, + "flos": 27529192235520.0, + "grad_norm": 8.704251116889646, + "language_loss": 0.84175128, + "learning_rate": 3.67409187219312e-06, + "loss": 0.85902202, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.30078125, + "step": 3472, + "time_per_iteration": 2.7337372303009033 + }, + { + "auxiliary_loss_clip": 0.01497449, + "auxiliary_loss_mlp": 0.00221215, + "balance_loss_clip": 1.20417047, + "balance_loss_mlp": 0.19348675, + "epoch": 0.20880805651585752, + "flos": 18548795370240.0, + "grad_norm": 4.007316835636918, + "language_loss": 0.92276502, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.93995166, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.27746582, + "step": 3473, + "time_per_iteration": 2.625120162963867 + }, + { + "auxiliary_loss_clip": 0.01615163, + "auxiliary_loss_mlp": 0.00078627, + "balance_loss_clip": 1.41480267, + "balance_loss_mlp": 0.06923302, + "epoch": 0.20886817976852548, + "flos": 65946644225280.0, + "grad_norm": 0.8768414329438375, + "language_loss": 0.63612819, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.65306604, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.09375, + "step": 3474, + "time_per_iteration": 3.0562610626220703 + }, + { + "auxiliary_loss_clip": 0.01485888, + "auxiliary_loss_mlp": 0.00216708, + "balance_loss_clip": 1.19228137, + "balance_loss_mlp": 0.18945722, + "epoch": 0.20892830302119345, + "flos": 36539178929280.0, + "grad_norm": 7.946646497610969, + "language_loss": 0.79150808, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.80853403, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.27270508, + "step": 3475, + "time_per_iteration": 2.773393392562866 + }, + { + "auxiliary_loss_clip": 0.01480855, + "auxiliary_loss_mlp": 0.00243083, + "balance_loss_clip": 1.18371022, + "balance_loss_mlp": 0.21478267, + "epoch": 0.2089884262738614, + "flos": 20956749361920.0, + "grad_norm": 4.279477986365849, + "language_loss": 0.77272713, + "learning_rate": 3.673239015669065e-06, + "loss": 0.78996652, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.28295898, + "step": 3476, + "time_per_iteration": 2.654015302658081 + }, + { + "auxiliary_loss_clip": 0.01469564, + "auxiliary_loss_mlp": 0.00189927, + "balance_loss_clip": 1.17942548, + "balance_loss_mlp": 0.1625091, + "epoch": 0.20904854952652938, + "flos": 22784028088320.0, + "grad_norm": 22.029054802019886, + "language_loss": 0.97070217, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.98729706, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.27404785, + "step": 3477, + "time_per_iteration": 2.597456932067871 + }, + { + "auxiliary_loss_clip": 0.01482688, + "auxiliary_loss_mlp": 0.00216822, + "balance_loss_clip": 1.18844676, + "balance_loss_mlp": 0.18969023, + "epoch": 0.20910867277919734, + "flos": 27303277645440.0, + "grad_norm": 35.464782932038545, + "language_loss": 0.76445377, + "learning_rate": 3.672812206678344e-06, + "loss": 0.78144884, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.27124023, + "step": 3478, + "time_per_iteration": 2.6702702045440674 + }, + { + "auxiliary_loss_clip": 0.01482825, + "auxiliary_loss_mlp": 0.00218926, + "balance_loss_clip": 1.18798089, + "balance_loss_mlp": 0.19162694, + "epoch": 0.20916879603186533, + "flos": 14319237000960.0, + "grad_norm": 5.148566983734678, + "language_loss": 0.91909617, + "learning_rate": 3.672598707029127e-06, + "loss": 0.93611377, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.27294922, + "step": 3479, + "time_per_iteration": 3.9548451900482178 + }, + { + "auxiliary_loss_clip": 0.01477847, + "auxiliary_loss_mlp": 0.00253086, + "balance_loss_clip": 1.18404937, + "balance_loss_mlp": 0.22514299, + "epoch": 0.2092289192845333, + "flos": 22273019251200.0, + "grad_norm": 8.477505895438382, + "language_loss": 0.84408134, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.86139071, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.27954102, + "step": 3480, + "time_per_iteration": 2.6153805255889893 + }, + { + "auxiliary_loss_clip": 0.01480879, + "auxiliary_loss_mlp": 0.00200692, + "balance_loss_clip": 1.18732297, + "balance_loss_mlp": 0.17454913, + "epoch": 0.20928904253720126, + "flos": 14830712714880.0, + "grad_norm": 164.24451642077236, + "language_loss": 0.83763111, + "learning_rate": 3.67217151746346e-06, + "loss": 0.85444671, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.26171875, + "step": 3481, + "time_per_iteration": 2.6292145252227783 + }, + { + "auxiliary_loss_clip": 0.01480962, + "auxiliary_loss_mlp": 0.00195333, + "balance_loss_clip": 1.18230557, + "balance_loss_mlp": 0.16865417, + "epoch": 0.20934916578986923, + "flos": 23259162216960.0, + "grad_norm": 26.412597338915898, + "language_loss": 0.91040158, + "learning_rate": 3.671957827563209e-06, + "loss": 0.92716455, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.26672363, + "step": 3482, + "time_per_iteration": 5.551847696304321 + }, + { + "auxiliary_loss_clip": 0.01490848, + "auxiliary_loss_mlp": 0.00196787, + "balance_loss_clip": 1.19272411, + "balance_loss_mlp": 0.17132419, + "epoch": 0.2094092890425372, + "flos": 32014398677760.0, + "grad_norm": 14.797961797233237, + "language_loss": 0.78600037, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.80287671, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.25463867, + "step": 3483, + "time_per_iteration": 2.7233521938323975 + }, + { + "auxiliary_loss_clip": 0.01472007, + "auxiliary_loss_mlp": 0.00244405, + "balance_loss_clip": 1.1781522, + "balance_loss_mlp": 0.21782085, + "epoch": 0.20946941229520516, + "flos": 20010647082240.0, + "grad_norm": 5.749147764887854, + "language_loss": 0.82755196, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.84471601, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.26586914, + "step": 3484, + "time_per_iteration": 2.6083261966705322 + }, + { + "auxiliary_loss_clip": 0.01480855, + "auxiliary_loss_mlp": 0.00253267, + "balance_loss_clip": 1.1821003, + "balance_loss_mlp": 0.22558701, + "epoch": 0.20952953554787315, + "flos": 30740072895360.0, + "grad_norm": 11.361404794956952, + "language_loss": 0.75073242, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.76807368, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.27685547, + "step": 3485, + "time_per_iteration": 2.71386456489563 + }, + { + "auxiliary_loss_clip": 0.01492268, + "auxiliary_loss_mlp": 0.00230229, + "balance_loss_clip": 1.19099259, + "balance_loss_mlp": 0.20083177, + "epoch": 0.20958965880054112, + "flos": 27049209770880.0, + "grad_norm": 29.6434983057903, + "language_loss": 0.88840181, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.90562671, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.29418945, + "step": 3486, + "time_per_iteration": 2.6766676902770996 + }, + { + "auxiliary_loss_clip": 0.01487948, + "auxiliary_loss_mlp": 0.00230185, + "balance_loss_clip": 1.18558085, + "balance_loss_mlp": 0.20228976, + "epoch": 0.20964978205320908, + "flos": 34204123589760.0, + "grad_norm": 31.46009411164163, + "language_loss": 0.92274857, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.93992984, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.2791748, + "step": 3487, + "time_per_iteration": 4.210130453109741 + }, + { + "auxiliary_loss_clip": 0.01487901, + "auxiliary_loss_mlp": 0.0020866, + "balance_loss_clip": 1.18521476, + "balance_loss_mlp": 0.18076494, + "epoch": 0.20970990530587705, + "flos": 23477391296640.0, + "grad_norm": 7.854326500418065, + "language_loss": 0.81484342, + "learning_rate": 3.670674357028504e-06, + "loss": 0.83180904, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.27893066, + "step": 3488, + "time_per_iteration": 2.6238105297088623 + }, + { + "auxiliary_loss_clip": 0.01509061, + "auxiliary_loss_mlp": 0.00227107, + "balance_loss_clip": 1.20090771, + "balance_loss_mlp": 0.19956946, + "epoch": 0.209770028558545, + "flos": 18551452976640.0, + "grad_norm": 20.4358991811247, + "language_loss": 0.87407899, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.89144075, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.27563477, + "step": 3489, + "time_per_iteration": 2.593170166015625 + }, + { + "auxiliary_loss_clip": 0.01501359, + "auxiliary_loss_mlp": 0.00222404, + "balance_loss_clip": 1.19152629, + "balance_loss_mlp": 0.19379413, + "epoch": 0.20983015181121298, + "flos": 21617003208960.0, + "grad_norm": 8.804924387027768, + "language_loss": 0.80008614, + "learning_rate": 3.670246026613266e-06, + "loss": 0.8173238, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.28613281, + "step": 3490, + "time_per_iteration": 2.588587522506714 + }, + { + "auxiliary_loss_clip": 0.01490922, + "auxiliary_loss_mlp": 0.00185676, + "balance_loss_clip": 1.19164777, + "balance_loss_mlp": 0.15984355, + "epoch": 0.20989027506388094, + "flos": 16614718531200.0, + "grad_norm": 26.151648460501747, + "language_loss": 0.7758534, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.79261935, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.25817871, + "step": 3491, + "time_per_iteration": 2.5811173915863037 + }, + { + "auxiliary_loss_clip": 0.01505034, + "auxiliary_loss_mlp": 0.00197941, + "balance_loss_clip": 1.19101465, + "balance_loss_mlp": 0.16840082, + "epoch": 0.20995039831654894, + "flos": 23216823060480.0, + "grad_norm": 61.74098755375713, + "language_loss": 0.87104452, + "learning_rate": 3.669817442854444e-06, + "loss": 0.88807428, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.29504395, + "step": 3492, + "time_per_iteration": 2.651395082473755 + }, + { + "auxiliary_loss_clip": 0.01508371, + "auxiliary_loss_mlp": 0.00210292, + "balance_loss_clip": 1.19839001, + "balance_loss_mlp": 0.1828979, + "epoch": 0.2100105215692169, + "flos": 18147493647360.0, + "grad_norm": 1.9903957774159589, + "language_loss": 0.9237057, + "learning_rate": 3.669603055991502e-06, + "loss": 0.94089234, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.27404785, + "step": 3493, + "time_per_iteration": 2.7487635612487793 + }, + { + "auxiliary_loss_clip": 0.01533556, + "auxiliary_loss_mlp": 0.00204379, + "balance_loss_clip": 1.21669531, + "balance_loss_mlp": 0.1763171, + "epoch": 0.21007064482188487, + "flos": 15961611490560.0, + "grad_norm": 18.04546089057488, + "language_loss": 0.78289151, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.80027086, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.28051758, + "step": 3494, + "time_per_iteration": 2.6804728507995605 + }, + { + "auxiliary_loss_clip": 0.01544451, + "auxiliary_loss_mlp": 0.00245147, + "balance_loss_clip": 1.22331512, + "balance_loss_mlp": 0.21317488, + "epoch": 0.21013076807455283, + "flos": 32234315696640.0, + "grad_norm": 39.84490826542003, + "language_loss": 0.86445463, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.88235068, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.32006836, + "step": 3495, + "time_per_iteration": 2.79950213432312 + }, + { + "auxiliary_loss_clip": 0.0152981, + "auxiliary_loss_mlp": 0.00221033, + "balance_loss_clip": 1.21053052, + "balance_loss_mlp": 0.19322111, + "epoch": 0.2101908913272208, + "flos": 23696625957120.0, + "grad_norm": 24.408553154962004, + "language_loss": 0.83914042, + "learning_rate": 3.668959515566116e-06, + "loss": 0.8566488, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.27807617, + "step": 3496, + "time_per_iteration": 2.6924169063568115 + }, + { + "auxiliary_loss_clip": 0.01535553, + "auxiliary_loss_mlp": 0.00231215, + "balance_loss_clip": 1.2144016, + "balance_loss_mlp": 0.20219994, + "epoch": 0.21025101457988876, + "flos": 20375786787840.0, + "grad_norm": 8.449118682189528, + "language_loss": 0.88033271, + "learning_rate": 3.668744875505915e-06, + "loss": 0.89800036, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.28991699, + "step": 3497, + "time_per_iteration": 2.5901620388031006 + }, + { + "auxiliary_loss_clip": 0.01538673, + "auxiliary_loss_mlp": 0.00251983, + "balance_loss_clip": 1.21785069, + "balance_loss_mlp": 0.22189459, + "epoch": 0.21031113783255675, + "flos": 25775638174080.0, + "grad_norm": 11.388320156954588, + "language_loss": 0.75902945, + "learning_rate": 3.668530172166741e-06, + "loss": 0.77693605, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.30078125, + "step": 3498, + "time_per_iteration": 2.648350238800049 + }, + { + "auxiliary_loss_clip": 0.01528122, + "auxiliary_loss_mlp": 0.00226798, + "balance_loss_clip": 1.20507419, + "balance_loss_mlp": 0.19639927, + "epoch": 0.21037126108522472, + "flos": 22018197191040.0, + "grad_norm": 4.54234971675551, + "language_loss": 0.89614248, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.9136917, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.30395508, + "step": 3499, + "time_per_iteration": 2.629744529724121 + }, + { + "auxiliary_loss_clip": 0.01556102, + "auxiliary_loss_mlp": 0.00215606, + "balance_loss_clip": 1.22362602, + "balance_loss_mlp": 0.18468267, + "epoch": 0.21043138433789269, + "flos": 25334403505920.0, + "grad_norm": 402.3850211550297, + "language_loss": 0.83963621, + "learning_rate": 3.668100575684043e-06, + "loss": 0.85735327, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.30908203, + "step": 3500, + "time_per_iteration": 2.7254345417022705 + }, + { + "auxiliary_loss_clip": 0.01557779, + "auxiliary_loss_mlp": 0.00226362, + "balance_loss_clip": 1.22231936, + "balance_loss_mlp": 0.19570157, + "epoch": 0.21049150759056065, + "flos": 25556654908800.0, + "grad_norm": 354.4054396455617, + "language_loss": 0.80452132, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.82236272, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.30664062, + "step": 3501, + "time_per_iteration": 2.6668622493743896 + }, + { + "auxiliary_loss_clip": 0.01547741, + "auxiliary_loss_mlp": 0.0019994, + "balance_loss_clip": 1.22049856, + "balance_loss_mlp": 0.17144856, + "epoch": 0.21055163084322862, + "flos": 24495602129280.0, + "grad_norm": 3.3899259846031553, + "language_loss": 0.80962002, + "learning_rate": 3.667670726183183e-06, + "loss": 0.82709682, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.28503418, + "step": 3502, + "time_per_iteration": 2.6850807666778564 + }, + { + "auxiliary_loss_clip": 0.01558205, + "auxiliary_loss_mlp": 0.00239932, + "balance_loss_clip": 1.2299875, + "balance_loss_mlp": 0.20996287, + "epoch": 0.21061175409589658, + "flos": 25739045193600.0, + "grad_norm": 38.60052175615511, + "language_loss": 0.82800674, + "learning_rate": 3.667455706571316e-06, + "loss": 0.84598804, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.29980469, + "step": 3503, + "time_per_iteration": 2.647056818008423 + }, + { + "auxiliary_loss_clip": 0.01535799, + "auxiliary_loss_mlp": 0.00253065, + "balance_loss_clip": 1.2103188, + "balance_loss_mlp": 0.22078341, + "epoch": 0.21067187734856455, + "flos": 18989168112000.0, + "grad_norm": 18.134579816138285, + "language_loss": 0.88127548, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.89916408, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.32299805, + "step": 3504, + "time_per_iteration": 2.6129584312438965 + }, + { + "auxiliary_loss_clip": 0.01527364, + "auxiliary_loss_mlp": 0.00263775, + "balance_loss_clip": 1.20505333, + "balance_loss_mlp": 0.23266172, + "epoch": 0.21073200060123254, + "flos": 24681368292480.0, + "grad_norm": 7.343272605920992, + "language_loss": 0.83080912, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.84872055, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.31103516, + "step": 3505, + "time_per_iteration": 2.6567957401275635 + }, + { + "auxiliary_loss_clip": 0.01538722, + "auxiliary_loss_mlp": 0.0022108, + "balance_loss_clip": 1.21839213, + "balance_loss_mlp": 0.19350719, + "epoch": 0.2107921238539005, + "flos": 28549342402560.0, + "grad_norm": 140.83561714918048, + "language_loss": 0.69307148, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.71066952, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.27575684, + "step": 3506, + "time_per_iteration": 2.6852028369903564 + }, + { + "auxiliary_loss_clip": 0.01538043, + "auxiliary_loss_mlp": 0.00245583, + "balance_loss_clip": 1.21938384, + "balance_loss_mlp": 0.21711631, + "epoch": 0.21085224710656847, + "flos": 25885848078720.0, + "grad_norm": 52.73467750653108, + "language_loss": 0.89146876, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.90930498, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.28466797, + "step": 3507, + "time_per_iteration": 2.66064715385437 + }, + { + "auxiliary_loss_clip": 0.01538427, + "auxiliary_loss_mlp": 0.00272655, + "balance_loss_clip": 1.21901286, + "balance_loss_mlp": 0.24075416, + "epoch": 0.21091237035923643, + "flos": 14976294537600.0, + "grad_norm": 10.175504902393257, + "language_loss": 0.82136476, + "learning_rate": 3.666379660223824e-06, + "loss": 0.83947563, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.3190918, + "step": 3508, + "time_per_iteration": 2.6710569858551025 + }, + { + "auxiliary_loss_clip": 0.01528845, + "auxiliary_loss_mlp": 0.00259464, + "balance_loss_clip": 1.21056652, + "balance_loss_mlp": 0.22627652, + "epoch": 0.2109724936119044, + "flos": 16362518163840.0, + "grad_norm": 19.2577446774282, + "language_loss": 0.94984043, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.96772349, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.33190918, + "step": 3509, + "time_per_iteration": 2.6409966945648193 + }, + { + "auxiliary_loss_clip": 0.0149752, + "auxiliary_loss_mlp": 0.00230924, + "balance_loss_clip": 1.19069064, + "balance_loss_mlp": 0.20102611, + "epoch": 0.21103261686457236, + "flos": 31502492000640.0, + "grad_norm": 10.838864593052367, + "language_loss": 0.75497186, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.77225626, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.29882812, + "step": 3510, + "time_per_iteration": 2.7410929203033447 + }, + { + "auxiliary_loss_clip": 0.014979, + "auxiliary_loss_mlp": 0.00267964, + "balance_loss_clip": 1.192047, + "balance_loss_mlp": 0.23781613, + "epoch": 0.21109274011724033, + "flos": 27344072517120.0, + "grad_norm": 7.129628729414589, + "language_loss": 0.79466194, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.81232059, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.30163574, + "step": 3511, + "time_per_iteration": 2.7011308670043945 + }, + { + "auxiliary_loss_clip": 0.01492957, + "auxiliary_loss_mlp": 0.00282699, + "balance_loss_clip": 1.18841696, + "balance_loss_mlp": 0.24972582, + "epoch": 0.21115286336990832, + "flos": 17820383466240.0, + "grad_norm": 35.824160281195475, + "language_loss": 0.80935407, + "learning_rate": 3.665517685689794e-06, + "loss": 0.82711065, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 3.04492188, + "router_z_loss_mlp": 0.32983398, + "step": 3512, + "time_per_iteration": 2.594832181930542 + }, + { + "auxiliary_loss_clip": 0.01500483, + "auxiliary_loss_mlp": 0.00288051, + "balance_loss_clip": 1.19742107, + "balance_loss_mlp": 0.25612646, + "epoch": 0.2112129866225763, + "flos": 27197987904000.0, + "grad_norm": 4.785938310653663, + "language_loss": 0.79294688, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.81083226, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.31933594, + "step": 3513, + "time_per_iteration": 2.708507776260376 + }, + { + "auxiliary_loss_clip": 0.01526241, + "auxiliary_loss_mlp": 0.0029864, + "balance_loss_clip": 1.22208738, + "balance_loss_mlp": 0.26976743, + "epoch": 0.21127310987524425, + "flos": 23731279603200.0, + "grad_norm": 18.36364981717912, + "language_loss": 0.80223513, + "learning_rate": 3.665086319450502e-06, + "loss": 0.82048392, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.28881836, + "step": 3514, + "time_per_iteration": 2.6958229541778564 + }, + { + "auxiliary_loss_clip": 0.01516106, + "auxiliary_loss_mlp": 0.00376062, + "balance_loss_clip": 1.21598077, + "balance_loss_mlp": 0.34485298, + "epoch": 0.21133323312791222, + "flos": 18332505624960.0, + "grad_norm": 96.25954307651718, + "language_loss": 0.84451127, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.863433, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.31176758, + "step": 3515, + "time_per_iteration": 2.7391021251678467 + }, + { + "auxiliary_loss_clip": 0.01519823, + "auxiliary_loss_mlp": 0.00406673, + "balance_loss_clip": 1.22169757, + "balance_loss_mlp": 0.37398604, + "epoch": 0.21139335638058018, + "flos": 17931203902080.0, + "grad_norm": 130.405471773741, + "language_loss": 0.76774585, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.78701079, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.3269043, + "step": 3516, + "time_per_iteration": 2.641533613204956 + }, + { + "auxiliary_loss_clip": 0.01538098, + "auxiliary_loss_mlp": 0.00372015, + "balance_loss_clip": 1.23154581, + "balance_loss_mlp": 0.34049639, + "epoch": 0.21145347963324815, + "flos": 24572092141440.0, + "grad_norm": 9104.606487295456, + "language_loss": 0.91043049, + "learning_rate": 3.664438796560225e-06, + "loss": 0.92953157, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.31518555, + "step": 3517, + "time_per_iteration": 2.746525764465332 + }, + { + "auxiliary_loss_clip": 0.0154429, + "auxiliary_loss_mlp": 0.00415685, + "balance_loss_clip": 1.23841453, + "balance_loss_mlp": 0.38136476, + "epoch": 0.21151360288591614, + "flos": 35845959375360.0, + "grad_norm": 453.92933228073844, + "language_loss": 0.71328694, + "learning_rate": 3.664222829354512e-06, + "loss": 0.73288667, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.34338379, + "step": 3518, + "time_per_iteration": 2.752349376678467 + }, + { + "auxiliary_loss_clip": 0.01555475, + "auxiliary_loss_mlp": 0.00463259, + "balance_loss_clip": 1.24923182, + "balance_loss_mlp": 0.42928439, + "epoch": 0.2115737261385841, + "flos": 24641579001600.0, + "grad_norm": 6.511846865051982, + "language_loss": 0.95659697, + "learning_rate": 3.664006799041303e-06, + "loss": 0.97678429, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.33959961, + "step": 3519, + "time_per_iteration": 2.639662504196167 + }, + { + "auxiliary_loss_clip": 0.0156777, + "auxiliary_loss_mlp": 0.0047406, + "balance_loss_clip": 1.2578584, + "balance_loss_mlp": 0.43853587, + "epoch": 0.21163384939125207, + "flos": 25226887121280.0, + "grad_norm": 7.415547956023514, + "language_loss": 0.88037896, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.90079725, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.35546875, + "step": 3520, + "time_per_iteration": 2.699019432067871 + }, + { + "auxiliary_loss_clip": 0.0156633, + "auxiliary_loss_mlp": 0.00458917, + "balance_loss_clip": 1.25914204, + "balance_loss_mlp": 0.42701644, + "epoch": 0.21169397264392004, + "flos": 26067520091520.0, + "grad_norm": 3.8671903082137256, + "language_loss": 0.83037317, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.85062557, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.31933594, + "step": 3521, + "time_per_iteration": 4.166273593902588 + }, + { + "auxiliary_loss_clip": 0.01577077, + "auxiliary_loss_mlp": 0.00486296, + "balance_loss_clip": 1.2651813, + "balance_loss_mlp": 0.45377594, + "epoch": 0.211754095896588, + "flos": 23108265181440.0, + "grad_norm": 3.256129224582501, + "language_loss": 0.80915916, + "learning_rate": 3.663358329538626e-06, + "loss": 0.82979286, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.32495117, + "step": 3522, + "time_per_iteration": 2.657609701156616 + }, + { + "auxiliary_loss_clip": 0.01568454, + "auxiliary_loss_mlp": 0.0051276, + "balance_loss_clip": 1.25672102, + "balance_loss_mlp": 0.47647256, + "epoch": 0.21181421914925597, + "flos": 27922341571200.0, + "grad_norm": 5.931239695254143, + "language_loss": 0.7791158, + "learning_rate": 3.663142046877374e-06, + "loss": 0.79992795, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.36279297, + "step": 3523, + "time_per_iteration": 2.639361619949341 + }, + { + "auxiliary_loss_clip": 0.01584483, + "auxiliary_loss_mlp": 0.00490012, + "balance_loss_clip": 1.27363229, + "balance_loss_mlp": 0.45505947, + "epoch": 0.21187434240192393, + "flos": 17128636369920.0, + "grad_norm": 15.140942372848487, + "language_loss": 0.8680563, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.88880128, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 3.109375, + "router_z_loss_mlp": 0.34936523, + "step": 3524, + "time_per_iteration": 3.956183910369873 + }, + { + "auxiliary_loss_clip": 0.01573636, + "auxiliary_loss_mlp": 0.00530113, + "balance_loss_clip": 1.2591753, + "balance_loss_mlp": 0.49094087, + "epoch": 0.21193446565459192, + "flos": 22347318533760.0, + "grad_norm": 6.4569390805065074, + "language_loss": 0.87437773, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.89541531, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.3918457, + "step": 3525, + "time_per_iteration": 4.136544704437256 + }, + { + "auxiliary_loss_clip": 0.01585764, + "auxiliary_loss_mlp": 0.00531254, + "balance_loss_clip": 1.27133441, + "balance_loss_mlp": 0.49375117, + "epoch": 0.2119945889072599, + "flos": 27199316707200.0, + "grad_norm": 10.714436092308228, + "language_loss": 0.79705751, + "learning_rate": 3.662492820527356e-06, + "loss": 0.81822777, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.375, + "step": 3526, + "time_per_iteration": 2.702420949935913 + }, + { + "auxiliary_loss_clip": 0.01581238, + "auxiliary_loss_mlp": 0.00574719, + "balance_loss_clip": 1.26688337, + "balance_loss_mlp": 0.53464133, + "epoch": 0.21205471215992786, + "flos": 20991869884800.0, + "grad_norm": 5.9199236975499785, + "language_loss": 0.81740439, + "learning_rate": 3.662276285649284e-06, + "loss": 0.83896399, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.40063477, + "step": 3527, + "time_per_iteration": 2.633538007736206 + }, + { + "auxiliary_loss_clip": 0.01581897, + "auxiliary_loss_mlp": 0.00574969, + "balance_loss_clip": 1.2673223, + "balance_loss_mlp": 0.53582108, + "epoch": 0.21211483541259582, + "flos": 20777663128320.0, + "grad_norm": 111.26718897438622, + "language_loss": 0.83883709, + "learning_rate": 3.662059687737528e-06, + "loss": 0.86040574, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.39160156, + "step": 3528, + "time_per_iteration": 2.6740565299987793 + }, + { + "auxiliary_loss_clip": 0.01578632, + "auxiliary_loss_mlp": 0.00534272, + "balance_loss_clip": 1.26310039, + "balance_loss_mlp": 0.49777019, + "epoch": 0.21217495866526379, + "flos": 18989994124800.0, + "grad_norm": 7.078812931316298, + "language_loss": 0.87107188, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.89220095, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.36523438, + "step": 3529, + "time_per_iteration": 4.106525182723999 + }, + { + "auxiliary_loss_clip": 0.01581274, + "auxiliary_loss_mlp": 0.00596017, + "balance_loss_clip": 1.26610243, + "balance_loss_mlp": 0.55384088, + "epoch": 0.21223508191793175, + "flos": 20667309569280.0, + "grad_norm": 5.2021284433646935, + "language_loss": 0.84080982, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.86258268, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.421875, + "step": 3530, + "time_per_iteration": 2.6949081420898438 + }, + { + "auxiliary_loss_clip": 0.01578875, + "auxiliary_loss_mlp": 0.00608957, + "balance_loss_clip": 1.26700258, + "balance_loss_mlp": 0.5665189, + "epoch": 0.21229520517059972, + "flos": 21616464504960.0, + "grad_norm": 11.377219446846189, + "language_loss": 0.89893597, + "learning_rate": 3.661409515882308e-06, + "loss": 0.92081428, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.42456055, + "step": 3531, + "time_per_iteration": 2.7014217376708984 + }, + { + "auxiliary_loss_clip": 0.01590062, + "auxiliary_loss_mlp": 0.0063157, + "balance_loss_clip": 1.27010214, + "balance_loss_mlp": 0.58607936, + "epoch": 0.2123553284232677, + "flos": 13991049411840.0, + "grad_norm": 20.188967616155526, + "language_loss": 0.83662504, + "learning_rate": 3.661192665917977e-06, + "loss": 0.85884136, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.45483398, + "step": 3532, + "time_per_iteration": 2.623870611190796 + }, + { + "auxiliary_loss_clip": 0.0159624, + "auxiliary_loss_mlp": 0.00596399, + "balance_loss_clip": 1.27719057, + "balance_loss_mlp": 0.5519343, + "epoch": 0.21241545167593567, + "flos": 18296774570880.0, + "grad_norm": 16.077896471226165, + "language_loss": 0.80648291, + "learning_rate": 3.660975752961054e-06, + "loss": 0.82840931, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.44482422, + "step": 3533, + "time_per_iteration": 2.6571602821350098 + }, + { + "auxiliary_loss_clip": 0.01613153, + "auxiliary_loss_mlp": 0.00684336, + "balance_loss_clip": 1.29096937, + "balance_loss_mlp": 0.63553202, + "epoch": 0.21247557492860364, + "flos": 34713121265280.0, + "grad_norm": 26.05169960462859, + "language_loss": 0.77599633, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.79897118, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.48803711, + "step": 3534, + "time_per_iteration": 2.7503480911254883 + }, + { + "auxiliary_loss_clip": 0.01598569, + "auxiliary_loss_mlp": 0.0061579, + "balance_loss_clip": 1.28294849, + "balance_loss_mlp": 0.57060945, + "epoch": 0.2125356981812716, + "flos": 22053820504320.0, + "grad_norm": 3.462957302647503, + "language_loss": 0.79321229, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.8153559, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.45166016, + "step": 3535, + "time_per_iteration": 2.6188313961029053 + }, + { + "auxiliary_loss_clip": 0.01595813, + "auxiliary_loss_mlp": 0.00685888, + "balance_loss_clip": 1.27403021, + "balance_loss_mlp": 0.63620126, + "epoch": 0.21259582143393957, + "flos": 28548336821760.0, + "grad_norm": 19.126589326487302, + "language_loss": 0.75741333, + "learning_rate": 3.660324636216996e-06, + "loss": 0.78023034, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.49707031, + "step": 3536, + "time_per_iteration": 2.7014362812042236 + }, + { + "auxiliary_loss_clip": 0.01608657, + "auxiliary_loss_mlp": 0.00734843, + "balance_loss_clip": 1.27982819, + "balance_loss_mlp": 0.6812939, + "epoch": 0.21265594468660753, + "flos": 20120892900480.0, + "grad_norm": 15.160573951972586, + "language_loss": 0.92015445, + "learning_rate": 3.660107471371981e-06, + "loss": 0.94358939, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.53564453, + "step": 3537, + "time_per_iteration": 2.679823637008667 + }, + { + "auxiliary_loss_clip": 0.01595316, + "auxiliary_loss_mlp": 0.00651314, + "balance_loss_clip": 1.27577579, + "balance_loss_mlp": 0.60653949, + "epoch": 0.21271606793927553, + "flos": 23076161400960.0, + "grad_norm": 132.46391691158146, + "language_loss": 0.86416423, + "learning_rate": 3.659890243575524e-06, + "loss": 0.88663054, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.44775391, + "step": 3538, + "time_per_iteration": 2.68831467628479 + }, + { + "auxiliary_loss_clip": 0.01599395, + "auxiliary_loss_mlp": 0.00668754, + "balance_loss_clip": 1.27373612, + "balance_loss_mlp": 0.62250042, + "epoch": 0.2127761911919435, + "flos": 26388201738240.0, + "grad_norm": 103.5069674492121, + "language_loss": 0.92263675, + "learning_rate": 3.659672952835863e-06, + "loss": 0.94531822, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.46264648, + "step": 3539, + "time_per_iteration": 2.6827430725097656 + }, + { + "auxiliary_loss_clip": 0.01603163, + "auxiliary_loss_mlp": 0.00611129, + "balance_loss_clip": 1.28095055, + "balance_loss_mlp": 0.56332618, + "epoch": 0.21283631444461146, + "flos": 20228265630720.0, + "grad_norm": 3.6687371724914826, + "language_loss": 0.64338869, + "learning_rate": 3.659455599161237e-06, + "loss": 0.66553164, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.47802734, + "step": 3540, + "time_per_iteration": 2.75003981590271 + }, + { + "auxiliary_loss_clip": 0.0160631, + "auxiliary_loss_mlp": 0.00584412, + "balance_loss_clip": 1.28448749, + "balance_loss_mlp": 0.54204488, + "epoch": 0.21289643769727942, + "flos": 13516992691200.0, + "grad_norm": 8.276254849339201, + "language_loss": 0.84061462, + "learning_rate": 3.659238182559888e-06, + "loss": 0.86252183, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.42382812, + "step": 3541, + "time_per_iteration": 2.611401081085205 + }, + { + "auxiliary_loss_clip": 0.01611458, + "auxiliary_loss_mlp": 0.00633496, + "balance_loss_clip": 1.2889514, + "balance_loss_mlp": 0.58798158, + "epoch": 0.2129565609499474, + "flos": 24827021942400.0, + "grad_norm": 271.8847615453416, + "language_loss": 0.74647337, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.76892292, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.45556641, + "step": 3542, + "time_per_iteration": 2.7024431228637695 + }, + { + "auxiliary_loss_clip": 0.01615127, + "auxiliary_loss_mlp": 0.00618678, + "balance_loss_clip": 1.29320455, + "balance_loss_mlp": 0.57440352, + "epoch": 0.21301668420261535, + "flos": 23659242877440.0, + "grad_norm": 45.609277626604346, + "language_loss": 0.82277358, + "learning_rate": 3.658803160610004e-06, + "loss": 0.84511167, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.44287109, + "step": 3543, + "time_per_iteration": 2.6531195640563965 + }, + { + "auxiliary_loss_clip": 0.01616016, + "auxiliary_loss_mlp": 0.00659962, + "balance_loss_clip": 1.2910974, + "balance_loss_mlp": 0.60901177, + "epoch": 0.21307680745528332, + "flos": 16362805472640.0, + "grad_norm": 34.83856056682472, + "language_loss": 0.73383784, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.75659758, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.50927734, + "step": 3544, + "time_per_iteration": 2.63032865524292 + }, + { + "auxiliary_loss_clip": 0.01608176, + "auxiliary_loss_mlp": 0.00682209, + "balance_loss_clip": 1.28621614, + "balance_loss_mlp": 0.63462031, + "epoch": 0.2131369307079513, + "flos": 19099054794240.0, + "grad_norm": 24.570260596662447, + "language_loss": 0.74626791, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.76917171, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.47558594, + "step": 3545, + "time_per_iteration": 2.612578868865967 + }, + { + "auxiliary_loss_clip": 0.01631618, + "auxiliary_loss_mlp": 0.00626375, + "balance_loss_clip": 1.3050437, + "balance_loss_mlp": 0.57921571, + "epoch": 0.21319705396061928, + "flos": 30372275583360.0, + "grad_norm": 7.185184840338022, + "language_loss": 0.77602899, + "learning_rate": 3.658150155940946e-06, + "loss": 0.7986089, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 3.265625, + "router_z_loss_mlp": 0.47143555, + "step": 3546, + "time_per_iteration": 2.7468297481536865 + }, + { + "auxiliary_loss_clip": 0.01612409, + "auxiliary_loss_mlp": 0.00648089, + "balance_loss_clip": 1.28975141, + "balance_loss_mlp": 0.59988129, + "epoch": 0.21325717721328724, + "flos": 21756192410880.0, + "grad_norm": 2.079554332889995, + "language_loss": 0.85735279, + "learning_rate": 3.657932361952479e-06, + "loss": 0.8799578, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.48193359, + "step": 3547, + "time_per_iteration": 2.731757640838623 + }, + { + "auxiliary_loss_clip": 0.01615674, + "auxiliary_loss_mlp": 0.00659372, + "balance_loss_clip": 1.28627634, + "balance_loss_mlp": 0.61314309, + "epoch": 0.2133173004659552, + "flos": 28730870760960.0, + "grad_norm": 10.680871888319698, + "language_loss": 0.83509248, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.85784292, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.4621582, + "step": 3548, + "time_per_iteration": 2.8374104499816895 + }, + { + "auxiliary_loss_clip": 0.01597379, + "auxiliary_loss_mlp": 0.00599635, + "balance_loss_clip": 1.27274036, + "balance_loss_mlp": 0.55633843, + "epoch": 0.21337742371862317, + "flos": 16837077674880.0, + "grad_norm": 5.771211318013403, + "language_loss": 0.7896198, + "learning_rate": 3.657496585376922e-06, + "loss": 0.81158996, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.43310547, + "step": 3549, + "time_per_iteration": 2.618839740753174 + }, + { + "auxiliary_loss_clip": 0.01600251, + "auxiliary_loss_mlp": 0.0062073, + "balance_loss_clip": 1.27925646, + "balance_loss_mlp": 0.57562095, + "epoch": 0.21343754697129114, + "flos": 24424930120320.0, + "grad_norm": 10.168011860006528, + "language_loss": 0.86470902, + "learning_rate": 3.657278602806357e-06, + "loss": 0.88691884, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.45092773, + "step": 3550, + "time_per_iteration": 2.7149343490600586 + }, + { + "auxiliary_loss_clip": 0.01626127, + "auxiliary_loss_mlp": 0.00655214, + "balance_loss_clip": 1.299106, + "balance_loss_mlp": 0.61029577, + "epoch": 0.21349767022395913, + "flos": 19277817805440.0, + "grad_norm": 2.754757517747534, + "language_loss": 0.9227618, + "learning_rate": 3.657060557391621e-06, + "loss": 0.94557512, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.44921875, + "step": 3551, + "time_per_iteration": 2.6197052001953125 + }, + { + "auxiliary_loss_clip": 0.01610374, + "auxiliary_loss_mlp": 0.00618287, + "balance_loss_clip": 1.28685582, + "balance_loss_mlp": 0.57561004, + "epoch": 0.2135577934766271, + "flos": 17347547808000.0, + "grad_norm": 164.5561652624733, + "language_loss": 0.88956332, + "learning_rate": 3.656842449140983e-06, + "loss": 0.91184998, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.42700195, + "step": 3552, + "time_per_iteration": 2.6898481845855713 + }, + { + "auxiliary_loss_clip": 0.01615719, + "auxiliary_loss_mlp": 0.00641799, + "balance_loss_clip": 1.28691602, + "balance_loss_mlp": 0.5951643, + "epoch": 0.21361791672929506, + "flos": 24057204635520.0, + "grad_norm": 60.68013624817904, + "language_loss": 0.81090081, + "learning_rate": 3.656624278062713e-06, + "loss": 0.83347595, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.46655273, + "step": 3553, + "time_per_iteration": 2.680987596511841 + }, + { + "auxiliary_loss_clip": 0.01621022, + "auxiliary_loss_mlp": 0.00636929, + "balance_loss_clip": 1.29349518, + "balance_loss_mlp": 0.59010327, + "epoch": 0.21367803998196302, + "flos": 22162306556160.0, + "grad_norm": 2.4852681813628412, + "language_loss": 0.7648114, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.78739095, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.46875, + "step": 3554, + "time_per_iteration": 2.7070655822753906 + }, + { + "auxiliary_loss_clip": 0.01615229, + "auxiliary_loss_mlp": 0.0058122, + "balance_loss_clip": 1.28855956, + "balance_loss_mlp": 0.53801858, + "epoch": 0.213738163234631, + "flos": 20886867452160.0, + "grad_norm": 181.47876599521854, + "language_loss": 0.73288745, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.75485194, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.43188477, + "step": 3555, + "time_per_iteration": 2.676180839538574 + }, + { + "auxiliary_loss_clip": 0.01611665, + "auxiliary_loss_mlp": 0.00583947, + "balance_loss_clip": 1.28573143, + "balance_loss_mlp": 0.54067409, + "epoch": 0.21379828648729896, + "flos": 28403114135040.0, + "grad_norm": 203.5868596792386, + "language_loss": 0.7100777, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.73203385, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.43286133, + "step": 3556, + "time_per_iteration": 2.7333500385284424 + }, + { + "auxiliary_loss_clip": 0.01620226, + "auxiliary_loss_mlp": 0.0055627, + "balance_loss_clip": 1.29113889, + "balance_loss_mlp": 0.51440394, + "epoch": 0.21385840973996692, + "flos": 25479662106240.0, + "grad_norm": 664.7459288170553, + "language_loss": 0.78061938, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.80238432, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.41894531, + "step": 3557, + "time_per_iteration": 2.696516275405884 + }, + { + "auxiliary_loss_clip": 0.01611461, + "auxiliary_loss_mlp": 0.0056113, + "balance_loss_clip": 1.28610384, + "balance_loss_mlp": 0.51711786, + "epoch": 0.2139185329926349, + "flos": 28074280101120.0, + "grad_norm": 47.40082189320505, + "language_loss": 0.75722682, + "learning_rate": 3.655532480546528e-06, + "loss": 0.77895272, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.43994141, + "step": 3558, + "time_per_iteration": 2.7421951293945312 + }, + { + "auxiliary_loss_clip": 0.01604195, + "auxiliary_loss_mlp": 0.0063334, + "balance_loss_clip": 1.27351284, + "balance_loss_mlp": 0.58718228, + "epoch": 0.21397865624530288, + "flos": 19608698914560.0, + "grad_norm": 12.451226471490383, + "language_loss": 0.85294104, + "learning_rate": 3.655313932676286e-06, + "loss": 0.87531638, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.46142578, + "step": 3559, + "time_per_iteration": 2.6936323642730713 + }, + { + "auxiliary_loss_clip": 0.01613819, + "auxiliary_loss_mlp": 0.00566056, + "balance_loss_clip": 1.28635943, + "balance_loss_mlp": 0.52678847, + "epoch": 0.21403877949797084, + "flos": 24681476033280.0, + "grad_norm": 5.457140000666865, + "language_loss": 0.73492026, + "learning_rate": 3.655095322036373e-06, + "loss": 0.75671899, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.39282227, + "step": 3560, + "time_per_iteration": 2.8082358837127686 + }, + { + "auxiliary_loss_clip": 0.01611615, + "auxiliary_loss_mlp": 0.00527259, + "balance_loss_clip": 1.28539073, + "balance_loss_mlp": 0.48615605, + "epoch": 0.2140989027506388, + "flos": 19861150677120.0, + "grad_norm": 12.329174368508658, + "language_loss": 0.79341072, + "learning_rate": 3.65487664863508e-06, + "loss": 0.81479937, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.41088867, + "step": 3561, + "time_per_iteration": 2.8042449951171875 + }, + { + "auxiliary_loss_clip": 0.01599742, + "auxiliary_loss_mlp": 0.00541028, + "balance_loss_clip": 1.27621269, + "balance_loss_mlp": 0.50052029, + "epoch": 0.21415902600330677, + "flos": 19135324552320.0, + "grad_norm": 25.146478973999574, + "language_loss": 0.85120016, + "learning_rate": 3.654657912480698e-06, + "loss": 0.87260783, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.40551758, + "step": 3562, + "time_per_iteration": 2.625723361968994 + }, + { + "auxiliary_loss_clip": 0.01609645, + "auxiliary_loss_mlp": 0.00498843, + "balance_loss_clip": 1.28910136, + "balance_loss_mlp": 0.46071953, + "epoch": 0.21421914925597474, + "flos": 22272624201600.0, + "grad_norm": 2.1470309161166465, + "language_loss": 0.88472712, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.90581203, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.38110352, + "step": 3563, + "time_per_iteration": 4.006052494049072 + }, + { + "auxiliary_loss_clip": 0.01609517, + "auxiliary_loss_mlp": 0.00486364, + "balance_loss_clip": 1.2903173, + "balance_loss_mlp": 0.44993401, + "epoch": 0.2142792725086427, + "flos": 33875109987840.0, + "grad_norm": 18.783197066617884, + "language_loss": 0.82118702, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.8421458, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.36425781, + "step": 3564, + "time_per_iteration": 2.721467971801758 + }, + { + "auxiliary_loss_clip": 0.01611963, + "auxiliary_loss_mlp": 0.00462994, + "balance_loss_clip": 1.29568875, + "balance_loss_mlp": 0.42599124, + "epoch": 0.2143393957613107, + "flos": 19860216923520.0, + "grad_norm": 18.116011642723535, + "language_loss": 0.93642807, + "learning_rate": 3.654001327581981e-06, + "loss": 0.95717764, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.37011719, + "step": 3565, + "time_per_iteration": 2.6702754497528076 + }, + { + "auxiliary_loss_clip": 0.01743221, + "auxiliary_loss_mlp": 0.00225471, + "balance_loss_clip": 1.48064268, + "balance_loss_mlp": 0.21345477, + "epoch": 0.21439951901397866, + "flos": 68530093090560.0, + "grad_norm": 0.8274634318473537, + "language_loss": 0.52509737, + "learning_rate": 3.653782340498215e-06, + "loss": 0.54478431, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.12011719, + "step": 3566, + "time_per_iteration": 4.469447612762451 + }, + { + "auxiliary_loss_clip": 0.01603152, + "auxiliary_loss_mlp": 0.00424292, + "balance_loss_clip": 1.29048538, + "balance_loss_mlp": 0.3909373, + "epoch": 0.21445964226664663, + "flos": 19682998197120.0, + "grad_norm": 30.57776267841068, + "language_loss": 0.72807431, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.74834877, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.33374023, + "step": 3567, + "time_per_iteration": 4.13170313835144 + }, + { + "auxiliary_loss_clip": 0.01617737, + "auxiliary_loss_mlp": 0.0044175, + "balance_loss_clip": 1.30540657, + "balance_loss_mlp": 0.40801346, + "epoch": 0.2145197655193146, + "flos": 31107259676160.0, + "grad_norm": 2.1115443391644275, + "language_loss": 0.79030573, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.81090063, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.3371582, + "step": 3568, + "time_per_iteration": 2.7216739654541016 + }, + { + "auxiliary_loss_clip": 0.01609355, + "auxiliary_loss_mlp": 0.00484986, + "balance_loss_clip": 1.29591012, + "balance_loss_mlp": 0.44812635, + "epoch": 0.21457988877198256, + "flos": 20120785159680.0, + "grad_norm": 7.244945462412931, + "language_loss": 0.83599573, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.85693914, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.36865234, + "step": 3569, + "time_per_iteration": 2.6145713329315186 + }, + { + "auxiliary_loss_clip": 0.01597671, + "auxiliary_loss_mlp": 0.00435858, + "balance_loss_clip": 1.28060377, + "balance_loss_mlp": 0.39935666, + "epoch": 0.21464001202465052, + "flos": 18588045957120.0, + "grad_norm": 3.9143759532378075, + "language_loss": 0.77891064, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.79924595, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.36499023, + "step": 3570, + "time_per_iteration": 2.5947957038879395 + }, + { + "auxiliary_loss_clip": 0.01605448, + "auxiliary_loss_mlp": 0.00434065, + "balance_loss_clip": 1.28878891, + "balance_loss_mlp": 0.39854047, + "epoch": 0.21470013527731852, + "flos": 21835160461440.0, + "grad_norm": 11.880048098882025, + "language_loss": 0.88286018, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.90325534, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.35522461, + "step": 3571, + "time_per_iteration": 2.6347668170928955 + }, + { + "auxiliary_loss_clip": 0.01609458, + "auxiliary_loss_mlp": 0.00409241, + "balance_loss_clip": 1.29536915, + "balance_loss_mlp": 0.37333542, + "epoch": 0.21476025852998648, + "flos": 17603195880960.0, + "grad_norm": 11.20668068463244, + "language_loss": 0.90071762, + "learning_rate": 3.652467101342991e-06, + "loss": 0.92090464, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.35888672, + "step": 3572, + "time_per_iteration": 4.1089417934417725 + }, + { + "auxiliary_loss_clip": 0.01622002, + "auxiliary_loss_mlp": 0.00390946, + "balance_loss_clip": 1.30426836, + "balance_loss_mlp": 0.35737681, + "epoch": 0.21482038178265445, + "flos": 24828135264000.0, + "grad_norm": 110.60516493026257, + "language_loss": 0.73189861, + "learning_rate": 3.652247675452598e-06, + "loss": 0.75202811, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.33569336, + "step": 3573, + "time_per_iteration": 2.673861026763916 + }, + { + "auxiliary_loss_clip": 0.01638209, + "auxiliary_loss_mlp": 0.00388955, + "balance_loss_clip": 1.32073927, + "balance_loss_mlp": 0.35700718, + "epoch": 0.2148805050353224, + "flos": 23258228463360.0, + "grad_norm": 1.6114417897002828, + "language_loss": 0.81473088, + "learning_rate": 3.652028186908807e-06, + "loss": 0.83500254, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.31958008, + "step": 3574, + "time_per_iteration": 2.713569402694702 + }, + { + "auxiliary_loss_clip": 0.01615701, + "auxiliary_loss_mlp": 0.00432443, + "balance_loss_clip": 1.30130589, + "balance_loss_mlp": 0.3948679, + "epoch": 0.21494062828799038, + "flos": 21321098968320.0, + "grad_norm": 35.682494162341726, + "language_loss": 0.79578614, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.81626749, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.37597656, + "step": 3575, + "time_per_iteration": 2.6348962783813477 + }, + { + "auxiliary_loss_clip": 0.01618482, + "auxiliary_loss_mlp": 0.00346297, + "balance_loss_clip": 1.31248116, + "balance_loss_mlp": 0.31644681, + "epoch": 0.21500075154065834, + "flos": 18843334894080.0, + "grad_norm": 4.70306139093771, + "language_loss": 0.77017117, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.789819, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.29858398, + "step": 3576, + "time_per_iteration": 2.618419885635376 + }, + { + "auxiliary_loss_clip": 0.01628386, + "auxiliary_loss_mlp": 0.00357671, + "balance_loss_clip": 1.31679893, + "balance_loss_mlp": 0.32473364, + "epoch": 0.2150608747933263, + "flos": 18441997257600.0, + "grad_norm": 140.24231158390728, + "language_loss": 0.96199095, + "learning_rate": 3.651369345440292e-06, + "loss": 0.98185146, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.32946777, + "step": 3577, + "time_per_iteration": 2.6062519550323486 + }, + { + "auxiliary_loss_clip": 0.01757007, + "auxiliary_loss_mlp": 0.00077283, + "balance_loss_clip": 1.49158573, + "balance_loss_mlp": 0.06741296, + "epoch": 0.2151209980459943, + "flos": 66598242894720.0, + "grad_norm": 0.7923399700887624, + "language_loss": 0.5619089, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.58025175, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.09863281, + "step": 3578, + "time_per_iteration": 3.0878729820251465 + }, + { + "auxiliary_loss_clip": 0.01624158, + "auxiliary_loss_mlp": 0.00341301, + "balance_loss_clip": 1.31240582, + "balance_loss_mlp": 0.3105208, + "epoch": 0.21518112129866226, + "flos": 21575885114880.0, + "grad_norm": 2.5537150796915746, + "language_loss": 0.92943764, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.94909221, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.30786133, + "step": 3579, + "time_per_iteration": 2.6308815479278564 + }, + { + "auxiliary_loss_clip": 0.01611754, + "auxiliary_loss_mlp": 0.00359959, + "balance_loss_clip": 1.30226254, + "balance_loss_mlp": 0.32770085, + "epoch": 0.21524124455133023, + "flos": 20047635112320.0, + "grad_norm": 2.861230724015034, + "language_loss": 0.85691249, + "learning_rate": 3.650709940390972e-06, + "loss": 0.87662971, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.32250977, + "step": 3580, + "time_per_iteration": 2.639620065689087 + }, + { + "auxiliary_loss_clip": 0.01619946, + "auxiliary_loss_mlp": 0.00307822, + "balance_loss_clip": 1.31015444, + "balance_loss_mlp": 0.27785242, + "epoch": 0.2153013678039982, + "flos": 23951807153280.0, + "grad_norm": 245.56357021240936, + "language_loss": 0.80181307, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.82109076, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.29956055, + "step": 3581, + "time_per_iteration": 2.6372814178466797 + }, + { + "auxiliary_loss_clip": 0.01612019, + "auxiliary_loss_mlp": 0.00364072, + "balance_loss_clip": 1.30388212, + "balance_loss_mlp": 0.33102736, + "epoch": 0.21536149105666616, + "flos": 20594841880320.0, + "grad_norm": 9.879129195384373, + "language_loss": 0.79593885, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.81569976, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.33032227, + "step": 3582, + "time_per_iteration": 2.6042470932006836 + }, + { + "auxiliary_loss_clip": 0.01629051, + "auxiliary_loss_mlp": 0.00336263, + "balance_loss_clip": 1.32163811, + "balance_loss_mlp": 0.30641347, + "epoch": 0.21542161430933413, + "flos": 12860042895360.0, + "grad_norm": 25.544822826624216, + "language_loss": 0.92148799, + "learning_rate": 3.650049971985889e-06, + "loss": 0.94114113, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.29858398, + "step": 3583, + "time_per_iteration": 2.5743021965026855 + }, + { + "auxiliary_loss_clip": 0.01608495, + "auxiliary_loss_mlp": 0.00319018, + "balance_loss_clip": 1.3014946, + "balance_loss_mlp": 0.28752255, + "epoch": 0.21548173756200212, + "flos": 26103933504000.0, + "grad_norm": 50.85353086599087, + "language_loss": 0.9232893, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.94256443, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.31494141, + "step": 3584, + "time_per_iteration": 2.643791437149048 + }, + { + "auxiliary_loss_clip": 0.01632622, + "auxiliary_loss_mlp": 0.00336199, + "balance_loss_clip": 1.32591414, + "balance_loss_mlp": 0.30637279, + "epoch": 0.21554186081467008, + "flos": 22163779013760.0, + "grad_norm": 111.01140886041367, + "language_loss": 0.97755063, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.99723887, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.29833984, + "step": 3585, + "time_per_iteration": 2.6343488693237305 + }, + { + "auxiliary_loss_clip": 0.01607208, + "auxiliary_loss_mlp": 0.00321859, + "balance_loss_clip": 1.30417967, + "balance_loss_mlp": 0.29218817, + "epoch": 0.21560198406733805, + "flos": 22966741595520.0, + "grad_norm": 37.09758843206317, + "language_loss": 0.81669307, + "learning_rate": 3.649389440450277e-06, + "loss": 0.83598375, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.29650879, + "step": 3586, + "time_per_iteration": 2.7761101722717285 + }, + { + "auxiliary_loss_clip": 0.01618581, + "auxiliary_loss_mlp": 0.00335566, + "balance_loss_clip": 1.3100487, + "balance_loss_mlp": 0.30512005, + "epoch": 0.215662107320006, + "flos": 22784064001920.0, + "grad_norm": 5.179206637889916, + "language_loss": 0.87364805, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.89318955, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.3046875, + "step": 3587, + "time_per_iteration": 2.7846498489379883 + }, + { + "auxiliary_loss_clip": 0.01590963, + "auxiliary_loss_mlp": 0.00322595, + "balance_loss_clip": 1.2886883, + "balance_loss_mlp": 0.29307848, + "epoch": 0.21572223057267398, + "flos": 30883859038080.0, + "grad_norm": 12.709707346691976, + "language_loss": 0.81413066, + "learning_rate": 3.648948773354224e-06, + "loss": 0.83326626, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.29504395, + "step": 3588, + "time_per_iteration": 2.7178738117218018 + }, + { + "auxiliary_loss_clip": 0.0159708, + "auxiliary_loss_mlp": 0.00332506, + "balance_loss_clip": 1.29345131, + "balance_loss_mlp": 0.29996216, + "epoch": 0.21578235382534194, + "flos": 26910487445760.0, + "grad_norm": 2.069396974763748, + "language_loss": 0.87841856, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.89771444, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.32568359, + "step": 3589, + "time_per_iteration": 2.6888983249664307 + }, + { + "auxiliary_loss_clip": 0.01604977, + "auxiliary_loss_mlp": 0.00331383, + "balance_loss_clip": 1.30385602, + "balance_loss_mlp": 0.30320179, + "epoch": 0.2158424770780099, + "flos": 24425720219520.0, + "grad_norm": 30.299339912258606, + "language_loss": 0.81366181, + "learning_rate": 3.648507856144961e-06, + "loss": 0.83302546, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.28173828, + "step": 3590, + "time_per_iteration": 2.673574447631836 + }, + { + "auxiliary_loss_clip": 0.01580707, + "auxiliary_loss_mlp": 0.00352271, + "balance_loss_clip": 1.27869153, + "balance_loss_mlp": 0.32156304, + "epoch": 0.2159026003306779, + "flos": 23949975559680.0, + "grad_norm": 31.024732360451562, + "language_loss": 0.91058779, + "learning_rate": 3.648287303768775e-06, + "loss": 0.92991751, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.30712891, + "step": 3591, + "time_per_iteration": 2.6728835105895996 + }, + { + "auxiliary_loss_clip": 0.01597792, + "auxiliary_loss_mlp": 0.0030179, + "balance_loss_clip": 1.29531598, + "balance_loss_mlp": 0.27074832, + "epoch": 0.21596272358334587, + "flos": 30040963511040.0, + "grad_norm": 44.18946965478503, + "language_loss": 0.75404805, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.77304387, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.31079102, + "step": 3592, + "time_per_iteration": 2.6805336475372314 + }, + { + "auxiliary_loss_clip": 0.01584708, + "auxiliary_loss_mlp": 0.00333337, + "balance_loss_clip": 1.28172755, + "balance_loss_mlp": 0.30323684, + "epoch": 0.21602284683601383, + "flos": 20376217751040.0, + "grad_norm": 4.895406907456475, + "language_loss": 0.94500703, + "learning_rate": 3.647846011515108e-06, + "loss": 0.9641875, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.30102539, + "step": 3593, + "time_per_iteration": 2.630737781524658 + }, + { + "auxiliary_loss_clip": 0.01582085, + "auxiliary_loss_mlp": 0.00319524, + "balance_loss_clip": 1.28340971, + "balance_loss_mlp": 0.28769544, + "epoch": 0.2160829700886818, + "flos": 20777339905920.0, + "grad_norm": 17.05854244068264, + "language_loss": 0.86483097, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.88384706, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 2.99023438, + "router_z_loss_mlp": 0.31835938, + "step": 3594, + "time_per_iteration": 2.7454886436462402 + }, + { + "auxiliary_loss_clip": 0.01600923, + "auxiliary_loss_mlp": 0.00315912, + "balance_loss_clip": 1.29909515, + "balance_loss_mlp": 0.28819564, + "epoch": 0.21614309334134976, + "flos": 22309755886080.0, + "grad_norm": 3.8498528696883967, + "language_loss": 0.86016357, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.87933195, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.27697754, + "step": 3595, + "time_per_iteration": 2.640489101409912 + }, + { + "auxiliary_loss_clip": 0.01602087, + "auxiliary_loss_mlp": 0.00343905, + "balance_loss_clip": 1.29777539, + "balance_loss_mlp": 0.31238642, + "epoch": 0.21620321659401773, + "flos": 19609524927360.0, + "grad_norm": 269.0310978174164, + "language_loss": 0.87357748, + "learning_rate": 3.647183604506897e-06, + "loss": 0.89303732, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 3.04492188, + "router_z_loss_mlp": 0.31518555, + "step": 3596, + "time_per_iteration": 2.6259968280792236 + }, + { + "auxiliary_loss_clip": 0.01590429, + "auxiliary_loss_mlp": 0.00336159, + "balance_loss_clip": 1.29005289, + "balance_loss_mlp": 0.30579594, + "epoch": 0.2162633398466857, + "flos": 18844555956480.0, + "grad_norm": 1.9049586985351887, + "language_loss": 0.9040032, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.92326909, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.30358887, + "step": 3597, + "time_per_iteration": 2.6933882236480713 + }, + { + "auxiliary_loss_clip": 0.01584057, + "auxiliary_loss_mlp": 0.00351418, + "balance_loss_clip": 1.28141689, + "balance_loss_mlp": 0.32051891, + "epoch": 0.21632346309935369, + "flos": 18768820129920.0, + "grad_norm": 5.9334144670770215, + "language_loss": 0.86746895, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.88682365, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.30883789, + "step": 3598, + "time_per_iteration": 2.637389898300171 + }, + { + "auxiliary_loss_clip": 0.01559019, + "auxiliary_loss_mlp": 0.00338031, + "balance_loss_clip": 1.25699127, + "balance_loss_mlp": 0.30586845, + "epoch": 0.21638358635202165, + "flos": 26324173745280.0, + "grad_norm": 113.12075916965986, + "language_loss": 0.89655286, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.91552335, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.3215332, + "step": 3599, + "time_per_iteration": 2.731397867202759 + }, + { + "auxiliary_loss_clip": 0.01585792, + "auxiliary_loss_mlp": 0.00276876, + "balance_loss_clip": 1.28496027, + "balance_loss_mlp": 0.25007752, + "epoch": 0.21644370960468962, + "flos": 20740854666240.0, + "grad_norm": 7.911784792153688, + "language_loss": 0.83855212, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.85717887, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.26806641, + "step": 3600, + "time_per_iteration": 2.6961028575897217 + }, + { + "auxiliary_loss_clip": 0.01581557, + "auxiliary_loss_mlp": 0.00298126, + "balance_loss_clip": 1.28253198, + "balance_loss_mlp": 0.27273422, + "epoch": 0.21650383285735758, + "flos": 23952238116480.0, + "grad_norm": 2.3133056860684396, + "language_loss": 0.87551039, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.89430726, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.25390625, + "step": 3601, + "time_per_iteration": 2.6702797412872314 + }, + { + "auxiliary_loss_clip": 0.01572629, + "auxiliary_loss_mlp": 0.00334606, + "balance_loss_clip": 1.26482129, + "balance_loss_mlp": 0.30334905, + "epoch": 0.21656395611002555, + "flos": 23696087253120.0, + "grad_norm": 8.98951499991499, + "language_loss": 0.89949334, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.91856575, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.3125, + "step": 3602, + "time_per_iteration": 2.72745418548584 + }, + { + "auxiliary_loss_clip": 0.01586391, + "auxiliary_loss_mlp": 0.00296696, + "balance_loss_clip": 1.28147399, + "balance_loss_mlp": 0.26805001, + "epoch": 0.2166240793626935, + "flos": 20666052593280.0, + "grad_norm": 6.130699066643353, + "language_loss": 0.81232178, + "learning_rate": 3.645635802397693e-06, + "loss": 0.83115268, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.28625488, + "step": 3603, + "time_per_iteration": 2.6937286853790283 + }, + { + "auxiliary_loss_clip": 0.01592004, + "auxiliary_loss_mlp": 0.0029874, + "balance_loss_clip": 1.28992546, + "balance_loss_mlp": 0.27198949, + "epoch": 0.2166842026153615, + "flos": 21580410228480.0, + "grad_norm": 2.798811588946276, + "language_loss": 0.8123793, + "learning_rate": 3.645414438132855e-06, + "loss": 0.83128679, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.26757812, + "step": 3604, + "time_per_iteration": 2.638385534286499 + }, + { + "auxiliary_loss_clip": 0.01584435, + "auxiliary_loss_mlp": 0.00318493, + "balance_loss_clip": 1.28471923, + "balance_loss_mlp": 0.29143226, + "epoch": 0.21674432586802947, + "flos": 25629948610560.0, + "grad_norm": 11.822759585450322, + "language_loss": 0.85824043, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.87726974, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.27075195, + "step": 3605, + "time_per_iteration": 4.071672439575195 + }, + { + "auxiliary_loss_clip": 0.01611744, + "auxiliary_loss_mlp": 0.00042082, + "balance_loss_clip": 1.37092054, + "balance_loss_mlp": 0.02548792, + "epoch": 0.21680444912069743, + "flos": 56417783616000.0, + "grad_norm": 0.7103392438815617, + "language_loss": 0.58334601, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.59988427, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.16601562, + "step": 3606, + "time_per_iteration": 3.211568832397461 + }, + { + "auxiliary_loss_clip": 0.01557575, + "auxiliary_loss_mlp": 0.00370442, + "balance_loss_clip": 1.25399697, + "balance_loss_mlp": 0.34238037, + "epoch": 0.2168645723733654, + "flos": 23878944414720.0, + "grad_norm": 3.9003917202347, + "language_loss": 0.79156017, + "learning_rate": 3.644749971006248e-06, + "loss": 0.81084037, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.28100586, + "step": 3607, + "time_per_iteration": 2.6477012634277344 + }, + { + "auxiliary_loss_clip": 0.01563207, + "auxiliary_loss_mlp": 0.00386717, + "balance_loss_clip": 1.26087523, + "balance_loss_mlp": 0.35605663, + "epoch": 0.21692469562603336, + "flos": 16946174257920.0, + "grad_norm": 4.005072169982187, + "language_loss": 0.83983433, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.85933363, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.30664062, + "step": 3608, + "time_per_iteration": 4.08153510093689 + }, + { + "auxiliary_loss_clip": 0.01558851, + "auxiliary_loss_mlp": 0.00383946, + "balance_loss_clip": 1.25543809, + "balance_loss_mlp": 0.3561821, + "epoch": 0.21698481887870133, + "flos": 25119047514240.0, + "grad_norm": 3.239181229483383, + "language_loss": 0.81626153, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.83568949, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.27746582, + "step": 3609, + "time_per_iteration": 2.6214609146118164 + }, + { + "auxiliary_loss_clip": 0.01556379, + "auxiliary_loss_mlp": 0.00322462, + "balance_loss_clip": 1.25432277, + "balance_loss_mlp": 0.29435202, + "epoch": 0.2170449421313693, + "flos": 17894682748800.0, + "grad_norm": 1327.7567291259452, + "language_loss": 0.94945294, + "learning_rate": 3.6440849425579e-06, + "loss": 0.96824139, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.28100586, + "step": 3610, + "time_per_iteration": 4.094866514205933 + }, + { + "auxiliary_loss_clip": 0.01563675, + "auxiliary_loss_mlp": 0.00395896, + "balance_loss_clip": 1.25885713, + "balance_loss_mlp": 0.36744028, + "epoch": 0.2171050653840373, + "flos": 22638446265600.0, + "grad_norm": 55.431700528637236, + "language_loss": 0.8280986, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.84769434, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.2845459, + "step": 3611, + "time_per_iteration": 2.646890878677368 + }, + { + "auxiliary_loss_clip": 0.01553765, + "auxiliary_loss_mlp": 0.00365081, + "balance_loss_clip": 1.25075793, + "balance_loss_mlp": 0.33881894, + "epoch": 0.21716518863670525, + "flos": 19499997381120.0, + "grad_norm": 4.652950267164679, + "language_loss": 0.70067012, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.71985853, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.26257324, + "step": 3612, + "time_per_iteration": 2.602277994155884 + }, + { + "auxiliary_loss_clip": 0.01536523, + "auxiliary_loss_mlp": 0.00403988, + "balance_loss_clip": 1.23250353, + "balance_loss_mlp": 0.37683168, + "epoch": 0.21722531188937322, + "flos": 19792022952960.0, + "grad_norm": 198.38230887964176, + "language_loss": 0.83389562, + "learning_rate": 3.643419353014776e-06, + "loss": 0.85330069, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.27172852, + "step": 3613, + "time_per_iteration": 2.635981559753418 + }, + { + "auxiliary_loss_clip": 0.01534403, + "auxiliary_loss_mlp": 0.00358383, + "balance_loss_clip": 1.23171639, + "balance_loss_mlp": 0.33400464, + "epoch": 0.21728543514204118, + "flos": 13334386924800.0, + "grad_norm": 282.2804790611201, + "language_loss": 0.805839, + "learning_rate": 3.643197365185261e-06, + "loss": 0.82476687, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.24389648, + "step": 3614, + "time_per_iteration": 4.127549886703491 + }, + { + "auxiliary_loss_clip": 0.01510351, + "auxiliary_loss_mlp": 0.00364931, + "balance_loss_clip": 1.2087276, + "balance_loss_mlp": 0.33760819, + "epoch": 0.21734555839470915, + "flos": 15231870783360.0, + "grad_norm": 6.183239903385005, + "language_loss": 0.7923674, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.81112027, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.27319336, + "step": 3615, + "time_per_iteration": 2.6451001167297363 + }, + { + "auxiliary_loss_clip": 0.01526353, + "auxiliary_loss_mlp": 0.00367161, + "balance_loss_clip": 1.22114015, + "balance_loss_mlp": 0.34000546, + "epoch": 0.2174056816473771, + "flos": 19973982274560.0, + "grad_norm": 3.306767416996963, + "language_loss": 0.98737121, + "learning_rate": 3.6427532026040263e-06, + "loss": 1.00630641, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.27148438, + "step": 3616, + "time_per_iteration": 2.662276268005371 + }, + { + "auxiliary_loss_clip": 0.01500849, + "auxiliary_loss_mlp": 0.00322974, + "balance_loss_clip": 1.19858265, + "balance_loss_mlp": 0.29732016, + "epoch": 0.21746580490004508, + "flos": 16687293960960.0, + "grad_norm": 53.219548055313005, + "language_loss": 0.90027666, + "learning_rate": 3.642531027869148e-06, + "loss": 0.91851485, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.2565918, + "step": 3617, + "time_per_iteration": 2.640181303024292 + }, + { + "auxiliary_loss_clip": 0.01514932, + "auxiliary_loss_mlp": 0.00324952, + "balance_loss_clip": 1.20930672, + "balance_loss_mlp": 0.29811791, + "epoch": 0.21752592815271307, + "flos": 25772298209280.0, + "grad_norm": 122.80807014486898, + "language_loss": 0.82387662, + "learning_rate": 3.642308790849329e-06, + "loss": 0.84227544, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.26843262, + "step": 3618, + "time_per_iteration": 2.7579987049102783 + }, + { + "auxiliary_loss_clip": 0.01511759, + "auxiliary_loss_mlp": 0.00351525, + "balance_loss_clip": 1.20727944, + "balance_loss_mlp": 0.32491758, + "epoch": 0.21758605140538104, + "flos": 11254692349440.0, + "grad_norm": 8.121300650501102, + "language_loss": 0.79000998, + "learning_rate": 3.642086491552996e-06, + "loss": 0.80864286, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.26623535, + "step": 3619, + "time_per_iteration": 2.757826328277588 + }, + { + "auxiliary_loss_clip": 0.01494259, + "auxiliary_loss_mlp": 0.00337156, + "balance_loss_clip": 1.18929112, + "balance_loss_mlp": 0.31318319, + "epoch": 0.217646174658049, + "flos": 19242625455360.0, + "grad_norm": 2.0717208756951093, + "language_loss": 0.83535737, + "learning_rate": 3.641864129988579e-06, + "loss": 0.85367155, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.23962402, + "step": 3620, + "time_per_iteration": 2.673243522644043 + }, + { + "auxiliary_loss_clip": 0.01492603, + "auxiliary_loss_mlp": 0.00311423, + "balance_loss_clip": 1.19221616, + "balance_loss_mlp": 0.28678209, + "epoch": 0.21770629791071697, + "flos": 21945083057280.0, + "grad_norm": 7.199974518599563, + "language_loss": 0.8570466, + "learning_rate": 3.641641706164509e-06, + "loss": 0.87508678, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.24658203, + "step": 3621, + "time_per_iteration": 2.6308305263519287 + }, + { + "auxiliary_loss_clip": 0.01474822, + "auxiliary_loss_mlp": 0.00298996, + "balance_loss_clip": 1.17260945, + "balance_loss_mlp": 0.27336639, + "epoch": 0.21776642116338493, + "flos": 24936764970240.0, + "grad_norm": 15.797736676534301, + "language_loss": 0.94199121, + "learning_rate": 3.641419220089221e-06, + "loss": 0.95972937, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.25598145, + "step": 3622, + "time_per_iteration": 2.6791775226593018 + }, + { + "auxiliary_loss_clip": 0.01475598, + "auxiliary_loss_mlp": 0.00371809, + "balance_loss_clip": 1.17092824, + "balance_loss_mlp": 0.34268624, + "epoch": 0.2178265444160529, + "flos": 17821317219840.0, + "grad_norm": 20.797157275913374, + "language_loss": 0.85582066, + "learning_rate": 3.641196671771152e-06, + "loss": 0.87429476, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.29125977, + "step": 3623, + "time_per_iteration": 2.6242311000823975 + }, + { + "auxiliary_loss_clip": 0.01473738, + "auxiliary_loss_mlp": 0.00282092, + "balance_loss_clip": 1.16955161, + "balance_loss_mlp": 0.25654545, + "epoch": 0.2178866676687209, + "flos": 17712902995200.0, + "grad_norm": 10.534770842798665, + "language_loss": 0.94053268, + "learning_rate": 3.640974061218741e-06, + "loss": 0.95809102, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.25561523, + "step": 3624, + "time_per_iteration": 2.6518168449401855 + }, + { + "auxiliary_loss_clip": 0.0148315, + "auxiliary_loss_mlp": 0.00332319, + "balance_loss_clip": 1.1710602, + "balance_loss_mlp": 0.30275527, + "epoch": 0.21794679092138886, + "flos": 16945851035520.0, + "grad_norm": 8.321961132450257, + "language_loss": 0.88432467, + "learning_rate": 3.640751388440429e-06, + "loss": 0.90247935, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.29577637, + "step": 3625, + "time_per_iteration": 2.595280885696411 + }, + { + "auxiliary_loss_clip": 0.01454073, + "auxiliary_loss_mlp": 0.00060412, + "balance_loss_clip": 1.20475459, + "balance_loss_mlp": 0.05354583, + "epoch": 0.21800691417405682, + "flos": 63718566566400.0, + "grad_norm": 0.8039210779528595, + "language_loss": 0.60797095, + "learning_rate": 3.64052865344466e-06, + "loss": 0.62311578, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.06884766, + "step": 3626, + "time_per_iteration": 3.2164504528045654 + }, + { + "auxiliary_loss_clip": 0.01476105, + "auxiliary_loss_mlp": 0.00319917, + "balance_loss_clip": 1.15810919, + "balance_loss_mlp": 0.29118714, + "epoch": 0.21806703742672479, + "flos": 21616392677760.0, + "grad_norm": 80.05716598415836, + "language_loss": 0.97005838, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.98801857, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.28710938, + "step": 3627, + "time_per_iteration": 2.6412737369537354 + }, + { + "auxiliary_loss_clip": 0.01474004, + "auxiliary_loss_mlp": 0.00279736, + "balance_loss_clip": 1.15360701, + "balance_loss_mlp": 0.25335547, + "epoch": 0.21812716067939275, + "flos": 19354882435200.0, + "grad_norm": 5.984024173833941, + "language_loss": 0.80515563, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.82269305, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.26403809, + "step": 3628, + "time_per_iteration": 2.6414167881011963 + }, + { + "auxiliary_loss_clip": 0.01470077, + "auxiliary_loss_mlp": 0.00284279, + "balance_loss_clip": 1.14756417, + "balance_loss_mlp": 0.25676596, + "epoch": 0.21818728393206072, + "flos": 23548063305600.0, + "grad_norm": 8.00374301430814, + "language_loss": 0.85298753, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.87053108, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 3.2265625, + "router_z_loss_mlp": 0.27502441, + "step": 3629, + "time_per_iteration": 2.7501423358917236 + }, + { + "auxiliary_loss_clip": 0.01476061, + "auxiliary_loss_mlp": 0.00261044, + "balance_loss_clip": 1.15055919, + "balance_loss_mlp": 0.23530687, + "epoch": 0.21824740718472868, + "flos": 30225652266240.0, + "grad_norm": 179.89535502976875, + "language_loss": 0.79891992, + "learning_rate": 3.63963709145597e-06, + "loss": 0.81629092, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.25732422, + "step": 3630, + "time_per_iteration": 2.7024085521698 + }, + { + "auxiliary_loss_clip": 0.01466415, + "auxiliary_loss_mlp": 0.0026295, + "balance_loss_clip": 1.13904953, + "balance_loss_mlp": 0.23530491, + "epoch": 0.21830753043739667, + "flos": 26134672567680.0, + "grad_norm": 3.148414843381209, + "language_loss": 0.84773189, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.86502552, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.27648926, + "step": 3631, + "time_per_iteration": 2.66243839263916 + }, + { + "auxiliary_loss_clip": 0.01463087, + "auxiliary_loss_mlp": 0.00271391, + "balance_loss_clip": 1.13489938, + "balance_loss_mlp": 0.24100481, + "epoch": 0.21836765369006464, + "flos": 21720712752000.0, + "grad_norm": 3.128521113035861, + "language_loss": 0.8404448, + "learning_rate": 3.639190937376594e-06, + "loss": 0.85778958, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.30383301, + "step": 3632, + "time_per_iteration": 2.62469744682312 + }, + { + "auxiliary_loss_clip": 0.01483347, + "auxiliary_loss_mlp": 0.00259056, + "balance_loss_clip": 1.15066552, + "balance_loss_mlp": 0.23057702, + "epoch": 0.2184277769427326, + "flos": 19937604775680.0, + "grad_norm": 16.15847755517661, + "language_loss": 0.91857946, + "learning_rate": 3.638967767095249e-06, + "loss": 0.93600345, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.28479004, + "step": 3633, + "time_per_iteration": 2.845135450363159 + }, + { + "auxiliary_loss_clip": 0.01473281, + "auxiliary_loss_mlp": 0.00283533, + "balance_loss_clip": 1.14247918, + "balance_loss_mlp": 0.25538787, + "epoch": 0.21848790019540057, + "flos": 20340235301760.0, + "grad_norm": 11.467027853750524, + "language_loss": 0.86116087, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.87872899, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.28161621, + "step": 3634, + "time_per_iteration": 2.664198398590088 + }, + { + "auxiliary_loss_clip": 0.0146853, + "auxiliary_loss_mlp": 0.00267856, + "balance_loss_clip": 1.13478589, + "balance_loss_mlp": 0.24027067, + "epoch": 0.21854802344806853, + "flos": 15450818135040.0, + "grad_norm": 2.5295402711664545, + "language_loss": 0.81908536, + "learning_rate": 3.638521240091558e-06, + "loss": 0.83644921, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.27612305, + "step": 3635, + "time_per_iteration": 2.683044672012329 + }, + { + "auxiliary_loss_clip": 0.01466096, + "auxiliary_loss_mlp": 0.00249068, + "balance_loss_clip": 1.13058686, + "balance_loss_mlp": 0.22128071, + "epoch": 0.2186081467007365, + "flos": 16320717711360.0, + "grad_norm": 3.076503544580779, + "language_loss": 0.95559514, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.97274679, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.27758789, + "step": 3636, + "time_per_iteration": 2.614664077758789 + }, + { + "auxiliary_loss_clip": 0.01459183, + "auxiliary_loss_mlp": 0.0026344, + "balance_loss_clip": 1.12749934, + "balance_loss_mlp": 0.23497277, + "epoch": 0.2186682699534045, + "flos": 21689255416320.0, + "grad_norm": 4.253278726280932, + "language_loss": 0.83852023, + "learning_rate": 3.638074464556311e-06, + "loss": 0.85574651, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.2845459, + "step": 3637, + "time_per_iteration": 2.6767051219940186 + }, + { + "auxiliary_loss_clip": 0.01485066, + "auxiliary_loss_mlp": 0.00307206, + "balance_loss_clip": 1.14823139, + "balance_loss_mlp": 0.27382761, + "epoch": 0.21872839320607246, + "flos": 17739260599680.0, + "grad_norm": 85.31997841805858, + "language_loss": 0.99180686, + "learning_rate": 3.63785098361053e-06, + "loss": 1.00972962, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.33374023, + "step": 3638, + "time_per_iteration": 2.610776901245117 + }, + { + "auxiliary_loss_clip": 0.0147862, + "auxiliary_loss_mlp": 0.0028786, + "balance_loss_clip": 1.14642251, + "balance_loss_mlp": 0.25777155, + "epoch": 0.21878851645874042, + "flos": 18652289431680.0, + "grad_norm": 2.7578090612849078, + "language_loss": 0.99263585, + "learning_rate": 3.637627440557275e-06, + "loss": 1.01030064, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.30065918, + "step": 3639, + "time_per_iteration": 2.6057791709899902 + }, + { + "auxiliary_loss_clip": 0.01495865, + "auxiliary_loss_mlp": 0.00265454, + "balance_loss_clip": 1.15387738, + "balance_loss_mlp": 0.23727328, + "epoch": 0.2188486397114084, + "flos": 25557301353600.0, + "grad_norm": 10.085462622413498, + "language_loss": 0.86392325, + "learning_rate": 3.637403835405024e-06, + "loss": 0.88153642, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.28173828, + "step": 3640, + "time_per_iteration": 2.7305824756622314 + }, + { + "auxiliary_loss_clip": 0.01496772, + "auxiliary_loss_mlp": 0.00353767, + "balance_loss_clip": 1.15911663, + "balance_loss_mlp": 0.31893423, + "epoch": 0.21890876296407635, + "flos": 17892061056000.0, + "grad_norm": 2532.3504278844507, + "language_loss": 0.79868865, + "learning_rate": 3.637180168162255e-06, + "loss": 0.81719404, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.34814453, + "step": 3641, + "time_per_iteration": 2.6700491905212402 + }, + { + "auxiliary_loss_clip": 0.01525157, + "auxiliary_loss_mlp": 0.00322348, + "balance_loss_clip": 1.17823148, + "balance_loss_mlp": 0.29013729, + "epoch": 0.21896888621674432, + "flos": 17749100926080.0, + "grad_norm": 12.461890884511147, + "language_loss": 0.87278593, + "learning_rate": 3.63695643883745e-06, + "loss": 0.89126098, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 3.46679688, + "router_z_loss_mlp": 0.32226562, + "step": 3642, + "time_per_iteration": 2.67175555229187 + }, + { + "auxiliary_loss_clip": 0.01496852, + "auxiliary_loss_mlp": 0.00301082, + "balance_loss_clip": 1.15004599, + "balance_loss_mlp": 0.26880044, + "epoch": 0.21902900946941228, + "flos": 23076161400960.0, + "grad_norm": 7.5319830973416, + "language_loss": 0.79980826, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.81778765, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.32299805, + "step": 3643, + "time_per_iteration": 2.645763397216797 + }, + { + "auxiliary_loss_clip": 0.01522684, + "auxiliary_loss_mlp": 0.00308077, + "balance_loss_clip": 1.16307104, + "balance_loss_mlp": 0.27722591, + "epoch": 0.21908913272208028, + "flos": 48178545004800.0, + "grad_norm": 15.28325632444656, + "language_loss": 0.75391221, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.77221978, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 0.30834961, + "step": 3644, + "time_per_iteration": 2.8755621910095215 + }, + { + "auxiliary_loss_clip": 0.01530973, + "auxiliary_loss_mlp": 0.00317677, + "balance_loss_clip": 1.16815829, + "balance_loss_mlp": 0.28472719, + "epoch": 0.21914925597474824, + "flos": 22236749493120.0, + "grad_norm": 186.38821355849947, + "language_loss": 0.88407528, + "learning_rate": 3.636284878455669e-06, + "loss": 0.90256178, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.32958984, + "step": 3645, + "time_per_iteration": 2.639883518218994 + }, + { + "auxiliary_loss_clip": 0.01528067, + "auxiliary_loss_mlp": 0.00349339, + "balance_loss_clip": 1.17261732, + "balance_loss_mlp": 0.31977522, + "epoch": 0.2192093792274162, + "flos": 22125605834880.0, + "grad_norm": 3.1650002445162437, + "language_loss": 0.88617051, + "learning_rate": 3.636060900887582e-06, + "loss": 0.9049446, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 0.29541016, + "step": 3646, + "time_per_iteration": 2.676959276199341 + }, + { + "auxiliary_loss_clip": 0.01526124, + "auxiliary_loss_mlp": 0.00318893, + "balance_loss_clip": 1.17214346, + "balance_loss_mlp": 0.29049695, + "epoch": 0.21926950248008417, + "flos": 15669442264320.0, + "grad_norm": 115.50145798923924, + "language_loss": 0.89224422, + "learning_rate": 3.635836861279901e-06, + "loss": 0.91069442, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 0.28417969, + "step": 3647, + "time_per_iteration": 4.060911178588867 + }, + { + "auxiliary_loss_clip": 0.01526715, + "auxiliary_loss_mlp": 0.00308123, + "balance_loss_clip": 1.17320442, + "balance_loss_mlp": 0.28029931, + "epoch": 0.21932962573275214, + "flos": 30262496641920.0, + "grad_norm": 7.346363070088784, + "language_loss": 0.77557892, + "learning_rate": 3.635612759641123e-06, + "loss": 0.79392731, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.27832031, + "step": 3648, + "time_per_iteration": 2.7174322605133057 + }, + { + "auxiliary_loss_clip": 0.01546703, + "auxiliary_loss_mlp": 0.00358211, + "balance_loss_clip": 1.18365908, + "balance_loss_mlp": 0.32457009, + "epoch": 0.2193897489854201, + "flos": 10780132838400.0, + "grad_norm": 3.8617938494841764, + "language_loss": 0.81729341, + "learning_rate": 3.635388595979745e-06, + "loss": 0.83634257, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.33642578, + "step": 3649, + "time_per_iteration": 2.65700364112854 + }, + { + "auxiliary_loss_clip": 0.01543353, + "auxiliary_loss_mlp": 0.00334243, + "balance_loss_clip": 1.1911056, + "balance_loss_mlp": 0.30433309, + "epoch": 0.21944987223808807, + "flos": 19133313390720.0, + "grad_norm": 5.4947272753916945, + "language_loss": 0.92733514, + "learning_rate": 3.635164370304267e-06, + "loss": 0.94611096, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.2989502, + "step": 3650, + "time_per_iteration": 4.070281505584717 + }, + { + "auxiliary_loss_clip": 0.0154735, + "auxiliary_loss_mlp": 0.00319668, + "balance_loss_clip": 1.19203889, + "balance_loss_mlp": 0.28802982, + "epoch": 0.21950999549075606, + "flos": 22711093522560.0, + "grad_norm": 7.316677456886712, + "language_loss": 0.91237068, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.93104088, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 0.31616211, + "step": 3651, + "time_per_iteration": 2.668449640274048 + }, + { + "auxiliary_loss_clip": 0.01544067, + "auxiliary_loss_mlp": 0.0031907, + "balance_loss_clip": 1.19180608, + "balance_loss_mlp": 0.29032844, + "epoch": 0.21957011874342403, + "flos": 10561329141120.0, + "grad_norm": 29.706014275918317, + "language_loss": 0.83289635, + "learning_rate": 3.634715732945027e-06, + "loss": 0.85152781, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.28747559, + "step": 3652, + "time_per_iteration": 4.025532007217407 + }, + { + "auxiliary_loss_clip": 0.01466882, + "auxiliary_loss_mlp": 0.0012509, + "balance_loss_clip": 1.21648204, + "balance_loss_mlp": 0.11769893, + "epoch": 0.219630241996092, + "flos": 65747913252480.0, + "grad_norm": 0.7334509122641438, + "language_loss": 0.51426327, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.53018302, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.07373047, + "step": 3653, + "time_per_iteration": 3.1085758209228516 + }, + { + "auxiliary_loss_clip": 0.01549517, + "auxiliary_loss_mlp": 0.00324187, + "balance_loss_clip": 1.1953429, + "balance_loss_mlp": 0.2941106, + "epoch": 0.21969036524875996, + "flos": 23696518216320.0, + "grad_norm": 3.3551344997472445, + "language_loss": 0.80652672, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.82526374, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 0.30078125, + "step": 3654, + "time_per_iteration": 2.6303417682647705 + }, + { + "auxiliary_loss_clip": 0.01534793, + "auxiliary_loss_mlp": 0.0033485, + "balance_loss_clip": 1.18015969, + "balance_loss_mlp": 0.303188, + "epoch": 0.21975048850142792, + "flos": 19640910435840.0, + "grad_norm": 95.05182995387307, + "language_loss": 0.79085732, + "learning_rate": 3.634042312013064e-06, + "loss": 0.80955374, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 0.31689453, + "step": 3655, + "time_per_iteration": 2.5903360843658447 + }, + { + "auxiliary_loss_clip": 0.01517805, + "auxiliary_loss_mlp": 0.00306166, + "balance_loss_clip": 1.16896844, + "balance_loss_mlp": 0.27777046, + "epoch": 0.21981061175409589, + "flos": 22448550038400.0, + "grad_norm": 348.9580756303383, + "language_loss": 0.86485106, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.88309073, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 3.48828125, + "router_z_loss_mlp": 0.28417969, + "step": 3656, + "time_per_iteration": 2.6252734661102295 + }, + { + "auxiliary_loss_clip": 0.01549753, + "auxiliary_loss_mlp": 0.00307802, + "balance_loss_clip": 1.19306862, + "balance_loss_mlp": 0.27795249, + "epoch": 0.21987073500676388, + "flos": 18151049093760.0, + "grad_norm": 4.913705309308351, + "language_loss": 0.92876476, + "learning_rate": 3.63359305489566e-06, + "loss": 0.94734037, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.29870605, + "step": 3657, + "time_per_iteration": 3.977330446243286 + }, + { + "auxiliary_loss_clip": 0.01521146, + "auxiliary_loss_mlp": 0.00344456, + "balance_loss_clip": 1.16878676, + "balance_loss_mlp": 0.31217372, + "epoch": 0.21993085825943184, + "flos": 25626177682560.0, + "grad_norm": 179.05664953194702, + "language_loss": 0.88201547, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.90067148, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.32275391, + "step": 3658, + "time_per_iteration": 2.6576168537139893 + }, + { + "auxiliary_loss_clip": 0.01455089, + "auxiliary_loss_mlp": 0.0007329, + "balance_loss_clip": 1.17938018, + "balance_loss_mlp": 0.06609001, + "epoch": 0.2199909815120998, + "flos": 70923217743360.0, + "grad_norm": 0.7877157342308037, + "language_loss": 0.57986796, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.59515178, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.07177734, + "step": 3659, + "time_per_iteration": 3.222860813140869 + }, + { + "auxiliary_loss_clip": 0.01525743, + "auxiliary_loss_mlp": 0.00277454, + "balance_loss_clip": 1.16858852, + "balance_loss_mlp": 0.25039399, + "epoch": 0.22005110476476777, + "flos": 21543529939200.0, + "grad_norm": 4.133740285734391, + "language_loss": 0.8123244, + "learning_rate": 3.632918704645772e-06, + "loss": 0.83035642, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.27038574, + "step": 3660, + "time_per_iteration": 2.643080711364746 + }, + { + "auxiliary_loss_clip": 0.01522395, + "auxiliary_loss_mlp": 0.00280333, + "balance_loss_clip": 1.16250086, + "balance_loss_mlp": 0.25597811, + "epoch": 0.22011122801743574, + "flos": 22054502862720.0, + "grad_norm": 1.8506001459858434, + "language_loss": 0.86928678, + "learning_rate": 3.632693797376893e-06, + "loss": 0.88731408, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 0.24365234, + "step": 3661, + "time_per_iteration": 2.715698003768921 + }, + { + "auxiliary_loss_clip": 0.01546378, + "auxiliary_loss_mlp": 0.00248976, + "balance_loss_clip": 1.18486845, + "balance_loss_mlp": 0.22435948, + "epoch": 0.2201713512701037, + "flos": 26687589598080.0, + "grad_norm": 2.3508647921349213, + "language_loss": 0.78548348, + "learning_rate": 3.632468828196102e-06, + "loss": 0.80343699, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 0.24621582, + "step": 3662, + "time_per_iteration": 2.71220064163208 + }, + { + "auxiliary_loss_clip": 0.01532182, + "auxiliary_loss_mlp": 0.00302943, + "balance_loss_clip": 1.17915702, + "balance_loss_mlp": 0.27674121, + "epoch": 0.22023147452277167, + "flos": 22162198815360.0, + "grad_norm": 1.7961677842534731, + "language_loss": 0.83944595, + "learning_rate": 3.632243797111929e-06, + "loss": 0.85779721, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.26184082, + "step": 3663, + "time_per_iteration": 2.6385631561279297 + }, + { + "auxiliary_loss_clip": 0.0155674, + "auxiliary_loss_mlp": 0.00320358, + "balance_loss_clip": 1.19525719, + "balance_loss_mlp": 0.29237962, + "epoch": 0.22029159777543966, + "flos": 22523280284160.0, + "grad_norm": 3.3494031345199176, + "language_loss": 0.8724978, + "learning_rate": 3.632018704132908e-06, + "loss": 0.89126873, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.2800293, + "step": 3664, + "time_per_iteration": 2.625037908554077 + }, + { + "auxiliary_loss_clip": 0.01535083, + "auxiliary_loss_mlp": 0.00355846, + "balance_loss_clip": 1.17787862, + "balance_loss_mlp": 0.32373053, + "epoch": 0.22035172102810763, + "flos": 13042469093760.0, + "grad_norm": 384.55832231825053, + "language_loss": 0.8577944, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.87670362, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 0.32128906, + "step": 3665, + "time_per_iteration": 2.625831127166748 + }, + { + "auxiliary_loss_clip": 0.01552814, + "auxiliary_loss_mlp": 0.00315692, + "balance_loss_clip": 1.19230723, + "balance_loss_mlp": 0.2868433, + "epoch": 0.2204118442807756, + "flos": 12165817760640.0, + "grad_norm": 7.28471252235349, + "language_loss": 1.0463388, + "learning_rate": 3.631568332524466e-06, + "loss": 1.0650239, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.28845215, + "step": 3666, + "time_per_iteration": 2.5694448947906494 + }, + { + "auxiliary_loss_clip": 0.01540949, + "auxiliary_loss_mlp": 0.00278159, + "balance_loss_clip": 1.19062483, + "balance_loss_mlp": 0.25189722, + "epoch": 0.22047196753344356, + "flos": 40108806673920.0, + "grad_norm": 2.542331521339245, + "language_loss": 0.84623539, + "learning_rate": 3.631343053912122e-06, + "loss": 0.86442649, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.26245117, + "step": 3667, + "time_per_iteration": 2.775805711746216 + }, + { + "auxiliary_loss_clip": 0.01549659, + "auxiliary_loss_mlp": 0.00328319, + "balance_loss_clip": 1.19482636, + "balance_loss_mlp": 0.29730076, + "epoch": 0.22053209078611152, + "flos": 20701137202560.0, + "grad_norm": 6.6077892324985195, + "language_loss": 0.84722066, + "learning_rate": 3.631117713439087e-06, + "loss": 0.86600041, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.31030273, + "step": 3668, + "time_per_iteration": 2.6298768520355225 + }, + { + "auxiliary_loss_clip": 0.01535656, + "auxiliary_loss_mlp": 0.00271464, + "balance_loss_clip": 1.18987036, + "balance_loss_mlp": 0.2437951, + "epoch": 0.2205922140387795, + "flos": 24716309247360.0, + "grad_norm": 5.451919598688381, + "language_loss": 0.77815115, + "learning_rate": 3.630892311113904e-06, + "loss": 0.79622245, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.27697754, + "step": 3669, + "time_per_iteration": 2.6486165523529053 + }, + { + "auxiliary_loss_clip": 0.01534398, + "auxiliary_loss_mlp": 0.00296183, + "balance_loss_clip": 1.18771958, + "balance_loss_mlp": 0.27112502, + "epoch": 0.22065233729144745, + "flos": 23477247642240.0, + "grad_norm": 3.6751953844631573, + "language_loss": 0.90529603, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.92360187, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.25048828, + "step": 3670, + "time_per_iteration": 2.6147139072418213 + }, + { + "auxiliary_loss_clip": 0.01571282, + "auxiliary_loss_mlp": 0.00282816, + "balance_loss_clip": 1.21981442, + "balance_loss_mlp": 0.25542152, + "epoch": 0.22071246054411545, + "flos": 35225566646400.0, + "grad_norm": 2.4633340471157505, + "language_loss": 0.83720291, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.85574389, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 0.27404785, + "step": 3671, + "time_per_iteration": 2.6902925968170166 + }, + { + "auxiliary_loss_clip": 0.01572812, + "auxiliary_loss_mlp": 0.00244554, + "balance_loss_clip": 1.22162938, + "balance_loss_mlp": 0.2215583, + "epoch": 0.2207725837967834, + "flos": 18150294908160.0, + "grad_norm": 2.204238601109321, + "language_loss": 0.88393527, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.90210891, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.22998047, + "step": 3672, + "time_per_iteration": 2.5950493812561035 + }, + { + "auxiliary_loss_clip": 0.01552066, + "auxiliary_loss_mlp": 0.00272158, + "balance_loss_clip": 1.2056942, + "balance_loss_mlp": 0.24626553, + "epoch": 0.22083270704945138, + "flos": 20479675898880.0, + "grad_norm": 1.7615580898166965, + "language_loss": 0.80253083, + "learning_rate": 3.629990083462682e-06, + "loss": 0.82077307, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.25878906, + "step": 3673, + "time_per_iteration": 2.6458864212036133 + }, + { + "auxiliary_loss_clip": 0.01555227, + "auxiliary_loss_mlp": 0.00287149, + "balance_loss_clip": 1.21455431, + "balance_loss_mlp": 0.25923008, + "epoch": 0.22089283030211934, + "flos": 34125801984000.0, + "grad_norm": 11.781788948197653, + "language_loss": 0.85621011, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.87463385, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.27929688, + "step": 3674, + "time_per_iteration": 2.7116451263427734 + }, + { + "auxiliary_loss_clip": 0.01559789, + "auxiliary_loss_mlp": 0.00293964, + "balance_loss_clip": 1.21605468, + "balance_loss_mlp": 0.26558033, + "epoch": 0.2209529535547873, + "flos": 18077216688000.0, + "grad_norm": 1.8237942258806847, + "language_loss": 0.84058917, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.85912669, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.28393555, + "step": 3675, + "time_per_iteration": 2.6111960411071777 + }, + { + "auxiliary_loss_clip": 0.01561282, + "auxiliary_loss_mlp": 0.00284995, + "balance_loss_clip": 1.22122908, + "balance_loss_mlp": 0.25822049, + "epoch": 0.22101307680745527, + "flos": 27235335070080.0, + "grad_norm": 7.848373884358802, + "language_loss": 0.87316263, + "learning_rate": 3.629312763695772e-06, + "loss": 0.8916254, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.26782227, + "step": 3676, + "time_per_iteration": 2.6955549716949463 + }, + { + "auxiliary_loss_clip": 0.01549374, + "auxiliary_loss_mlp": 0.00304344, + "balance_loss_clip": 1.21278012, + "balance_loss_mlp": 0.27621022, + "epoch": 0.22107320006012326, + "flos": 16543256423040.0, + "grad_norm": 3.747263336852701, + "language_loss": 0.86282802, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.88136518, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.28125, + "step": 3677, + "time_per_iteration": 2.59421968460083 + }, + { + "auxiliary_loss_clip": 0.01574273, + "auxiliary_loss_mlp": 0.00288401, + "balance_loss_clip": 1.22534668, + "balance_loss_mlp": 0.26159057, + "epoch": 0.22113332331279123, + "flos": 22054466949120.0, + "grad_norm": 11.14396513509158, + "language_loss": 0.91137254, + "learning_rate": 3.628860908251712e-06, + "loss": 0.92999923, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 3.49023438, + "router_z_loss_mlp": 0.26794434, + "step": 3678, + "time_per_iteration": 2.643198251724243 + }, + { + "auxiliary_loss_clip": 0.01575697, + "auxiliary_loss_mlp": 0.00275592, + "balance_loss_clip": 1.22563601, + "balance_loss_mlp": 0.24915087, + "epoch": 0.2211934465654592, + "flos": 26612787525120.0, + "grad_norm": 1.787366371080943, + "language_loss": 0.95562923, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.97414219, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.2644043, + "step": 3679, + "time_per_iteration": 2.693951368331909 + }, + { + "auxiliary_loss_clip": 0.01613782, + "auxiliary_loss_mlp": 0.00287671, + "balance_loss_clip": 1.25679982, + "balance_loss_mlp": 0.26180291, + "epoch": 0.22125356981812716, + "flos": 16360363347840.0, + "grad_norm": 18.093333379184173, + "language_loss": 0.94428951, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.96330404, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.25915527, + "step": 3680, + "time_per_iteration": 2.6228039264678955 + }, + { + "auxiliary_loss_clip": 0.01623027, + "auxiliary_loss_mlp": 0.00276076, + "balance_loss_clip": 1.26449943, + "balance_loss_mlp": 0.24900314, + "epoch": 0.22131369307079513, + "flos": 21651118151040.0, + "grad_norm": 24.435980862523223, + "language_loss": 0.88085759, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.89984852, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 0.27062988, + "step": 3681, + "time_per_iteration": 2.6269021034240723 + }, + { + "auxiliary_loss_clip": 0.01608963, + "auxiliary_loss_mlp": 0.00278049, + "balance_loss_clip": 1.25551772, + "balance_loss_mlp": 0.25238347, + "epoch": 0.2213738163234631, + "flos": 19609524927360.0, + "grad_norm": 8.5076292668002, + "language_loss": 0.88539588, + "learning_rate": 3.62795645623335e-06, + "loss": 0.904266, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.25683594, + "step": 3682, + "time_per_iteration": 2.5884249210357666 + }, + { + "auxiliary_loss_clip": 0.01619954, + "auxiliary_loss_mlp": 0.00307035, + "balance_loss_clip": 1.26633716, + "balance_loss_mlp": 0.28039202, + "epoch": 0.22143393957613106, + "flos": 23623404082560.0, + "grad_norm": 2.768790432028993, + "language_loss": 0.8463006, + "learning_rate": 3.627730188876638e-06, + "loss": 0.86557055, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.26635742, + "step": 3683, + "time_per_iteration": 2.6336348056793213 + }, + { + "auxiliary_loss_clip": 0.01608906, + "auxiliary_loss_mlp": 0.00294743, + "balance_loss_clip": 1.25589383, + "balance_loss_mlp": 0.26802772, + "epoch": 0.22149406282879905, + "flos": 26177801823360.0, + "grad_norm": 8.180395690943065, + "language_loss": 0.8207382, + "learning_rate": 3.627503859796234e-06, + "loss": 0.83977461, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.26696777, + "step": 3684, + "time_per_iteration": 2.6592659950256348 + }, + { + "auxiliary_loss_clip": 0.01625066, + "auxiliary_loss_mlp": 0.00287154, + "balance_loss_clip": 1.2682054, + "balance_loss_mlp": 0.25909197, + "epoch": 0.221554186081467, + "flos": 14538758970240.0, + "grad_norm": 2.8508009955158307, + "language_loss": 0.87378955, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.89291167, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 0.28063965, + "step": 3685, + "time_per_iteration": 2.6591267585754395 + }, + { + "auxiliary_loss_clip": 0.01647171, + "auxiliary_loss_mlp": 0.00262565, + "balance_loss_clip": 1.28293014, + "balance_loss_mlp": 0.23700607, + "epoch": 0.22161430933413498, + "flos": 22238257864320.0, + "grad_norm": 1.3897668354112773, + "language_loss": 0.91917908, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.93827653, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 0.2557373, + "step": 3686, + "time_per_iteration": 2.652676582336426 + }, + { + "auxiliary_loss_clip": 0.01629834, + "auxiliary_loss_mlp": 0.00379878, + "balance_loss_clip": 1.27197981, + "balance_loss_mlp": 0.34922922, + "epoch": 0.22167443258680294, + "flos": 23476529370240.0, + "grad_norm": 5.822234974423388, + "language_loss": 0.86127198, + "learning_rate": 3.626824502298707e-06, + "loss": 0.88136911, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.30639648, + "step": 3687, + "time_per_iteration": 2.6901378631591797 + }, + { + "auxiliary_loss_clip": 0.01642433, + "auxiliary_loss_mlp": 0.00331323, + "balance_loss_clip": 1.28225315, + "balance_loss_mlp": 0.3009485, + "epoch": 0.2217345558394709, + "flos": 23221132692480.0, + "grad_norm": 13.64467537306214, + "language_loss": 0.93890786, + "learning_rate": 3.626597926409383e-06, + "loss": 0.95864534, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.30371094, + "step": 3688, + "time_per_iteration": 2.617197036743164 + }, + { + "auxiliary_loss_clip": 0.01657546, + "auxiliary_loss_mlp": 0.00372826, + "balance_loss_clip": 1.30028629, + "balance_loss_mlp": 0.33963817, + "epoch": 0.22179467909213887, + "flos": 20011078045440.0, + "grad_norm": 24.42787567537049, + "language_loss": 0.87923419, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.89953792, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.33178711, + "step": 3689, + "time_per_iteration": 4.033870458602905 + }, + { + "auxiliary_loss_clip": 0.01672387, + "auxiliary_loss_mlp": 0.00348362, + "balance_loss_clip": 1.31137133, + "balance_loss_mlp": 0.31888172, + "epoch": 0.22185480234480687, + "flos": 19683034110720.0, + "grad_norm": 2.9699809532193893, + "language_loss": 0.78957111, + "learning_rate": 3.626144589597061e-06, + "loss": 0.80977857, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.2947998, + "step": 3690, + "time_per_iteration": 2.629692554473877 + }, + { + "auxiliary_loss_clip": 0.01688025, + "auxiliary_loss_mlp": 0.00370755, + "balance_loss_clip": 1.32806337, + "balance_loss_mlp": 0.33840209, + "epoch": 0.22191492559747483, + "flos": 21981316901760.0, + "grad_norm": 17.317343953410802, + "language_loss": 0.81772292, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.83831072, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 0.32324219, + "step": 3691, + "time_per_iteration": 2.6158835887908936 + }, + { + "auxiliary_loss_clip": 0.01698609, + "auxiliary_loss_mlp": 0.00353021, + "balance_loss_clip": 1.33793259, + "balance_loss_mlp": 0.32047677, + "epoch": 0.2219750488501428, + "flos": 23222066446080.0, + "grad_norm": 5.607678534291826, + "language_loss": 0.81631404, + "learning_rate": 3.625691006130477e-06, + "loss": 0.83683038, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 0.32543945, + "step": 3692, + "time_per_iteration": 4.0299012660980225 + }, + { + "auxiliary_loss_clip": 0.01697891, + "auxiliary_loss_mlp": 0.00374294, + "balance_loss_clip": 1.34024358, + "balance_loss_mlp": 0.34189284, + "epoch": 0.22203517210281076, + "flos": 22453685683200.0, + "grad_norm": 11.286296493580691, + "language_loss": 0.93502867, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.95575052, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.32397461, + "step": 3693, + "time_per_iteration": 2.6183626651763916 + }, + { + "auxiliary_loss_clip": 0.01696813, + "auxiliary_loss_mlp": 0.00323026, + "balance_loss_clip": 1.33565855, + "balance_loss_mlp": 0.29379582, + "epoch": 0.22209529535547873, + "flos": 17564555825280.0, + "grad_norm": 11.638232903147339, + "language_loss": 0.94132411, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.96152258, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 0.29248047, + "step": 3694, + "time_per_iteration": 2.5532171726226807 + }, + { + "auxiliary_loss_clip": 0.01696223, + "auxiliary_loss_mlp": 0.00341683, + "balance_loss_clip": 1.32694256, + "balance_loss_mlp": 0.3092584, + "epoch": 0.2221554186081467, + "flos": 21469015175040.0, + "grad_norm": 1.9933793717720325, + "language_loss": 0.7982434, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.81862247, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 0.32421875, + "step": 3695, + "time_per_iteration": 4.054062366485596 + }, + { + "auxiliary_loss_clip": 0.01712687, + "auxiliary_loss_mlp": 0.00365992, + "balance_loss_clip": 1.35601735, + "balance_loss_mlp": 0.33487841, + "epoch": 0.22221554186081466, + "flos": 27673445255040.0, + "grad_norm": 50.33291954235224, + "language_loss": 0.77830058, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.79908735, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 0.3112793, + "step": 3696, + "time_per_iteration": 2.6508381366729736 + }, + { + "auxiliary_loss_clip": 0.01721258, + "auxiliary_loss_mlp": 0.00355809, + "balance_loss_clip": 1.35634518, + "balance_loss_mlp": 0.32448041, + "epoch": 0.22227566511348265, + "flos": 25958926298880.0, + "grad_norm": 23.22595493798862, + "language_loss": 0.9437325, + "learning_rate": 3.624555968803217e-06, + "loss": 0.96450323, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 0.31347656, + "step": 3697, + "time_per_iteration": 2.628314256668091 + }, + { + "auxiliary_loss_clip": 0.01733714, + "auxiliary_loss_mlp": 0.00316092, + "balance_loss_clip": 1.36933804, + "balance_loss_mlp": 0.28569365, + "epoch": 0.22233578836615062, + "flos": 39203678833920.0, + "grad_norm": 1.5316105139934753, + "language_loss": 0.73240703, + "learning_rate": 3.624328776493346e-06, + "loss": 0.75290513, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 0.30395508, + "step": 3698, + "time_per_iteration": 2.74393892288208 + }, + { + "auxiliary_loss_clip": 0.01728017, + "auxiliary_loss_mlp": 0.00347417, + "balance_loss_clip": 1.36542499, + "balance_loss_mlp": 0.31413391, + "epoch": 0.22239591161881858, + "flos": 36283782251520.0, + "grad_norm": 79.84971668382472, + "language_loss": 0.90833133, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.92908561, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.33276367, + "step": 3699, + "time_per_iteration": 4.102436304092407 + }, + { + "auxiliary_loss_clip": 0.01722163, + "auxiliary_loss_mlp": 0.00320874, + "balance_loss_clip": 1.36665392, + "balance_loss_mlp": 0.29054725, + "epoch": 0.22245603487148655, + "flos": 19719591177600.0, + "grad_norm": 2.054063251891168, + "language_loss": 0.87160373, + "learning_rate": 3.62387420709809e-06, + "loss": 0.89203405, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 0.30273438, + "step": 3700, + "time_per_iteration": 2.6151325702667236 + }, + { + "auxiliary_loss_clip": 0.01728938, + "auxiliary_loss_mlp": 0.00365476, + "balance_loss_clip": 1.37781692, + "balance_loss_mlp": 0.33104861, + "epoch": 0.2225161581241545, + "flos": 46280450615040.0, + "grad_norm": 16.31634392870071, + "language_loss": 0.80071652, + "learning_rate": 3.623646830029943e-06, + "loss": 0.82166064, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 0.34399414, + "step": 3701, + "time_per_iteration": 2.809668779373169 + }, + { + "auxiliary_loss_clip": 0.01752385, + "auxiliary_loss_mlp": 0.0031881, + "balance_loss_clip": 1.4018718, + "balance_loss_mlp": 0.28678989, + "epoch": 0.22257628137682248, + "flos": 23696194993920.0, + "grad_norm": 31.486857422584386, + "language_loss": 0.8939867, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.91469866, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 0.32006836, + "step": 3702, + "time_per_iteration": 2.6198415756225586 + }, + { + "auxiliary_loss_clip": 0.01733855, + "auxiliary_loss_mlp": 0.00303546, + "balance_loss_clip": 1.39256454, + "balance_loss_mlp": 0.27383897, + "epoch": 0.22263640462949044, + "flos": 19353984595200.0, + "grad_norm": 101.47959777847205, + "language_loss": 0.86793268, + "learning_rate": 3.623191891195723e-06, + "loss": 0.88830674, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.29736328, + "step": 3703, + "time_per_iteration": 2.614367961883545 + }, + { + "auxiliary_loss_clip": 0.0172518, + "auxiliary_loss_mlp": 0.00306889, + "balance_loss_clip": 1.38021469, + "balance_loss_mlp": 0.27551284, + "epoch": 0.22269652788215843, + "flos": 20776047016320.0, + "grad_norm": 22.16151273097123, + "language_loss": 0.83292335, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.85324407, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.31396484, + "step": 3704, + "time_per_iteration": 2.631744623184204 + }, + { + "auxiliary_loss_clip": 0.01740922, + "auxiliary_loss_mlp": 0.00297918, + "balance_loss_clip": 1.39778256, + "balance_loss_mlp": 0.26801986, + "epoch": 0.2227566511348264, + "flos": 47958843467520.0, + "grad_norm": 9.415190522015516, + "language_loss": 0.71521211, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.73560047, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.29907227, + "step": 3705, + "time_per_iteration": 2.8331565856933594 + }, + { + "auxiliary_loss_clip": 0.01713981, + "auxiliary_loss_mlp": 0.00064361, + "balance_loss_clip": 1.4706037, + "balance_loss_mlp": 0.0575418, + "epoch": 0.22281677438749437, + "flos": 66218953230720.0, + "grad_norm": 1.0600923250007073, + "language_loss": 0.64869475, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.66647816, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.06835938, + "step": 3706, + "time_per_iteration": 3.0434606075286865 + }, + { + "auxiliary_loss_clip": 0.01760707, + "auxiliary_loss_mlp": 0.00309879, + "balance_loss_clip": 1.41355371, + "balance_loss_mlp": 0.27659565, + "epoch": 0.22287689764016233, + "flos": 21871609787520.0, + "grad_norm": 12.079676041156633, + "language_loss": 0.88239419, + "learning_rate": 3.622281274977141e-06, + "loss": 0.90310007, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.33276367, + "step": 3707, + "time_per_iteration": 2.7407302856445312 + }, + { + "auxiliary_loss_clip": 0.01747122, + "auxiliary_loss_mlp": 0.00286677, + "balance_loss_clip": 1.40257502, + "balance_loss_mlp": 0.25776899, + "epoch": 0.2229370208928303, + "flos": 27672475587840.0, + "grad_norm": 536.1346735333221, + "language_loss": 0.86869705, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.88903505, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.2890625, + "step": 3708, + "time_per_iteration": 2.645179510116577 + }, + { + "auxiliary_loss_clip": 0.01768878, + "auxiliary_loss_mlp": 0.00292178, + "balance_loss_clip": 1.41839325, + "balance_loss_mlp": 0.25946712, + "epoch": 0.22299714414549826, + "flos": 30154657034880.0, + "grad_norm": 5.761874765795653, + "language_loss": 0.89920354, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.91981405, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 0.32714844, + "step": 3709, + "time_per_iteration": 2.674984931945801 + }, + { + "auxiliary_loss_clip": 0.01753318, + "auxiliary_loss_mlp": 0.00312178, + "balance_loss_clip": 1.40389371, + "balance_loss_mlp": 0.27786934, + "epoch": 0.22305726739816625, + "flos": 23143134309120.0, + "grad_norm": 3.1592723537410716, + "language_loss": 0.78936088, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.81001586, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 0.34289551, + "step": 3710, + "time_per_iteration": 2.633990526199341 + }, + { + "auxiliary_loss_clip": 0.01743961, + "auxiliary_loss_mlp": 0.0028421, + "balance_loss_clip": 1.40137339, + "balance_loss_mlp": 0.25409758, + "epoch": 0.22311739065083422, + "flos": 19172061187200.0, + "grad_norm": 6.357035540803494, + "language_loss": 0.97972631, + "learning_rate": 3.6213696744855096e-06, + "loss": 1.00000811, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 0.30102539, + "step": 3711, + "time_per_iteration": 2.5900700092315674 + }, + { + "auxiliary_loss_clip": 0.01745471, + "auxiliary_loss_mlp": 0.00321092, + "balance_loss_clip": 1.40614247, + "balance_loss_mlp": 0.28659284, + "epoch": 0.22317751390350218, + "flos": 13617757319040.0, + "grad_norm": 33.553328462623845, + "language_loss": 0.99660897, + "learning_rate": 3.6211416206347395e-06, + "loss": 1.01727462, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.3449707, + "step": 3712, + "time_per_iteration": 2.5452916622161865 + }, + { + "auxiliary_loss_clip": 0.0174944, + "auxiliary_loss_mlp": 0.00277402, + "balance_loss_clip": 1.41678905, + "balance_loss_mlp": 0.24478626, + "epoch": 0.22323763715617015, + "flos": 11029065068160.0, + "grad_norm": 5.096945967082815, + "language_loss": 0.86157012, + "learning_rate": 3.620913505310117e-06, + "loss": 0.88183856, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.32653809, + "step": 3713, + "time_per_iteration": 2.5520167350769043 + }, + { + "auxiliary_loss_clip": 0.01746571, + "auxiliary_loss_mlp": 0.00291547, + "balance_loss_clip": 1.40656614, + "balance_loss_mlp": 0.26095751, + "epoch": 0.22329776040883811, + "flos": 41351531466240.0, + "grad_norm": 76.01407686239082, + "language_loss": 0.69798458, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.71836579, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.30615234, + "step": 3714, + "time_per_iteration": 2.7956221103668213 + }, + { + "auxiliary_loss_clip": 0.0175136, + "auxiliary_loss_mlp": 0.00290504, + "balance_loss_clip": 1.41132259, + "balance_loss_mlp": 0.25977197, + "epoch": 0.22335788366150608, + "flos": 25119478477440.0, + "grad_norm": 5.1323160327537165, + "language_loss": 0.86282134, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.88323998, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.30761719, + "step": 3715, + "time_per_iteration": 2.644925832748413 + }, + { + "auxiliary_loss_clip": 0.01729833, + "auxiliary_loss_mlp": 0.00285261, + "balance_loss_clip": 1.40240288, + "balance_loss_mlp": 0.25469548, + "epoch": 0.22341800691417404, + "flos": 16983377769600.0, + "grad_norm": 385.06320342901046, + "language_loss": 0.8525815, + "learning_rate": 3.620228790579645e-06, + "loss": 0.8727324, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.30566406, + "step": 3716, + "time_per_iteration": 2.586158514022827 + }, + { + "auxiliary_loss_clip": 0.01753008, + "auxiliary_loss_mlp": 0.00277589, + "balance_loss_clip": 1.41590047, + "balance_loss_mlp": 0.24630815, + "epoch": 0.22347813016684204, + "flos": 14136738975360.0, + "grad_norm": 14.075362001549422, + "language_loss": 0.88795424, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.90826023, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.31262207, + "step": 3717, + "time_per_iteration": 2.5936636924743652 + }, + { + "auxiliary_loss_clip": 0.01759981, + "auxiliary_loss_mlp": 0.00293332, + "balance_loss_clip": 1.41568518, + "balance_loss_mlp": 0.25711617, + "epoch": 0.22353825341951, + "flos": 23583147914880.0, + "grad_norm": 1684.0348916195737, + "language_loss": 0.7658022, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.78633535, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 3.44140625, + "router_z_loss_mlp": 0.36206055, + "step": 3718, + "time_per_iteration": 2.6291565895080566 + }, + { + "auxiliary_loss_clip": 0.01760422, + "auxiliary_loss_mlp": 0.00290654, + "balance_loss_clip": 1.42101526, + "balance_loss_mlp": 0.25384185, + "epoch": 0.22359837667217797, + "flos": 29824206888960.0, + "grad_norm": 2.9258486960805072, + "language_loss": 0.8650775, + "learning_rate": 3.619543522896045e-06, + "loss": 0.88558829, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.36816406, + "step": 3719, + "time_per_iteration": 2.658836603164673 + }, + { + "auxiliary_loss_clip": 0.01748132, + "auxiliary_loss_mlp": 0.00285808, + "balance_loss_clip": 1.40807343, + "balance_loss_mlp": 0.24863864, + "epoch": 0.22365849992484593, + "flos": 17603088140160.0, + "grad_norm": 18.19607218504418, + "language_loss": 0.95700097, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.97734034, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.37158203, + "step": 3720, + "time_per_iteration": 2.607684373855591 + }, + { + "auxiliary_loss_clip": 0.01748168, + "auxiliary_loss_mlp": 0.00256905, + "balance_loss_clip": 1.41860306, + "balance_loss_mlp": 0.2235266, + "epoch": 0.2237186231775139, + "flos": 22710949868160.0, + "grad_norm": 1.883720739458755, + "language_loss": 0.79906577, + "learning_rate": 3.619086370692945e-06, + "loss": 0.81911647, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.33374023, + "step": 3721, + "time_per_iteration": 2.6033802032470703 + }, + { + "auxiliary_loss_clip": 0.01753427, + "auxiliary_loss_mlp": 0.00281108, + "balance_loss_clip": 1.41026485, + "balance_loss_mlp": 0.24584566, + "epoch": 0.22377874643018186, + "flos": 13371518609280.0, + "grad_norm": 2.5992682267625895, + "language_loss": 0.88445222, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.90479761, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.35253906, + "step": 3722, + "time_per_iteration": 2.6264517307281494 + }, + { + "auxiliary_loss_clip": 0.01716987, + "auxiliary_loss_mlp": 0.00275511, + "balance_loss_clip": 1.38306284, + "balance_loss_mlp": 0.24468386, + "epoch": 0.22383886968284986, + "flos": 17894970057600.0, + "grad_norm": 4.237143029277335, + "language_loss": 0.90874046, + "learning_rate": 3.618628972906178e-06, + "loss": 0.92866552, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.30834961, + "step": 3723, + "time_per_iteration": 2.566830635070801 + }, + { + "auxiliary_loss_clip": 0.01707523, + "auxiliary_loss_mlp": 0.00300966, + "balance_loss_clip": 1.37428427, + "balance_loss_mlp": 0.26658565, + "epoch": 0.22389899293551782, + "flos": 23879123982720.0, + "grad_norm": 41.43982533131782, + "language_loss": 0.93413842, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.95422328, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.34375, + "step": 3724, + "time_per_iteration": 2.6416313648223877 + }, + { + "auxiliary_loss_clip": 0.0172444, + "auxiliary_loss_mlp": 0.00282778, + "balance_loss_clip": 1.38867891, + "balance_loss_mlp": 0.24773084, + "epoch": 0.2239591161881858, + "flos": 27272430840960.0, + "grad_norm": 7.476115715257439, + "language_loss": 0.87355769, + "learning_rate": 3.618171329605121e-06, + "loss": 0.89362991, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.3503418, + "step": 3725, + "time_per_iteration": 2.640695095062256 + }, + { + "auxiliary_loss_clip": 0.01709947, + "auxiliary_loss_mlp": 0.0027517, + "balance_loss_clip": 1.37383604, + "balance_loss_mlp": 0.24431907, + "epoch": 0.22401923944085375, + "flos": 22236857233920.0, + "grad_norm": 2.1957370696906846, + "language_loss": 0.8588782, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.87872934, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 3.36132812, + "router_z_loss_mlp": 0.30859375, + "step": 3726, + "time_per_iteration": 2.6212878227233887 + }, + { + "auxiliary_loss_clip": 0.01706481, + "auxiliary_loss_mlp": 0.00308717, + "balance_loss_clip": 1.36788344, + "balance_loss_mlp": 0.27307314, + "epoch": 0.22407936269352172, + "flos": 12053668521600.0, + "grad_norm": 6.265205322844015, + "language_loss": 0.86108208, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.88123411, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.35644531, + "step": 3727, + "time_per_iteration": 2.750154972076416 + }, + { + "auxiliary_loss_clip": 0.016848, + "auxiliary_loss_mlp": 0.00297886, + "balance_loss_clip": 1.34830165, + "balance_loss_mlp": 0.26226667, + "epoch": 0.22413948594618968, + "flos": 19353553632000.0, + "grad_norm": 2.9916809250902845, + "language_loss": 0.9560867, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.97591352, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.35668945, + "step": 3728, + "time_per_iteration": 2.674445390701294 + }, + { + "auxiliary_loss_clip": 0.01666801, + "auxiliary_loss_mlp": 0.00289349, + "balance_loss_clip": 1.33292699, + "balance_loss_mlp": 0.25725788, + "epoch": 0.22419960919885765, + "flos": 24170000319360.0, + "grad_norm": 3.487450119551203, + "language_loss": 0.91883874, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.93840027, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.32104492, + "step": 3729, + "time_per_iteration": 2.6321234703063965 + }, + { + "auxiliary_loss_clip": 0.01651864, + "auxiliary_loss_mlp": 0.00260111, + "balance_loss_clip": 1.32876813, + "balance_loss_mlp": 0.23073794, + "epoch": 0.22425973245152564, + "flos": 27378977558400.0, + "grad_norm": 1.8051955359619953, + "language_loss": 0.93593144, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.95505118, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.2935791, + "step": 3730, + "time_per_iteration": 2.642503499984741 + }, + { + "auxiliary_loss_clip": 0.01644362, + "auxiliary_loss_mlp": 0.00287921, + "balance_loss_clip": 1.31437838, + "balance_loss_mlp": 0.25578254, + "epoch": 0.2243198557041936, + "flos": 13735652734080.0, + "grad_norm": 23.69721076733346, + "language_loss": 0.80117583, + "learning_rate": 3.616796927310559e-06, + "loss": 0.82049865, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.32128906, + "step": 3731, + "time_per_iteration": 2.6223978996276855 + }, + { + "auxiliary_loss_clip": 0.01629544, + "auxiliary_loss_mlp": 0.00284443, + "balance_loss_clip": 1.29849982, + "balance_loss_mlp": 0.25284061, + "epoch": 0.22437997895686157, + "flos": 19530700531200.0, + "grad_norm": 128.58797139745107, + "language_loss": 0.83693612, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.856076, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.31555176, + "step": 3732, + "time_per_iteration": 4.01387357711792 + }, + { + "auxiliary_loss_clip": 0.01622681, + "auxiliary_loss_mlp": 0.00261306, + "balance_loss_clip": 1.28904772, + "balance_loss_mlp": 0.23097873, + "epoch": 0.22444010220952954, + "flos": 23696230907520.0, + "grad_norm": 1.6243654256547913, + "language_loss": 0.94479221, + "learning_rate": 3.616338302646873e-06, + "loss": 0.96363205, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.3034668, + "step": 3733, + "time_per_iteration": 2.6539969444274902 + }, + { + "auxiliary_loss_clip": 0.01596926, + "auxiliary_loss_mlp": 0.00281174, + "balance_loss_clip": 1.26911247, + "balance_loss_mlp": 0.25057322, + "epoch": 0.2245002254621975, + "flos": 22382905933440.0, + "grad_norm": 6.296600100130652, + "language_loss": 0.90009594, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.91887701, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.30615234, + "step": 3734, + "time_per_iteration": 4.120077133178711 + }, + { + "auxiliary_loss_clip": 0.01586135, + "auxiliary_loss_mlp": 0.00239123, + "balance_loss_clip": 1.25241518, + "balance_loss_mlp": 0.21006027, + "epoch": 0.22456034871486547, + "flos": 26942303917440.0, + "grad_norm": 14.868012265174627, + "language_loss": 0.82364428, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.84189683, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.29077148, + "step": 3735, + "time_per_iteration": 2.6559417247772217 + }, + { + "auxiliary_loss_clip": 0.01574622, + "auxiliary_loss_mlp": 0.00240886, + "balance_loss_clip": 1.23917282, + "balance_loss_mlp": 0.2146365, + "epoch": 0.22462047196753343, + "flos": 28983538005120.0, + "grad_norm": 4.090248746192041, + "language_loss": 0.90533888, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.92349398, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.26220703, + "step": 3736, + "time_per_iteration": 2.703596353530884 + }, + { + "auxiliary_loss_clip": 0.01589858, + "auxiliary_loss_mlp": 0.0028323, + "balance_loss_clip": 1.24937797, + "balance_loss_mlp": 0.25446469, + "epoch": 0.22468059522020142, + "flos": 20011329440640.0, + "grad_norm": 15.264752636765813, + "language_loss": 0.93908215, + "learning_rate": 3.615420317888586e-06, + "loss": 0.95781296, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.28747559, + "step": 3737, + "time_per_iteration": 4.047317743301392 + }, + { + "auxiliary_loss_clip": 0.01580588, + "auxiliary_loss_mlp": 0.00300648, + "balance_loss_clip": 1.24001813, + "balance_loss_mlp": 0.26821095, + "epoch": 0.2247407184728694, + "flos": 29314239546240.0, + "grad_norm": 3.1243568449216528, + "language_loss": 0.85876656, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.87757885, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.32446289, + "step": 3738, + "time_per_iteration": 2.658029079437256 + }, + { + "auxiliary_loss_clip": 0.01575859, + "auxiliary_loss_mlp": 0.00268173, + "balance_loss_clip": 1.23830271, + "balance_loss_mlp": 0.2392171, + "epoch": 0.22480084172553735, + "flos": 22310366417280.0, + "grad_norm": 13.242282304508533, + "language_loss": 0.84311473, + "learning_rate": 3.614960957933224e-06, + "loss": 0.86155498, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.28955078, + "step": 3739, + "time_per_iteration": 2.6988308429718018 + }, + { + "auxiliary_loss_clip": 0.01565207, + "auxiliary_loss_mlp": 0.00265972, + "balance_loss_clip": 1.22739661, + "balance_loss_mlp": 0.23357069, + "epoch": 0.22486096497820532, + "flos": 25591272641280.0, + "grad_norm": 18.970612564592223, + "language_loss": 0.81274772, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.83105946, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.32446289, + "step": 3740, + "time_per_iteration": 2.6940972805023193 + }, + { + "auxiliary_loss_clip": 0.01563965, + "auxiliary_loss_mlp": 0.00247245, + "balance_loss_clip": 1.2300539, + "balance_loss_mlp": 0.22020839, + "epoch": 0.22492108823087328, + "flos": 17639824775040.0, + "grad_norm": 6.6661159020940985, + "language_loss": 0.83149922, + "learning_rate": 3.614501353019939e-06, + "loss": 0.84961128, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.27050781, + "step": 3741, + "time_per_iteration": 2.5805277824401855 + }, + { + "auxiliary_loss_clip": 0.01547146, + "auxiliary_loss_mlp": 0.00263119, + "balance_loss_clip": 1.20943642, + "balance_loss_mlp": 0.2343777, + "epoch": 0.22498121148354125, + "flos": 16034653797120.0, + "grad_norm": 6.624517730269669, + "language_loss": 0.96296012, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.98106277, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.28771973, + "step": 3742, + "time_per_iteration": 3.9571785926818848 + }, + { + "auxiliary_loss_clip": 0.01542076, + "auxiliary_loss_mlp": 0.00261836, + "balance_loss_clip": 1.20222473, + "balance_loss_mlp": 0.23543085, + "epoch": 0.22504133473620924, + "flos": 24023772051840.0, + "grad_norm": 32.35585523370647, + "language_loss": 0.8777771, + "learning_rate": 3.614041503218444e-06, + "loss": 0.89581627, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 0.26403809, + "step": 3743, + "time_per_iteration": 2.6378064155578613 + }, + { + "auxiliary_loss_clip": 0.01578814, + "auxiliary_loss_mlp": 0.00282835, + "balance_loss_clip": 1.22328711, + "balance_loss_mlp": 0.25262767, + "epoch": 0.2251014579888772, + "flos": 16763963541120.0, + "grad_norm": 4.199372369551484, + "language_loss": 0.7343837, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.7530002, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 0.30163574, + "step": 3744, + "time_per_iteration": 2.5655248165130615 + }, + { + "auxiliary_loss_clip": 0.01542687, + "auxiliary_loss_mlp": 0.0026768, + "balance_loss_clip": 1.19179702, + "balance_loss_mlp": 0.24072722, + "epoch": 0.22516158124154517, + "flos": 13991013498240.0, + "grad_norm": 93.14452409530932, + "language_loss": 0.8547467, + "learning_rate": 3.613581408598489e-06, + "loss": 0.87285042, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 0.26965332, + "step": 3745, + "time_per_iteration": 2.590364694595337 + }, + { + "auxiliary_loss_clip": 0.01555736, + "auxiliary_loss_mlp": 0.00259223, + "balance_loss_clip": 1.19764459, + "balance_loss_mlp": 0.23321182, + "epoch": 0.22522170449421314, + "flos": 14390016750720.0, + "grad_norm": 3.6015674738140118, + "language_loss": 0.86922884, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.88737845, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.26000977, + "step": 3746, + "time_per_iteration": 2.5995683670043945 + }, + { + "auxiliary_loss_clip": 0.01547277, + "auxiliary_loss_mlp": 0.00290854, + "balance_loss_clip": 1.18504667, + "balance_loss_mlp": 0.26328075, + "epoch": 0.2252818277468811, + "flos": 23805542972160.0, + "grad_norm": 3.2540980932069905, + "language_loss": 0.94966918, + "learning_rate": 3.613121069229862e-06, + "loss": 0.96805042, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.27575684, + "step": 3747, + "time_per_iteration": 2.6372578144073486 + }, + { + "auxiliary_loss_clip": 0.01546306, + "auxiliary_loss_mlp": 0.00268358, + "balance_loss_clip": 1.18590236, + "balance_loss_mlp": 0.24060571, + "epoch": 0.22534195099954907, + "flos": 24718033100160.0, + "grad_norm": 2.114602440126379, + "language_loss": 0.83550823, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.85365486, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 0.27770996, + "step": 3748, + "time_per_iteration": 2.652848958969116 + }, + { + "auxiliary_loss_clip": 0.01518413, + "auxiliary_loss_mlp": 0.00289816, + "balance_loss_clip": 1.16724515, + "balance_loss_mlp": 0.26287472, + "epoch": 0.22540207425221703, + "flos": 21032341534080.0, + "grad_norm": 13.006428734948704, + "language_loss": 0.8602258, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.87830806, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 0.26940918, + "step": 3749, + "time_per_iteration": 2.6743314266204834 + }, + { + "auxiliary_loss_clip": 0.01494517, + "auxiliary_loss_mlp": 0.00263137, + "balance_loss_clip": 1.15013492, + "balance_loss_mlp": 0.23548055, + "epoch": 0.22546219750488503, + "flos": 19390362094080.0, + "grad_norm": 2.998229513291716, + "language_loss": 0.87159061, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.88916719, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.2767334, + "step": 3750, + "time_per_iteration": 2.7304012775421143 + }, + { + "auxiliary_loss_clip": 0.01501942, + "auxiliary_loss_mlp": 0.00268673, + "balance_loss_clip": 1.15010595, + "balance_loss_mlp": 0.24101678, + "epoch": 0.225522320757553, + "flos": 25192628524800.0, + "grad_norm": 10.307570585188342, + "language_loss": 0.90418935, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.9218955, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 0.2767334, + "step": 3751, + "time_per_iteration": 2.7133405208587646 + }, + { + "auxiliary_loss_clip": 0.01489346, + "auxiliary_loss_mlp": 0.00235388, + "balance_loss_clip": 1.14491463, + "balance_loss_mlp": 0.20911434, + "epoch": 0.22558244401022096, + "flos": 17163110448000.0, + "grad_norm": 3.322885603348074, + "language_loss": 0.9160583, + "learning_rate": 3.611969150491165e-06, + "loss": 0.93330568, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.2623291, + "step": 3752, + "time_per_iteration": 2.666675567626953 + }, + { + "auxiliary_loss_clip": 0.01479082, + "auxiliary_loss_mlp": 0.00254369, + "balance_loss_clip": 1.13345361, + "balance_loss_mlp": 0.22838143, + "epoch": 0.22564256726288892, + "flos": 15231008856960.0, + "grad_norm": 66.5203879579358, + "language_loss": 0.84538692, + "learning_rate": 3.611738583330375e-06, + "loss": 0.86272144, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 3.45703125, + "router_z_loss_mlp": 0.25976562, + "step": 3753, + "time_per_iteration": 2.5854814052581787 + }, + { + "auxiliary_loss_clip": 0.01472189, + "auxiliary_loss_mlp": 0.0024686, + "balance_loss_clip": 1.13263881, + "balance_loss_mlp": 0.22046663, + "epoch": 0.2257026905155569, + "flos": 34568652764160.0, + "grad_norm": 2.748066550416106, + "language_loss": 0.8437615, + "learning_rate": 3.611507955052295e-06, + "loss": 0.86095202, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 3.39648438, + "router_z_loss_mlp": 0.26403809, + "step": 3754, + "time_per_iteration": 2.7454335689544678 + }, + { + "auxiliary_loss_clip": 0.01494609, + "auxiliary_loss_mlp": 0.00258218, + "balance_loss_clip": 1.14446044, + "balance_loss_mlp": 0.23300573, + "epoch": 0.22576281376822485, + "flos": 19938430788480.0, + "grad_norm": 2.532103702961982, + "language_loss": 0.77478129, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.79230952, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.25195312, + "step": 3755, + "time_per_iteration": 2.611109733581543 + }, + { + "auxiliary_loss_clip": 0.0146902, + "auxiliary_loss_mlp": 0.00254977, + "balance_loss_clip": 1.11967146, + "balance_loss_mlp": 0.2274162, + "epoch": 0.22582293702089282, + "flos": 24602005192320.0, + "grad_norm": 2.7168041333282806, + "language_loss": 0.8662923, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.88353229, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 0.2755127, + "step": 3756, + "time_per_iteration": 2.6402969360351562 + }, + { + "auxiliary_loss_clip": 0.01476135, + "auxiliary_loss_mlp": 0.00263765, + "balance_loss_clip": 1.13475609, + "balance_loss_mlp": 0.23526214, + "epoch": 0.2258830602735608, + "flos": 23035438356480.0, + "grad_norm": 3.452118910324784, + "language_loss": 0.89350921, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.91090822, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 3.4140625, + "router_z_loss_mlp": 0.28479004, + "step": 3757, + "time_per_iteration": 2.6229939460754395 + }, + { + "auxiliary_loss_clip": 0.01484057, + "auxiliary_loss_mlp": 0.00282326, + "balance_loss_clip": 1.13879275, + "balance_loss_mlp": 0.25311926, + "epoch": 0.22594318352622877, + "flos": 22158427887360.0, + "grad_norm": 33.213973999448974, + "language_loss": 0.82079387, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.8384577, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 3.45703125, + "router_z_loss_mlp": 0.29211426, + "step": 3758, + "time_per_iteration": 2.6361501216888428 + }, + { + "auxiliary_loss_clip": 0.01478545, + "auxiliary_loss_mlp": 0.00255803, + "balance_loss_clip": 1.12851083, + "balance_loss_mlp": 0.22830112, + "epoch": 0.22600330677889674, + "flos": 20594303176320.0, + "grad_norm": 3.7980699276349865, + "language_loss": 0.84744501, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.86478847, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.27526855, + "step": 3759, + "time_per_iteration": 2.5729997158050537 + }, + { + "auxiliary_loss_clip": 0.01467708, + "auxiliary_loss_mlp": 0.00248822, + "balance_loss_clip": 1.119766, + "balance_loss_mlp": 0.22210692, + "epoch": 0.2260634300315647, + "flos": 35659798162560.0, + "grad_norm": 2.2022061986511683, + "language_loss": 0.84686321, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.86402851, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.26708984, + "step": 3760, + "time_per_iteration": 2.702336311340332 + }, + { + "auxiliary_loss_clip": 0.01604861, + "auxiliary_loss_mlp": 0.00126645, + "balance_loss_clip": 1.34817743, + "balance_loss_mlp": 0.11734698, + "epoch": 0.22612355328423267, + "flos": 72090455126400.0, + "grad_norm": 0.8789520094836868, + "language_loss": 0.60041964, + "learning_rate": 3.609891846556569e-06, + "loss": 0.61773479, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.09277344, + "step": 3761, + "time_per_iteration": 3.076446771621704 + }, + { + "auxiliary_loss_clip": 0.01469807, + "auxiliary_loss_mlp": 0.0024709, + "balance_loss_clip": 1.12058938, + "balance_loss_mlp": 0.21968368, + "epoch": 0.22618367653690064, + "flos": 22783776693120.0, + "grad_norm": 1105.949890832068, + "language_loss": 0.86608762, + "learning_rate": 3.609660729655211e-06, + "loss": 0.88325655, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 3.49023438, + "router_z_loss_mlp": 0.27416992, + "step": 3762, + "time_per_iteration": 2.6155059337615967 + }, + { + "auxiliary_loss_clip": 0.01487241, + "auxiliary_loss_mlp": 0.00238571, + "balance_loss_clip": 1.13581181, + "balance_loss_mlp": 0.21089014, + "epoch": 0.22624379978956863, + "flos": 20448254476800.0, + "grad_norm": 39.59204086053606, + "language_loss": 0.8986603, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.91591841, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 0.2767334, + "step": 3763, + "time_per_iteration": 2.5660717487335205 + }, + { + "auxiliary_loss_clip": 0.01491278, + "auxiliary_loss_mlp": 0.00251748, + "balance_loss_clip": 1.13375616, + "balance_loss_mlp": 0.22434179, + "epoch": 0.2263039230422366, + "flos": 17494314779520.0, + "grad_norm": 2.7505557506183895, + "language_loss": 0.98194957, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.99937987, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.27429199, + "step": 3764, + "time_per_iteration": 2.574394702911377 + }, + { + "auxiliary_loss_clip": 0.01472884, + "auxiliary_loss_mlp": 0.00198136, + "balance_loss_clip": 1.12239766, + "balance_loss_mlp": 0.1741989, + "epoch": 0.22636404629490456, + "flos": 28329748606080.0, + "grad_norm": 7.40099722990444, + "language_loss": 0.82427269, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.84098285, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 0.23913574, + "step": 3765, + "time_per_iteration": 2.678882122039795 + }, + { + "auxiliary_loss_clip": 0.01498731, + "auxiliary_loss_mlp": 0.00196275, + "balance_loss_clip": 1.14206707, + "balance_loss_mlp": 0.17203987, + "epoch": 0.22642416954757252, + "flos": 17489143221120.0, + "grad_norm": 85.58254028116588, + "language_loss": 0.99533308, + "learning_rate": 3.608735651752494e-06, + "loss": 1.01228309, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 0.2421875, + "step": 3766, + "time_per_iteration": 2.592388153076172 + }, + { + "auxiliary_loss_clip": 0.0148388, + "auxiliary_loss_mlp": 0.00190062, + "balance_loss_clip": 1.13614202, + "balance_loss_mlp": 0.16313274, + "epoch": 0.2264842928002405, + "flos": 24384530298240.0, + "grad_norm": 83.31321765837072, + "language_loss": 0.8187238, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.83546317, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.26928711, + "step": 3767, + "time_per_iteration": 2.6785457134246826 + }, + { + "auxiliary_loss_clip": 0.01480999, + "auxiliary_loss_mlp": 0.00221725, + "balance_loss_clip": 1.13329673, + "balance_loss_mlp": 0.19591665, + "epoch": 0.22654441605290845, + "flos": 19830519354240.0, + "grad_norm": 15853.12459299125, + "language_loss": 0.79736352, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.81439078, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.25793457, + "step": 3768, + "time_per_iteration": 2.6148195266723633 + }, + { + "auxiliary_loss_clip": 0.01494432, + "auxiliary_loss_mlp": 0.00205481, + "balance_loss_clip": 1.14576077, + "balance_loss_mlp": 0.18023278, + "epoch": 0.22660453930557642, + "flos": 27454569730560.0, + "grad_norm": 7.01395108341076, + "language_loss": 0.8604964, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.87749553, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.25244141, + "step": 3769, + "time_per_iteration": 2.6502463817596436 + }, + { + "auxiliary_loss_clip": 0.01483367, + "auxiliary_loss_mlp": 0.00201625, + "balance_loss_clip": 1.1342144, + "balance_loss_mlp": 0.17488611, + "epoch": 0.2266646625582444, + "flos": 23988148738560.0, + "grad_norm": 5.193023306427383, + "language_loss": 0.78396207, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.80081201, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 3.48828125, + "router_z_loss_mlp": 0.26745605, + "step": 3770, + "time_per_iteration": 2.640263557434082 + }, + { + "auxiliary_loss_clip": 0.01489396, + "auxiliary_loss_mlp": 0.001862, + "balance_loss_clip": 1.13894057, + "balance_loss_mlp": 0.16139276, + "epoch": 0.22672478581091238, + "flos": 26028054023040.0, + "grad_norm": 1309.76988638453, + "language_loss": 0.87205958, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.88881552, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.24829102, + "step": 3771, + "time_per_iteration": 2.6380138397216797 + }, + { + "auxiliary_loss_clip": 0.01460856, + "auxiliary_loss_mlp": 0.00190784, + "balance_loss_clip": 1.12229681, + "balance_loss_mlp": 0.16679901, + "epoch": 0.22678490906358034, + "flos": 23841812730240.0, + "grad_norm": 13.853751436531944, + "language_loss": 0.84655017, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.86306655, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.23974609, + "step": 3772, + "time_per_iteration": 2.6567471027374268 + }, + { + "auxiliary_loss_clip": 0.01540068, + "auxiliary_loss_mlp": 0.00055507, + "balance_loss_clip": 1.29785252, + "balance_loss_mlp": 0.04797298, + "epoch": 0.2268450323162483, + "flos": 65048088574080.0, + "grad_norm": 0.6519527005091391, + "language_loss": 0.53768098, + "learning_rate": 3.607114417129261e-06, + "loss": 0.55363679, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.07519531, + "step": 3773, + "time_per_iteration": 3.1942477226257324 + }, + { + "auxiliary_loss_clip": 0.01463465, + "auxiliary_loss_mlp": 0.00197421, + "balance_loss_clip": 1.12790728, + "balance_loss_mlp": 0.17342424, + "epoch": 0.22690515556891627, + "flos": 22526081544960.0, + "grad_norm": 15.714425528651002, + "language_loss": 0.77801788, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.79462677, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.23999023, + "step": 3774, + "time_per_iteration": 4.2597901821136475 + }, + { + "auxiliary_loss_clip": 0.01465873, + "auxiliary_loss_mlp": 0.0019841, + "balance_loss_clip": 1.1298455, + "balance_loss_mlp": 0.17224374, + "epoch": 0.22696527882158424, + "flos": 18223444955520.0, + "grad_norm": 2.8794006614258225, + "language_loss": 0.82740462, + "learning_rate": 3.606650658627658e-06, + "loss": 0.84404743, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.26159668, + "step": 3775, + "time_per_iteration": 2.589857816696167 + }, + { + "auxiliary_loss_clip": 0.01441417, + "auxiliary_loss_mlp": 0.00174584, + "balance_loss_clip": 1.11505008, + "balance_loss_mlp": 0.15059876, + "epoch": 0.22702540207425223, + "flos": 17019252478080.0, + "grad_norm": 3.5694158333676933, + "language_loss": 0.93185186, + "learning_rate": 3.606418687985928e-06, + "loss": 0.94801188, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.23962402, + "step": 3776, + "time_per_iteration": 4.018434286117554 + }, + { + "auxiliary_loss_clip": 0.01447866, + "auxiliary_loss_mlp": 0.00213538, + "balance_loss_clip": 1.11782956, + "balance_loss_mlp": 0.18926661, + "epoch": 0.2270855253269202, + "flos": 21325731822720.0, + "grad_norm": 2.4656605359535906, + "language_loss": 0.90881091, + "learning_rate": 3.606186656428641e-06, + "loss": 0.92542493, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.24267578, + "step": 3777, + "time_per_iteration": 2.6232762336730957 + }, + { + "auxiliary_loss_clip": 0.01462126, + "auxiliary_loss_mlp": 0.00220729, + "balance_loss_clip": 1.13179731, + "balance_loss_mlp": 0.1959458, + "epoch": 0.22714564857958816, + "flos": 23550469516800.0, + "grad_norm": 5.715832934674001, + "language_loss": 0.84445029, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.86127883, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.2479248, + "step": 3778, + "time_per_iteration": 2.651299238204956 + }, + { + "auxiliary_loss_clip": 0.01447511, + "auxiliary_loss_mlp": 0.00231076, + "balance_loss_clip": 1.11835694, + "balance_loss_mlp": 0.20443246, + "epoch": 0.22720577183225613, + "flos": 25989880844160.0, + "grad_norm": 4.409052486835874, + "language_loss": 0.76447302, + "learning_rate": 3.605722410602591e-06, + "loss": 0.78125894, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.26647949, + "step": 3779, + "time_per_iteration": 2.7251527309417725 + }, + { + "auxiliary_loss_clip": 0.01441671, + "auxiliary_loss_mlp": 0.00205789, + "balance_loss_clip": 1.12188244, + "balance_loss_mlp": 0.17933664, + "epoch": 0.2272658950849241, + "flos": 20814076540800.0, + "grad_norm": 13.372673067637644, + "language_loss": 0.78403294, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.80050755, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.26452637, + "step": 3780, + "time_per_iteration": 4.226995944976807 + }, + { + "auxiliary_loss_clip": 0.01457722, + "auxiliary_loss_mlp": 0.00241159, + "balance_loss_clip": 1.13814187, + "balance_loss_mlp": 0.2146354, + "epoch": 0.22732601833759206, + "flos": 23909324342400.0, + "grad_norm": 34.868951577999894, + "language_loss": 0.96005428, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.97704315, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.26538086, + "step": 3781, + "time_per_iteration": 2.66371488571167 + }, + { + "auxiliary_loss_clip": 0.01444655, + "auxiliary_loss_mlp": 0.00240751, + "balance_loss_clip": 1.11860919, + "balance_loss_mlp": 0.21237871, + "epoch": 0.22738614159026002, + "flos": 15924407978880.0, + "grad_norm": 26.395089601407683, + "language_loss": 0.86673003, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.88358414, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.28369141, + "step": 3782, + "time_per_iteration": 2.617506265640259 + }, + { + "auxiliary_loss_clip": 0.01440137, + "auxiliary_loss_mlp": 0.00239254, + "balance_loss_clip": 1.11913311, + "balance_loss_mlp": 0.21442321, + "epoch": 0.22744626484292801, + "flos": 24205515891840.0, + "grad_norm": 4.946512366229814, + "language_loss": 0.88735139, + "learning_rate": 3.604793188351095e-06, + "loss": 0.9041453, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.24841309, + "step": 3783, + "time_per_iteration": 2.6770713329315186 + }, + { + "auxiliary_loss_clip": 0.0145269, + "auxiliary_loss_mlp": 0.00260614, + "balance_loss_clip": 1.12478542, + "balance_loss_mlp": 0.23455521, + "epoch": 0.22750638809559598, + "flos": 24791614110720.0, + "grad_norm": 712.6147564479653, + "language_loss": 0.8250649, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.8421979, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.26062012, + "step": 3784, + "time_per_iteration": 4.01696515083313 + }, + { + "auxiliary_loss_clip": 0.01460735, + "auxiliary_loss_mlp": 0.00235702, + "balance_loss_clip": 1.13296795, + "balance_loss_mlp": 0.20705557, + "epoch": 0.22756651134826394, + "flos": 22236498097920.0, + "grad_norm": 9.835490130864995, + "language_loss": 0.78016859, + "learning_rate": 3.604328212066594e-06, + "loss": 0.79713297, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.28662109, + "step": 3785, + "time_per_iteration": 2.588087558746338 + }, + { + "auxiliary_loss_clip": 0.01578094, + "auxiliary_loss_mlp": 0.00112157, + "balance_loss_clip": 1.33320618, + "balance_loss_mlp": 0.1003793, + "epoch": 0.2276266346009319, + "flos": 62707466626560.0, + "grad_norm": 0.8739821698525461, + "language_loss": 0.62171841, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.63862097, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.11767578, + "step": 3786, + "time_per_iteration": 3.0779669284820557 + }, + { + "auxiliary_loss_clip": 0.01481345, + "auxiliary_loss_mlp": 0.00241773, + "balance_loss_clip": 1.14729977, + "balance_loss_mlp": 0.21570194, + "epoch": 0.22768675785359987, + "flos": 18613936684800.0, + "grad_norm": 4.2170069054870485, + "language_loss": 0.96787018, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.98510134, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.26062012, + "step": 3787, + "time_per_iteration": 2.566694736480713 + }, + { + "auxiliary_loss_clip": 0.01464301, + "auxiliary_loss_mlp": 0.00259188, + "balance_loss_clip": 1.13692188, + "balance_loss_mlp": 0.23259225, + "epoch": 0.22774688110626784, + "flos": 26870195364480.0, + "grad_norm": 2.3676336910294573, + "language_loss": 0.78211159, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.79934657, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.26611328, + "step": 3788, + "time_per_iteration": 2.6707265377044678 + }, + { + "auxiliary_loss_clip": 0.01486565, + "auxiliary_loss_mlp": 0.00254891, + "balance_loss_clip": 1.15287161, + "balance_loss_mlp": 0.22747239, + "epoch": 0.2278070043589358, + "flos": 15553593924480.0, + "grad_norm": 2.3874569399524868, + "language_loss": 0.76778829, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.78520286, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.27441406, + "step": 3789, + "time_per_iteration": 2.5700623989105225 + }, + { + "auxiliary_loss_clip": 0.01496191, + "auxiliary_loss_mlp": 0.0025867, + "balance_loss_clip": 1.16265082, + "balance_loss_mlp": 0.23174061, + "epoch": 0.2278671276116038, + "flos": 22416805393920.0, + "grad_norm": 26.160126235566878, + "language_loss": 0.81849235, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.83604097, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.26940918, + "step": 3790, + "time_per_iteration": 2.6664602756500244 + }, + { + "auxiliary_loss_clip": 0.01496293, + "auxiliary_loss_mlp": 0.00236631, + "balance_loss_clip": 1.16245019, + "balance_loss_mlp": 0.20852157, + "epoch": 0.22792725086427176, + "flos": 20631363033600.0, + "grad_norm": 6.61602102945212, + "language_loss": 0.99717367, + "learning_rate": 3.602931823424522e-06, + "loss": 1.01450288, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.28100586, + "step": 3791, + "time_per_iteration": 2.5914418697357178 + }, + { + "auxiliary_loss_clip": 0.01509863, + "auxiliary_loss_mlp": 0.00264067, + "balance_loss_clip": 1.17673326, + "balance_loss_mlp": 0.23431244, + "epoch": 0.22798737411693973, + "flos": 31428946903680.0, + "grad_norm": 7.37214409417311, + "language_loss": 0.90857595, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.92631525, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.29785156, + "step": 3792, + "time_per_iteration": 2.6922590732574463 + }, + { + "auxiliary_loss_clip": 0.01549448, + "auxiliary_loss_mlp": 0.00089354, + "balance_loss_clip": 1.32915473, + "balance_loss_mlp": 0.07609776, + "epoch": 0.2280474973696077, + "flos": 52396685827200.0, + "grad_norm": 1.1340433642462766, + "language_loss": 0.65839267, + "learning_rate": 3.602465874182981e-06, + "loss": 0.67478073, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.1328125, + "step": 3793, + "time_per_iteration": 2.9107420444488525 + }, + { + "auxiliary_loss_clip": 0.01537376, + "auxiliary_loss_mlp": 0.00317057, + "balance_loss_clip": 1.1983161, + "balance_loss_mlp": 0.28417861, + "epoch": 0.22810762062227566, + "flos": 26396066816640.0, + "grad_norm": 324.3256417574254, + "language_loss": 0.86184204, + "learning_rate": 3.602232808409293e-06, + "loss": 0.88038635, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.32861328, + "step": 3794, + "time_per_iteration": 2.684657335281372 + }, + { + "auxiliary_loss_clip": 0.01527243, + "auxiliary_loss_mlp": 0.00300376, + "balance_loss_clip": 1.18967474, + "balance_loss_mlp": 0.27057418, + "epoch": 0.22816774387494362, + "flos": 25630271832960.0, + "grad_norm": 62.848019560234285, + "language_loss": 0.87247902, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.89075524, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.2980957, + "step": 3795, + "time_per_iteration": 2.6837921142578125 + }, + { + "auxiliary_loss_clip": 0.01513221, + "auxiliary_loss_mlp": 0.00285835, + "balance_loss_clip": 1.18411303, + "balance_loss_mlp": 0.25900108, + "epoch": 0.22822786712761162, + "flos": 22451602694400.0, + "grad_norm": 310.3910527622936, + "language_loss": 0.84874344, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.86673403, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.26843262, + "step": 3796, + "time_per_iteration": 2.616368293762207 + }, + { + "auxiliary_loss_clip": 0.0154804, + "auxiliary_loss_mlp": 0.00298538, + "balance_loss_clip": 1.21022463, + "balance_loss_mlp": 0.26828295, + "epoch": 0.22828799038027958, + "flos": 12202554395520.0, + "grad_norm": 3.2248013044324124, + "language_loss": 1.04688621, + "learning_rate": 3.6015332465826188e-06, + "loss": 1.06535196, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.30200195, + "step": 3797, + "time_per_iteration": 2.6025280952453613 + }, + { + "auxiliary_loss_clip": 0.01536033, + "auxiliary_loss_mlp": 0.00283494, + "balance_loss_clip": 1.20216739, + "balance_loss_mlp": 0.25440741, + "epoch": 0.22834811363294755, + "flos": 22085708803200.0, + "grad_norm": 176.68408993618127, + "language_loss": 0.87116992, + "learning_rate": 3.601299937834666e-06, + "loss": 0.88936526, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.29125977, + "step": 3798, + "time_per_iteration": 2.644073009490967 + }, + { + "auxiliary_loss_clip": 0.01534388, + "auxiliary_loss_mlp": 0.00329668, + "balance_loss_clip": 1.19908309, + "balance_loss_mlp": 0.29767185, + "epoch": 0.2284082368856155, + "flos": 24860634094080.0, + "grad_norm": 2.518291054392234, + "language_loss": 0.88611174, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.90475225, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.31982422, + "step": 3799, + "time_per_iteration": 2.6403379440307617 + }, + { + "auxiliary_loss_clip": 0.01546335, + "auxiliary_loss_mlp": 0.00264631, + "balance_loss_clip": 1.21076083, + "balance_loss_mlp": 0.23726091, + "epoch": 0.22846836013828348, + "flos": 23292882109440.0, + "grad_norm": 10.010092907979706, + "language_loss": 0.82764274, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.84575242, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.27392578, + "step": 3800, + "time_per_iteration": 2.5893261432647705 + }, + { + "auxiliary_loss_clip": 0.01532576, + "auxiliary_loss_mlp": 0.00272501, + "balance_loss_clip": 1.2019701, + "balance_loss_mlp": 0.24360511, + "epoch": 0.22852848339095144, + "flos": 27416288810880.0, + "grad_norm": 58.83597301498925, + "language_loss": 0.73710966, + "learning_rate": 3.600599647297484e-06, + "loss": 0.75516045, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.28918457, + "step": 3801, + "time_per_iteration": 2.677828550338745 + }, + { + "auxiliary_loss_clip": 0.01587788, + "auxiliary_loss_mlp": 0.00279791, + "balance_loss_clip": 1.24426794, + "balance_loss_mlp": 0.25160986, + "epoch": 0.2285886066436194, + "flos": 26321157002880.0, + "grad_norm": 10.03252907367889, + "language_loss": 0.88022298, + "learning_rate": 3.60036609571682e-06, + "loss": 0.89889872, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.28173828, + "step": 3802, + "time_per_iteration": 2.610530376434326 + }, + { + "auxiliary_loss_clip": 0.01622384, + "auxiliary_loss_mlp": 0.00289162, + "balance_loss_clip": 1.26447868, + "balance_loss_mlp": 0.25673699, + "epoch": 0.2286487298962874, + "flos": 29716475022720.0, + "grad_norm": 5.161547203267033, + "language_loss": 0.86699682, + "learning_rate": 3.600132483450114e-06, + "loss": 0.88611233, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.32470703, + "step": 3803, + "time_per_iteration": 2.6577377319335938 + }, + { + "auxiliary_loss_clip": 0.01635849, + "auxiliary_loss_mlp": 0.00275, + "balance_loss_clip": 1.26880074, + "balance_loss_mlp": 0.24307598, + "epoch": 0.22870885314895537, + "flos": 21287199507840.0, + "grad_norm": 5.5224944268367455, + "language_loss": 0.92798424, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.94709271, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 0.3190918, + "step": 3804, + "time_per_iteration": 2.6192097663879395 + }, + { + "auxiliary_loss_clip": 0.01637202, + "auxiliary_loss_mlp": 0.0029951, + "balance_loss_clip": 1.26698458, + "balance_loss_mlp": 0.2688489, + "epoch": 0.22876897640162333, + "flos": 14939450161920.0, + "grad_norm": 72.61022764030776, + "language_loss": 0.85961944, + "learning_rate": 3.59966507689401e-06, + "loss": 0.87898654, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 3.70703125, + "router_z_loss_mlp": 0.30712891, + "step": 3805, + "time_per_iteration": 2.619967222213745 + }, + { + "auxiliary_loss_clip": 0.01682763, + "auxiliary_loss_mlp": 0.00328826, + "balance_loss_clip": 1.29840386, + "balance_loss_mlp": 0.29616219, + "epoch": 0.2288290996542913, + "flos": 18113917409280.0, + "grad_norm": 73.30421704403635, + "language_loss": 0.88582981, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.90594572, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 3.84375, + "router_z_loss_mlp": 0.32666016, + "step": 3806, + "time_per_iteration": 2.5871024131774902 + }, + { + "auxiliary_loss_clip": 0.01708374, + "auxiliary_loss_mlp": 0.00272325, + "balance_loss_clip": 1.3144151, + "balance_loss_mlp": 0.23999512, + "epoch": 0.22888922290695926, + "flos": 39855457071360.0, + "grad_norm": 28.008932547167074, + "language_loss": 0.79271215, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.81251913, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 3.9375, + "router_z_loss_mlp": 0.32299805, + "step": 3807, + "time_per_iteration": 2.744049072265625 + }, + { + "auxiliary_loss_clip": 0.0171314, + "auxiliary_loss_mlp": 0.00299618, + "balance_loss_clip": 1.31703544, + "balance_loss_mlp": 0.26781267, + "epoch": 0.22894934615962723, + "flos": 23403774372480.0, + "grad_norm": 2.3153668310295186, + "language_loss": 0.745327, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.76545465, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 3.9609375, + "router_z_loss_mlp": 0.31774902, + "step": 3808, + "time_per_iteration": 2.5916390419006348 + }, + { + "auxiliary_loss_clip": 0.01726959, + "auxiliary_loss_mlp": 0.00294458, + "balance_loss_clip": 1.32122016, + "balance_loss_mlp": 0.2615087, + "epoch": 0.22900946941229522, + "flos": 18843011671680.0, + "grad_norm": 2.1288330982280472, + "language_loss": 0.81988668, + "learning_rate": 3.598729535939222e-06, + "loss": 0.84010088, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 4.05859375, + "router_z_loss_mlp": 0.32958984, + "step": 3809, + "time_per_iteration": 2.592005491256714 + }, + { + "auxiliary_loss_clip": 0.01710172, + "auxiliary_loss_mlp": 0.00291151, + "balance_loss_clip": 1.31287169, + "balance_loss_mlp": 0.26080048, + "epoch": 0.22906959266496318, + "flos": 22929394429440.0, + "grad_norm": 4.741205062903025, + "language_loss": 0.86548162, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.88549483, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.3034668, + "step": 3810, + "time_per_iteration": 2.6395115852355957 + }, + { + "auxiliary_loss_clip": 0.01745461, + "auxiliary_loss_mlp": 0.00288113, + "balance_loss_clip": 1.34165359, + "balance_loss_mlp": 0.25528285, + "epoch": 0.22912971591763115, + "flos": 19354523299200.0, + "grad_norm": 4.150973826416358, + "language_loss": 0.86122358, + "learning_rate": 3.598261401682441e-06, + "loss": 0.88155937, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 4.0390625, + "router_z_loss_mlp": 0.328125, + "step": 3811, + "time_per_iteration": 2.5900938510894775 + }, + { + "auxiliary_loss_clip": 0.01726433, + "auxiliary_loss_mlp": 0.00301015, + "balance_loss_clip": 1.32037854, + "balance_loss_mlp": 0.26980636, + "epoch": 0.22918983917029911, + "flos": 19933546538880.0, + "grad_norm": 3.60620158171666, + "language_loss": 0.90086806, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.92114258, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 4.05664062, + "router_z_loss_mlp": 0.31201172, + "step": 3812, + "time_per_iteration": 2.5985283851623535 + }, + { + "auxiliary_loss_clip": 0.01741628, + "auxiliary_loss_mlp": 0.00326192, + "balance_loss_clip": 1.33623874, + "balance_loss_mlp": 0.29524538, + "epoch": 0.22924996242296708, + "flos": 16690885320960.0, + "grad_norm": 3.1728373343400027, + "language_loss": 0.94890583, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.96958405, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.30932617, + "step": 3813, + "time_per_iteration": 2.5441997051239014 + }, + { + "auxiliary_loss_clip": 0.01720725, + "auxiliary_loss_mlp": 0.00334028, + "balance_loss_clip": 1.3231467, + "balance_loss_mlp": 0.30222341, + "epoch": 0.22931008567563504, + "flos": 33036164956800.0, + "grad_norm": 31.602049187358634, + "language_loss": 0.76296902, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.78351653, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.31787109, + "step": 3814, + "time_per_iteration": 2.7498624324798584 + }, + { + "auxiliary_loss_clip": 0.01717085, + "auxiliary_loss_mlp": 0.00280211, + "balance_loss_clip": 1.31670189, + "balance_loss_mlp": 0.24735676, + "epoch": 0.229370208928303, + "flos": 23330696152320.0, + "grad_norm": 7.992459309920149, + "language_loss": 0.76270485, + "learning_rate": 3.597324405965139e-06, + "loss": 0.78267777, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 4.00390625, + "router_z_loss_mlp": 0.32836914, + "step": 3815, + "time_per_iteration": 2.59753680229187 + }, + { + "auxiliary_loss_clip": 0.01722675, + "auxiliary_loss_mlp": 0.00317691, + "balance_loss_clip": 1.32510972, + "balance_loss_mlp": 0.28312021, + "epoch": 0.229430332180971, + "flos": 28617213150720.0, + "grad_norm": 9.298115920921022, + "language_loss": 0.90096962, + "learning_rate": 3.597090005586848e-06, + "loss": 0.92137331, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.34570312, + "step": 3816, + "time_per_iteration": 4.144388437271118 + }, + { + "auxiliary_loss_clip": 0.01715904, + "auxiliary_loss_mlp": 0.00309111, + "balance_loss_clip": 1.32122886, + "balance_loss_mlp": 0.27699572, + "epoch": 0.22949045543363897, + "flos": 17238199829760.0, + "grad_norm": 3.2765136115425486, + "language_loss": 0.98105812, + "learning_rate": 3.596855544646742e-06, + "loss": 1.0013082, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 3.9453125, + "router_z_loss_mlp": 0.32104492, + "step": 3817, + "time_per_iteration": 2.5716075897216797 + }, + { + "auxiliary_loss_clip": 0.01686057, + "auxiliary_loss_mlp": 0.00336664, + "balance_loss_clip": 1.3010509, + "balance_loss_mlp": 0.30543089, + "epoch": 0.22955057868630693, + "flos": 27489438858240.0, + "grad_norm": 3.046691292552805, + "language_loss": 0.82034242, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.84056962, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.31225586, + "step": 3818, + "time_per_iteration": 4.065706491470337 + }, + { + "auxiliary_loss_clip": 0.01680708, + "auxiliary_loss_mlp": 0.00367283, + "balance_loss_clip": 1.30041564, + "balance_loss_mlp": 0.33457211, + "epoch": 0.2296107019389749, + "flos": 23476421629440.0, + "grad_norm": 5.742223213919984, + "language_loss": 0.81604946, + "learning_rate": 3.596386441116659e-06, + "loss": 0.83652937, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 3.80273438, + "router_z_loss_mlp": 0.3269043, + "step": 3819, + "time_per_iteration": 2.688422441482544 + }, + { + "auxiliary_loss_clip": 0.01672863, + "auxiliary_loss_mlp": 0.00343884, + "balance_loss_clip": 1.30121374, + "balance_loss_mlp": 0.31095839, + "epoch": 0.22967082519164286, + "flos": 31285160760960.0, + "grad_norm": 9.991675958048456, + "language_loss": 0.86893058, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.88909805, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.3293457, + "step": 3820, + "time_per_iteration": 2.7714638710021973 + }, + { + "auxiliary_loss_clip": 0.01664681, + "auxiliary_loss_mlp": 0.00382529, + "balance_loss_clip": 1.29442072, + "balance_loss_mlp": 0.34616998, + "epoch": 0.22973094844431083, + "flos": 14642935390080.0, + "grad_norm": 2.978612404594324, + "language_loss": 0.75874436, + "learning_rate": 3.595917095446042e-06, + "loss": 0.77921641, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 0.36303711, + "step": 3821, + "time_per_iteration": 2.642810583114624 + }, + { + "auxiliary_loss_clip": 0.01654071, + "auxiliary_loss_mlp": 0.00358299, + "balance_loss_clip": 1.28819036, + "balance_loss_mlp": 0.32673243, + "epoch": 0.2297910716969788, + "flos": 22823853292800.0, + "grad_norm": 27.680267535752098, + "language_loss": 0.87942874, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.89955246, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 0.31567383, + "step": 3822, + "time_per_iteration": 4.121351480484009 + }, + { + "auxiliary_loss_clip": 0.01639652, + "auxiliary_loss_mlp": 0.00362421, + "balance_loss_clip": 1.28446388, + "balance_loss_mlp": 0.32942411, + "epoch": 0.2298511949496468, + "flos": 23039029716480.0, + "grad_norm": 18.796079892317774, + "language_loss": 0.72759765, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.74761832, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 0.32983398, + "step": 3823, + "time_per_iteration": 2.623065710067749 + }, + { + "auxiliary_loss_clip": 0.01615789, + "auxiliary_loss_mlp": 0.00326492, + "balance_loss_clip": 1.37119973, + "balance_loss_mlp": 0.31247327, + "epoch": 0.22991131820231475, + "flos": 66890914911360.0, + "grad_norm": 0.8326668672781224, + "language_loss": 0.57068962, + "learning_rate": 3.595212623082357e-06, + "loss": 0.59011245, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.140625, + "step": 3824, + "time_per_iteration": 3.1868696212768555 + }, + { + "auxiliary_loss_clip": 0.01583706, + "auxiliary_loss_mlp": 0.00349511, + "balance_loss_clip": 1.24801302, + "balance_loss_mlp": 0.31882608, + "epoch": 0.22997144145498272, + "flos": 17887248633600.0, + "grad_norm": 2.1120881025767955, + "language_loss": 0.79923916, + "learning_rate": 3.594977677968009e-06, + "loss": 0.81857133, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.30664062, + "step": 3825, + "time_per_iteration": 2.552391290664673 + }, + { + "auxiliary_loss_clip": 0.01585987, + "auxiliary_loss_mlp": 0.00385327, + "balance_loss_clip": 1.25231576, + "balance_loss_mlp": 0.35213926, + "epoch": 0.23003156470765068, + "flos": 24676843178880.0, + "grad_norm": 52.40753069588256, + "language_loss": 0.9418394, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.96155262, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.33203125, + "step": 3826, + "time_per_iteration": 2.6896347999572754 + }, + { + "auxiliary_loss_clip": 0.01586305, + "auxiliary_loss_mlp": 0.0040504, + "balance_loss_clip": 1.25150514, + "balance_loss_mlp": 0.36715513, + "epoch": 0.23009168796031865, + "flos": 15814126247040.0, + "grad_norm": 32.76298135644684, + "language_loss": 0.93496799, + "learning_rate": 3.594507606303083e-06, + "loss": 0.95488131, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.37890625, + "step": 3827, + "time_per_iteration": 4.0243189334869385 + }, + { + "auxiliary_loss_clip": 0.01559938, + "auxiliary_loss_mlp": 0.00445952, + "balance_loss_clip": 1.23246908, + "balance_loss_mlp": 0.40759015, + "epoch": 0.2301518112129866, + "flos": 16212842190720.0, + "grad_norm": 3.6689640330061577, + "language_loss": 0.93316716, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.95322609, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.38378906, + "step": 3828, + "time_per_iteration": 2.572387218475342 + }, + { + "auxiliary_loss_clip": 0.01550184, + "auxiliary_loss_mlp": 0.00372985, + "balance_loss_clip": 1.22506833, + "balance_loss_mlp": 0.33936799, + "epoch": 0.2302119344656546, + "flos": 20595452411520.0, + "grad_norm": 15.907263742799122, + "language_loss": 0.77885407, + "learning_rate": 3.594037292782607e-06, + "loss": 0.79808581, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.33618164, + "step": 3829, + "time_per_iteration": 2.583833932876587 + }, + { + "auxiliary_loss_clip": 0.01523718, + "auxiliary_loss_mlp": 0.00389387, + "balance_loss_clip": 1.21057415, + "balance_loss_mlp": 0.35650897, + "epoch": 0.23027205771832257, + "flos": 26796901662720.0, + "grad_norm": 7.753957548504881, + "language_loss": 0.91551393, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.93464494, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.32885742, + "step": 3830, + "time_per_iteration": 2.632283926010132 + }, + { + "auxiliary_loss_clip": 0.01514876, + "auxiliary_loss_mlp": 0.0034917, + "balance_loss_clip": 1.20565343, + "balance_loss_mlp": 0.31626865, + "epoch": 0.23033218097099054, + "flos": 43873143068160.0, + "grad_norm": 5.837360026030787, + "language_loss": 0.73600936, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.75464982, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.32885742, + "step": 3831, + "time_per_iteration": 2.842564105987549 + }, + { + "auxiliary_loss_clip": 0.01509839, + "auxiliary_loss_mlp": 0.00399993, + "balance_loss_clip": 1.19554758, + "balance_loss_mlp": 0.36449289, + "epoch": 0.2303923042236585, + "flos": 26067663745920.0, + "grad_norm": 9.065490226263991, + "language_loss": 0.84183788, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.86093616, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.35522461, + "step": 3832, + "time_per_iteration": 2.6516799926757812 + }, + { + "auxiliary_loss_clip": 0.01467074, + "auxiliary_loss_mlp": 0.00407116, + "balance_loss_clip": 1.16796291, + "balance_loss_mlp": 0.37290311, + "epoch": 0.23045242747632647, + "flos": 18296379521280.0, + "grad_norm": 59.04356698156164, + "language_loss": 0.94544077, + "learning_rate": 3.593095940460389e-06, + "loss": 0.96418273, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 2.99023438, + "router_z_loss_mlp": 0.34228516, + "step": 3833, + "time_per_iteration": 2.5972213745117188 + }, + { + "auxiliary_loss_clip": 0.01473096, + "auxiliary_loss_mlp": 0.00411363, + "balance_loss_clip": 1.1681056, + "balance_loss_mlp": 0.37450361, + "epoch": 0.23051255072899443, + "flos": 25520528805120.0, + "grad_norm": 3.0994795522221166, + "language_loss": 0.83798862, + "learning_rate": 3.592860451331624e-06, + "loss": 0.85683322, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.36816406, + "step": 3834, + "time_per_iteration": 2.6614646911621094 + }, + { + "auxiliary_loss_clip": 0.01438258, + "auxiliary_loss_mlp": 0.0041101, + "balance_loss_clip": 1.14316475, + "balance_loss_mlp": 0.37601, + "epoch": 0.2305726739816624, + "flos": 21215198695680.0, + "grad_norm": 2.9920172433264614, + "language_loss": 0.9582963, + "learning_rate": 3.592624901801432e-06, + "loss": 0.976789, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.34985352, + "step": 3835, + "time_per_iteration": 2.6086299419403076 + }, + { + "auxiliary_loss_clip": 0.01439442, + "auxiliary_loss_mlp": 0.00409891, + "balance_loss_clip": 1.14746451, + "balance_loss_mlp": 0.37510532, + "epoch": 0.2306327972343304, + "flos": 23331127115520.0, + "grad_norm": 62.031530529009196, + "language_loss": 0.92512608, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.94361937, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.34790039, + "step": 3836, + "time_per_iteration": 2.618842124938965 + }, + { + "auxiliary_loss_clip": 0.01435394, + "auxiliary_loss_mlp": 0.00428894, + "balance_loss_clip": 1.14295268, + "balance_loss_mlp": 0.3941564, + "epoch": 0.23069292048699835, + "flos": 20666734951680.0, + "grad_norm": 2.6031159319443775, + "language_loss": 0.86546838, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.88411129, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.34716797, + "step": 3837, + "time_per_iteration": 2.7157907485961914 + }, + { + "auxiliary_loss_clip": 0.01468477, + "auxiliary_loss_mlp": 0.00147442, + "balance_loss_clip": 1.26969779, + "balance_loss_mlp": 0.1356639, + "epoch": 0.23075304373966632, + "flos": 70454832393600.0, + "grad_norm": 0.9772372181541251, + "language_loss": 0.65615225, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67231143, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.11767578, + "step": 3838, + "time_per_iteration": 3.0955965518951416 + }, + { + "auxiliary_loss_clip": 0.01413751, + "auxiliary_loss_mlp": 0.00400222, + "balance_loss_clip": 1.12745798, + "balance_loss_mlp": 0.36796394, + "epoch": 0.23081316699233428, + "flos": 16617986668800.0, + "grad_norm": 9.318951434495228, + "language_loss": 0.82238513, + "learning_rate": 3.591682099845058e-06, + "loss": 0.84052485, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.32202148, + "step": 3839, + "time_per_iteration": 2.6256251335144043 + }, + { + "auxiliary_loss_clip": 0.01407861, + "auxiliary_loss_mlp": 0.00408014, + "balance_loss_clip": 1.12268281, + "balance_loss_mlp": 0.37270442, + "epoch": 0.23087329024500225, + "flos": 13298081253120.0, + "grad_norm": 20.068595315099323, + "language_loss": 0.79343551, + "learning_rate": 3.591446248441752e-06, + "loss": 0.81159425, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.35302734, + "step": 3840, + "time_per_iteration": 2.6267831325531006 + }, + { + "auxiliary_loss_clip": 0.01410534, + "auxiliary_loss_mlp": 0.00445238, + "balance_loss_clip": 1.12444711, + "balance_loss_mlp": 0.4068765, + "epoch": 0.23093341349767021, + "flos": 17785729820160.0, + "grad_norm": 3422.678944921393, + "language_loss": 0.85435545, + "learning_rate": 3.591210336690645e-06, + "loss": 0.87291312, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.38354492, + "step": 3841, + "time_per_iteration": 2.6340572834014893 + }, + { + "auxiliary_loss_clip": 0.01386594, + "auxiliary_loss_mlp": 0.00373744, + "balance_loss_clip": 1.10743558, + "balance_loss_mlp": 0.34301153, + "epoch": 0.23099353675033818, + "flos": 23988076911360.0, + "grad_norm": 19.489356456138427, + "language_loss": 0.90224278, + "learning_rate": 3.590974364600683e-06, + "loss": 0.91984618, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.30712891, + "step": 3842, + "time_per_iteration": 2.6355743408203125 + }, + { + "auxiliary_loss_clip": 0.01400999, + "auxiliary_loss_mlp": 0.00388384, + "balance_loss_clip": 1.12165689, + "balance_loss_mlp": 0.35641241, + "epoch": 0.23105366000300617, + "flos": 35995168471680.0, + "grad_norm": 3.739288758117505, + "language_loss": 0.72183323, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.73972702, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.31958008, + "step": 3843, + "time_per_iteration": 2.819784164428711 + }, + { + "auxiliary_loss_clip": 0.01385939, + "auxiliary_loss_mlp": 0.00416082, + "balance_loss_clip": 1.10552216, + "balance_loss_mlp": 0.38062924, + "epoch": 0.23111378325567414, + "flos": 31245335556480.0, + "grad_norm": 14.49094016202683, + "language_loss": 0.83434343, + "learning_rate": 3.590502239439987e-06, + "loss": 0.85236365, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.35449219, + "step": 3844, + "time_per_iteration": 2.6809773445129395 + }, + { + "auxiliary_loss_clip": 0.01401291, + "auxiliary_loss_mlp": 0.00487767, + "balance_loss_clip": 1.1176616, + "balance_loss_mlp": 0.44807065, + "epoch": 0.2311739065083421, + "flos": 19208223204480.0, + "grad_norm": 57.73295418574718, + "language_loss": 0.84471834, + "learning_rate": 3.590266086387156e-06, + "loss": 0.86360896, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.3972168, + "step": 3845, + "time_per_iteration": 2.602656364440918 + }, + { + "auxiliary_loss_clip": 0.01403874, + "auxiliary_loss_mlp": 0.0045598, + "balance_loss_clip": 1.12005186, + "balance_loss_mlp": 0.41907299, + "epoch": 0.23123402976101007, + "flos": 23360178240000.0, + "grad_norm": 3.86277212373702, + "language_loss": 0.82519698, + "learning_rate": 3.590029873031276e-06, + "loss": 0.84379548, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.36914062, + "step": 3846, + "time_per_iteration": 2.581758737564087 + }, + { + "auxiliary_loss_clip": 0.0142243, + "auxiliary_loss_mlp": 0.00487723, + "balance_loss_clip": 1.1349932, + "balance_loss_mlp": 0.44671541, + "epoch": 0.23129415301367803, + "flos": 13735365425280.0, + "grad_norm": 1.9351935870896522, + "language_loss": 0.79531229, + "learning_rate": 3.589793599381304e-06, + "loss": 0.81441379, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.41015625, + "step": 3847, + "time_per_iteration": 2.5760838985443115 + }, + { + "auxiliary_loss_clip": 0.01402315, + "auxiliary_loss_mlp": 0.00219273, + "balance_loss_clip": 1.21649718, + "balance_loss_mlp": 0.20401409, + "epoch": 0.231354276266346, + "flos": 69737015001600.0, + "grad_norm": 0.769502429711866, + "language_loss": 0.60847056, + "learning_rate": 3.589557265446198e-06, + "loss": 0.62468636, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.15234375, + "step": 3848, + "time_per_iteration": 3.000399589538574 + }, + { + "auxiliary_loss_clip": 0.01413613, + "auxiliary_loss_mlp": 0.00506104, + "balance_loss_clip": 1.13069916, + "balance_loss_mlp": 0.46497688, + "epoch": 0.231414399519014, + "flos": 18835900778880.0, + "grad_norm": 11.863643549778564, + "language_loss": 0.87480986, + "learning_rate": 3.589320871234923e-06, + "loss": 0.89400697, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.41137695, + "step": 3849, + "time_per_iteration": 2.600926160812378 + }, + { + "auxiliary_loss_clip": 0.01441006, + "auxiliary_loss_mlp": 0.00559546, + "balance_loss_clip": 1.14792323, + "balance_loss_mlp": 0.51307869, + "epoch": 0.23147452277168196, + "flos": 36135470995200.0, + "grad_norm": 13.56552653560103, + "language_loss": 0.78588045, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.80588591, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.46508789, + "step": 3850, + "time_per_iteration": 2.703482151031494 + }, + { + "auxiliary_loss_clip": 0.01421209, + "auxiliary_loss_mlp": 0.00583498, + "balance_loss_clip": 1.13228464, + "balance_loss_mlp": 0.5360049, + "epoch": 0.23153464602434992, + "flos": 20812927305600.0, + "grad_norm": 6.632644562568718, + "language_loss": 0.84451443, + "learning_rate": 3.588847902019718e-06, + "loss": 0.86456144, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.47485352, + "step": 3851, + "time_per_iteration": 2.5967869758605957 + }, + { + "auxiliary_loss_clip": 0.01419735, + "auxiliary_loss_mlp": 0.00559549, + "balance_loss_clip": 1.12682176, + "balance_loss_mlp": 0.51041144, + "epoch": 0.2315947692770179, + "flos": 19939256801280.0, + "grad_norm": 7.128407186335358, + "language_loss": 0.75948274, + "learning_rate": 3.588611327033723e-06, + "loss": 0.7792756, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.4909668, + "step": 3852, + "time_per_iteration": 2.5646016597747803 + }, + { + "auxiliary_loss_clip": 0.01437059, + "auxiliary_loss_mlp": 0.00520789, + "balance_loss_clip": 1.14139032, + "balance_loss_mlp": 0.47565588, + "epoch": 0.23165489252968585, + "flos": 12855553695360.0, + "grad_norm": 21.462240782163338, + "language_loss": 0.75076616, + "learning_rate": 3.588374691807428e-06, + "loss": 0.77034461, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.45141602, + "step": 3853, + "time_per_iteration": 2.597895383834839 + }, + { + "auxiliary_loss_clip": 0.01465847, + "auxiliary_loss_mlp": 0.00535809, + "balance_loss_clip": 1.16989088, + "balance_loss_mlp": 0.48807761, + "epoch": 0.23171501578235382, + "flos": 30628282792320.0, + "grad_norm": 24.353038918382964, + "language_loss": 0.86582476, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.88584125, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.47753906, + "step": 3854, + "time_per_iteration": 2.6793253421783447 + }, + { + "auxiliary_loss_clip": 0.01465794, + "auxiliary_loss_mlp": 0.00526728, + "balance_loss_clip": 1.15848422, + "balance_loss_mlp": 0.47794712, + "epoch": 0.23177513903502178, + "flos": 23842782397440.0, + "grad_norm": 24.66245290716232, + "language_loss": 0.74803782, + "learning_rate": 3.587901240669831e-06, + "loss": 0.76796299, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.48803711, + "step": 3855, + "time_per_iteration": 2.659219980239868 + }, + { + "auxiliary_loss_clip": 0.01448953, + "auxiliary_loss_mlp": 0.00532536, + "balance_loss_clip": 1.14742708, + "balance_loss_mlp": 0.48768905, + "epoch": 0.23183526228768978, + "flos": 29570282668800.0, + "grad_norm": 17.106602927614528, + "language_loss": 0.7810297, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.80084455, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.44848633, + "step": 3856, + "time_per_iteration": 2.6995129585266113 + }, + { + "auxiliary_loss_clip": 0.01440016, + "auxiliary_loss_mlp": 0.00580546, + "balance_loss_clip": 1.13458836, + "balance_loss_mlp": 0.53369677, + "epoch": 0.23189538554035774, + "flos": 34458694254720.0, + "grad_norm": 2.91834406866174, + "language_loss": 0.83261752, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.85282314, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.46875, + "step": 3857, + "time_per_iteration": 2.7494008541107178 + }, + { + "auxiliary_loss_clip": 0.014752, + "auxiliary_loss_mlp": 0.00516679, + "balance_loss_clip": 1.16538775, + "balance_loss_mlp": 0.46761233, + "epoch": 0.2319555087930257, + "flos": 18003815245440.0, + "grad_norm": 3.660252918202163, + "language_loss": 0.97188991, + "learning_rate": 3.587190612385584e-06, + "loss": 0.99180871, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.4909668, + "step": 3858, + "time_per_iteration": 4.023346424102783 + }, + { + "auxiliary_loss_clip": 0.0147247, + "auxiliary_loss_mlp": 0.00497449, + "balance_loss_clip": 1.16436124, + "balance_loss_mlp": 0.45293635, + "epoch": 0.23201563204569367, + "flos": 23143852581120.0, + "grad_norm": 2.2905369744355544, + "language_loss": 0.83435965, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.85405886, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 3.08203125, + "router_z_loss_mlp": 0.44482422, + "step": 3859, + "time_per_iteration": 2.589754819869995 + }, + { + "auxiliary_loss_clip": 0.01459842, + "auxiliary_loss_mlp": 0.00532099, + "balance_loss_clip": 1.15122652, + "balance_loss_mlp": 0.48670369, + "epoch": 0.23207575529836164, + "flos": 20667991927680.0, + "grad_norm": 3.1997030226697083, + "language_loss": 0.88831747, + "learning_rate": 3.58671655924898e-06, + "loss": 0.90823686, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 3.08203125, + "router_z_loss_mlp": 0.45410156, + "step": 3860, + "time_per_iteration": 2.6342296600341797 + }, + { + "auxiliary_loss_clip": 0.01464441, + "auxiliary_loss_mlp": 0.00469904, + "balance_loss_clip": 1.15824533, + "balance_loss_mlp": 0.42989695, + "epoch": 0.2321358785510296, + "flos": 16472189364480.0, + "grad_norm": 9.94350977514072, + "language_loss": 0.91551816, + "learning_rate": 3.586479442423508e-06, + "loss": 0.93486166, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.40014648, + "step": 3861, + "time_per_iteration": 3.9328665733337402 + }, + { + "auxiliary_loss_clip": 0.01485871, + "auxiliary_loss_mlp": 0.00446362, + "balance_loss_clip": 1.16955948, + "balance_loss_mlp": 0.40537792, + "epoch": 0.2321960018036976, + "flos": 21616320850560.0, + "grad_norm": 3.1704372012580637, + "language_loss": 0.91320801, + "learning_rate": 3.586242265438576e-06, + "loss": 0.93253028, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.40966797, + "step": 3862, + "time_per_iteration": 2.601963520050049 + }, + { + "auxiliary_loss_clip": 0.01468829, + "auxiliary_loss_mlp": 0.00433543, + "balance_loss_clip": 1.15992713, + "balance_loss_mlp": 0.39537245, + "epoch": 0.23225612505636556, + "flos": 22271474966400.0, + "grad_norm": 22.589469095175428, + "language_loss": 0.80228698, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.8213107, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.3815918, + "step": 3863, + "time_per_iteration": 2.6034138202667236 + }, + { + "auxiliary_loss_clip": 0.01481181, + "auxiliary_loss_mlp": 0.00375875, + "balance_loss_clip": 1.16672933, + "balance_loss_mlp": 0.34359351, + "epoch": 0.23231624830903352, + "flos": 17052325925760.0, + "grad_norm": 6.011408603416355, + "language_loss": 0.8142789, + "learning_rate": 3.58576773102631e-06, + "loss": 0.8328495, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.32299805, + "step": 3864, + "time_per_iteration": 2.573729991912842 + }, + { + "auxiliary_loss_clip": 0.01503191, + "auxiliary_loss_mlp": 0.00447438, + "balance_loss_clip": 1.18139279, + "balance_loss_mlp": 0.40759784, + "epoch": 0.2323763715617015, + "flos": 34640043045120.0, + "grad_norm": 6.475105030320255, + "language_loss": 0.77081662, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.7903229, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.3984375, + "step": 3865, + "time_per_iteration": 4.195216655731201 + }, + { + "auxiliary_loss_clip": 0.01515262, + "auxiliary_loss_mlp": 0.00407075, + "balance_loss_clip": 1.18790758, + "balance_loss_mlp": 0.36773598, + "epoch": 0.23243649481436945, + "flos": 25551698832000.0, + "grad_norm": 8.690797802150158, + "language_loss": 1.04475856, + "learning_rate": 3.5852929560841617e-06, + "loss": 1.06398201, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.39331055, + "step": 3866, + "time_per_iteration": 2.6873841285705566 + }, + { + "auxiliary_loss_clip": 0.01506616, + "auxiliary_loss_mlp": 0.00412907, + "balance_loss_clip": 1.18787265, + "balance_loss_mlp": 0.37602413, + "epoch": 0.23249661806703742, + "flos": 20483482740480.0, + "grad_norm": 16.55336281660246, + "language_loss": 0.81595457, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.83514977, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.36865234, + "step": 3867, + "time_per_iteration": 2.641598701477051 + }, + { + "auxiliary_loss_clip": 0.01491443, + "auxiliary_loss_mlp": 0.00393825, + "balance_loss_clip": 1.1706624, + "balance_loss_mlp": 0.35706037, + "epoch": 0.23255674131970538, + "flos": 20376612800640.0, + "grad_norm": 28.88283394645634, + "language_loss": 0.88434041, + "learning_rate": 3.584817940684145e-06, + "loss": 0.90319312, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.3671875, + "step": 3868, + "time_per_iteration": 2.600832223892212 + }, + { + "auxiliary_loss_clip": 0.01499517, + "auxiliary_loss_mlp": 0.00351573, + "balance_loss_clip": 1.17711627, + "balance_loss_mlp": 0.31826597, + "epoch": 0.23261686457237338, + "flos": 17056096853760.0, + "grad_norm": 5.4040744742976985, + "language_loss": 0.8227948, + "learning_rate": 3.58458034283495e-06, + "loss": 0.84130561, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 3.2265625, + "router_z_loss_mlp": 0.33300781, + "step": 3869, + "time_per_iteration": 4.027331113815308 + }, + { + "auxiliary_loss_clip": 0.01511296, + "auxiliary_loss_mlp": 0.00392207, + "balance_loss_clip": 1.19231784, + "balance_loss_mlp": 0.3572548, + "epoch": 0.23267698782504134, + "flos": 29169878785920.0, + "grad_norm": 2.393801602651625, + "language_loss": 0.87253606, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.8915711, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.34960938, + "step": 3870, + "time_per_iteration": 2.7029871940612793 + }, + { + "auxiliary_loss_clip": 0.01528549, + "auxiliary_loss_mlp": 0.00425756, + "balance_loss_clip": 1.19548178, + "balance_loss_mlp": 0.38838363, + "epoch": 0.2327371110777093, + "flos": 21174655219200.0, + "grad_norm": 483.7557305808067, + "language_loss": 0.80113006, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.82067311, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.37365723, + "step": 3871, + "time_per_iteration": 2.561936378479004 + }, + { + "auxiliary_loss_clip": 0.01518446, + "auxiliary_loss_mlp": 0.00376531, + "balance_loss_clip": 1.18750668, + "balance_loss_mlp": 0.33824122, + "epoch": 0.23279723433037727, + "flos": 24863112132480.0, + "grad_norm": 8.550990807151678, + "language_loss": 0.76674926, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.78569907, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.38305664, + "step": 3872, + "time_per_iteration": 2.641291856765747 + }, + { + "auxiliary_loss_clip": 0.01518912, + "auxiliary_loss_mlp": 0.00390294, + "balance_loss_clip": 1.1905818, + "balance_loss_mlp": 0.35503209, + "epoch": 0.23285735758304524, + "flos": 38800617344640.0, + "grad_norm": 1.6446157696032937, + "language_loss": 0.8568064, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.87589842, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.3527832, + "step": 3873, + "time_per_iteration": 2.7859528064727783 + }, + { + "auxiliary_loss_clip": 0.01470829, + "auxiliary_loss_mlp": 0.00130258, + "balance_loss_clip": 1.27159762, + "balance_loss_mlp": 0.12019652, + "epoch": 0.2329174808357132, + "flos": 53944113692160.0, + "grad_norm": 0.8422233868496591, + "language_loss": 0.60677892, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.6227898, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.10058594, + "step": 3874, + "time_per_iteration": 3.028733730316162 + }, + { + "auxiliary_loss_clip": 0.01506636, + "auxiliary_loss_mlp": 0.00346939, + "balance_loss_clip": 1.18330979, + "balance_loss_mlp": 0.31496736, + "epoch": 0.23297760408838117, + "flos": 21216024708480.0, + "grad_norm": 4.386811172320261, + "language_loss": 0.91494286, + "learning_rate": 3.583153494218927e-06, + "loss": 0.93347859, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.31982422, + "step": 3875, + "time_per_iteration": 2.6044623851776123 + }, + { + "auxiliary_loss_clip": 0.01508442, + "auxiliary_loss_mlp": 0.00361785, + "balance_loss_clip": 1.1867516, + "balance_loss_mlp": 0.32792968, + "epoch": 0.23303772734104916, + "flos": 28403006394240.0, + "grad_norm": 6.228929072294961, + "language_loss": 0.68952233, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.70822465, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.33874512, + "step": 3876, + "time_per_iteration": 2.6666629314422607 + }, + { + "auxiliary_loss_clip": 0.0153161, + "auxiliary_loss_mlp": 0.00364566, + "balance_loss_clip": 1.20668232, + "balance_loss_mlp": 0.33190301, + "epoch": 0.23309785059371713, + "flos": 24314720215680.0, + "grad_norm": 9.68407982865053, + "language_loss": 0.78422785, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.80318964, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.32617188, + "step": 3877, + "time_per_iteration": 2.6487319469451904 + }, + { + "auxiliary_loss_clip": 0.01520727, + "auxiliary_loss_mlp": 0.00337759, + "balance_loss_clip": 1.19071329, + "balance_loss_mlp": 0.30635971, + "epoch": 0.2331579738463851, + "flos": 15992925171840.0, + "grad_norm": 857.9270381883572, + "language_loss": 0.89301956, + "learning_rate": 3.582439259339073e-06, + "loss": 0.9116044, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.31408691, + "step": 3878, + "time_per_iteration": 2.5793309211730957 + }, + { + "auxiliary_loss_clip": 0.01535064, + "auxiliary_loss_mlp": 0.00374088, + "balance_loss_clip": 1.20047045, + "balance_loss_mlp": 0.3397316, + "epoch": 0.23321809709905306, + "flos": 36426957863040.0, + "grad_norm": 1.8077024783088067, + "language_loss": 0.80507064, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.82416213, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.34350586, + "step": 3879, + "time_per_iteration": 2.7233481407165527 + }, + { + "auxiliary_loss_clip": 0.0152801, + "auxiliary_loss_mlp": 0.00347914, + "balance_loss_clip": 1.19555938, + "balance_loss_mlp": 0.31576347, + "epoch": 0.23327822035172102, + "flos": 21324762155520.0, + "grad_norm": 4.049167064397569, + "language_loss": 0.95763439, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.97639358, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.3215332, + "step": 3880, + "time_per_iteration": 2.61074161529541 + }, + { + "auxiliary_loss_clip": 0.01533362, + "auxiliary_loss_mlp": 0.00361031, + "balance_loss_clip": 1.20314157, + "balance_loss_mlp": 0.32731897, + "epoch": 0.233338343604389, + "flos": 19171881619200.0, + "grad_norm": 2418.88008758777, + "language_loss": 0.78795087, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.80689484, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.3371582, + "step": 3881, + "time_per_iteration": 2.6547446250915527 + }, + { + "auxiliary_loss_clip": 0.01537589, + "auxiliary_loss_mlp": 0.00319899, + "balance_loss_clip": 1.21122742, + "balance_loss_mlp": 0.29021594, + "epoch": 0.23339846685705698, + "flos": 26908368543360.0, + "grad_norm": 7.137942466611316, + "language_loss": 0.73574126, + "learning_rate": 3.581486106120537e-06, + "loss": 0.75431615, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 3.265625, + "router_z_loss_mlp": 0.29650879, + "step": 3882, + "time_per_iteration": 2.6802635192871094 + }, + { + "auxiliary_loss_clip": 0.01552831, + "auxiliary_loss_mlp": 0.00309094, + "balance_loss_clip": 1.22341371, + "balance_loss_mlp": 0.27752763, + "epoch": 0.23345859010972494, + "flos": 32343160884480.0, + "grad_norm": 7.19393593533623, + "language_loss": 0.8375268, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.85614604, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.31567383, + "step": 3883, + "time_per_iteration": 2.7177138328552246 + }, + { + "auxiliary_loss_clip": 0.0146364, + "auxiliary_loss_mlp": 0.00100657, + "balance_loss_clip": 1.2681303, + "balance_loss_mlp": 0.09107301, + "epoch": 0.2335187133623929, + "flos": 58484229050880.0, + "grad_norm": 0.7681811969274355, + "language_loss": 0.59314609, + "learning_rate": 3.58100916965445e-06, + "loss": 0.60878909, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.09570312, + "step": 3884, + "time_per_iteration": 3.292201042175293 + }, + { + "auxiliary_loss_clip": 0.01549048, + "auxiliary_loss_mlp": 0.00248486, + "balance_loss_clip": 1.21497607, + "balance_loss_mlp": 0.21947059, + "epoch": 0.23357883661506088, + "flos": 24502317972480.0, + "grad_norm": 2.26561415972392, + "language_loss": 0.85807645, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.87605178, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.29052734, + "step": 3885, + "time_per_iteration": 2.6868319511413574 + }, + { + "auxiliary_loss_clip": 0.01549705, + "auxiliary_loss_mlp": 0.00307875, + "balance_loss_clip": 1.21884155, + "balance_loss_mlp": 0.276833, + "epoch": 0.23363895986772884, + "flos": 18948516894720.0, + "grad_norm": 19.236034654201397, + "language_loss": 0.93934977, + "learning_rate": 3.580531993380261e-06, + "loss": 0.95792556, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.3104248, + "step": 3886, + "time_per_iteration": 2.592223882675171 + }, + { + "auxiliary_loss_clip": 0.01582786, + "auxiliary_loss_mlp": 0.00283644, + "balance_loss_clip": 1.24247432, + "balance_loss_mlp": 0.25303078, + "epoch": 0.2336990831203968, + "flos": 31686821619840.0, + "grad_norm": 3.0421341141889515, + "language_loss": 0.79404575, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.81271005, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.30566406, + "step": 3887, + "time_per_iteration": 2.67746901512146 + }, + { + "auxiliary_loss_clip": 0.01594643, + "auxiliary_loss_mlp": 0.00308224, + "balance_loss_clip": 1.24935484, + "balance_loss_mlp": 0.2748934, + "epoch": 0.23375920637306477, + "flos": 27709750926720.0, + "grad_norm": 26.781479898230245, + "language_loss": 0.92225468, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.94128335, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 0.33349609, + "step": 3888, + "time_per_iteration": 2.667341709136963 + }, + { + "auxiliary_loss_clip": 0.01583932, + "auxiliary_loss_mlp": 0.00264804, + "balance_loss_clip": 1.24117899, + "balance_loss_mlp": 0.2349301, + "epoch": 0.23381932962573276, + "flos": 17675627656320.0, + "grad_norm": 25.14247936784745, + "language_loss": 0.98027706, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.9987644, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 0.29846191, + "step": 3889, + "time_per_iteration": 2.587050676345825 + }, + { + "auxiliary_loss_clip": 0.01565128, + "auxiliary_loss_mlp": 0.00273003, + "balance_loss_clip": 1.23403776, + "balance_loss_mlp": 0.24067374, + "epoch": 0.23387945287840073, + "flos": 14390842763520.0, + "grad_norm": 10.23434922795639, + "language_loss": 0.8841154, + "learning_rate": 3.579576921697125e-06, + "loss": 0.9024967, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.32324219, + "step": 3890, + "time_per_iteration": 2.5834460258483887 + }, + { + "auxiliary_loss_clip": 0.01554162, + "auxiliary_loss_mlp": 0.00266516, + "balance_loss_clip": 1.22774434, + "balance_loss_mlp": 0.23385252, + "epoch": 0.2339395761310687, + "flos": 46097988503040.0, + "grad_norm": 91.72431315299482, + "language_loss": 0.80513477, + "learning_rate": 3.579338004009412e-06, + "loss": 0.82334161, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.3269043, + "step": 3891, + "time_per_iteration": 2.813584566116333 + }, + { + "auxiliary_loss_clip": 0.01566615, + "auxiliary_loss_mlp": 0.00263504, + "balance_loss_clip": 1.23669791, + "balance_loss_mlp": 0.23184188, + "epoch": 0.23399969938373666, + "flos": 22382044007040.0, + "grad_norm": 90.87057371080303, + "language_loss": 0.90019822, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.91849947, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.31640625, + "step": 3892, + "time_per_iteration": 2.670623540878296 + }, + { + "auxiliary_loss_clip": 0.01573988, + "auxiliary_loss_mlp": 0.00258747, + "balance_loss_clip": 1.23813069, + "balance_loss_mlp": 0.22620277, + "epoch": 0.23405982263640462, + "flos": 43508542066560.0, + "grad_norm": 14.914749340877671, + "language_loss": 0.72299528, + "learning_rate": 3.578859988977082e-06, + "loss": 0.74132258, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.32543945, + "step": 3893, + "time_per_iteration": 2.8169593811035156 + }, + { + "auxiliary_loss_clip": 0.01587422, + "auxiliary_loss_mlp": 0.00251067, + "balance_loss_clip": 1.24884081, + "balance_loss_mlp": 0.2194528, + "epoch": 0.2341199458890726, + "flos": 22564685687040.0, + "grad_norm": 2.399695096691209, + "language_loss": 0.87677372, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.89515865, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.31591797, + "step": 3894, + "time_per_iteration": 2.5976035594940186 + }, + { + "auxiliary_loss_clip": 0.01566574, + "auxiliary_loss_mlp": 0.00268501, + "balance_loss_clip": 1.23478985, + "balance_loss_mlp": 0.23617128, + "epoch": 0.23418006914174055, + "flos": 25633970933760.0, + "grad_norm": 5.193771558812267, + "language_loss": 0.88060844, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.89895928, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.32324219, + "step": 3895, + "time_per_iteration": 2.687689781188965 + }, + { + "auxiliary_loss_clip": 0.01578566, + "auxiliary_loss_mlp": 0.00287456, + "balance_loss_clip": 1.241799, + "balance_loss_mlp": 0.25410175, + "epoch": 0.23424019239440855, + "flos": 13545936074880.0, + "grad_norm": 21.868334041643614, + "language_loss": 0.92191064, + "learning_rate": 3.578142517422292e-06, + "loss": 0.94057083, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.33349609, + "step": 3896, + "time_per_iteration": 2.6185250282287598 + }, + { + "auxiliary_loss_clip": 0.01571084, + "auxiliary_loss_mlp": 0.00259424, + "balance_loss_clip": 1.23473108, + "balance_loss_mlp": 0.22738072, + "epoch": 0.2343003156470765, + "flos": 22419498913920.0, + "grad_norm": 21.44322702814386, + "language_loss": 0.89202905, + "learning_rate": 3.577903240538623e-06, + "loss": 0.91033411, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 3.36132812, + "router_z_loss_mlp": 0.3203125, + "step": 3897, + "time_per_iteration": 2.627413749694824 + }, + { + "auxiliary_loss_clip": 0.01566192, + "auxiliary_loss_mlp": 0.00267953, + "balance_loss_clip": 1.2369715, + "balance_loss_mlp": 0.23457402, + "epoch": 0.23436043889974448, + "flos": 14790815683200.0, + "grad_norm": 2.7420126972733248, + "language_loss": 0.86911112, + "learning_rate": 3.577663903820705e-06, + "loss": 0.8874526, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.33398438, + "step": 3898, + "time_per_iteration": 2.637450695037842 + }, + { + "auxiliary_loss_clip": 0.01574462, + "auxiliary_loss_mlp": 0.0022852, + "balance_loss_clip": 1.24930453, + "balance_loss_mlp": 0.1982166, + "epoch": 0.23442056215241244, + "flos": 22965700101120.0, + "grad_norm": 1.9103512585332605, + "language_loss": 0.82676625, + "learning_rate": 3.577424507277614e-06, + "loss": 0.84479606, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.30297852, + "step": 3899, + "time_per_iteration": 2.669923782348633 + }, + { + "auxiliary_loss_clip": 0.01588296, + "auxiliary_loss_mlp": 0.0026412, + "balance_loss_clip": 1.24891829, + "balance_loss_mlp": 0.2306221, + "epoch": 0.2344806854050804, + "flos": 23071887682560.0, + "grad_norm": 16.727755607081637, + "language_loss": 0.83395565, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.85247976, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 0.33496094, + "step": 3900, + "time_per_iteration": 4.065730571746826 + }, + { + "auxiliary_loss_clip": 0.01560803, + "auxiliary_loss_mlp": 0.00255706, + "balance_loss_clip": 1.23531055, + "balance_loss_mlp": 0.2193234, + "epoch": 0.23454080865774837, + "flos": 16327074418560.0, + "grad_norm": 16.746783477497107, + "language_loss": 0.73628408, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.75444913, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.36352539, + "step": 3901, + "time_per_iteration": 2.5840258598327637 + }, + { + "auxiliary_loss_clip": 0.01514717, + "auxiliary_loss_mlp": 0.00101665, + "balance_loss_clip": 1.33315003, + "balance_loss_mlp": 0.08468911, + "epoch": 0.23460093191041637, + "flos": 67760958142080.0, + "grad_norm": 0.7420805585575118, + "language_loss": 0.58182955, + "learning_rate": 3.576705958788091e-06, + "loss": 0.59799337, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.16992188, + "step": 3902, + "time_per_iteration": 3.0634796619415283 + }, + { + "auxiliary_loss_clip": 0.0158465, + "auxiliary_loss_mlp": 0.00217457, + "balance_loss_clip": 1.25498676, + "balance_loss_mlp": 0.18231453, + "epoch": 0.23466105516308433, + "flos": 20077619990400.0, + "grad_norm": 31.02036356202509, + "language_loss": 0.91951847, + "learning_rate": 3.576466323035108e-06, + "loss": 0.93753952, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35131836, + "step": 3903, + "time_per_iteration": 4.025574445724487 + }, + { + "auxiliary_loss_clip": 0.01554509, + "auxiliary_loss_mlp": 0.00246712, + "balance_loss_clip": 1.23120761, + "balance_loss_mlp": 0.21180737, + "epoch": 0.2347211784157523, + "flos": 24535714642560.0, + "grad_norm": 16.82525038503733, + "language_loss": 0.89066505, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.90867722, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.34936523, + "step": 3904, + "time_per_iteration": 2.63858699798584 + }, + { + "auxiliary_loss_clip": 0.0158458, + "auxiliary_loss_mlp": 0.00255746, + "balance_loss_clip": 1.25399172, + "balance_loss_mlp": 0.217289, + "epoch": 0.23478130166842026, + "flos": 23805040181760.0, + "grad_norm": 33.03334815152209, + "language_loss": 0.81339985, + "learning_rate": 3.57598687219895e-06, + "loss": 0.83180308, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.38452148, + "step": 3905, + "time_per_iteration": 2.6143176555633545 + }, + { + "auxiliary_loss_clip": 0.01561623, + "auxiliary_loss_mlp": 0.00248992, + "balance_loss_clip": 1.2428596, + "balance_loss_mlp": 0.21816465, + "epoch": 0.23484142492108823, + "flos": 24093618048000.0, + "grad_norm": 7.284202622819403, + "language_loss": 0.77291584, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.79102194, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.30810547, + "step": 3906, + "time_per_iteration": 2.64517879486084 + }, + { + "auxiliary_loss_clip": 0.01550132, + "auxiliary_loss_mlp": 0.00253157, + "balance_loss_clip": 1.23197889, + "balance_loss_mlp": 0.2165361, + "epoch": 0.2349015481737562, + "flos": 29095830898560.0, + "grad_norm": 7.350384070206175, + "language_loss": 0.84044558, + "learning_rate": 3.575507182316473e-06, + "loss": 0.85847849, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.36645508, + "step": 3907, + "time_per_iteration": 4.168622732162476 + }, + { + "auxiliary_loss_clip": 0.01573689, + "auxiliary_loss_mlp": 0.00259579, + "balance_loss_clip": 1.24849987, + "balance_loss_mlp": 0.2226243, + "epoch": 0.23496167142642416, + "flos": 18916305373440.0, + "grad_norm": 27.04044802041194, + "language_loss": 0.80529988, + "learning_rate": 3.575267247755601e-06, + "loss": 0.8236326, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.36938477, + "step": 3908, + "time_per_iteration": 2.608916759490967 + }, + { + "auxiliary_loss_clip": 0.01486048, + "auxiliary_loss_mlp": 0.00105317, + "balance_loss_clip": 1.31350732, + "balance_loss_mlp": 0.09005806, + "epoch": 0.23502179467909215, + "flos": 55868062896000.0, + "grad_norm": 1.0581299642734718, + "language_loss": 0.73459184, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75050551, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.15234375, + "step": 3909, + "time_per_iteration": 2.8471546173095703 + }, + { + "auxiliary_loss_clip": 0.01540532, + "auxiliary_loss_mlp": 0.00260073, + "balance_loss_clip": 1.22871268, + "balance_loss_mlp": 0.22752905, + "epoch": 0.23508191793176011, + "flos": 23401763210880.0, + "grad_norm": 2.9206773803145456, + "language_loss": 0.94145197, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.95945805, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 3.12304688, + "router_z_loss_mlp": 0.32568359, + "step": 3910, + "time_per_iteration": 2.6145718097686768 + }, + { + "auxiliary_loss_clip": 0.01554889, + "auxiliary_loss_mlp": 0.00252051, + "balance_loss_clip": 1.24737096, + "balance_loss_mlp": 0.21957824, + "epoch": 0.23514204118442808, + "flos": 20047671025920.0, + "grad_norm": 14.916258518016361, + "language_loss": 0.85056615, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.86863554, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.32470703, + "step": 3911, + "time_per_iteration": 2.6060173511505127 + }, + { + "auxiliary_loss_clip": 0.01566302, + "auxiliary_loss_mlp": 0.00259928, + "balance_loss_clip": 1.2565794, + "balance_loss_mlp": 0.22373646, + "epoch": 0.23520216443709605, + "flos": 21580589796480.0, + "grad_norm": 15.7874487418103, + "language_loss": 0.864685, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.88294727, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.36181641, + "step": 3912, + "time_per_iteration": 4.077056169509888 + }, + { + "auxiliary_loss_clip": 0.01543309, + "auxiliary_loss_mlp": 0.00240574, + "balance_loss_clip": 1.23798919, + "balance_loss_mlp": 0.20876943, + "epoch": 0.235262287689764, + "flos": 23185796688000.0, + "grad_norm": 6.192101228155215, + "language_loss": 0.80841827, + "learning_rate": 3.574066679118909e-06, + "loss": 0.82625711, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.31835938, + "step": 3913, + "time_per_iteration": 2.773421049118042 + }, + { + "auxiliary_loss_clip": 0.01554037, + "auxiliary_loss_mlp": 0.00243949, + "balance_loss_clip": 1.2464658, + "balance_loss_mlp": 0.21030819, + "epoch": 0.23532241094243198, + "flos": 23185222070400.0, + "grad_norm": 8.748131638046283, + "language_loss": 0.84424198, + "learning_rate": 3.57382638628884e-06, + "loss": 0.86222184, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.33642578, + "step": 3914, + "time_per_iteration": 2.6995866298675537 + }, + { + "auxiliary_loss_clip": 0.01545427, + "auxiliary_loss_mlp": 0.00251648, + "balance_loss_clip": 1.24605656, + "balance_loss_mlp": 0.21853197, + "epoch": 0.23538253419509997, + "flos": 17019324305280.0, + "grad_norm": 2.063064134486771, + "language_loss": 0.99065745, + "learning_rate": 3.5735860337791174e-06, + "loss": 1.00862825, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.33105469, + "step": 3915, + "time_per_iteration": 2.558112621307373 + }, + { + "auxiliary_loss_clip": 0.01492564, + "auxiliary_loss_mlp": 0.00106461, + "balance_loss_clip": 1.32614124, + "balance_loss_mlp": 0.0959233, + "epoch": 0.23544265744776793, + "flos": 63448588967040.0, + "grad_norm": 0.8061902987727162, + "language_loss": 0.5936991, + "learning_rate": 3.573345621598854e-06, + "loss": 0.60968935, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.10546875, + "step": 3916, + "time_per_iteration": 3.0434319972991943 + }, + { + "auxiliary_loss_clip": 0.01492433, + "auxiliary_loss_mlp": 0.00159461, + "balance_loss_clip": 1.32925057, + "balance_loss_mlp": 0.14715844, + "epoch": 0.2355027807004359, + "flos": 70515343831680.0, + "grad_norm": 0.7655231812963825, + "language_loss": 0.49393615, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51045513, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.12255859, + "step": 3917, + "time_per_iteration": 3.137397289276123 + }, + { + "auxiliary_loss_clip": 0.0155895, + "auxiliary_loss_mlp": 0.00270919, + "balance_loss_clip": 1.25439644, + "balance_loss_mlp": 0.23899487, + "epoch": 0.23556290395310386, + "flos": 21434289701760.0, + "grad_norm": 2.1371563650068857, + "language_loss": 0.83699918, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.8552978, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.3190918, + "step": 3918, + "time_per_iteration": 2.5929818153381348 + }, + { + "auxiliary_loss_clip": 0.01544257, + "auxiliary_loss_mlp": 0.00256567, + "balance_loss_clip": 1.24220395, + "balance_loss_mlp": 0.22438005, + "epoch": 0.23562302720577183, + "flos": 18186421011840.0, + "grad_norm": 4.381977124747024, + "language_loss": 0.78792334, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.80593157, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.32202148, + "step": 3919, + "time_per_iteration": 2.61092209815979 + }, + { + "auxiliary_loss_clip": 0.01535642, + "auxiliary_loss_mlp": 0.00256333, + "balance_loss_clip": 1.24478459, + "balance_loss_mlp": 0.22598219, + "epoch": 0.2356831504584398, + "flos": 33730497832320.0, + "grad_norm": 10.733740379184598, + "language_loss": 0.75733346, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.77525318, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.30322266, + "step": 3920, + "time_per_iteration": 2.8123748302459717 + }, + { + "auxiliary_loss_clip": 0.01556207, + "auxiliary_loss_mlp": 0.00290161, + "balance_loss_clip": 1.26170325, + "balance_loss_mlp": 0.25935712, + "epoch": 0.23574327371110776, + "flos": 24932778560640.0, + "grad_norm": 5.393722228254328, + "language_loss": 0.83442342, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.85288709, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.30810547, + "step": 3921, + "time_per_iteration": 2.6680800914764404 + }, + { + "auxiliary_loss_clip": 0.01545339, + "auxiliary_loss_mlp": 0.00257986, + "balance_loss_clip": 1.25230217, + "balance_loss_mlp": 0.22341537, + "epoch": 0.23580339696377575, + "flos": 17822107319040.0, + "grad_norm": 2.3089907563975576, + "language_loss": 0.86559844, + "learning_rate": 3.571901895946612e-06, + "loss": 0.88363171, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.34545898, + "step": 3922, + "time_per_iteration": 2.5993423461914062 + }, + { + "auxiliary_loss_clip": 0.01549468, + "auxiliary_loss_mlp": 0.00267292, + "balance_loss_clip": 1.25801146, + "balance_loss_mlp": 0.23501037, + "epoch": 0.23586352021644372, + "flos": 26286611097600.0, + "grad_norm": 7.836508164740288, + "language_loss": 0.87763554, + "learning_rate": 3.571661066327956e-06, + "loss": 0.89580315, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.32299805, + "step": 3923, + "time_per_iteration": 2.657937526702881 + }, + { + "auxiliary_loss_clip": 0.01581766, + "auxiliary_loss_mlp": 0.00242517, + "balance_loss_clip": 1.29043818, + "balance_loss_mlp": 0.20978169, + "epoch": 0.23592364346911168, + "flos": 14246697484800.0, + "grad_norm": 15.539235542254026, + "language_loss": 0.81207317, + "learning_rate": 3.571420177111754e-06, + "loss": 0.83031607, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.32739258, + "step": 3924, + "time_per_iteration": 2.6227288246154785 + }, + { + "auxiliary_loss_clip": 0.01592422, + "auxiliary_loss_mlp": 0.00278589, + "balance_loss_clip": 1.29647946, + "balance_loss_mlp": 0.24301709, + "epoch": 0.23598376672177965, + "flos": 18587938216320.0, + "grad_norm": 1.6616119983045639, + "language_loss": 0.90760148, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.92631155, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.35571289, + "step": 3925, + "time_per_iteration": 2.6524109840393066 + }, + { + "auxiliary_loss_clip": 0.01584101, + "auxiliary_loss_mlp": 0.00267022, + "balance_loss_clip": 1.29239333, + "balance_loss_mlp": 0.23078194, + "epoch": 0.2360438899744476, + "flos": 22675542036480.0, + "grad_norm": 22.822191881419794, + "language_loss": 0.68925118, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.70776242, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.36230469, + "step": 3926, + "time_per_iteration": 2.6537747383117676 + }, + { + "auxiliary_loss_clip": 0.01609847, + "auxiliary_loss_mlp": 0.00316836, + "balance_loss_clip": 1.31829071, + "balance_loss_mlp": 0.28135914, + "epoch": 0.23610401322711558, + "flos": 29570139014400.0, + "grad_norm": 5.729943695238413, + "language_loss": 0.78991246, + "learning_rate": 3.570697151969235e-06, + "loss": 0.80917937, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.35498047, + "step": 3927, + "time_per_iteration": 2.7240242958068848 + }, + { + "auxiliary_loss_clip": 0.01612617, + "auxiliary_loss_mlp": 0.00275788, + "balance_loss_clip": 1.31724811, + "balance_loss_mlp": 0.23892856, + "epoch": 0.23616413647978354, + "flos": 17858520731520.0, + "grad_norm": 23.753180391826476, + "language_loss": 0.81736124, + "learning_rate": 3.570456024454221e-06, + "loss": 0.8362453, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.36889648, + "step": 3928, + "time_per_iteration": 2.6664931774139404 + }, + { + "auxiliary_loss_clip": 0.0164579, + "auxiliary_loss_mlp": 0.00296392, + "balance_loss_clip": 1.34759724, + "balance_loss_mlp": 0.25772041, + "epoch": 0.23622425973245154, + "flos": 11034847157760.0, + "grad_norm": 26.802330543272586, + "language_loss": 0.93908441, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.95850623, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.38671875, + "step": 3929, + "time_per_iteration": 2.60994815826416 + }, + { + "auxiliary_loss_clip": 0.01644935, + "auxiliary_loss_mlp": 0.00351134, + "balance_loss_clip": 1.34153318, + "balance_loss_mlp": 0.30964893, + "epoch": 0.2362843829851195, + "flos": 23404061681280.0, + "grad_norm": 12.508779584627433, + "language_loss": 0.80607295, + "learning_rate": 3.569973590777789e-06, + "loss": 0.82603365, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.41479492, + "step": 3930, + "time_per_iteration": 2.6540067195892334 + }, + { + "auxiliary_loss_clip": 0.01649709, + "auxiliary_loss_mlp": 0.00312186, + "balance_loss_clip": 1.35591149, + "balance_loss_mlp": 0.27203614, + "epoch": 0.23634450623778747, + "flos": 39529855261440.0, + "grad_norm": 160.3118174208196, + "language_loss": 0.82710195, + "learning_rate": 3.569732284634665e-06, + "loss": 0.84672087, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.40185547, + "step": 3931, + "time_per_iteration": 2.768934726715088 + }, + { + "auxiliary_loss_clip": 0.01660748, + "auxiliary_loss_mlp": 0.00310808, + "balance_loss_clip": 1.35857034, + "balance_loss_mlp": 0.27013344, + "epoch": 0.23640462949045543, + "flos": 24207167917440.0, + "grad_norm": 11.467637750819025, + "language_loss": 0.89064109, + "learning_rate": 3.569490918967136e-06, + "loss": 0.9103567, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.40673828, + "step": 3932, + "time_per_iteration": 2.6669669151306152 + }, + { + "auxiliary_loss_clip": 0.0165614, + "auxiliary_loss_mlp": 0.00359892, + "balance_loss_clip": 1.35943317, + "balance_loss_mlp": 0.32069585, + "epoch": 0.2364647527431234, + "flos": 26177622255360.0, + "grad_norm": 13.497292911386934, + "language_loss": 0.92114955, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.94130981, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.3918457, + "step": 3933, + "time_per_iteration": 2.6621618270874023 + }, + { + "auxiliary_loss_clip": 0.01653134, + "auxiliary_loss_mlp": 0.00383917, + "balance_loss_clip": 1.34688485, + "balance_loss_mlp": 0.3440769, + "epoch": 0.23652487599579136, + "flos": 22637009721600.0, + "grad_norm": 2.775742774450453, + "language_loss": 0.90382171, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.92419225, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.39868164, + "step": 3934, + "time_per_iteration": 2.6503539085388184 + }, + { + "auxiliary_loss_clip": 0.01642164, + "auxiliary_loss_mlp": 0.00348172, + "balance_loss_clip": 1.34210443, + "balance_loss_mlp": 0.30952418, + "epoch": 0.23658499924845935, + "flos": 21762261809280.0, + "grad_norm": 6.584901106765564, + "language_loss": 0.86059344, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.8804968, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.38671875, + "step": 3935, + "time_per_iteration": 2.633786916732788 + }, + { + "auxiliary_loss_clip": 0.01621958, + "auxiliary_loss_mlp": 0.00351701, + "balance_loss_clip": 1.33459544, + "balance_loss_mlp": 0.31591401, + "epoch": 0.23664512250112732, + "flos": 21798998444160.0, + "grad_norm": 3.688213620248369, + "language_loss": 0.87282515, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.89256179, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.35791016, + "step": 3936, + "time_per_iteration": 2.6274874210357666 + }, + { + "auxiliary_loss_clip": 0.01615905, + "auxiliary_loss_mlp": 0.00386871, + "balance_loss_clip": 1.32150865, + "balance_loss_mlp": 0.34958249, + "epoch": 0.23670524575379528, + "flos": 22637871648000.0, + "grad_norm": 4.4074001977466954, + "language_loss": 0.84176922, + "learning_rate": 3.568283198083826e-06, + "loss": 0.86179703, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.37255859, + "step": 3937, + "time_per_iteration": 2.643369674682617 + }, + { + "auxiliary_loss_clip": 0.01652543, + "auxiliary_loss_mlp": 0.00457271, + "balance_loss_clip": 1.35462093, + "balance_loss_mlp": 0.417503, + "epoch": 0.23676536900646325, + "flos": 16725000263040.0, + "grad_norm": 121.01535407188125, + "language_loss": 0.92565048, + "learning_rate": 3.568041475462147e-06, + "loss": 0.94674861, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.39770508, + "step": 3938, + "time_per_iteration": 2.582885980606079 + }, + { + "auxiliary_loss_clip": 0.01644519, + "auxiliary_loss_mlp": 0.00487901, + "balance_loss_clip": 1.34440875, + "balance_loss_mlp": 0.44663098, + "epoch": 0.23682549225913122, + "flos": 11135611785600.0, + "grad_norm": 47.63368999001478, + "language_loss": 1.01336813, + "learning_rate": 3.5677996933801785e-06, + "loss": 1.03469241, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.41259766, + "step": 3939, + "time_per_iteration": 2.525876760482788 + }, + { + "auxiliary_loss_clip": 0.01645792, + "auxiliary_loss_mlp": 0.00459089, + "balance_loss_clip": 1.33927917, + "balance_loss_mlp": 0.41877213, + "epoch": 0.23688561551179918, + "flos": 22559226819840.0, + "grad_norm": 2.005170846304699, + "language_loss": 0.89588594, + "learning_rate": 3.567557851847088e-06, + "loss": 0.91693473, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.40332031, + "step": 3940, + "time_per_iteration": 2.6353864669799805 + }, + { + "auxiliary_loss_clip": 0.01648145, + "auxiliary_loss_mlp": 0.00444867, + "balance_loss_clip": 1.33388793, + "balance_loss_mlp": 0.40531337, + "epoch": 0.23694573876446715, + "flos": 18514895909760.0, + "grad_norm": 5.406575191533512, + "language_loss": 0.99016082, + "learning_rate": 3.5673159508720464e-06, + "loss": 1.01109099, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.39550781, + "step": 3941, + "time_per_iteration": 2.5834462642669678 + }, + { + "auxiliary_loss_clip": 0.01638895, + "auxiliary_loss_mlp": 0.00448999, + "balance_loss_clip": 1.325899, + "balance_loss_mlp": 0.40818125, + "epoch": 0.23700586201713514, + "flos": 15335723980800.0, + "grad_norm": 29.92943228445727, + "language_loss": 0.93808508, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.95896399, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 3.13085938, + "router_z_loss_mlp": 0.40820312, + "step": 3942, + "time_per_iteration": 4.1977598667144775 + }, + { + "auxiliary_loss_clip": 0.01648596, + "auxiliary_loss_mlp": 0.00480081, + "balance_loss_clip": 1.33617735, + "balance_loss_mlp": 0.43995482, + "epoch": 0.2370659852698031, + "flos": 23947605262080.0, + "grad_norm": 5.860146191166942, + "language_loss": 0.89329016, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.91457689, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.40136719, + "step": 3943, + "time_per_iteration": 2.661034345626831 + }, + { + "auxiliary_loss_clip": 0.01654389, + "auxiliary_loss_mlp": 0.00436482, + "balance_loss_clip": 1.32853842, + "balance_loss_mlp": 0.39666647, + "epoch": 0.23712610852247107, + "flos": 15332527670400.0, + "grad_norm": 97.690761144732, + "language_loss": 0.78468108, + "learning_rate": 3.566589891386959e-06, + "loss": 0.80558985, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.39794922, + "step": 3944, + "time_per_iteration": 2.6178956031799316 + }, + { + "auxiliary_loss_clip": 0.0165479, + "auxiliary_loss_mlp": 0.00423613, + "balance_loss_clip": 1.32936287, + "balance_loss_mlp": 0.38703954, + "epoch": 0.23718623177513903, + "flos": 19682567233920.0, + "grad_norm": 3.488330412869094, + "language_loss": 0.86114967, + "learning_rate": 3.566347752735866e-06, + "loss": 0.88193369, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.3659668, + "step": 3945, + "time_per_iteration": 4.039342164993286 + }, + { + "auxiliary_loss_clip": 0.01670172, + "auxiliary_loss_mlp": 0.0042583, + "balance_loss_clip": 1.34577441, + "balance_loss_mlp": 0.38873214, + "epoch": 0.237246355027807, + "flos": 24973322037120.0, + "grad_norm": 7.270644829318704, + "language_loss": 0.70462501, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.72558498, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.37084961, + "step": 3946, + "time_per_iteration": 2.676464080810547 + }, + { + "auxiliary_loss_clip": 0.01634354, + "auxiliary_loss_mlp": 0.00384909, + "balance_loss_clip": 1.32062054, + "balance_loss_mlp": 0.34714314, + "epoch": 0.23730647828047496, + "flos": 15377416692480.0, + "grad_norm": 5.727668811218663, + "language_loss": 0.83991826, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.860111, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.37768555, + "step": 3947, + "time_per_iteration": 2.6729941368103027 + }, + { + "auxiliary_loss_clip": 0.01650991, + "auxiliary_loss_mlp": 0.00338129, + "balance_loss_clip": 1.33138084, + "balance_loss_mlp": 0.30110276, + "epoch": 0.23736660153314296, + "flos": 28150662372480.0, + "grad_norm": 107.73161182776765, + "language_loss": 0.87244987, + "learning_rate": 3.565620980442944e-06, + "loss": 0.89234114, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.37036133, + "step": 3948, + "time_per_iteration": 2.7135183811187744 + }, + { + "auxiliary_loss_clip": 0.01628378, + "auxiliary_loss_mlp": 0.00389448, + "balance_loss_clip": 1.31541967, + "balance_loss_mlp": 0.35292196, + "epoch": 0.23742672478581092, + "flos": 22086570729600.0, + "grad_norm": 25.556796644547752, + "language_loss": 0.87699342, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.89717168, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.36474609, + "step": 3949, + "time_per_iteration": 2.6440789699554443 + }, + { + "auxiliary_loss_clip": 0.01608256, + "auxiliary_loss_mlp": 0.00339505, + "balance_loss_clip": 1.29179096, + "balance_loss_mlp": 0.30154908, + "epoch": 0.2374868480384789, + "flos": 19537093152000.0, + "grad_norm": 2.64894777950134, + "language_loss": 0.80734909, + "learning_rate": 3.565136168723163e-06, + "loss": 0.82682675, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.37939453, + "step": 3950, + "time_per_iteration": 4.183924674987793 + }, + { + "auxiliary_loss_clip": 0.01570186, + "auxiliary_loss_mlp": 0.00302515, + "balance_loss_clip": 1.26669753, + "balance_loss_mlp": 0.26582262, + "epoch": 0.23754697129114685, + "flos": 19422501788160.0, + "grad_norm": 11.545321788611842, + "language_loss": 0.78510273, + "learning_rate": 3.564893673833495e-06, + "loss": 0.80382979, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.36694336, + "step": 3951, + "time_per_iteration": 2.6240272521972656 + }, + { + "auxiliary_loss_clip": 0.01566021, + "auxiliary_loss_mlp": 0.00320402, + "balance_loss_clip": 1.2649008, + "balance_loss_mlp": 0.28661782, + "epoch": 0.23760709454381482, + "flos": 19501002961920.0, + "grad_norm": 14.373065667816087, + "language_loss": 0.81967396, + "learning_rate": 3.564651119602903e-06, + "loss": 0.83853817, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.33789062, + "step": 3952, + "time_per_iteration": 2.609994411468506 + }, + { + "auxiliary_loss_clip": 0.01546732, + "auxiliary_loss_mlp": 0.00285538, + "balance_loss_clip": 1.24812555, + "balance_loss_mlp": 0.25416243, + "epoch": 0.23766721779648278, + "flos": 27636600879360.0, + "grad_norm": 1855.2640625322074, + "language_loss": 0.78238112, + "learning_rate": 3.564408506040583e-06, + "loss": 0.80070382, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.31347656, + "step": 3953, + "time_per_iteration": 2.6848113536834717 + }, + { + "auxiliary_loss_clip": 0.01533833, + "auxiliary_loss_mlp": 0.00271547, + "balance_loss_clip": 1.23571539, + "balance_loss_mlp": 0.23644015, + "epoch": 0.23772734104915075, + "flos": 23404348990080.0, + "grad_norm": 3.4353021497600142, + "language_loss": 0.90531409, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.92336792, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.35119629, + "step": 3954, + "time_per_iteration": 4.022048234939575 + }, + { + "auxiliary_loss_clip": 0.01533009, + "auxiliary_loss_mlp": 0.00287061, + "balance_loss_clip": 1.23560345, + "balance_loss_mlp": 0.2527523, + "epoch": 0.23778746430181874, + "flos": 15705496540800.0, + "grad_norm": 11.091313293914116, + "language_loss": 0.76538479, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.78358549, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.34301758, + "step": 3955, + "time_per_iteration": 2.6496119499206543 + }, + { + "auxiliary_loss_clip": 0.01513011, + "auxiliary_loss_mlp": 0.00270749, + "balance_loss_clip": 1.22067261, + "balance_loss_mlp": 0.2366786, + "epoch": 0.2378475875544867, + "flos": 19426452284160.0, + "grad_norm": 1.514591256806729, + "language_loss": 0.89229727, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.91013491, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.34082031, + "step": 3956, + "time_per_iteration": 2.6496944427490234 + }, + { + "auxiliary_loss_clip": 0.01495179, + "auxiliary_loss_mlp": 0.00298973, + "balance_loss_clip": 1.21028054, + "balance_loss_mlp": 0.26752606, + "epoch": 0.23790771080715467, + "flos": 22268565964800.0, + "grad_norm": 32.466568387755466, + "language_loss": 0.92815328, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.94609487, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.31445312, + "step": 3957, + "time_per_iteration": 2.6973254680633545 + }, + { + "auxiliary_loss_clip": 0.01475926, + "auxiliary_loss_mlp": 0.002825, + "balance_loss_clip": 1.18812442, + "balance_loss_mlp": 0.25200653, + "epoch": 0.23796783405982264, + "flos": 20047311889920.0, + "grad_norm": 24.846990447340048, + "language_loss": 0.77539384, + "learning_rate": 3.563194548575151e-06, + "loss": 0.79297811, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 2.88085938, + "router_z_loss_mlp": 0.30493164, + "step": 3958, + "time_per_iteration": 2.6372926235198975 + }, + { + "auxiliary_loss_clip": 0.01488619, + "auxiliary_loss_mlp": 0.00249739, + "balance_loss_clip": 1.19796634, + "balance_loss_mlp": 0.21776709, + "epoch": 0.2380279573124906, + "flos": 14245943299200.0, + "grad_norm": 39.59851545745892, + "language_loss": 0.76945907, + "learning_rate": 3.562951579215745e-06, + "loss": 0.78684258, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.31958008, + "step": 3959, + "time_per_iteration": 2.723525047302246 + }, + { + "auxiliary_loss_clip": 0.01478225, + "auxiliary_loss_mlp": 0.00258809, + "balance_loss_clip": 1.18916094, + "balance_loss_mlp": 0.22836268, + "epoch": 0.23808808056515857, + "flos": 21179180332800.0, + "grad_norm": 11.601468047184996, + "language_loss": 0.79523379, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.81260413, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.30444336, + "step": 3960, + "time_per_iteration": 2.6715517044067383 + }, + { + "auxiliary_loss_clip": 0.01471644, + "auxiliary_loss_mlp": 0.00238101, + "balance_loss_clip": 1.18275928, + "balance_loss_mlp": 0.20811982, + "epoch": 0.23814820381782653, + "flos": 22528308188160.0, + "grad_norm": 27.015769932336003, + "language_loss": 0.82552099, + "learning_rate": 3.562465462704307e-06, + "loss": 0.84261835, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.29968262, + "step": 3961, + "time_per_iteration": 2.6212189197540283 + }, + { + "auxiliary_loss_clip": 0.01475616, + "auxiliary_loss_mlp": 0.00262092, + "balance_loss_clip": 1.18285549, + "balance_loss_mlp": 0.23019123, + "epoch": 0.23820832707049452, + "flos": 22304332932480.0, + "grad_norm": 6.477860557403541, + "language_loss": 0.74430829, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.76168537, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.31860352, + "step": 3962, + "time_per_iteration": 2.6233973503112793 + }, + { + "auxiliary_loss_clip": 0.01442909, + "auxiliary_loss_mlp": 0.00226516, + "balance_loss_clip": 1.16206479, + "balance_loss_mlp": 0.1965825, + "epoch": 0.2382684503231625, + "flos": 24864225454080.0, + "grad_norm": 3.4707071771198255, + "language_loss": 0.82333398, + "learning_rate": 3.561979109197483e-06, + "loss": 0.84002823, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.29919434, + "step": 3963, + "time_per_iteration": 2.689762830734253 + }, + { + "auxiliary_loss_clip": 0.01470324, + "auxiliary_loss_mlp": 0.0026217, + "balance_loss_clip": 1.18095839, + "balance_loss_mlp": 0.23022181, + "epoch": 0.23832857357583045, + "flos": 21871609787520.0, + "grad_norm": 22.221095597183197, + "language_loss": 0.86760092, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.88492584, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.31933594, + "step": 3964, + "time_per_iteration": 2.6445839405059814 + }, + { + "auxiliary_loss_clip": 0.01447027, + "auxiliary_loss_mlp": 0.00258091, + "balance_loss_clip": 1.15965831, + "balance_loss_mlp": 0.22833669, + "epoch": 0.23838869682849842, + "flos": 21288061434240.0, + "grad_norm": 2.420332916241352, + "language_loss": 0.80662549, + "learning_rate": 3.561492518769045e-06, + "loss": 0.82367671, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.29711914, + "step": 3965, + "time_per_iteration": 2.627964973449707 + }, + { + "auxiliary_loss_clip": 0.01460842, + "auxiliary_loss_mlp": 0.00255318, + "balance_loss_clip": 1.17231417, + "balance_loss_mlp": 0.22568239, + "epoch": 0.23844882008116638, + "flos": 16180594755840.0, + "grad_norm": 4.099865366198051, + "language_loss": 0.87728095, + "learning_rate": 3.561249134732282e-06, + "loss": 0.89444256, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.29663086, + "step": 3966, + "time_per_iteration": 2.6249730587005615 + }, + { + "auxiliary_loss_clip": 0.01441323, + "auxiliary_loss_mlp": 0.00251915, + "balance_loss_clip": 1.15775037, + "balance_loss_mlp": 0.22320952, + "epoch": 0.23850894333383435, + "flos": 21069724613760.0, + "grad_norm": 2.78017290886812, + "language_loss": 0.75793874, + "learning_rate": 3.561005691492797e-06, + "loss": 0.77487123, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.28735352, + "step": 3967, + "time_per_iteration": 2.7137513160705566 + }, + { + "auxiliary_loss_clip": 0.01446857, + "auxiliary_loss_mlp": 0.00238927, + "balance_loss_clip": 1.16647339, + "balance_loss_mlp": 0.21068624, + "epoch": 0.23856906658650234, + "flos": 17201606849280.0, + "grad_norm": 5.815829030838157, + "language_loss": 0.77416205, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.79101992, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.28271484, + "step": 3968, + "time_per_iteration": 2.6641108989715576 + }, + { + "auxiliary_loss_clip": 0.01411662, + "auxiliary_loss_mlp": 0.00243993, + "balance_loss_clip": 1.13764894, + "balance_loss_mlp": 0.21500111, + "epoch": 0.2386291898391703, + "flos": 29494223619840.0, + "grad_norm": 342.5621656772212, + "language_loss": 0.83635175, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.85290825, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.29003906, + "step": 3969, + "time_per_iteration": 2.7201578617095947 + }, + { + "auxiliary_loss_clip": 0.014072, + "auxiliary_loss_mlp": 0.00201002, + "balance_loss_clip": 1.13388729, + "balance_loss_mlp": 0.1745494, + "epoch": 0.23868931309183827, + "flos": 21142443697920.0, + "grad_norm": 3.903847908375964, + "language_loss": 0.84533596, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.86141801, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.26464844, + "step": 3970, + "time_per_iteration": 2.6480729579925537 + }, + { + "auxiliary_loss_clip": 0.01422776, + "auxiliary_loss_mlp": 0.00257367, + "balance_loss_clip": 1.14847374, + "balance_loss_mlp": 0.22830355, + "epoch": 0.23874943634450624, + "flos": 25659394784640.0, + "grad_norm": 15.270517772228047, + "language_loss": 0.93937302, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.95617437, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.29077148, + "step": 3971, + "time_per_iteration": 2.708425998687744 + }, + { + "auxiliary_loss_clip": 0.01355738, + "auxiliary_loss_mlp": 0.00077352, + "balance_loss_clip": 1.20065081, + "balance_loss_mlp": 0.06853019, + "epoch": 0.2388095595971742, + "flos": 58986618624000.0, + "grad_norm": 0.7490609841624442, + "language_loss": 0.62479568, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.63912654, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.08837891, + "step": 3972, + "time_per_iteration": 3.212846279144287 + }, + { + "auxiliary_loss_clip": 0.01428399, + "auxiliary_loss_mlp": 0.00245764, + "balance_loss_clip": 1.15253651, + "balance_loss_mlp": 0.21755911, + "epoch": 0.23886968284984217, + "flos": 16800341040000.0, + "grad_norm": 5.510743683047225, + "language_loss": 0.89110172, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.90784335, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.2824707, + "step": 3973, + "time_per_iteration": 2.629711151123047 + }, + { + "auxiliary_loss_clip": 0.01428039, + "auxiliary_loss_mlp": 0.00260507, + "balance_loss_clip": 1.15457368, + "balance_loss_mlp": 0.23335147, + "epoch": 0.23892980610251013, + "flos": 22382654538240.0, + "grad_norm": 1.7892156681987128, + "language_loss": 0.85115939, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.86804485, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.27185059, + "step": 3974, + "time_per_iteration": 2.627185106277466 + }, + { + "auxiliary_loss_clip": 0.01438368, + "auxiliary_loss_mlp": 0.00246295, + "balance_loss_clip": 1.1560638, + "balance_loss_mlp": 0.21894856, + "epoch": 0.23898992935517813, + "flos": 12823198519680.0, + "grad_norm": 4.488444165019723, + "language_loss": 0.93234682, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.94919342, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.27331543, + "step": 3975, + "time_per_iteration": 2.6112895011901855 + }, + { + "auxiliary_loss_clip": 0.01437789, + "auxiliary_loss_mlp": 0.00225662, + "balance_loss_clip": 1.15942562, + "balance_loss_mlp": 0.19879206, + "epoch": 0.2390500526078461, + "flos": 22345666508160.0, + "grad_norm": 3.5207995961601575, + "language_loss": 0.94903219, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.96566677, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.2689209, + "step": 3976, + "time_per_iteration": 2.615145683288574 + }, + { + "auxiliary_loss_clip": 0.01445834, + "auxiliary_loss_mlp": 0.00241606, + "balance_loss_clip": 1.16587532, + "balance_loss_mlp": 0.21528485, + "epoch": 0.23911017586051406, + "flos": 22635142214400.0, + "grad_norm": 11.909951959059281, + "language_loss": 0.79477876, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.8116532, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.26330566, + "step": 3977, + "time_per_iteration": 2.6397438049316406 + }, + { + "auxiliary_loss_clip": 0.01454776, + "auxiliary_loss_mlp": 0.00254609, + "balance_loss_clip": 1.16945338, + "balance_loss_mlp": 0.2245442, + "epoch": 0.23917029911318202, + "flos": 23653281219840.0, + "grad_norm": 3.551392295607186, + "language_loss": 0.78315997, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.80025375, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 2.85351562, + "router_z_loss_mlp": 0.30053711, + "step": 3978, + "time_per_iteration": 2.621234893798828 + }, + { + "auxiliary_loss_clip": 0.01496589, + "auxiliary_loss_mlp": 0.00228627, + "balance_loss_clip": 1.2032913, + "balance_loss_mlp": 0.19944505, + "epoch": 0.23923042236585, + "flos": 22783597125120.0, + "grad_norm": 183.0761118855229, + "language_loss": 0.89345855, + "learning_rate": 3.558079758168997e-06, + "loss": 0.91071075, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.29199219, + "step": 3979, + "time_per_iteration": 2.6349189281463623 + }, + { + "auxiliary_loss_clip": 0.01492647, + "auxiliary_loss_mlp": 0.00240628, + "balance_loss_clip": 1.20275784, + "balance_loss_mlp": 0.21009868, + "epoch": 0.23929054561851795, + "flos": 28147717457280.0, + "grad_norm": 93.95361243140225, + "language_loss": 0.89174962, + "learning_rate": 3.557835546134977e-06, + "loss": 0.90908241, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.30529785, + "step": 3980, + "time_per_iteration": 2.731947898864746 + }, + { + "auxiliary_loss_clip": 0.01496884, + "auxiliary_loss_mlp": 0.00223731, + "balance_loss_clip": 1.20652723, + "balance_loss_mlp": 0.19218794, + "epoch": 0.23935066887118592, + "flos": 21686525982720.0, + "grad_norm": 8.299072590549539, + "language_loss": 0.90774637, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.92495251, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.31542969, + "step": 3981, + "time_per_iteration": 2.6254470348358154 + }, + { + "auxiliary_loss_clip": 0.01505853, + "auxiliary_loss_mlp": 0.00255128, + "balance_loss_clip": 1.21459556, + "balance_loss_mlp": 0.2216306, + "epoch": 0.2394107921238539, + "flos": 32122274198400.0, + "grad_norm": 2.7316425743621875, + "language_loss": 0.84854817, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.86615801, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.3347168, + "step": 3982, + "time_per_iteration": 2.7111430168151855 + }, + { + "auxiliary_loss_clip": 0.01502883, + "auxiliary_loss_mlp": 0.00245068, + "balance_loss_clip": 1.21609163, + "balance_loss_mlp": 0.21405007, + "epoch": 0.23947091537652188, + "flos": 17019180650880.0, + "grad_norm": 7.510907698285992, + "language_loss": 0.85020339, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.86768293, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.31005859, + "step": 3983, + "time_per_iteration": 2.616969108581543 + }, + { + "auxiliary_loss_clip": 0.0151528, + "auxiliary_loss_mlp": 0.00268483, + "balance_loss_clip": 1.22330046, + "balance_loss_mlp": 0.23410329, + "epoch": 0.23953103862918984, + "flos": 20593584904320.0, + "grad_norm": 2.082014653407247, + "language_loss": 0.79926229, + "learning_rate": 3.556858107358737e-06, + "loss": 0.81709993, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.34375, + "step": 3984, + "time_per_iteration": 4.0893638134002686 + }, + { + "auxiliary_loss_clip": 0.01509697, + "auxiliary_loss_mlp": 0.00261698, + "balance_loss_clip": 1.21801066, + "balance_loss_mlp": 0.23075168, + "epoch": 0.2395911618818578, + "flos": 20704405340160.0, + "grad_norm": 12.754420812163612, + "language_loss": 0.87867707, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.89639103, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.30932617, + "step": 3985, + "time_per_iteration": 2.676412343978882 + }, + { + "auxiliary_loss_clip": 0.01516537, + "auxiliary_loss_mlp": 0.00242394, + "balance_loss_clip": 1.22189903, + "balance_loss_mlp": 0.21001635, + "epoch": 0.23965128513452577, + "flos": 27053519402880.0, + "grad_norm": 33.16018614193123, + "language_loss": 0.81226176, + "learning_rate": 3.556369033716254e-06, + "loss": 0.82985109, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.32373047, + "step": 3986, + "time_per_iteration": 2.7242431640625 + }, + { + "auxiliary_loss_clip": 0.01532144, + "auxiliary_loss_mlp": 0.00283033, + "balance_loss_clip": 1.23358738, + "balance_loss_mlp": 0.24963051, + "epoch": 0.23971140838719374, + "flos": 23144319457920.0, + "grad_norm": 6.573784297303717, + "language_loss": 0.94496262, + "learning_rate": 3.556124408363871e-06, + "loss": 0.96311438, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.33422852, + "step": 3987, + "time_per_iteration": 4.047123908996582 + }, + { + "auxiliary_loss_clip": 0.01538512, + "auxiliary_loss_mlp": 0.0023808, + "balance_loss_clip": 1.2503531, + "balance_loss_mlp": 0.20823035, + "epoch": 0.23977153163986173, + "flos": 18034554309120.0, + "grad_norm": 39.797067360883126, + "language_loss": 0.88829052, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.90605646, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.29882812, + "step": 3988, + "time_per_iteration": 2.5939865112304688 + }, + { + "auxiliary_loss_clip": 0.01519394, + "auxiliary_loss_mlp": 0.00276943, + "balance_loss_clip": 1.23009777, + "balance_loss_mlp": 0.24417222, + "epoch": 0.2398316548925297, + "flos": 18113378705280.0, + "grad_norm": 1.7075536099014303, + "language_loss": 0.92908394, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.94704723, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.32800293, + "step": 3989, + "time_per_iteration": 2.5944008827209473 + }, + { + "auxiliary_loss_clip": 0.01515515, + "auxiliary_loss_mlp": 0.00232092, + "balance_loss_clip": 1.22836828, + "balance_loss_mlp": 0.20257606, + "epoch": 0.23989177814519766, + "flos": 12567730014720.0, + "grad_norm": 101.71186192355533, + "language_loss": 0.91692352, + "learning_rate": 3.555390178293477e-06, + "loss": 0.93439955, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.29541016, + "step": 3990, + "time_per_iteration": 2.6023075580596924 + }, + { + "auxiliary_loss_clip": 0.01515019, + "auxiliary_loss_mlp": 0.00250389, + "balance_loss_clip": 1.22752619, + "balance_loss_mlp": 0.21982333, + "epoch": 0.23995190139786562, + "flos": 25264593423360.0, + "grad_norm": 1.8001088674027574, + "language_loss": 0.80636954, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.82402354, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.3059082, + "step": 3991, + "time_per_iteration": 2.659005880355835 + }, + { + "auxiliary_loss_clip": 0.01360896, + "auxiliary_loss_mlp": 0.00107439, + "balance_loss_clip": 1.19947219, + "balance_loss_mlp": 0.09671061, + "epoch": 0.2400120246505336, + "flos": 61960379650560.0, + "grad_norm": 0.8553043211173433, + "language_loss": 0.63141495, + "learning_rate": 3.554900396661656e-06, + "loss": 0.64609826, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.10742188, + "step": 3992, + "time_per_iteration": 4.4299726486206055 + }, + { + "auxiliary_loss_clip": 0.01360402, + "auxiliary_loss_mlp": 0.00109359, + "balance_loss_clip": 1.19582486, + "balance_loss_mlp": 0.0978668, + "epoch": 0.24007214790320155, + "flos": 66708560540160.0, + "grad_norm": 0.7527208033166083, + "language_loss": 0.62805325, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.6427508, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.11474609, + "step": 3993, + "time_per_iteration": 3.1985270977020264 + }, + { + "auxiliary_loss_clip": 0.0154231, + "auxiliary_loss_mlp": 0.00269525, + "balance_loss_clip": 1.25286222, + "balance_loss_mlp": 0.23731467, + "epoch": 0.24013227115586952, + "flos": 25809070757760.0, + "grad_norm": 23.043453896550425, + "language_loss": 0.82550782, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.84362614, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.32177734, + "step": 3994, + "time_per_iteration": 2.6905274391174316 + }, + { + "auxiliary_loss_clip": 0.01525478, + "auxiliary_loss_mlp": 0.00248531, + "balance_loss_clip": 1.23757696, + "balance_loss_mlp": 0.21684489, + "epoch": 0.2401923944085375, + "flos": 25557480921600.0, + "grad_norm": 8.859938911076036, + "language_loss": 0.85656154, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.87430155, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 2.88085938, + "router_z_loss_mlp": 0.31665039, + "step": 3995, + "time_per_iteration": 2.6377129554748535 + }, + { + "auxiliary_loss_clip": 0.01321272, + "auxiliary_loss_mlp": 0.000829, + "balance_loss_clip": 1.16401136, + "balance_loss_mlp": 0.07426915, + "epoch": 0.24025251766120548, + "flos": 54941138478720.0, + "grad_norm": 0.8797027980534742, + "language_loss": 0.63524348, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.64928514, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.08642578, + "step": 3996, + "time_per_iteration": 3.1851038932800293 + }, + { + "auxiliary_loss_clip": 0.01521266, + "auxiliary_loss_mlp": 0.00235083, + "balance_loss_clip": 1.23336935, + "balance_loss_mlp": 0.20665173, + "epoch": 0.24031264091387344, + "flos": 20631075724800.0, + "grad_norm": 5.537774957588464, + "language_loss": 0.79404795, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.81161147, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.28430176, + "step": 3997, + "time_per_iteration": 4.008476495742798 + }, + { + "auxiliary_loss_clip": 0.0150619, + "auxiliary_loss_mlp": 0.00210122, + "balance_loss_clip": 1.22635341, + "balance_loss_mlp": 0.18017697, + "epoch": 0.2403727641665414, + "flos": 20886256920960.0, + "grad_norm": 3.233083621421874, + "language_loss": 0.9484967, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.96565986, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.29919434, + "step": 3998, + "time_per_iteration": 2.592376947402954 + }, + { + "auxiliary_loss_clip": 0.0149374, + "auxiliary_loss_mlp": 0.00265953, + "balance_loss_clip": 1.20979762, + "balance_loss_mlp": 0.23383847, + "epoch": 0.24043288741920937, + "flos": 22820046451200.0, + "grad_norm": 205.43370281249335, + "language_loss": 0.83863509, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.85623205, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.32080078, + "step": 3999, + "time_per_iteration": 2.6365256309509277 + }, + { + "auxiliary_loss_clip": 0.01495974, + "auxiliary_loss_mlp": 0.00240861, + "balance_loss_clip": 1.22068083, + "balance_loss_mlp": 0.21258414, + "epoch": 0.24049301067187734, + "flos": 27959652823680.0, + "grad_norm": 320.1400145577789, + "language_loss": 0.80222535, + "learning_rate": 3.552938912398679e-06, + "loss": 0.81959373, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.28295898, + "step": 4000, + "time_per_iteration": 2.6764094829559326 + }, + { + "auxiliary_loss_clip": 0.01480468, + "auxiliary_loss_mlp": 0.00273726, + "balance_loss_clip": 1.20321667, + "balance_loss_mlp": 0.2427312, + "epoch": 0.24055313392454533, + "flos": 27451409333760.0, + "grad_norm": 31.309971904986266, + "language_loss": 0.7288897, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.74643165, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.30957031, + "step": 4001, + "time_per_iteration": 2.7127201557159424 + }, + { + "auxiliary_loss_clip": 0.01496248, + "auxiliary_loss_mlp": 0.00270949, + "balance_loss_clip": 1.21928477, + "balance_loss_mlp": 0.24175429, + "epoch": 0.2406132571772133, + "flos": 25556618995200.0, + "grad_norm": 3.340568788889956, + "language_loss": 0.91458923, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.93226123, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.29211426, + "step": 4002, + "time_per_iteration": 2.6655960083007812 + }, + { + "auxiliary_loss_clip": 0.01465055, + "auxiliary_loss_mlp": 0.00244879, + "balance_loss_clip": 1.1938051, + "balance_loss_mlp": 0.21734208, + "epoch": 0.24067338042988126, + "flos": 24791398629120.0, + "grad_norm": 81.37571598942792, + "language_loss": 0.90693623, + "learning_rate": 3.552202383898897e-06, + "loss": 0.92403561, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.27563477, + "step": 4003, + "time_per_iteration": 2.6436946392059326 + }, + { + "auxiliary_loss_clip": 0.01501058, + "auxiliary_loss_mlp": 0.00287109, + "balance_loss_clip": 1.22187805, + "balance_loss_mlp": 0.25969076, + "epoch": 0.24073350368254923, + "flos": 21177923356800.0, + "grad_norm": 393.25979736817106, + "language_loss": 0.95673907, + "learning_rate": 3.551956756667215e-06, + "loss": 0.97462082, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.27441406, + "step": 4004, + "time_per_iteration": 2.6052279472351074 + }, + { + "auxiliary_loss_clip": 0.01477048, + "auxiliary_loss_mlp": 0.00275946, + "balance_loss_clip": 1.20478272, + "balance_loss_mlp": 0.24659631, + "epoch": 0.2407936269352172, + "flos": 22494300986880.0, + "grad_norm": 201.7028671500432, + "language_loss": 0.8600027, + "learning_rate": 3.551711070585177e-06, + "loss": 0.8775326, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.29333496, + "step": 4005, + "time_per_iteration": 2.693500518798828 + }, + { + "auxiliary_loss_clip": 0.01466334, + "auxiliary_loss_mlp": 0.00269573, + "balance_loss_clip": 1.19920468, + "balance_loss_mlp": 0.24304859, + "epoch": 0.24085375018788516, + "flos": 18551129754240.0, + "grad_norm": 27.381339812084644, + "language_loss": 0.85162103, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.86898005, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.26501465, + "step": 4006, + "time_per_iteration": 2.7737812995910645 + }, + { + "auxiliary_loss_clip": 0.01471403, + "auxiliary_loss_mlp": 0.00327318, + "balance_loss_clip": 1.196679, + "balance_loss_mlp": 0.29506022, + "epoch": 0.24091387344055312, + "flos": 24170539023360.0, + "grad_norm": 1.8127224714865762, + "language_loss": 0.7868005, + "learning_rate": 3.551219521907302e-06, + "loss": 0.80478764, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.32226562, + "step": 4007, + "time_per_iteration": 2.7195253372192383 + }, + { + "auxiliary_loss_clip": 0.01479124, + "auxiliary_loss_mlp": 0.0031145, + "balance_loss_clip": 1.20970583, + "balance_loss_mlp": 0.28332835, + "epoch": 0.24097399669322112, + "flos": 11036319615360.0, + "grad_norm": 2.4931790095153823, + "language_loss": 0.83009696, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.84800267, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.28137207, + "step": 4008, + "time_per_iteration": 2.605273485183716 + }, + { + "auxiliary_loss_clip": 0.01473602, + "auxiliary_loss_mlp": 0.00321273, + "balance_loss_clip": 1.20357525, + "balance_loss_mlp": 0.2902433, + "epoch": 0.24103411994588908, + "flos": 17165085696000.0, + "grad_norm": 12.248530909853292, + "language_loss": 0.83389544, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.85184419, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.31030273, + "step": 4009, + "time_per_iteration": 2.6008353233337402 + }, + { + "auxiliary_loss_clip": 0.01465699, + "auxiliary_loss_mlp": 0.00348324, + "balance_loss_clip": 1.19841075, + "balance_loss_mlp": 0.31784236, + "epoch": 0.24109424319855705, + "flos": 20667956014080.0, + "grad_norm": 33.118222941660704, + "language_loss": 0.87025714, + "learning_rate": 3.550481757745804e-06, + "loss": 0.88839734, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.30493164, + "step": 4010, + "time_per_iteration": 2.626300096511841 + }, + { + "auxiliary_loss_clip": 0.01478667, + "auxiliary_loss_mlp": 0.00334837, + "balance_loss_clip": 1.20592046, + "balance_loss_mlp": 0.30157775, + "epoch": 0.241154366451225, + "flos": 28181796485760.0, + "grad_norm": 9.657541087390337, + "language_loss": 0.77107638, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.78921139, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.33251953, + "step": 4011, + "time_per_iteration": 2.7033228874206543 + }, + { + "auxiliary_loss_clip": 0.01481032, + "auxiliary_loss_mlp": 0.00362706, + "balance_loss_clip": 1.21088171, + "balance_loss_mlp": 0.33116302, + "epoch": 0.24121448970389298, + "flos": 21689722293120.0, + "grad_norm": 319.7125193687515, + "language_loss": 0.77801669, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.79645413, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.31494141, + "step": 4012, + "time_per_iteration": 2.7173452377319336 + }, + { + "auxiliary_loss_clip": 0.01503069, + "auxiliary_loss_mlp": 0.00366104, + "balance_loss_clip": 1.22667623, + "balance_loss_mlp": 0.33050781, + "epoch": 0.24127461295656094, + "flos": 39676191269760.0, + "grad_norm": 226.70463881660595, + "language_loss": 0.80997932, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.82867098, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.35546875, + "step": 4013, + "time_per_iteration": 2.864473581314087 + }, + { + "auxiliary_loss_clip": 0.01525062, + "auxiliary_loss_mlp": 0.00354063, + "balance_loss_clip": 1.24722195, + "balance_loss_mlp": 0.32201958, + "epoch": 0.2413347362092289, + "flos": 19135863256320.0, + "grad_norm": 2.832949217862195, + "language_loss": 0.97509915, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.9938904, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.32055664, + "step": 4014, + "time_per_iteration": 2.65303111076355 + }, + { + "auxiliary_loss_clip": 0.01518794, + "auxiliary_loss_mlp": 0.00348939, + "balance_loss_clip": 1.23692012, + "balance_loss_mlp": 0.31732506, + "epoch": 0.2413948594618969, + "flos": 26939430829440.0, + "grad_norm": 5.776382086421996, + "language_loss": 1.04411352, + "learning_rate": 3.549250975045952e-06, + "loss": 1.06279087, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.31616211, + "step": 4015, + "time_per_iteration": 2.676043748855591 + }, + { + "auxiliary_loss_clip": 0.01556943, + "auxiliary_loss_mlp": 0.00389868, + "balance_loss_clip": 1.27391219, + "balance_loss_mlp": 0.35603654, + "epoch": 0.24145498271456486, + "flos": 25228108183680.0, + "grad_norm": 2.6841854221133414, + "language_loss": 0.88839149, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.90785956, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.33813477, + "step": 4016, + "time_per_iteration": 2.6809616088867188 + }, + { + "auxiliary_loss_clip": 0.01555219, + "auxiliary_loss_mlp": 0.00333865, + "balance_loss_clip": 1.2772187, + "balance_loss_mlp": 0.30375263, + "epoch": 0.24151510596723283, + "flos": 40661759617920.0, + "grad_norm": 4.528299992829508, + "language_loss": 0.77027035, + "learning_rate": 3.54875825066639e-06, + "loss": 0.78916115, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.30151367, + "step": 4017, + "time_per_iteration": 2.8239240646362305 + }, + { + "auxiliary_loss_clip": 0.01585729, + "auxiliary_loss_mlp": 0.00365415, + "balance_loss_clip": 1.28597271, + "balance_loss_mlp": 0.32972404, + "epoch": 0.2415752292199008, + "flos": 18146667634560.0, + "grad_norm": 25.79908216408453, + "language_loss": 0.91970849, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.93921995, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.35693359, + "step": 4018, + "time_per_iteration": 2.605440139770508 + }, + { + "auxiliary_loss_clip": 0.01474609, + "auxiliary_loss_mlp": 0.00086238, + "balance_loss_clip": 1.29541183, + "balance_loss_mlp": 0.07841775, + "epoch": 0.24163535247256876, + "flos": 67288409792640.0, + "grad_norm": 0.8580887066501497, + "language_loss": 0.60503095, + "learning_rate": 3.548265291370558e-06, + "loss": 0.62063938, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.078125, + "step": 4019, + "time_per_iteration": 3.173668622970581 + }, + { + "auxiliary_loss_clip": 0.0159065, + "auxiliary_loss_mlp": 0.00383293, + "balance_loss_clip": 1.29534733, + "balance_loss_mlp": 0.35077298, + "epoch": 0.24169547572523672, + "flos": 24929941386240.0, + "grad_norm": 5.092596183938838, + "language_loss": 0.79514277, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.81488216, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.32519531, + "step": 4020, + "time_per_iteration": 2.6483712196350098 + }, + { + "auxiliary_loss_clip": 0.01615371, + "auxiliary_loss_mlp": 0.004128, + "balance_loss_clip": 1.31255746, + "balance_loss_mlp": 0.37665591, + "epoch": 0.24175559897790472, + "flos": 18728312567040.0, + "grad_norm": 3.2464722178492997, + "language_loss": 0.89343703, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.91371876, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.36132812, + "step": 4021, + "time_per_iteration": 2.5894668102264404 + }, + { + "auxiliary_loss_clip": 0.01627271, + "auxiliary_loss_mlp": 0.00399852, + "balance_loss_clip": 1.31680799, + "balance_loss_mlp": 0.36218238, + "epoch": 0.24181572223057268, + "flos": 23039281111680.0, + "grad_norm": 7.549929140657058, + "language_loss": 0.82998955, + "learning_rate": 3.547525412122378e-06, + "loss": 0.85026085, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.37646484, + "step": 4022, + "time_per_iteration": 2.5951194763183594 + }, + { + "auxiliary_loss_clip": 0.01636957, + "auxiliary_loss_mlp": 0.00439849, + "balance_loss_clip": 1.31789923, + "balance_loss_mlp": 0.39896074, + "epoch": 0.24187584548324065, + "flos": 20376145923840.0, + "grad_norm": 7.561232859376897, + "language_loss": 0.83350331, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.85427135, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.40869141, + "step": 4023, + "time_per_iteration": 2.655895233154297 + }, + { + "auxiliary_loss_clip": 0.01631751, + "auxiliary_loss_mlp": 0.00403891, + "balance_loss_clip": 1.31449032, + "balance_loss_mlp": 0.36564913, + "epoch": 0.2419359687359086, + "flos": 21397517153280.0, + "grad_norm": 2.1899396332086187, + "language_loss": 0.90049273, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.9208492, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.38256836, + "step": 4024, + "time_per_iteration": 2.6063201427459717 + }, + { + "auxiliary_loss_clip": 0.0163505, + "auxiliary_loss_mlp": 0.00394875, + "balance_loss_clip": 1.31734872, + "balance_loss_mlp": 0.3584919, + "epoch": 0.24199609198857658, + "flos": 18369385914240.0, + "grad_norm": 2.7135802243744065, + "language_loss": 0.90361714, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.9239164, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.36401367, + "step": 4025, + "time_per_iteration": 2.6469478607177734 + }, + { + "auxiliary_loss_clip": 0.01634767, + "auxiliary_loss_mlp": 0.00392658, + "balance_loss_clip": 1.31463265, + "balance_loss_mlp": 0.35708624, + "epoch": 0.24205621524124454, + "flos": 19463871277440.0, + "grad_norm": 3.807417277597529, + "language_loss": 0.79531276, + "learning_rate": 3.546538084949365e-06, + "loss": 0.81558698, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.35546875, + "step": 4026, + "time_per_iteration": 4.0419745445251465 + }, + { + "auxiliary_loss_clip": 0.01630671, + "auxiliary_loss_mlp": 0.0041835, + "balance_loss_clip": 1.32209706, + "balance_loss_mlp": 0.37948778, + "epoch": 0.2421163384939125, + "flos": 14976330451200.0, + "grad_norm": 14.369806801897061, + "language_loss": 0.71087444, + "learning_rate": 3.546291106520509e-06, + "loss": 0.73136467, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 3.08398438, + "router_z_loss_mlp": 0.38867188, + "step": 4027, + "time_per_iteration": 2.6133980751037598 + }, + { + "auxiliary_loss_clip": 0.0164711, + "auxiliary_loss_mlp": 0.00410443, + "balance_loss_clip": 1.32540178, + "balance_loss_mlp": 0.37141353, + "epoch": 0.2421764617465805, + "flos": 18662057930880.0, + "grad_norm": 2.7674177674863922, + "language_loss": 0.78581321, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.80638874, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.39038086, + "step": 4028, + "time_per_iteration": 2.674367904663086 + }, + { + "auxiliary_loss_clip": 0.01544092, + "auxiliary_loss_mlp": 0.00074855, + "balance_loss_clip": 1.33819389, + "balance_loss_mlp": 0.06565236, + "epoch": 0.24223658499924847, + "flos": 64347327164160.0, + "grad_norm": 0.8549094930137469, + "language_loss": 0.55439085, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57058036, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.09179688, + "step": 4029, + "time_per_iteration": 4.540846347808838 + }, + { + "auxiliary_loss_clip": 0.01617844, + "auxiliary_loss_mlp": 0.00408794, + "balance_loss_clip": 1.30667186, + "balance_loss_mlp": 0.3730073, + "epoch": 0.24229670825191643, + "flos": 25775243124480.0, + "grad_norm": 5.488057818348033, + "language_loss": 0.80174839, + "learning_rate": 3.54554981945833e-06, + "loss": 0.82201475, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 3.109375, + "router_z_loss_mlp": 0.35766602, + "step": 4030, + "time_per_iteration": 2.6995301246643066 + }, + { + "auxiliary_loss_clip": 0.01638867, + "auxiliary_loss_mlp": 0.00444825, + "balance_loss_clip": 1.32291567, + "balance_loss_mlp": 0.40586767, + "epoch": 0.2423568315045844, + "flos": 20667094087680.0, + "grad_norm": 160.6522362737671, + "language_loss": 0.83747768, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.85831469, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.38964844, + "step": 4031, + "time_per_iteration": 2.687688112258911 + }, + { + "auxiliary_loss_clip": 0.0162627, + "auxiliary_loss_mlp": 0.00423719, + "balance_loss_clip": 1.30040359, + "balance_loss_mlp": 0.38330728, + "epoch": 0.24241695475725236, + "flos": 22416805393920.0, + "grad_norm": 3.578725892548548, + "language_loss": 0.75508857, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.77558851, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.40380859, + "step": 4032, + "time_per_iteration": 2.621288776397705 + }, + { + "auxiliary_loss_clip": 0.01649014, + "auxiliary_loss_mlp": 0.00428162, + "balance_loss_clip": 1.32856894, + "balance_loss_mlp": 0.39065832, + "epoch": 0.24247707800992033, + "flos": 17128995505920.0, + "grad_norm": 10.539217935540377, + "language_loss": 0.87637466, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.8971464, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.37475586, + "step": 4033, + "time_per_iteration": 2.6018104553222656 + }, + { + "auxiliary_loss_clip": 0.01642308, + "auxiliary_loss_mlp": 0.00393317, + "balance_loss_clip": 1.32119632, + "balance_loss_mlp": 0.35345381, + "epoch": 0.2425372012625883, + "flos": 31613743399680.0, + "grad_norm": 7.49211548424063, + "language_loss": 0.74672133, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.76707762, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.3984375, + "step": 4034, + "time_per_iteration": 4.149355173110962 + }, + { + "auxiliary_loss_clip": 0.01652773, + "auxiliary_loss_mlp": 0.0044026, + "balance_loss_clip": 1.32992697, + "balance_loss_mlp": 0.40189832, + "epoch": 0.24259732451525629, + "flos": 16326032924160.0, + "grad_norm": 2.8203308671644467, + "language_loss": 1.02710199, + "learning_rate": 3.5443131689983283e-06, + "loss": 1.04803228, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.38354492, + "step": 4035, + "time_per_iteration": 2.6441774368286133 + }, + { + "auxiliary_loss_clip": 0.01651524, + "auxiliary_loss_mlp": 0.00431818, + "balance_loss_clip": 1.33247519, + "balance_loss_mlp": 0.39226428, + "epoch": 0.24265744776792425, + "flos": 22856639431680.0, + "grad_norm": 9.808424142605672, + "language_loss": 0.84412038, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.86495376, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.39575195, + "step": 4036, + "time_per_iteration": 2.6744649410247803 + }, + { + "auxiliary_loss_clip": 0.01654809, + "auxiliary_loss_mlp": 0.00396976, + "balance_loss_clip": 1.32816863, + "balance_loss_mlp": 0.35551465, + "epoch": 0.24271757102059222, + "flos": 21871573873920.0, + "grad_norm": 208.85647006991022, + "language_loss": 0.80662501, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.82714283, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 3.265625, + "router_z_loss_mlp": 0.41430664, + "step": 4037, + "time_per_iteration": 2.6264445781707764 + }, + { + "auxiliary_loss_clip": 0.01638349, + "auxiliary_loss_mlp": 0.00385149, + "balance_loss_clip": 1.31586456, + "balance_loss_mlp": 0.34645322, + "epoch": 0.24277769427326018, + "flos": 19208582340480.0, + "grad_norm": 294.2207455447814, + "language_loss": 0.86380303, + "learning_rate": 3.543570475921171e-06, + "loss": 0.88403797, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.38623047, + "step": 4038, + "time_per_iteration": 2.645948886871338 + }, + { + "auxiliary_loss_clip": 0.01621172, + "auxiliary_loss_mlp": 0.00396858, + "balance_loss_clip": 1.29863667, + "balance_loss_mlp": 0.3559449, + "epoch": 0.24283781752592815, + "flos": 19499889640320.0, + "grad_norm": 17.81797288427343, + "language_loss": 0.78902888, + "learning_rate": 3.543322794484905e-06, + "loss": 0.80920917, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.40869141, + "step": 4039, + "time_per_iteration": 4.016241788864136 + }, + { + "auxiliary_loss_clip": 0.0164204, + "auxiliary_loss_mlp": 0.00427717, + "balance_loss_clip": 1.31436002, + "balance_loss_mlp": 0.38503975, + "epoch": 0.2428979407785961, + "flos": 19902196944000.0, + "grad_norm": 16.239942266194465, + "language_loss": 0.84406078, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.86475837, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.42651367, + "step": 4040, + "time_per_iteration": 2.6640191078186035 + }, + { + "auxiliary_loss_clip": 0.01655402, + "auxiliary_loss_mlp": 0.00337098, + "balance_loss_clip": 1.33219814, + "balance_loss_mlp": 0.2996659, + "epoch": 0.2429580640312641, + "flos": 24715878284160.0, + "grad_norm": 3.025987311901787, + "language_loss": 0.87836266, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.89828771, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.37426758, + "step": 4041, + "time_per_iteration": 2.6808066368103027 + }, + { + "auxiliary_loss_clip": 0.01636036, + "auxiliary_loss_mlp": 0.00389701, + "balance_loss_clip": 1.31724751, + "balance_loss_mlp": 0.35048071, + "epoch": 0.24301818728393207, + "flos": 25630343660160.0, + "grad_norm": 2.903359448513012, + "language_loss": 0.81794715, + "learning_rate": 3.542579399075957e-06, + "loss": 0.8382045, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.39208984, + "step": 4042, + "time_per_iteration": 2.665313720703125 + }, + { + "auxiliary_loss_clip": 0.01642039, + "auxiliary_loss_mlp": 0.0038113, + "balance_loss_clip": 1.32389355, + "balance_loss_mlp": 0.34179121, + "epoch": 0.24307831053660003, + "flos": 26141388410880.0, + "grad_norm": 3.8025967647269128, + "language_loss": 0.86710137, + "learning_rate": 3.542331483604246e-06, + "loss": 0.8873331, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.39355469, + "step": 4043, + "time_per_iteration": 2.6969072818756104 + }, + { + "auxiliary_loss_clip": 0.01630551, + "auxiliary_loss_mlp": 0.00398971, + "balance_loss_clip": 1.30158758, + "balance_loss_mlp": 0.35975143, + "epoch": 0.243138433789268, + "flos": 14972415868800.0, + "grad_norm": 343.23369440830544, + "language_loss": 0.81866592, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.83896112, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.39257812, + "step": 4044, + "time_per_iteration": 2.6264166831970215 + }, + { + "auxiliary_loss_clip": 0.01626098, + "auxiliary_loss_mlp": 0.00358241, + "balance_loss_clip": 1.30834985, + "balance_loss_mlp": 0.31949806, + "epoch": 0.24319855704193596, + "flos": 25191694771200.0, + "grad_norm": 2.982591802394121, + "language_loss": 0.90360993, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.92345333, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.38745117, + "step": 4045, + "time_per_iteration": 2.681691884994507 + }, + { + "auxiliary_loss_clip": 0.01613263, + "auxiliary_loss_mlp": 0.00350835, + "balance_loss_clip": 1.29409838, + "balance_loss_mlp": 0.3125689, + "epoch": 0.24325868029460393, + "flos": 22127221946880.0, + "grad_norm": 251.1124119243344, + "language_loss": 0.93659747, + "learning_rate": 3.541587386314541e-06, + "loss": 0.95623851, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.38256836, + "step": 4046, + "time_per_iteration": 2.6540229320526123 + }, + { + "auxiliary_loss_clip": 0.01626368, + "auxiliary_loss_mlp": 0.00389399, + "balance_loss_clip": 1.30363488, + "balance_loss_mlp": 0.34812844, + "epoch": 0.2433188035472719, + "flos": 23582106420480.0, + "grad_norm": 14.346983871100031, + "language_loss": 0.79286879, + "learning_rate": 3.5413392369578e-06, + "loss": 0.81302649, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 3.2265625, + "router_z_loss_mlp": 0.41259766, + "step": 4047, + "time_per_iteration": 2.664977550506592 + }, + { + "auxiliary_loss_clip": 0.01619084, + "auxiliary_loss_mlp": 0.00364173, + "balance_loss_clip": 1.2967273, + "balance_loss_mlp": 0.32445198, + "epoch": 0.2433789267999399, + "flos": 24462815990400.0, + "grad_norm": 2.9132014661097294, + "language_loss": 0.80298364, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.82281625, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.39770508, + "step": 4048, + "time_per_iteration": 2.6766440868377686 + }, + { + "auxiliary_loss_clip": 0.01632401, + "auxiliary_loss_mlp": 0.00353289, + "balance_loss_clip": 1.30529225, + "balance_loss_mlp": 0.31487948, + "epoch": 0.24343905005260785, + "flos": 16727909264640.0, + "grad_norm": 24.376232773704785, + "language_loss": 0.80592018, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.82577711, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.3840332, + "step": 4049, + "time_per_iteration": 2.648660182952881 + }, + { + "auxiliary_loss_clip": 0.01620866, + "auxiliary_loss_mlp": 0.00341922, + "balance_loss_clip": 1.30119562, + "balance_loss_mlp": 0.30298811, + "epoch": 0.24349917330527582, + "flos": 20043756443520.0, + "grad_norm": 219.07682909842322, + "language_loss": 0.80543625, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.82506406, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.38916016, + "step": 4050, + "time_per_iteration": 2.641662359237671 + }, + { + "auxiliary_loss_clip": 0.01614013, + "auxiliary_loss_mlp": 0.00310429, + "balance_loss_clip": 1.29336119, + "balance_loss_mlp": 0.27218711, + "epoch": 0.24355929655794378, + "flos": 17420554200960.0, + "grad_norm": 7.7001967244823, + "language_loss": 0.8316865, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.85093093, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.3828125, + "step": 4051, + "time_per_iteration": 2.681220293045044 + }, + { + "auxiliary_loss_clip": 0.01632796, + "auxiliary_loss_mlp": 0.00393848, + "balance_loss_clip": 1.30073822, + "balance_loss_mlp": 0.35393625, + "epoch": 0.24361941981061175, + "flos": 25410929431680.0, + "grad_norm": 38.816229818503025, + "language_loss": 0.79124653, + "learning_rate": 3.540097613646296e-06, + "loss": 0.81151295, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.39892578, + "step": 4052, + "time_per_iteration": 2.821835994720459 + }, + { + "auxiliary_loss_clip": 0.01631439, + "auxiliary_loss_mlp": 0.00354311, + "balance_loss_clip": 1.31202209, + "balance_loss_mlp": 0.31656969, + "epoch": 0.2436795430632797, + "flos": 22820800636800.0, + "grad_norm": 6.675292255985764, + "language_loss": 0.8632583, + "learning_rate": 3.539849113744351e-06, + "loss": 0.88311577, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.37768555, + "step": 4053, + "time_per_iteration": 2.7326395511627197 + }, + { + "auxiliary_loss_clip": 0.0161879, + "auxiliary_loss_mlp": 0.00361161, + "balance_loss_clip": 1.29348612, + "balance_loss_mlp": 0.32215583, + "epoch": 0.2437396663159477, + "flos": 15157786982400.0, + "grad_norm": 6.29863228357168, + "language_loss": 0.83742344, + "learning_rate": 3.539600555451172e-06, + "loss": 0.85722303, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.38989258, + "step": 4054, + "time_per_iteration": 2.618770122528076 + }, + { + "auxiliary_loss_clip": 0.01651357, + "auxiliary_loss_mlp": 0.00387666, + "balance_loss_clip": 1.31672871, + "balance_loss_mlp": 0.34584761, + "epoch": 0.24379978956861567, + "flos": 22091131756800.0, + "grad_norm": 7.97909212853329, + "language_loss": 0.89997321, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.92036343, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.41821289, + "step": 4055, + "time_per_iteration": 2.6917226314544678 + }, + { + "auxiliary_loss_clip": 0.0162168, + "auxiliary_loss_mlp": 0.00351616, + "balance_loss_clip": 1.28798747, + "balance_loss_mlp": 0.30910605, + "epoch": 0.24385991282128364, + "flos": 31467766527360.0, + "grad_norm": 32.5416955023098, + "language_loss": 0.64655161, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.66628462, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.42504883, + "step": 4056, + "time_per_iteration": 2.770857095718384 + }, + { + "auxiliary_loss_clip": 0.01677059, + "auxiliary_loss_mlp": 0.0035839, + "balance_loss_clip": 1.34126019, + "balance_loss_mlp": 0.31447369, + "epoch": 0.2439200360739516, + "flos": 23838795987840.0, + "grad_norm": 11.68339687262265, + "language_loss": 0.88752818, + "learning_rate": 3.538854530318506e-06, + "loss": 0.90788269, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.43969727, + "step": 4057, + "time_per_iteration": 2.6745340824127197 + }, + { + "auxiliary_loss_clip": 0.01672642, + "auxiliary_loss_mlp": 0.00366565, + "balance_loss_clip": 1.33638358, + "balance_loss_mlp": 0.32596266, + "epoch": 0.24398015932661957, + "flos": 19169978198400.0, + "grad_norm": 9.027204546362029, + "language_loss": 0.85394537, + "learning_rate": 3.538605738554673e-06, + "loss": 0.87433743, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.40576172, + "step": 4058, + "time_per_iteration": 2.658832311630249 + }, + { + "auxiliary_loss_clip": 0.01678353, + "auxiliary_loss_mlp": 0.00353384, + "balance_loss_clip": 1.3300699, + "balance_loss_mlp": 0.31094587, + "epoch": 0.24404028257928753, + "flos": 25262474520960.0, + "grad_norm": 17.421361725824354, + "language_loss": 0.91296279, + "learning_rate": 3.538356888446756e-06, + "loss": 0.93328023, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.42431641, + "step": 4059, + "time_per_iteration": 2.749457597732544 + }, + { + "auxiliary_loss_clip": 0.016906, + "auxiliary_loss_mlp": 0.00362655, + "balance_loss_clip": 1.34981775, + "balance_loss_mlp": 0.32298195, + "epoch": 0.2441004058319555, + "flos": 26467600752000.0, + "grad_norm": 9449.228278831311, + "language_loss": 0.78562701, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.80615956, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.39672852, + "step": 4060, + "time_per_iteration": 2.807614326477051 + }, + { + "auxiliary_loss_clip": 0.01702156, + "auxiliary_loss_mlp": 0.00414233, + "balance_loss_clip": 1.35087037, + "balance_loss_mlp": 0.36974409, + "epoch": 0.2441605290846235, + "flos": 26760524163840.0, + "grad_norm": 3300.2049392401477, + "language_loss": 0.82141334, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.84257722, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 0.4453125, + "step": 4061, + "time_per_iteration": 2.72826886177063 + }, + { + "auxiliary_loss_clip": 0.01720814, + "auxiliary_loss_mlp": 0.00360998, + "balance_loss_clip": 1.36935711, + "balance_loss_mlp": 0.31557935, + "epoch": 0.24422065233729146, + "flos": 21105850717440.0, + "grad_norm": 5.184566758326707, + "language_loss": 0.82850248, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.84932065, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 0.45385742, + "step": 4062, + "time_per_iteration": 2.6990461349487305 + }, + { + "auxiliary_loss_clip": 0.01699404, + "auxiliary_loss_mlp": 0.00341968, + "balance_loss_clip": 1.35635757, + "balance_loss_mlp": 0.30146098, + "epoch": 0.24428077558995942, + "flos": 25263156879360.0, + "grad_norm": 2.716712954470212, + "language_loss": 0.8957181, + "learning_rate": 3.537360904763011e-06, + "loss": 0.91613179, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.4050293, + "step": 4063, + "time_per_iteration": 2.6793694496154785 + }, + { + "auxiliary_loss_clip": 0.0171584, + "auxiliary_loss_mlp": 0.00362117, + "balance_loss_clip": 1.35005546, + "balance_loss_mlp": 0.31982154, + "epoch": 0.24434089884262739, + "flos": 20485278420480.0, + "grad_norm": 9.667026749703734, + "language_loss": 0.79313439, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.81391394, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 0.42285156, + "step": 4064, + "time_per_iteration": 2.651258707046509 + }, + { + "auxiliary_loss_clip": 0.01704108, + "auxiliary_loss_mlp": 0.00360596, + "balance_loss_clip": 1.35152292, + "balance_loss_mlp": 0.31675038, + "epoch": 0.24440102209529535, + "flos": 23621895711360.0, + "grad_norm": 3.2805419588388856, + "language_loss": 0.75577211, + "learning_rate": 3.536862563102088e-06, + "loss": 0.7764191, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.43847656, + "step": 4065, + "time_per_iteration": 2.7352805137634277 + }, + { + "auxiliary_loss_clip": 0.01679526, + "auxiliary_loss_mlp": 0.00367896, + "balance_loss_clip": 1.32755089, + "balance_loss_mlp": 0.32011673, + "epoch": 0.24446114534796332, + "flos": 20554729367040.0, + "grad_norm": 37.87336671302953, + "language_loss": 0.91744608, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.93792033, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.47827148, + "step": 4066, + "time_per_iteration": 2.6320135593414307 + }, + { + "auxiliary_loss_clip": 0.01593239, + "auxiliary_loss_mlp": 0.00165235, + "balance_loss_clip": 1.39066124, + "balance_loss_mlp": 0.15093008, + "epoch": 0.24452126860063128, + "flos": 60389575009920.0, + "grad_norm": 0.7350235095971702, + "language_loss": 0.52088296, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.53846776, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.14257812, + "step": 4067, + "time_per_iteration": 3.0438199043273926 + }, + { + "auxiliary_loss_clip": 0.01700334, + "auxiliary_loss_mlp": 0.00335543, + "balance_loss_clip": 1.3478663, + "balance_loss_mlp": 0.29451129, + "epoch": 0.24458139185329927, + "flos": 15121660878720.0, + "grad_norm": 33.26523606196786, + "language_loss": 0.81912065, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.83947939, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.41064453, + "step": 4068, + "time_per_iteration": 2.620424509048462 + }, + { + "auxiliary_loss_clip": 0.01703145, + "auxiliary_loss_mlp": 0.00307511, + "balance_loss_clip": 1.35665369, + "balance_loss_mlp": 0.2666218, + "epoch": 0.24464151510596724, + "flos": 27998723842560.0, + "grad_norm": 5.332173264784658, + "language_loss": 0.84044564, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.86055225, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.40917969, + "step": 4069, + "time_per_iteration": 4.095356702804565 + }, + { + "auxiliary_loss_clip": 0.01681753, + "auxiliary_loss_mlp": 0.00331061, + "balance_loss_clip": 1.33749247, + "balance_loss_mlp": 0.28869432, + "epoch": 0.2447016383586352, + "flos": 19792884879360.0, + "grad_norm": 2.2545112977960238, + "language_loss": 0.86880082, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.88892901, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.42382812, + "step": 4070, + "time_per_iteration": 2.6493489742279053 + }, + { + "auxiliary_loss_clip": 0.01686764, + "auxiliary_loss_mlp": 0.0031284, + "balance_loss_clip": 1.33608198, + "balance_loss_mlp": 0.27280974, + "epoch": 0.24476176161130317, + "flos": 26067340523520.0, + "grad_norm": 3.1998643619847718, + "language_loss": 0.89118576, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.91118181, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.40063477, + "step": 4071, + "time_per_iteration": 2.695143938064575 + }, + { + "auxiliary_loss_clip": 0.01643533, + "auxiliary_loss_mlp": 0.00338322, + "balance_loss_clip": 1.29758096, + "balance_loss_mlp": 0.29767177, + "epoch": 0.24482188486397113, + "flos": 18843550375680.0, + "grad_norm": 7.082563541770859, + "language_loss": 0.86913276, + "learning_rate": 3.535116532028798e-06, + "loss": 0.8889513, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 3.45703125, + "router_z_loss_mlp": 0.40625, + "step": 4072, + "time_per_iteration": 4.085922002792358 + }, + { + "auxiliary_loss_clip": 0.01681664, + "auxiliary_loss_mlp": 0.00310393, + "balance_loss_clip": 1.33177543, + "balance_loss_mlp": 0.2702198, + "epoch": 0.2448820081166391, + "flos": 21251791676160.0, + "grad_norm": 6.579187034653531, + "language_loss": 0.76599199, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.78591251, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 0.40185547, + "step": 4073, + "time_per_iteration": 2.700016498565674 + }, + { + "auxiliary_loss_clip": 0.0168248, + "auxiliary_loss_mlp": 0.00312356, + "balance_loss_clip": 1.33674979, + "balance_loss_mlp": 0.27280283, + "epoch": 0.2449421313693071, + "flos": 23950586090880.0, + "grad_norm": 4.99629472936577, + "language_loss": 0.76435769, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.78430605, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 3.45703125, + "router_z_loss_mlp": 0.39575195, + "step": 4074, + "time_per_iteration": 2.6851930618286133 + }, + { + "auxiliary_loss_clip": 0.01493718, + "auxiliary_loss_mlp": 0.00182872, + "balance_loss_clip": 1.31435633, + "balance_loss_mlp": 0.16894864, + "epoch": 0.24500225462197506, + "flos": 60687669980160.0, + "grad_norm": 0.9072845897780876, + "language_loss": 0.68515682, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.70192277, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.13964844, + "step": 4075, + "time_per_iteration": 3.23795485496521 + }, + { + "auxiliary_loss_clip": 0.01640555, + "auxiliary_loss_mlp": 0.00360257, + "balance_loss_clip": 1.3034656, + "balance_loss_mlp": 0.32034534, + "epoch": 0.24506237787464302, + "flos": 26284204886400.0, + "grad_norm": 58.985263983326654, + "language_loss": 0.86912954, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.88913763, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.39916992, + "step": 4076, + "time_per_iteration": 4.147852897644043 + }, + { + "auxiliary_loss_clip": 0.01654063, + "auxiliary_loss_mlp": 0.00354476, + "balance_loss_clip": 1.30275679, + "balance_loss_mlp": 0.31125027, + "epoch": 0.245122501127311, + "flos": 20552287242240.0, + "grad_norm": 21.501932079705817, + "language_loss": 0.88386375, + "learning_rate": 3.533867620434151e-06, + "loss": 0.90394914, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 0.43212891, + "step": 4077, + "time_per_iteration": 2.697634696960449 + }, + { + "auxiliary_loss_clip": 0.01663002, + "auxiliary_loss_mlp": 0.00329453, + "balance_loss_clip": 1.31272531, + "balance_loss_mlp": 0.28610888, + "epoch": 0.24518262437997895, + "flos": 29132603447040.0, + "grad_norm": 29.648568041383562, + "language_loss": 0.69991684, + "learning_rate": 3.533617663584082e-06, + "loss": 0.71984136, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.43334961, + "step": 4078, + "time_per_iteration": 2.7280759811401367 + }, + { + "auxiliary_loss_clip": 0.01626605, + "auxiliary_loss_mlp": 0.00317804, + "balance_loss_clip": 1.28407025, + "balance_loss_mlp": 0.28008634, + "epoch": 0.24524274763264692, + "flos": 23476924419840.0, + "grad_norm": 3.4996030849645807, + "language_loss": 0.80390996, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.82335407, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.37695312, + "step": 4079, + "time_per_iteration": 2.727792739868164 + }, + { + "auxiliary_loss_clip": 0.01643981, + "auxiliary_loss_mlp": 0.0034326, + "balance_loss_clip": 1.30307317, + "balance_loss_mlp": 0.30544716, + "epoch": 0.24530287088531488, + "flos": 17201175886080.0, + "grad_norm": 8.665821652051525, + "language_loss": 0.83709198, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.85696435, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.37841797, + "step": 4080, + "time_per_iteration": 2.627474308013916 + }, + { + "auxiliary_loss_clip": 0.01600778, + "auxiliary_loss_mlp": 0.00334341, + "balance_loss_clip": 1.26621485, + "balance_loss_mlp": 0.29700503, + "epoch": 0.24536299413798288, + "flos": 14867449349760.0, + "grad_norm": 3.5238721234300625, + "language_loss": 0.8967607, + "learning_rate": 3.532867444142186e-06, + "loss": 0.91611195, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.37353516, + "step": 4081, + "time_per_iteration": 2.6587443351745605 + }, + { + "auxiliary_loss_clip": 0.01621193, + "auxiliary_loss_mlp": 0.00320958, + "balance_loss_clip": 1.28541923, + "balance_loss_mlp": 0.28312111, + "epoch": 0.24542311739065084, + "flos": 35262051886080.0, + "grad_norm": 6.149470609982144, + "language_loss": 0.80525708, + "learning_rate": 3.532617254729267e-06, + "loss": 0.82467854, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.37841797, + "step": 4082, + "time_per_iteration": 4.157889366149902 + }, + { + "auxiliary_loss_clip": 0.01650843, + "auxiliary_loss_mlp": 0.00334914, + "balance_loss_clip": 1.3079443, + "balance_loss_mlp": 0.2950266, + "epoch": 0.2454832406433188, + "flos": 21503130117120.0, + "grad_norm": 6.327889263559792, + "language_loss": 0.77003419, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.78989172, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.39892578, + "step": 4083, + "time_per_iteration": 2.6434004306793213 + }, + { + "auxiliary_loss_clip": 0.01642518, + "auxiliary_loss_mlp": 0.00347934, + "balance_loss_clip": 1.29663515, + "balance_loss_mlp": 0.30702111, + "epoch": 0.24554336389598677, + "flos": 14756664827520.0, + "grad_norm": 3.9438764169614116, + "language_loss": 0.82520747, + "learning_rate": 3.532116701561919e-06, + "loss": 0.84511197, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 0.40917969, + "step": 4084, + "time_per_iteration": 2.619974136352539 + }, + { + "auxiliary_loss_clip": 0.01608016, + "auxiliary_loss_mlp": 0.00317599, + "balance_loss_clip": 1.2708565, + "balance_loss_mlp": 0.27742499, + "epoch": 0.24560348714865474, + "flos": 14976402278400.0, + "grad_norm": 8.364700557550242, + "language_loss": 0.91903043, + "learning_rate": 3.531866337826471e-06, + "loss": 0.93828654, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.40185547, + "step": 4085, + "time_per_iteration": 2.6119370460510254 + }, + { + "auxiliary_loss_clip": 0.0162692, + "auxiliary_loss_mlp": 0.00346695, + "balance_loss_clip": 1.28602362, + "balance_loss_mlp": 0.30876249, + "epoch": 0.2456636104013227, + "flos": 22675326554880.0, + "grad_norm": 6.0274868134171244, + "language_loss": 0.86455661, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.88429272, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.37939453, + "step": 4086, + "time_per_iteration": 2.69118070602417 + }, + { + "auxiliary_loss_clip": 0.01612097, + "auxiliary_loss_mlp": 0.00323966, + "balance_loss_clip": 1.27446938, + "balance_loss_mlp": 0.28796464, + "epoch": 0.2457237336539907, + "flos": 27417869009280.0, + "grad_norm": 1.6881962128297683, + "language_loss": 0.80459458, + "learning_rate": 3.531365436099496e-06, + "loss": 0.82395518, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.36010742, + "step": 4087, + "time_per_iteration": 2.761948823928833 + }, + { + "auxiliary_loss_clip": 0.01659255, + "auxiliary_loss_mlp": 0.00375422, + "balance_loss_clip": 1.30153334, + "balance_loss_mlp": 0.3354634, + "epoch": 0.24578385690665866, + "flos": 20412379768320.0, + "grad_norm": 5.260589205122971, + "language_loss": 0.87534219, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.89568895, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.3996582, + "step": 4088, + "time_per_iteration": 2.6609017848968506 + }, + { + "auxiliary_loss_clip": 0.0162987, + "auxiliary_loss_mlp": 0.00340543, + "balance_loss_clip": 1.28189683, + "balance_loss_mlp": 0.30244383, + "epoch": 0.24584398015932662, + "flos": 23915393740800.0, + "grad_norm": 7.44917209199784, + "language_loss": 0.82691753, + "learning_rate": 3.5308643020944e-06, + "loss": 0.84662163, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.38110352, + "step": 4089, + "time_per_iteration": 2.725630760192871 + }, + { + "auxiliary_loss_clip": 0.01629656, + "auxiliary_loss_mlp": 0.00342092, + "balance_loss_clip": 1.27485704, + "balance_loss_mlp": 0.3040646, + "epoch": 0.2459041034119946, + "flos": 41496359103360.0, + "grad_norm": 32.784828436748356, + "language_loss": 0.86701292, + "learning_rate": 3.530613648011309e-06, + "loss": 0.88673037, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 0.38012695, + "step": 4090, + "time_per_iteration": 2.8051397800445557 + }, + { + "auxiliary_loss_clip": 0.01655695, + "auxiliary_loss_mlp": 0.00341459, + "balance_loss_clip": 1.29525399, + "balance_loss_mlp": 0.30030793, + "epoch": 0.24596422666466256, + "flos": 19936814676480.0, + "grad_norm": 10.72643422794924, + "language_loss": 0.81976682, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.83973837, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.41162109, + "step": 4091, + "time_per_iteration": 2.6356942653656006 + }, + { + "auxiliary_loss_clip": 0.01658048, + "auxiliary_loss_mlp": 0.00339047, + "balance_loss_clip": 1.2961092, + "balance_loss_mlp": 0.30054224, + "epoch": 0.24602434991733052, + "flos": 21544391865600.0, + "grad_norm": 136.983693700579, + "language_loss": 0.8320356, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.85200661, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 3.62109375, + "router_z_loss_mlp": 0.38549805, + "step": 4092, + "time_per_iteration": 2.657351493835449 + }, + { + "auxiliary_loss_clip": 0.01660216, + "auxiliary_loss_mlp": 0.00373482, + "balance_loss_clip": 1.28746927, + "balance_loss_mlp": 0.33407182, + "epoch": 0.24608447316999849, + "flos": 23185078416000.0, + "grad_norm": 51.45237817894789, + "language_loss": 0.89129144, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.91162848, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 0.39428711, + "step": 4093, + "time_per_iteration": 2.674542188644409 + }, + { + "auxiliary_loss_clip": 0.01668889, + "auxiliary_loss_mlp": 0.00370208, + "balance_loss_clip": 1.29081655, + "balance_loss_mlp": 0.32414562, + "epoch": 0.24614459642266648, + "flos": 19641951930240.0, + "grad_norm": 7.534603615635604, + "language_loss": 0.93290472, + "learning_rate": 3.529610451363797e-06, + "loss": 0.95329571, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 0.4609375, + "step": 4094, + "time_per_iteration": 2.6294965744018555 + }, + { + "auxiliary_loss_clip": 0.01501504, + "auxiliary_loss_mlp": 0.00175475, + "balance_loss_clip": 1.30555201, + "balance_loss_mlp": 0.16164683, + "epoch": 0.24620471967533444, + "flos": 61739816186880.0, + "grad_norm": 1.3061354965899592, + "language_loss": 0.56896842, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.58573824, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.13867188, + "step": 4095, + "time_per_iteration": 3.221538543701172 + }, + { + "auxiliary_loss_clip": 0.01526037, + "auxiliary_loss_mlp": 0.00238153, + "balance_loss_clip": 1.32304013, + "balance_loss_mlp": 0.22270302, + "epoch": 0.2462648429280024, + "flos": 69154436315520.0, + "grad_norm": 0.6860879517498415, + "language_loss": 0.56359452, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58123636, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.15429688, + "step": 4096, + "time_per_iteration": 3.260131359100342 + }, + { + "auxiliary_loss_clip": 0.01648998, + "auxiliary_loss_mlp": 0.0034901, + "balance_loss_clip": 1.27948403, + "balance_loss_mlp": 0.30657101, + "epoch": 0.24632496618067037, + "flos": 29459605887360.0, + "grad_norm": 20.535292674275055, + "language_loss": 0.83947718, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.85945725, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 0.42456055, + "step": 4097, + "time_per_iteration": 2.7194676399230957 + }, + { + "auxiliary_loss_clip": 0.01644025, + "auxiliary_loss_mlp": 0.00356552, + "balance_loss_clip": 1.26546001, + "balance_loss_mlp": 0.31451833, + "epoch": 0.24638508943333834, + "flos": 24316444068480.0, + "grad_norm": 35.486127284077924, + "language_loss": 0.81548774, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.83549356, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 3.78320312, + "router_z_loss_mlp": 0.42041016, + "step": 4098, + "time_per_iteration": 2.849799394607544 + }, + { + "auxiliary_loss_clip": 0.01640582, + "auxiliary_loss_mlp": 0.00344412, + "balance_loss_clip": 1.26842737, + "balance_loss_mlp": 0.30397645, + "epoch": 0.2464452126860063, + "flos": 26613254401920.0, + "grad_norm": 16.075970307857546, + "language_loss": 0.75530541, + "learning_rate": 3.528355150558764e-06, + "loss": 0.77515537, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 0.40405273, + "step": 4099, + "time_per_iteration": 2.814713954925537 + }, + { + "auxiliary_loss_clip": 0.01641718, + "auxiliary_loss_mlp": 0.00335346, + "balance_loss_clip": 1.27401161, + "balance_loss_mlp": 0.29641211, + "epoch": 0.24650533593867427, + "flos": 31212405763200.0, + "grad_norm": 4.568995210169731, + "language_loss": 0.71435982, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.73413044, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.38916016, + "step": 4100, + "time_per_iteration": 2.782658815383911 + }, + { + "auxiliary_loss_clip": 0.01483643, + "auxiliary_loss_mlp": 0.00188695, + "balance_loss_clip": 1.29511964, + "balance_loss_mlp": 0.17248254, + "epoch": 0.24656545919134226, + "flos": 68494002900480.0, + "grad_norm": 0.7077342202645737, + "language_loss": 0.61420721, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.6309306, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.16210938, + "step": 4101, + "time_per_iteration": 3.2875173091888428 + }, + { + "auxiliary_loss_clip": 0.01625483, + "auxiliary_loss_mlp": 0.00338431, + "balance_loss_clip": 1.25559545, + "balance_loss_mlp": 0.29883015, + "epoch": 0.24662558244401023, + "flos": 20084192179200.0, + "grad_norm": 7.298211494756958, + "language_loss": 0.78749192, + "learning_rate": 3.527601274535012e-06, + "loss": 0.80713111, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 0.39575195, + "step": 4102, + "time_per_iteration": 2.630338668823242 + }, + { + "auxiliary_loss_clip": 0.01621932, + "auxiliary_loss_mlp": 0.00322387, + "balance_loss_clip": 1.2472657, + "balance_loss_mlp": 0.28497928, + "epoch": 0.2466857056966782, + "flos": 30701361012480.0, + "grad_norm": 96.5505603023039, + "language_loss": 0.81840634, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.8378495, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 0.37402344, + "step": 4103, + "time_per_iteration": 2.6935436725616455 + }, + { + "auxiliary_loss_clip": 0.01630983, + "auxiliary_loss_mlp": 0.00315578, + "balance_loss_clip": 1.2552824, + "balance_loss_mlp": 0.27597648, + "epoch": 0.24674582894934616, + "flos": 22528523669760.0, + "grad_norm": 8.341043070306116, + "language_loss": 0.85758579, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.87705135, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.39599609, + "step": 4104, + "time_per_iteration": 2.625530242919922 + }, + { + "auxiliary_loss_clip": 0.01616517, + "auxiliary_loss_mlp": 0.00308269, + "balance_loss_clip": 1.24409103, + "balance_loss_mlp": 0.27117145, + "epoch": 0.24680595220201412, + "flos": 20704297599360.0, + "grad_norm": 15.119404904229249, + "language_loss": 0.89451385, + "learning_rate": 3.526846877170133e-06, + "loss": 0.91376173, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 0.37109375, + "step": 4105, + "time_per_iteration": 2.680436134338379 + }, + { + "auxiliary_loss_clip": 0.01610958, + "auxiliary_loss_mlp": 0.00338641, + "balance_loss_clip": 1.24424136, + "balance_loss_mlp": 0.30161488, + "epoch": 0.2468660754546821, + "flos": 21831174051840.0, + "grad_norm": 98.66766138659817, + "language_loss": 0.83992261, + "learning_rate": 3.52659529557275e-06, + "loss": 0.85941863, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 0.37036133, + "step": 4106, + "time_per_iteration": 2.6495416164398193 + }, + { + "auxiliary_loss_clip": 0.01617868, + "auxiliary_loss_mlp": 0.00364904, + "balance_loss_clip": 1.24129784, + "balance_loss_mlp": 0.32642344, + "epoch": 0.24692619870735008, + "flos": 15267709578240.0, + "grad_norm": 14.343963145145667, + "language_loss": 0.8030206, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.82284826, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 0.38476562, + "step": 4107, + "time_per_iteration": 2.6064810752868652 + }, + { + "auxiliary_loss_clip": 0.01627781, + "auxiliary_loss_mlp": 0.00332309, + "balance_loss_clip": 1.25384164, + "balance_loss_mlp": 0.29287487, + "epoch": 0.24698632196001805, + "flos": 29680097523840.0, + "grad_norm": 13.286853741541114, + "language_loss": 0.74721944, + "learning_rate": 3.526091958721587e-06, + "loss": 0.76682037, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 0.39453125, + "step": 4108, + "time_per_iteration": 2.6833479404449463 + }, + { + "auxiliary_loss_clip": 0.01586324, + "auxiliary_loss_mlp": 0.0035055, + "balance_loss_clip": 1.22097564, + "balance_loss_mlp": 0.31133023, + "epoch": 0.247046445212686, + "flos": 39165469741440.0, + "grad_norm": 2.6725233528591823, + "language_loss": 0.80024087, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.81960964, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 0.39208984, + "step": 4109, + "time_per_iteration": 2.8353703022003174 + }, + { + "auxiliary_loss_clip": 0.01579438, + "auxiliary_loss_mlp": 0.00352227, + "balance_loss_clip": 1.2160387, + "balance_loss_mlp": 0.31522477, + "epoch": 0.24710656846535398, + "flos": 22998845376000.0, + "grad_norm": 4.132501333207132, + "language_loss": 0.84114712, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.86046374, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 0.36987305, + "step": 4110, + "time_per_iteration": 2.6308388710021973 + }, + { + "auxiliary_loss_clip": 0.01615343, + "auxiliary_loss_mlp": 0.00331821, + "balance_loss_clip": 1.24703717, + "balance_loss_mlp": 0.29312548, + "epoch": 0.24716669171802194, + "flos": 26432803451520.0, + "grad_norm": 71.16425149046033, + "language_loss": 0.88919067, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.90866226, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 3.6796875, + "router_z_loss_mlp": 0.38696289, + "step": 4111, + "time_per_iteration": 4.085323333740234 + }, + { + "auxiliary_loss_clip": 0.01603114, + "auxiliary_loss_mlp": 0.00358025, + "balance_loss_clip": 1.23195708, + "balance_loss_mlp": 0.31949612, + "epoch": 0.2472268149706899, + "flos": 23329870139520.0, + "grad_norm": 11.44956199850168, + "language_loss": 0.8055141, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.82512558, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 0.38549805, + "step": 4112, + "time_per_iteration": 2.643277883529663 + }, + { + "auxiliary_loss_clip": 0.01600087, + "auxiliary_loss_mlp": 0.00344439, + "balance_loss_clip": 1.22714186, + "balance_loss_mlp": 0.30698308, + "epoch": 0.24728693822335787, + "flos": 23768734510080.0, + "grad_norm": 3.3616311445321077, + "language_loss": 0.89675593, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.91620123, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 3.73046875, + "router_z_loss_mlp": 0.37426758, + "step": 4113, + "time_per_iteration": 2.6380510330200195 + }, + { + "auxiliary_loss_clip": 0.01563954, + "auxiliary_loss_mlp": 0.00299357, + "balance_loss_clip": 1.20575559, + "balance_loss_mlp": 0.26118588, + "epoch": 0.24734706147602586, + "flos": 19317499355520.0, + "grad_norm": 2.560508202444428, + "language_loss": 0.95158076, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.97021389, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.3815918, + "step": 4114, + "time_per_iteration": 4.060837984085083 + }, + { + "auxiliary_loss_clip": 0.01577795, + "auxiliary_loss_mlp": 0.0030376, + "balance_loss_clip": 1.21452498, + "balance_loss_mlp": 0.26928455, + "epoch": 0.24740718472869383, + "flos": 28036932935040.0, + "grad_norm": 2.593634770959704, + "language_loss": 0.81411016, + "learning_rate": 3.524328457352734e-06, + "loss": 0.83292568, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.3449707, + "step": 4115, + "time_per_iteration": 2.7037220001220703 + }, + { + "auxiliary_loss_clip": 0.01404145, + "auxiliary_loss_mlp": 0.00076857, + "balance_loss_clip": 1.24002099, + "balance_loss_mlp": 0.05959515, + "epoch": 0.2474673079813618, + "flos": 68107569408000.0, + "grad_norm": 1.5225242893981896, + "language_loss": 0.57638657, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.59119654, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.17285156, + "step": 4116, + "time_per_iteration": 3.2344231605529785 + }, + { + "auxiliary_loss_clip": 0.01558125, + "auxiliary_loss_mlp": 0.00283354, + "balance_loss_clip": 1.2040813, + "balance_loss_mlp": 0.24730554, + "epoch": 0.24752743123402976, + "flos": 29462119839360.0, + "grad_norm": 14.660377114872066, + "language_loss": 0.8925041, + "learning_rate": 3.523824079451235e-06, + "loss": 0.91091889, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 0.3605957, + "step": 4117, + "time_per_iteration": 2.7813124656677246 + }, + { + "auxiliary_loss_clip": 0.01378716, + "auxiliary_loss_mlp": 0.00087638, + "balance_loss_clip": 1.20988894, + "balance_loss_mlp": 0.07485904, + "epoch": 0.24758755448669773, + "flos": 58350459824640.0, + "grad_norm": 0.8751892768462821, + "language_loss": 0.63379711, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.64846063, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.12792969, + "step": 4118, + "time_per_iteration": 4.359997749328613 + }, + { + "auxiliary_loss_clip": 0.01583993, + "auxiliary_loss_mlp": 0.00320558, + "balance_loss_clip": 1.22221804, + "balance_loss_mlp": 0.28367478, + "epoch": 0.2476476777393657, + "flos": 20484416494080.0, + "grad_norm": 20.689716289313967, + "language_loss": 0.8559221, + "learning_rate": 3.523319470415491e-06, + "loss": 0.87496758, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.3684082, + "step": 4119, + "time_per_iteration": 2.708580732345581 + }, + { + "auxiliary_loss_clip": 0.01558951, + "auxiliary_loss_mlp": 0.00287547, + "balance_loss_clip": 1.20591235, + "balance_loss_mlp": 0.25052103, + "epoch": 0.24770780099203366, + "flos": 20485853038080.0, + "grad_norm": 10.007822824854461, + "language_loss": 0.81251168, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.83097667, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.37036133, + "step": 4120, + "time_per_iteration": 2.650531530380249 + }, + { + "auxiliary_loss_clip": 0.01560447, + "auxiliary_loss_mlp": 0.00324771, + "balance_loss_clip": 1.20536828, + "balance_loss_mlp": 0.28929389, + "epoch": 0.24776792424470165, + "flos": 15153405523200.0, + "grad_norm": 16.67719211000628, + "language_loss": 0.95121467, + "learning_rate": 3.522814630322041e-06, + "loss": 0.97006691, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.35498047, + "step": 4121, + "time_per_iteration": 2.5972020626068115 + }, + { + "auxiliary_loss_clip": 0.01559296, + "auxiliary_loss_mlp": 0.00308306, + "balance_loss_clip": 1.20173848, + "balance_loss_mlp": 0.27089852, + "epoch": 0.2478280474973696, + "flos": 21725453347200.0, + "grad_norm": 3.444485956588, + "language_loss": 0.78672338, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.80539942, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 0.37402344, + "step": 4122, + "time_per_iteration": 2.647784948348999 + }, + { + "auxiliary_loss_clip": 0.01550757, + "auxiliary_loss_mlp": 0.00282857, + "balance_loss_clip": 1.19222713, + "balance_loss_mlp": 0.24578337, + "epoch": 0.24788817075003758, + "flos": 20412200200320.0, + "grad_norm": 2.2510618426609623, + "language_loss": 0.86900854, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.88734466, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.37060547, + "step": 4123, + "time_per_iteration": 2.631120443344116 + }, + { + "auxiliary_loss_clip": 0.01566763, + "auxiliary_loss_mlp": 0.00296299, + "balance_loss_clip": 1.21172571, + "balance_loss_mlp": 0.26139462, + "epoch": 0.24794829400270554, + "flos": 22594455083520.0, + "grad_norm": 3.529432966033417, + "language_loss": 0.79914916, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.81777978, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 0.34912109, + "step": 4124, + "time_per_iteration": 4.1663713455200195 + }, + { + "auxiliary_loss_clip": 0.01560277, + "auxiliary_loss_mlp": 0.00297022, + "balance_loss_clip": 1.20577228, + "balance_loss_mlp": 0.26364291, + "epoch": 0.2480084172553735, + "flos": 39676047615360.0, + "grad_norm": 6.504756330539726, + "language_loss": 0.79161829, + "learning_rate": 3.521804257268357e-06, + "loss": 0.81019127, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 0.33398438, + "step": 4125, + "time_per_iteration": 2.7839128971099854 + }, + { + "auxiliary_loss_clip": 0.01582479, + "auxiliary_loss_mlp": 0.00349939, + "balance_loss_clip": 1.21150875, + "balance_loss_mlp": 0.31288838, + "epoch": 0.24806854050804147, + "flos": 22053712763520.0, + "grad_norm": 30.57992046959528, + "language_loss": 0.77504444, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.79436862, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 3.71484375, + "router_z_loss_mlp": 0.37060547, + "step": 4126, + "time_per_iteration": 2.621971845626831 + }, + { + "auxiliary_loss_clip": 0.01588919, + "auxiliary_loss_mlp": 0.00328616, + "balance_loss_clip": 1.22084582, + "balance_loss_mlp": 0.29237676, + "epoch": 0.24812866376070947, + "flos": 15486764670720.0, + "grad_norm": 3.494661066054162, + "language_loss": 0.8892861, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.90846145, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 0.36230469, + "step": 4127, + "time_per_iteration": 2.619274854660034 + }, + { + "auxiliary_loss_clip": 0.01593439, + "auxiliary_loss_mlp": 0.00300869, + "balance_loss_clip": 1.22469234, + "balance_loss_mlp": 0.26434332, + "epoch": 0.24818878701337743, + "flos": 14757419013120.0, + "grad_norm": 10.847052260671141, + "language_loss": 0.91142428, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.93036735, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 0.36499023, + "step": 4128, + "time_per_iteration": 2.594078302383423 + }, + { + "auxiliary_loss_clip": 0.01588823, + "auxiliary_loss_mlp": 0.00328826, + "balance_loss_clip": 1.21915174, + "balance_loss_mlp": 0.29127467, + "epoch": 0.2482489102660454, + "flos": 27089501852160.0, + "grad_norm": 70.58542331293015, + "language_loss": 0.73438179, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.75355828, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 3.6953125, + "router_z_loss_mlp": 0.37548828, + "step": 4129, + "time_per_iteration": 2.6914682388305664 + }, + { + "auxiliary_loss_clip": 0.01575628, + "auxiliary_loss_mlp": 0.0034094, + "balance_loss_clip": 1.20976615, + "balance_loss_mlp": 0.30434281, + "epoch": 0.24830903351871336, + "flos": 26467528924800.0, + "grad_norm": 35.98961304762489, + "language_loss": 0.81956077, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.83872646, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 0.36621094, + "step": 4130, + "time_per_iteration": 2.686846971511841 + }, + { + "auxiliary_loss_clip": 0.01597437, + "auxiliary_loss_mlp": 0.00357673, + "balance_loss_clip": 1.22617579, + "balance_loss_mlp": 0.31926346, + "epoch": 0.24836915677138133, + "flos": 10228436870400.0, + "grad_norm": 33.63542345259908, + "language_loss": 0.86513841, + "learning_rate": 3.520286966670535e-06, + "loss": 0.88468957, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 0.3840332, + "step": 4131, + "time_per_iteration": 2.600212574005127 + }, + { + "auxiliary_loss_clip": 0.01606668, + "auxiliary_loss_mlp": 0.00311862, + "balance_loss_clip": 1.23475814, + "balance_loss_mlp": 0.27605128, + "epoch": 0.2484292800240493, + "flos": 30080429579520.0, + "grad_norm": 40.67960033110541, + "language_loss": 0.88529718, + "learning_rate": 3.520033883075255e-06, + "loss": 0.90448248, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.35791016, + "step": 4132, + "time_per_iteration": 2.76163649559021 + }, + { + "auxiliary_loss_clip": 0.01624537, + "auxiliary_loss_mlp": 0.00358648, + "balance_loss_clip": 1.24673653, + "balance_loss_mlp": 0.31587541, + "epoch": 0.24848940327671726, + "flos": 13442944803840.0, + "grad_norm": 22.434479328811985, + "language_loss": 0.77397722, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.79380906, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 3.77929688, + "router_z_loss_mlp": 0.42797852, + "step": 4133, + "time_per_iteration": 2.6229987144470215 + }, + { + "auxiliary_loss_clip": 0.0163632, + "auxiliary_loss_mlp": 0.00339809, + "balance_loss_clip": 1.24445057, + "balance_loss_mlp": 0.30073196, + "epoch": 0.24854952652938525, + "flos": 19970247260160.0, + "grad_norm": 318.8768316898681, + "language_loss": 0.71988219, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.73964345, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 3.91796875, + "router_z_loss_mlp": 0.390625, + "step": 4134, + "time_per_iteration": 2.6628546714782715 + }, + { + "auxiliary_loss_clip": 0.01632215, + "auxiliary_loss_mlp": 0.00311941, + "balance_loss_clip": 1.24456239, + "balance_loss_mlp": 0.27531958, + "epoch": 0.24860964978205322, + "flos": 18150187167360.0, + "grad_norm": 98.16990109395587, + "language_loss": 0.8436414, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.86308295, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 0.36621094, + "step": 4135, + "time_per_iteration": 2.6176228523254395 + }, + { + "auxiliary_loss_clip": 0.01618182, + "auxiliary_loss_mlp": 0.00358081, + "balance_loss_clip": 1.23771203, + "balance_loss_mlp": 0.31957677, + "epoch": 0.24866977303472118, + "flos": 11728641329280.0, + "grad_norm": 6.034747879130095, + "language_loss": 0.91327989, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.93304253, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.38500977, + "step": 4136, + "time_per_iteration": 2.636693000793457 + }, + { + "auxiliary_loss_clip": 0.0165105, + "auxiliary_loss_mlp": 0.00315477, + "balance_loss_clip": 1.2525146, + "balance_loss_mlp": 0.27702013, + "epoch": 0.24872989628738915, + "flos": 34823582565120.0, + "grad_norm": 3.9493585424694375, + "language_loss": 0.78993183, + "learning_rate": 3.518767600693314e-06, + "loss": 0.80959713, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 3.98632812, + "router_z_loss_mlp": 0.38452148, + "step": 4137, + "time_per_iteration": 2.8064165115356445 + }, + { + "auxiliary_loss_clip": 0.0165912, + "auxiliary_loss_mlp": 0.00344801, + "balance_loss_clip": 1.25828147, + "balance_loss_mlp": 0.30474666, + "epoch": 0.2487900195400571, + "flos": 13699347062400.0, + "grad_norm": 48.897771210315916, + "language_loss": 0.74513113, + "learning_rate": 3.518514171403042e-06, + "loss": 0.76517034, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 4.0078125, + "router_z_loss_mlp": 0.40063477, + "step": 4138, + "time_per_iteration": 2.6587345600128174 + }, + { + "auxiliary_loss_clip": 0.01681615, + "auxiliary_loss_mlp": 0.00332011, + "balance_loss_clip": 1.27690828, + "balance_loss_mlp": 0.2916702, + "epoch": 0.24885014279272508, + "flos": 25337815297920.0, + "grad_norm": 4.99907537749592, + "language_loss": 0.88505793, + "learning_rate": 3.51826068453056e-06, + "loss": 0.90519416, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.40332031, + "step": 4139, + "time_per_iteration": 2.680232524871826 + }, + { + "auxiliary_loss_clip": 0.01670372, + "auxiliary_loss_mlp": 0.00368882, + "balance_loss_clip": 1.26457274, + "balance_loss_mlp": 0.32897103, + "epoch": 0.24891026604539307, + "flos": 20631434860800.0, + "grad_norm": 3.733303265796573, + "language_loss": 0.85245162, + "learning_rate": 3.518007140085481e-06, + "loss": 0.87284416, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 4.06640625, + "router_z_loss_mlp": 0.39868164, + "step": 4140, + "time_per_iteration": 2.649062395095825 + }, + { + "auxiliary_loss_clip": 0.01378362, + "auxiliary_loss_mlp": 0.000798, + "balance_loss_clip": 1.19434214, + "balance_loss_mlp": 0.06644814, + "epoch": 0.24897038929806103, + "flos": 66960294030720.0, + "grad_norm": 0.7916896047721058, + "language_loss": 0.6031363, + "learning_rate": 3.51775353807742e-06, + "loss": 0.61771792, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.13378906, + "step": 4141, + "time_per_iteration": 3.223546028137207 + }, + { + "auxiliary_loss_clip": 0.01667314, + "auxiliary_loss_mlp": 0.00338031, + "balance_loss_clip": 1.26513076, + "balance_loss_mlp": 0.29833466, + "epoch": 0.249030512550729, + "flos": 36392555612160.0, + "grad_norm": 2.2462992065935645, + "language_loss": 0.79525411, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.81530756, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 4.02148438, + "router_z_loss_mlp": 0.39672852, + "step": 4142, + "time_per_iteration": 2.818804979324341 + }, + { + "auxiliary_loss_clip": 0.01677843, + "auxiliary_loss_mlp": 0.00368829, + "balance_loss_clip": 1.27150941, + "balance_loss_mlp": 0.32708192, + "epoch": 0.24909063580339696, + "flos": 20154576879360.0, + "grad_norm": 8.88068899723159, + "language_loss": 0.86440003, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.88486671, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 4.06054688, + "router_z_loss_mlp": 0.41723633, + "step": 4143, + "time_per_iteration": 2.615807294845581 + }, + { + "auxiliary_loss_clip": 0.01673431, + "auxiliary_loss_mlp": 0.00350429, + "balance_loss_clip": 1.26996374, + "balance_loss_mlp": 0.31030327, + "epoch": 0.24915075905606493, + "flos": 26396569607040.0, + "grad_norm": 10.378688528502895, + "language_loss": 0.64769375, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.66793239, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 4.03515625, + "router_z_loss_mlp": 0.40136719, + "step": 4144, + "time_per_iteration": 2.78159236907959 + }, + { + "auxiliary_loss_clip": 0.01682066, + "auxiliary_loss_mlp": 0.00320384, + "balance_loss_clip": 1.27979279, + "balance_loss_mlp": 0.28249896, + "epoch": 0.2492108823087329, + "flos": 27527216987520.0, + "grad_norm": 2.287360875936286, + "language_loss": 0.84512997, + "learning_rate": 3.516738554607708e-06, + "loss": 0.8651545, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 4.015625, + "router_z_loss_mlp": 0.37866211, + "step": 4145, + "time_per_iteration": 2.8103249073028564 + }, + { + "auxiliary_loss_clip": 0.01687868, + "auxiliary_loss_mlp": 0.0036843, + "balance_loss_clip": 1.28472412, + "balance_loss_mlp": 0.32506174, + "epoch": 0.24927100556140086, + "flos": 16691388111360.0, + "grad_norm": 88.56428761737007, + "language_loss": 0.76088572, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.78144872, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 4.03125, + "router_z_loss_mlp": 0.43359375, + "step": 4146, + "time_per_iteration": 2.6481122970581055 + }, + { + "auxiliary_loss_clip": 0.01433006, + "auxiliary_loss_mlp": 0.0007412, + "balance_loss_clip": 1.24524558, + "balance_loss_mlp": 0.05809785, + "epoch": 0.24933112881406885, + "flos": 62772464286720.0, + "grad_norm": 0.9391876255772994, + "language_loss": 0.67174792, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.6868192, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.16015625, + "step": 4147, + "time_per_iteration": 3.293832302093506 + }, + { + "auxiliary_loss_clip": 0.01676762, + "auxiliary_loss_mlp": 0.00356581, + "balance_loss_clip": 1.27315474, + "balance_loss_mlp": 0.31650287, + "epoch": 0.24939125206673682, + "flos": 26651894457600.0, + "grad_norm": 1.6993454730732647, + "language_loss": 0.95997477, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.98030818, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 4.03710938, + "router_z_loss_mlp": 0.40087891, + "step": 4148, + "time_per_iteration": 2.700181245803833 + }, + { + "auxiliary_loss_clip": 0.01702047, + "auxiliary_loss_mlp": 0.0036396, + "balance_loss_clip": 1.28868723, + "balance_loss_mlp": 0.32319051, + "epoch": 0.24945137531940478, + "flos": 20704333512960.0, + "grad_norm": 68.05735301965714, + "language_loss": 0.75568092, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.77634096, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 4.1328125, + "router_z_loss_mlp": 0.40795898, + "step": 4149, + "time_per_iteration": 2.679426431655884 + }, + { + "auxiliary_loss_clip": 0.01679139, + "auxiliary_loss_mlp": 0.00361903, + "balance_loss_clip": 1.28914356, + "balance_loss_mlp": 0.32289767, + "epoch": 0.24951149857207275, + "flos": 23768662682880.0, + "grad_norm": 12.978639419053055, + "language_loss": 0.76382625, + "learning_rate": 3.515468531258095e-06, + "loss": 0.78423667, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.38989258, + "step": 4150, + "time_per_iteration": 2.6783668994903564 + }, + { + "auxiliary_loss_clip": 0.0165879, + "auxiliary_loss_mlp": 0.00374914, + "balance_loss_clip": 1.26337111, + "balance_loss_mlp": 0.33433527, + "epoch": 0.2495716218247407, + "flos": 15664881237120.0, + "grad_norm": 15.940898924040004, + "language_loss": 0.78920627, + "learning_rate": 3.515214354149478e-06, + "loss": 0.80954325, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 3.95507812, + "router_z_loss_mlp": 0.40600586, + "step": 4151, + "time_per_iteration": 2.6855456829071045 + }, + { + "auxiliary_loss_clip": 0.01670458, + "auxiliary_loss_mlp": 0.00378467, + "balance_loss_clip": 1.27448392, + "balance_loss_mlp": 0.33693409, + "epoch": 0.24963174507740868, + "flos": 24052499953920.0, + "grad_norm": 24.948380849339397, + "language_loss": 0.73644125, + "learning_rate": 3.514960119583781e-06, + "loss": 0.75693047, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 3.9609375, + "router_z_loss_mlp": 0.41503906, + "step": 4152, + "time_per_iteration": 2.7001969814300537 + }, + { + "auxiliary_loss_clip": 0.01693832, + "auxiliary_loss_mlp": 0.00368962, + "balance_loss_clip": 1.29443991, + "balance_loss_mlp": 0.32809728, + "epoch": 0.24969186833007664, + "flos": 21799501234560.0, + "grad_norm": 6.18107207295018, + "language_loss": 0.83690363, + "learning_rate": 3.514705827570645e-06, + "loss": 0.85753155, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 3.99609375, + "router_z_loss_mlp": 0.40869141, + "step": 4153, + "time_per_iteration": 4.045004367828369 + }, + { + "auxiliary_loss_clip": 0.01688524, + "auxiliary_loss_mlp": 0.00369848, + "balance_loss_clip": 1.29476607, + "balance_loss_mlp": 0.33055663, + "epoch": 0.24975199158274464, + "flos": 19938143479680.0, + "grad_norm": 3.0147549487533696, + "language_loss": 0.83342552, + "learning_rate": 3.514451478119711e-06, + "loss": 0.85400921, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 3.93945312, + "router_z_loss_mlp": 0.39257812, + "step": 4154, + "time_per_iteration": 2.659853935241699 + }, + { + "auxiliary_loss_clip": 0.01699165, + "auxiliary_loss_mlp": 0.00382485, + "balance_loss_clip": 1.29056191, + "balance_loss_mlp": 0.33775753, + "epoch": 0.2498121148354126, + "flos": 25338389915520.0, + "grad_norm": 19.498646034506738, + "language_loss": 0.76917291, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.78998947, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 4.08789062, + "router_z_loss_mlp": 0.44775391, + "step": 4155, + "time_per_iteration": 2.6661174297332764 + }, + { + "auxiliary_loss_clip": 0.01713254, + "auxiliary_loss_mlp": 0.00394735, + "balance_loss_clip": 1.29966712, + "balance_loss_mlp": 0.35353652, + "epoch": 0.24987223808808057, + "flos": 20558787603840.0, + "grad_norm": 3.388049981669927, + "language_loss": 0.82391047, + "learning_rate": 3.513942606943036e-06, + "loss": 0.84499037, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 4.13671875, + "router_z_loss_mlp": 0.41210938, + "step": 4156, + "time_per_iteration": 4.044252872467041 + }, + { + "auxiliary_loss_clip": 0.01722149, + "auxiliary_loss_mlp": 0.00382439, + "balance_loss_clip": 1.31101, + "balance_loss_mlp": 0.33880812, + "epoch": 0.24993236134074853, + "flos": 19749037351680.0, + "grad_norm": 8.458180837666884, + "language_loss": 0.85453051, + "learning_rate": 3.513688085236591e-06, + "loss": 0.87557638, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 4.11523438, + "router_z_loss_mlp": 0.43652344, + "step": 4157, + "time_per_iteration": 2.656287431716919 + }, + { + "auxiliary_loss_clip": 0.01697187, + "auxiliary_loss_mlp": 0.00382409, + "balance_loss_clip": 1.28843164, + "balance_loss_mlp": 0.33977976, + "epoch": 0.2499924845934165, + "flos": 18770292587520.0, + "grad_norm": 8.385051716522147, + "language_loss": 0.86735475, + "learning_rate": 3.513433506130942e-06, + "loss": 0.88815069, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 4.08789062, + "router_z_loss_mlp": 0.42602539, + "step": 4158, + "time_per_iteration": 2.6642305850982666 + }, + { + "auxiliary_loss_clip": 0.01746591, + "auxiliary_loss_mlp": 0.00377001, + "balance_loss_clip": 1.32438135, + "balance_loss_mlp": 0.33539721, + "epoch": 0.25005260784608446, + "flos": 16872198197760.0, + "grad_norm": 5.182674637914498, + "language_loss": 0.82302916, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.84426504, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 4.2265625, + "router_z_loss_mlp": 0.41601562, + "step": 4159, + "time_per_iteration": 2.6599841117858887 + }, + { + "auxiliary_loss_clip": 0.01736726, + "auxiliary_loss_mlp": 0.00380718, + "balance_loss_clip": 1.31545138, + "balance_loss_mlp": 0.33498913, + "epoch": 0.2501127310987524, + "flos": 22124923476480.0, + "grad_norm": 4.360900478936708, + "language_loss": 0.77357417, + "learning_rate": 3.512924175760649e-06, + "loss": 0.79474854, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 4.2109375, + "router_z_loss_mlp": 0.45703125, + "step": 4160, + "time_per_iteration": 4.100468158721924 + }, + { + "auxiliary_loss_clip": 0.0147425, + "auxiliary_loss_mlp": 0.00069325, + "balance_loss_clip": 1.28157544, + "balance_loss_mlp": 0.0516822, + "epoch": 0.2501728543514204, + "flos": 69458061980160.0, + "grad_norm": 1.1044878035630226, + "language_loss": 0.566706, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.58214176, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.17675781, + "step": 4161, + "time_per_iteration": 3.2302401065826416 + }, + { + "auxiliary_loss_clip": 0.01750971, + "auxiliary_loss_mlp": 0.00400915, + "balance_loss_clip": 1.32357037, + "balance_loss_mlp": 0.35790455, + "epoch": 0.25023297760408836, + "flos": 16289978647680.0, + "grad_norm": 27.10750325978225, + "language_loss": 0.8851552, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.90667403, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 4.27734375, + "router_z_loss_mlp": 0.4296875, + "step": 4162, + "time_per_iteration": 2.740518093109131 + }, + { + "auxiliary_loss_clip": 0.01757411, + "auxiliary_loss_mlp": 0.00400136, + "balance_loss_clip": 1.32371676, + "balance_loss_mlp": 0.358055, + "epoch": 0.2502931008567563, + "flos": 12237998140800.0, + "grad_norm": 4.924283023869376, + "language_loss": 0.94599736, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.96757287, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 4.34179688, + "router_z_loss_mlp": 0.42089844, + "step": 4163, + "time_per_iteration": 2.5928537845611572 + }, + { + "auxiliary_loss_clip": 0.01773059, + "auxiliary_loss_mlp": 0.00403892, + "balance_loss_clip": 1.33595204, + "balance_loss_mlp": 0.36114311, + "epoch": 0.25035322410942434, + "flos": 23181882105600.0, + "grad_norm": 3.712350454006176, + "language_loss": 0.89638305, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.91815257, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 4.3671875, + "router_z_loss_mlp": 0.42749023, + "step": 4164, + "time_per_iteration": 2.680172920227051 + }, + { + "auxiliary_loss_clip": 0.01740181, + "auxiliary_loss_mlp": 0.00399107, + "balance_loss_clip": 1.31957436, + "balance_loss_mlp": 0.35597757, + "epoch": 0.2504133473620923, + "flos": 20917534688640.0, + "grad_norm": 14.820452076107669, + "language_loss": 0.78638607, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.80777895, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 4.2109375, + "router_z_loss_mlp": 0.43139648, + "step": 4165, + "time_per_iteration": 2.6597282886505127 + }, + { + "auxiliary_loss_clip": 0.01765597, + "auxiliary_loss_mlp": 0.00430017, + "balance_loss_clip": 1.32789409, + "balance_loss_mlp": 0.38321573, + "epoch": 0.2504734706147603, + "flos": 20776549806720.0, + "grad_norm": 8.285484897117435, + "language_loss": 0.79103994, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.81299615, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 4.375, + "router_z_loss_mlp": 0.46777344, + "step": 4166, + "time_per_iteration": 2.6459267139434814 + }, + { + "auxiliary_loss_clip": 0.01725853, + "auxiliary_loss_mlp": 0.00375757, + "balance_loss_clip": 1.30199277, + "balance_loss_mlp": 0.33365217, + "epoch": 0.25053359386742824, + "flos": 24349373861760.0, + "grad_norm": 12.23596311459132, + "language_loss": 0.87565714, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.8966732, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 4.2421875, + "router_z_loss_mlp": 0.42138672, + "step": 4167, + "time_per_iteration": 4.156676530838013 + }, + { + "auxiliary_loss_clip": 0.0172705, + "auxiliary_loss_mlp": 0.00371574, + "balance_loss_clip": 1.31781411, + "balance_loss_mlp": 0.33159083, + "epoch": 0.2505937171200962, + "flos": 21214336769280.0, + "grad_norm": 58.219360444293386, + "language_loss": 0.85679185, + "learning_rate": 3.51088456024312e-06, + "loss": 0.87777817, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 4.09765625, + "router_z_loss_mlp": 0.3996582, + "step": 4168, + "time_per_iteration": 2.630072832107544 + }, + { + "auxiliary_loss_clip": 0.01735012, + "auxiliary_loss_mlp": 0.00395198, + "balance_loss_clip": 1.30172074, + "balance_loss_mlp": 0.34784758, + "epoch": 0.25065384037276417, + "flos": 41427231379200.0, + "grad_norm": 27.149577690611306, + "language_loss": 0.78424931, + "learning_rate": 3.510629350383849e-06, + "loss": 0.80555141, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 4.328125, + "router_z_loss_mlp": 0.47412109, + "step": 4169, + "time_per_iteration": 2.7959632873535156 + }, + { + "auxiliary_loss_clip": 0.01721828, + "auxiliary_loss_mlp": 0.00345627, + "balance_loss_clip": 1.30208993, + "balance_loss_mlp": 0.30366546, + "epoch": 0.25071396362543213, + "flos": 26102389219200.0, + "grad_norm": 75.86475815591345, + "language_loss": 0.84749293, + "learning_rate": 3.510374083241361e-06, + "loss": 0.86816752, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 4.1953125, + "router_z_loss_mlp": 0.41967773, + "step": 4170, + "time_per_iteration": 2.7124831676483154 + }, + { + "auxiliary_loss_clip": 0.01733714, + "auxiliary_loss_mlp": 0.00361093, + "balance_loss_clip": 1.30724955, + "balance_loss_mlp": 0.31979859, + "epoch": 0.2507740868781001, + "flos": 19098982967040.0, + "grad_norm": 4.287048929377042, + "language_loss": 0.83214402, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.85309207, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 4.25, + "router_z_loss_mlp": 0.4128418, + "step": 4171, + "time_per_iteration": 2.702528953552246 + }, + { + "auxiliary_loss_clip": 0.01562006, + "auxiliary_loss_mlp": 0.00115442, + "balance_loss_clip": 1.34955525, + "balance_loss_mlp": 0.10075565, + "epoch": 0.25083421013076806, + "flos": 64341868296960.0, + "grad_norm": 1.4656045544706184, + "language_loss": 0.60125053, + "learning_rate": 3.509863377145458e-06, + "loss": 0.618025, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.14648438, + "step": 4172, + "time_per_iteration": 3.1474382877349854 + }, + { + "auxiliary_loss_clip": 0.0172086, + "auxiliary_loss_mlp": 0.00355263, + "balance_loss_clip": 1.29701805, + "balance_loss_mlp": 0.31375426, + "epoch": 0.25089433338343603, + "flos": 24279599692800.0, + "grad_norm": 379.96847912480564, + "language_loss": 0.84558362, + "learning_rate": 3.509607938211409e-06, + "loss": 0.86634487, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 4.23828125, + "router_z_loss_mlp": 0.41552734, + "step": 4173, + "time_per_iteration": 2.7111175060272217 + }, + { + "auxiliary_loss_clip": 0.01766929, + "auxiliary_loss_mlp": 0.00361936, + "balance_loss_clip": 1.32879353, + "balance_loss_mlp": 0.3186388, + "epoch": 0.250954456636104, + "flos": 14721472477440.0, + "grad_norm": 2.108061658194272, + "language_loss": 0.91189992, + "learning_rate": 3.509352442032875e-06, + "loss": 0.93318856, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 4.37890625, + "router_z_loss_mlp": 0.43310547, + "step": 4174, + "time_per_iteration": 2.747072219848633 + }, + { + "auxiliary_loss_clip": 0.01746269, + "auxiliary_loss_mlp": 0.00382044, + "balance_loss_clip": 1.32358027, + "balance_loss_mlp": 0.3358624, + "epoch": 0.25101457988877196, + "flos": 22273593868800.0, + "grad_norm": 1683.8000366236479, + "language_loss": 0.78481686, + "learning_rate": 3.509096888619545e-06, + "loss": 0.80610001, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 4.23046875, + "router_z_loss_mlp": 0.46191406, + "step": 4175, + "time_per_iteration": 2.7203521728515625 + }, + { + "auxiliary_loss_clip": 0.01723251, + "auxiliary_loss_mlp": 0.00368967, + "balance_loss_clip": 1.29564548, + "balance_loss_mlp": 0.32431072, + "epoch": 0.2510747031414399, + "flos": 25188929424000.0, + "grad_norm": 2.8406437051574653, + "language_loss": 0.86460221, + "learning_rate": 3.50884127798111e-06, + "loss": 0.88552439, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 4.27734375, + "router_z_loss_mlp": 0.4465332, + "step": 4176, + "time_per_iteration": 2.6984965801239014 + }, + { + "auxiliary_loss_clip": 0.01709031, + "auxiliary_loss_mlp": 0.00347616, + "balance_loss_clip": 1.29871988, + "balance_loss_mlp": 0.30779988, + "epoch": 0.25113482639410795, + "flos": 20704189858560.0, + "grad_norm": 9.894091325548603, + "language_loss": 0.88829297, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.90885949, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 4.10351562, + "router_z_loss_mlp": 0.39819336, + "step": 4177, + "time_per_iteration": 2.6713273525238037 + }, + { + "auxiliary_loss_clip": 0.01740001, + "auxiliary_loss_mlp": 0.00366186, + "balance_loss_clip": 1.32228446, + "balance_loss_mlp": 0.32277048, + "epoch": 0.2511949496467759, + "flos": 21506936958720.0, + "grad_norm": 13.24306334502241, + "language_loss": 0.90009606, + "learning_rate": 3.508329885067698e-06, + "loss": 0.92115796, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 4.171875, + "router_z_loss_mlp": 0.43383789, + "step": 4178, + "time_per_iteration": 2.6550631523132324 + }, + { + "auxiliary_loss_clip": 0.01724731, + "auxiliary_loss_mlp": 0.00339863, + "balance_loss_clip": 1.30957627, + "balance_loss_mlp": 0.29823545, + "epoch": 0.2512550728994439, + "flos": 20701999128960.0, + "grad_norm": 16.460488558139318, + "language_loss": 0.83062792, + "learning_rate": 3.508074102812112e-06, + "loss": 0.85127389, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 4.15234375, + "router_z_loss_mlp": 0.41650391, + "step": 4179, + "time_per_iteration": 2.656933546066284 + }, + { + "auxiliary_loss_clip": 0.01721718, + "auxiliary_loss_mlp": 0.00365529, + "balance_loss_clip": 1.31100845, + "balance_loss_mlp": 0.32232744, + "epoch": 0.25131519615211184, + "flos": 18478626151680.0, + "grad_norm": 25.137492552808496, + "language_loss": 0.78887159, + "learning_rate": 3.507818263370206e-06, + "loss": 0.80974406, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 4.10742188, + "router_z_loss_mlp": 0.43188477, + "step": 4180, + "time_per_iteration": 2.676567554473877 + }, + { + "auxiliary_loss_clip": 0.017452, + "auxiliary_loss_mlp": 0.00355899, + "balance_loss_clip": 1.33292937, + "balance_loss_mlp": 0.31701279, + "epoch": 0.2513753194047798, + "flos": 20484955198080.0, + "grad_norm": 48.8172449956689, + "language_loss": 0.91048241, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.93149334, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 4.12109375, + "router_z_loss_mlp": 0.38842773, + "step": 4181, + "time_per_iteration": 2.659618854522705 + }, + { + "auxiliary_loss_clip": 0.01725769, + "auxiliary_loss_mlp": 0.00361301, + "balance_loss_clip": 1.3215152, + "balance_loss_mlp": 0.3193866, + "epoch": 0.25143544265744777, + "flos": 37670077704960.0, + "grad_norm": 41.31115987201298, + "language_loss": 0.74679357, + "learning_rate": 3.507306412966238e-06, + "loss": 0.76766431, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 4.04492188, + "router_z_loss_mlp": 0.41918945, + "step": 4182, + "time_per_iteration": 2.825866222381592 + }, + { + "auxiliary_loss_clip": 0.01496348, + "auxiliary_loss_mlp": 0.0014551, + "balance_loss_clip": 1.27973676, + "balance_loss_mlp": 0.1359729, + "epoch": 0.25149556591011574, + "flos": 69367457923200.0, + "grad_norm": 0.8748226155282093, + "language_loss": 0.70048219, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.71690077, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.09521484, + "step": 4183, + "time_per_iteration": 3.179516315460205 + }, + { + "auxiliary_loss_clip": 0.01713858, + "auxiliary_loss_mlp": 0.00335354, + "balance_loss_clip": 1.30644178, + "balance_loss_mlp": 0.2945841, + "epoch": 0.2515556891627837, + "flos": 13990402967040.0, + "grad_norm": 13.194161648142764, + "language_loss": 0.81278026, + "learning_rate": 3.506794333933431e-06, + "loss": 0.83327246, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 4.07421875, + "router_z_loss_mlp": 0.4074707, + "step": 4184, + "time_per_iteration": 2.6457557678222656 + }, + { + "auxiliary_loss_clip": 0.01703966, + "auxiliary_loss_mlp": 0.00336853, + "balance_loss_clip": 1.31213653, + "balance_loss_mlp": 0.29667962, + "epoch": 0.25161581241545167, + "flos": 22163527618560.0, + "grad_norm": 4.824601058984388, + "language_loss": 0.90101665, + "learning_rate": 3.506538208705484e-06, + "loss": 0.92142493, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 3.921875, + "router_z_loss_mlp": 0.40209961, + "step": 4185, + "time_per_iteration": 2.668301820755005 + }, + { + "auxiliary_loss_clip": 0.01425414, + "auxiliary_loss_mlp": 0.00159628, + "balance_loss_clip": 1.23715138, + "balance_loss_mlp": 0.15195122, + "epoch": 0.25167593566811963, + "flos": 69358407696000.0, + "grad_norm": 0.805679358016476, + "language_loss": 0.61374801, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.62959844, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.07666016, + "step": 4186, + "time_per_iteration": 3.0358080863952637 + }, + { + "auxiliary_loss_clip": 0.01684407, + "auxiliary_loss_mlp": 0.00353376, + "balance_loss_clip": 1.30350399, + "balance_loss_mlp": 0.31353626, + "epoch": 0.2517360589207876, + "flos": 13261452359040.0, + "grad_norm": 7.956608087473492, + "language_loss": 0.86799961, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.88837743, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 3.81054688, + "router_z_loss_mlp": 0.39819336, + "step": 4187, + "time_per_iteration": 2.6614584922790527 + }, + { + "auxiliary_loss_clip": 0.01673042, + "auxiliary_loss_mlp": 0.00312322, + "balance_loss_clip": 1.30131435, + "balance_loss_mlp": 0.27500916, + "epoch": 0.25179618217345556, + "flos": 20376828282240.0, + "grad_norm": 1.9757761599786032, + "language_loss": 0.85114944, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.87100315, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.37304688, + "step": 4188, + "time_per_iteration": 2.6738290786743164 + }, + { + "auxiliary_loss_clip": 0.01677668, + "auxiliary_loss_mlp": 0.00314982, + "balance_loss_clip": 1.30055034, + "balance_loss_mlp": 0.27578634, + "epoch": 0.25185630542612353, + "flos": 27664718250240.0, + "grad_norm": 2.3627564519292568, + "language_loss": 0.81015015, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.83007669, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.39160156, + "step": 4189, + "time_per_iteration": 2.8758323192596436 + }, + { + "auxiliary_loss_clip": 0.01650569, + "auxiliary_loss_mlp": 0.00296342, + "balance_loss_clip": 1.28457665, + "balance_loss_mlp": 0.26043588, + "epoch": 0.25191642867879155, + "flos": 20996430912000.0, + "grad_norm": 7.851090411967549, + "language_loss": 0.91027111, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.92974019, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 0.35888672, + "step": 4190, + "time_per_iteration": 2.817430257797241 + }, + { + "auxiliary_loss_clip": 0.01625086, + "auxiliary_loss_mlp": 0.00318007, + "balance_loss_clip": 1.26379275, + "balance_loss_mlp": 0.27759552, + "epoch": 0.2519765519314595, + "flos": 21105671149440.0, + "grad_norm": 5.153109194520077, + "language_loss": 0.83802366, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.8574546, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.40405273, + "step": 4191, + "time_per_iteration": 2.6803903579711914 + }, + { + "auxiliary_loss_clip": 0.01372878, + "auxiliary_loss_mlp": 0.0011357, + "balance_loss_clip": 1.18128633, + "balance_loss_mlp": 0.1053211, + "epoch": 0.2520366751841275, + "flos": 62744993360640.0, + "grad_norm": 0.7210883294475887, + "language_loss": 0.5648126, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.5796771, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.08251953, + "step": 4192, + "time_per_iteration": 3.2181878089904785 + }, + { + "auxiliary_loss_clip": 0.01609207, + "auxiliary_loss_mlp": 0.00297123, + "balance_loss_clip": 1.26611495, + "balance_loss_mlp": 0.2615037, + "epoch": 0.25209679843679544, + "flos": 22230716008320.0, + "grad_norm": 2.5153332872618144, + "language_loss": 0.82166004, + "learning_rate": 3.504487151087323e-06, + "loss": 0.84072334, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 0.35595703, + "step": 4193, + "time_per_iteration": 2.6363346576690674 + }, + { + "auxiliary_loss_clip": 0.01604959, + "auxiliary_loss_mlp": 0.00360085, + "balance_loss_clip": 1.25498414, + "balance_loss_mlp": 0.32036462, + "epoch": 0.2521569216894634, + "flos": 12166643773440.0, + "grad_norm": 15.25054986623241, + "language_loss": 0.91243666, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.93208712, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.3972168, + "step": 4194, + "time_per_iteration": 2.616605520248413 + }, + { + "auxiliary_loss_clip": 0.01589619, + "auxiliary_loss_mlp": 0.00330798, + "balance_loss_clip": 1.24687767, + "balance_loss_mlp": 0.29503548, + "epoch": 0.2522170449421314, + "flos": 23699786353920.0, + "grad_norm": 8.071640701383187, + "language_loss": 0.91762942, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.93683362, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.35766602, + "step": 4195, + "time_per_iteration": 4.136380195617676 + }, + { + "auxiliary_loss_clip": 0.01615715, + "auxiliary_loss_mlp": 0.00306963, + "balance_loss_clip": 1.26009738, + "balance_loss_mlp": 0.26504895, + "epoch": 0.25227716819479934, + "flos": 20955456472320.0, + "grad_norm": 6.70556622227512, + "language_loss": 0.9156577, + "learning_rate": 3.503717062883053e-06, + "loss": 0.93488455, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 0.41918945, + "step": 4196, + "time_per_iteration": 2.66886305809021 + }, + { + "auxiliary_loss_clip": 0.0158347, + "auxiliary_loss_mlp": 0.00341941, + "balance_loss_clip": 1.23952961, + "balance_loss_mlp": 0.30667895, + "epoch": 0.2523372914474673, + "flos": 23331342597120.0, + "grad_norm": 42.29913244984571, + "language_loss": 0.88880169, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.90805578, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 3.44140625, + "router_z_loss_mlp": 0.3527832, + "step": 4197, + "time_per_iteration": 2.747697591781616 + }, + { + "auxiliary_loss_clip": 0.01583687, + "auxiliary_loss_mlp": 0.00327252, + "balance_loss_clip": 1.23915565, + "balance_loss_mlp": 0.28645813, + "epoch": 0.25239741470013527, + "flos": 36970321875840.0, + "grad_norm": 3.4742372347917962, + "language_loss": 0.79797244, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.81708187, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.4074707, + "step": 4198, + "time_per_iteration": 4.180483341217041 + }, + { + "auxiliary_loss_clip": 0.01574, + "auxiliary_loss_mlp": 0.00371439, + "balance_loss_clip": 1.22474766, + "balance_loss_mlp": 0.32661596, + "epoch": 0.25245753795280323, + "flos": 18515757836160.0, + "grad_norm": 4.734608765369558, + "language_loss": 0.82854933, + "learning_rate": 3.50294646148888e-06, + "loss": 0.84800375, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 0.44799805, + "step": 4199, + "time_per_iteration": 2.6905980110168457 + }, + { + "auxiliary_loss_clip": 0.01555098, + "auxiliary_loss_mlp": 0.00320783, + "balance_loss_clip": 1.21420097, + "balance_loss_mlp": 0.28122896, + "epoch": 0.2525176612054712, + "flos": 32344884737280.0, + "grad_norm": 28.722720742043276, + "language_loss": 0.80940276, + "learning_rate": 3.502689480360739e-06, + "loss": 0.8281616, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.39526367, + "step": 4200, + "time_per_iteration": 2.7869579792022705 + }, + { + "auxiliary_loss_clip": 0.01571189, + "auxiliary_loss_mlp": 0.00319041, + "balance_loss_clip": 1.22936296, + "balance_loss_mlp": 0.28113225, + "epoch": 0.25257778445813917, + "flos": 45258217459200.0, + "grad_norm": 3.506313192125652, + "language_loss": 0.87721264, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.89611501, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 0.37915039, + "step": 4201, + "time_per_iteration": 2.8783254623413086 + }, + { + "auxiliary_loss_clip": 0.01568816, + "auxiliary_loss_mlp": 0.00335331, + "balance_loss_clip": 1.22597408, + "balance_loss_mlp": 0.29830423, + "epoch": 0.25263790771080713, + "flos": 23367791923200.0, + "grad_norm": 5.749991287640342, + "language_loss": 0.81828117, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.83732271, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.37036133, + "step": 4202, + "time_per_iteration": 4.1148083209991455 + }, + { + "auxiliary_loss_clip": 0.01555945, + "auxiliary_loss_mlp": 0.00323972, + "balance_loss_clip": 1.22036195, + "balance_loss_mlp": 0.28837645, + "epoch": 0.25269803096347515, + "flos": 18515039564160.0, + "grad_norm": 9.629399845386747, + "language_loss": 0.81411541, + "learning_rate": 3.501918195122491e-06, + "loss": 0.83291459, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.35620117, + "step": 4203, + "time_per_iteration": 2.6485085487365723 + }, + { + "auxiliary_loss_clip": 0.01554237, + "auxiliary_loss_mlp": 0.00353593, + "balance_loss_clip": 1.21712935, + "balance_loss_mlp": 0.3138963, + "epoch": 0.2527581542161431, + "flos": 24610552629120.0, + "grad_norm": 120.57566395971459, + "language_loss": 0.82850516, + "learning_rate": 3.501660986124297e-06, + "loss": 0.84758347, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.39697266, + "step": 4204, + "time_per_iteration": 2.7450177669525146 + }, + { + "auxiliary_loss_clip": 0.01539411, + "auxiliary_loss_mlp": 0.00287728, + "balance_loss_clip": 1.20598948, + "balance_loss_mlp": 0.24915197, + "epoch": 0.2528182774688111, + "flos": 12641275111680.0, + "grad_norm": 8.807417309699238, + "language_loss": 0.82680202, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.84507334, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.38574219, + "step": 4205, + "time_per_iteration": 2.723830461502075 + }, + { + "auxiliary_loss_clip": 0.01526451, + "auxiliary_loss_mlp": 0.00282255, + "balance_loss_clip": 1.20809531, + "balance_loss_mlp": 0.24866211, + "epoch": 0.25287840072147905, + "flos": 46936789879680.0, + "grad_norm": 75.79627030497613, + "language_loss": 0.80593145, + "learning_rate": 3.50114639730826e-06, + "loss": 0.82401848, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.3359375, + "step": 4206, + "time_per_iteration": 2.8919711112976074 + }, + { + "auxiliary_loss_clip": 0.01530199, + "auxiliary_loss_mlp": 0.0028257, + "balance_loss_clip": 1.20517254, + "balance_loss_mlp": 0.24842826, + "epoch": 0.252938523974147, + "flos": 18879712392960.0, + "grad_norm": 11.022642687157369, + "language_loss": 0.86206937, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.88019705, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.34130859, + "step": 4207, + "time_per_iteration": 2.6290981769561768 + }, + { + "auxiliary_loss_clip": 0.01516154, + "auxiliary_loss_mlp": 0.00303017, + "balance_loss_clip": 1.195081, + "balance_loss_mlp": 0.2696389, + "epoch": 0.252998647226815, + "flos": 21434720664960.0, + "grad_norm": 321.42915364295374, + "language_loss": 0.82478523, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.84297693, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.33349609, + "step": 4208, + "time_per_iteration": 2.647425889968872 + }, + { + "auxiliary_loss_clip": 0.01525648, + "auxiliary_loss_mlp": 0.0028523, + "balance_loss_clip": 1.20681, + "balance_loss_mlp": 0.25111198, + "epoch": 0.25305877047948294, + "flos": 25442171285760.0, + "grad_norm": 29.090275724091324, + "language_loss": 0.75740939, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.77551818, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.34106445, + "step": 4209, + "time_per_iteration": 4.050754070281982 + }, + { + "auxiliary_loss_clip": 0.01275699, + "auxiliary_loss_mlp": 0.00086552, + "balance_loss_clip": 1.07934809, + "balance_loss_mlp": 0.07820731, + "epoch": 0.2531188937321509, + "flos": 60185603629440.0, + "grad_norm": 0.7519981503578049, + "language_loss": 0.55076367, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.56438619, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.08349609, + "step": 4210, + "time_per_iteration": 3.2179529666900635 + }, + { + "auxiliary_loss_clip": 0.01521211, + "auxiliary_loss_mlp": 0.00302474, + "balance_loss_clip": 1.19806159, + "balance_loss_mlp": 0.26740283, + "epoch": 0.25317901698481887, + "flos": 19682387665920.0, + "grad_norm": 2.4354892209208345, + "language_loss": 0.85134113, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.869578, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.3503418, + "step": 4211, + "time_per_iteration": 2.642101764678955 + }, + { + "auxiliary_loss_clip": 0.0153171, + "auxiliary_loss_mlp": 0.00291641, + "balance_loss_clip": 1.21180427, + "balance_loss_mlp": 0.25907263, + "epoch": 0.25323914023748684, + "flos": 24424355502720.0, + "grad_norm": 2.839678746800825, + "language_loss": 0.82798111, + "learning_rate": 3.499601265005622e-06, + "loss": 0.84621465, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.32592773, + "step": 4212, + "time_per_iteration": 2.6925222873687744 + }, + { + "auxiliary_loss_clip": 0.01531661, + "auxiliary_loss_mlp": 0.00312455, + "balance_loss_clip": 1.20815301, + "balance_loss_mlp": 0.27564344, + "epoch": 0.2532992634901548, + "flos": 25447450584960.0, + "grad_norm": 65.12943478300109, + "language_loss": 0.62873435, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.64717555, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.36816406, + "step": 4213, + "time_per_iteration": 2.6699957847595215 + }, + { + "auxiliary_loss_clip": 0.01523382, + "auxiliary_loss_mlp": 0.00301076, + "balance_loss_clip": 1.2063601, + "balance_loss_mlp": 0.26385936, + "epoch": 0.25335938674282277, + "flos": 18880538405760.0, + "grad_norm": 8.896055469047226, + "language_loss": 0.72874963, + "learning_rate": 3.499085765880308e-06, + "loss": 0.7469942, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.37231445, + "step": 4214, + "time_per_iteration": 2.61181902885437 + }, + { + "auxiliary_loss_clip": 0.01295669, + "auxiliary_loss_mlp": 0.00135991, + "balance_loss_clip": 1.10108078, + "balance_loss_mlp": 0.12759836, + "epoch": 0.25341950999549073, + "flos": 53062649936640.0, + "grad_norm": 3.2014141401046796, + "language_loss": 0.57677394, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.5910905, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.08398438, + "step": 4215, + "time_per_iteration": 2.9706528186798096 + }, + { + "auxiliary_loss_clip": 0.01538172, + "auxiliary_loss_mlp": 0.00292261, + "balance_loss_clip": 1.21908438, + "balance_loss_mlp": 0.25811917, + "epoch": 0.2534796332481587, + "flos": 39020247054720.0, + "grad_norm": 172.05271943626448, + "language_loss": 0.90038514, + "learning_rate": 3.498570039373066e-06, + "loss": 0.91868949, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.34155273, + "step": 4216, + "time_per_iteration": 2.8323018550872803 + }, + { + "auxiliary_loss_clip": 0.01528391, + "auxiliary_loss_mlp": 0.00285296, + "balance_loss_clip": 1.21001101, + "balance_loss_mlp": 0.25206086, + "epoch": 0.2535397565008267, + "flos": 23586990670080.0, + "grad_norm": 4.074770662778696, + "language_loss": 0.86563402, + "learning_rate": 3.498312090875666e-06, + "loss": 0.88377088, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.33227539, + "step": 4217, + "time_per_iteration": 2.680891990661621 + }, + { + "auxiliary_loss_clip": 0.01540522, + "auxiliary_loss_mlp": 0.00304752, + "balance_loss_clip": 1.22231889, + "balance_loss_mlp": 0.27022874, + "epoch": 0.2535998797534947, + "flos": 19281373251840.0, + "grad_norm": 5.752200767529421, + "language_loss": 0.83950114, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.85795391, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.34521484, + "step": 4218, + "time_per_iteration": 2.6433663368225098 + }, + { + "auxiliary_loss_clip": 0.01543577, + "auxiliary_loss_mlp": 0.0030859, + "balance_loss_clip": 1.22418165, + "balance_loss_mlp": 0.27192146, + "epoch": 0.25366000300616265, + "flos": 24024382583040.0, + "grad_norm": 9.055378812375205, + "language_loss": 0.81815314, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.83667481, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.36645508, + "step": 4219, + "time_per_iteration": 2.7160122394561768 + }, + { + "auxiliary_loss_clip": 0.0156381, + "auxiliary_loss_mlp": 0.00311778, + "balance_loss_clip": 1.23722303, + "balance_loss_mlp": 0.2755388, + "epoch": 0.2537201262588306, + "flos": 16289368116480.0, + "grad_norm": 18.78032902138606, + "language_loss": 0.86756265, + "learning_rate": 3.497537904525736e-06, + "loss": 0.88631856, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.36230469, + "step": 4220, + "time_per_iteration": 2.619300603866577 + }, + { + "auxiliary_loss_clip": 0.01555862, + "auxiliary_loss_mlp": 0.00334232, + "balance_loss_clip": 1.2344842, + "balance_loss_mlp": 0.29784924, + "epoch": 0.2537802495114986, + "flos": 23294677789440.0, + "grad_norm": 5.322390171449813, + "language_loss": 0.78420442, + "learning_rate": 3.497279728822468e-06, + "loss": 0.80310535, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.36352539, + "step": 4221, + "time_per_iteration": 2.788115978240967 + }, + { + "auxiliary_loss_clip": 0.01553838, + "auxiliary_loss_mlp": 0.00338031, + "balance_loss_clip": 1.22697818, + "balance_loss_mlp": 0.29962146, + "epoch": 0.25384037276416654, + "flos": 17639142416640.0, + "grad_norm": 7.085697036148153, + "language_loss": 0.69304383, + "learning_rate": 3.497021496342202e-06, + "loss": 0.71196252, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.38427734, + "step": 4222, + "time_per_iteration": 2.655081033706665 + }, + { + "auxiliary_loss_clip": 0.01566229, + "auxiliary_loss_mlp": 0.00358182, + "balance_loss_clip": 1.24218881, + "balance_loss_mlp": 0.32077387, + "epoch": 0.2539004960168345, + "flos": 21507044699520.0, + "grad_norm": 5.033238237654, + "language_loss": 0.8004297, + "learning_rate": 3.496763207094731e-06, + "loss": 0.81967378, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.37402344, + "step": 4223, + "time_per_iteration": 2.730522394180298 + }, + { + "auxiliary_loss_clip": 0.01569542, + "auxiliary_loss_mlp": 0.0034086, + "balance_loss_clip": 1.24866748, + "balance_loss_mlp": 0.30619407, + "epoch": 0.2539606192695025, + "flos": 23950909313280.0, + "grad_norm": 2.50144211063995, + "language_loss": 0.85496986, + "learning_rate": 3.49650486108985e-06, + "loss": 0.87407386, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.34643555, + "step": 4224, + "time_per_iteration": 2.69619083404541 + }, + { + "auxiliary_loss_clip": 0.01556332, + "auxiliary_loss_mlp": 0.00346988, + "balance_loss_clip": 1.23694634, + "balance_loss_mlp": 0.30915052, + "epoch": 0.25402074252217044, + "flos": 24169784837760.0, + "grad_norm": 3.7296491149198823, + "language_loss": 0.8279196, + "learning_rate": 3.496246458337354e-06, + "loss": 0.84695274, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.37792969, + "step": 4225, + "time_per_iteration": 2.7021117210388184 + }, + { + "auxiliary_loss_clip": 0.01555082, + "auxiliary_loss_mlp": 0.00359136, + "balance_loss_clip": 1.2422998, + "balance_loss_mlp": 0.32208586, + "epoch": 0.2540808657748384, + "flos": 22303758314880.0, + "grad_norm": 7.718800424302671, + "language_loss": 0.89968348, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.91882569, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.37036133, + "step": 4226, + "time_per_iteration": 2.6514675617218018 + }, + { + "auxiliary_loss_clip": 0.0153839, + "auxiliary_loss_mlp": 0.00333168, + "balance_loss_clip": 1.22410047, + "balance_loss_mlp": 0.29723793, + "epoch": 0.25414098902750637, + "flos": 27599541022080.0, + "grad_norm": 11.675907811798416, + "language_loss": 0.77429795, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.79301351, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.35913086, + "step": 4227, + "time_per_iteration": 2.7726974487304688 + }, + { + "auxiliary_loss_clip": 0.0137435, + "auxiliary_loss_mlp": 0.00197491, + "balance_loss_clip": 1.20258451, + "balance_loss_mlp": 0.1893373, + "epoch": 0.25420111228017434, + "flos": 58170834887040.0, + "grad_norm": 1.17377216590894, + "language_loss": 0.61718476, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.63290322, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.08154297, + "step": 4228, + "time_per_iteration": 3.0211005210876465 + }, + { + "auxiliary_loss_clip": 0.01555417, + "auxiliary_loss_mlp": 0.00334805, + "balance_loss_clip": 1.23528361, + "balance_loss_mlp": 0.29777879, + "epoch": 0.2542612355328423, + "flos": 11464409905920.0, + "grad_norm": 115.09788547368345, + "language_loss": 0.95322537, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.97212756, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.37036133, + "step": 4229, + "time_per_iteration": 2.5889062881469727 + }, + { + "auxiliary_loss_clip": 0.01563052, + "auxiliary_loss_mlp": 0.003312, + "balance_loss_clip": 1.25449443, + "balance_loss_mlp": 0.29488844, + "epoch": 0.2543213587855103, + "flos": 22965879669120.0, + "grad_norm": 12.378293995279423, + "language_loss": 0.83237767, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.85132015, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.36303711, + "step": 4230, + "time_per_iteration": 2.6581215858459473 + }, + { + "auxiliary_loss_clip": 0.01543072, + "auxiliary_loss_mlp": 0.00356755, + "balance_loss_clip": 1.23031211, + "balance_loss_mlp": 0.31820303, + "epoch": 0.2543814820381783, + "flos": 18253178438400.0, + "grad_norm": 11.142742653261285, + "language_loss": 0.80322611, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.82222444, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.38574219, + "step": 4231, + "time_per_iteration": 2.6209990978240967 + }, + { + "auxiliary_loss_clip": 0.01556348, + "auxiliary_loss_mlp": 0.00347078, + "balance_loss_clip": 1.23863435, + "balance_loss_mlp": 0.31234065, + "epoch": 0.25444160529084625, + "flos": 15632705629440.0, + "grad_norm": 7.895121350920693, + "language_loss": 0.80303478, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.82206905, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 3.17382812, + "router_z_loss_mlp": 0.34765625, + "step": 4232, + "time_per_iteration": 2.654867172241211 + }, + { + "auxiliary_loss_clip": 0.01543942, + "auxiliary_loss_mlp": 0.00359716, + "balance_loss_clip": 1.2314595, + "balance_loss_mlp": 0.32400137, + "epoch": 0.2545017285435142, + "flos": 24601610142720.0, + "grad_norm": 3.3498167264767993, + "language_loss": 0.9206326, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.93966913, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.35693359, + "step": 4233, + "time_per_iteration": 2.6950345039367676 + }, + { + "auxiliary_loss_clip": 0.0153012, + "auxiliary_loss_mlp": 0.00313456, + "balance_loss_clip": 1.22450542, + "balance_loss_mlp": 0.28081632, + "epoch": 0.2545618517961822, + "flos": 24679069822080.0, + "grad_norm": 7.002956704915364, + "language_loss": 0.78518867, + "learning_rate": 3.493918281539737e-06, + "loss": 0.80362439, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.32641602, + "step": 4234, + "time_per_iteration": 2.7019336223602295 + }, + { + "auxiliary_loss_clip": 0.01541375, + "auxiliary_loss_mlp": 0.00355886, + "balance_loss_clip": 1.22939312, + "balance_loss_mlp": 0.32174408, + "epoch": 0.25462197504885015, + "flos": 23915106432000.0, + "grad_norm": 10.402531013342383, + "language_loss": 0.8060208, + "learning_rate": 3.493659311850379e-06, + "loss": 0.82499343, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.34179688, + "step": 4235, + "time_per_iteration": 2.7035655975341797 + }, + { + "auxiliary_loss_clip": 0.01569076, + "auxiliary_loss_mlp": 0.00388091, + "balance_loss_clip": 1.2464385, + "balance_loss_mlp": 0.34891897, + "epoch": 0.2546820983015181, + "flos": 24789387467520.0, + "grad_norm": 3.7740590285915814, + "language_loss": 0.75472283, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.7742945, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.39160156, + "step": 4236, + "time_per_iteration": 2.866189956665039 + }, + { + "auxiliary_loss_clip": 0.01566775, + "auxiliary_loss_mlp": 0.00353041, + "balance_loss_clip": 1.24700987, + "balance_loss_mlp": 0.31861368, + "epoch": 0.2547422215541861, + "flos": 18734130570240.0, + "grad_norm": 25.45997715702305, + "language_loss": 0.72652656, + "learning_rate": 3.493141202562354e-06, + "loss": 0.74572468, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.34448242, + "step": 4237, + "time_per_iteration": 4.105910062789917 + }, + { + "auxiliary_loss_clip": 0.01557582, + "auxiliary_loss_mlp": 0.00351021, + "balance_loss_clip": 1.23008311, + "balance_loss_mlp": 0.3124209, + "epoch": 0.25480234480685404, + "flos": 21032449274880.0, + "grad_norm": 5.970043901183035, + "language_loss": 0.80425054, + "learning_rate": 3.492882062983333e-06, + "loss": 0.8233366, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.38623047, + "step": 4238, + "time_per_iteration": 2.638857126235962 + }, + { + "auxiliary_loss_clip": 0.01557818, + "auxiliary_loss_mlp": 0.00347302, + "balance_loss_clip": 1.2334609, + "balance_loss_mlp": 0.30934554, + "epoch": 0.254862468059522, + "flos": 25082167224960.0, + "grad_norm": 31.52544064120883, + "language_loss": 0.8644948, + "learning_rate": 3.492622866794074e-06, + "loss": 0.88354599, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.37963867, + "step": 4239, + "time_per_iteration": 2.6800105571746826 + }, + { + "auxiliary_loss_clip": 0.01561366, + "auxiliary_loss_mlp": 0.00368979, + "balance_loss_clip": 1.24142289, + "balance_loss_mlp": 0.32913893, + "epoch": 0.25492259131219, + "flos": 20558392554240.0, + "grad_norm": 68.89486892408794, + "language_loss": 0.82912207, + "learning_rate": 3.492363614004407e-06, + "loss": 0.84842545, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.39819336, + "step": 4240, + "time_per_iteration": 2.6444733142852783 + }, + { + "auxiliary_loss_clip": 0.01574133, + "auxiliary_loss_mlp": 0.00396973, + "balance_loss_clip": 1.24367714, + "balance_loss_mlp": 0.3557744, + "epoch": 0.25498271456485794, + "flos": 25042485674880.0, + "grad_norm": 3.4659288683785596, + "language_loss": 0.90255392, + "learning_rate": 3.492104304624162e-06, + "loss": 0.92226499, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.41210938, + "step": 4241, + "time_per_iteration": 4.037973403930664 + }, + { + "auxiliary_loss_clip": 0.01578156, + "auxiliary_loss_mlp": 0.00363639, + "balance_loss_clip": 1.25023651, + "balance_loss_mlp": 0.32825744, + "epoch": 0.2550428378175259, + "flos": 26178412354560.0, + "grad_norm": 25.210081573691333, + "language_loss": 0.79022247, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.80964041, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.35375977, + "step": 4242, + "time_per_iteration": 2.801414728164673 + }, + { + "auxiliary_loss_clip": 0.0157073, + "auxiliary_loss_mlp": 0.00341454, + "balance_loss_clip": 1.24361038, + "balance_loss_mlp": 0.30683526, + "epoch": 0.2551029610701939, + "flos": 15267170874240.0, + "grad_norm": 13.06068062012221, + "language_loss": 0.8026762, + "learning_rate": 3.491585516131273e-06, + "loss": 0.82179809, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.34643555, + "step": 4243, + "time_per_iteration": 2.6196041107177734 + }, + { + "auxiliary_loss_clip": 0.01570652, + "auxiliary_loss_mlp": 0.00374247, + "balance_loss_clip": 1.24193931, + "balance_loss_mlp": 0.33488452, + "epoch": 0.2551630843228619, + "flos": 18112193556480.0, + "grad_norm": 104.11673399723135, + "language_loss": 0.87608498, + "learning_rate": 3.491326037038301e-06, + "loss": 0.89553404, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.39379883, + "step": 4244, + "time_per_iteration": 4.106152057647705 + }, + { + "auxiliary_loss_clip": 0.0127739, + "auxiliary_loss_mlp": 0.00184849, + "balance_loss_clip": 1.10293484, + "balance_loss_mlp": 0.17645696, + "epoch": 0.25522320757552985, + "flos": 70520192167680.0, + "grad_norm": 0.6923643241181449, + "language_loss": 0.57200611, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.5866285, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.08398438, + "step": 4245, + "time_per_iteration": 3.2411646842956543 + }, + { + "auxiliary_loss_clip": 0.01584353, + "auxiliary_loss_mlp": 0.00405443, + "balance_loss_clip": 1.24506044, + "balance_loss_mlp": 0.36054856, + "epoch": 0.2552833308281978, + "flos": 22893088757760.0, + "grad_norm": 121.86447046187399, + "language_loss": 0.73609692, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.75599492, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.44897461, + "step": 4246, + "time_per_iteration": 2.6821351051330566 + }, + { + "auxiliary_loss_clip": 0.01572623, + "auxiliary_loss_mlp": 0.00402169, + "balance_loss_clip": 1.24350858, + "balance_loss_mlp": 0.36316413, + "epoch": 0.2553434540808658, + "flos": 22053605022720.0, + "grad_norm": 3.0970510035740286, + "language_loss": 0.85547251, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.87522042, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.38964844, + "step": 4247, + "time_per_iteration": 2.657200336456299 + }, + { + "auxiliary_loss_clip": 0.01620953, + "auxiliary_loss_mlp": 0.00454791, + "balance_loss_clip": 1.26221287, + "balance_loss_mlp": 0.40968269, + "epoch": 0.25540357733353375, + "flos": 16544190176640.0, + "grad_norm": 12.93149386933323, + "language_loss": 0.90837109, + "learning_rate": 3.490287555252514e-06, + "loss": 0.92912853, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.45117188, + "step": 4248, + "time_per_iteration": 2.7281994819641113 + }, + { + "auxiliary_loss_clip": 0.01601338, + "auxiliary_loss_mlp": 0.0040274, + "balance_loss_clip": 1.25351977, + "balance_loss_mlp": 0.36182764, + "epoch": 0.2554637005862017, + "flos": 17565022702080.0, + "grad_norm": 58.06943548327985, + "language_loss": 0.90978038, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.92982119, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.40917969, + "step": 4249, + "time_per_iteration": 2.6962203979492188 + }, + { + "auxiliary_loss_clip": 0.01283519, + "auxiliary_loss_mlp": 0.00151265, + "balance_loss_clip": 1.10314035, + "balance_loss_mlp": 0.1431583, + "epoch": 0.2555238238388697, + "flos": 72244763953920.0, + "grad_norm": 0.7234700626928396, + "language_loss": 0.55921382, + "learning_rate": 3.489767975249115e-06, + "loss": 0.57356167, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.08105469, + "step": 4250, + "time_per_iteration": 3.152906894683838 + }, + { + "auxiliary_loss_clip": 0.01600296, + "auxiliary_loss_mlp": 0.00401073, + "balance_loss_clip": 1.24474072, + "balance_loss_mlp": 0.36094749, + "epoch": 0.25558394709153764, + "flos": 24389414547840.0, + "grad_norm": 40.99894601352705, + "language_loss": 0.87354898, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.89356267, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 0.40112305, + "step": 4251, + "time_per_iteration": 2.67527437210083 + }, + { + "auxiliary_loss_clip": 0.01284076, + "auxiliary_loss_mlp": 0.00138508, + "balance_loss_clip": 1.10090721, + "balance_loss_mlp": 0.12987719, + "epoch": 0.2556440703442056, + "flos": 69231213636480.0, + "grad_norm": 0.8541731565296798, + "language_loss": 0.66198182, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.67620766, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.08642578, + "step": 4252, + "time_per_iteration": 4.534997224807739 + }, + { + "auxiliary_loss_clip": 0.01595341, + "auxiliary_loss_mlp": 0.00377438, + "balance_loss_clip": 1.25456691, + "balance_loss_mlp": 0.34174639, + "epoch": 0.2557041935968736, + "flos": 24863902231680.0, + "grad_norm": 10.682323469473737, + "language_loss": 0.80098146, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.82070929, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.35668945, + "step": 4253, + "time_per_iteration": 2.663130521774292 + }, + { + "auxiliary_loss_clip": 0.0161196, + "auxiliary_loss_mlp": 0.00369733, + "balance_loss_clip": 1.26501918, + "balance_loss_mlp": 0.33220643, + "epoch": 0.25576431684954154, + "flos": 22492110257280.0, + "grad_norm": 14.924121978140311, + "language_loss": 0.78818291, + "learning_rate": 3.488728137415357e-06, + "loss": 0.80799985, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.37548828, + "step": 4254, + "time_per_iteration": 2.632819175720215 + }, + { + "auxiliary_loss_clip": 0.01609731, + "auxiliary_loss_mlp": 0.00412729, + "balance_loss_clip": 1.25645936, + "balance_loss_mlp": 0.37195963, + "epoch": 0.2558244401022095, + "flos": 19826748426240.0, + "grad_norm": 3.0176308131749168, + "language_loss": 0.86101925, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.88124382, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 0.4074707, + "step": 4255, + "time_per_iteration": 2.6358561515808105 + }, + { + "auxiliary_loss_clip": 0.01590787, + "auxiliary_loss_mlp": 0.00375015, + "balance_loss_clip": 1.24432445, + "balance_loss_mlp": 0.33681998, + "epoch": 0.2558845633548775, + "flos": 23220486247680.0, + "grad_norm": 13.130333463635614, + "language_loss": 0.89639199, + "learning_rate": 3.488207879742721e-06, + "loss": 0.91604996, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.38183594, + "step": 4256, + "time_per_iteration": 2.6508142948150635 + }, + { + "auxiliary_loss_clip": 0.01630649, + "auxiliary_loss_mlp": 0.00411186, + "balance_loss_clip": 1.27241397, + "balance_loss_mlp": 0.37160835, + "epoch": 0.2559446866075455, + "flos": 16837867774080.0, + "grad_norm": 2.0854665398401577, + "language_loss": 0.82245088, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.84286922, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.39599609, + "step": 4257, + "time_per_iteration": 2.65822696685791 + }, + { + "auxiliary_loss_clip": 0.01314279, + "auxiliary_loss_mlp": 0.00136997, + "balance_loss_clip": 1.12814426, + "balance_loss_mlp": 0.130178, + "epoch": 0.25600480986021346, + "flos": 57593786895360.0, + "grad_norm": 0.7917209406458895, + "language_loss": 0.64963633, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.66414911, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.06835938, + "step": 4258, + "time_per_iteration": 3.1186623573303223 + }, + { + "auxiliary_loss_clip": 0.01617232, + "auxiliary_loss_mlp": 0.00394954, + "balance_loss_clip": 1.26560569, + "balance_loss_mlp": 0.35282522, + "epoch": 0.2560649331128814, + "flos": 27819529868160.0, + "grad_norm": 36.88756584068166, + "language_loss": 0.82149684, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.84161872, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 0.42138672, + "step": 4259, + "time_per_iteration": 2.7915873527526855 + }, + { + "auxiliary_loss_clip": 0.01299911, + "auxiliary_loss_mlp": 0.00128024, + "balance_loss_clip": 1.11840034, + "balance_loss_mlp": 0.12101467, + "epoch": 0.2561250563655494, + "flos": 70950509101440.0, + "grad_norm": 0.7887197038478593, + "language_loss": 0.58340627, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.59768564, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.0703125, + "step": 4260, + "time_per_iteration": 3.2170069217681885 + }, + { + "auxiliary_loss_clip": 0.0160055, + "auxiliary_loss_mlp": 0.00371382, + "balance_loss_clip": 1.25319278, + "balance_loss_mlp": 0.33344954, + "epoch": 0.25618517961821735, + "flos": 27012329481600.0, + "grad_norm": 655.9381536059058, + "language_loss": 0.82652152, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.84624088, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 0.37939453, + "step": 4261, + "time_per_iteration": 2.7703909873962402 + }, + { + "auxiliary_loss_clip": 0.01591277, + "auxiliary_loss_mlp": 0.00342695, + "balance_loss_clip": 1.2477951, + "balance_loss_mlp": 0.30731374, + "epoch": 0.2562453028708853, + "flos": 23068296322560.0, + "grad_norm": 9.727791640840573, + "language_loss": 0.86148041, + "learning_rate": 3.486645752648842e-06, + "loss": 0.88082016, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 3.43554688, + "router_z_loss_mlp": 0.35400391, + "step": 4262, + "time_per_iteration": 2.7573652267456055 + }, + { + "auxiliary_loss_clip": 0.01581384, + "auxiliary_loss_mlp": 0.00406145, + "balance_loss_clip": 1.24133468, + "balance_loss_mlp": 0.36609069, + "epoch": 0.2563054261235533, + "flos": 15120942606720.0, + "grad_norm": 3.9580665729128177, + "language_loss": 0.82364696, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.84352225, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.40039062, + "step": 4263, + "time_per_iteration": 2.6295158863067627 + }, + { + "auxiliary_loss_clip": 0.0160586, + "auxiliary_loss_mlp": 0.0033018, + "balance_loss_clip": 1.26005793, + "balance_loss_mlp": 0.2941789, + "epoch": 0.25636554937622125, + "flos": 27854865872640.0, + "grad_norm": 24.479787640707276, + "language_loss": 0.87377059, + "learning_rate": 3.486124592522163e-06, + "loss": 0.89313102, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 3.45703125, + "router_z_loss_mlp": 0.36035156, + "step": 4264, + "time_per_iteration": 2.7303380966186523 + }, + { + "auxiliary_loss_clip": 0.01601949, + "auxiliary_loss_mlp": 0.00351841, + "balance_loss_clip": 1.25344443, + "balance_loss_mlp": 0.31331247, + "epoch": 0.2564256726288892, + "flos": 28906509288960.0, + "grad_norm": 3.652980672540202, + "language_loss": 0.80776316, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.82730103, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 0.38525391, + "step": 4265, + "time_per_iteration": 2.7210803031921387 + }, + { + "auxiliary_loss_clip": 0.01596456, + "auxiliary_loss_mlp": 0.00341342, + "balance_loss_clip": 1.24985456, + "balance_loss_mlp": 0.30221707, + "epoch": 0.2564857958815572, + "flos": 18514931823360.0, + "grad_norm": 3.7265312932983075, + "language_loss": 0.88204211, + "learning_rate": 3.485603206979513e-06, + "loss": 0.90142012, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 3.46679688, + "router_z_loss_mlp": 0.39111328, + "step": 4266, + "time_per_iteration": 2.6343631744384766 + }, + { + "auxiliary_loss_clip": 0.01574934, + "auxiliary_loss_mlp": 0.00348282, + "balance_loss_clip": 1.2363081, + "balance_loss_mlp": 0.31096992, + "epoch": 0.25654591913422514, + "flos": 25808280658560.0, + "grad_norm": 140.73430313468518, + "language_loss": 0.83885169, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.85808384, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.37329102, + "step": 4267, + "time_per_iteration": 2.696073055267334 + }, + { + "auxiliary_loss_clip": 0.01593715, + "auxiliary_loss_mlp": 0.00326718, + "balance_loss_clip": 1.26043797, + "balance_loss_mlp": 0.28985846, + "epoch": 0.2566060423868931, + "flos": 19099665325440.0, + "grad_norm": 18.95924583779467, + "language_loss": 0.8391819, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.85838628, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.36865234, + "step": 4268, + "time_per_iteration": 2.686746597290039 + }, + { + "auxiliary_loss_clip": 0.01561113, + "auxiliary_loss_mlp": 0.0038754, + "balance_loss_clip": 1.22384131, + "balance_loss_mlp": 0.34624612, + "epoch": 0.25666616563956113, + "flos": 23842674656640.0, + "grad_norm": 7.499204100822425, + "language_loss": 0.73866612, + "learning_rate": 3.484820706183595e-06, + "loss": 0.75815266, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.4128418, + "step": 4269, + "time_per_iteration": 2.7734789848327637 + }, + { + "auxiliary_loss_clip": 0.01572622, + "auxiliary_loss_mlp": 0.0037158, + "balance_loss_clip": 1.23302507, + "balance_loss_mlp": 0.33345681, + "epoch": 0.2567262888922291, + "flos": 14604259420800.0, + "grad_norm": 18.997769613455297, + "language_loss": 0.89150369, + "learning_rate": 3.484559759962666e-06, + "loss": 0.91094571, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.38134766, + "step": 4270, + "time_per_iteration": 2.7033731937408447 + }, + { + "auxiliary_loss_clip": 0.01604338, + "auxiliary_loss_mlp": 0.00382277, + "balance_loss_clip": 1.25166416, + "balance_loss_mlp": 0.33959994, + "epoch": 0.25678641214489706, + "flos": 32923117877760.0, + "grad_norm": 10.76505531285129, + "language_loss": 0.76590908, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.78577518, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.42675781, + "step": 4271, + "time_per_iteration": 2.7528584003448486 + }, + { + "auxiliary_loss_clip": 0.01603635, + "auxiliary_loss_mlp": 0.0041348, + "balance_loss_clip": 1.24735165, + "balance_loss_mlp": 0.37075499, + "epoch": 0.256846535397565, + "flos": 24098933260800.0, + "grad_norm": 54.62397528591798, + "language_loss": 0.90503711, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.92520821, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.42700195, + "step": 4272, + "time_per_iteration": 2.693373441696167 + }, + { + "auxiliary_loss_clip": 0.01601696, + "auxiliary_loss_mlp": 0.00384962, + "balance_loss_clip": 1.25500405, + "balance_loss_mlp": 0.34316683, + "epoch": 0.256906658650233, + "flos": 19718441942400.0, + "grad_norm": 119.8238468301448, + "language_loss": 0.89091432, + "learning_rate": 3.483776583571541e-06, + "loss": 0.91078091, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.41796875, + "step": 4273, + "time_per_iteration": 2.635831117630005 + }, + { + "auxiliary_loss_clip": 0.01569391, + "auxiliary_loss_mlp": 0.00348969, + "balance_loss_clip": 1.23402607, + "balance_loss_mlp": 0.3118712, + "epoch": 0.25696678190290095, + "flos": 22926018551040.0, + "grad_norm": 3.846553842427074, + "language_loss": 0.82876581, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.84794939, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.37109375, + "step": 4274, + "time_per_iteration": 2.711742401123047 + }, + { + "auxiliary_loss_clip": 0.01556831, + "auxiliary_loss_mlp": 0.00350051, + "balance_loss_clip": 1.22377217, + "balance_loss_mlp": 0.31455049, + "epoch": 0.2570269051555689, + "flos": 27307838672640.0, + "grad_norm": 184.48437825694162, + "language_loss": 0.90028185, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.91935074, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.35498047, + "step": 4275, + "time_per_iteration": 2.764719247817993 + }, + { + "auxiliary_loss_clip": 0.01597012, + "auxiliary_loss_mlp": 0.00404477, + "balance_loss_clip": 1.2432121, + "balance_loss_mlp": 0.35912991, + "epoch": 0.2570870284082369, + "flos": 27563414918400.0, + "grad_norm": 12.402653841018454, + "language_loss": 0.83852744, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.85854232, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 0.45336914, + "step": 4276, + "time_per_iteration": 2.738550901412964 + }, + { + "auxiliary_loss_clip": 0.01579367, + "auxiliary_loss_mlp": 0.0040313, + "balance_loss_clip": 1.23732352, + "balance_loss_mlp": 0.36162144, + "epoch": 0.25714715166090485, + "flos": 28730834847360.0, + "grad_norm": 5.314268200947001, + "language_loss": 0.85130978, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.8711347, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 0.41503906, + "step": 4277, + "time_per_iteration": 2.7512271404266357 + }, + { + "auxiliary_loss_clip": 0.01596766, + "auxiliary_loss_mlp": 0.00381432, + "balance_loss_clip": 1.25220597, + "balance_loss_mlp": 0.33980396, + "epoch": 0.2572072749135728, + "flos": 20116152305280.0, + "grad_norm": 10.27914051228256, + "language_loss": 0.84481692, + "learning_rate": 3.482470164419295e-06, + "loss": 0.86459887, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.41650391, + "step": 4278, + "time_per_iteration": 2.641101598739624 + }, + { + "auxiliary_loss_clip": 0.01611673, + "auxiliary_loss_mlp": 0.0039117, + "balance_loss_clip": 1.25830817, + "balance_loss_mlp": 0.3509016, + "epoch": 0.2572673981662408, + "flos": 26030855283840.0, + "grad_norm": 5.104598185502512, + "language_loss": 0.81155163, + "learning_rate": 3.482208711902952e-06, + "loss": 0.83158004, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.40283203, + "step": 4279, + "time_per_iteration": 2.6795058250427246 + }, + { + "auxiliary_loss_clip": 0.0161259, + "auxiliary_loss_mlp": 0.00418857, + "balance_loss_clip": 1.26013613, + "balance_loss_mlp": 0.37196046, + "epoch": 0.25732752141890874, + "flos": 16106618695680.0, + "grad_norm": 2.897643250747758, + "language_loss": 0.91896927, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.93928373, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.46899414, + "step": 4280, + "time_per_iteration": 4.12591814994812 + }, + { + "auxiliary_loss_clip": 0.0160431, + "auxiliary_loss_mlp": 0.00408189, + "balance_loss_clip": 1.25457716, + "balance_loss_mlp": 0.36610764, + "epoch": 0.2573876446715767, + "flos": 22524429519360.0, + "grad_norm": 125.36500864329933, + "language_loss": 0.85371846, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.87384343, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 0.42089844, + "step": 4281, + "time_per_iteration": 2.8399553298950195 + }, + { + "auxiliary_loss_clip": 0.01604141, + "auxiliary_loss_mlp": 0.00411512, + "balance_loss_clip": 1.26104164, + "balance_loss_mlp": 0.36654642, + "epoch": 0.2574477679242447, + "flos": 23950837486080.0, + "grad_norm": 61.39868000406039, + "language_loss": 0.91258442, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.93274093, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.44970703, + "step": 4282, + "time_per_iteration": 2.6627583503723145 + }, + { + "auxiliary_loss_clip": 0.01604249, + "auxiliary_loss_mlp": 0.00420262, + "balance_loss_clip": 1.2580719, + "balance_loss_mlp": 0.37987396, + "epoch": 0.2575078911769127, + "flos": 21981711951360.0, + "grad_norm": 4.7154341544829, + "language_loss": 0.77212214, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.79236728, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 0.40380859, + "step": 4283, + "time_per_iteration": 4.092426538467407 + }, + { + "auxiliary_loss_clip": 0.01609844, + "auxiliary_loss_mlp": 0.00384799, + "balance_loss_clip": 1.27080762, + "balance_loss_mlp": 0.34646088, + "epoch": 0.25756801442958066, + "flos": 21945406279680.0, + "grad_norm": 14.062485300645683, + "language_loss": 0.85539663, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.87534308, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.38330078, + "step": 4284, + "time_per_iteration": 2.658836603164673 + }, + { + "auxiliary_loss_clip": 0.01613571, + "auxiliary_loss_mlp": 0.00379593, + "balance_loss_clip": 1.26241946, + "balance_loss_mlp": 0.33827484, + "epoch": 0.2576281376822486, + "flos": 35261980058880.0, + "grad_norm": 150.05128645686236, + "language_loss": 0.77642357, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.79635519, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.41308594, + "step": 4285, + "time_per_iteration": 2.7907333374023438 + }, + { + "auxiliary_loss_clip": 0.01622956, + "auxiliary_loss_mlp": 0.00398892, + "balance_loss_clip": 1.2759769, + "balance_loss_mlp": 0.35941017, + "epoch": 0.2576882609349166, + "flos": 14132285688960.0, + "grad_norm": 4.138459867012965, + "language_loss": 0.65171874, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.67193723, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 0.39453125, + "step": 4286, + "time_per_iteration": 2.663689613342285 + }, + { + "auxiliary_loss_clip": 0.01625226, + "auxiliary_loss_mlp": 0.00409983, + "balance_loss_clip": 1.2739948, + "balance_loss_mlp": 0.36725861, + "epoch": 0.25774838418758456, + "flos": 23258336204160.0, + "grad_norm": 1.8153738326139868, + "language_loss": 0.70541805, + "learning_rate": 3.480115069207354e-06, + "loss": 0.72577018, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 0.42724609, + "step": 4287, + "time_per_iteration": 4.211211681365967 + }, + { + "auxiliary_loss_clip": 0.01617148, + "auxiliary_loss_mlp": 0.00408724, + "balance_loss_clip": 1.26280737, + "balance_loss_mlp": 0.36444989, + "epoch": 0.2578085074402525, + "flos": 22601745544320.0, + "grad_norm": 42.03122823544241, + "language_loss": 0.77232003, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.79257882, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 0.44287109, + "step": 4288, + "time_per_iteration": 2.686741352081299 + }, + { + "auxiliary_loss_clip": 0.01619896, + "auxiliary_loss_mlp": 0.00404427, + "balance_loss_clip": 1.26980758, + "balance_loss_mlp": 0.36325249, + "epoch": 0.2578686306929205, + "flos": 24571840746240.0, + "grad_norm": 23.642214276729618, + "language_loss": 0.82053757, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.84078074, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.41186523, + "step": 4289, + "time_per_iteration": 2.7459123134613037 + }, + { + "auxiliary_loss_clip": 0.01636609, + "auxiliary_loss_mlp": 0.00432989, + "balance_loss_clip": 1.27812791, + "balance_loss_mlp": 0.38993108, + "epoch": 0.25792875394558845, + "flos": 18113953322880.0, + "grad_norm": 1224.7041528629234, + "language_loss": 0.91947466, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.94017065, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.43066406, + "step": 4290, + "time_per_iteration": 2.6794793605804443 + }, + { + "auxiliary_loss_clip": 0.01640249, + "auxiliary_loss_mlp": 0.00435662, + "balance_loss_clip": 1.28351521, + "balance_loss_mlp": 0.38971841, + "epoch": 0.2579888771982564, + "flos": 17712902995200.0, + "grad_norm": 11.973228876341132, + "language_loss": 0.79227549, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.81303465, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.45947266, + "step": 4291, + "time_per_iteration": 2.6781442165374756 + }, + { + "auxiliary_loss_clip": 0.01630446, + "auxiliary_loss_mlp": 0.0041568, + "balance_loss_clip": 1.26933157, + "balance_loss_mlp": 0.37224039, + "epoch": 0.2580490004509244, + "flos": 16434878112000.0, + "grad_norm": 140.8880845760531, + "language_loss": 0.86449242, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.88495368, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.43408203, + "step": 4292, + "time_per_iteration": 2.5928194522857666 + }, + { + "auxiliary_loss_clip": 0.01625425, + "auxiliary_loss_mlp": 0.00427696, + "balance_loss_clip": 1.2706238, + "balance_loss_mlp": 0.384161, + "epoch": 0.25810912370359235, + "flos": 33835141128960.0, + "grad_norm": 182.34580909256167, + "language_loss": 0.73902482, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.75955606, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.43505859, + "step": 4293, + "time_per_iteration": 2.7577500343322754 + }, + { + "auxiliary_loss_clip": 0.01613756, + "auxiliary_loss_mlp": 0.00378855, + "balance_loss_clip": 1.26349521, + "balance_loss_mlp": 0.33954015, + "epoch": 0.2581692469562603, + "flos": 25192197561600.0, + "grad_norm": 16.64140203140871, + "language_loss": 0.81131637, + "learning_rate": 3.478280185054542e-06, + "loss": 0.83124256, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.39331055, + "step": 4294, + "time_per_iteration": 4.058233737945557 + }, + { + "auxiliary_loss_clip": 0.01637403, + "auxiliary_loss_mlp": 0.00390434, + "balance_loss_clip": 1.28039443, + "balance_loss_mlp": 0.34966433, + "epoch": 0.2582293702089283, + "flos": 34932212271360.0, + "grad_norm": 2.9349595077038333, + "language_loss": 0.8725425, + "learning_rate": 3.478017834441318e-06, + "loss": 0.89282089, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 0.40771484, + "step": 4295, + "time_per_iteration": 2.753767728805542 + }, + { + "auxiliary_loss_clip": 0.01647666, + "auxiliary_loss_mlp": 0.00450913, + "balance_loss_clip": 1.27451801, + "balance_loss_mlp": 0.40103593, + "epoch": 0.2582894934615963, + "flos": 26833746038400.0, + "grad_norm": 102.74124785454518, + "language_loss": 0.78106725, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.80205303, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 0.4987793, + "step": 4296, + "time_per_iteration": 2.6904144287109375 + }, + { + "auxiliary_loss_clip": 0.01628487, + "auxiliary_loss_mlp": 0.00438152, + "balance_loss_clip": 1.26937652, + "balance_loss_mlp": 0.39089775, + "epoch": 0.25834961671426426, + "flos": 23515241253120.0, + "grad_norm": 3.4303382615519435, + "language_loss": 0.91841668, + "learning_rate": 3.477492965085067e-06, + "loss": 0.9390831, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 0.47290039, + "step": 4297, + "time_per_iteration": 2.654670476913452 + }, + { + "auxiliary_loss_clip": 0.01617403, + "auxiliary_loss_mlp": 0.00429378, + "balance_loss_clip": 1.25857663, + "balance_loss_mlp": 0.3871063, + "epoch": 0.25840973996693223, + "flos": 22451028076800.0, + "grad_norm": 3.0165916962340984, + "language_loss": 0.8894074, + "learning_rate": 3.477230446361943e-06, + "loss": 0.90987515, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 0.42285156, + "step": 4298, + "time_per_iteration": 2.635442018508911 + }, + { + "auxiliary_loss_clip": 0.01604089, + "auxiliary_loss_mlp": 0.00404133, + "balance_loss_clip": 1.25603473, + "balance_loss_mlp": 0.36214763, + "epoch": 0.2584698632196002, + "flos": 11290854366720.0, + "grad_norm": 4.6054608122235425, + "language_loss": 0.9096446, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.92972684, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 0.41992188, + "step": 4299, + "time_per_iteration": 2.598755359649658 + }, + { + "auxiliary_loss_clip": 0.01600372, + "auxiliary_loss_mlp": 0.00392036, + "balance_loss_clip": 1.25373793, + "balance_loss_mlp": 0.35064644, + "epoch": 0.25852998647226816, + "flos": 17929982839680.0, + "grad_norm": 3.3078295769091697, + "language_loss": 0.89459771, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.91452175, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 3.46679688, + "router_z_loss_mlp": 0.41381836, + "step": 4300, + "time_per_iteration": 2.622708320617676 + }, + { + "auxiliary_loss_clip": 0.01603688, + "auxiliary_loss_mlp": 0.00438602, + "balance_loss_clip": 1.25486732, + "balance_loss_mlp": 0.39540058, + "epoch": 0.2585901097249361, + "flos": 33256117889280.0, + "grad_norm": 18.96064488499278, + "language_loss": 0.74269748, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.76312035, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 0.43237305, + "step": 4301, + "time_per_iteration": 2.7964694499969482 + }, + { + "auxiliary_loss_clip": 0.01596575, + "auxiliary_loss_mlp": 0.00436663, + "balance_loss_clip": 1.2460959, + "balance_loss_mlp": 0.39191133, + "epoch": 0.2586502329776041, + "flos": 18441278985600.0, + "grad_norm": 42.41336524681273, + "language_loss": 0.90777385, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.92810619, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 0.44750977, + "step": 4302, + "time_per_iteration": 2.5983996391296387 + }, + { + "auxiliary_loss_clip": 0.01592039, + "auxiliary_loss_mlp": 0.00431757, + "balance_loss_clip": 1.24890995, + "balance_loss_mlp": 0.39031965, + "epoch": 0.25871035623027205, + "flos": 17968120104960.0, + "grad_norm": 16.105681164555772, + "language_loss": 0.96994853, + "learning_rate": 3.475917012694595e-06, + "loss": 0.99018651, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.41430664, + "step": 4303, + "time_per_iteration": 2.6844773292541504 + }, + { + "auxiliary_loss_clip": 0.01586734, + "auxiliary_loss_mlp": 0.00387979, + "balance_loss_clip": 1.24113822, + "balance_loss_mlp": 0.34198844, + "epoch": 0.25877047948294, + "flos": 27777729415680.0, + "grad_norm": 194.4802105937529, + "language_loss": 0.73129588, + "learning_rate": 3.475654158020507e-06, + "loss": 0.75104302, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 3.45703125, + "router_z_loss_mlp": 0.4597168, + "step": 4304, + "time_per_iteration": 2.71366286277771 + }, + { + "auxiliary_loss_clip": 0.0161088, + "auxiliary_loss_mlp": 0.0042917, + "balance_loss_clip": 1.25213313, + "balance_loss_mlp": 0.38425225, + "epoch": 0.258830602735608, + "flos": 27125843437440.0, + "grad_norm": 55.338316121111596, + "language_loss": 0.79223841, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.81263888, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.44921875, + "step": 4305, + "time_per_iteration": 2.708440065383911 + }, + { + "auxiliary_loss_clip": 0.01592599, + "auxiliary_loss_mlp": 0.00450774, + "balance_loss_clip": 1.24355602, + "balance_loss_mlp": 0.40270904, + "epoch": 0.25889072598827595, + "flos": 17891486438400.0, + "grad_norm": 8.268687918225556, + "language_loss": 0.83387464, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.85430837, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 3.48828125, + "router_z_loss_mlp": 0.48071289, + "step": 4306, + "time_per_iteration": 2.658363103866577 + }, + { + "auxiliary_loss_clip": 0.01279781, + "auxiliary_loss_mlp": 0.00081462, + "balance_loss_clip": 1.11517692, + "balance_loss_mlp": 0.0735467, + "epoch": 0.2589508492409439, + "flos": 53934955724160.0, + "grad_norm": 0.7954183830100308, + "language_loss": 0.56544894, + "learning_rate": 3.474865258296403e-06, + "loss": 0.57906139, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.07910156, + "step": 4307, + "time_per_iteration": 3.1115903854370117 + }, + { + "auxiliary_loss_clip": 0.01576248, + "auxiliary_loss_mlp": 0.00362544, + "balance_loss_clip": 1.2506566, + "balance_loss_mlp": 0.32358682, + "epoch": 0.2590109724936119, + "flos": 22125785402880.0, + "grad_norm": 5.070811505358024, + "language_loss": 0.76528203, + "learning_rate": 3.474602179854327e-06, + "loss": 0.78466994, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.38964844, + "step": 4308, + "time_per_iteration": 2.66080904006958 + }, + { + "auxiliary_loss_clip": 0.01594706, + "auxiliary_loss_mlp": 0.00388852, + "balance_loss_clip": 1.25092006, + "balance_loss_mlp": 0.34607965, + "epoch": 0.2590710957462799, + "flos": 13474294398720.0, + "grad_norm": 10.983139668061368, + "language_loss": 0.90052921, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.9203648, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 3.43945312, + "router_z_loss_mlp": 0.42773438, + "step": 4309, + "time_per_iteration": 2.6558055877685547 + }, + { + "auxiliary_loss_clip": 0.01584926, + "auxiliary_loss_mlp": 0.00369823, + "balance_loss_clip": 1.25293779, + "balance_loss_mlp": 0.33100802, + "epoch": 0.25913121899894787, + "flos": 22307098279680.0, + "grad_norm": 23.14402985940009, + "language_loss": 0.88763458, + "learning_rate": 3.474075855228966e-06, + "loss": 0.90718204, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.38818359, + "step": 4310, + "time_per_iteration": 2.6646313667297363 + }, + { + "auxiliary_loss_clip": 0.01596395, + "auxiliary_loss_mlp": 0.00432543, + "balance_loss_clip": 1.26152253, + "balance_loss_mlp": 0.38984227, + "epoch": 0.25919134225161583, + "flos": 25811728364160.0, + "grad_norm": 9.310200179315895, + "language_loss": 0.84004331, + "learning_rate": 3.473812609065639e-06, + "loss": 0.86033267, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.42675781, + "step": 4311, + "time_per_iteration": 2.6830973625183105 + }, + { + "auxiliary_loss_clip": 0.01619912, + "auxiliary_loss_mlp": 0.00412675, + "balance_loss_clip": 1.27518725, + "balance_loss_mlp": 0.36832952, + "epoch": 0.2592514655042838, + "flos": 31212262108800.0, + "grad_norm": 2.833986316170463, + "language_loss": 0.77660483, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.79693067, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 0.44335938, + "step": 4312, + "time_per_iteration": 2.7557132244110107 + }, + { + "auxiliary_loss_clip": 0.01604011, + "auxiliary_loss_mlp": 0.00407215, + "balance_loss_clip": 1.26379502, + "balance_loss_mlp": 0.36348861, + "epoch": 0.25931158875695176, + "flos": 18474998878080.0, + "grad_norm": 98.00910946601336, + "language_loss": 0.76556087, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.78567314, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.43652344, + "step": 4313, + "time_per_iteration": 2.681765079498291 + }, + { + "auxiliary_loss_clip": 0.01592302, + "auxiliary_loss_mlp": 0.00393752, + "balance_loss_clip": 1.2581594, + "balance_loss_mlp": 0.35136092, + "epoch": 0.2593717120096197, + "flos": 19207935895680.0, + "grad_norm": 5.939091800278994, + "language_loss": 0.85633421, + "learning_rate": 3.473022535292867e-06, + "loss": 0.87619478, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.42407227, + "step": 4314, + "time_per_iteration": 2.6464781761169434 + }, + { + "auxiliary_loss_clip": 0.01608404, + "auxiliary_loss_mlp": 0.00426106, + "balance_loss_clip": 1.26118195, + "balance_loss_mlp": 0.38259488, + "epoch": 0.2594318352622877, + "flos": 31248100903680.0, + "grad_norm": 4.229394872690892, + "language_loss": 0.72468436, + "learning_rate": 3.472759065640968e-06, + "loss": 0.74502945, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 3.47460938, + "router_z_loss_mlp": 0.43530273, + "step": 4315, + "time_per_iteration": 2.726323366165161 + }, + { + "auxiliary_loss_clip": 0.01601574, + "auxiliary_loss_mlp": 0.00384952, + "balance_loss_clip": 1.26553917, + "balance_loss_mlp": 0.34437346, + "epoch": 0.25949195851495566, + "flos": 22237144542720.0, + "grad_norm": 1.7731085239065583, + "language_loss": 0.84760374, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.86746895, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.40576172, + "step": 4316, + "time_per_iteration": 2.620793342590332 + }, + { + "auxiliary_loss_clip": 0.01623841, + "auxiliary_loss_mlp": 0.00446893, + "balance_loss_clip": 1.27619672, + "balance_loss_mlp": 0.39954293, + "epoch": 0.2595520817676236, + "flos": 28075716645120.0, + "grad_norm": 6.548440562813901, + "language_loss": 0.81215823, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.8328656, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.47363281, + "step": 4317, + "time_per_iteration": 2.7009530067443848 + }, + { + "auxiliary_loss_clip": 0.0159178, + "auxiliary_loss_mlp": 0.00449044, + "balance_loss_clip": 1.25915325, + "balance_loss_mlp": 0.40333965, + "epoch": 0.2596122050202916, + "flos": 20190954378240.0, + "grad_norm": 93.44735429904904, + "language_loss": 0.82745624, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.84786445, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.45703125, + "step": 4318, + "time_per_iteration": 2.6476423740386963 + }, + { + "auxiliary_loss_clip": 0.01585567, + "auxiliary_loss_mlp": 0.00406372, + "balance_loss_clip": 1.25922859, + "balance_loss_mlp": 0.36467263, + "epoch": 0.25967232827295955, + "flos": 22527949052160.0, + "grad_norm": 3.4522883704824414, + "language_loss": 0.80410099, + "learning_rate": 3.471704628661598e-06, + "loss": 0.82402033, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.41699219, + "step": 4319, + "time_per_iteration": 2.661287546157837 + }, + { + "auxiliary_loss_clip": 0.01576422, + "auxiliary_loss_mlp": 0.00433006, + "balance_loss_clip": 1.2508738, + "balance_loss_mlp": 0.3883259, + "epoch": 0.2597324515256275, + "flos": 21068252156160.0, + "grad_norm": 10.712097905220364, + "language_loss": 0.82053566, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.84062999, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.4465332, + "step": 4320, + "time_per_iteration": 2.663148880004883 + }, + { + "auxiliary_loss_clip": 0.01569728, + "auxiliary_loss_mlp": 0.00406678, + "balance_loss_clip": 1.24328756, + "balance_loss_mlp": 0.36106858, + "epoch": 0.2597925747782955, + "flos": 22050013662720.0, + "grad_norm": 5.945908854569021, + "language_loss": 0.77153206, + "learning_rate": 3.471177075288801e-06, + "loss": 0.79129612, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 3.265625, + "router_z_loss_mlp": 0.45556641, + "step": 4321, + "time_per_iteration": 2.663271427154541 + }, + { + "auxiliary_loss_clip": 0.01589012, + "auxiliary_loss_mlp": 0.00431169, + "balance_loss_clip": 1.2539854, + "balance_loss_mlp": 0.38572633, + "epoch": 0.2598526980309635, + "flos": 19536949497600.0, + "grad_norm": 11.358765896560024, + "language_loss": 0.80499792, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.82519972, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.4543457, + "step": 4322, + "time_per_iteration": 4.150545120239258 + }, + { + "auxiliary_loss_clip": 0.0157569, + "auxiliary_loss_mlp": 0.00445891, + "balance_loss_clip": 1.24828589, + "balance_loss_mlp": 0.39837414, + "epoch": 0.25991282128363147, + "flos": 24495207079680.0, + "grad_norm": 18.618925739278914, + "language_loss": 0.79090673, + "learning_rate": 3.470649298767278e-06, + "loss": 0.81112254, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.47509766, + "step": 4323, + "time_per_iteration": 2.6640539169311523 + }, + { + "auxiliary_loss_clip": 0.01593962, + "auxiliary_loss_mlp": 0.00455599, + "balance_loss_clip": 1.25926042, + "balance_loss_mlp": 0.40886915, + "epoch": 0.25997294453629943, + "flos": 24201457655040.0, + "grad_norm": 4.112078007119001, + "language_loss": 0.72384733, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.74434298, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.46679688, + "step": 4324, + "time_per_iteration": 2.7136664390563965 + }, + { + "auxiliary_loss_clip": 0.01581835, + "auxiliary_loss_mlp": 0.00411698, + "balance_loss_clip": 1.25407541, + "balance_loss_mlp": 0.3700223, + "epoch": 0.2600330677889674, + "flos": 31431460855680.0, + "grad_norm": 22.01950689656422, + "language_loss": 0.75527906, + "learning_rate": 3.470121299177082e-06, + "loss": 0.77521437, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.41625977, + "step": 4325, + "time_per_iteration": 4.164029836654663 + }, + { + "auxiliary_loss_clip": 0.01574096, + "auxiliary_loss_mlp": 0.00431408, + "balance_loss_clip": 1.25117552, + "balance_loss_mlp": 0.38587043, + "epoch": 0.26009319104163536, + "flos": 32266527217920.0, + "grad_norm": 13.647746112935126, + "language_loss": 0.78311121, + "learning_rate": 3.469857215756257e-06, + "loss": 0.80316627, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.45532227, + "step": 4326, + "time_per_iteration": 2.7986278533935547 + }, + { + "auxiliary_loss_clip": 0.01577629, + "auxiliary_loss_mlp": 0.00416787, + "balance_loss_clip": 1.25449705, + "balance_loss_mlp": 0.37277511, + "epoch": 0.26015331429430333, + "flos": 26286754752000.0, + "grad_norm": 5.431525880549466, + "language_loss": 0.91785192, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.93779612, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.44042969, + "step": 4327, + "time_per_iteration": 2.726407051086426 + }, + { + "auxiliary_loss_clip": 0.01597404, + "auxiliary_loss_mlp": 0.00473068, + "balance_loss_clip": 1.26838315, + "balance_loss_mlp": 0.42424011, + "epoch": 0.2602134375469713, + "flos": 21142335957120.0, + "grad_norm": 6.840347219264271, + "language_loss": 0.85066736, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.8713721, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.48803711, + "step": 4328, + "time_per_iteration": 2.655184745788574 + }, + { + "auxiliary_loss_clip": 0.01576222, + "auxiliary_loss_mlp": 0.00424598, + "balance_loss_clip": 1.25083423, + "balance_loss_mlp": 0.38034743, + "epoch": 0.26027356079963926, + "flos": 25921327737600.0, + "grad_norm": 22.781754071934714, + "language_loss": 0.92214918, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.94215745, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.44213867, + "step": 4329, + "time_per_iteration": 4.126258373260498 + }, + { + "auxiliary_loss_clip": 0.01569635, + "auxiliary_loss_mlp": 0.00403015, + "balance_loss_clip": 1.25131428, + "balance_loss_mlp": 0.3617689, + "epoch": 0.2603336840523072, + "flos": 26359222440960.0, + "grad_norm": 11.805415607317057, + "language_loss": 0.82169896, + "learning_rate": 3.468800324801802e-06, + "loss": 0.84142548, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.41235352, + "step": 4330, + "time_per_iteration": 2.694737672805786 + }, + { + "auxiliary_loss_clip": 0.01586112, + "auxiliary_loss_mlp": 0.00451207, + "balance_loss_clip": 1.25692773, + "balance_loss_mlp": 0.40562153, + "epoch": 0.2603938073049752, + "flos": 23513661054720.0, + "grad_norm": 51.03110825523836, + "language_loss": 0.80559319, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.82596642, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.45629883, + "step": 4331, + "time_per_iteration": 2.6537725925445557 + }, + { + "auxiliary_loss_clip": 0.01609368, + "auxiliary_loss_mlp": 0.00410992, + "balance_loss_clip": 1.28021526, + "balance_loss_mlp": 0.36805314, + "epoch": 0.26045393055764315, + "flos": 25374300537600.0, + "grad_norm": 23.555371069240355, + "language_loss": 0.73883736, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.75904101, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.4296875, + "step": 4332, + "time_per_iteration": 2.697411298751831 + }, + { + "auxiliary_loss_clip": 0.01586065, + "auxiliary_loss_mlp": 0.00425533, + "balance_loss_clip": 1.25465679, + "balance_loss_mlp": 0.37780195, + "epoch": 0.2605140538103111, + "flos": 27635272076160.0, + "grad_norm": 6.087085659935331, + "language_loss": 0.83345699, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.85357296, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.47705078, + "step": 4333, + "time_per_iteration": 2.7642416954040527 + }, + { + "auxiliary_loss_clip": 0.01593222, + "auxiliary_loss_mlp": 0.00419103, + "balance_loss_clip": 1.26704526, + "balance_loss_mlp": 0.37840497, + "epoch": 0.2605741770629791, + "flos": 13769839503360.0, + "grad_norm": 10.871069290868418, + "language_loss": 0.84198534, + "learning_rate": 3.467742542694501e-06, + "loss": 0.86210859, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.40698242, + "step": 4334, + "time_per_iteration": 2.7419238090515137 + }, + { + "auxiliary_loss_clip": 0.01590673, + "auxiliary_loss_mlp": 0.00419634, + "balance_loss_clip": 1.26041031, + "balance_loss_mlp": 0.37509763, + "epoch": 0.26063430031564705, + "flos": 26031681296640.0, + "grad_norm": 2.3907817648259426, + "language_loss": 0.83854085, + "learning_rate": 3.46747795800024e-06, + "loss": 0.85864395, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.44555664, + "step": 4335, + "time_per_iteration": 2.6968600749969482 + }, + { + "auxiliary_loss_clip": 0.01434337, + "auxiliary_loss_mlp": 0.00096329, + "balance_loss_clip": 1.20947778, + "balance_loss_mlp": 0.08655354, + "epoch": 0.26069442356831507, + "flos": 62443809820800.0, + "grad_norm": 0.8402256951789285, + "language_loss": 0.61005062, + "learning_rate": 3.467213317659068e-06, + "loss": 0.62535727, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.09765625, + "step": 4336, + "time_per_iteration": 3.0961384773254395 + }, + { + "auxiliary_loss_clip": 0.0161221, + "auxiliary_loss_mlp": 0.00420248, + "balance_loss_clip": 1.27358568, + "balance_loss_mlp": 0.37771386, + "epoch": 0.26075454682098304, + "flos": 13626376583040.0, + "grad_norm": 2.799172392868859, + "language_loss": 0.84572655, + "learning_rate": 3.46694862168102e-06, + "loss": 0.86605108, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.42504883, + "step": 4337, + "time_per_iteration": 4.061982154846191 + }, + { + "auxiliary_loss_clip": 0.01608895, + "auxiliary_loss_mlp": 0.00409573, + "balance_loss_clip": 1.27347529, + "balance_loss_mlp": 0.36684799, + "epoch": 0.260814670073651, + "flos": 12126531260160.0, + "grad_norm": 628.2514120574647, + "language_loss": 0.82055104, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.84073573, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.42675781, + "step": 4338, + "time_per_iteration": 2.613847017288208 + }, + { + "auxiliary_loss_clip": 0.0160815, + "auxiliary_loss_mlp": 0.00455261, + "balance_loss_clip": 1.27008176, + "balance_loss_mlp": 0.40972298, + "epoch": 0.26087479332631897, + "flos": 15122522805120.0, + "grad_norm": 22.27445067746055, + "language_loss": 0.86933112, + "learning_rate": 3.466419062854447e-06, + "loss": 0.88996518, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.45532227, + "step": 4339, + "time_per_iteration": 2.6236824989318848 + }, + { + "auxiliary_loss_clip": 0.0160059, + "auxiliary_loss_mlp": 0.00399235, + "balance_loss_clip": 1.27053785, + "balance_loss_mlp": 0.3610163, + "epoch": 0.26093491657898693, + "flos": 24680937329280.0, + "grad_norm": 17.83804118828831, + "language_loss": 0.80930722, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.82930547, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.38208008, + "step": 4340, + "time_per_iteration": 2.656724691390991 + }, + { + "auxiliary_loss_clip": 0.01605632, + "auxiliary_loss_mlp": 0.00415437, + "balance_loss_clip": 1.26772189, + "balance_loss_mlp": 0.37435716, + "epoch": 0.2609950398316549, + "flos": 25116138512640.0, + "grad_norm": 2.2506691994915733, + "language_loss": 0.87632167, + "learning_rate": 3.465889281600845e-06, + "loss": 0.8965323, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.41137695, + "step": 4341, + "time_per_iteration": 2.660353422164917 + }, + { + "auxiliary_loss_clip": 0.01589287, + "auxiliary_loss_mlp": 0.00439066, + "balance_loss_clip": 1.25824142, + "balance_loss_mlp": 0.39529243, + "epoch": 0.26105516308432286, + "flos": 28548588216960.0, + "grad_norm": 8.598968899034103, + "language_loss": 0.82364178, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.84392524, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.43774414, + "step": 4342, + "time_per_iteration": 2.7670674324035645 + }, + { + "auxiliary_loss_clip": 0.01594681, + "auxiliary_loss_mlp": 0.00433157, + "balance_loss_clip": 1.26438308, + "balance_loss_mlp": 0.38661808, + "epoch": 0.2611152863369908, + "flos": 39530609447040.0, + "grad_norm": 73.19985743565881, + "language_loss": 0.70387244, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.72415078, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.46533203, + "step": 4343, + "time_per_iteration": 2.7839484214782715 + }, + { + "auxiliary_loss_clip": 0.01596289, + "auxiliary_loss_mlp": 0.00413977, + "balance_loss_clip": 1.26153946, + "balance_loss_mlp": 0.37425691, + "epoch": 0.2611754095896588, + "flos": 13735329511680.0, + "grad_norm": 2701.0898486327687, + "language_loss": 0.79664528, + "learning_rate": 3.465094192845553e-06, + "loss": 0.8167479, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.39697266, + "step": 4344, + "time_per_iteration": 2.609745979309082 + }, + { + "auxiliary_loss_clip": 0.01602399, + "auxiliary_loss_mlp": 0.00444106, + "balance_loss_clip": 1.26933086, + "balance_loss_mlp": 0.3980912, + "epoch": 0.26123553284232676, + "flos": 21506649649920.0, + "grad_norm": 3.0385612342096993, + "language_loss": 0.92440724, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.94487232, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.46020508, + "step": 4345, + "time_per_iteration": 2.679110288619995 + }, + { + "auxiliary_loss_clip": 0.01593604, + "auxiliary_loss_mlp": 0.00414481, + "balance_loss_clip": 1.2656002, + "balance_loss_mlp": 0.37290066, + "epoch": 0.2612956560949947, + "flos": 21139786091520.0, + "grad_norm": 22.146764244372324, + "language_loss": 0.82104564, + "learning_rate": 3.464563855876015e-06, + "loss": 0.84112644, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.41552734, + "step": 4346, + "time_per_iteration": 2.6394100189208984 + }, + { + "auxiliary_loss_clip": 0.01605986, + "auxiliary_loss_mlp": 0.00403328, + "balance_loss_clip": 1.267712, + "balance_loss_mlp": 0.36220074, + "epoch": 0.2613557793476627, + "flos": 25119011600640.0, + "grad_norm": 7.83741586146149, + "language_loss": 0.80571508, + "learning_rate": 3.464298604081606e-06, + "loss": 0.82580829, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 3.38476562, + "router_z_loss_mlp": 0.41137695, + "step": 4347, + "time_per_iteration": 2.6952731609344482 + }, + { + "auxiliary_loss_clip": 0.0158921, + "auxiliary_loss_mlp": 0.00406489, + "balance_loss_clip": 1.25962901, + "balance_loss_mlp": 0.3681038, + "epoch": 0.26141590260033065, + "flos": 26067699659520.0, + "grad_norm": 186.0934521535218, + "language_loss": 0.79181105, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.81176805, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.3840332, + "step": 4348, + "time_per_iteration": 2.7072737216949463 + }, + { + "auxiliary_loss_clip": 0.01605126, + "auxiliary_loss_mlp": 0.00426008, + "balance_loss_clip": 1.26639664, + "balance_loss_mlp": 0.38428435, + "epoch": 0.2614760258529987, + "flos": 25701518459520.0, + "grad_norm": 21.29350366992205, + "language_loss": 0.96784019, + "learning_rate": 3.463767933923799e-06, + "loss": 0.98815155, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.41748047, + "step": 4349, + "time_per_iteration": 2.684600830078125 + }, + { + "auxiliary_loss_clip": 0.01593976, + "auxiliary_loss_mlp": 0.00396517, + "balance_loss_clip": 1.25685024, + "balance_loss_mlp": 0.35867995, + "epoch": 0.26153614910566664, + "flos": 17457147181440.0, + "grad_norm": 3.210882687861878, + "language_loss": 0.84925383, + "learning_rate": 3.463502515580524e-06, + "loss": 0.86915874, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.37841797, + "step": 4350, + "time_per_iteration": 2.67560076713562 + }, + { + "auxiliary_loss_clip": 0.01597224, + "auxiliary_loss_mlp": 0.00413685, + "balance_loss_clip": 1.25719774, + "balance_loss_mlp": 0.37601489, + "epoch": 0.2615962723583346, + "flos": 17712831168000.0, + "grad_norm": 10.69151414604349, + "language_loss": 0.6798712, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.69998032, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.37695312, + "step": 4351, + "time_per_iteration": 2.6551854610443115 + }, + { + "auxiliary_loss_clip": 0.01615624, + "auxiliary_loss_mlp": 0.00438704, + "balance_loss_clip": 1.26244843, + "balance_loss_mlp": 0.39574108, + "epoch": 0.26165639561100257, + "flos": 23257725672960.0, + "grad_norm": 5.7130367530858495, + "language_loss": 0.89691973, + "learning_rate": 3.462971512415555e-06, + "loss": 0.91746294, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.4296875, + "step": 4352, + "time_per_iteration": 2.642085075378418 + }, + { + "auxiliary_loss_clip": 0.01452806, + "auxiliary_loss_mlp": 0.00142172, + "balance_loss_clip": 1.19030929, + "balance_loss_mlp": 0.13058463, + "epoch": 0.26171651886367053, + "flos": 66737970800640.0, + "grad_norm": 0.7903744642486397, + "language_loss": 0.70106006, + "learning_rate": 3.462705927613996e-06, + "loss": 0.71700978, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.11572266, + "step": 4353, + "time_per_iteration": 3.059678792953491 + }, + { + "auxiliary_loss_clip": 0.0161771, + "auxiliary_loss_mlp": 0.00377098, + "balance_loss_clip": 1.26782155, + "balance_loss_mlp": 0.33766392, + "epoch": 0.2617766421163385, + "flos": 22349581090560.0, + "grad_norm": 3.567774847989798, + "language_loss": 0.83331227, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.8532604, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.39404297, + "step": 4354, + "time_per_iteration": 2.673826217651367 + }, + { + "auxiliary_loss_clip": 0.01611028, + "auxiliary_loss_mlp": 0.00401768, + "balance_loss_clip": 1.25300574, + "balance_loss_mlp": 0.36302531, + "epoch": 0.26183676536900646, + "flos": 26067125041920.0, + "grad_norm": 39.416878313967764, + "language_loss": 0.75324029, + "learning_rate": 3.462174591623085e-06, + "loss": 0.77336824, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 0.38745117, + "step": 4355, + "time_per_iteration": 2.7014710903167725 + }, + { + "auxiliary_loss_clip": 0.01597082, + "auxiliary_loss_mlp": 0.0037993, + "balance_loss_clip": 1.24688053, + "balance_loss_mlp": 0.33670494, + "epoch": 0.26189688862167443, + "flos": 20996466825600.0, + "grad_norm": 221.80664695221512, + "language_loss": 0.73810726, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.75787735, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.43212891, + "step": 4356, + "time_per_iteration": 2.7149598598480225 + }, + { + "auxiliary_loss_clip": 0.01425486, + "auxiliary_loss_mlp": 0.00099175, + "balance_loss_clip": 1.18707776, + "balance_loss_mlp": 0.08715858, + "epoch": 0.2619570118743424, + "flos": 65798261141760.0, + "grad_norm": 0.7693750162069145, + "language_loss": 0.53090072, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.54614735, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.12011719, + "step": 4357, + "time_per_iteration": 3.0530319213867188 + }, + { + "auxiliary_loss_clip": 0.01605364, + "auxiliary_loss_mlp": 0.00393669, + "balance_loss_clip": 1.25234079, + "balance_loss_mlp": 0.35430631, + "epoch": 0.26201713512701036, + "flos": 28766817296640.0, + "grad_norm": 17.609466924667792, + "language_loss": 0.90785289, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.92784321, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.39404297, + "step": 4358, + "time_per_iteration": 2.688920736312866 + }, + { + "auxiliary_loss_clip": 0.01595565, + "auxiliary_loss_mlp": 0.00453183, + "balance_loss_clip": 1.23778796, + "balance_loss_mlp": 0.40578526, + "epoch": 0.2620772583796783, + "flos": 26432516142720.0, + "grad_norm": 5.937081696078898, + "language_loss": 0.73830462, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.7587921, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.47363281, + "step": 4359, + "time_per_iteration": 2.664734125137329 + }, + { + "auxiliary_loss_clip": 0.0160437, + "auxiliary_loss_mlp": 0.004025, + "balance_loss_clip": 1.25245309, + "balance_loss_mlp": 0.36275613, + "epoch": 0.2621373816323463, + "flos": 20156552127360.0, + "grad_norm": 82.24860916503191, + "language_loss": 0.8299247, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.84999341, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.39770508, + "step": 4360, + "time_per_iteration": 2.6236424446105957 + }, + { + "auxiliary_loss_clip": 0.01593245, + "auxiliary_loss_mlp": 0.0037216, + "balance_loss_clip": 1.24547696, + "balance_loss_mlp": 0.33339328, + "epoch": 0.26219750488501425, + "flos": 28621235473920.0, + "grad_norm": 185.87731922635396, + "language_loss": 0.73393184, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.75358588, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.38769531, + "step": 4361, + "time_per_iteration": 2.694024085998535 + }, + { + "auxiliary_loss_clip": 0.0160537, + "auxiliary_loss_mlp": 0.00422127, + "balance_loss_clip": 1.25430942, + "balance_loss_mlp": 0.37990308, + "epoch": 0.2622576281376823, + "flos": 15042549173760.0, + "grad_norm": 5.907545738207341, + "language_loss": 0.89360207, + "learning_rate": 3.46031316964119e-06, + "loss": 0.91387701, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 0.42211914, + "step": 4362, + "time_per_iteration": 2.6335391998291016 + }, + { + "auxiliary_loss_clip": 0.01587155, + "auxiliary_loss_mlp": 0.00389422, + "balance_loss_clip": 1.24581385, + "balance_loss_mlp": 0.34259671, + "epoch": 0.26231775139035024, + "flos": 26396174557440.0, + "grad_norm": 29.241391893927197, + "language_loss": 0.72831702, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.74808276, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.46875, + "step": 4363, + "time_per_iteration": 2.6818857192993164 + }, + { + "auxiliary_loss_clip": 0.01402475, + "auxiliary_loss_mlp": 0.00099538, + "balance_loss_clip": 1.18080544, + "balance_loss_mlp": 0.08628213, + "epoch": 0.2623778746430182, + "flos": 65408918647680.0, + "grad_norm": 2.4978947748342613, + "language_loss": 0.60720742, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.62222755, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.1328125, + "step": 4364, + "time_per_iteration": 4.593913793563843 + }, + { + "auxiliary_loss_clip": 0.0158554, + "auxiliary_loss_mlp": 0.00373655, + "balance_loss_clip": 1.24395728, + "balance_loss_mlp": 0.33014402, + "epoch": 0.26243799789568617, + "flos": 12604215254400.0, + "grad_norm": 3.8533919999684376, + "language_loss": 0.80207872, + "learning_rate": 3.459514586533184e-06, + "loss": 0.82167065, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 0.43554688, + "step": 4365, + "time_per_iteration": 2.638861894607544 + }, + { + "auxiliary_loss_clip": 0.01593626, + "auxiliary_loss_mlp": 0.00352879, + "balance_loss_clip": 1.25105584, + "balance_loss_mlp": 0.31418362, + "epoch": 0.26249812114835414, + "flos": 28623821253120.0, + "grad_norm": 11.802399362668922, + "language_loss": 0.8288877, + "learning_rate": 3.459248281460509e-06, + "loss": 0.84835279, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.38696289, + "step": 4366, + "time_per_iteration": 2.6985559463500977 + }, + { + "auxiliary_loss_clip": 0.01604727, + "auxiliary_loss_mlp": 0.00359689, + "balance_loss_clip": 1.25908136, + "balance_loss_mlp": 0.32051647, + "epoch": 0.2625582444010221, + "flos": 14465393441280.0, + "grad_norm": 36.15860051653756, + "language_loss": 0.81951666, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.8391608, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.3918457, + "step": 4367, + "time_per_iteration": 2.6854851245880127 + }, + { + "auxiliary_loss_clip": 0.01598255, + "auxiliary_loss_mlp": 0.00349349, + "balance_loss_clip": 1.25806236, + "balance_loss_mlp": 0.30724403, + "epoch": 0.26261836765369007, + "flos": 16613174246400.0, + "grad_norm": 9.296935961972375, + "language_loss": 0.74417782, + "learning_rate": 3.458715505320736e-06, + "loss": 0.76365387, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.42138672, + "step": 4368, + "time_per_iteration": 4.02173376083374 + }, + { + "auxiliary_loss_clip": 0.01596189, + "auxiliary_loss_mlp": 0.00383913, + "balance_loss_clip": 1.24771821, + "balance_loss_mlp": 0.33827996, + "epoch": 0.26267849090635803, + "flos": 20519932066560.0, + "grad_norm": 9.50455581045785, + "language_loss": 0.84595597, + "learning_rate": 3.458449034273841e-06, + "loss": 0.86575705, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 0.45605469, + "step": 4369, + "time_per_iteration": 2.6540353298187256 + }, + { + "auxiliary_loss_clip": 0.01610584, + "auxiliary_loss_mlp": 0.00365747, + "balance_loss_clip": 1.26062822, + "balance_loss_mlp": 0.32640785, + "epoch": 0.262738614159026, + "flos": 21323936142720.0, + "grad_norm": 8.488637794856688, + "language_loss": 0.88776124, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.90752459, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.39355469, + "step": 4370, + "time_per_iteration": 2.669826030731201 + }, + { + "auxiliary_loss_clip": 0.01617663, + "auxiliary_loss_mlp": 0.00425765, + "balance_loss_clip": 1.25833035, + "balance_loss_mlp": 0.37991661, + "epoch": 0.26279873741169396, + "flos": 17603590930560.0, + "grad_norm": 16.1981727452957, + "language_loss": 0.76926953, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.78970379, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 0.45800781, + "step": 4371, + "time_per_iteration": 4.075396776199341 + }, + { + "auxiliary_loss_clip": 0.01444178, + "auxiliary_loss_mlp": 0.00107062, + "balance_loss_clip": 1.19938016, + "balance_loss_mlp": 0.0970008, + "epoch": 0.2628588606643619, + "flos": 60949746587520.0, + "grad_norm": 0.7200117470464382, + "language_loss": 0.55954444, + "learning_rate": 3.457649289346384e-06, + "loss": 0.57505691, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.10058594, + "step": 4372, + "time_per_iteration": 3.353477954864502 + }, + { + "auxiliary_loss_clip": 0.01608575, + "auxiliary_loss_mlp": 0.00358184, + "balance_loss_clip": 1.25803757, + "balance_loss_mlp": 0.31534058, + "epoch": 0.2629189839170299, + "flos": 27016315891200.0, + "grad_norm": 3.6508751177348, + "language_loss": 0.8308627, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.85053027, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 0.42871094, + "step": 4373, + "time_per_iteration": 2.717026472091675 + }, + { + "auxiliary_loss_clip": 0.01600023, + "auxiliary_loss_mlp": 0.00355614, + "balance_loss_clip": 1.25338936, + "balance_loss_mlp": 0.31484419, + "epoch": 0.26297910716969786, + "flos": 17019863009280.0, + "grad_norm": 1010.8596292285557, + "language_loss": 0.76675153, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.78630787, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.40771484, + "step": 4374, + "time_per_iteration": 2.6224489212036133 + }, + { + "auxiliary_loss_clip": 0.0162273, + "auxiliary_loss_mlp": 0.00341257, + "balance_loss_clip": 1.26611972, + "balance_loss_mlp": 0.30110729, + "epoch": 0.2630392304223659, + "flos": 24897370728960.0, + "grad_norm": 4.9265379581341, + "language_loss": 0.85816771, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.8778075, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 0.40136719, + "step": 4375, + "time_per_iteration": 2.72318434715271 + }, + { + "auxiliary_loss_clip": 0.01598343, + "auxiliary_loss_mlp": 0.00345231, + "balance_loss_clip": 1.25150204, + "balance_loss_mlp": 0.30670267, + "epoch": 0.26309935367503384, + "flos": 32854026067200.0, + "grad_norm": 8.58512537889125, + "language_loss": 0.72630352, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.74573922, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.38549805, + "step": 4376, + "time_per_iteration": 2.8121542930603027 + }, + { + "auxiliary_loss_clip": 0.0162395, + "auxiliary_loss_mlp": 0.00384045, + "balance_loss_clip": 1.26842117, + "balance_loss_mlp": 0.3423928, + "epoch": 0.2631594769277018, + "flos": 15887958652800.0, + "grad_norm": 14.622921557832578, + "language_loss": 0.75015295, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.77023292, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 0.41674805, + "step": 4377, + "time_per_iteration": 2.7024106979370117 + }, + { + "auxiliary_loss_clip": 0.01603406, + "auxiliary_loss_mlp": 0.00348568, + "balance_loss_clip": 1.25489414, + "balance_loss_mlp": 0.30875248, + "epoch": 0.2632196001803698, + "flos": 50804943557760.0, + "grad_norm": 28.891185012586693, + "language_loss": 0.84862113, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.86814094, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.39819336, + "step": 4378, + "time_per_iteration": 2.9192392826080322 + }, + { + "auxiliary_loss_clip": 0.01593389, + "auxiliary_loss_mlp": 0.00349063, + "balance_loss_clip": 1.2545321, + "balance_loss_mlp": 0.31172633, + "epoch": 0.26327972343303774, + "flos": 13733031041280.0, + "grad_norm": 2.25301397337326, + "language_loss": 0.83861887, + "learning_rate": 3.455781283723846e-06, + "loss": 0.85804343, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.37353516, + "step": 4379, + "time_per_iteration": 4.116482496261597 + }, + { + "auxiliary_loss_clip": 0.01622029, + "auxiliary_loss_mlp": 0.00384969, + "balance_loss_clip": 1.26492608, + "balance_loss_mlp": 0.34205407, + "epoch": 0.2633398466857057, + "flos": 23769057732480.0, + "grad_norm": 18.9534499346279, + "language_loss": 0.85572368, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.8757937, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.42895508, + "step": 4380, + "time_per_iteration": 2.691673755645752 + }, + { + "auxiliary_loss_clip": 0.01594592, + "auxiliary_loss_mlp": 0.00338465, + "balance_loss_clip": 1.24240088, + "balance_loss_mlp": 0.29969791, + "epoch": 0.26339996993837367, + "flos": 27600223380480.0, + "grad_norm": 5.60441707904213, + "language_loss": 0.71773791, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.73706853, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.38769531, + "step": 4381, + "time_per_iteration": 2.6739449501037598 + }, + { + "auxiliary_loss_clip": 0.01592409, + "auxiliary_loss_mlp": 0.00330871, + "balance_loss_clip": 1.24904776, + "balance_loss_mlp": 0.28983968, + "epoch": 0.26346009319104163, + "flos": 16946317912320.0, + "grad_norm": 4.409276731995388, + "language_loss": 0.88812619, + "learning_rate": 3.454979881632595e-06, + "loss": 0.907359, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.41040039, + "step": 4382, + "time_per_iteration": 2.6391866207122803 + }, + { + "auxiliary_loss_clip": 0.01604257, + "auxiliary_loss_mlp": 0.00384775, + "balance_loss_clip": 1.25388157, + "balance_loss_mlp": 0.34195471, + "epoch": 0.2635202164437096, + "flos": 37232218915200.0, + "grad_norm": 80.48621024341162, + "language_loss": 0.76218367, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.78207397, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.42797852, + "step": 4383, + "time_per_iteration": 2.755786895751953 + }, + { + "auxiliary_loss_clip": 0.01603106, + "auxiliary_loss_mlp": 0.00362329, + "balance_loss_clip": 1.25436664, + "balance_loss_mlp": 0.32461131, + "epoch": 0.26358033969637756, + "flos": 20996359084800.0, + "grad_norm": 11.457913830100518, + "language_loss": 0.74340796, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.76306236, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 0.37744141, + "step": 4384, + "time_per_iteration": 2.634168863296509 + }, + { + "auxiliary_loss_clip": 0.01588677, + "auxiliary_loss_mlp": 0.00352312, + "balance_loss_clip": 1.24986625, + "balance_loss_mlp": 0.31254393, + "epoch": 0.26364046294904553, + "flos": 27746092512000.0, + "grad_norm": 10.605461962296028, + "language_loss": 0.75145555, + "learning_rate": 3.45417798298451e-06, + "loss": 0.77086538, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.39770508, + "step": 4385, + "time_per_iteration": 2.6744024753570557 + }, + { + "auxiliary_loss_clip": 0.01593112, + "auxiliary_loss_mlp": 0.0037309, + "balance_loss_clip": 1.25106788, + "balance_loss_mlp": 0.33317852, + "epoch": 0.2637005862017135, + "flos": 22893088757760.0, + "grad_norm": 1.8728132422185797, + "language_loss": 0.88271332, + "learning_rate": 3.453910573136482e-06, + "loss": 0.90237534, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.39916992, + "step": 4386, + "time_per_iteration": 2.6595780849456787 + }, + { + "auxiliary_loss_clip": 0.01588997, + "auxiliary_loss_mlp": 0.003341, + "balance_loss_clip": 1.25372493, + "balance_loss_mlp": 0.29697776, + "epoch": 0.26376070945438146, + "flos": 15048834053760.0, + "grad_norm": 22.7332080326983, + "language_loss": 0.84224373, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.86147463, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.37109375, + "step": 4387, + "time_per_iteration": 2.624607563018799 + }, + { + "auxiliary_loss_clip": 0.01588959, + "auxiliary_loss_mlp": 0.00346927, + "balance_loss_clip": 1.2509594, + "balance_loss_mlp": 0.30878007, + "epoch": 0.2638208327070494, + "flos": 21141833166720.0, + "grad_norm": 22.939609459723986, + "language_loss": 0.81708795, + "learning_rate": 3.453375588053264e-06, + "loss": 0.83644676, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.3815918, + "step": 4388, + "time_per_iteration": 2.679506540298462 + }, + { + "auxiliary_loss_clip": 0.01570917, + "auxiliary_loss_mlp": 0.00349363, + "balance_loss_clip": 1.24104393, + "balance_loss_mlp": 0.31028616, + "epoch": 0.26388095595971744, + "flos": 21725597001600.0, + "grad_norm": 24.366060921065614, + "language_loss": 0.92738712, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.94658995, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.390625, + "step": 4389, + "time_per_iteration": 2.653801441192627 + }, + { + "auxiliary_loss_clip": 0.01391482, + "auxiliary_loss_mlp": 0.000795, + "balance_loss_clip": 1.18577051, + "balance_loss_mlp": 0.06891456, + "epoch": 0.2639410792123854, + "flos": 65515537192320.0, + "grad_norm": 0.8013620600417956, + "language_loss": 0.60044217, + "learning_rate": 3.452840382521457e-06, + "loss": 0.615152, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.10595703, + "step": 4390, + "time_per_iteration": 3.158923625946045 + }, + { + "auxiliary_loss_clip": 0.01594783, + "auxiliary_loss_mlp": 0.00364634, + "balance_loss_clip": 1.24718022, + "balance_loss_mlp": 0.32481843, + "epoch": 0.2640012024650534, + "flos": 23948574929280.0, + "grad_norm": 9.140828030675603, + "language_loss": 0.82767463, + "learning_rate": 3.4525726971127e-06, + "loss": 0.84726882, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.3984375, + "step": 4391, + "time_per_iteration": 2.7188563346862793 + }, + { + "auxiliary_loss_clip": 0.0138959, + "auxiliary_loss_mlp": 0.00076307, + "balance_loss_clip": 1.18095422, + "balance_loss_mlp": 0.06314629, + "epoch": 0.26406132571772134, + "flos": 56441163369600.0, + "grad_norm": 0.9164135787524301, + "language_loss": 0.58860999, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60326898, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.13183594, + "step": 4392, + "time_per_iteration": 3.118898630142212 + }, + { + "auxiliary_loss_clip": 0.01583723, + "auxiliary_loss_mlp": 0.00383535, + "balance_loss_clip": 1.24660981, + "balance_loss_mlp": 0.34257454, + "epoch": 0.2641214489703893, + "flos": 22090557139200.0, + "grad_norm": 171.42948722094826, + "language_loss": 0.76012051, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.77979308, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.40966797, + "step": 4393, + "time_per_iteration": 2.661250352859497 + }, + { + "auxiliary_loss_clip": 0.01623831, + "auxiliary_loss_mlp": 0.00384944, + "balance_loss_clip": 1.2655344, + "balance_loss_mlp": 0.34126598, + "epoch": 0.26418157222305727, + "flos": 16544764794240.0, + "grad_norm": 11.708284844087663, + "language_loss": 0.91368806, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.93377578, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 0.43676758, + "step": 4394, + "time_per_iteration": 2.6276464462280273 + }, + { + "auxiliary_loss_clip": 0.01600912, + "auxiliary_loss_mlp": 0.00402194, + "balance_loss_clip": 1.25420022, + "balance_loss_mlp": 0.35656053, + "epoch": 0.26424169547572524, + "flos": 18002486442240.0, + "grad_norm": 1482.765293011046, + "language_loss": 0.77639329, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.79642433, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.45629883, + "step": 4395, + "time_per_iteration": 2.619351625442505 + }, + { + "auxiliary_loss_clip": 0.01588415, + "auxiliary_loss_mlp": 0.00340152, + "balance_loss_clip": 1.25052786, + "balance_loss_mlp": 0.30238697, + "epoch": 0.2643018187283932, + "flos": 16983162288000.0, + "grad_norm": 6.6543545038175225, + "language_loss": 0.94116664, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.96045232, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.37768555, + "step": 4396, + "time_per_iteration": 2.6286208629608154 + }, + { + "auxiliary_loss_clip": 0.01428902, + "auxiliary_loss_mlp": 0.00104129, + "balance_loss_clip": 1.2148546, + "balance_loss_mlp": 0.09483024, + "epoch": 0.26436194198106117, + "flos": 59664359416320.0, + "grad_norm": 51.951619106525534, + "language_loss": 0.5497418, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.56507212, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.09277344, + "step": 4397, + "time_per_iteration": 2.955902576446533 + }, + { + "auxiliary_loss_clip": 0.01610627, + "auxiliary_loss_mlp": 0.00362872, + "balance_loss_clip": 1.26934361, + "balance_loss_mlp": 0.3245101, + "epoch": 0.26442206523372913, + "flos": 32921322197760.0, + "grad_norm": 12.10005954139989, + "language_loss": 0.84692836, + "learning_rate": 3.450697357532435e-06, + "loss": 0.8666634, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 3.41210938, + "router_z_loss_mlp": 0.38354492, + "step": 4398, + "time_per_iteration": 2.7408740520477295 + }, + { + "auxiliary_loss_clip": 0.01623277, + "auxiliary_loss_mlp": 0.00371295, + "balance_loss_clip": 1.27660108, + "balance_loss_mlp": 0.33064455, + "epoch": 0.2644821884863971, + "flos": 21031300039680.0, + "grad_norm": 69.63306269543864, + "language_loss": 0.72652328, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.74646902, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 0.40625, + "step": 4399, + "time_per_iteration": 2.647465705871582 + }, + { + "auxiliary_loss_clip": 0.01605932, + "auxiliary_loss_mlp": 0.0035963, + "balance_loss_clip": 1.27070642, + "balance_loss_mlp": 0.3201957, + "epoch": 0.26454231173906506, + "flos": 20776801201920.0, + "grad_norm": 11.332896086100925, + "language_loss": 0.91432726, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.93398285, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.39453125, + "step": 4400, + "time_per_iteration": 2.6431796550750732 + }, + { + "auxiliary_loss_clip": 0.01630613, + "auxiliary_loss_mlp": 0.00354536, + "balance_loss_clip": 1.27521586, + "balance_loss_mlp": 0.31469631, + "epoch": 0.264602434991733, + "flos": 16618669027200.0, + "grad_norm": 1.618481801358154, + "language_loss": 0.82527089, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.8451224, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.39892578, + "step": 4401, + "time_per_iteration": 2.6326613426208496 + }, + { + "auxiliary_loss_clip": 0.01647247, + "auxiliary_loss_mlp": 0.00358207, + "balance_loss_clip": 1.29132462, + "balance_loss_mlp": 0.31791455, + "epoch": 0.26466255824440105, + "flos": 19062677295360.0, + "grad_norm": 102.91629804216895, + "language_loss": 0.941486, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.96154058, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 0.40307617, + "step": 4402, + "time_per_iteration": 2.655700922012329 + }, + { + "auxiliary_loss_clip": 0.01635588, + "auxiliary_loss_mlp": 0.00383494, + "balance_loss_clip": 1.28456175, + "balance_loss_mlp": 0.34384471, + "epoch": 0.264722681497069, + "flos": 22638554006400.0, + "grad_norm": 5.817912925504186, + "language_loss": 0.82817233, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.84836316, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 0.39624023, + "step": 4403, + "time_per_iteration": 2.7161617279052734 + }, + { + "auxiliary_loss_clip": 0.01621117, + "auxiliary_loss_mlp": 0.00361903, + "balance_loss_clip": 1.27246547, + "balance_loss_mlp": 0.32292202, + "epoch": 0.264782804749737, + "flos": 22492253911680.0, + "grad_norm": 7.951867323753539, + "language_loss": 0.95772052, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.97755075, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 0.3894043, + "step": 4404, + "time_per_iteration": 2.703840970993042 + }, + { + "auxiliary_loss_clip": 0.01647617, + "auxiliary_loss_mlp": 0.00380297, + "balance_loss_clip": 1.29062009, + "balance_loss_mlp": 0.3413631, + "epoch": 0.26484292800240494, + "flos": 16800269212800.0, + "grad_norm": 154.77053076397786, + "language_loss": 0.82240272, + "learning_rate": 3.448819322433709e-06, + "loss": 0.84268188, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.38964844, + "step": 4405, + "time_per_iteration": 2.66880202293396 + }, + { + "auxiliary_loss_clip": 0.01665182, + "auxiliary_loss_mlp": 0.00376446, + "balance_loss_clip": 1.30851531, + "balance_loss_mlp": 0.33565244, + "epoch": 0.2649030512550729, + "flos": 20449583280000.0, + "grad_norm": 6.358367720175698, + "language_loss": 0.76658416, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.78700048, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.40771484, + "step": 4406, + "time_per_iteration": 4.046116828918457 + }, + { + "auxiliary_loss_clip": 0.01644135, + "auxiliary_loss_mlp": 0.00364583, + "balance_loss_clip": 1.29360342, + "balance_loss_mlp": 0.32715079, + "epoch": 0.2649631745077409, + "flos": 22416123035520.0, + "grad_norm": 4.451931143264114, + "language_loss": 0.89456713, + "learning_rate": 3.448282246369912e-06, + "loss": 0.91465431, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 3.50390625, + "router_z_loss_mlp": 0.37426758, + "step": 4407, + "time_per_iteration": 2.6576969623565674 + }, + { + "auxiliary_loss_clip": 0.01661058, + "auxiliary_loss_mlp": 0.00373256, + "balance_loss_clip": 1.30885744, + "balance_loss_mlp": 0.3305552, + "epoch": 0.26502329776040884, + "flos": 35116110927360.0, + "grad_norm": 13.79497465849659, + "language_loss": 0.81820953, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.83855265, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.42700195, + "step": 4408, + "time_per_iteration": 2.768663167953491 + }, + { + "auxiliary_loss_clip": 0.01665107, + "auxiliary_loss_mlp": 0.00341423, + "balance_loss_clip": 1.31174171, + "balance_loss_mlp": 0.30129707, + "epoch": 0.2650834210130768, + "flos": 38687498438400.0, + "grad_norm": 65.89275684587346, + "language_loss": 0.75916332, + "learning_rate": 3.447744950630084e-06, + "loss": 0.77922857, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.40136719, + "step": 4409, + "time_per_iteration": 2.782188892364502 + }, + { + "auxiliary_loss_clip": 0.01649175, + "auxiliary_loss_mlp": 0.00365, + "balance_loss_clip": 1.30138433, + "balance_loss_mlp": 0.32260871, + "epoch": 0.26514354426574477, + "flos": 24716847951360.0, + "grad_norm": 2.298126350193679, + "language_loss": 0.79863763, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.81877935, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.42407227, + "step": 4410, + "time_per_iteration": 4.194639682769775 + }, + { + "auxiliary_loss_clip": 0.01661276, + "auxiliary_loss_mlp": 0.00370911, + "balance_loss_clip": 1.30788457, + "balance_loss_mlp": 0.33304971, + "epoch": 0.26520366751841273, + "flos": 20340055733760.0, + "grad_norm": 3016.7396988165387, + "language_loss": 0.79149592, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.81181777, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.37866211, + "step": 4411, + "time_per_iteration": 2.685380220413208 + }, + { + "auxiliary_loss_clip": 0.01665414, + "auxiliary_loss_mlp": 0.00365644, + "balance_loss_clip": 1.3137666, + "balance_loss_mlp": 0.32575637, + "epoch": 0.2652637907710807, + "flos": 22343870828160.0, + "grad_norm": 6.344420228011144, + "language_loss": 0.86301613, + "learning_rate": 3.446938595306071e-06, + "loss": 0.88332671, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 0.39892578, + "step": 4412, + "time_per_iteration": 2.684338092803955 + }, + { + "auxiliary_loss_clip": 0.01662976, + "auxiliary_loss_mlp": 0.00381177, + "balance_loss_clip": 1.31664872, + "balance_loss_mlp": 0.33845234, + "epoch": 0.26532391402374866, + "flos": 19354235990400.0, + "grad_norm": 19.09315943351729, + "language_loss": 0.81205696, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.83249843, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.42724609, + "step": 4413, + "time_per_iteration": 4.190816164016724 + }, + { + "auxiliary_loss_clip": 0.01560515, + "auxiliary_loss_mlp": 0.00081885, + "balance_loss_clip": 1.34129024, + "balance_loss_mlp": 0.07220526, + "epoch": 0.26538403727641663, + "flos": 44787611422080.0, + "grad_norm": 2.6152054337818296, + "language_loss": 0.56897759, + "learning_rate": 3.446400750732793e-06, + "loss": 0.58540159, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.09667969, + "step": 4414, + "time_per_iteration": 3.086897134780884 + }, + { + "auxiliary_loss_clip": 0.01670697, + "auxiliary_loss_mlp": 0.00316586, + "balance_loss_clip": 1.32437074, + "balance_loss_mlp": 0.28335014, + "epoch": 0.26544416052908465, + "flos": 28182119708160.0, + "grad_norm": 4.432633040028551, + "language_loss": 0.7979036, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.81777644, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.33203125, + "step": 4415, + "time_per_iteration": 2.7279345989227295 + }, + { + "auxiliary_loss_clip": 0.01706293, + "auxiliary_loss_mlp": 0.00341173, + "balance_loss_clip": 1.34350157, + "balance_loss_mlp": 0.29904473, + "epoch": 0.2655042837817526, + "flos": 17565274097280.0, + "grad_norm": 3.5749072695502946, + "language_loss": 0.93746585, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.95794052, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.42138672, + "step": 4416, + "time_per_iteration": 2.6295835971832275 + }, + { + "auxiliary_loss_clip": 0.01679806, + "auxiliary_loss_mlp": 0.00394389, + "balance_loss_clip": 1.33170605, + "balance_loss_mlp": 0.35283297, + "epoch": 0.2655644070344206, + "flos": 23404636298880.0, + "grad_norm": 33.0397158480753, + "language_loss": 0.82156336, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.84230536, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.41601562, + "step": 4417, + "time_per_iteration": 2.722395896911621 + }, + { + "auxiliary_loss_clip": 0.01673118, + "auxiliary_loss_mlp": 0.00365148, + "balance_loss_clip": 1.32694411, + "balance_loss_mlp": 0.32695347, + "epoch": 0.26562453028708854, + "flos": 26468462678400.0, + "grad_norm": 18.254020775525536, + "language_loss": 0.85134661, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.87172925, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.38183594, + "step": 4418, + "time_per_iteration": 2.802130937576294 + }, + { + "auxiliary_loss_clip": 0.01686134, + "auxiliary_loss_mlp": 0.00381115, + "balance_loss_clip": 1.3326745, + "balance_loss_mlp": 0.34096509, + "epoch": 0.2656846535397565, + "flos": 19207576759680.0, + "grad_norm": 1338.003638675426, + "language_loss": 0.74313211, + "learning_rate": 3.445055179644071e-06, + "loss": 0.76380467, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.40161133, + "step": 4419, + "time_per_iteration": 2.642695188522339 + }, + { + "auxiliary_loss_clip": 0.01695314, + "auxiliary_loss_mlp": 0.00395932, + "balance_loss_clip": 1.33759522, + "balance_loss_mlp": 0.35227793, + "epoch": 0.2657447767924245, + "flos": 30551325903360.0, + "grad_norm": 1.9715654798980515, + "language_loss": 0.83187604, + "learning_rate": 3.444785900995585e-06, + "loss": 0.85278857, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.43603516, + "step": 4420, + "time_per_iteration": 2.768308401107788 + }, + { + "auxiliary_loss_clip": 0.0169135, + "auxiliary_loss_mlp": 0.00390772, + "balance_loss_clip": 1.33756638, + "balance_loss_mlp": 0.34690344, + "epoch": 0.26580490004509244, + "flos": 20922742160640.0, + "grad_norm": 66.77281440717351, + "language_loss": 0.88456047, + "learning_rate": 3.444516567560673e-06, + "loss": 0.9053818, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.43823242, + "step": 4421, + "time_per_iteration": 2.645838975906372 + }, + { + "auxiliary_loss_clip": 0.01691712, + "auxiliary_loss_mlp": 0.00380703, + "balance_loss_clip": 1.34299827, + "balance_loss_mlp": 0.34036285, + "epoch": 0.2658650232977604, + "flos": 43945682584320.0, + "grad_norm": 27.242346534237473, + "language_loss": 0.71819156, + "learning_rate": 3.444247179349548e-06, + "loss": 0.73891568, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 3.48828125, + "router_z_loss_mlp": 0.40380859, + "step": 4422, + "time_per_iteration": 4.398077487945557 + }, + { + "auxiliary_loss_clip": 0.01713157, + "auxiliary_loss_mlp": 0.003808, + "balance_loss_clip": 1.35866356, + "balance_loss_mlp": 0.34262967, + "epoch": 0.26592514655042837, + "flos": 29716439109120.0, + "grad_norm": 3.2531377722438743, + "language_loss": 0.81198126, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.83292079, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 0.3815918, + "step": 4423, + "time_per_iteration": 2.691225290298462 + }, + { + "auxiliary_loss_clip": 0.0171078, + "auxiliary_loss_mlp": 0.00359497, + "balance_loss_clip": 1.35023379, + "balance_loss_mlp": 0.32156426, + "epoch": 0.26598526980309634, + "flos": 46677730014720.0, + "grad_norm": 107.92113502015309, + "language_loss": 0.83775306, + "learning_rate": 3.443708238639522e-06, + "loss": 0.85845584, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 0.37890625, + "step": 4424, + "time_per_iteration": 2.865556240081787 + }, + { + "auxiliary_loss_clip": 0.01720516, + "auxiliary_loss_mlp": 0.00370531, + "balance_loss_clip": 1.35926199, + "balance_loss_mlp": 0.33028615, + "epoch": 0.2660453930557643, + "flos": 11509442582400.0, + "grad_norm": 2.8169754673398706, + "language_loss": 0.8712703, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.89218074, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.40185547, + "step": 4425, + "time_per_iteration": 2.717451572418213 + }, + { + "auxiliary_loss_clip": 0.01707991, + "auxiliary_loss_mlp": 0.00346806, + "balance_loss_clip": 1.34944868, + "balance_loss_mlp": 0.31073329, + "epoch": 0.26610551630843227, + "flos": 24791578197120.0, + "grad_norm": 29.326340744807283, + "language_loss": 0.87234187, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.89288986, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.36108398, + "step": 4426, + "time_per_iteration": 2.7240235805511475 + }, + { + "auxiliary_loss_clip": 0.01722144, + "auxiliary_loss_mlp": 0.00396775, + "balance_loss_clip": 1.35879016, + "balance_loss_mlp": 0.3560057, + "epoch": 0.26616563956110023, + "flos": 27636385397760.0, + "grad_norm": 10.245466372152071, + "language_loss": 0.81480241, + "learning_rate": 3.442899417008333e-06, + "loss": 0.83599162, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 0.40795898, + "step": 4427, + "time_per_iteration": 2.719569206237793 + }, + { + "auxiliary_loss_clip": 0.01732347, + "auxiliary_loss_mlp": 0.00372393, + "balance_loss_clip": 1.37612939, + "balance_loss_mlp": 0.33388823, + "epoch": 0.26622576281376825, + "flos": 28362893880960.0, + "grad_norm": 134.32217586471037, + "language_loss": 0.81191504, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.83296239, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.38500977, + "step": 4428, + "time_per_iteration": 2.7029848098754883 + }, + { + "auxiliary_loss_clip": 0.01729139, + "auxiliary_loss_mlp": 0.00399247, + "balance_loss_clip": 1.3653276, + "balance_loss_mlp": 0.361696, + "epoch": 0.2662858860664362, + "flos": 18041341979520.0, + "grad_norm": 3.559619986218037, + "language_loss": 0.90190566, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.92318952, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 0.37548828, + "step": 4429, + "time_per_iteration": 2.6381616592407227 + }, + { + "auxiliary_loss_clip": 0.01764834, + "auxiliary_loss_mlp": 0.00371279, + "balance_loss_clip": 1.3990221, + "balance_loss_mlp": 0.3300091, + "epoch": 0.2663460093191042, + "flos": 22745818995840.0, + "grad_norm": 14.965997464731696, + "language_loss": 0.78619695, + "learning_rate": 3.442090102943143e-06, + "loss": 0.80755806, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.41259766, + "step": 4430, + "time_per_iteration": 2.7359862327575684 + }, + { + "auxiliary_loss_clip": 0.01748496, + "auxiliary_loss_mlp": 0.00388265, + "balance_loss_clip": 1.38243437, + "balance_loss_mlp": 0.34725672, + "epoch": 0.26640613257177215, + "flos": 16508782344960.0, + "grad_norm": 20.40977637072055, + "language_loss": 0.89106119, + "learning_rate": 3.441820222206035e-06, + "loss": 0.91242874, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 0.41064453, + "step": 4431, + "time_per_iteration": 2.633929491043091 + }, + { + "auxiliary_loss_clip": 0.01754074, + "auxiliary_loss_mlp": 0.00381877, + "balance_loss_clip": 1.37961435, + "balance_loss_mlp": 0.34196621, + "epoch": 0.2664662558244401, + "flos": 23075945919360.0, + "grad_norm": 3.6592495184331066, + "language_loss": 0.83306289, + "learning_rate": 3.44155028679496e-06, + "loss": 0.85442233, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 0.39916992, + "step": 4432, + "time_per_iteration": 2.6431877613067627 + }, + { + "auxiliary_loss_clip": 0.01759343, + "auxiliary_loss_mlp": 0.00406631, + "balance_loss_clip": 1.39300656, + "balance_loss_mlp": 0.36576644, + "epoch": 0.2665263790771081, + "flos": 23769273214080.0, + "grad_norm": 13.483417166048925, + "language_loss": 0.87939644, + "learning_rate": 3.441280296720154e-06, + "loss": 0.90105623, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.40869141, + "step": 4433, + "time_per_iteration": 2.7074031829833984 + }, + { + "auxiliary_loss_clip": 0.01767544, + "auxiliary_loss_mlp": 0.00396169, + "balance_loss_clip": 1.40131783, + "balance_loss_mlp": 0.35775954, + "epoch": 0.26658650232977604, + "flos": 28001273708160.0, + "grad_norm": 12.273698615442134, + "language_loss": 0.82364643, + "learning_rate": 3.441010251991854e-06, + "loss": 0.84528351, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.38427734, + "step": 4434, + "time_per_iteration": 2.7192370891571045 + }, + { + "auxiliary_loss_clip": 0.01765668, + "auxiliary_loss_mlp": 0.00337562, + "balance_loss_clip": 1.40214968, + "balance_loss_mlp": 0.30065489, + "epoch": 0.266646625582444, + "flos": 22163635359360.0, + "grad_norm": 194.27621476208552, + "language_loss": 0.89281571, + "learning_rate": 3.440740152620301e-06, + "loss": 0.91384798, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 3.63476562, + "router_z_loss_mlp": 0.36889648, + "step": 4435, + "time_per_iteration": 2.686112642288208 + }, + { + "auxiliary_loss_clip": 0.01779304, + "auxiliary_loss_mlp": 0.0036657, + "balance_loss_clip": 1.40237665, + "balance_loss_mlp": 0.32274842, + "epoch": 0.266706748835112, + "flos": 27853537069440.0, + "grad_norm": 5.654495295303697, + "language_loss": 0.93507802, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.95653677, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.43847656, + "step": 4436, + "time_per_iteration": 2.739887237548828 + }, + { + "auxiliary_loss_clip": 0.01743841, + "auxiliary_loss_mlp": 0.00360209, + "balance_loss_clip": 1.38872135, + "balance_loss_mlp": 0.32151347, + "epoch": 0.26676687208777994, + "flos": 25812123413760.0, + "grad_norm": 14.256208074931841, + "language_loss": 0.82260609, + "learning_rate": 3.440199789988407e-06, + "loss": 0.84364659, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 0.38696289, + "step": 4437, + "time_per_iteration": 2.7306854724884033 + }, + { + "auxiliary_loss_clip": 0.0175288, + "auxiliary_loss_mlp": 0.00394381, + "balance_loss_clip": 1.39313245, + "balance_loss_mlp": 0.3524195, + "epoch": 0.2668269953404479, + "flos": 36064583504640.0, + "grad_norm": 7.619170753496409, + "language_loss": 0.73111832, + "learning_rate": 3.439929526748556e-06, + "loss": 0.75259089, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 0.41967773, + "step": 4438, + "time_per_iteration": 2.800873041152954 + }, + { + "auxiliary_loss_clip": 0.01753735, + "auxiliary_loss_mlp": 0.00397802, + "balance_loss_clip": 1.39476538, + "balance_loss_mlp": 0.35650772, + "epoch": 0.26688711859311587, + "flos": 26570987072640.0, + "grad_norm": 93.20551592953444, + "language_loss": 0.81169271, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.83320808, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.41308594, + "step": 4439, + "time_per_iteration": 2.7216665744781494 + }, + { + "auxiliary_loss_clip": 0.01786916, + "auxiliary_loss_mlp": 0.00360374, + "balance_loss_clip": 1.41924095, + "balance_loss_mlp": 0.32012898, + "epoch": 0.26694724184578383, + "flos": 26761565658240.0, + "grad_norm": 11.122130703391738, + "language_loss": 0.76121038, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.78268325, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.40234375, + "step": 4440, + "time_per_iteration": 2.7328786849975586 + }, + { + "auxiliary_loss_clip": 0.0175465, + "auxiliary_loss_mlp": 0.00359095, + "balance_loss_clip": 1.39617252, + "balance_loss_mlp": 0.32025683, + "epoch": 0.2670073650984518, + "flos": 20959586536320.0, + "grad_norm": 7.144158743098327, + "language_loss": 0.74710035, + "learning_rate": 3.439118409456376e-06, + "loss": 0.76823771, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.38867188, + "step": 4441, + "time_per_iteration": 2.6313068866729736 + }, + { + "auxiliary_loss_clip": 0.01786913, + "auxiliary_loss_mlp": 0.00347142, + "balance_loss_clip": 1.42233169, + "balance_loss_mlp": 0.30832708, + "epoch": 0.2670674883511198, + "flos": 28366054277760.0, + "grad_norm": 4.910556405614218, + "language_loss": 0.8046118, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.82595229, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 0.38818359, + "step": 4442, + "time_per_iteration": 2.736863613128662 + }, + { + "auxiliary_loss_clip": 0.01572611, + "auxiliary_loss_mlp": 0.00081311, + "balance_loss_clip": 1.33654523, + "balance_loss_mlp": 0.07134517, + "epoch": 0.2671276116037878, + "flos": 58971319430400.0, + "grad_norm": 77.75508826375645, + "language_loss": 0.6122694, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.62880862, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.09960938, + "step": 4443, + "time_per_iteration": 3.0510714054107666 + }, + { + "auxiliary_loss_clip": 0.01781453, + "auxiliary_loss_mlp": 0.0033858, + "balance_loss_clip": 1.41875398, + "balance_loss_mlp": 0.30079019, + "epoch": 0.26718773485645575, + "flos": 43945072053120.0, + "grad_norm": 957.5126358590235, + "language_loss": 0.81968659, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.84088689, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 0.37768555, + "step": 4444, + "time_per_iteration": 2.847710609436035 + }, + { + "auxiliary_loss_clip": 0.01775713, + "auxiliary_loss_mlp": 0.00362103, + "balance_loss_clip": 1.41267347, + "balance_loss_mlp": 0.32111874, + "epoch": 0.2672478581091237, + "flos": 25228323665280.0, + "grad_norm": 186.15953464878817, + "language_loss": 0.85374832, + "learning_rate": 3.438036155780158e-06, + "loss": 0.87512648, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.40966797, + "step": 4445, + "time_per_iteration": 2.6781675815582275 + }, + { + "auxiliary_loss_clip": 0.01790274, + "auxiliary_loss_mlp": 0.0034569, + "balance_loss_clip": 1.42550921, + "balance_loss_mlp": 0.30835325, + "epoch": 0.2673079813617917, + "flos": 15268176455040.0, + "grad_norm": 4.288001138192591, + "language_loss": 0.96481597, + "learning_rate": 3.43776545600926e-06, + "loss": 0.9861756, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 0.37329102, + "step": 4446, + "time_per_iteration": 2.606088399887085 + }, + { + "auxiliary_loss_clip": 0.0178934, + "auxiliary_loss_mlp": 0.0034773, + "balance_loss_clip": 1.42279673, + "balance_loss_mlp": 0.31199145, + "epoch": 0.26736810461445965, + "flos": 25812733944960.0, + "grad_norm": 21.588094182953714, + "language_loss": 0.73234665, + "learning_rate": 3.437494701718153e-06, + "loss": 0.7537173, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 0.35717773, + "step": 4447, + "time_per_iteration": 2.746830940246582 + }, + { + "auxiliary_loss_clip": 0.01802767, + "auxiliary_loss_mlp": 0.00328992, + "balance_loss_clip": 1.43480897, + "balance_loss_mlp": 0.29227582, + "epoch": 0.2674282278671276, + "flos": 24312709054080.0, + "grad_norm": 9.008351583015127, + "language_loss": 0.8982963, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.9196139, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.3671875, + "step": 4448, + "time_per_iteration": 4.102078437805176 + }, + { + "auxiliary_loss_clip": 0.01782475, + "auxiliary_loss_mlp": 0.00327441, + "balance_loss_clip": 1.42072201, + "balance_loss_mlp": 0.29165423, + "epoch": 0.2674883511197956, + "flos": 22815521337600.0, + "grad_norm": 39.76491722339182, + "language_loss": 0.90724409, + "learning_rate": 3.436953029616378e-06, + "loss": 0.9283433, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.35766602, + "step": 4449, + "time_per_iteration": 2.7006447315216064 + }, + { + "auxiliary_loss_clip": 0.01771308, + "auxiliary_loss_mlp": 0.00349603, + "balance_loss_clip": 1.40084016, + "balance_loss_mlp": 0.31126535, + "epoch": 0.26754847437246354, + "flos": 25370170473600.0, + "grad_norm": 14.33238402684806, + "language_loss": 0.9034735, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.92468262, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 3.703125, + "router_z_loss_mlp": 0.38330078, + "step": 4450, + "time_per_iteration": 2.6777524948120117 + }, + { + "auxiliary_loss_clip": 0.01766435, + "auxiliary_loss_mlp": 0.00320998, + "balance_loss_clip": 1.41061568, + "balance_loss_mlp": 0.28385222, + "epoch": 0.2676085976251315, + "flos": 20230420446720.0, + "grad_norm": 1.9881956311920828, + "language_loss": 0.8626948, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.88356912, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 0.37109375, + "step": 4451, + "time_per_iteration": 2.695317506790161 + }, + { + "auxiliary_loss_clip": 0.01785034, + "auxiliary_loss_mlp": 0.00314527, + "balance_loss_clip": 1.42931795, + "balance_loss_mlp": 0.27673724, + "epoch": 0.26766872087779947, + "flos": 28038225824640.0, + "grad_norm": 2.8883458846529817, + "language_loss": 0.92019361, + "learning_rate": 3.436140112818882e-06, + "loss": 0.94118923, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 0.37817383, + "step": 4452, + "time_per_iteration": 4.181818962097168 + }, + { + "auxiliary_loss_clip": 0.01750966, + "auxiliary_loss_mlp": 0.00304844, + "balance_loss_clip": 1.40393758, + "balance_loss_mlp": 0.26943928, + "epoch": 0.26772884413046744, + "flos": 18325179250560.0, + "grad_norm": 66.5727138171409, + "language_loss": 0.92458737, + "learning_rate": 3.435869031622194e-06, + "loss": 0.94514549, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 3.46679688, + "router_z_loss_mlp": 0.35400391, + "step": 4453, + "time_per_iteration": 2.6070289611816406 + }, + { + "auxiliary_loss_clip": 0.01780198, + "auxiliary_loss_mlp": 0.00305377, + "balance_loss_clip": 1.41953754, + "balance_loss_mlp": 0.26878011, + "epoch": 0.2677889673831354, + "flos": 22127509255680.0, + "grad_norm": 6.405915311322177, + "language_loss": 0.8398279, + "learning_rate": 3.435597895977208e-06, + "loss": 0.86068368, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 0.3659668, + "step": 4454, + "time_per_iteration": 2.6772449016571045 + }, + { + "auxiliary_loss_clip": 0.01761056, + "auxiliary_loss_mlp": 0.0031654, + "balance_loss_clip": 1.40583742, + "balance_loss_mlp": 0.27941856, + "epoch": 0.2678490906358034, + "flos": 23729699404800.0, + "grad_norm": 233.98706347356833, + "language_loss": 0.77264929, + "learning_rate": 3.435326705894206e-06, + "loss": 0.79342526, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.37133789, + "step": 4455, + "time_per_iteration": 4.074885845184326 + }, + { + "auxiliary_loss_clip": 0.01761384, + "auxiliary_loss_mlp": 0.00285488, + "balance_loss_clip": 1.41740203, + "balance_loss_mlp": 0.2490097, + "epoch": 0.2679092138884714, + "flos": 21762872340480.0, + "grad_norm": 5.017561767758398, + "language_loss": 0.79117537, + "learning_rate": 3.435055461383471e-06, + "loss": 0.81164408, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.36474609, + "step": 4456, + "time_per_iteration": 2.6519463062286377 + }, + { + "auxiliary_loss_clip": 0.01770818, + "auxiliary_loss_mlp": 0.00333291, + "balance_loss_clip": 1.4104718, + "balance_loss_mlp": 0.29612172, + "epoch": 0.26796933714113935, + "flos": 19861186590720.0, + "grad_norm": 22.019987963505788, + "language_loss": 0.772807, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.7938481, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 0.37133789, + "step": 4457, + "time_per_iteration": 2.646531105041504 + }, + { + "auxiliary_loss_clip": 0.01784807, + "auxiliary_loss_mlp": 0.00320674, + "balance_loss_clip": 1.41868258, + "balance_loss_mlp": 0.28255093, + "epoch": 0.2680294603938073, + "flos": 20047886507520.0, + "grad_norm": 16.210136382448397, + "language_loss": 0.85734677, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.87840158, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 0.3815918, + "step": 4458, + "time_per_iteration": 2.7151293754577637 + }, + { + "auxiliary_loss_clip": 0.01662258, + "auxiliary_loss_mlp": 0.00054031, + "balance_loss_clip": 1.42428493, + "balance_loss_mlp": 0.04253971, + "epoch": 0.2680895836464753, + "flos": 72113763052800.0, + "grad_norm": 0.8319328967421279, + "language_loss": 0.58521545, + "learning_rate": 3.434241401387739e-06, + "loss": 0.60237837, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.11474609, + "step": 4459, + "time_per_iteration": 3.1260011196136475 + }, + { + "auxiliary_loss_clip": 0.01768002, + "auxiliary_loss_mlp": 0.00295442, + "balance_loss_clip": 1.41497231, + "balance_loss_mlp": 0.25801033, + "epoch": 0.26814970689914325, + "flos": 20449044576000.0, + "grad_norm": 18.251251138733224, + "language_loss": 0.91461968, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.9352541, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.37451172, + "step": 4460, + "time_per_iteration": 2.6385910511016846 + }, + { + "auxiliary_loss_clip": 0.01752205, + "auxiliary_loss_mlp": 0.00282046, + "balance_loss_clip": 1.40861535, + "balance_loss_mlp": 0.24640219, + "epoch": 0.2682098301518112, + "flos": 17566674727680.0, + "grad_norm": 38.99845780083298, + "language_loss": 0.76697206, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.78731453, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 3.43554688, + "router_z_loss_mlp": 0.35644531, + "step": 4461, + "time_per_iteration": 2.630237102508545 + }, + { + "auxiliary_loss_clip": 0.0176419, + "auxiliary_loss_mlp": 0.0030782, + "balance_loss_clip": 1.41325641, + "balance_loss_mlp": 0.27291578, + "epoch": 0.2682699534044792, + "flos": 18333259810560.0, + "grad_norm": 2.540563100055694, + "language_loss": 0.74809861, + "learning_rate": 3.43342685191282e-06, + "loss": 0.76881874, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 0.34912109, + "step": 4462, + "time_per_iteration": 2.671095132827759 + }, + { + "auxiliary_loss_clip": 0.01781115, + "auxiliary_loss_mlp": 0.00299002, + "balance_loss_clip": 1.42416835, + "balance_loss_mlp": 0.26123697, + "epoch": 0.26833007665714714, + "flos": 25301294144640.0, + "grad_norm": 11.784904295892973, + "language_loss": 0.7622776, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.78307879, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.37768555, + "step": 4463, + "time_per_iteration": 2.843959331512451 + }, + { + "auxiliary_loss_clip": 0.0176902, + "auxiliary_loss_mlp": 0.003234, + "balance_loss_clip": 1.41163754, + "balance_loss_mlp": 0.28429893, + "epoch": 0.2683901999098151, + "flos": 16099759198080.0, + "grad_norm": 14.257168781003871, + "language_loss": 0.85564226, + "learning_rate": 3.432883547133931e-06, + "loss": 0.87656647, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 0.39086914, + "step": 4464, + "time_per_iteration": 4.185925245285034 + }, + { + "auxiliary_loss_clip": 0.01756228, + "auxiliary_loss_mlp": 0.00306774, + "balance_loss_clip": 1.40784335, + "balance_loss_mlp": 0.27153599, + "epoch": 0.2684503231624831, + "flos": 27308054154240.0, + "grad_norm": 102.28070255080105, + "language_loss": 0.77004272, + "learning_rate": 3.432611813236704e-06, + "loss": 0.79067278, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 0.3527832, + "step": 4465, + "time_per_iteration": 2.710642099380493 + }, + { + "auxiliary_loss_clip": 0.01637127, + "auxiliary_loss_mlp": 0.00074215, + "balance_loss_clip": 1.41617489, + "balance_loss_mlp": 0.06424913, + "epoch": 0.26851044641515104, + "flos": 71858007239040.0, + "grad_norm": 0.6763758809177138, + "language_loss": 0.52398598, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.54109943, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.09960938, + "step": 4466, + "time_per_iteration": 3.2841269969940186 + }, + { + "auxiliary_loss_clip": 0.01739963, + "auxiliary_loss_mlp": 0.00330925, + "balance_loss_clip": 1.40518951, + "balance_loss_mlp": 0.29375508, + "epoch": 0.268570569667819, + "flos": 18733771434240.0, + "grad_norm": 6626.189657638971, + "language_loss": 0.79805493, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.81876385, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.37182617, + "step": 4467, + "time_per_iteration": 2.6106374263763428 + }, + { + "auxiliary_loss_clip": 0.01767249, + "auxiliary_loss_mlp": 0.00324373, + "balance_loss_clip": 1.41641057, + "balance_loss_mlp": 0.28815705, + "epoch": 0.268630692920487, + "flos": 18178376365440.0, + "grad_norm": 12.602406983964583, + "language_loss": 0.89848471, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.91940087, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.36181641, + "step": 4468, + "time_per_iteration": 2.6108169555664062 + }, + { + "auxiliary_loss_clip": 0.01607251, + "auxiliary_loss_mlp": 0.00068945, + "balance_loss_clip": 1.38639426, + "balance_loss_mlp": 0.05912209, + "epoch": 0.268690816173155, + "flos": 68731768978560.0, + "grad_norm": 0.8441198338643263, + "language_loss": 0.59129828, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.60806024, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.09814453, + "step": 4469, + "time_per_iteration": 3.2028920650482178 + }, + { + "auxiliary_loss_clip": 0.01735918, + "auxiliary_loss_mlp": 0.00330745, + "balance_loss_clip": 1.39865398, + "balance_loss_mlp": 0.29333708, + "epoch": 0.26875093942582295, + "flos": 23293636295040.0, + "grad_norm": 10.924709919227064, + "language_loss": 0.86566275, + "learning_rate": 3.431252329084972e-06, + "loss": 0.88632941, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.37426758, + "step": 4470, + "time_per_iteration": 2.6719727516174316 + }, + { + "auxiliary_loss_clip": 0.01721326, + "auxiliary_loss_mlp": 0.00301509, + "balance_loss_clip": 1.39632893, + "balance_loss_mlp": 0.26569846, + "epoch": 0.2688110626784909, + "flos": 21543458112000.0, + "grad_norm": 4.803991781136725, + "language_loss": 0.87866032, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.89888871, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.3581543, + "step": 4471, + "time_per_iteration": 2.7073724269866943 + }, + { + "auxiliary_loss_clip": 0.01706195, + "auxiliary_loss_mlp": 0.00298573, + "balance_loss_clip": 1.38725162, + "balance_loss_mlp": 0.26431227, + "epoch": 0.2688711859311589, + "flos": 28400600183040.0, + "grad_norm": 4.156181291209035, + "language_loss": 0.78360713, + "learning_rate": 3.43070815543947e-06, + "loss": 0.80365479, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.3425293, + "step": 4472, + "time_per_iteration": 2.676076650619507 + }, + { + "auxiliary_loss_clip": 0.01706889, + "auxiliary_loss_mlp": 0.00316085, + "balance_loss_clip": 1.38579977, + "balance_loss_mlp": 0.2809898, + "epoch": 0.26893130918382685, + "flos": 25994944661760.0, + "grad_norm": 12.349018296914668, + "language_loss": 0.73826396, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.75849366, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.35131836, + "step": 4473, + "time_per_iteration": 2.6946754455566406 + }, + { + "auxiliary_loss_clip": 0.01694648, + "auxiliary_loss_mlp": 0.00276977, + "balance_loss_clip": 1.37541938, + "balance_loss_mlp": 0.24629208, + "epoch": 0.2689914324364948, + "flos": 20339624770560.0, + "grad_norm": 34.630055758193194, + "language_loss": 0.89245272, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.91216898, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.30664062, + "step": 4474, + "time_per_iteration": 2.6751511096954346 + }, + { + "auxiliary_loss_clip": 0.01711676, + "auxiliary_loss_mlp": 0.00286971, + "balance_loss_clip": 1.39995587, + "balance_loss_mlp": 0.25594082, + "epoch": 0.2690515556891628, + "flos": 19464553635840.0, + "grad_norm": 16.903318376535246, + "language_loss": 0.76115656, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.78114307, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.30993652, + "step": 4475, + "time_per_iteration": 2.6512725353240967 + }, + { + "auxiliary_loss_clip": 0.01684704, + "auxiliary_loss_mlp": 0.00286093, + "balance_loss_clip": 1.37024283, + "balance_loss_mlp": 0.2488279, + "epoch": 0.26911167894183075, + "flos": 18146631720960.0, + "grad_norm": 20.100159250195027, + "language_loss": 0.79686952, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.81657743, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.37255859, + "step": 4476, + "time_per_iteration": 2.624776601791382 + }, + { + "auxiliary_loss_clip": 0.01696074, + "auxiliary_loss_mlp": 0.00303159, + "balance_loss_clip": 1.38118839, + "balance_loss_mlp": 0.26966119, + "epoch": 0.2691718021944987, + "flos": 19975131509760.0, + "grad_norm": 23.78184398543077, + "language_loss": 0.85709059, + "learning_rate": 3.429346772085922e-06, + "loss": 0.87708294, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.33496094, + "step": 4477, + "time_per_iteration": 2.6755316257476807 + }, + { + "auxiliary_loss_clip": 0.01698122, + "auxiliary_loss_mlp": 0.00288514, + "balance_loss_clip": 1.37971854, + "balance_loss_mlp": 0.25229815, + "epoch": 0.2692319254471667, + "flos": 37447215770880.0, + "grad_norm": 181.54917570871226, + "language_loss": 0.73722064, + "learning_rate": 3.429074332770984e-06, + "loss": 0.75708699, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.36230469, + "step": 4478, + "time_per_iteration": 2.805828094482422 + }, + { + "auxiliary_loss_clip": 0.01682559, + "auxiliary_loss_mlp": 0.00298534, + "balance_loss_clip": 1.37139797, + "balance_loss_mlp": 0.26169872, + "epoch": 0.26929204869983464, + "flos": 22127796564480.0, + "grad_norm": 2.5756406746945992, + "language_loss": 0.87561667, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.89542758, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.36791992, + "step": 4479, + "time_per_iteration": 2.6652278900146484 + }, + { + "auxiliary_loss_clip": 0.01704959, + "auxiliary_loss_mlp": 0.00295705, + "balance_loss_clip": 1.38935804, + "balance_loss_mlp": 0.26084787, + "epoch": 0.2693521719525026, + "flos": 19792813052160.0, + "grad_norm": 6.56392758113658, + "language_loss": 0.86834037, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.88834697, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.34814453, + "step": 4480, + "time_per_iteration": 2.666550636291504 + }, + { + "auxiliary_loss_clip": 0.0168871, + "auxiliary_loss_mlp": 0.0028692, + "balance_loss_clip": 1.37468195, + "balance_loss_mlp": 0.252635, + "epoch": 0.2694122952051706, + "flos": 20994383836800.0, + "grad_norm": 58.51533988753343, + "language_loss": 0.82221937, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.84197569, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.3425293, + "step": 4481, + "time_per_iteration": 2.644855499267578 + }, + { + "auxiliary_loss_clip": 0.0169266, + "auxiliary_loss_mlp": 0.00294961, + "balance_loss_clip": 1.38109052, + "balance_loss_mlp": 0.25936559, + "epoch": 0.2694724184578386, + "flos": 25849291011840.0, + "grad_norm": 20.60393661477104, + "language_loss": 0.80874395, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.8286202, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.35571289, + "step": 4482, + "time_per_iteration": 2.749721050262451 + }, + { + "auxiliary_loss_clip": 0.01681001, + "auxiliary_loss_mlp": 0.00286486, + "balance_loss_clip": 1.37541044, + "balance_loss_mlp": 0.25007966, + "epoch": 0.26953254171050656, + "flos": 21726961718400.0, + "grad_norm": 2.938147658778023, + "language_loss": 0.80795681, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.82763165, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.36401367, + "step": 4483, + "time_per_iteration": 2.6267523765563965 + }, + { + "auxiliary_loss_clip": 0.01706408, + "auxiliary_loss_mlp": 0.00308111, + "balance_loss_clip": 1.38593841, + "balance_loss_mlp": 0.27137083, + "epoch": 0.2695926649631745, + "flos": 19682926369920.0, + "grad_norm": 7.157626379297422, + "language_loss": 0.94431788, + "learning_rate": 3.427438559239605e-06, + "loss": 0.96446306, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.36743164, + "step": 4484, + "time_per_iteration": 2.6421194076538086 + }, + { + "auxiliary_loss_clip": 0.01659697, + "auxiliary_loss_mlp": 0.00270112, + "balance_loss_clip": 1.35131443, + "balance_loss_mlp": 0.23635209, + "epoch": 0.2696527882158425, + "flos": 32886596724480.0, + "grad_norm": 4.832069808662705, + "language_loss": 0.72256792, + "learning_rate": 3.427165740807239e-06, + "loss": 0.74186599, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 3.08203125, + "router_z_loss_mlp": 0.33789062, + "step": 4485, + "time_per_iteration": 2.741579294204712 + }, + { + "auxiliary_loss_clip": 0.01677425, + "auxiliary_loss_mlp": 0.00299118, + "balance_loss_clip": 1.36963844, + "balance_loss_mlp": 0.26392734, + "epoch": 0.26971291146851045, + "flos": 12124843320960.0, + "grad_norm": 29.566356653867278, + "language_loss": 0.81364131, + "learning_rate": 3.426892868256604e-06, + "loss": 0.83340681, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 3.07617188, + "router_z_loss_mlp": 0.35205078, + "step": 4486, + "time_per_iteration": 2.5904295444488525 + }, + { + "auxiliary_loss_clip": 0.01690448, + "auxiliary_loss_mlp": 0.00274593, + "balance_loss_clip": 1.38178396, + "balance_loss_mlp": 0.23880598, + "epoch": 0.2697730347211784, + "flos": 22634459856000.0, + "grad_norm": 35.99737293295793, + "language_loss": 0.91206032, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.93171072, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.35791016, + "step": 4487, + "time_per_iteration": 2.6221632957458496 + }, + { + "auxiliary_loss_clip": 0.01733248, + "auxiliary_loss_mlp": 0.0027786, + "balance_loss_clip": 1.40866137, + "balance_loss_mlp": 0.24328962, + "epoch": 0.2698331579738464, + "flos": 23513050523520.0, + "grad_norm": 28.78178003156099, + "language_loss": 0.81006193, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.83017302, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.34545898, + "step": 4488, + "time_per_iteration": 2.633152484893799 + }, + { + "auxiliary_loss_clip": 0.01676429, + "auxiliary_loss_mlp": 0.00288868, + "balance_loss_clip": 1.37065637, + "balance_loss_mlp": 0.25091141, + "epoch": 0.26989328122651435, + "flos": 24641040297600.0, + "grad_norm": 18.437308599081437, + "language_loss": 0.88005269, + "learning_rate": 3.426073925998578e-06, + "loss": 0.89970565, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.37963867, + "step": 4489, + "time_per_iteration": 2.656083822250366 + }, + { + "auxiliary_loss_clip": 0.01667246, + "auxiliary_loss_mlp": 0.00273887, + "balance_loss_clip": 1.35664487, + "balance_loss_mlp": 0.23900652, + "epoch": 0.2699534044791823, + "flos": 10772555068800.0, + "grad_norm": 26.817061084326124, + "language_loss": 0.99627924, + "learning_rate": 3.4258008370783656e-06, + "loss": 1.01569057, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.34887695, + "step": 4490, + "time_per_iteration": 2.599663496017456 + }, + { + "auxiliary_loss_clip": 0.01681697, + "auxiliary_loss_mlp": 0.00282812, + "balance_loss_clip": 1.37665784, + "balance_loss_mlp": 0.24812186, + "epoch": 0.2700135277318503, + "flos": 36171597098880.0, + "grad_norm": 24.995421529669034, + "language_loss": 0.78603721, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.8056823, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.34692383, + "step": 4491, + "time_per_iteration": 4.181100130081177 + }, + { + "auxiliary_loss_clip": 0.01684412, + "auxiliary_loss_mlp": 0.00260792, + "balance_loss_clip": 1.37476671, + "balance_loss_mlp": 0.22834361, + "epoch": 0.27007365098451824, + "flos": 17418614866560.0, + "grad_norm": 53.92831361287904, + "language_loss": 0.82157719, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.84102923, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.32421875, + "step": 4492, + "time_per_iteration": 2.5942978858947754 + }, + { + "auxiliary_loss_clip": 0.01656345, + "auxiliary_loss_mlp": 0.00252656, + "balance_loss_clip": 1.35415673, + "balance_loss_mlp": 0.21937284, + "epoch": 0.2701337742371862, + "flos": 23185688947200.0, + "grad_norm": 2.529860711929969, + "language_loss": 0.9497028, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.9687928, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.33276367, + "step": 4493, + "time_per_iteration": 2.639547348022461 + }, + { + "auxiliary_loss_clip": 0.01682025, + "auxiliary_loss_mlp": 0.00268539, + "balance_loss_clip": 1.37612128, + "balance_loss_mlp": 0.23451675, + "epoch": 0.2701938974898542, + "flos": 24389450461440.0, + "grad_norm": 1.9574098045686603, + "language_loss": 0.76822555, + "learning_rate": 3.424707940835998e-06, + "loss": 0.78773123, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.34008789, + "step": 4494, + "time_per_iteration": 4.083948135375977 + }, + { + "auxiliary_loss_clip": 0.01675873, + "auxiliary_loss_mlp": 0.00268968, + "balance_loss_clip": 1.37010264, + "balance_loss_mlp": 0.23415874, + "epoch": 0.2702540207425222, + "flos": 26214322976640.0, + "grad_norm": 4.0136051907628385, + "language_loss": 0.92234242, + "learning_rate": 3.42443458168683e-06, + "loss": 0.94179088, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.34838867, + "step": 4495, + "time_per_iteration": 2.6980011463165283 + }, + { + "auxiliary_loss_clip": 0.01680658, + "auxiliary_loss_mlp": 0.00302762, + "balance_loss_clip": 1.36451626, + "balance_loss_mlp": 0.26716647, + "epoch": 0.27031414399519016, + "flos": 22926377687040.0, + "grad_norm": 7.201571547512301, + "language_loss": 0.82061613, + "learning_rate": 3.424161168522959e-06, + "loss": 0.84045029, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 3.16601562, + "router_z_loss_mlp": 0.35620117, + "step": 4496, + "time_per_iteration": 2.64264178276062 + }, + { + "auxiliary_loss_clip": 0.01565977, + "auxiliary_loss_mlp": 0.00054981, + "balance_loss_clip": 1.31791544, + "balance_loss_mlp": 0.04029403, + "epoch": 0.2703742672478581, + "flos": 63019780404480.0, + "grad_norm": 0.6655178149962462, + "language_loss": 0.49771115, + "learning_rate": 3.423887701354754e-06, + "loss": 0.51392072, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.14648438, + "step": 4497, + "time_per_iteration": 4.574505805969238 + }, + { + "auxiliary_loss_clip": 0.01667637, + "auxiliary_loss_mlp": 0.00253031, + "balance_loss_clip": 1.36435997, + "balance_loss_mlp": 0.2204397, + "epoch": 0.2704343905005261, + "flos": 18840820942080.0, + "grad_norm": 14.128998143477038, + "language_loss": 0.79037404, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.80958074, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.32568359, + "step": 4498, + "time_per_iteration": 2.6200194358825684 + }, + { + "auxiliary_loss_clip": 0.01581378, + "auxiliary_loss_mlp": 0.00076427, + "balance_loss_clip": 1.33077693, + "balance_loss_mlp": 0.06693798, + "epoch": 0.27049451375319405, + "flos": 71233412618880.0, + "grad_norm": 0.7788683687025697, + "language_loss": 0.59258878, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.60916686, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.09472656, + "step": 4499, + "time_per_iteration": 3.1540472507476807 + }, + { + "auxiliary_loss_clip": 0.01663261, + "auxiliary_loss_mlp": 0.00277904, + "balance_loss_clip": 1.36053824, + "balance_loss_mlp": 0.243524, + "epoch": 0.270554637005862, + "flos": 24278594112000.0, + "grad_norm": 10.72484320307758, + "language_loss": 0.79905742, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.81846905, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.34399414, + "step": 4500, + "time_per_iteration": 2.662043809890747 + }, + { + "auxiliary_loss_clip": 0.01636725, + "auxiliary_loss_mlp": 0.00246854, + "balance_loss_clip": 1.3433311, + "balance_loss_mlp": 0.21292754, + "epoch": 0.27061476025853, + "flos": 17632318832640.0, + "grad_norm": 3.5457561509121778, + "language_loss": 0.89406443, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.91290021, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.33911133, + "step": 4501, + "time_per_iteration": 2.6114330291748047 + }, + { + "auxiliary_loss_clip": 0.01662413, + "auxiliary_loss_mlp": 0.00317017, + "balance_loss_clip": 1.34897351, + "balance_loss_mlp": 0.27910829, + "epoch": 0.27067488351119795, + "flos": 22710123855360.0, + "grad_norm": 3.587713853830485, + "language_loss": 0.78681165, + "learning_rate": 3.422519555811735e-06, + "loss": 0.80660594, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.37915039, + "step": 4502, + "time_per_iteration": 2.688307523727417 + }, + { + "auxiliary_loss_clip": 0.01638532, + "auxiliary_loss_mlp": 0.002648, + "balance_loss_clip": 1.33166909, + "balance_loss_mlp": 0.22932306, + "epoch": 0.2707350067638659, + "flos": 41719616087040.0, + "grad_norm": 41.73281732953292, + "language_loss": 0.74053931, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.75957257, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.35473633, + "step": 4503, + "time_per_iteration": 2.889307737350464 + }, + { + "auxiliary_loss_clip": 0.01600911, + "auxiliary_loss_mlp": 0.00284945, + "balance_loss_clip": 1.31221223, + "balance_loss_mlp": 0.2519713, + "epoch": 0.2707951300165339, + "flos": 20193037367040.0, + "grad_norm": 4.631348426877167, + "language_loss": 0.73773015, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.7565887, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.3293457, + "step": 4504, + "time_per_iteration": 2.5923943519592285 + }, + { + "auxiliary_loss_clip": 0.01643696, + "auxiliary_loss_mlp": 0.00276331, + "balance_loss_clip": 1.3393935, + "balance_loss_mlp": 0.24416858, + "epoch": 0.27085525326920185, + "flos": 21433966479360.0, + "grad_norm": 45.758168622599094, + "language_loss": 0.82134485, + "learning_rate": 3.421698021097902e-06, + "loss": 0.84054512, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 3.04492188, + "router_z_loss_mlp": 0.32177734, + "step": 4505, + "time_per_iteration": 2.629586696624756 + }, + { + "auxiliary_loss_clip": 0.01618632, + "auxiliary_loss_mlp": 0.0023327, + "balance_loss_clip": 1.32128811, + "balance_loss_mlp": 0.20208445, + "epoch": 0.2709153765218698, + "flos": 17675232606720.0, + "grad_norm": 93.03845038139863, + "language_loss": 0.83357722, + "learning_rate": 3.42142406835758e-06, + "loss": 0.8520962, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.31176758, + "step": 4506, + "time_per_iteration": 4.037301778793335 + }, + { + "auxiliary_loss_clip": 0.01604797, + "auxiliary_loss_mlp": 0.00256585, + "balance_loss_clip": 1.31236601, + "balance_loss_mlp": 0.22220516, + "epoch": 0.2709754997745378, + "flos": 24456243801600.0, + "grad_norm": 241.97278426955253, + "language_loss": 0.87722474, + "learning_rate": 3.421150061716715e-06, + "loss": 0.89583862, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.34350586, + "step": 4507, + "time_per_iteration": 2.6379196643829346 + }, + { + "auxiliary_loss_clip": 0.01489475, + "auxiliary_loss_mlp": 0.00054432, + "balance_loss_clip": 1.23242295, + "balance_loss_mlp": 0.04360763, + "epoch": 0.2710356230272058, + "flos": 65210798206080.0, + "grad_norm": 0.7155295055090518, + "language_loss": 0.50395942, + "learning_rate": 3.420876001185698e-06, + "loss": 0.51939845, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.10839844, + "step": 4508, + "time_per_iteration": 3.0530025959014893 + }, + { + "auxiliary_loss_clip": 0.01589289, + "auxiliary_loss_mlp": 0.00201446, + "balance_loss_clip": 1.30300808, + "balance_loss_mlp": 0.16999874, + "epoch": 0.27109574627987376, + "flos": 25484438615040.0, + "grad_norm": 297.20728185365516, + "language_loss": 0.79585135, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.81375867, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.31445312, + "step": 4509, + "time_per_iteration": 2.719332695007324 + }, + { + "auxiliary_loss_clip": 0.01584801, + "auxiliary_loss_mlp": 0.00215445, + "balance_loss_clip": 1.29736161, + "balance_loss_mlp": 0.18313883, + "epoch": 0.2711558695325417, + "flos": 19682782715520.0, + "grad_norm": 9.667674131157119, + "language_loss": 0.76468623, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.78268874, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.32275391, + "step": 4510, + "time_per_iteration": 2.6951777935028076 + }, + { + "auxiliary_loss_clip": 0.01612475, + "auxiliary_loss_mlp": 0.00216543, + "balance_loss_clip": 1.31533873, + "balance_loss_mlp": 0.17899203, + "epoch": 0.2712159927852097, + "flos": 18587758648320.0, + "grad_norm": 32.48281965278007, + "language_loss": 0.82016909, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.83845925, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.37573242, + "step": 4511, + "time_per_iteration": 2.5786476135253906 + }, + { + "auxiliary_loss_clip": 0.01592467, + "auxiliary_loss_mlp": 0.00247218, + "balance_loss_clip": 1.29618835, + "balance_loss_mlp": 0.21472114, + "epoch": 0.27127611603787766, + "flos": 25630235919360.0, + "grad_norm": 279.7785326352359, + "language_loss": 0.89946163, + "learning_rate": 3.419779220367979e-06, + "loss": 0.91785848, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.32495117, + "step": 4512, + "time_per_iteration": 2.6616363525390625 + }, + { + "auxiliary_loss_clip": 0.01577477, + "auxiliary_loss_mlp": 0.00197582, + "balance_loss_clip": 1.28884339, + "balance_loss_mlp": 0.16351163, + "epoch": 0.2713362392905456, + "flos": 23148952312320.0, + "grad_norm": 153.42615274126126, + "language_loss": 0.87076247, + "learning_rate": 3.419504890542124e-06, + "loss": 0.88851309, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.34106445, + "step": 4513, + "time_per_iteration": 2.624129295349121 + }, + { + "auxiliary_loss_clip": 0.01585788, + "auxiliary_loss_mlp": 0.00215651, + "balance_loss_clip": 1.29194307, + "balance_loss_mlp": 0.17967391, + "epoch": 0.2713963625432136, + "flos": 18366045949440.0, + "grad_norm": 15.258985277576064, + "language_loss": 0.96288073, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.9808951, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.35961914, + "step": 4514, + "time_per_iteration": 2.645146608352661 + }, + { + "auxiliary_loss_clip": 0.01594056, + "auxiliary_loss_mlp": 0.0021771, + "balance_loss_clip": 1.30066657, + "balance_loss_mlp": 0.18354481, + "epoch": 0.27145648579588155, + "flos": 22491751121280.0, + "grad_norm": 6.454196491718074, + "language_loss": 0.97536933, + "learning_rate": 3.418956069417517e-06, + "loss": 0.993487, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.34204102, + "step": 4515, + "time_per_iteration": 2.650865077972412 + }, + { + "auxiliary_loss_clip": 0.01623932, + "auxiliary_loss_mlp": 0.00233635, + "balance_loss_clip": 1.31792414, + "balance_loss_mlp": 0.19503519, + "epoch": 0.2715166090485495, + "flos": 19239177749760.0, + "grad_norm": 17.985410175775158, + "language_loss": 0.83682394, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.85539961, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.38549805, + "step": 4516, + "time_per_iteration": 2.653933525085449 + }, + { + "auxiliary_loss_clip": 0.01587757, + "auxiliary_loss_mlp": 0.00200057, + "balance_loss_clip": 1.3017118, + "balance_loss_mlp": 0.16343635, + "epoch": 0.2715767323012175, + "flos": 17709598944000.0, + "grad_norm": 15.82888663250305, + "language_loss": 0.83089334, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.84877151, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.36572266, + "step": 4517, + "time_per_iteration": 2.8193275928497314 + }, + { + "auxiliary_loss_clip": 0.01596604, + "auxiliary_loss_mlp": 0.00210912, + "balance_loss_clip": 1.30403149, + "balance_loss_mlp": 0.17340913, + "epoch": 0.27163685555388545, + "flos": 22382834106240.0, + "grad_norm": 106.97815448376242, + "language_loss": 0.8964029, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.91447806, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.37475586, + "step": 4518, + "time_per_iteration": 2.6116461753845215 + }, + { + "auxiliary_loss_clip": 0.01581281, + "auxiliary_loss_mlp": 0.00207656, + "balance_loss_clip": 1.29104161, + "balance_loss_mlp": 0.17067704, + "epoch": 0.2716969788065534, + "flos": 22346708002560.0, + "grad_norm": 7.485683560264491, + "language_loss": 0.76281214, + "learning_rate": 3.41785778156811e-06, + "loss": 0.78070152, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.36987305, + "step": 4519, + "time_per_iteration": 2.678581476211548 + }, + { + "auxiliary_loss_clip": 0.01567893, + "auxiliary_loss_mlp": 0.00206799, + "balance_loss_clip": 1.28632236, + "balance_loss_mlp": 0.17075002, + "epoch": 0.2717571020592214, + "flos": 25228467319680.0, + "grad_norm": 13.208768356617812, + "language_loss": 0.8048104, + "learning_rate": 3.417583075166451e-06, + "loss": 0.82255727, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.36083984, + "step": 4520, + "time_per_iteration": 2.648935317993164 + }, + { + "auxiliary_loss_clip": 0.01598637, + "auxiliary_loss_mlp": 0.00214173, + "balance_loss_clip": 1.30178118, + "balance_loss_mlp": 0.17659777, + "epoch": 0.2718172253118894, + "flos": 20189769229440.0, + "grad_norm": 4.186069540744834, + "language_loss": 0.83829969, + "learning_rate": 3.4173083150099e-06, + "loss": 0.85642779, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.3762207, + "step": 4521, + "time_per_iteration": 2.6424646377563477 + }, + { + "auxiliary_loss_clip": 0.01586136, + "auxiliary_loss_mlp": 0.00228377, + "balance_loss_clip": 1.29025292, + "balance_loss_mlp": 0.19259053, + "epoch": 0.27187734856455736, + "flos": 14319129260160.0, + "grad_norm": 14.642421876621789, + "language_loss": 0.86559737, + "learning_rate": 3.417033501108875e-06, + "loss": 0.88374257, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.35791016, + "step": 4522, + "time_per_iteration": 2.652073383331299 + }, + { + "auxiliary_loss_clip": 0.01591828, + "auxiliary_loss_mlp": 0.00226403, + "balance_loss_clip": 1.30702603, + "balance_loss_mlp": 0.18997248, + "epoch": 0.27193747181722533, + "flos": 21107682311040.0, + "grad_norm": 21.90759340674125, + "language_loss": 0.79146367, + "learning_rate": 3.416758633473798e-06, + "loss": 0.80964595, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.36425781, + "step": 4523, + "time_per_iteration": 2.6658334732055664 + }, + { + "auxiliary_loss_clip": 0.01562874, + "auxiliary_loss_mlp": 0.00193414, + "balance_loss_clip": 1.28031731, + "balance_loss_mlp": 0.16060755, + "epoch": 0.2719975950698933, + "flos": 19682782715520.0, + "grad_norm": 12.924650959984083, + "language_loss": 0.80908144, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.82664436, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.32788086, + "step": 4524, + "time_per_iteration": 2.6434154510498047 + }, + { + "auxiliary_loss_clip": 0.01555893, + "auxiliary_loss_mlp": 0.00215255, + "balance_loss_clip": 1.27634561, + "balance_loss_mlp": 0.18085107, + "epoch": 0.27205771832256126, + "flos": 24754482426240.0, + "grad_norm": 12.800930601429771, + "language_loss": 0.8150292, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.83274066, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.34423828, + "step": 4525, + "time_per_iteration": 2.7041895389556885 + }, + { + "auxiliary_loss_clip": 0.01580622, + "auxiliary_loss_mlp": 0.00199854, + "balance_loss_clip": 1.30059242, + "balance_loss_mlp": 0.16418707, + "epoch": 0.2721178415752292, + "flos": 21755581879680.0, + "grad_norm": 25.182707188554623, + "language_loss": 0.8877486, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.90555334, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.35668945, + "step": 4526, + "time_per_iteration": 2.5921270847320557 + }, + { + "auxiliary_loss_clip": 0.01579248, + "auxiliary_loss_mlp": 0.00251242, + "balance_loss_clip": 1.28871465, + "balance_loss_mlp": 0.21495487, + "epoch": 0.2721779648278972, + "flos": 12676826597760.0, + "grad_norm": 27.065362799928746, + "language_loss": 0.87825394, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.89655888, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.36303711, + "step": 4527, + "time_per_iteration": 2.6026248931884766 + }, + { + "auxiliary_loss_clip": 0.0156393, + "auxiliary_loss_mlp": 0.00224202, + "balance_loss_clip": 1.28189707, + "balance_loss_mlp": 0.18824852, + "epoch": 0.27223808808056515, + "flos": 16253206099200.0, + "grad_norm": 3.933387745037403, + "language_loss": 0.89046276, + "learning_rate": 3.415383489652503e-06, + "loss": 0.90834403, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.35961914, + "step": 4528, + "time_per_iteration": 2.6785829067230225 + }, + { + "auxiliary_loss_clip": 0.0159149, + "auxiliary_loss_mlp": 0.00203177, + "balance_loss_clip": 1.30633378, + "balance_loss_mlp": 0.16908276, + "epoch": 0.2722982113332331, + "flos": 27745805203200.0, + "grad_norm": 8.53623919862263, + "language_loss": 0.83549362, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.85344028, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.34082031, + "step": 4529, + "time_per_iteration": 2.6636667251586914 + }, + { + "auxiliary_loss_clip": 0.01571559, + "auxiliary_loss_mlp": 0.00229147, + "balance_loss_clip": 1.2881186, + "balance_loss_mlp": 0.19526795, + "epoch": 0.2723583345859011, + "flos": 21726243446400.0, + "grad_norm": 14.76010063563752, + "language_loss": 0.89296508, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.91097206, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.33911133, + "step": 4530, + "time_per_iteration": 2.612440824508667 + }, + { + "auxiliary_loss_clip": 0.01570555, + "auxiliary_loss_mlp": 0.002156, + "balance_loss_clip": 1.28752685, + "balance_loss_mlp": 0.18176824, + "epoch": 0.27241845783856905, + "flos": 17347260499200.0, + "grad_norm": 9.007807897527684, + "language_loss": 0.98103809, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.99889964, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.33837891, + "step": 4531, + "time_per_iteration": 2.5635035037994385 + }, + { + "auxiliary_loss_clip": 0.01574109, + "auxiliary_loss_mlp": 0.00253767, + "balance_loss_clip": 1.28733099, + "balance_loss_mlp": 0.22117475, + "epoch": 0.272478581091237, + "flos": 24754302858240.0, + "grad_norm": 4.049562540688933, + "language_loss": 0.82821423, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.84649301, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.32592773, + "step": 4532, + "time_per_iteration": 2.6456568241119385 + }, + { + "auxiliary_loss_clip": 0.01568879, + "auxiliary_loss_mlp": 0.0020716, + "balance_loss_clip": 1.29179931, + "balance_loss_mlp": 0.17549773, + "epoch": 0.272538704343905, + "flos": 17890624512000.0, + "grad_norm": 3.6818997040403305, + "language_loss": 0.9597581, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.97751856, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.31640625, + "step": 4533, + "time_per_iteration": 3.9501445293426514 + }, + { + "auxiliary_loss_clip": 0.01610925, + "auxiliary_loss_mlp": 0.00235166, + "balance_loss_clip": 1.31797981, + "balance_loss_mlp": 0.20159701, + "epoch": 0.272598827596573, + "flos": 22932016122240.0, + "grad_norm": 4.620463960491849, + "language_loss": 0.77031398, + "learning_rate": 3.413731546022929e-06, + "loss": 0.78877485, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.33569336, + "step": 4534, + "time_per_iteration": 2.6633293628692627 + }, + { + "auxiliary_loss_clip": 0.01597553, + "auxiliary_loss_mlp": 0.00253482, + "balance_loss_clip": 1.2975142, + "balance_loss_mlp": 0.21759978, + "epoch": 0.27265895084924097, + "flos": 24238409771520.0, + "grad_norm": 2.4704819573654664, + "language_loss": 0.97477418, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.99328452, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.35888672, + "step": 4535, + "time_per_iteration": 2.6271297931671143 + }, + { + "auxiliary_loss_clip": 0.01610417, + "auxiliary_loss_mlp": 0.0025068, + "balance_loss_clip": 1.31044078, + "balance_loss_mlp": 0.2171587, + "epoch": 0.27271907410190893, + "flos": 27013155494400.0, + "grad_norm": 3.928089233974998, + "language_loss": 0.78289747, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.80150843, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.33544922, + "step": 4536, + "time_per_iteration": 4.0961244106292725 + }, + { + "auxiliary_loss_clip": 0.01621163, + "auxiliary_loss_mlp": 0.00265989, + "balance_loss_clip": 1.32249188, + "balance_loss_mlp": 0.23089348, + "epoch": 0.2727791973545769, + "flos": 34452588942720.0, + "grad_norm": 4.245097272879593, + "language_loss": 0.78587723, + "learning_rate": 3.41290485034781e-06, + "loss": 0.80474877, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.35083008, + "step": 4537, + "time_per_iteration": 2.7355167865753174 + }, + { + "auxiliary_loss_clip": 0.01604007, + "auxiliary_loss_mlp": 0.00258939, + "balance_loss_clip": 1.308429, + "balance_loss_mlp": 0.22532186, + "epoch": 0.27283932060724486, + "flos": 15041723160960.0, + "grad_norm": 7.078910352249782, + "language_loss": 0.84412825, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.86275774, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.3359375, + "step": 4538, + "time_per_iteration": 2.684952974319458 + }, + { + "auxiliary_loss_clip": 0.0157595, + "auxiliary_loss_mlp": 0.00243181, + "balance_loss_clip": 1.28280413, + "balance_loss_mlp": 0.21068488, + "epoch": 0.2728994438599128, + "flos": 21652411040640.0, + "grad_norm": 8.696028760497006, + "language_loss": 0.9532699, + "learning_rate": 3.412353451992847e-06, + "loss": 0.97146124, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.32519531, + "step": 4539, + "time_per_iteration": 4.230685710906982 + }, + { + "auxiliary_loss_clip": 0.01565665, + "auxiliary_loss_mlp": 0.00233023, + "balance_loss_clip": 1.27626729, + "balance_loss_mlp": 0.19568679, + "epoch": 0.2729595671125808, + "flos": 17488424949120.0, + "grad_norm": 49.308237499078786, + "language_loss": 0.93591702, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.95390385, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.37329102, + "step": 4540, + "time_per_iteration": 2.659888505935669 + }, + { + "auxiliary_loss_clip": 0.01555021, + "auxiliary_loss_mlp": 0.00246336, + "balance_loss_clip": 1.26966333, + "balance_loss_mlp": 0.20988151, + "epoch": 0.27301969036524876, + "flos": 19318145800320.0, + "grad_norm": 39.50255514683836, + "language_loss": 0.88286555, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.90087914, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.36450195, + "step": 4541, + "time_per_iteration": 2.584987163543701 + }, + { + "auxiliary_loss_clip": 0.01548189, + "auxiliary_loss_mlp": 0.00252421, + "balance_loss_clip": 1.26388788, + "balance_loss_mlp": 0.21508458, + "epoch": 0.2730798136179167, + "flos": 21065666376960.0, + "grad_norm": 60.49664232164366, + "language_loss": 0.86837971, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.8863858, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.37329102, + "step": 4542, + "time_per_iteration": 2.608703136444092 + }, + { + "auxiliary_loss_clip": 0.01550708, + "auxiliary_loss_mlp": 0.00226694, + "balance_loss_clip": 1.26698625, + "balance_loss_mlp": 0.19424537, + "epoch": 0.2731399368705847, + "flos": 19171737964800.0, + "grad_norm": 482.30258575543763, + "language_loss": 0.99043071, + "learning_rate": 3.411250012687582e-06, + "loss": 1.0082047, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.32470703, + "step": 4543, + "time_per_iteration": 2.640106678009033 + }, + { + "auxiliary_loss_clip": 0.01587291, + "auxiliary_loss_mlp": 0.00250202, + "balance_loss_clip": 1.28601694, + "balance_loss_mlp": 0.21639435, + "epoch": 0.27320006012325265, + "flos": 18290130554880.0, + "grad_norm": 51.07060010193714, + "language_loss": 0.74302918, + "learning_rate": 3.410974019048255e-06, + "loss": 0.7614041, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.33813477, + "step": 4544, + "time_per_iteration": 2.6073198318481445 + }, + { + "auxiliary_loss_clip": 0.01561144, + "auxiliary_loss_mlp": 0.00235551, + "balance_loss_clip": 1.27134192, + "balance_loss_mlp": 0.20193404, + "epoch": 0.2732601833759206, + "flos": 34860929731200.0, + "grad_norm": 6.343290049931752, + "language_loss": 0.78909522, + "learning_rate": 3.410697971904651e-06, + "loss": 0.80706221, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.33618164, + "step": 4545, + "time_per_iteration": 2.7494277954101562 + }, + { + "auxiliary_loss_clip": 0.01417486, + "auxiliary_loss_mlp": 0.00040026, + "balance_loss_clip": 1.16310596, + "balance_loss_mlp": 0.02247802, + "epoch": 0.2733203066285886, + "flos": 53910824762880.0, + "grad_norm": 0.803259201801223, + "language_loss": 0.61814582, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.63272095, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.17578125, + "step": 4546, + "time_per_iteration": 3.1723639965057373 + }, + { + "auxiliary_loss_clip": 0.0155984, + "auxiliary_loss_mlp": 0.00241295, + "balance_loss_clip": 1.27459109, + "balance_loss_mlp": 0.2072245, + "epoch": 0.2733804298812566, + "flos": 20660378244480.0, + "grad_norm": 85.27070750515162, + "language_loss": 0.70286882, + "learning_rate": 3.410145717146488e-06, + "loss": 0.72088015, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.34082031, + "step": 4547, + "time_per_iteration": 2.606595277786255 + }, + { + "auxiliary_loss_clip": 0.01560127, + "auxiliary_loss_mlp": 0.00183877, + "balance_loss_clip": 1.27357268, + "balance_loss_mlp": 0.15250131, + "epoch": 0.27344055313392457, + "flos": 25884339707520.0, + "grad_norm": 8.405477558205277, + "language_loss": 0.85866594, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.87610596, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.3137207, + "step": 4548, + "time_per_iteration": 4.0167882442474365 + }, + { + "auxiliary_loss_clip": 0.01587427, + "auxiliary_loss_mlp": 0.00216485, + "balance_loss_clip": 1.2914114, + "balance_loss_mlp": 0.1845842, + "epoch": 0.27350067638659253, + "flos": 22929753565440.0, + "grad_norm": 4.76584582241432, + "language_loss": 0.89299512, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.91103423, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.3190918, + "step": 4549, + "time_per_iteration": 2.6597344875335693 + }, + { + "auxiliary_loss_clip": 0.01552492, + "auxiliary_loss_mlp": 0.00214591, + "balance_loss_clip": 1.26523459, + "balance_loss_mlp": 0.17720735, + "epoch": 0.2735607996392605, + "flos": 16574821499520.0, + "grad_norm": 20.245757536829103, + "language_loss": 0.79713255, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.81480336, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.3737793, + "step": 4550, + "time_per_iteration": 2.592839479446411 + }, + { + "auxiliary_loss_clip": 0.01574346, + "auxiliary_loss_mlp": 0.00234094, + "balance_loss_clip": 1.28493273, + "balance_loss_mlp": 0.20317113, + "epoch": 0.27362092289192846, + "flos": 19645291895040.0, + "grad_norm": 6.956141497705398, + "language_loss": 0.86810625, + "learning_rate": 3.409040566039563e-06, + "loss": 0.88619065, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.30932617, + "step": 4551, + "time_per_iteration": 2.588219165802002 + }, + { + "auxiliary_loss_clip": 0.01553227, + "auxiliary_loss_mlp": 0.00214298, + "balance_loss_clip": 1.26951385, + "balance_loss_mlp": 0.18001331, + "epoch": 0.27368104614459643, + "flos": 17639142416640.0, + "grad_norm": 24.714237349373025, + "language_loss": 0.82119995, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.83887517, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.3425293, + "step": 4552, + "time_per_iteration": 2.6232831478118896 + }, + { + "auxiliary_loss_clip": 0.01555495, + "auxiliary_loss_mlp": 0.00202436, + "balance_loss_clip": 1.26928425, + "balance_loss_mlp": 0.16979647, + "epoch": 0.2737411693972644, + "flos": 21580015178880.0, + "grad_norm": 32.11255804101521, + "language_loss": 0.79995441, + "learning_rate": 3.408487669858431e-06, + "loss": 0.81753367, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.32617188, + "step": 4553, + "time_per_iteration": 2.6763126850128174 + }, + { + "auxiliary_loss_clip": 0.01557217, + "auxiliary_loss_mlp": 0.0021954, + "balance_loss_clip": 1.27436233, + "balance_loss_mlp": 0.18711527, + "epoch": 0.27380129264993236, + "flos": 25484043565440.0, + "grad_norm": 168.10114577567407, + "language_loss": 0.68187088, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.69963849, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.32446289, + "step": 4554, + "time_per_iteration": 2.6798346042633057 + }, + { + "auxiliary_loss_clip": 0.01548404, + "auxiliary_loss_mlp": 0.00224483, + "balance_loss_clip": 1.25889504, + "balance_loss_mlp": 0.19027036, + "epoch": 0.2738614159026003, + "flos": 18661196004480.0, + "grad_norm": 43.154530756939145, + "language_loss": 0.83406091, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.85178977, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.34204102, + "step": 4555, + "time_per_iteration": 2.563953399658203 + }, + { + "auxiliary_loss_clip": 0.01556879, + "auxiliary_loss_mlp": 0.00243326, + "balance_loss_clip": 1.26807499, + "balance_loss_mlp": 0.21259362, + "epoch": 0.2739215391552683, + "flos": 23477139901440.0, + "grad_norm": 5.188196555011678, + "language_loss": 0.85815299, + "learning_rate": 3.407657925038002e-06, + "loss": 0.87615502, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.30712891, + "step": 4556, + "time_per_iteration": 2.6894569396972656 + }, + { + "auxiliary_loss_clip": 0.01578891, + "auxiliary_loss_mlp": 0.00285515, + "balance_loss_clip": 1.27357841, + "balance_loss_mlp": 0.25077695, + "epoch": 0.27398166240793626, + "flos": 17128636369920.0, + "grad_norm": 13.735363744192822, + "language_loss": 0.90234208, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.92098618, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.34716797, + "step": 4557, + "time_per_iteration": 2.6907575130462646 + }, + { + "auxiliary_loss_clip": 0.01574506, + "auxiliary_loss_mlp": 0.00238983, + "balance_loss_clip": 1.2814914, + "balance_loss_mlp": 0.20934774, + "epoch": 0.2740417856606042, + "flos": 23404744039680.0, + "grad_norm": 7.670273866644852, + "language_loss": 0.82653081, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.84466565, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.29626465, + "step": 4558, + "time_per_iteration": 2.6964802742004395 + }, + { + "auxiliary_loss_clip": 0.01540571, + "auxiliary_loss_mlp": 0.00236606, + "balance_loss_clip": 1.2568326, + "balance_loss_mlp": 0.20580187, + "epoch": 0.2741019089132722, + "flos": 12780428400000.0, + "grad_norm": 3.4186623044187914, + "language_loss": 0.74825639, + "learning_rate": 3.406827699810819e-06, + "loss": 0.76602811, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.30786133, + "step": 4559, + "time_per_iteration": 2.6025753021240234 + }, + { + "auxiliary_loss_clip": 0.01543561, + "auxiliary_loss_mlp": 0.00218461, + "balance_loss_clip": 1.25338006, + "balance_loss_mlp": 0.18875408, + "epoch": 0.27416203216594015, + "flos": 20631542601600.0, + "grad_norm": 8.436019213709066, + "language_loss": 0.78768599, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.8053062, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.296875, + "step": 4560, + "time_per_iteration": 2.6664299964904785 + }, + { + "auxiliary_loss_clip": 0.01560351, + "auxiliary_loss_mlp": 0.00234128, + "balance_loss_clip": 1.26830733, + "balance_loss_mlp": 0.20346749, + "epoch": 0.27422215541860817, + "flos": 26541576812160.0, + "grad_norm": 148.49572921329076, + "language_loss": 0.89215982, + "learning_rate": 3.406273949573303e-06, + "loss": 0.91010463, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.30639648, + "step": 4561, + "time_per_iteration": 2.6673169136047363 + }, + { + "auxiliary_loss_clip": 0.01566875, + "auxiliary_loss_mlp": 0.00242383, + "balance_loss_clip": 1.27127349, + "balance_loss_mlp": 0.21394014, + "epoch": 0.27428227867127614, + "flos": 23331163029120.0, + "grad_norm": 8.59651917189317, + "language_loss": 0.81945741, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.83754998, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.28442383, + "step": 4562, + "time_per_iteration": 2.648052930831909 + }, + { + "auxiliary_loss_clip": 0.01559518, + "auxiliary_loss_mlp": 0.00251725, + "balance_loss_clip": 1.26429749, + "balance_loss_mlp": 0.22061148, + "epoch": 0.2743424019239441, + "flos": 23035115134080.0, + "grad_norm": 2.096790067883379, + "language_loss": 0.81354201, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.83165443, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.3112793, + "step": 4563, + "time_per_iteration": 2.6519968509674072 + }, + { + "auxiliary_loss_clip": 0.01585975, + "auxiliary_loss_mlp": 0.00275068, + "balance_loss_clip": 1.27970743, + "balance_loss_mlp": 0.24285811, + "epoch": 0.27440252517661207, + "flos": 21981101420160.0, + "grad_norm": 3.0384243622099882, + "language_loss": 0.75125164, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.76986206, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.32202148, + "step": 4564, + "time_per_iteration": 2.7213807106018066 + }, + { + "auxiliary_loss_clip": 0.01568673, + "auxiliary_loss_mlp": 0.00213148, + "balance_loss_clip": 1.27170277, + "balance_loss_mlp": 0.1846216, + "epoch": 0.27446264842928003, + "flos": 40187451502080.0, + "grad_norm": 10.898261233037164, + "language_loss": 0.8656745, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.88349271, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.28527832, + "step": 4565, + "time_per_iteration": 2.8316197395324707 + }, + { + "auxiliary_loss_clip": 0.01585748, + "auxiliary_loss_mlp": 0.00210357, + "balance_loss_clip": 1.28903079, + "balance_loss_mlp": 0.17967269, + "epoch": 0.274522771681948, + "flos": 13479681438720.0, + "grad_norm": 30.805194613072974, + "language_loss": 0.78944337, + "learning_rate": 3.404888640957477e-06, + "loss": 0.80740446, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.30664062, + "step": 4566, + "time_per_iteration": 2.6731512546539307 + }, + { + "auxiliary_loss_clip": 0.01541336, + "auxiliary_loss_mlp": 0.00200881, + "balance_loss_clip": 1.24842954, + "balance_loss_mlp": 0.17155552, + "epoch": 0.27458289493461596, + "flos": 28622133313920.0, + "grad_norm": 4.864807161841828, + "language_loss": 0.65775865, + "learning_rate": 3.404611419371723e-06, + "loss": 0.67518079, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.29321289, + "step": 4567, + "time_per_iteration": 2.757650852203369 + }, + { + "auxiliary_loss_clip": 0.01560498, + "auxiliary_loss_mlp": 0.00242634, + "balance_loss_clip": 1.26447725, + "balance_loss_mlp": 0.20961335, + "epoch": 0.2746430181872839, + "flos": 20119815492480.0, + "grad_norm": 6.293623505219834, + "language_loss": 0.88528204, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.9033134, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.33032227, + "step": 4568, + "time_per_iteration": 2.762847423553467 + }, + { + "auxiliary_loss_clip": 0.01574402, + "auxiliary_loss_mlp": 0.00231094, + "balance_loss_clip": 1.26895523, + "balance_loss_mlp": 0.19924147, + "epoch": 0.2747031414399519, + "flos": 20193468330240.0, + "grad_norm": 21.698888943414957, + "language_loss": 0.75778788, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.77584291, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.31860352, + "step": 4569, + "time_per_iteration": 2.6298258304595947 + }, + { + "auxiliary_loss_clip": 0.01538891, + "auxiliary_loss_mlp": 0.00234025, + "balance_loss_clip": 1.24009252, + "balance_loss_mlp": 0.20183812, + "epoch": 0.27476326469261986, + "flos": 13516346246400.0, + "grad_norm": 4.31692128628831, + "language_loss": 0.79828942, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.81601858, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 2.99023438, + "router_z_loss_mlp": 0.32177734, + "step": 4570, + "time_per_iteration": 2.61742901802063 + }, + { + "auxiliary_loss_clip": 0.01559462, + "auxiliary_loss_mlp": 0.00143068, + "balance_loss_clip": 1.24628294, + "balance_loss_mlp": 0.13515237, + "epoch": 0.2748233879452878, + "flos": 65937127121280.0, + "grad_norm": 0.7138605174132817, + "language_loss": 0.55784851, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.57487381, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.07910156, + "step": 4571, + "time_per_iteration": 3.244525671005249 + }, + { + "auxiliary_loss_clip": 0.01567733, + "auxiliary_loss_mlp": 0.00269095, + "balance_loss_clip": 1.26961446, + "balance_loss_mlp": 0.23855332, + "epoch": 0.2748835111979558, + "flos": 17384212615680.0, + "grad_norm": 1.8936855702083126, + "language_loss": 0.86809409, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.88646233, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.30517578, + "step": 4572, + "time_per_iteration": 2.6719746589660645 + }, + { + "auxiliary_loss_clip": 0.01541569, + "auxiliary_loss_mlp": 0.00182776, + "balance_loss_clip": 1.2499969, + "balance_loss_mlp": 0.15410665, + "epoch": 0.27494363445062375, + "flos": 23587565287680.0, + "grad_norm": 33.204937483664864, + "language_loss": 0.85948527, + "learning_rate": 3.402946971702147e-06, + "loss": 0.87672871, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.28686523, + "step": 4573, + "time_per_iteration": 2.634859561920166 + }, + { + "auxiliary_loss_clip": 0.01533638, + "auxiliary_loss_mlp": 0.00221935, + "balance_loss_clip": 1.24083972, + "balance_loss_mlp": 0.19337264, + "epoch": 0.2750037577032918, + "flos": 17164582905600.0, + "grad_norm": 4.785212972862184, + "language_loss": 0.85274392, + "learning_rate": 3.402669377496223e-06, + "loss": 0.8702997, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.28564453, + "step": 4574, + "time_per_iteration": 2.578932762145996 + }, + { + "auxiliary_loss_clip": 0.01560866, + "auxiliary_loss_mlp": 0.00249093, + "balance_loss_clip": 1.26251721, + "balance_loss_mlp": 0.22063813, + "epoch": 0.27506388095595974, + "flos": 24491903028480.0, + "grad_norm": 8.60742642731296, + "language_loss": 0.81396818, + "learning_rate": 3.402391730100936e-06, + "loss": 0.83206773, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.28442383, + "step": 4575, + "time_per_iteration": 4.026278257369995 + }, + { + "auxiliary_loss_clip": 0.01552218, + "auxiliary_loss_mlp": 0.00231002, + "balance_loss_clip": 1.25690234, + "balance_loss_mlp": 0.20165282, + "epoch": 0.2751240042086277, + "flos": 38764706722560.0, + "grad_norm": 12.464961256452915, + "language_loss": 0.78146648, + "learning_rate": 3.402114029526814e-06, + "loss": 0.7992987, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.29345703, + "step": 4576, + "time_per_iteration": 2.7828738689422607 + }, + { + "auxiliary_loss_clip": 0.0155861, + "auxiliary_loss_mlp": 0.00226401, + "balance_loss_clip": 1.25946987, + "balance_loss_mlp": 0.19590728, + "epoch": 0.27518412746129567, + "flos": 26907039740160.0, + "grad_norm": 3.781584753355348, + "language_loss": 0.79547864, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.81332874, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.30493164, + "step": 4577, + "time_per_iteration": 2.682868480682373 + }, + { + "auxiliary_loss_clip": 0.01584812, + "auxiliary_loss_mlp": 0.00234274, + "balance_loss_clip": 1.28283978, + "balance_loss_mlp": 0.20335117, + "epoch": 0.27524425071396363, + "flos": 24900531125760.0, + "grad_norm": 59.06258941851503, + "language_loss": 0.82773811, + "learning_rate": 3.401558468884188e-06, + "loss": 0.84592891, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.30932617, + "step": 4578, + "time_per_iteration": 2.6752095222473145 + }, + { + "auxiliary_loss_clip": 0.01568351, + "auxiliary_loss_mlp": 0.00255424, + "balance_loss_clip": 1.26870418, + "balance_loss_mlp": 0.22278449, + "epoch": 0.2753043739666316, + "flos": 26288047641600.0, + "grad_norm": 26.07687039856782, + "language_loss": 0.73417926, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.75241697, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.32641602, + "step": 4579, + "time_per_iteration": 4.184458494186401 + }, + { + "auxiliary_loss_clip": 0.01547117, + "auxiliary_loss_mlp": 0.00267841, + "balance_loss_clip": 1.24732852, + "balance_loss_mlp": 0.2333895, + "epoch": 0.27536449721929956, + "flos": 24206772867840.0, + "grad_norm": 183.74328617105823, + "language_loss": 0.86277461, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.88092422, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.34448242, + "step": 4580, + "time_per_iteration": 2.6636626720428467 + }, + { + "auxiliary_loss_clip": 0.0155742, + "auxiliary_loss_mlp": 0.0025084, + "balance_loss_clip": 1.25918639, + "balance_loss_mlp": 0.21798617, + "epoch": 0.27542462047196753, + "flos": 19537272720000.0, + "grad_norm": 1.4910340166500349, + "language_loss": 0.74840295, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.76648557, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.328125, + "step": 4581, + "time_per_iteration": 4.140495538711548 + }, + { + "auxiliary_loss_clip": 0.01550674, + "auxiliary_loss_mlp": 0.00240921, + "balance_loss_clip": 1.25018132, + "balance_loss_mlp": 0.2078758, + "epoch": 0.2754847437246355, + "flos": 14319165173760.0, + "grad_norm": 10.781380615845523, + "language_loss": 0.85684592, + "learning_rate": 3.400446709916392e-06, + "loss": 0.87476182, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.33056641, + "step": 4582, + "time_per_iteration": 2.716481924057007 + }, + { + "auxiliary_loss_clip": 0.01556011, + "auxiliary_loss_mlp": 0.00195127, + "balance_loss_clip": 1.25652575, + "balance_loss_mlp": 0.16769692, + "epoch": 0.27554486697730346, + "flos": 18838773866880.0, + "grad_norm": 1.8510802448190473, + "language_loss": 0.9054963, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.92300773, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.27441406, + "step": 4583, + "time_per_iteration": 2.647376537322998 + }, + { + "auxiliary_loss_clip": 0.01581235, + "auxiliary_loss_mlp": 0.00255033, + "balance_loss_clip": 1.26916552, + "balance_loss_mlp": 0.22470617, + "epoch": 0.2756049902299714, + "flos": 22382295402240.0, + "grad_norm": 5.2913647695758215, + "language_loss": 0.76768696, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.7860496, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.3034668, + "step": 4584, + "time_per_iteration": 2.6411688327789307 + }, + { + "auxiliary_loss_clip": 0.01589055, + "auxiliary_loss_mlp": 0.00223729, + "balance_loss_clip": 1.2789104, + "balance_loss_mlp": 0.19437999, + "epoch": 0.2756651134826394, + "flos": 19573901614080.0, + "grad_norm": 171.144612461705, + "language_loss": 0.83416164, + "learning_rate": 3.399612333050327e-06, + "loss": 0.8522895, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.29345703, + "step": 4585, + "time_per_iteration": 2.6393234729766846 + }, + { + "auxiliary_loss_clip": 0.01571638, + "auxiliary_loss_mlp": 0.00253059, + "balance_loss_clip": 1.25662816, + "balance_loss_mlp": 0.2205389, + "epoch": 0.27572523673530736, + "flos": 23586559706880.0, + "grad_norm": 71.47468639620806, + "language_loss": 0.80204725, + "learning_rate": 3.399334101267362e-06, + "loss": 0.8202942, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.32519531, + "step": 4586, + "time_per_iteration": 2.6717560291290283 + }, + { + "auxiliary_loss_clip": 0.01564564, + "auxiliary_loss_mlp": 0.00244752, + "balance_loss_clip": 1.25843489, + "balance_loss_mlp": 0.21452026, + "epoch": 0.2757853599879754, + "flos": 22820118278400.0, + "grad_norm": 279.62898056645224, + "language_loss": 0.86335826, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.88145143, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.30249023, + "step": 4587, + "time_per_iteration": 2.655555009841919 + }, + { + "auxiliary_loss_clip": 0.01542043, + "auxiliary_loss_mlp": 0.00230618, + "balance_loss_clip": 1.2402252, + "balance_loss_mlp": 0.20142324, + "epoch": 0.27584548324064334, + "flos": 18551704371840.0, + "grad_norm": 6.193081232064679, + "language_loss": 0.88156557, + "learning_rate": 3.398777478523316e-06, + "loss": 0.89929211, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.29211426, + "step": 4588, + "time_per_iteration": 2.679084539413452 + }, + { + "auxiliary_loss_clip": 0.01548361, + "auxiliary_loss_mlp": 0.00239055, + "balance_loss_clip": 1.24653399, + "balance_loss_mlp": 0.21132722, + "epoch": 0.2759056064933113, + "flos": 23769883745280.0, + "grad_norm": 1.956529467711396, + "language_loss": 0.80575848, + "learning_rate": 3.398499087583342e-06, + "loss": 0.82363272, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.27734375, + "step": 4589, + "time_per_iteration": 2.677325963973999 + }, + { + "auxiliary_loss_clip": 0.01565342, + "auxiliary_loss_mlp": 0.00254723, + "balance_loss_clip": 1.25678253, + "balance_loss_mlp": 0.22499266, + "epoch": 0.27596572974597927, + "flos": 24281898163200.0, + "grad_norm": 17.179852967305877, + "language_loss": 0.94184494, + "learning_rate": 3.398220643612143e-06, + "loss": 0.96004558, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.29736328, + "step": 4590, + "time_per_iteration": 4.126562595367432 + }, + { + "auxiliary_loss_clip": 0.01579523, + "auxiliary_loss_mlp": 0.00276757, + "balance_loss_clip": 1.2651031, + "balance_loss_mlp": 0.24769396, + "epoch": 0.27602585299864724, + "flos": 35040985632000.0, + "grad_norm": 14.111477365743127, + "language_loss": 0.77559626, + "learning_rate": 3.397942146620277e-06, + "loss": 0.79415905, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.29064941, + "step": 4591, + "time_per_iteration": 2.7779150009155273 + }, + { + "auxiliary_loss_clip": 0.01544526, + "auxiliary_loss_mlp": 0.00269327, + "balance_loss_clip": 1.24699283, + "balance_loss_mlp": 0.23809452, + "epoch": 0.2760859762513152, + "flos": 24309405002880.0, + "grad_norm": 553.2485182405113, + "language_loss": 0.86157554, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.87971413, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.31225586, + "step": 4592, + "time_per_iteration": 2.6586720943450928 + }, + { + "auxiliary_loss_clip": 0.01465138, + "auxiliary_loss_mlp": 0.0008516, + "balance_loss_clip": 1.18611026, + "balance_loss_mlp": 0.07476512, + "epoch": 0.27614609950398317, + "flos": 71260739890560.0, + "grad_norm": 0.6920349547374054, + "language_loss": 0.61460841, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63011134, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.10400391, + "step": 4593, + "time_per_iteration": 3.1123058795928955 + }, + { + "auxiliary_loss_clip": 0.01554453, + "auxiliary_loss_mlp": 0.00273624, + "balance_loss_clip": 1.24871111, + "balance_loss_mlp": 0.24518113, + "epoch": 0.27620622275665113, + "flos": 29674854138240.0, + "grad_norm": 3.5755904911077616, + "language_loss": 0.82169378, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.83997452, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.28417969, + "step": 4594, + "time_per_iteration": 2.6748263835906982 + }, + { + "auxiliary_loss_clip": 0.01554257, + "auxiliary_loss_mlp": 0.00258631, + "balance_loss_clip": 1.24984527, + "balance_loss_mlp": 0.23054521, + "epoch": 0.2762663460093191, + "flos": 15378063137280.0, + "grad_norm": 24.147520501970828, + "language_loss": 0.97446072, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.99258959, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 3.04492188, + "router_z_loss_mlp": 0.28100586, + "step": 4595, + "time_per_iteration": 2.5887420177459717 + }, + { + "auxiliary_loss_clip": 0.01576256, + "auxiliary_loss_mlp": 0.00272136, + "balance_loss_clip": 1.26428056, + "balance_loss_mlp": 0.24004443, + "epoch": 0.27632646926198706, + "flos": 20704082117760.0, + "grad_norm": 62.121422483627335, + "language_loss": 0.7627759, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.78125983, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.32104492, + "step": 4596, + "time_per_iteration": 2.6254708766937256 + }, + { + "auxiliary_loss_clip": 0.01577308, + "auxiliary_loss_mlp": 0.00274628, + "balance_loss_clip": 1.26431072, + "balance_loss_mlp": 0.24591047, + "epoch": 0.276386592514655, + "flos": 32813374849920.0, + "grad_norm": 4.03019144248856, + "language_loss": 0.69006264, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.70858204, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.28723145, + "step": 4597, + "time_per_iteration": 2.719001531600952 + }, + { + "auxiliary_loss_clip": 0.01544764, + "auxiliary_loss_mlp": 0.00251833, + "balance_loss_clip": 1.24715638, + "balance_loss_mlp": 0.22423568, + "epoch": 0.276446715767323, + "flos": 18551704371840.0, + "grad_norm": 3.1210994758290025, + "language_loss": 0.90733171, + "learning_rate": 3.395991183985887e-06, + "loss": 0.92529768, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.27612305, + "step": 4598, + "time_per_iteration": 2.6177024841308594 + }, + { + "auxiliary_loss_clip": 0.01596747, + "auxiliary_loss_mlp": 0.00266645, + "balance_loss_clip": 1.2749362, + "balance_loss_mlp": 0.23625827, + "epoch": 0.27650683901999096, + "flos": 22819615488000.0, + "grad_norm": 3.2681317333661553, + "language_loss": 0.8700099, + "learning_rate": 3.395712263209037e-06, + "loss": 0.8886438, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.30383301, + "step": 4599, + "time_per_iteration": 2.610305070877075 + }, + { + "auxiliary_loss_clip": 0.01574816, + "auxiliary_loss_mlp": 0.00313987, + "balance_loss_clip": 1.25939584, + "balance_loss_mlp": 0.28418446, + "epoch": 0.276566962272659, + "flos": 21361534704000.0, + "grad_norm": 4.270466681588254, + "language_loss": 0.87004256, + "learning_rate": 3.395433289506639e-06, + "loss": 0.88893056, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.29760742, + "step": 4600, + "time_per_iteration": 2.633394718170166 + }, + { + "auxiliary_loss_clip": 0.01568736, + "auxiliary_loss_mlp": 0.00326036, + "balance_loss_clip": 1.25808847, + "balance_loss_mlp": 0.29630488, + "epoch": 0.27662708552532694, + "flos": 17710604524800.0, + "grad_norm": 35.42518767289026, + "language_loss": 0.80994654, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.82889426, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.29711914, + "step": 4601, + "time_per_iteration": 2.625255584716797 + }, + { + "auxiliary_loss_clip": 0.01599379, + "auxiliary_loss_mlp": 0.00313667, + "balance_loss_clip": 1.28081119, + "balance_loss_mlp": 0.28281522, + "epoch": 0.2766872087779949, + "flos": 21252725429760.0, + "grad_norm": 1.784662101693216, + "language_loss": 0.8779704, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.89710087, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.30859375, + "step": 4602, + "time_per_iteration": 2.67585825920105 + }, + { + "auxiliary_loss_clip": 0.01579147, + "auxiliary_loss_mlp": 0.00367879, + "balance_loss_clip": 1.26756072, + "balance_loss_mlp": 0.3339994, + "epoch": 0.2767473320306629, + "flos": 12931900053120.0, + "grad_norm": 7.9571264333128475, + "language_loss": 0.85257864, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.87204885, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.33862305, + "step": 4603, + "time_per_iteration": 2.7897605895996094 + }, + { + "auxiliary_loss_clip": 0.01592217, + "auxiliary_loss_mlp": 0.00352924, + "balance_loss_clip": 1.2744571, + "balance_loss_mlp": 0.32477909, + "epoch": 0.27680745528333084, + "flos": 15012851604480.0, + "grad_norm": 6.454686226042073, + "language_loss": 0.87110883, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.89056027, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.28137207, + "step": 4604, + "time_per_iteration": 2.6881730556488037 + }, + { + "auxiliary_loss_clip": 0.01578487, + "auxiliary_loss_mlp": 0.00321918, + "balance_loss_clip": 1.26448548, + "balance_loss_mlp": 0.29130489, + "epoch": 0.2768675785359988, + "flos": 22637835734400.0, + "grad_norm": 13.351706690390365, + "language_loss": 0.7570765, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.77608061, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.30615234, + "step": 4605, + "time_per_iteration": 2.6751046180725098 + }, + { + "auxiliary_loss_clip": 0.0149603, + "auxiliary_loss_mlp": 0.00130662, + "balance_loss_clip": 1.21722198, + "balance_loss_mlp": 0.12184059, + "epoch": 0.27692770178866677, + "flos": 66130542881280.0, + "grad_norm": 0.699221176498872, + "language_loss": 0.57032561, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.58659256, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.08837891, + "step": 4606, + "time_per_iteration": 3.2314839363098145 + }, + { + "auxiliary_loss_clip": 0.01606027, + "auxiliary_loss_mlp": 0.00342882, + "balance_loss_clip": 1.28120494, + "balance_loss_mlp": 0.31148258, + "epoch": 0.27698782504133473, + "flos": 26464979059200.0, + "grad_norm": 9.557620257868262, + "language_loss": 0.76025939, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.77974844, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.31420898, + "step": 4607, + "time_per_iteration": 2.6667613983154297 + }, + { + "auxiliary_loss_clip": 0.01584137, + "auxiliary_loss_mlp": 0.00331055, + "balance_loss_clip": 1.27396917, + "balance_loss_mlp": 0.30224198, + "epoch": 0.2770479482940027, + "flos": 25884806584320.0, + "grad_norm": 42.289663712935145, + "language_loss": 0.75252306, + "learning_rate": 3.393199595837555e-06, + "loss": 0.77167499, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.28796387, + "step": 4608, + "time_per_iteration": 2.679741144180298 + }, + { + "auxiliary_loss_clip": 0.01573651, + "auxiliary_loss_mlp": 0.00327693, + "balance_loss_clip": 1.25963879, + "balance_loss_mlp": 0.29541135, + "epoch": 0.27710807154667066, + "flos": 22857249962880.0, + "grad_norm": 36.69369011424916, + "language_loss": 0.80163956, + "learning_rate": 3.392920146281499e-06, + "loss": 0.82065296, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.32275391, + "step": 4609, + "time_per_iteration": 2.637974739074707 + }, + { + "auxiliary_loss_clip": 0.01593924, + "auxiliary_loss_mlp": 0.00323209, + "balance_loss_clip": 1.27589703, + "balance_loss_mlp": 0.29214314, + "epoch": 0.27716819479933863, + "flos": 17711071401600.0, + "grad_norm": 12.174152792573059, + "language_loss": 0.90147734, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.92064863, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.31079102, + "step": 4610, + "time_per_iteration": 2.7526721954345703 + }, + { + "auxiliary_loss_clip": 0.0159957, + "auxiliary_loss_mlp": 0.00317456, + "balance_loss_clip": 1.27541697, + "balance_loss_mlp": 0.28677145, + "epoch": 0.2772283180520066, + "flos": 19646046080640.0, + "grad_norm": 68.29123998369232, + "language_loss": 0.79367256, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.81284285, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.30688477, + "step": 4611, + "time_per_iteration": 2.6406233310699463 + }, + { + "auxiliary_loss_clip": 0.01583327, + "auxiliary_loss_mlp": 0.0027775, + "balance_loss_clip": 1.2707541, + "balance_loss_mlp": 0.24764964, + "epoch": 0.27728844130467456, + "flos": 21032628842880.0, + "grad_norm": 1.5966653729018665, + "language_loss": 0.81046247, + "learning_rate": 3.392081480737698e-06, + "loss": 0.82907319, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.30102539, + "step": 4612, + "time_per_iteration": 2.695617914199829 + }, + { + "auxiliary_loss_clip": 0.01618339, + "auxiliary_loss_mlp": 0.00337004, + "balance_loss_clip": 1.2900331, + "balance_loss_mlp": 0.30577078, + "epoch": 0.2773485645573425, + "flos": 18989204025600.0, + "grad_norm": 5.549447105402128, + "language_loss": 0.75465161, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.77420503, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.3125, + "step": 4613, + "time_per_iteration": 2.6503517627716064 + }, + { + "auxiliary_loss_clip": 0.01580183, + "auxiliary_loss_mlp": 0.0028275, + "balance_loss_clip": 1.26779532, + "balance_loss_mlp": 0.25170782, + "epoch": 0.27740868781001055, + "flos": 21468440557440.0, + "grad_norm": 10.460367033417189, + "language_loss": 0.85215008, + "learning_rate": 3.39152210641815e-06, + "loss": 0.87077934, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 3.12304688, + "router_z_loss_mlp": 0.31079102, + "step": 4614, + "time_per_iteration": 2.633056402206421 + }, + { + "auxiliary_loss_clip": 0.01601937, + "auxiliary_loss_mlp": 0.00293702, + "balance_loss_clip": 1.27660966, + "balance_loss_mlp": 0.26380435, + "epoch": 0.2774688110626785, + "flos": 19827825834240.0, + "grad_norm": 34.519143047106844, + "language_loss": 0.91765571, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.93661213, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.29907227, + "step": 4615, + "time_per_iteration": 2.652782440185547 + }, + { + "auxiliary_loss_clip": 0.0159202, + "auxiliary_loss_mlp": 0.00279342, + "balance_loss_clip": 1.27205348, + "balance_loss_mlp": 0.24952741, + "epoch": 0.2775289343153465, + "flos": 18216226321920.0, + "grad_norm": 15.855423503002251, + "language_loss": 0.76201063, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.78072423, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.29846191, + "step": 4616, + "time_per_iteration": 2.6125993728637695 + }, + { + "auxiliary_loss_clip": 0.01616348, + "auxiliary_loss_mlp": 0.0033009, + "balance_loss_clip": 1.28769088, + "balance_loss_mlp": 0.29759404, + "epoch": 0.27758905756801444, + "flos": 16472476673280.0, + "grad_norm": 2.6021579603519385, + "language_loss": 0.90389383, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.9233582, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.32519531, + "step": 4617, + "time_per_iteration": 4.029827833175659 + }, + { + "auxiliary_loss_clip": 0.01612508, + "auxiliary_loss_mlp": 0.0031352, + "balance_loss_clip": 1.28824675, + "balance_loss_mlp": 0.28040391, + "epoch": 0.2776491808206824, + "flos": 18728240739840.0, + "grad_norm": 9.347561743591502, + "language_loss": 0.83301258, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.85227287, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.33081055, + "step": 4618, + "time_per_iteration": 2.6006414890289307 + }, + { + "auxiliary_loss_clip": 0.01608787, + "auxiliary_loss_mlp": 0.00279806, + "balance_loss_clip": 1.2868216, + "balance_loss_mlp": 0.25158954, + "epoch": 0.27770930407335037, + "flos": 28038189911040.0, + "grad_norm": 6.199086347798929, + "language_loss": 0.92086506, + "learning_rate": 3.390122747388459e-06, + "loss": 0.93975103, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.28210449, + "step": 4619, + "time_per_iteration": 2.692800283432007 + }, + { + "auxiliary_loss_clip": 0.01585512, + "auxiliary_loss_mlp": 0.00283077, + "balance_loss_clip": 1.2729212, + "balance_loss_mlp": 0.25406125, + "epoch": 0.27776942732601834, + "flos": 23549823072000.0, + "grad_norm": 33.29519430612466, + "language_loss": 0.82342708, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.84211296, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.29003906, + "step": 4620, + "time_per_iteration": 2.6956193447113037 + }, + { + "auxiliary_loss_clip": 0.01574375, + "auxiliary_loss_mlp": 0.00315389, + "balance_loss_clip": 1.26078653, + "balance_loss_mlp": 0.28355998, + "epoch": 0.2778295505786863, + "flos": 23908713811200.0, + "grad_norm": 18.652174699886842, + "language_loss": 0.82735336, + "learning_rate": 3.389562634707122e-06, + "loss": 0.84625101, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.31799316, + "step": 4621, + "time_per_iteration": 4.007266998291016 + }, + { + "auxiliary_loss_clip": 0.01598988, + "auxiliary_loss_mlp": 0.00279542, + "balance_loss_clip": 1.27712619, + "balance_loss_mlp": 0.2485718, + "epoch": 0.27788967383135427, + "flos": 25554571920000.0, + "grad_norm": 3.370325993982739, + "language_loss": 0.94443864, + "learning_rate": 3.389282499322611e-06, + "loss": 0.96322405, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.30957031, + "step": 4622, + "time_per_iteration": 2.637813091278076 + }, + { + "auxiliary_loss_clip": 0.01575449, + "auxiliary_loss_mlp": 0.00311614, + "balance_loss_clip": 1.25880432, + "balance_loss_mlp": 0.28040501, + "epoch": 0.27794979708402223, + "flos": 16252631481600.0, + "grad_norm": 36.25405837134604, + "language_loss": 0.90045464, + "learning_rate": 3.389002311256369e-06, + "loss": 0.91932523, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.31225586, + "step": 4623, + "time_per_iteration": 4.141775608062744 + }, + { + "auxiliary_loss_clip": 0.01608817, + "auxiliary_loss_mlp": 0.00304766, + "balance_loss_clip": 1.28664196, + "balance_loss_mlp": 0.27424854, + "epoch": 0.2780099203366902, + "flos": 20667632791680.0, + "grad_norm": 21.36671290110447, + "language_loss": 0.88555604, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.90469182, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.30493164, + "step": 4624, + "time_per_iteration": 2.686006784439087 + }, + { + "auxiliary_loss_clip": 0.01596086, + "auxiliary_loss_mlp": 0.00267632, + "balance_loss_clip": 1.27988625, + "balance_loss_mlp": 0.23620847, + "epoch": 0.27807004358935816, + "flos": 17739583822080.0, + "grad_norm": 210.86693173980385, + "language_loss": 0.84688556, + "learning_rate": 3.388441777121191e-06, + "loss": 0.86552274, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.31445312, + "step": 4625, + "time_per_iteration": 2.646076202392578 + }, + { + "auxiliary_loss_clip": 0.01583808, + "auxiliary_loss_mlp": 0.00275669, + "balance_loss_clip": 1.27165687, + "balance_loss_mlp": 0.24379244, + "epoch": 0.2781301668420261, + "flos": 16727119165440.0, + "grad_norm": 68.9408105294936, + "language_loss": 0.77647233, + "learning_rate": 3.388161431073511e-06, + "loss": 0.79506707, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.31872559, + "step": 4626, + "time_per_iteration": 2.6243302822113037 + }, + { + "auxiliary_loss_clip": 0.01619271, + "auxiliary_loss_mlp": 0.00299992, + "balance_loss_clip": 1.29222524, + "balance_loss_mlp": 0.26878339, + "epoch": 0.27819029009469415, + "flos": 13844749317120.0, + "grad_norm": 2.7969993226072294, + "language_loss": 1.02144611, + "learning_rate": 3.38788103238661e-06, + "loss": 1.04063869, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.31225586, + "step": 4627, + "time_per_iteration": 2.607041120529175 + }, + { + "auxiliary_loss_clip": 0.0159841, + "auxiliary_loss_mlp": 0.00299188, + "balance_loss_clip": 1.2749207, + "balance_loss_mlp": 0.27014881, + "epoch": 0.2782504133473621, + "flos": 27089286370560.0, + "grad_norm": 3.1215468305476413, + "language_loss": 0.91142803, + "learning_rate": 3.387600581071121e-06, + "loss": 0.93040407, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.29016113, + "step": 4628, + "time_per_iteration": 2.694209575653076 + }, + { + "auxiliary_loss_clip": 0.01582885, + "auxiliary_loss_mlp": 0.0030818, + "balance_loss_clip": 1.26539481, + "balance_loss_mlp": 0.27487323, + "epoch": 0.2783105366000301, + "flos": 21068826773760.0, + "grad_norm": 5.889525070693907, + "language_loss": 0.84191138, + "learning_rate": 3.387320077137679e-06, + "loss": 0.86082202, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.33276367, + "step": 4629, + "time_per_iteration": 2.7062413692474365 + }, + { + "auxiliary_loss_clip": 0.01574135, + "auxiliary_loss_mlp": 0.00307547, + "balance_loss_clip": 1.26568246, + "balance_loss_mlp": 0.27811375, + "epoch": 0.27837065985269804, + "flos": 26501823434880.0, + "grad_norm": 8.882886865317023, + "language_loss": 0.89122188, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.91003871, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.29431152, + "step": 4630, + "time_per_iteration": 2.7005350589752197 + }, + { + "auxiliary_loss_clip": 0.01568982, + "auxiliary_loss_mlp": 0.00291474, + "balance_loss_clip": 1.2619797, + "balance_loss_mlp": 0.26083672, + "epoch": 0.278430783105366, + "flos": 20223201813120.0, + "grad_norm": 125.04851336576725, + "language_loss": 0.90355325, + "learning_rate": 3.386758911459485e-06, + "loss": 0.92215776, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.30615234, + "step": 4631, + "time_per_iteration": 2.6541709899902344 + }, + { + "auxiliary_loss_clip": 0.01578673, + "auxiliary_loss_mlp": 0.0034988, + "balance_loss_clip": 1.26995945, + "balance_loss_mlp": 0.31790826, + "epoch": 0.278490906358034, + "flos": 25592888753280.0, + "grad_norm": 78.68601412124104, + "language_loss": 0.79526138, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.81454688, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 3.0859375, + "router_z_loss_mlp": 0.31982422, + "step": 4632, + "time_per_iteration": 4.065192699432373 + }, + { + "auxiliary_loss_clip": 0.01603373, + "auxiliary_loss_mlp": 0.00296513, + "balance_loss_clip": 1.29069543, + "balance_loss_mlp": 0.26597169, + "epoch": 0.27855102961070194, + "flos": 16171544528640.0, + "grad_norm": 43.06459144275631, + "language_loss": 0.87519211, + "learning_rate": 3.386197535437145e-06, + "loss": 0.89419097, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.30517578, + "step": 4633, + "time_per_iteration": 2.5964860916137695 + }, + { + "auxiliary_loss_clip": 0.01602117, + "auxiliary_loss_mlp": 0.00329289, + "balance_loss_clip": 1.28624964, + "balance_loss_mlp": 0.2972818, + "epoch": 0.2786111528633699, + "flos": 22927598749440.0, + "grad_norm": 47.658037151536945, + "language_loss": 0.92635447, + "learning_rate": 3.385916768573529e-06, + "loss": 0.94566846, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.31994629, + "step": 4634, + "time_per_iteration": 2.663015604019165 + }, + { + "auxiliary_loss_clip": 0.01611334, + "auxiliary_loss_mlp": 0.00355778, + "balance_loss_clip": 1.29139757, + "balance_loss_mlp": 0.32168448, + "epoch": 0.27867127611603787, + "flos": 23404205335680.0, + "grad_norm": 7.674788406087617, + "language_loss": 0.83177418, + "learning_rate": 3.38563594915581e-06, + "loss": 0.85144532, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.34082031, + "step": 4635, + "time_per_iteration": 2.734494686126709 + }, + { + "auxiliary_loss_clip": 0.01604974, + "auxiliary_loss_mlp": 0.00363121, + "balance_loss_clip": 1.28768635, + "balance_loss_mlp": 0.33245987, + "epoch": 0.27873139936870583, + "flos": 19829010983040.0, + "grad_norm": 5.527385940079714, + "language_loss": 0.72194642, + "learning_rate": 3.385355077194637e-06, + "loss": 0.74162734, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 3.17382812, + "router_z_loss_mlp": 0.30639648, + "step": 4636, + "time_per_iteration": 2.650726795196533 + }, + { + "auxiliary_loss_clip": 0.01569717, + "auxiliary_loss_mlp": 0.00370269, + "balance_loss_clip": 1.26028883, + "balance_loss_mlp": 0.33827296, + "epoch": 0.2787915226213738, + "flos": 17707659609600.0, + "grad_norm": 16.21700259165247, + "language_loss": 0.93640453, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.95580435, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.31982422, + "step": 4637, + "time_per_iteration": 2.607879400253296 + }, + { + "auxiliary_loss_clip": 0.01584217, + "auxiliary_loss_mlp": 0.00326159, + "balance_loss_clip": 1.27805281, + "balance_loss_mlp": 0.29723868, + "epoch": 0.27885164587404176, + "flos": 22090557139200.0, + "grad_norm": 11.459380443733988, + "language_loss": 0.83055186, + "learning_rate": 3.384793175684533e-06, + "loss": 0.84965563, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.28918457, + "step": 4638, + "time_per_iteration": 2.6357314586639404 + }, + { + "auxiliary_loss_clip": 0.01599448, + "auxiliary_loss_mlp": 0.00317989, + "balance_loss_clip": 1.28620887, + "balance_loss_mlp": 0.28832984, + "epoch": 0.27891176912670973, + "flos": 19207684500480.0, + "grad_norm": 3.1586450689574246, + "language_loss": 0.79116398, + "learning_rate": 3.38451214615691e-06, + "loss": 0.81033838, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.29663086, + "step": 4639, + "time_per_iteration": 2.6228363513946533 + }, + { + "auxiliary_loss_clip": 0.01602918, + "auxiliary_loss_mlp": 0.00338244, + "balance_loss_clip": 1.28529024, + "balance_loss_mlp": 0.30534187, + "epoch": 0.27897189237937775, + "flos": 27600007898880.0, + "grad_norm": 3.3646375847196306, + "language_loss": 0.75042534, + "learning_rate": 3.384231064128447e-06, + "loss": 0.7698369, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.32910156, + "step": 4640, + "time_per_iteration": 2.680684804916382 + }, + { + "auxiliary_loss_clip": 0.01589465, + "auxiliary_loss_mlp": 0.00293235, + "balance_loss_clip": 1.27726936, + "balance_loss_mlp": 0.26505345, + "epoch": 0.2790320156320457, + "flos": 21178210665600.0, + "grad_norm": 8.587089712339315, + "language_loss": 0.79932547, + "learning_rate": 3.383949929609804e-06, + "loss": 0.81815243, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.28198242, + "step": 4641, + "time_per_iteration": 2.6388065814971924 + }, + { + "auxiliary_loss_clip": 0.0159008, + "auxiliary_loss_mlp": 0.00307602, + "balance_loss_clip": 1.27904952, + "balance_loss_mlp": 0.27589187, + "epoch": 0.2790921388847137, + "flos": 22783920347520.0, + "grad_norm": 3.7395298864133086, + "language_loss": 0.84073955, + "learning_rate": 3.383668742611641e-06, + "loss": 0.8597163, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.31713867, + "step": 4642, + "time_per_iteration": 2.6814684867858887 + }, + { + "auxiliary_loss_clip": 0.01583364, + "auxiliary_loss_mlp": 0.00365986, + "balance_loss_clip": 1.27534604, + "balance_loss_mlp": 0.33563581, + "epoch": 0.27915226213738165, + "flos": 23400649889280.0, + "grad_norm": 9.89059585062372, + "language_loss": 0.91325057, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.93274409, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.30322266, + "step": 4643, + "time_per_iteration": 2.6561787128448486 + }, + { + "auxiliary_loss_clip": 0.01580578, + "auxiliary_loss_mlp": 0.00320976, + "balance_loss_clip": 1.26607442, + "balance_loss_mlp": 0.29036266, + "epoch": 0.2792123853900496, + "flos": 22747794243840.0, + "grad_norm": 2.240433272018533, + "language_loss": 0.88273239, + "learning_rate": 3.383106211219407e-06, + "loss": 0.90174794, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.30615234, + "step": 4644, + "time_per_iteration": 2.612318277359009 + }, + { + "auxiliary_loss_clip": 0.01595326, + "auxiliary_loss_mlp": 0.00313591, + "balance_loss_clip": 1.28029823, + "balance_loss_mlp": 0.28269216, + "epoch": 0.2792725086427176, + "flos": 15049372757760.0, + "grad_norm": 4.4783898912065645, + "language_loss": 0.86636418, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.88545334, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.30908203, + "step": 4645, + "time_per_iteration": 2.6397786140441895 + }, + { + "auxiliary_loss_clip": 0.01345419, + "auxiliary_loss_mlp": 0.00121615, + "balance_loss_clip": 1.11451316, + "balance_loss_mlp": 0.11498653, + "epoch": 0.27933263189538554, + "flos": 62544861757440.0, + "grad_norm": 0.7693157033174948, + "language_loss": 0.62047958, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.63514996, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.06640625, + "step": 4646, + "time_per_iteration": 3.0961034297943115 + }, + { + "auxiliary_loss_clip": 0.01599694, + "auxiliary_loss_mlp": 0.00279291, + "balance_loss_clip": 1.28531325, + "balance_loss_mlp": 0.24886864, + "epoch": 0.2793927551480535, + "flos": 25118365155840.0, + "grad_norm": 9.400963403858782, + "language_loss": 0.92530847, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.94409835, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.30407715, + "step": 4647, + "time_per_iteration": 2.6632654666900635 + }, + { + "auxiliary_loss_clip": 0.0159555, + "auxiliary_loss_mlp": 0.00335325, + "balance_loss_clip": 1.28249013, + "balance_loss_mlp": 0.30504617, + "epoch": 0.27945287840072147, + "flos": 21324582587520.0, + "grad_norm": 21.792047087840604, + "language_loss": 0.9149158, + "learning_rate": 3.381980519149988e-06, + "loss": 0.93422461, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.30297852, + "step": 4648, + "time_per_iteration": 2.690333127975464 + }, + { + "auxiliary_loss_clip": 0.01594285, + "auxiliary_loss_mlp": 0.00326331, + "balance_loss_clip": 1.2798692, + "balance_loss_mlp": 0.29237992, + "epoch": 0.27951300165338944, + "flos": 27450547407360.0, + "grad_norm": 6.03952927016642, + "language_loss": 0.80887538, + "learning_rate": 3.38169896509385e-06, + "loss": 0.82808161, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.33935547, + "step": 4649, + "time_per_iteration": 2.724867105484009 + }, + { + "auxiliary_loss_clip": 0.0158349, + "auxiliary_loss_mlp": 0.00296855, + "balance_loss_clip": 1.28237367, + "balance_loss_mlp": 0.26409596, + "epoch": 0.2795731249060574, + "flos": 15159008044800.0, + "grad_norm": 2.309082706861934, + "language_loss": 0.8973282, + "learning_rate": 3.381417358643549e-06, + "loss": 0.91613162, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.32739258, + "step": 4650, + "time_per_iteration": 2.72514009475708 + }, + { + "auxiliary_loss_clip": 0.01318539, + "auxiliary_loss_mlp": 0.0010631, + "balance_loss_clip": 1.09032428, + "balance_loss_mlp": 0.10001602, + "epoch": 0.27963324815872537, + "flos": 60120103178880.0, + "grad_norm": 0.8065755201521717, + "language_loss": 0.58598304, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.60023153, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.06298828, + "step": 4651, + "time_per_iteration": 3.1868698596954346 + }, + { + "auxiliary_loss_clip": 0.01598704, + "auxiliary_loss_mlp": 0.002965, + "balance_loss_clip": 1.28565764, + "balance_loss_mlp": 0.26426566, + "epoch": 0.27969337141139333, + "flos": 21765960910080.0, + "grad_norm": 2.409292006228867, + "language_loss": 0.81082332, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.82977533, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.32275391, + "step": 4652, + "time_per_iteration": 2.676682710647583 + }, + { + "auxiliary_loss_clip": 0.01599309, + "auxiliary_loss_mlp": 0.00273548, + "balance_loss_clip": 1.29238999, + "balance_loss_mlp": 0.24128976, + "epoch": 0.27975349466406135, + "flos": 39851398834560.0, + "grad_norm": 2.282969532173594, + "language_loss": 0.85929394, + "learning_rate": 3.380572225034461e-06, + "loss": 0.87802249, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.32250977, + "step": 4653, + "time_per_iteration": 2.820068120956421 + }, + { + "auxiliary_loss_clip": 0.01587644, + "auxiliary_loss_mlp": 0.00284054, + "balance_loss_clip": 1.28299057, + "balance_loss_mlp": 0.25478843, + "epoch": 0.2798136179167293, + "flos": 21579799697280.0, + "grad_norm": 6.011123518874792, + "language_loss": 0.87384433, + "learning_rate": 3.380290409114312e-06, + "loss": 0.89256132, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.29284668, + "step": 4654, + "time_per_iteration": 2.6798083782196045 + }, + { + "auxiliary_loss_clip": 0.01635706, + "auxiliary_loss_mlp": 0.00267069, + "balance_loss_clip": 1.31629753, + "balance_loss_mlp": 0.23596719, + "epoch": 0.2798737411693973, + "flos": 21537676022400.0, + "grad_norm": 118.2723476332369, + "language_loss": 0.89278573, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.9118135, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.31091309, + "step": 4655, + "time_per_iteration": 2.687196731567383 + }, + { + "auxiliary_loss_clip": 0.01612749, + "auxiliary_loss_mlp": 0.0026334, + "balance_loss_clip": 1.30090249, + "balance_loss_mlp": 0.23322769, + "epoch": 0.27993386442206525, + "flos": 26981051713920.0, + "grad_norm": 202.88922219888045, + "language_loss": 0.87620574, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.8949666, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.30102539, + "step": 4656, + "time_per_iteration": 2.7527873516082764 + }, + { + "auxiliary_loss_clip": 0.0160799, + "auxiliary_loss_mlp": 0.00253687, + "balance_loss_clip": 1.30306864, + "balance_loss_mlp": 0.22533867, + "epoch": 0.2799939876747332, + "flos": 24349876652160.0, + "grad_norm": 55.75280247176119, + "language_loss": 0.88150108, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.90011787, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.28356934, + "step": 4657, + "time_per_iteration": 2.7641730308532715 + }, + { + "auxiliary_loss_clip": 0.01633502, + "auxiliary_loss_mlp": 0.00275981, + "balance_loss_clip": 1.32360387, + "balance_loss_mlp": 0.24665537, + "epoch": 0.2800541109274012, + "flos": 33656988648960.0, + "grad_norm": 67.93379707015656, + "language_loss": 0.72181153, + "learning_rate": 3.379162622133105e-06, + "loss": 0.74090642, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.29321289, + "step": 4658, + "time_per_iteration": 2.749008893966675 + }, + { + "auxiliary_loss_clip": 0.0164013, + "auxiliary_loss_mlp": 0.00253387, + "balance_loss_clip": 1.32042897, + "balance_loss_mlp": 0.22210626, + "epoch": 0.28011423418006914, + "flos": 21614417429760.0, + "grad_norm": 2.147522034004586, + "language_loss": 0.87796485, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.8969, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.31298828, + "step": 4659, + "time_per_iteration": 4.165915250778198 + }, + { + "auxiliary_loss_clip": 0.01620776, + "auxiliary_loss_mlp": 0.00243709, + "balance_loss_clip": 1.31906676, + "balance_loss_mlp": 0.21369207, + "epoch": 0.2801743574327371, + "flos": 23112431159040.0, + "grad_norm": 6.023846862849659, + "language_loss": 0.86748183, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.88612676, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.29980469, + "step": 4660, + "time_per_iteration": 2.637861490249634 + }, + { + "auxiliary_loss_clip": 0.01656307, + "auxiliary_loss_mlp": 0.00255534, + "balance_loss_clip": 1.34179986, + "balance_loss_mlp": 0.22675736, + "epoch": 0.2802344806854051, + "flos": 12641418766080.0, + "grad_norm": 21.37424897571569, + "language_loss": 0.90345782, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.92257631, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.28759766, + "step": 4661, + "time_per_iteration": 2.6170201301574707 + }, + { + "auxiliary_loss_clip": 0.01696332, + "auxiliary_loss_mlp": 0.00270203, + "balance_loss_clip": 1.3813374, + "balance_loss_mlp": 0.2392564, + "epoch": 0.28029460393807304, + "flos": 37267878142080.0, + "grad_norm": 4.4410778757899845, + "language_loss": 0.85267138, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.87233675, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.30957031, + "step": 4662, + "time_per_iteration": 2.835235595703125 + }, + { + "auxiliary_loss_clip": 0.01659932, + "auxiliary_loss_mlp": 0.00279634, + "balance_loss_clip": 1.3438859, + "balance_loss_mlp": 0.24456219, + "epoch": 0.280354727190741, + "flos": 20741106061440.0, + "grad_norm": 12.22859604157899, + "language_loss": 0.77876139, + "learning_rate": 3.377751711782227e-06, + "loss": 0.79815704, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.35083008, + "step": 4663, + "time_per_iteration": 4.022468328475952 + }, + { + "auxiliary_loss_clip": 0.01674705, + "auxiliary_loss_mlp": 0.00258098, + "balance_loss_clip": 1.35780787, + "balance_loss_mlp": 0.22715141, + "epoch": 0.28041485044340897, + "flos": 21471026336640.0, + "grad_norm": 59.48558963338177, + "language_loss": 0.87360942, + "learning_rate": 3.377469372935791e-06, + "loss": 0.89293742, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.30957031, + "step": 4664, + "time_per_iteration": 2.6585397720336914 + }, + { + "auxiliary_loss_clip": 0.01651125, + "auxiliary_loss_mlp": 0.00246852, + "balance_loss_clip": 1.34845352, + "balance_loss_mlp": 0.21609627, + "epoch": 0.28047497369607693, + "flos": 14794263388800.0, + "grad_norm": 2.486962505327289, + "language_loss": 0.86085814, + "learning_rate": 3.377186981855578e-06, + "loss": 0.87983793, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.30749512, + "step": 4665, + "time_per_iteration": 4.002792596817017 + }, + { + "auxiliary_loss_clip": 0.0165657, + "auxiliary_loss_mlp": 0.00238621, + "balance_loss_clip": 1.34856427, + "balance_loss_mlp": 0.20891395, + "epoch": 0.2805350969487449, + "flos": 23070738447360.0, + "grad_norm": 38.928299230022056, + "language_loss": 0.8765465, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.89549839, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.29711914, + "step": 4666, + "time_per_iteration": 2.6525614261627197 + }, + { + "auxiliary_loss_clip": 0.01690248, + "auxiliary_loss_mlp": 0.00283673, + "balance_loss_clip": 1.37830782, + "balance_loss_mlp": 0.25370368, + "epoch": 0.2805952202014129, + "flos": 20479855466880.0, + "grad_norm": 19.345630329293527, + "language_loss": 0.92719686, + "learning_rate": 3.376622043036658e-06, + "loss": 0.94693613, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.29968262, + "step": 4667, + "time_per_iteration": 2.7392404079437256 + }, + { + "auxiliary_loss_clip": 0.01682229, + "auxiliary_loss_mlp": 0.00258197, + "balance_loss_clip": 1.36356843, + "balance_loss_mlp": 0.22858526, + "epoch": 0.2806553434540809, + "flos": 27417330305280.0, + "grad_norm": 69.1957259778857, + "language_loss": 0.86132646, + "learning_rate": 3.376339495319373e-06, + "loss": 0.88073069, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.29602051, + "step": 4668, + "time_per_iteration": 2.7748167514801025 + }, + { + "auxiliary_loss_clip": 0.01698897, + "auxiliary_loss_mlp": 0.00264513, + "balance_loss_clip": 1.37199473, + "balance_loss_mlp": 0.22905988, + "epoch": 0.28071546670674885, + "flos": 26505019745280.0, + "grad_norm": 8.043220347628944, + "language_loss": 0.81349754, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.83313167, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.35449219, + "step": 4669, + "time_per_iteration": 2.7716145515441895 + }, + { + "auxiliary_loss_clip": 0.01665705, + "auxiliary_loss_mlp": 0.00255011, + "balance_loss_clip": 1.35506904, + "balance_loss_mlp": 0.22327772, + "epoch": 0.2807755899594168, + "flos": 20558679863040.0, + "grad_norm": 963.2182609046885, + "language_loss": 0.87411159, + "learning_rate": 3.375774243322725e-06, + "loss": 0.89331877, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.31738281, + "step": 4670, + "time_per_iteration": 2.6526458263397217 + }, + { + "auxiliary_loss_clip": 0.01674316, + "auxiliary_loss_mlp": 0.00253883, + "balance_loss_clip": 1.36345243, + "balance_loss_mlp": 0.2204091, + "epoch": 0.2808357132120848, + "flos": 24313319585280.0, + "grad_norm": 40.388095305024486, + "language_loss": 0.89394146, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.91322351, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.33447266, + "step": 4671, + "time_per_iteration": 2.6808464527130127 + }, + { + "auxiliary_loss_clip": 0.01693974, + "auxiliary_loss_mlp": 0.0023103, + "balance_loss_clip": 1.38039494, + "balance_loss_mlp": 0.20084664, + "epoch": 0.28089583646475275, + "flos": 26432408401920.0, + "grad_norm": 5.325396187758985, + "language_loss": 0.81257355, + "learning_rate": 3.37520878264809e-06, + "loss": 0.83182359, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.30200195, + "step": 4672, + "time_per_iteration": 2.677990436553955 + }, + { + "auxiliary_loss_clip": 0.01684123, + "auxiliary_loss_mlp": 0.00261831, + "balance_loss_clip": 1.36937022, + "balance_loss_mlp": 0.22735575, + "epoch": 0.2809559597174207, + "flos": 23111820627840.0, + "grad_norm": 75.45867886544858, + "language_loss": 0.85301149, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.87247097, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.3449707, + "step": 4673, + "time_per_iteration": 2.6267406940460205 + }, + { + "auxiliary_loss_clip": 0.01674223, + "auxiliary_loss_mlp": 0.00255654, + "balance_loss_clip": 1.36602974, + "balance_loss_mlp": 0.22380091, + "epoch": 0.2810160829700887, + "flos": 20923496346240.0, + "grad_norm": 9.332460671539687, + "language_loss": 0.79087758, + "learning_rate": 3.374643113381237e-06, + "loss": 0.81017631, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.31860352, + "step": 4674, + "time_per_iteration": 4.024905204772949 + }, + { + "auxiliary_loss_clip": 0.01657641, + "auxiliary_loss_mlp": 0.0022761, + "balance_loss_clip": 1.35396099, + "balance_loss_mlp": 0.19661546, + "epoch": 0.28107620622275664, + "flos": 14355901808640.0, + "grad_norm": 11.516676175197956, + "language_loss": 0.8464402, + "learning_rate": 3.374360200552541e-06, + "loss": 0.86529273, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.31030273, + "step": 4675, + "time_per_iteration": 2.6272220611572266 + }, + { + "auxiliary_loss_clip": 0.0166156, + "auxiliary_loss_mlp": 0.00225744, + "balance_loss_clip": 1.35475457, + "balance_loss_mlp": 0.19470204, + "epoch": 0.2811363294754246, + "flos": 20919078973440.0, + "grad_norm": 19.224872172372645, + "language_loss": 0.78491974, + "learning_rate": 3.374077235607968e-06, + "loss": 0.80379283, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.31054688, + "step": 4676, + "time_per_iteration": 2.694166898727417 + }, + { + "auxiliary_loss_clip": 0.01661827, + "auxiliary_loss_mlp": 0.00229228, + "balance_loss_clip": 1.35355949, + "balance_loss_mlp": 0.19966441, + "epoch": 0.28119645272809257, + "flos": 20594841880320.0, + "grad_norm": 179.06059161888058, + "language_loss": 0.77235931, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.7912699, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 3.08398438, + "router_z_loss_mlp": 0.29541016, + "step": 4677, + "time_per_iteration": 2.622537851333618 + }, + { + "auxiliary_loss_clip": 0.01628561, + "auxiliary_loss_mlp": 0.00237767, + "balance_loss_clip": 1.33231544, + "balance_loss_mlp": 0.20739228, + "epoch": 0.28125657598076054, + "flos": 25337420248320.0, + "grad_norm": 17.650262076095398, + "language_loss": 0.69706202, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.7157253, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.3034668, + "step": 4678, + "time_per_iteration": 2.6600399017333984 + }, + { + "auxiliary_loss_clip": 0.01608766, + "auxiliary_loss_mlp": 0.00224259, + "balance_loss_clip": 1.31726384, + "balance_loss_mlp": 0.19419467, + "epoch": 0.2813166992334285, + "flos": 24827093769600.0, + "grad_norm": 50.077661588530745, + "language_loss": 0.77269304, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.79102325, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.30078125, + "step": 4679, + "time_per_iteration": 2.6743009090423584 + }, + { + "auxiliary_loss_clip": 0.01644159, + "auxiliary_loss_mlp": 0.00220089, + "balance_loss_clip": 1.34293056, + "balance_loss_mlp": 0.18778336, + "epoch": 0.2813768224860965, + "flos": 21760753438080.0, + "grad_norm": 3.054637002136198, + "language_loss": 0.82242954, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.84107202, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.32299805, + "step": 4680, + "time_per_iteration": 2.637751340866089 + }, + { + "auxiliary_loss_clip": 0.01603688, + "auxiliary_loss_mlp": 0.00225466, + "balance_loss_clip": 1.31495237, + "balance_loss_mlp": 0.19618854, + "epoch": 0.2814369457387645, + "flos": 24316803204480.0, + "grad_norm": 57.16248163343582, + "language_loss": 0.84591174, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.86420333, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.29309082, + "step": 4681, + "time_per_iteration": 2.6839046478271484 + }, + { + "auxiliary_loss_clip": 0.0164386, + "auxiliary_loss_mlp": 0.00257883, + "balance_loss_clip": 1.33828688, + "balance_loss_mlp": 0.22681752, + "epoch": 0.28149706899143245, + "flos": 18515326872960.0, + "grad_norm": 55.90385149030006, + "language_loss": 0.83569509, + "learning_rate": 3.372378352108146e-06, + "loss": 0.85471255, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.31066895, + "step": 4682, + "time_per_iteration": 2.636246919631958 + }, + { + "auxiliary_loss_clip": 0.0163367, + "auxiliary_loss_mlp": 0.00230523, + "balance_loss_clip": 1.33894205, + "balance_loss_mlp": 0.20236562, + "epoch": 0.2815571922441004, + "flos": 24863255786880.0, + "grad_norm": 16.4274315057748, + "language_loss": 0.87505263, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.89369458, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.28149414, + "step": 4683, + "time_per_iteration": 2.652150869369507 + }, + { + "auxiliary_loss_clip": 0.01625457, + "auxiliary_loss_mlp": 0.00256421, + "balance_loss_clip": 1.32566202, + "balance_loss_mlp": 0.22602242, + "epoch": 0.2816173154967684, + "flos": 19901622326400.0, + "grad_norm": 3682.7688563605384, + "language_loss": 0.82746804, + "learning_rate": 3.371811641167852e-06, + "loss": 0.84628689, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.30395508, + "step": 4684, + "time_per_iteration": 2.6531569957733154 + }, + { + "auxiliary_loss_clip": 0.01601331, + "auxiliary_loss_mlp": 0.00196203, + "balance_loss_clip": 1.31345677, + "balance_loss_mlp": 0.16570926, + "epoch": 0.28167743874943635, + "flos": 17491333950720.0, + "grad_norm": 1.986915627223217, + "language_loss": 0.85059953, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.86857486, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.30493164, + "step": 4685, + "time_per_iteration": 2.612409830093384 + }, + { + "auxiliary_loss_clip": 0.01618363, + "auxiliary_loss_mlp": 0.00262594, + "balance_loss_clip": 1.32211316, + "balance_loss_mlp": 0.23090824, + "epoch": 0.2817375620021043, + "flos": 25302120157440.0, + "grad_norm": 2.3967575770447476, + "language_loss": 0.82967412, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.84848362, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.31665039, + "step": 4686, + "time_per_iteration": 2.6875386238098145 + }, + { + "auxiliary_loss_clip": 0.01609973, + "auxiliary_loss_mlp": 0.00285256, + "balance_loss_clip": 1.31171894, + "balance_loss_mlp": 0.24927905, + "epoch": 0.2817976852547723, + "flos": 18693227957760.0, + "grad_norm": 3.1787296748717178, + "language_loss": 0.75641596, + "learning_rate": 3.370961184640025e-06, + "loss": 0.77536827, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.35986328, + "step": 4687, + "time_per_iteration": 2.6094439029693604 + }, + { + "auxiliary_loss_clip": 0.01587667, + "auxiliary_loss_mlp": 0.0027306, + "balance_loss_clip": 1.29992533, + "balance_loss_mlp": 0.24082616, + "epoch": 0.28185780850744024, + "flos": 22742263549440.0, + "grad_norm": 2.9333868868963364, + "language_loss": 0.81783873, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.83644599, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.32202148, + "step": 4688, + "time_per_iteration": 2.6644248962402344 + }, + { + "auxiliary_loss_clip": 0.01624791, + "auxiliary_loss_mlp": 0.00246188, + "balance_loss_clip": 1.32582569, + "balance_loss_mlp": 0.21804251, + "epoch": 0.2819179317601082, + "flos": 14933919467520.0, + "grad_norm": 9.43176201369757, + "language_loss": 0.86257523, + "learning_rate": 3.37039395366863e-06, + "loss": 0.88128507, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.28161621, + "step": 4689, + "time_per_iteration": 2.6134145259857178 + }, + { + "auxiliary_loss_clip": 0.01613291, + "auxiliary_loss_mlp": 0.00279853, + "balance_loss_clip": 1.31959069, + "balance_loss_mlp": 0.24957344, + "epoch": 0.2819780550127762, + "flos": 23145325038720.0, + "grad_norm": 24.25863405412492, + "language_loss": 0.84549701, + "learning_rate": 3.37011026022934e-06, + "loss": 0.86442846, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.30297852, + "step": 4690, + "time_per_iteration": 2.6709845066070557 + }, + { + "auxiliary_loss_clip": 0.01597002, + "auxiliary_loss_mlp": 0.00286513, + "balance_loss_clip": 1.30673528, + "balance_loss_mlp": 0.2578311, + "epoch": 0.28203817826544414, + "flos": 21616356764160.0, + "grad_norm": 3.4096801956659735, + "language_loss": 0.96184105, + "learning_rate": 3.369826514835332e-06, + "loss": 0.98067617, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.28710938, + "step": 4691, + "time_per_iteration": 2.697049379348755 + }, + { + "auxiliary_loss_clip": 0.01616103, + "auxiliary_loss_mlp": 0.00268462, + "balance_loss_clip": 1.3186121, + "balance_loss_mlp": 0.23875535, + "epoch": 0.2820983015181121, + "flos": 24026788794240.0, + "grad_norm": 31.19083363063642, + "language_loss": 0.87851799, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.89736366, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.29663086, + "step": 4692, + "time_per_iteration": 2.6714847087860107 + }, + { + "auxiliary_loss_clip": 0.01608532, + "auxiliary_loss_mlp": 0.00305365, + "balance_loss_clip": 1.31909978, + "balance_loss_mlp": 0.27561063, + "epoch": 0.2821584247707801, + "flos": 30007925976960.0, + "grad_norm": 4.525560818070334, + "language_loss": 0.80368114, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.82282007, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.29760742, + "step": 4693, + "time_per_iteration": 2.721050500869751 + }, + { + "auxiliary_loss_clip": 0.01604025, + "auxiliary_loss_mlp": 0.0029195, + "balance_loss_clip": 1.31114423, + "balance_loss_mlp": 0.26167127, + "epoch": 0.2822185480234481, + "flos": 21396762967680.0, + "grad_norm": 2.8311450666874833, + "language_loss": 0.85904908, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.87800884, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.30273438, + "step": 4694, + "time_per_iteration": 2.7414908409118652 + }, + { + "auxiliary_loss_clip": 0.01617627, + "auxiliary_loss_mlp": 0.00293478, + "balance_loss_clip": 1.31991625, + "balance_loss_mlp": 0.26460534, + "epoch": 0.28227867127611606, + "flos": 27452809964160.0, + "grad_norm": 9.731992252037038, + "language_loss": 0.72721547, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.74632645, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.28857422, + "step": 4695, + "time_per_iteration": 2.7868998050689697 + }, + { + "auxiliary_loss_clip": 0.01633638, + "auxiliary_loss_mlp": 0.00343873, + "balance_loss_clip": 1.33331573, + "balance_loss_mlp": 0.31109065, + "epoch": 0.282338794528784, + "flos": 22593736811520.0, + "grad_norm": 13.943455410931433, + "language_loss": 0.84865189, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.86842704, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.32763672, + "step": 4696, + "time_per_iteration": 2.7816009521484375 + }, + { + "auxiliary_loss_clip": 0.01584245, + "auxiliary_loss_mlp": 0.00325356, + "balance_loss_clip": 1.29527271, + "balance_loss_mlp": 0.29548216, + "epoch": 0.282398917781452, + "flos": 42010923386880.0, + "grad_norm": 48.82236989271834, + "language_loss": 0.68643045, + "learning_rate": 3.368122952024877e-06, + "loss": 0.70552647, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.29833984, + "step": 4697, + "time_per_iteration": 2.86149001121521 + }, + { + "auxiliary_loss_clip": 0.0158217, + "auxiliary_loss_mlp": 0.00307672, + "balance_loss_clip": 1.29461598, + "balance_loss_mlp": 0.27932429, + "epoch": 0.28245904103411995, + "flos": 23224724052480.0, + "grad_norm": 151.0128702357655, + "language_loss": 0.78714895, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.80604738, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.28369141, + "step": 4698, + "time_per_iteration": 2.755028247833252 + }, + { + "auxiliary_loss_clip": 0.016012, + "auxiliary_loss_mlp": 0.00285391, + "balance_loss_clip": 1.30688918, + "balance_loss_mlp": 0.2548492, + "epoch": 0.2825191642867879, + "flos": 25374623760000.0, + "grad_norm": 3.534610631394604, + "language_loss": 0.82200825, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.8408742, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.30541992, + "step": 4699, + "time_per_iteration": 2.6520562171936035 + }, + { + "auxiliary_loss_clip": 0.01628401, + "auxiliary_loss_mlp": 0.00311691, + "balance_loss_clip": 1.32533312, + "balance_loss_mlp": 0.28091121, + "epoch": 0.2825792875394559, + "flos": 17236799199360.0, + "grad_norm": 11.106100989706265, + "language_loss": 0.94904602, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.96844697, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.30761719, + "step": 4700, + "time_per_iteration": 2.619940757751465 + }, + { + "auxiliary_loss_clip": 0.01608649, + "auxiliary_loss_mlp": 0.00277641, + "balance_loss_clip": 1.30747867, + "balance_loss_mlp": 0.25031805, + "epoch": 0.28263941079212385, + "flos": 26723967096960.0, + "grad_norm": 7.796887182821752, + "language_loss": 0.90085959, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.9197225, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.2734375, + "step": 4701, + "time_per_iteration": 4.11996865272522 + }, + { + "auxiliary_loss_clip": 0.01640716, + "auxiliary_loss_mlp": 0.0029967, + "balance_loss_clip": 1.33063745, + "balance_loss_mlp": 0.26972461, + "epoch": 0.2826995340447918, + "flos": 25921327737600.0, + "grad_norm": 8.710553034497698, + "language_loss": 0.82103479, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.8404386, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.29943848, + "step": 4702, + "time_per_iteration": 2.701606273651123 + }, + { + "auxiliary_loss_clip": 0.01625542, + "auxiliary_loss_mlp": 0.00305274, + "balance_loss_clip": 1.32742643, + "balance_loss_mlp": 0.27741483, + "epoch": 0.2827596572974598, + "flos": 22379709623040.0, + "grad_norm": 4.979133582492381, + "language_loss": 0.84778261, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.86709082, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.27844238, + "step": 4703, + "time_per_iteration": 2.8046929836273193 + }, + { + "auxiliary_loss_clip": 0.01624231, + "auxiliary_loss_mlp": 0.00327248, + "balance_loss_clip": 1.32174087, + "balance_loss_mlp": 0.29546687, + "epoch": 0.28281978055012774, + "flos": 33547137880320.0, + "grad_norm": 14.798348708245324, + "language_loss": 0.76076365, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.78027844, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.31762695, + "step": 4704, + "time_per_iteration": 2.7232794761657715 + }, + { + "auxiliary_loss_clip": 0.016388, + "auxiliary_loss_mlp": 0.00338723, + "balance_loss_clip": 1.32434821, + "balance_loss_mlp": 0.3053925, + "epoch": 0.2828799038027957, + "flos": 23440870143360.0, + "grad_norm": 4.0721411801949365, + "language_loss": 0.77615917, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.79593444, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.33325195, + "step": 4705, + "time_per_iteration": 4.069917440414429 + }, + { + "auxiliary_loss_clip": 0.01472091, + "auxiliary_loss_mlp": 0.00268633, + "balance_loss_clip": 1.25661004, + "balance_loss_mlp": 0.25337416, + "epoch": 0.2829400270554637, + "flos": 69873690251520.0, + "grad_norm": 0.7191338470940035, + "language_loss": 0.59051615, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.60792339, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.15234375, + "step": 4706, + "time_per_iteration": 3.23539662361145 + }, + { + "auxiliary_loss_clip": 0.01627421, + "auxiliary_loss_mlp": 0.00356863, + "balance_loss_clip": 1.32206213, + "balance_loss_mlp": 0.32673913, + "epoch": 0.2830001503081317, + "flos": 24789028331520.0, + "grad_norm": 1.5433311862065702, + "language_loss": 0.86172593, + "learning_rate": 3.365279531475407e-06, + "loss": 0.88156879, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.30126953, + "step": 4707, + "time_per_iteration": 2.656975746154785 + }, + { + "auxiliary_loss_clip": 0.01613262, + "auxiliary_loss_mlp": 0.00345227, + "balance_loss_clip": 1.30934143, + "balance_loss_mlp": 0.31287348, + "epoch": 0.28306027356079966, + "flos": 27669387018240.0, + "grad_norm": 11.232725228398703, + "language_loss": 0.87092268, + "learning_rate": 3.36499490449902e-06, + "loss": 0.89050758, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.32348633, + "step": 4708, + "time_per_iteration": 4.1207544803619385 + }, + { + "auxiliary_loss_clip": 0.01430991, + "auxiliary_loss_mlp": 0.00305249, + "balance_loss_clip": 1.22730792, + "balance_loss_mlp": 0.28979927, + "epoch": 0.2831203968134676, + "flos": 60527938199040.0, + "grad_norm": 6.908157066103251, + "language_loss": 0.62797505, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.6453374, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.15429688, + "step": 4709, + "time_per_iteration": 3.005810260772705 + }, + { + "auxiliary_loss_clip": 0.01614144, + "auxiliary_loss_mlp": 0.00336154, + "balance_loss_clip": 1.30951631, + "balance_loss_mlp": 0.30525488, + "epoch": 0.2831805200661356, + "flos": 22054790171520.0, + "grad_norm": 6.8076593262392215, + "language_loss": 0.78794026, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.8074432, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.30883789, + "step": 4710, + "time_per_iteration": 2.66361403465271 + }, + { + "auxiliary_loss_clip": 0.01638491, + "auxiliary_loss_mlp": 0.00341773, + "balance_loss_clip": 1.33143508, + "balance_loss_mlp": 0.3090862, + "epoch": 0.28324064331880355, + "flos": 22600668136320.0, + "grad_norm": 23.299886379806146, + "language_loss": 0.87041718, + "learning_rate": 3.364140713048579e-06, + "loss": 0.89021981, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.32666016, + "step": 4711, + "time_per_iteration": 2.670058488845825 + }, + { + "auxiliary_loss_clip": 0.0162569, + "auxiliary_loss_mlp": 0.00367168, + "balance_loss_clip": 1.31859636, + "balance_loss_mlp": 0.33498132, + "epoch": 0.2833007665714715, + "flos": 30404127968640.0, + "grad_norm": 9.591739854435675, + "language_loss": 0.7717135, + "learning_rate": 3.363855879093996e-06, + "loss": 0.79164207, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.32177734, + "step": 4712, + "time_per_iteration": 2.804062604904175 + }, + { + "auxiliary_loss_clip": 0.01641787, + "auxiliary_loss_mlp": 0.00364611, + "balance_loss_clip": 1.33097219, + "balance_loss_mlp": 0.33283031, + "epoch": 0.2833608898241395, + "flos": 23549499849600.0, + "grad_norm": 80.04840149261338, + "language_loss": 0.89691377, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.91697776, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.31787109, + "step": 4713, + "time_per_iteration": 2.7151596546173096 + }, + { + "auxiliary_loss_clip": 0.01680615, + "auxiliary_loss_mlp": 0.00362322, + "balance_loss_clip": 1.35769987, + "balance_loss_mlp": 0.32853818, + "epoch": 0.28342101307680745, + "flos": 20266726118400.0, + "grad_norm": 25.37827047740509, + "language_loss": 0.83040041, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.85082984, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.33764648, + "step": 4714, + "time_per_iteration": 2.751180648803711 + }, + { + "auxiliary_loss_clip": 0.01644597, + "auxiliary_loss_mlp": 0.00359398, + "balance_loss_clip": 1.33058047, + "balance_loss_mlp": 0.32609135, + "epoch": 0.2834811363294754, + "flos": 30847050576000.0, + "grad_norm": 6.336650456278102, + "language_loss": 0.82404578, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.84408569, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.33325195, + "step": 4715, + "time_per_iteration": 2.8691794872283936 + }, + { + "auxiliary_loss_clip": 0.0162932, + "auxiliary_loss_mlp": 0.0033, + "balance_loss_clip": 1.31813455, + "balance_loss_mlp": 0.29786116, + "epoch": 0.2835412595821434, + "flos": 22711021695360.0, + "grad_norm": 72.33621248752729, + "language_loss": 0.81234157, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.83193475, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 3.109375, + "router_z_loss_mlp": 0.32128906, + "step": 4716, + "time_per_iteration": 4.063138723373413 + }, + { + "auxiliary_loss_clip": 0.01637748, + "auxiliary_loss_mlp": 0.00370397, + "balance_loss_clip": 1.31979907, + "balance_loss_mlp": 0.33570671, + "epoch": 0.28360138283481134, + "flos": 18077719478400.0, + "grad_norm": 11.928890607737554, + "language_loss": 0.83308464, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.8531661, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.34692383, + "step": 4717, + "time_per_iteration": 2.6545255184173584 + }, + { + "auxiliary_loss_clip": 0.01646356, + "auxiliary_loss_mlp": 0.00408443, + "balance_loss_clip": 1.32293224, + "balance_loss_mlp": 0.3754932, + "epoch": 0.2836615060874793, + "flos": 17854785717120.0, + "grad_norm": 43.37671053489301, + "language_loss": 0.73054206, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.75109005, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.3293457, + "step": 4718, + "time_per_iteration": 2.695160388946533 + }, + { + "auxiliary_loss_clip": 0.01622051, + "auxiliary_loss_mlp": 0.00376113, + "balance_loss_clip": 1.3099432, + "balance_loss_mlp": 0.34173262, + "epoch": 0.2837216293401473, + "flos": 25740302169600.0, + "grad_norm": 1.9613817921563024, + "language_loss": 0.79504067, + "learning_rate": 3.361860593925566e-06, + "loss": 0.81502229, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.34375, + "step": 4719, + "time_per_iteration": 2.7974069118499756 + }, + { + "auxiliary_loss_clip": 0.01660217, + "auxiliary_loss_mlp": 0.00373299, + "balance_loss_clip": 1.33689141, + "balance_loss_mlp": 0.34256691, + "epoch": 0.2837817525928153, + "flos": 20923532259840.0, + "grad_norm": 33.921076365597564, + "language_loss": 0.85820121, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.87853634, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.30761719, + "step": 4720, + "time_per_iteration": 2.6898224353790283 + }, + { + "auxiliary_loss_clip": 0.0166991, + "auxiliary_loss_mlp": 0.00424739, + "balance_loss_clip": 1.34136808, + "balance_loss_mlp": 0.3868542, + "epoch": 0.28384187584548326, + "flos": 18916700423040.0, + "grad_norm": 27.639367740142806, + "language_loss": 0.86362374, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.88457024, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.37866211, + "step": 4721, + "time_per_iteration": 2.6308140754699707 + }, + { + "auxiliary_loss_clip": 0.01652298, + "auxiliary_loss_mlp": 0.00432028, + "balance_loss_clip": 1.3315804, + "balance_loss_mlp": 0.39731473, + "epoch": 0.2839019990981512, + "flos": 27343964776320.0, + "grad_norm": 5.675479148334503, + "language_loss": 0.8884697, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.90931296, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.34716797, + "step": 4722, + "time_per_iteration": 2.665184736251831 + }, + { + "auxiliary_loss_clip": 0.0165571, + "auxiliary_loss_mlp": 0.00418037, + "balance_loss_clip": 1.32827926, + "balance_loss_mlp": 0.38329929, + "epoch": 0.2839621223508192, + "flos": 18114312458880.0, + "grad_norm": 10.777808646051835, + "language_loss": 0.76082671, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.78156412, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.34741211, + "step": 4723, + "time_per_iteration": 2.6743991374969482 + }, + { + "auxiliary_loss_clip": 0.01649796, + "auxiliary_loss_mlp": 0.00423899, + "balance_loss_clip": 1.32952523, + "balance_loss_mlp": 0.39094913, + "epoch": 0.28402224560348716, + "flos": 26358360514560.0, + "grad_norm": 36.89805844279536, + "language_loss": 0.83698797, + "learning_rate": 3.360433840760998e-06, + "loss": 0.85772491, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.3293457, + "step": 4724, + "time_per_iteration": 2.641019105911255 + }, + { + "auxiliary_loss_clip": 0.01681194, + "auxiliary_loss_mlp": 0.00449209, + "balance_loss_clip": 1.34770525, + "balance_loss_mlp": 0.41475749, + "epoch": 0.2840823688561551, + "flos": 24060795995520.0, + "grad_norm": 2.2576141937711496, + "language_loss": 0.97874498, + "learning_rate": 3.36014833532143e-06, + "loss": 1.000049, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.34448242, + "step": 4725, + "time_per_iteration": 2.6407768726348877 + }, + { + "auxiliary_loss_clip": 0.01684952, + "auxiliary_loss_mlp": 0.00478397, + "balance_loss_clip": 1.34878635, + "balance_loss_mlp": 0.43979704, + "epoch": 0.2841424921088231, + "flos": 29459821368960.0, + "grad_norm": 4.311832322493749, + "language_loss": 0.92909765, + "learning_rate": 3.3598627783049e-06, + "loss": 0.95073116, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.38598633, + "step": 4726, + "time_per_iteration": 2.685722589492798 + }, + { + "auxiliary_loss_clip": 0.01689871, + "auxiliary_loss_mlp": 0.00447824, + "balance_loss_clip": 1.35041809, + "balance_loss_mlp": 0.41096491, + "epoch": 0.28420261536149105, + "flos": 48100367053440.0, + "grad_norm": 70.1651328154248, + "language_loss": 0.84216112, + "learning_rate": 3.359577169722238e-06, + "loss": 0.86353803, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.36889648, + "step": 4727, + "time_per_iteration": 2.8862416744232178 + }, + { + "auxiliary_loss_clip": 0.01678133, + "auxiliary_loss_mlp": 0.00462783, + "balance_loss_clip": 1.35101616, + "balance_loss_mlp": 0.42613822, + "epoch": 0.284262738614159, + "flos": 25666146541440.0, + "grad_norm": 57.33930053976764, + "language_loss": 0.74120742, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.76261657, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.36645508, + "step": 4728, + "time_per_iteration": 2.655653476715088 + }, + { + "auxiliary_loss_clip": 0.01679475, + "auxiliary_loss_mlp": 0.00440545, + "balance_loss_clip": 1.34363747, + "balance_loss_mlp": 0.40826344, + "epoch": 0.284322861866827, + "flos": 19718980646400.0, + "grad_norm": 16.548196432894038, + "language_loss": 0.82207078, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.84327096, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 3.36132812, + "router_z_loss_mlp": 0.32299805, + "step": 4729, + "time_per_iteration": 2.640824317932129 + }, + { + "auxiliary_loss_clip": 0.01685122, + "auxiliary_loss_mlp": 0.0047972, + "balance_loss_clip": 1.35071135, + "balance_loss_mlp": 0.44264546, + "epoch": 0.28438298511949495, + "flos": 23915250086400.0, + "grad_norm": 33.03200505205498, + "language_loss": 0.73445129, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.7560997, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.37084961, + "step": 4730, + "time_per_iteration": 2.648637294769287 + }, + { + "auxiliary_loss_clip": 0.01719756, + "auxiliary_loss_mlp": 0.00551905, + "balance_loss_clip": 1.37237728, + "balance_loss_mlp": 0.50910872, + "epoch": 0.2844431083721629, + "flos": 26067340523520.0, + "grad_norm": 52.62437878941061, + "language_loss": 0.81529248, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.83800906, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.42797852, + "step": 4731, + "time_per_iteration": 2.7473647594451904 + }, + { + "auxiliary_loss_clip": 0.01688423, + "auxiliary_loss_mlp": 0.00471841, + "balance_loss_clip": 1.35306919, + "balance_loss_mlp": 0.43440884, + "epoch": 0.2845032316248309, + "flos": 25810435474560.0, + "grad_norm": 2.3797141618486024, + "language_loss": 0.89230806, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.91391069, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.37451172, + "step": 4732, + "time_per_iteration": 2.7198784351348877 + }, + { + "auxiliary_loss_clip": 0.01677734, + "auxiliary_loss_mlp": 0.00517084, + "balance_loss_clip": 1.34523416, + "balance_loss_mlp": 0.47724438, + "epoch": 0.2845633548774989, + "flos": 19823192979840.0, + "grad_norm": 556.4611932214924, + "language_loss": 0.85835612, + "learning_rate": 3.357862435944109e-06, + "loss": 0.88030434, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.39868164, + "step": 4733, + "time_per_iteration": 2.6868326663970947 + }, + { + "auxiliary_loss_clip": 0.01703913, + "auxiliary_loss_mlp": 0.00526034, + "balance_loss_clip": 1.3662113, + "balance_loss_mlp": 0.48624146, + "epoch": 0.28462347813016686, + "flos": 23182815859200.0, + "grad_norm": 6.179714114532041, + "language_loss": 0.79664147, + "learning_rate": 3.357576466701875e-06, + "loss": 0.81894088, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.39819336, + "step": 4734, + "time_per_iteration": 2.728217601776123 + }, + { + "auxiliary_loss_clip": 0.01663983, + "auxiliary_loss_mlp": 0.0045438, + "balance_loss_clip": 1.33523583, + "balance_loss_mlp": 0.41804507, + "epoch": 0.2846836013828348, + "flos": 18660477732480.0, + "grad_norm": 3.4907970988250683, + "language_loss": 0.80195856, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.82314211, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.36328125, + "step": 4735, + "time_per_iteration": 2.6204795837402344 + }, + { + "auxiliary_loss_clip": 0.01673121, + "auxiliary_loss_mlp": 0.00452085, + "balance_loss_clip": 1.33877921, + "balance_loss_mlp": 0.41805047, + "epoch": 0.2847437246355028, + "flos": 14173511523840.0, + "grad_norm": 5.277331044899071, + "language_loss": 0.85466617, + "learning_rate": 3.357004373789946e-06, + "loss": 0.87591827, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.34020996, + "step": 4736, + "time_per_iteration": 2.65429949760437 + }, + { + "auxiliary_loss_clip": 0.01690696, + "auxiliary_loss_mlp": 0.0042369, + "balance_loss_clip": 1.35440588, + "balance_loss_mlp": 0.38883293, + "epoch": 0.28480384788817076, + "flos": 29278364837760.0, + "grad_norm": 52.409289487851055, + "language_loss": 0.6969009, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.71804476, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.34851074, + "step": 4737, + "time_per_iteration": 2.736737012863159 + }, + { + "auxiliary_loss_clip": 0.016614, + "auxiliary_loss_mlp": 0.00417948, + "balance_loss_clip": 1.32993078, + "balance_loss_mlp": 0.38447374, + "epoch": 0.2848639711408387, + "flos": 22601314581120.0, + "grad_norm": 10.03821632265725, + "language_loss": 0.91728246, + "learning_rate": 3.356432075047052e-06, + "loss": 0.9380759, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.3347168, + "step": 4738, + "time_per_iteration": 2.6771883964538574 + }, + { + "auxiliary_loss_clip": 0.016737, + "auxiliary_loss_mlp": 0.00409548, + "balance_loss_clip": 1.34243917, + "balance_loss_mlp": 0.37745661, + "epoch": 0.2849240943935067, + "flos": 17599460866560.0, + "grad_norm": 5.083780003124386, + "language_loss": 0.97753686, + "learning_rate": 3.356145848516118e-06, + "loss": 0.9983694, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.32092285, + "step": 4739, + "time_per_iteration": 2.621105909347534 + }, + { + "auxiliary_loss_clip": 0.0163544, + "auxiliary_loss_mlp": 0.00398734, + "balance_loss_clip": 1.31303287, + "balance_loss_mlp": 0.36750126, + "epoch": 0.28498421764617465, + "flos": 24862573428480.0, + "grad_norm": 2.004535449589495, + "language_loss": 0.76820028, + "learning_rate": 3.355859570559998e-06, + "loss": 0.78854197, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.3125, + "step": 4740, + "time_per_iteration": 2.7275452613830566 + }, + { + "auxiliary_loss_clip": 0.01667668, + "auxiliary_loss_mlp": 0.00400953, + "balance_loss_clip": 1.3356818, + "balance_loss_mlp": 0.36690688, + "epoch": 0.2850443408988426, + "flos": 22782555630720.0, + "grad_norm": 9.149122077533956, + "language_loss": 0.84876359, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.86944985, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.34082031, + "step": 4741, + "time_per_iteration": 2.686898946762085 + }, + { + "auxiliary_loss_clip": 0.01652195, + "auxiliary_loss_mlp": 0.00386765, + "balance_loss_clip": 1.31984651, + "balance_loss_mlp": 0.35548449, + "epoch": 0.2851044641515106, + "flos": 18844053166080.0, + "grad_norm": 115.25523814839043, + "language_loss": 0.83979428, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.86018389, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.31286621, + "step": 4742, + "time_per_iteration": 2.829045057296753 + }, + { + "auxiliary_loss_clip": 0.01642266, + "auxiliary_loss_mlp": 0.00397029, + "balance_loss_clip": 1.31850743, + "balance_loss_mlp": 0.36319703, + "epoch": 0.28516458740417855, + "flos": 18880502492160.0, + "grad_norm": 4.651365881113107, + "language_loss": 0.6352371, + "learning_rate": 3.355000428249086e-06, + "loss": 0.65563011, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.33837891, + "step": 4743, + "time_per_iteration": 2.692364454269409 + }, + { + "auxiliary_loss_clip": 0.01673313, + "auxiliary_loss_mlp": 0.00366421, + "balance_loss_clip": 1.35053217, + "balance_loss_mlp": 0.33545026, + "epoch": 0.2852247106568465, + "flos": 25299821687040.0, + "grad_norm": 3.7721856710939146, + "language_loss": 0.80997211, + "learning_rate": 3.354713944700797e-06, + "loss": 0.83036941, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.30944824, + "step": 4744, + "time_per_iteration": 4.150479793548584 + }, + { + "auxiliary_loss_clip": 0.01625699, + "auxiliary_loss_mlp": 0.0035636, + "balance_loss_clip": 1.30040932, + "balance_loss_mlp": 0.32542509, + "epoch": 0.2852848339095145, + "flos": 11655383541120.0, + "grad_norm": 2.5885334714699337, + "language_loss": 0.84211826, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.86193883, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.30969238, + "step": 4745, + "time_per_iteration": 2.6903557777404785 + }, + { + "auxiliary_loss_clip": 0.01660049, + "auxiliary_loss_mlp": 0.00353006, + "balance_loss_clip": 1.33082187, + "balance_loss_mlp": 0.32348967, + "epoch": 0.2853449571621825, + "flos": 12933228856320.0, + "grad_norm": 458.5000759223159, + "language_loss": 0.88189304, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.90202355, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.29516602, + "step": 4746, + "time_per_iteration": 2.592804431915283 + }, + { + "auxiliary_loss_clip": 0.01682021, + "auxiliary_loss_mlp": 0.00403192, + "balance_loss_clip": 1.34150887, + "balance_loss_mlp": 0.36938453, + "epoch": 0.28540508041485046, + "flos": 20010575255040.0, + "grad_norm": 11.869548954391876, + "language_loss": 0.87527508, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.89612716, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.33813477, + "step": 4747, + "time_per_iteration": 4.056623220443726 + }, + { + "auxiliary_loss_clip": 0.01443868, + "auxiliary_loss_mlp": 0.00086535, + "balance_loss_clip": 1.21674323, + "balance_loss_mlp": 0.07747462, + "epoch": 0.28546520366751843, + "flos": 68139349966080.0, + "grad_norm": 0.7553747870637408, + "language_loss": 0.59956741, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.61487144, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.09082031, + "step": 4748, + "time_per_iteration": 3.142232894897461 + }, + { + "auxiliary_loss_clip": 0.01641018, + "auxiliary_loss_mlp": 0.0035461, + "balance_loss_clip": 1.31471443, + "balance_loss_mlp": 0.32471216, + "epoch": 0.2855253269201864, + "flos": 13251540205440.0, + "grad_norm": 8.089513093174133, + "language_loss": 0.90290046, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.92285681, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.29882812, + "step": 4749, + "time_per_iteration": 2.6235690116882324 + }, + { + "auxiliary_loss_clip": 0.01657807, + "auxiliary_loss_mlp": 0.00340109, + "balance_loss_clip": 1.32656264, + "balance_loss_mlp": 0.30985385, + "epoch": 0.28558545017285436, + "flos": 28620876337920.0, + "grad_norm": 3.694778315074223, + "language_loss": 0.76693666, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.78691578, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.30236816, + "step": 4750, + "time_per_iteration": 4.106104373931885 + }, + { + "auxiliary_loss_clip": 0.01656803, + "auxiliary_loss_mlp": 0.0033863, + "balance_loss_clip": 1.32879901, + "balance_loss_mlp": 0.30954331, + "epoch": 0.2856455734255223, + "flos": 34130470752000.0, + "grad_norm": 8.789912390883961, + "language_loss": 0.86551785, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.88547218, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.29077148, + "step": 4751, + "time_per_iteration": 2.7182555198669434 + }, + { + "auxiliary_loss_clip": 0.01641722, + "auxiliary_loss_mlp": 0.00342539, + "balance_loss_clip": 1.31488681, + "balance_loss_mlp": 0.31160438, + "epoch": 0.2857056966781903, + "flos": 39786149779200.0, + "grad_norm": 9.624447152484379, + "language_loss": 0.86576116, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.88560379, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.3092041, + "step": 4752, + "time_per_iteration": 2.8040924072265625 + }, + { + "auxiliary_loss_clip": 0.01618184, + "auxiliary_loss_mlp": 0.00328896, + "balance_loss_clip": 1.30217719, + "balance_loss_mlp": 0.29768687, + "epoch": 0.28576581993085826, + "flos": 21872292145920.0, + "grad_norm": 30.008528234611813, + "language_loss": 0.84944707, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.86891782, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.31225586, + "step": 4753, + "time_per_iteration": 2.666517496109009 + }, + { + "auxiliary_loss_clip": 0.01670862, + "auxiliary_loss_mlp": 0.0035002, + "balance_loss_clip": 1.33373332, + "balance_loss_mlp": 0.3175593, + "epoch": 0.2858259431835262, + "flos": 19091656592640.0, + "grad_norm": 12.196293430503038, + "language_loss": 0.97219503, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.99240381, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.32434082, + "step": 4754, + "time_per_iteration": 2.628031015396118 + }, + { + "auxiliary_loss_clip": 0.01650731, + "auxiliary_loss_mlp": 0.00277255, + "balance_loss_clip": 1.32245409, + "balance_loss_mlp": 0.25151801, + "epoch": 0.2858860664361942, + "flos": 20334309557760.0, + "grad_norm": 11.729621378802253, + "language_loss": 0.86892104, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.88820088, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.25744629, + "step": 4755, + "time_per_iteration": 2.665055990219116 + }, + { + "auxiliary_loss_clip": 0.01625849, + "auxiliary_loss_mlp": 0.00317417, + "balance_loss_clip": 1.30061221, + "balance_loss_mlp": 0.28875953, + "epoch": 0.28594618968886215, + "flos": 24461738582400.0, + "grad_norm": 2.9883454009622588, + "language_loss": 0.89202553, + "learning_rate": 3.351272138300922e-06, + "loss": 0.91145819, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.28686523, + "step": 4756, + "time_per_iteration": 2.7143256664276123 + }, + { + "auxiliary_loss_clip": 0.01473114, + "auxiliary_loss_mlp": 0.00142423, + "balance_loss_clip": 1.22799516, + "balance_loss_mlp": 0.13026337, + "epoch": 0.2860063129415301, + "flos": 71652850709760.0, + "grad_norm": 0.852295904037903, + "language_loss": 0.60485053, + "learning_rate": 3.350984987779142e-06, + "loss": 0.62100595, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.12158203, + "step": 4757, + "time_per_iteration": 3.2938787937164307 + }, + { + "auxiliary_loss_clip": 0.01680445, + "auxiliary_loss_mlp": 0.00310904, + "balance_loss_clip": 1.3455708, + "balance_loss_mlp": 0.28269893, + "epoch": 0.2860664361941981, + "flos": 20558679863040.0, + "grad_norm": 4.1818602296924325, + "language_loss": 0.7452535, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.765167, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.28210449, + "step": 4758, + "time_per_iteration": 4.044320344924927 + }, + { + "auxiliary_loss_clip": 0.01659889, + "auxiliary_loss_mlp": 0.00309473, + "balance_loss_clip": 1.32836509, + "balance_loss_mlp": 0.28055304, + "epoch": 0.2861265594468661, + "flos": 35996389534080.0, + "grad_norm": 1.3786935989320641, + "language_loss": 0.68704075, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.70673436, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.28930664, + "step": 4759, + "time_per_iteration": 2.7863552570343018 + }, + { + "auxiliary_loss_clip": 0.01657324, + "auxiliary_loss_mlp": 0.00291857, + "balance_loss_clip": 1.32922268, + "balance_loss_mlp": 0.26187578, + "epoch": 0.28618668269953407, + "flos": 20047419630720.0, + "grad_norm": 5.202456839164399, + "language_loss": 0.80678689, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.82627875, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.29992676, + "step": 4760, + "time_per_iteration": 2.679760217666626 + }, + { + "auxiliary_loss_clip": 0.01632986, + "auxiliary_loss_mlp": 0.00281558, + "balance_loss_clip": 1.31194329, + "balance_loss_mlp": 0.25424671, + "epoch": 0.28624680595220203, + "flos": 24971849579520.0, + "grad_norm": 23.984817874841426, + "language_loss": 0.78277826, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.80192375, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.27319336, + "step": 4761, + "time_per_iteration": 2.667949676513672 + }, + { + "auxiliary_loss_clip": 0.01675902, + "auxiliary_loss_mlp": 0.00328906, + "balance_loss_clip": 1.34229481, + "balance_loss_mlp": 0.29741102, + "epoch": 0.28630692920487, + "flos": 22492253911680.0, + "grad_norm": 9.823831461277473, + "language_loss": 0.81806457, + "learning_rate": 3.349548466945793e-06, + "loss": 0.83811271, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.31469727, + "step": 4762, + "time_per_iteration": 2.6710431575775146 + }, + { + "auxiliary_loss_clip": 0.01652345, + "auxiliary_loss_mlp": 0.00350796, + "balance_loss_clip": 1.32453847, + "balance_loss_mlp": 0.31768003, + "epoch": 0.28636705245753796, + "flos": 21249888255360.0, + "grad_norm": 2.8241413257704373, + "language_loss": 0.80020863, + "learning_rate": 3.349261009210496e-06, + "loss": 0.82024002, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.33105469, + "step": 4763, + "time_per_iteration": 2.647678852081299 + }, + { + "auxiliary_loss_clip": 0.01670702, + "auxiliary_loss_mlp": 0.00303097, + "balance_loss_clip": 1.33634388, + "balance_loss_mlp": 0.27138737, + "epoch": 0.28642717571020593, + "flos": 24095772864000.0, + "grad_norm": 6.731645448575013, + "language_loss": 0.84453988, + "learning_rate": 3.348973500311086e-06, + "loss": 0.86427784, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.31713867, + "step": 4764, + "time_per_iteration": 2.665630578994751 + }, + { + "auxiliary_loss_clip": 0.01687639, + "auxiliary_loss_mlp": 0.0030719, + "balance_loss_clip": 1.34561205, + "balance_loss_mlp": 0.27621984, + "epoch": 0.2864872989628739, + "flos": 22601386408320.0, + "grad_norm": 4.040443755256701, + "language_loss": 0.79863954, + "learning_rate": 3.348685940258466e-06, + "loss": 0.81858784, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 3.41992188, + "router_z_loss_mlp": 0.30981445, + "step": 4765, + "time_per_iteration": 2.6523478031158447 + }, + { + "auxiliary_loss_clip": 0.01686934, + "auxiliary_loss_mlp": 0.00293848, + "balance_loss_clip": 1.34787929, + "balance_loss_mlp": 0.26218569, + "epoch": 0.28654742221554186, + "flos": 32745073138560.0, + "grad_norm": 6.051850583241571, + "language_loss": 0.817572, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.83737987, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.31665039, + "step": 4766, + "time_per_iteration": 2.80873441696167 + }, + { + "auxiliary_loss_clip": 0.01669734, + "auxiliary_loss_mlp": 0.00261962, + "balance_loss_clip": 1.33112037, + "balance_loss_mlp": 0.23361361, + "epoch": 0.2866075454682098, + "flos": 26981626331520.0, + "grad_norm": 1.6388542575784177, + "language_loss": 0.84339178, + "learning_rate": 3.348110666737214e-06, + "loss": 0.86270869, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.28344727, + "step": 4767, + "time_per_iteration": 2.7050046920776367 + }, + { + "auxiliary_loss_clip": 0.01677774, + "auxiliary_loss_mlp": 0.00302355, + "balance_loss_clip": 1.34827065, + "balance_loss_mlp": 0.27071714, + "epoch": 0.2866676687208778, + "flos": 23253847004160.0, + "grad_norm": 3.895020177199508, + "language_loss": 0.6969856, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.71678686, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.31616211, + "step": 4768, + "time_per_iteration": 2.6171796321868896 + }, + { + "auxiliary_loss_clip": 0.01686857, + "auxiliary_loss_mlp": 0.00310207, + "balance_loss_clip": 1.34253561, + "balance_loss_mlp": 0.27875975, + "epoch": 0.28672779197354575, + "flos": 21579727870080.0, + "grad_norm": 1.9284736349868663, + "language_loss": 0.77705914, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.79702979, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.3145752, + "step": 4769, + "time_per_iteration": 2.681192636489868 + }, + { + "auxiliary_loss_clip": 0.01679064, + "auxiliary_loss_mlp": 0.00294678, + "balance_loss_clip": 1.33691299, + "balance_loss_mlp": 0.26635385, + "epoch": 0.2867879152262137, + "flos": 19865568049920.0, + "grad_norm": 3.0359543880802358, + "language_loss": 0.81082606, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.83056343, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.28344727, + "step": 4770, + "time_per_iteration": 2.648583173751831 + }, + { + "auxiliary_loss_clip": 0.01642054, + "auxiliary_loss_mlp": 0.00306987, + "balance_loss_clip": 1.31379163, + "balance_loss_mlp": 0.27711278, + "epoch": 0.2868480384788817, + "flos": 28213325648640.0, + "grad_norm": 40.99671387033889, + "language_loss": 0.73237789, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.75186831, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.29907227, + "step": 4771, + "time_per_iteration": 2.703432321548462 + }, + { + "auxiliary_loss_clip": 0.0145368, + "auxiliary_loss_mlp": 0.00115222, + "balance_loss_clip": 1.20026374, + "balance_loss_mlp": 0.10816433, + "epoch": 0.2869081617315497, + "flos": 65424286690560.0, + "grad_norm": 0.8233291929627559, + "language_loss": 0.56518191, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.58087093, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.07080078, + "step": 4772, + "time_per_iteration": 3.078024387359619 + }, + { + "auxiliary_loss_clip": 0.01649319, + "auxiliary_loss_mlp": 0.00278669, + "balance_loss_clip": 1.31290674, + "balance_loss_mlp": 0.25058341, + "epoch": 0.28696828498421767, + "flos": 18660729127680.0, + "grad_norm": 9.987008598473734, + "language_loss": 0.92112017, + "learning_rate": 3.346383619630856e-06, + "loss": 0.9404, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.28088379, + "step": 4773, + "time_per_iteration": 2.6300106048583984 + }, + { + "auxiliary_loss_clip": 0.01649684, + "auxiliary_loss_mlp": 0.00290662, + "balance_loss_clip": 1.30729628, + "balance_loss_mlp": 0.25952432, + "epoch": 0.28702840823688563, + "flos": 23659745667840.0, + "grad_norm": 50.01136461617431, + "language_loss": 0.8476274, + "learning_rate": 3.34609559969027e-06, + "loss": 0.8670308, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 3.42382812, + "router_z_loss_mlp": 0.31152344, + "step": 4774, + "time_per_iteration": 2.6683661937713623 + }, + { + "auxiliary_loss_clip": 0.01674586, + "auxiliary_loss_mlp": 0.00311903, + "balance_loss_clip": 1.32749319, + "balance_loss_mlp": 0.28093201, + "epoch": 0.2870885314895536, + "flos": 13804744544640.0, + "grad_norm": 30.95612836902878, + "language_loss": 0.8029207, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.82278562, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.30981445, + "step": 4775, + "time_per_iteration": 2.6364853382110596 + }, + { + "auxiliary_loss_clip": 0.01660686, + "auxiliary_loss_mlp": 0.00289553, + "balance_loss_clip": 1.31991279, + "balance_loss_mlp": 0.2611689, + "epoch": 0.28714865474222157, + "flos": 17786771314560.0, + "grad_norm": 2.006411866678454, + "language_loss": 0.93145108, + "learning_rate": 3.34551940668778e-06, + "loss": 0.95095354, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.28405762, + "step": 4776, + "time_per_iteration": 2.6232988834381104 + }, + { + "auxiliary_loss_clip": 0.01672578, + "auxiliary_loss_mlp": 0.00266974, + "balance_loss_clip": 1.32477593, + "balance_loss_mlp": 0.23988923, + "epoch": 0.28720877799488953, + "flos": 15997486199040.0, + "grad_norm": 4.316749534407035, + "language_loss": 0.82511544, + "learning_rate": 3.345231233647726e-06, + "loss": 0.84451097, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.27087402, + "step": 4777, + "time_per_iteration": 2.656352996826172 + }, + { + "auxiliary_loss_clip": 0.01708032, + "auxiliary_loss_mlp": 0.00306882, + "balance_loss_clip": 1.34554982, + "balance_loss_mlp": 0.27502948, + "epoch": 0.2872689012475575, + "flos": 20923137210240.0, + "grad_norm": 6.4693903740004375, + "language_loss": 0.88194549, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.90209466, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 0.31884766, + "step": 4778, + "time_per_iteration": 2.6604461669921875 + }, + { + "auxiliary_loss_clip": 0.01673888, + "auxiliary_loss_mlp": 0.002766, + "balance_loss_clip": 1.3239857, + "balance_loss_mlp": 0.24793009, + "epoch": 0.28732902450022546, + "flos": 21325121291520.0, + "grad_norm": 5.011305570582617, + "language_loss": 0.80356604, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.82307088, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 0.28649902, + "step": 4779, + "time_per_iteration": 2.6423301696777344 + }, + { + "auxiliary_loss_clip": 0.01695165, + "auxiliary_loss_mlp": 0.00309672, + "balance_loss_clip": 1.3351326, + "balance_loss_mlp": 0.28025091, + "epoch": 0.2873891477528934, + "flos": 20850382212480.0, + "grad_norm": 12.225589067643705, + "language_loss": 0.83389133, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.85393965, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 3.6015625, + "router_z_loss_mlp": 0.29406738, + "step": 4780, + "time_per_iteration": 2.6619162559509277 + }, + { + "auxiliary_loss_clip": 0.01667874, + "auxiliary_loss_mlp": 0.00284101, + "balance_loss_clip": 1.31709576, + "balance_loss_mlp": 0.25581264, + "epoch": 0.2874492710055614, + "flos": 17420051410560.0, + "grad_norm": 4.249654516878094, + "language_loss": 0.86969936, + "learning_rate": 3.344078031483784e-06, + "loss": 0.88921916, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.28259277, + "step": 4781, + "time_per_iteration": 2.648014783859253 + }, + { + "auxiliary_loss_clip": 0.01683341, + "auxiliary_loss_mlp": 0.00325633, + "balance_loss_clip": 1.32571673, + "balance_loss_mlp": 0.29363674, + "epoch": 0.28750939425822936, + "flos": 13406818700160.0, + "grad_norm": 5.974564792894227, + "language_loss": 0.92900705, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.9490968, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 0.31982422, + "step": 4782, + "time_per_iteration": 2.689769744873047 + }, + { + "auxiliary_loss_clip": 0.01722175, + "auxiliary_loss_mlp": 0.0030027, + "balance_loss_clip": 1.35292161, + "balance_loss_mlp": 0.26903713, + "epoch": 0.2875695175108973, + "flos": 21870029589120.0, + "grad_norm": 70.12287953806398, + "language_loss": 0.77562463, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.79584908, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 0.31237793, + "step": 4783, + "time_per_iteration": 2.719494104385376 + }, + { + "auxiliary_loss_clip": 0.01661883, + "auxiliary_loss_mlp": 0.00324803, + "balance_loss_clip": 1.3119719, + "balance_loss_mlp": 0.29483342, + "epoch": 0.2876296407635653, + "flos": 26245457089920.0, + "grad_norm": 9.68682042705836, + "language_loss": 0.82716024, + "learning_rate": 3.343212594663047e-06, + "loss": 0.84702718, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.29980469, + "step": 4784, + "time_per_iteration": 2.7448737621307373 + }, + { + "auxiliary_loss_clip": 0.01643667, + "auxiliary_loss_mlp": 0.00272484, + "balance_loss_clip": 1.29573107, + "balance_loss_mlp": 0.24430273, + "epoch": 0.28768976401623325, + "flos": 25373654092800.0, + "grad_norm": 2.3625077222097524, + "language_loss": 0.81009507, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.82925659, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.28186035, + "step": 4785, + "time_per_iteration": 2.6764068603515625 + }, + { + "auxiliary_loss_clip": 0.01641456, + "auxiliary_loss_mlp": 0.00243396, + "balance_loss_clip": 1.29124331, + "balance_loss_mlp": 0.21576294, + "epoch": 0.28774988726890127, + "flos": 30664372982400.0, + "grad_norm": 17.042018825673708, + "language_loss": 0.89909041, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.91793895, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.27648926, + "step": 4786, + "time_per_iteration": 4.065742015838623 + }, + { + "auxiliary_loss_clip": 0.01625866, + "auxiliary_loss_mlp": 0.00298757, + "balance_loss_clip": 1.27702427, + "balance_loss_mlp": 0.26969364, + "epoch": 0.28781001052156924, + "flos": 20595452411520.0, + "grad_norm": 4.835435076941311, + "language_loss": 0.8515864, + "learning_rate": 3.342346699429516e-06, + "loss": 0.87083256, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 0.29077148, + "step": 4787, + "time_per_iteration": 2.6337380409240723 + }, + { + "auxiliary_loss_clip": 0.01663861, + "auxiliary_loss_mlp": 0.00314687, + "balance_loss_clip": 1.30284643, + "balance_loss_mlp": 0.28505141, + "epoch": 0.2878701337742372, + "flos": 26542330997760.0, + "grad_norm": 10.311346404534666, + "language_loss": 0.90249598, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.9222815, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.29626465, + "step": 4788, + "time_per_iteration": 2.813897132873535 + }, + { + "auxiliary_loss_clip": 0.01650028, + "auxiliary_loss_mlp": 0.00292405, + "balance_loss_clip": 1.2937634, + "balance_loss_mlp": 0.26329422, + "epoch": 0.28793025702690517, + "flos": 28146855530880.0, + "grad_norm": 45.1170240152689, + "language_loss": 0.81358898, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.8330133, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.29125977, + "step": 4789, + "time_per_iteration": 4.286560773849487 + }, + { + "auxiliary_loss_clip": 0.01598901, + "auxiliary_loss_mlp": 0.00242069, + "balance_loss_clip": 1.26793861, + "balance_loss_mlp": 0.21626063, + "epoch": 0.28799038027957313, + "flos": 23805471144960.0, + "grad_norm": 9.920060553375782, + "language_loss": 0.90827483, + "learning_rate": 3.341480346078704e-06, + "loss": 0.92668462, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.2578125, + "step": 4790, + "time_per_iteration": 2.6528830528259277 + }, + { + "auxiliary_loss_clip": 0.01634669, + "auxiliary_loss_mlp": 0.00302728, + "balance_loss_clip": 1.28570485, + "balance_loss_mlp": 0.27192438, + "epoch": 0.2880505035322411, + "flos": 22344122223360.0, + "grad_norm": 10.897009097718461, + "language_loss": 0.84922725, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.8686012, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 0.30810547, + "step": 4791, + "time_per_iteration": 2.6346404552459717 + }, + { + "auxiliary_loss_clip": 0.01622634, + "auxiliary_loss_mlp": 0.00290359, + "balance_loss_clip": 1.27290916, + "balance_loss_mlp": 0.25844651, + "epoch": 0.28811062678490906, + "flos": 18004246208640.0, + "grad_norm": 41.09127223671159, + "language_loss": 0.78944194, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.80857182, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.31896973, + "step": 4792, + "time_per_iteration": 4.0592734813690186 + }, + { + "auxiliary_loss_clip": 0.01607942, + "auxiliary_loss_mlp": 0.00276573, + "balance_loss_clip": 1.26671064, + "balance_loss_mlp": 0.25019157, + "epoch": 0.28817075003757703, + "flos": 22090880361600.0, + "grad_norm": 17.75419520868001, + "language_loss": 0.88692939, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.90577453, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.26391602, + "step": 4793, + "time_per_iteration": 2.6383016109466553 + }, + { + "auxiliary_loss_clip": 0.01612502, + "auxiliary_loss_mlp": 0.00281107, + "balance_loss_clip": 1.26984143, + "balance_loss_mlp": 0.25522679, + "epoch": 0.288230873290245, + "flos": 41683130847360.0, + "grad_norm": 12.159592890495626, + "language_loss": 0.82287252, + "learning_rate": 3.340324496161797e-06, + "loss": 0.84180856, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 0.25891113, + "step": 4794, + "time_per_iteration": 2.9083666801452637 + }, + { + "auxiliary_loss_clip": 0.0160476, + "auxiliary_loss_mlp": 0.00281194, + "balance_loss_clip": 1.2637006, + "balance_loss_mlp": 0.25239331, + "epoch": 0.28829099654291296, + "flos": 18624423456000.0, + "grad_norm": 2.910094433452658, + "language_loss": 0.89856994, + "learning_rate": 3.340035406592074e-06, + "loss": 0.91742945, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.2878418, + "step": 4795, + "time_per_iteration": 2.683037757873535 + }, + { + "auxiliary_loss_clip": 0.01585295, + "auxiliary_loss_mlp": 0.00240303, + "balance_loss_clip": 1.25398135, + "balance_loss_mlp": 0.21617499, + "epoch": 0.2883511197955809, + "flos": 24674832017280.0, + "grad_norm": 7.665158179075347, + "language_loss": 0.79001808, + "learning_rate": 3.339746266208074e-06, + "loss": 0.80827403, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.24121094, + "step": 4796, + "time_per_iteration": 2.6798603534698486 + }, + { + "auxiliary_loss_clip": 0.01617628, + "auxiliary_loss_mlp": 0.00272055, + "balance_loss_clip": 1.27667665, + "balance_loss_mlp": 0.24232399, + "epoch": 0.2884112430482489, + "flos": 23112143850240.0, + "grad_norm": 2.1965696795318856, + "language_loss": 0.81033301, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.82922983, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.29699707, + "step": 4797, + "time_per_iteration": 2.668360710144043 + }, + { + "auxiliary_loss_clip": 0.01592366, + "auxiliary_loss_mlp": 0.00254092, + "balance_loss_clip": 1.25612426, + "balance_loss_mlp": 0.22525528, + "epoch": 0.28847136630091685, + "flos": 16873347432960.0, + "grad_norm": 73.94060012211794, + "language_loss": 0.81609476, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.83455932, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.28796387, + "step": 4798, + "time_per_iteration": 2.643634557723999 + }, + { + "auxiliary_loss_clip": 0.01637858, + "auxiliary_loss_mlp": 0.00263752, + "balance_loss_clip": 1.29387975, + "balance_loss_mlp": 0.23499903, + "epoch": 0.2885314895535849, + "flos": 25657527277440.0, + "grad_norm": 5.030926605102282, + "language_loss": 0.74981374, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.76882982, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 3.43945312, + "router_z_loss_mlp": 0.2878418, + "step": 4799, + "time_per_iteration": 2.6968932151794434 + }, + { + "auxiliary_loss_clip": 0.01610547, + "auxiliary_loss_mlp": 0.00289617, + "balance_loss_clip": 1.27320933, + "balance_loss_mlp": 0.2598151, + "epoch": 0.28859161280625284, + "flos": 21107251347840.0, + "grad_norm": 2.0686137486137146, + "language_loss": 0.89502317, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.91402483, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.2980957, + "step": 4800, + "time_per_iteration": 4.116274356842041 + }, + { + "auxiliary_loss_clip": 0.0158209, + "auxiliary_loss_mlp": 0.00236079, + "balance_loss_clip": 1.25681961, + "balance_loss_mlp": 0.21009117, + "epoch": 0.2886517360589208, + "flos": 26469540086400.0, + "grad_norm": 28.830285294923623, + "language_loss": 0.9635098, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.98169154, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.26000977, + "step": 4801, + "time_per_iteration": 2.696899890899658 + }, + { + "auxiliary_loss_clip": 0.01562938, + "auxiliary_loss_mlp": 0.00241659, + "balance_loss_clip": 1.24283099, + "balance_loss_mlp": 0.21581489, + "epoch": 0.28871185931158877, + "flos": 25265275781760.0, + "grad_norm": 82.82088692858193, + "language_loss": 0.80222648, + "learning_rate": 3.33801035741839e-06, + "loss": 0.82027245, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.25817871, + "step": 4802, + "time_per_iteration": 2.6930692195892334 + }, + { + "auxiliary_loss_clip": 0.01397537, + "auxiliary_loss_mlp": 0.00055171, + "balance_loss_clip": 1.15106225, + "balance_loss_mlp": 0.0474466, + "epoch": 0.28877198256425674, + "flos": 66665431284480.0, + "grad_norm": 0.7872175937567806, + "language_loss": 0.62818098, + "learning_rate": 3.337720861641558e-06, + "loss": 0.64270806, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.07714844, + "step": 4803, + "time_per_iteration": 3.0930697917938232 + }, + { + "auxiliary_loss_clip": 0.01601592, + "auxiliary_loss_mlp": 0.00246875, + "balance_loss_clip": 1.27128816, + "balance_loss_mlp": 0.2200053, + "epoch": 0.2888321058169247, + "flos": 20303031790080.0, + "grad_norm": 23.64662922413243, + "language_loss": 0.79756731, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.81605196, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.26855469, + "step": 4804, + "time_per_iteration": 2.6821911334991455 + }, + { + "auxiliary_loss_clip": 0.01589767, + "auxiliary_loss_mlp": 0.00241795, + "balance_loss_clip": 1.26114416, + "balance_loss_mlp": 0.21351878, + "epoch": 0.28889222906959267, + "flos": 25516721963520.0, + "grad_norm": 1.958763590811428, + "language_loss": 0.74934655, + "learning_rate": 3.337141717919346e-06, + "loss": 0.76766217, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.28283691, + "step": 4805, + "time_per_iteration": 2.671834707260132 + }, + { + "auxiliary_loss_clip": 0.01602693, + "auxiliary_loss_mlp": 0.00236796, + "balance_loss_clip": 1.26799297, + "balance_loss_mlp": 0.20789948, + "epoch": 0.28895235232226063, + "flos": 32671312560000.0, + "grad_norm": 25.202732228764393, + "language_loss": 0.76788545, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.78628039, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.28881836, + "step": 4806, + "time_per_iteration": 2.727370262145996 + }, + { + "auxiliary_loss_clip": 0.01584635, + "auxiliary_loss_mlp": 0.00245318, + "balance_loss_clip": 1.26524901, + "balance_loss_mlp": 0.21708894, + "epoch": 0.2890124755749286, + "flos": 29714679342720.0, + "grad_norm": 1.6747753336095474, + "language_loss": 0.77108669, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.78938627, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.28222656, + "step": 4807, + "time_per_iteration": 2.6979117393493652 + }, + { + "auxiliary_loss_clip": 0.01569129, + "auxiliary_loss_mlp": 0.0026319, + "balance_loss_clip": 1.25403309, + "balance_loss_mlp": 0.23607016, + "epoch": 0.28907259882759656, + "flos": 22674464628480.0, + "grad_norm": 10.334349585692953, + "language_loss": 0.86822766, + "learning_rate": 3.336272622079382e-06, + "loss": 0.88655084, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.2713623, + "step": 4808, + "time_per_iteration": 2.7061989307403564 + }, + { + "auxiliary_loss_clip": 0.01594575, + "auxiliary_loss_mlp": 0.00241956, + "balance_loss_clip": 1.27660847, + "balance_loss_mlp": 0.21257126, + "epoch": 0.2891327220802645, + "flos": 22566050403840.0, + "grad_norm": 10.08881012835152, + "language_loss": 0.84944999, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.86781538, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.2935791, + "step": 4809, + "time_per_iteration": 2.759411096572876 + }, + { + "auxiliary_loss_clip": 0.01595753, + "auxiliary_loss_mlp": 0.0026893, + "balance_loss_clip": 1.27048254, + "balance_loss_mlp": 0.23958036, + "epoch": 0.2891928453329325, + "flos": 21652806090240.0, + "grad_norm": 233.1375924690938, + "language_loss": 0.86557746, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.8842243, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.29370117, + "step": 4810, + "time_per_iteration": 2.636070966720581 + }, + { + "auxiliary_loss_clip": 0.01597768, + "auxiliary_loss_mlp": 0.00237317, + "balance_loss_clip": 1.28222716, + "balance_loss_mlp": 0.20887412, + "epoch": 0.28925296858560046, + "flos": 23222102359680.0, + "grad_norm": 132.10036904199978, + "language_loss": 0.82876122, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.84711218, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.2845459, + "step": 4811, + "time_per_iteration": 2.636136293411255 + }, + { + "auxiliary_loss_clip": 0.01632703, + "auxiliary_loss_mlp": 0.00252371, + "balance_loss_clip": 1.30275273, + "balance_loss_mlp": 0.2211501, + "epoch": 0.2893130918382685, + "flos": 28621666437120.0, + "grad_norm": 17.481864445837573, + "language_loss": 0.83418995, + "learning_rate": 3.335113118275117e-06, + "loss": 0.8530407, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.31237793, + "step": 4812, + "time_per_iteration": 2.680908441543579 + }, + { + "auxiliary_loss_clip": 0.01454647, + "auxiliary_loss_mlp": 0.00081054, + "balance_loss_clip": 1.23036754, + "balance_loss_mlp": 0.07433056, + "epoch": 0.28937321509093644, + "flos": 72301288982400.0, + "grad_norm": 0.8097895392727522, + "language_loss": 0.59659529, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.6119523, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.06738281, + "step": 4813, + "time_per_iteration": 3.2664732933044434 + }, + { + "auxiliary_loss_clip": 0.0160387, + "auxiliary_loss_mlp": 0.00277395, + "balance_loss_clip": 1.29045105, + "balance_loss_mlp": 0.24831973, + "epoch": 0.2894333383436044, + "flos": 16216397637120.0, + "grad_norm": 2.363460538940302, + "language_loss": 0.90440464, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.92321736, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.29064941, + "step": 4814, + "time_per_iteration": 2.617704153060913 + }, + { + "auxiliary_loss_clip": 0.01586083, + "auxiliary_loss_mlp": 0.00288101, + "balance_loss_clip": 1.27895033, + "balance_loss_mlp": 0.2580964, + "epoch": 0.2894934615962724, + "flos": 24828278918400.0, + "grad_norm": 3.9419578685944634, + "language_loss": 0.80413449, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.82287639, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.30041504, + "step": 4815, + "time_per_iteration": 2.6765856742858887 + }, + { + "auxiliary_loss_clip": 0.01583924, + "auxiliary_loss_mlp": 0.00243495, + "balance_loss_clip": 1.27972627, + "balance_loss_mlp": 0.21606499, + "epoch": 0.28955358484894034, + "flos": 20449978329600.0, + "grad_norm": 1.917147202938252, + "language_loss": 0.77486467, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.79313886, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 3.04296875, + "router_z_loss_mlp": 0.27404785, + "step": 4816, + "time_per_iteration": 2.636842966079712 + }, + { + "auxiliary_loss_clip": 0.01625698, + "auxiliary_loss_mlp": 0.00322508, + "balance_loss_clip": 1.30101562, + "balance_loss_mlp": 0.28908166, + "epoch": 0.2896137081016083, + "flos": 22565188477440.0, + "grad_norm": 2.542180714341053, + "language_loss": 0.85157263, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.87105465, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.33398438, + "step": 4817, + "time_per_iteration": 2.732513427734375 + }, + { + "auxiliary_loss_clip": 0.01599643, + "auxiliary_loss_mlp": 0.00294656, + "balance_loss_clip": 1.28001785, + "balance_loss_mlp": 0.26299381, + "epoch": 0.28967383135427627, + "flos": 26687948734080.0, + "grad_norm": 3.3082428432894715, + "language_loss": 0.83311236, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.85205537, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.31665039, + "step": 4818, + "time_per_iteration": 2.73789119720459 + }, + { + "auxiliary_loss_clip": 0.01607485, + "auxiliary_loss_mlp": 0.00285724, + "balance_loss_clip": 1.29191935, + "balance_loss_mlp": 0.25291806, + "epoch": 0.28973395460694423, + "flos": 15558262692480.0, + "grad_norm": 2.724967322174943, + "language_loss": 0.87565136, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.89458346, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.32800293, + "step": 4819, + "time_per_iteration": 2.6208438873291016 + }, + { + "auxiliary_loss_clip": 0.01608801, + "auxiliary_loss_mlp": 0.00307827, + "balance_loss_clip": 1.29608893, + "balance_loss_mlp": 0.27566403, + "epoch": 0.2897940778596122, + "flos": 18697465762560.0, + "grad_norm": 20.21590245976609, + "language_loss": 0.87885189, + "learning_rate": 3.332791681244776e-06, + "loss": 0.89801812, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.32177734, + "step": 4820, + "time_per_iteration": 2.6680848598480225 + }, + { + "auxiliary_loss_clip": 0.01633195, + "auxiliary_loss_mlp": 0.0030273, + "balance_loss_clip": 1.30922461, + "balance_loss_mlp": 0.27218905, + "epoch": 0.28985420111228016, + "flos": 18770292587520.0, + "grad_norm": 30.81024535643987, + "language_loss": 0.80478859, + "learning_rate": 3.332501274072231e-06, + "loss": 0.82414788, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.30541992, + "step": 4821, + "time_per_iteration": 2.652608871459961 + }, + { + "auxiliary_loss_clip": 0.01629125, + "auxiliary_loss_mlp": 0.00321887, + "balance_loss_clip": 1.30606699, + "balance_loss_mlp": 0.28775722, + "epoch": 0.28991432436494813, + "flos": 23069840607360.0, + "grad_norm": 1.692796420877047, + "language_loss": 0.78658867, + "learning_rate": 3.332210816371104e-06, + "loss": 0.80609882, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.34143066, + "step": 4822, + "time_per_iteration": 2.6353065967559814 + }, + { + "auxiliary_loss_clip": 0.01610408, + "auxiliary_loss_mlp": 0.00276937, + "balance_loss_clip": 1.29201293, + "balance_loss_mlp": 0.24720654, + "epoch": 0.2899744476176161, + "flos": 17603195880960.0, + "grad_norm": 30.53333565960886, + "language_loss": 0.72391796, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.74279141, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.29711914, + "step": 4823, + "time_per_iteration": 2.6165010929107666 + }, + { + "auxiliary_loss_clip": 0.01592048, + "auxiliary_loss_mlp": 0.00301793, + "balance_loss_clip": 1.28302622, + "balance_loss_mlp": 0.27182376, + "epoch": 0.29003457087028406, + "flos": 22309360836480.0, + "grad_norm": 2.2463158085212016, + "language_loss": 0.87091196, + "learning_rate": 3.331629749427164e-06, + "loss": 0.88985032, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.29980469, + "step": 4824, + "time_per_iteration": 2.6951851844787598 + }, + { + "auxiliary_loss_clip": 0.01634006, + "auxiliary_loss_mlp": 0.00296321, + "balance_loss_clip": 1.30878782, + "balance_loss_mlp": 0.26294214, + "epoch": 0.2900946941229521, + "flos": 21944975316480.0, + "grad_norm": 10.813071944621353, + "language_loss": 0.79662025, + "learning_rate": 3.331339140206385e-06, + "loss": 0.81592357, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.33349609, + "step": 4825, + "time_per_iteration": 2.680131673812866 + }, + { + "auxiliary_loss_clip": 0.01619788, + "auxiliary_loss_mlp": 0.00318059, + "balance_loss_clip": 1.29726756, + "balance_loss_mlp": 0.28527653, + "epoch": 0.29015481737562004, + "flos": 17932173569280.0, + "grad_norm": 48.36906058988632, + "language_loss": 0.8257221, + "learning_rate": 3.331048480501092e-06, + "loss": 0.84510064, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.328125, + "step": 4826, + "time_per_iteration": 2.6892335414886475 + }, + { + "auxiliary_loss_clip": 0.01624866, + "auxiliary_loss_mlp": 0.00318043, + "balance_loss_clip": 1.29952872, + "balance_loss_mlp": 0.28661966, + "epoch": 0.290214940628288, + "flos": 22783525297920.0, + "grad_norm": 3.1437969848418517, + "language_loss": 0.7661615, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.78559059, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.31445312, + "step": 4827, + "time_per_iteration": 2.7381391525268555 + }, + { + "auxiliary_loss_clip": 0.01613012, + "auxiliary_loss_mlp": 0.00307927, + "balance_loss_clip": 1.30014324, + "balance_loss_mlp": 0.2754786, + "epoch": 0.290275063880956, + "flos": 20006481104640.0, + "grad_norm": 1.7735862724061389, + "language_loss": 0.86312288, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.88233227, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 3.13085938, + "router_z_loss_mlp": 0.32470703, + "step": 4828, + "time_per_iteration": 4.106780767440796 + }, + { + "auxiliary_loss_clip": 0.01587902, + "auxiliary_loss_mlp": 0.00293388, + "balance_loss_clip": 1.28251517, + "balance_loss_mlp": 0.26270345, + "epoch": 0.29033518713362394, + "flos": 22053605022720.0, + "grad_norm": 4.176256488719725, + "language_loss": 0.85667294, + "learning_rate": 3.33017619858836e-06, + "loss": 0.87548578, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.30688477, + "step": 4829, + "time_per_iteration": 2.622544288635254 + }, + { + "auxiliary_loss_clip": 0.01592658, + "auxiliary_loss_mlp": 0.00268923, + "balance_loss_clip": 1.28586483, + "balance_loss_mlp": 0.24142139, + "epoch": 0.2903953103862919, + "flos": 25630056351360.0, + "grad_norm": 2.0296812207273325, + "language_loss": 0.87396967, + "learning_rate": 3.329885337055249e-06, + "loss": 0.89258552, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.27490234, + "step": 4830, + "time_per_iteration": 2.676800489425659 + }, + { + "auxiliary_loss_clip": 0.01627058, + "auxiliary_loss_mlp": 0.00329696, + "balance_loss_clip": 1.31066072, + "balance_loss_mlp": 0.29631752, + "epoch": 0.29045543363895987, + "flos": 16945851035520.0, + "grad_norm": 135.88901008841094, + "language_loss": 0.87789178, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.89745939, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.3338623, + "step": 4831, + "time_per_iteration": 4.213700771331787 + }, + { + "auxiliary_loss_clip": 0.0159734, + "auxiliary_loss_mlp": 0.00281121, + "balance_loss_clip": 1.2922405, + "balance_loss_mlp": 0.25046048, + "epoch": 0.29051555689162784, + "flos": 26395492199040.0, + "grad_norm": 10.8383879246354, + "language_loss": 0.80703014, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.82581472, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 3.04492188, + "router_z_loss_mlp": 0.30688477, + "step": 4832, + "time_per_iteration": 2.7080366611480713 + }, + { + "auxiliary_loss_clip": 0.01582505, + "auxiliary_loss_mlp": 0.00304957, + "balance_loss_clip": 1.27031338, + "balance_loss_mlp": 0.27603707, + "epoch": 0.2905756801442958, + "flos": 21103875469440.0, + "grad_norm": 242.73364488234517, + "language_loss": 0.81474078, + "learning_rate": 3.329012449923736e-06, + "loss": 0.83361542, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 3.12304688, + "router_z_loss_mlp": 0.28930664, + "step": 4833, + "time_per_iteration": 2.7506818771362305 + }, + { + "auxiliary_loss_clip": 0.01580332, + "auxiliary_loss_mlp": 0.00289654, + "balance_loss_clip": 1.27149081, + "balance_loss_mlp": 0.2601018, + "epoch": 0.29063580339696377, + "flos": 15706071158400.0, + "grad_norm": 3.3672665753678586, + "language_loss": 0.73633504, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.75503486, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 3.0859375, + "router_z_loss_mlp": 0.29528809, + "step": 4834, + "time_per_iteration": 2.814420461654663 + }, + { + "auxiliary_loss_clip": 0.01583481, + "auxiliary_loss_mlp": 0.00279231, + "balance_loss_clip": 1.27226496, + "balance_loss_mlp": 0.24747351, + "epoch": 0.29069592664963173, + "flos": 24644990793600.0, + "grad_norm": 2.6084360107463933, + "language_loss": 0.76715267, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.78577983, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.31738281, + "step": 4835, + "time_per_iteration": 4.108250141143799 + }, + { + "auxiliary_loss_clip": 0.01568995, + "auxiliary_loss_mlp": 0.0029273, + "balance_loss_clip": 1.26424408, + "balance_loss_mlp": 0.26166445, + "epoch": 0.2907560499022997, + "flos": 24973753000320.0, + "grad_norm": 3.4932471983634414, + "language_loss": 0.86801314, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.88663042, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.31079102, + "step": 4836, + "time_per_iteration": 2.697174310684204 + }, + { + "auxiliary_loss_clip": 0.01573095, + "auxiliary_loss_mlp": 0.00279948, + "balance_loss_clip": 1.271981, + "balance_loss_mlp": 0.24969263, + "epoch": 0.29081617315496766, + "flos": 18657496903680.0, + "grad_norm": 62.29199338589051, + "language_loss": 0.86402881, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.88255924, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.30236816, + "step": 4837, + "time_per_iteration": 2.618614912033081 + }, + { + "auxiliary_loss_clip": 0.0156619, + "auxiliary_loss_mlp": 0.00298499, + "balance_loss_clip": 1.26220489, + "balance_loss_mlp": 0.26681301, + "epoch": 0.2908762964076356, + "flos": 35331035955840.0, + "grad_norm": 14.619181300918413, + "language_loss": 0.73421693, + "learning_rate": 3.327556630259381e-06, + "loss": 0.75286388, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.31665039, + "step": 4838, + "time_per_iteration": 2.8038620948791504 + }, + { + "auxiliary_loss_clip": 0.01568258, + "auxiliary_loss_mlp": 0.00289305, + "balance_loss_clip": 1.26272178, + "balance_loss_mlp": 0.25943065, + "epoch": 0.29093641966030365, + "flos": 23076305055360.0, + "grad_norm": 4.3222222011141165, + "language_loss": 0.78650403, + "learning_rate": 3.327265315259095e-06, + "loss": 0.8050797, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.29858398, + "step": 4839, + "time_per_iteration": 2.6770036220550537 + }, + { + "auxiliary_loss_clip": 0.0158025, + "auxiliary_loss_mlp": 0.00302923, + "balance_loss_clip": 1.2756592, + "balance_loss_mlp": 0.27045065, + "epoch": 0.2909965429129716, + "flos": 35955415094400.0, + "grad_norm": 3.177759521372114, + "language_loss": 0.83013737, + "learning_rate": 3.326973949928776e-06, + "loss": 0.8489691, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 3.05078125, + "router_z_loss_mlp": 0.32446289, + "step": 4840, + "time_per_iteration": 2.8860912322998047 + }, + { + "auxiliary_loss_clip": 0.01581242, + "auxiliary_loss_mlp": 0.00279988, + "balance_loss_clip": 1.27577853, + "balance_loss_mlp": 0.24858877, + "epoch": 0.2910566661656396, + "flos": 30880231764480.0, + "grad_norm": 103.12975403831918, + "language_loss": 0.70267689, + "learning_rate": 3.326682534279471e-06, + "loss": 0.72128922, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.31396484, + "step": 4841, + "time_per_iteration": 2.7527477741241455 + }, + { + "auxiliary_loss_clip": 0.01561867, + "auxiliary_loss_mlp": 0.00272769, + "balance_loss_clip": 1.26434779, + "balance_loss_mlp": 0.24299042, + "epoch": 0.29111678941830754, + "flos": 30010188533760.0, + "grad_norm": 2.1470009958189222, + "language_loss": 0.76646912, + "learning_rate": 3.326391068322232e-06, + "loss": 0.78481549, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.29760742, + "step": 4842, + "time_per_iteration": 4.1642467975616455 + }, + { + "auxiliary_loss_clip": 0.01568762, + "auxiliary_loss_mlp": 0.00278663, + "balance_loss_clip": 1.26454294, + "balance_loss_mlp": 0.24840814, + "epoch": 0.2911769126709755, + "flos": 22857393617280.0, + "grad_norm": 2.4205394423608584, + "language_loss": 0.80978823, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.82826257, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.3026123, + "step": 4843, + "time_per_iteration": 2.7080800533294678 + }, + { + "auxiliary_loss_clip": 0.01577086, + "auxiliary_loss_mlp": 0.00264048, + "balance_loss_clip": 1.2671206, + "balance_loss_mlp": 0.23336369, + "epoch": 0.2912370359236435, + "flos": 21650507619840.0, + "grad_norm": 3.791888943379148, + "language_loss": 0.67201537, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.69042671, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.30688477, + "step": 4844, + "time_per_iteration": 2.692915439605713 + }, + { + "auxiliary_loss_clip": 0.01577903, + "auxiliary_loss_mlp": 0.00289838, + "balance_loss_clip": 1.27152145, + "balance_loss_mlp": 0.2560299, + "epoch": 0.29129715917631144, + "flos": 22893340152960.0, + "grad_norm": 2.560732944449527, + "language_loss": 0.94854546, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.96722293, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.33837891, + "step": 4845, + "time_per_iteration": 2.7472190856933594 + }, + { + "auxiliary_loss_clip": 0.01547994, + "auxiliary_loss_mlp": 0.00295216, + "balance_loss_clip": 1.2566328, + "balance_loss_mlp": 0.26376811, + "epoch": 0.2913572824289794, + "flos": 22674464628480.0, + "grad_norm": 44.13642092755913, + "language_loss": 0.73461169, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.75304377, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.31445312, + "step": 4846, + "time_per_iteration": 2.6958000659942627 + }, + { + "auxiliary_loss_clip": 0.01534262, + "auxiliary_loss_mlp": 0.00253265, + "balance_loss_clip": 1.24763036, + "balance_loss_mlp": 0.22552454, + "epoch": 0.29141740568164737, + "flos": 23107403255040.0, + "grad_norm": 5.2253917170121955, + "language_loss": 0.76103383, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.77890909, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.27758789, + "step": 4847, + "time_per_iteration": 2.684823989868164 + }, + { + "auxiliary_loss_clip": 0.0154264, + "auxiliary_loss_mlp": 0.00272411, + "balance_loss_clip": 1.24918795, + "balance_loss_mlp": 0.24251309, + "epoch": 0.29147752893431533, + "flos": 23587026583680.0, + "grad_norm": 2.623932573418787, + "language_loss": 0.80926484, + "learning_rate": 3.324641216731237e-06, + "loss": 0.82741535, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.29882812, + "step": 4848, + "time_per_iteration": 2.7317447662353516 + }, + { + "auxiliary_loss_clip": 0.01568333, + "auxiliary_loss_mlp": 0.00292211, + "balance_loss_clip": 1.2686348, + "balance_loss_mlp": 0.26256335, + "epoch": 0.2915376521869833, + "flos": 20591968792320.0, + "grad_norm": 17.577846442460558, + "language_loss": 0.85034895, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.86895442, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.29663086, + "step": 4849, + "time_per_iteration": 2.6301426887512207 + }, + { + "auxiliary_loss_clip": 0.01595768, + "auxiliary_loss_mlp": 0.0029886, + "balance_loss_clip": 1.28440905, + "balance_loss_mlp": 0.26600593, + "epoch": 0.29159777543965126, + "flos": 20811490761600.0, + "grad_norm": 4.050617406388676, + "language_loss": 0.8564505, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.87539673, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.32861328, + "step": 4850, + "time_per_iteration": 2.6469013690948486 + }, + { + "auxiliary_loss_clip": 0.01586234, + "auxiliary_loss_mlp": 0.00280953, + "balance_loss_clip": 1.28378403, + "balance_loss_mlp": 0.24988672, + "epoch": 0.29165789869231923, + "flos": 24244155947520.0, + "grad_norm": 17.82449392684173, + "language_loss": 0.83497405, + "learning_rate": 3.323765612674296e-06, + "loss": 0.85364592, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.31030273, + "step": 4851, + "time_per_iteration": 2.638295888900757 + }, + { + "auxiliary_loss_clip": 0.01573609, + "auxiliary_loss_mlp": 0.00261814, + "balance_loss_clip": 1.27817523, + "balance_loss_mlp": 0.23346606, + "epoch": 0.29171802194498725, + "flos": 28949925853440.0, + "grad_norm": 1.9361715836929503, + "language_loss": 0.82053888, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.83889306, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.28393555, + "step": 4852, + "time_per_iteration": 2.7473742961883545 + }, + { + "auxiliary_loss_clip": 0.01616088, + "auxiliary_loss_mlp": 0.00273195, + "balance_loss_clip": 1.30166507, + "balance_loss_mlp": 0.24251074, + "epoch": 0.2917781451976552, + "flos": 22598226011520.0, + "grad_norm": 12.791631457804728, + "language_loss": 0.83584166, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.85473454, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.30664062, + "step": 4853, + "time_per_iteration": 2.7051782608032227 + }, + { + "auxiliary_loss_clip": 0.01569866, + "auxiliary_loss_mlp": 0.00269121, + "balance_loss_clip": 1.26828516, + "balance_loss_mlp": 0.23641036, + "epoch": 0.2918382684503232, + "flos": 21574448570880.0, + "grad_norm": 4.195435542249867, + "language_loss": 0.93571937, + "learning_rate": 3.322889556841445e-06, + "loss": 0.95410931, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.3269043, + "step": 4854, + "time_per_iteration": 2.6016831398010254 + }, + { + "auxiliary_loss_clip": 0.01602876, + "auxiliary_loss_mlp": 0.00265811, + "balance_loss_clip": 1.29563642, + "balance_loss_mlp": 0.23267131, + "epoch": 0.29189839170299114, + "flos": 24353503925760.0, + "grad_norm": 12.959596729799504, + "language_loss": 0.90871692, + "learning_rate": 3.322597437887519e-06, + "loss": 0.92740381, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.33154297, + "step": 4855, + "time_per_iteration": 2.676748752593994 + }, + { + "auxiliary_loss_clip": 0.01391324, + "auxiliary_loss_mlp": 0.00133189, + "balance_loss_clip": 1.13202024, + "balance_loss_mlp": 0.12322346, + "epoch": 0.2919585149556591, + "flos": 71316726215040.0, + "grad_norm": 0.7962725211885296, + "language_loss": 0.60009313, + "learning_rate": 3.322305268780566e-06, + "loss": 0.61533827, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.09960938, + "step": 4856, + "time_per_iteration": 3.255371332168579 + }, + { + "auxiliary_loss_clip": 0.01577311, + "auxiliary_loss_mlp": 0.00249907, + "balance_loss_clip": 1.28431904, + "balance_loss_mlp": 0.21681458, + "epoch": 0.2920186382083271, + "flos": 15633208419840.0, + "grad_norm": 11.359451870640555, + "language_loss": 0.76261055, + "learning_rate": 3.322013049531664e-06, + "loss": 0.78088272, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.33105469, + "step": 4857, + "time_per_iteration": 2.618434190750122 + }, + { + "auxiliary_loss_clip": 0.01595119, + "auxiliary_loss_mlp": 0.00274457, + "balance_loss_clip": 1.29268241, + "balance_loss_mlp": 0.2450359, + "epoch": 0.29207876146099504, + "flos": 28366018364160.0, + "grad_norm": 25.411934396594226, + "language_loss": 0.90066671, + "learning_rate": 3.321720780151895e-06, + "loss": 0.91936243, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.29467773, + "step": 4858, + "time_per_iteration": 2.7145328521728516 + }, + { + "auxiliary_loss_clip": 0.01614129, + "auxiliary_loss_mlp": 0.00268406, + "balance_loss_clip": 1.31104529, + "balance_loss_mlp": 0.2351228, + "epoch": 0.292138884713663, + "flos": 21870963342720.0, + "grad_norm": 2.235097327405344, + "language_loss": 0.84422982, + "learning_rate": 3.321428460652342e-06, + "loss": 0.86305511, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.33251953, + "step": 4859, + "time_per_iteration": 2.711402177810669 + }, + { + "auxiliary_loss_clip": 0.01612589, + "auxiliary_loss_mlp": 0.00286582, + "balance_loss_clip": 1.30119109, + "balance_loss_mlp": 0.25353739, + "epoch": 0.29219900796633097, + "flos": 20992552243200.0, + "grad_norm": 7.123776477872965, + "language_loss": 0.79723531, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.81622708, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.33056641, + "step": 4860, + "time_per_iteration": 2.7873964309692383 + }, + { + "auxiliary_loss_clip": 0.01619301, + "auxiliary_loss_mlp": 0.00255276, + "balance_loss_clip": 1.31606317, + "balance_loss_mlp": 0.22375695, + "epoch": 0.29225913121899894, + "flos": 35004608133120.0, + "grad_norm": 13.87536757520556, + "language_loss": 0.81999749, + "learning_rate": 3.320843671338222e-06, + "loss": 0.83874333, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.31494141, + "step": 4861, + "time_per_iteration": 2.8162572383880615 + }, + { + "auxiliary_loss_clip": 0.01639005, + "auxiliary_loss_mlp": 0.0024486, + "balance_loss_clip": 1.32986903, + "balance_loss_mlp": 0.21164876, + "epoch": 0.2923192544716669, + "flos": 13515663888000.0, + "grad_norm": 159.49943672448535, + "language_loss": 0.98120123, + "learning_rate": 3.320551201545832e-06, + "loss": 1.00003982, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.33203125, + "step": 4862, + "time_per_iteration": 2.6080386638641357 + }, + { + "auxiliary_loss_clip": 0.01590607, + "auxiliary_loss_mlp": 0.0025822, + "balance_loss_clip": 1.29491806, + "balance_loss_mlp": 0.22596201, + "epoch": 0.29237937772433487, + "flos": 19463512141440.0, + "grad_norm": 27.17185689422184, + "language_loss": 0.81215799, + "learning_rate": 3.320258681678008e-06, + "loss": 0.83064628, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.32299805, + "step": 4863, + "time_per_iteration": 2.62457013130188 + }, + { + "auxiliary_loss_clip": 0.01585761, + "auxiliary_loss_mlp": 0.00225091, + "balance_loss_clip": 1.29397345, + "balance_loss_mlp": 0.19257085, + "epoch": 0.29243950097700283, + "flos": 20850597694080.0, + "grad_norm": 6.0708105272629, + "language_loss": 0.85696501, + "learning_rate": 3.319966111745842e-06, + "loss": 0.87507355, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.32543945, + "step": 4864, + "time_per_iteration": 2.6734812259674072 + }, + { + "auxiliary_loss_clip": 0.01599777, + "auxiliary_loss_mlp": 0.00241702, + "balance_loss_clip": 1.30576277, + "balance_loss_mlp": 0.2077989, + "epoch": 0.29249962422967085, + "flos": 23584225322880.0, + "grad_norm": 3.2930103635377006, + "language_loss": 0.87697041, + "learning_rate": 3.319673491760429e-06, + "loss": 0.89538515, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.33935547, + "step": 4865, + "time_per_iteration": 2.676692247390747 + }, + { + "auxiliary_loss_clip": 0.01621299, + "auxiliary_loss_mlp": 0.00253523, + "balance_loss_clip": 1.31580913, + "balance_loss_mlp": 0.21964367, + "epoch": 0.2925597474823388, + "flos": 22273342473600.0, + "grad_norm": 9.349034200563327, + "language_loss": 0.93262506, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.95137334, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.33911133, + "step": 4866, + "time_per_iteration": 2.631282329559326 + }, + { + "auxiliary_loss_clip": 0.0158654, + "auxiliary_loss_mlp": 0.00238703, + "balance_loss_clip": 1.29215837, + "balance_loss_mlp": 0.20737515, + "epoch": 0.2926198707350068, + "flos": 34456108475520.0, + "grad_norm": 23.1504506729568, + "language_loss": 0.82083607, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.8390885, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.31298828, + "step": 4867, + "time_per_iteration": 2.7648160457611084 + }, + { + "auxiliary_loss_clip": 0.01589795, + "auxiliary_loss_mlp": 0.0025033, + "balance_loss_clip": 1.29170275, + "balance_loss_mlp": 0.21595022, + "epoch": 0.29267999398767475, + "flos": 20704153944960.0, + "grad_norm": 431.9207819175144, + "language_loss": 0.79193532, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.81033653, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.34375, + "step": 4868, + "time_per_iteration": 2.6399261951446533 + }, + { + "auxiliary_loss_clip": 0.01563624, + "auxiliary_loss_mlp": 0.0024095, + "balance_loss_clip": 1.27770591, + "balance_loss_mlp": 0.21057546, + "epoch": 0.2927401172403427, + "flos": 18368667642240.0, + "grad_norm": 3.686849990221056, + "language_loss": 0.79627752, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.81432331, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.30395508, + "step": 4869, + "time_per_iteration": 2.650019407272339 + }, + { + "auxiliary_loss_clip": 0.0157325, + "auxiliary_loss_mlp": 0.0023359, + "balance_loss_clip": 1.28253591, + "balance_loss_mlp": 0.19868596, + "epoch": 0.2928002404930107, + "flos": 26104041244800.0, + "grad_norm": 9.510378483080297, + "language_loss": 0.82524168, + "learning_rate": 3.318209641423088e-06, + "loss": 0.84331006, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.34887695, + "step": 4870, + "time_per_iteration": 2.730355978012085 + }, + { + "auxiliary_loss_clip": 0.01588317, + "auxiliary_loss_mlp": 0.00264624, + "balance_loss_clip": 1.29045367, + "balance_loss_mlp": 0.23124561, + "epoch": 0.29286036374567864, + "flos": 21324726241920.0, + "grad_norm": 6.00621640419454, + "language_loss": 0.77302265, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.79155207, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.33398438, + "step": 4871, + "time_per_iteration": 4.191540718078613 + }, + { + "auxiliary_loss_clip": 0.01561923, + "auxiliary_loss_mlp": 0.00234273, + "balance_loss_clip": 1.27241158, + "balance_loss_mlp": 0.20294437, + "epoch": 0.2929204869983466, + "flos": 29569492569600.0, + "grad_norm": 2.8572046450462216, + "language_loss": 0.84662962, + "learning_rate": 3.317623751303933e-06, + "loss": 0.8645916, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.31347656, + "step": 4872, + "time_per_iteration": 2.718592882156372 + }, + { + "auxiliary_loss_clip": 0.01609723, + "auxiliary_loss_mlp": 0.00269416, + "balance_loss_clip": 1.30285883, + "balance_loss_mlp": 0.23680004, + "epoch": 0.2929806102510146, + "flos": 19058259922560.0, + "grad_norm": 3.7030218909441066, + "language_loss": 0.78123587, + "learning_rate": 3.317330731292164e-06, + "loss": 0.80002725, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.32641602, + "step": 4873, + "time_per_iteration": 2.656299591064453 + }, + { + "auxiliary_loss_clip": 0.01583732, + "auxiliary_loss_mlp": 0.00275961, + "balance_loss_clip": 1.28719759, + "balance_loss_mlp": 0.24353588, + "epoch": 0.29304073350368254, + "flos": 21944221130880.0, + "grad_norm": 14.459734128343657, + "language_loss": 0.8505441, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.86914098, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.32397461, + "step": 4874, + "time_per_iteration": 4.185588836669922 + }, + { + "auxiliary_loss_clip": 0.01600913, + "auxiliary_loss_mlp": 0.00261548, + "balance_loss_clip": 1.29444075, + "balance_loss_mlp": 0.22866964, + "epoch": 0.2931008567563505, + "flos": 15450818135040.0, + "grad_norm": 77.0191525701655, + "language_loss": 0.85980272, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.87842727, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.32910156, + "step": 4875, + "time_per_iteration": 2.6289570331573486 + }, + { + "auxiliary_loss_clip": 0.01567314, + "auxiliary_loss_mlp": 0.00245722, + "balance_loss_clip": 1.2759335, + "balance_loss_mlp": 0.21442959, + "epoch": 0.29316098000901847, + "flos": 16983162288000.0, + "grad_norm": 2.9196948242157665, + "language_loss": 0.76263922, + "learning_rate": 3.316451371581431e-06, + "loss": 0.78076959, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.31311035, + "step": 4876, + "time_per_iteration": 2.617654323577881 + }, + { + "auxiliary_loss_clip": 0.01579555, + "auxiliary_loss_mlp": 0.00249227, + "balance_loss_clip": 1.28276205, + "balance_loss_mlp": 0.21665952, + "epoch": 0.29322110326168643, + "flos": 16357705741440.0, + "grad_norm": 84.99190085639044, + "language_loss": 0.90022069, + "learning_rate": 3.316158151823096e-06, + "loss": 0.91850853, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.32568359, + "step": 4877, + "time_per_iteration": 4.060422658920288 + }, + { + "auxiliary_loss_clip": 0.01596862, + "auxiliary_loss_mlp": 0.00263144, + "balance_loss_clip": 1.29449081, + "balance_loss_mlp": 0.23214975, + "epoch": 0.29328122651435445, + "flos": 13990869843840.0, + "grad_norm": 121.85008833822694, + "language_loss": 0.79009748, + "learning_rate": 3.315864882155911e-06, + "loss": 0.80869758, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.30957031, + "step": 4878, + "time_per_iteration": 2.626777410507202 + }, + { + "auxiliary_loss_clip": 0.01603554, + "auxiliary_loss_mlp": 0.00260925, + "balance_loss_clip": 1.30484915, + "balance_loss_mlp": 0.22852355, + "epoch": 0.2933413497670224, + "flos": 25264593423360.0, + "grad_norm": 4.676252663622893, + "language_loss": 0.81738377, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.83602858, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.32421875, + "step": 4879, + "time_per_iteration": 2.776411771774292 + }, + { + "auxiliary_loss_clip": 0.01626965, + "auxiliary_loss_mlp": 0.00263191, + "balance_loss_clip": 1.31708217, + "balance_loss_mlp": 0.23238684, + "epoch": 0.2934014730196904, + "flos": 32123746656000.0, + "grad_norm": 3.393169267782962, + "language_loss": 0.74953312, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.76843464, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.30810547, + "step": 4880, + "time_per_iteration": 2.7478859424591064 + }, + { + "auxiliary_loss_clip": 0.01599791, + "auxiliary_loss_mlp": 0.00266532, + "balance_loss_clip": 1.29189992, + "balance_loss_mlp": 0.23210463, + "epoch": 0.29346159627235835, + "flos": 24352498344960.0, + "grad_norm": 7.115627434233218, + "language_loss": 0.81892186, + "learning_rate": 3.314984773812481e-06, + "loss": 0.83758509, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.34448242, + "step": 4881, + "time_per_iteration": 2.6834983825683594 + }, + { + "auxiliary_loss_clip": 0.01619698, + "auxiliary_loss_mlp": 0.00276802, + "balance_loss_clip": 1.31173456, + "balance_loss_mlp": 0.24299452, + "epoch": 0.2935217195250263, + "flos": 22746752749440.0, + "grad_norm": 9.904588074905043, + "language_loss": 0.88614523, + "learning_rate": 3.314691304621127e-06, + "loss": 0.90511024, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 3.08203125, + "router_z_loss_mlp": 0.33789062, + "step": 4882, + "time_per_iteration": 2.6612088680267334 + }, + { + "auxiliary_loss_clip": 0.01636963, + "auxiliary_loss_mlp": 0.00271059, + "balance_loss_clip": 1.31880379, + "balance_loss_mlp": 0.23801431, + "epoch": 0.2935818427776943, + "flos": 21725561088000.0, + "grad_norm": 5.004506428808566, + "language_loss": 0.84553063, + "learning_rate": 3.314397785576548e-06, + "loss": 0.86461079, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.33056641, + "step": 4883, + "time_per_iteration": 2.704188585281372 + }, + { + "auxiliary_loss_clip": 0.01633524, + "auxiliary_loss_mlp": 0.00260989, + "balance_loss_clip": 1.32149863, + "balance_loss_mlp": 0.22470142, + "epoch": 0.29364196603036224, + "flos": 23804968354560.0, + "grad_norm": 26.724161491589474, + "language_loss": 0.99148232, + "learning_rate": 3.3141042166898726e-06, + "loss": 1.01042747, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.36279297, + "step": 4884, + "time_per_iteration": 2.708307981491089 + }, + { + "auxiliary_loss_clip": 0.01609257, + "auxiliary_loss_mlp": 0.00261951, + "balance_loss_clip": 1.30433774, + "balance_loss_mlp": 0.22955039, + "epoch": 0.2937020892830302, + "flos": 23470064922240.0, + "grad_norm": 19.61680200977082, + "language_loss": 0.81169522, + "learning_rate": 3.313810597972234e-06, + "loss": 0.83040732, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.32409668, + "step": 4885, + "time_per_iteration": 4.126938581466675 + }, + { + "auxiliary_loss_clip": 0.01590925, + "auxiliary_loss_mlp": 0.00241976, + "balance_loss_clip": 1.28907287, + "balance_loss_mlp": 0.20916972, + "epoch": 0.2937622125356982, + "flos": 24272740195200.0, + "grad_norm": 8.852823358963004, + "language_loss": 0.91809684, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.93642581, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.32788086, + "step": 4886, + "time_per_iteration": 2.7778303623199463 + }, + { + "auxiliary_loss_clip": 0.01567158, + "auxiliary_loss_mlp": 0.00239144, + "balance_loss_clip": 1.26939094, + "balance_loss_mlp": 0.20690984, + "epoch": 0.29382233578836614, + "flos": 20662461233280.0, + "grad_norm": 3.1583114443597164, + "language_loss": 0.84297299, + "learning_rate": 3.313223211088603e-06, + "loss": 0.86103594, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.3223877, + "step": 4887, + "time_per_iteration": 2.674790382385254 + }, + { + "auxiliary_loss_clip": 0.01602078, + "auxiliary_loss_mlp": 0.00241221, + "balance_loss_clip": 1.29781556, + "balance_loss_mlp": 0.20803355, + "epoch": 0.2938824590410341, + "flos": 16545052103040.0, + "grad_norm": 70.45312626741318, + "language_loss": 0.8845064, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.90293944, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.33178711, + "step": 4888, + "time_per_iteration": 2.6701881885528564 + }, + { + "auxiliary_loss_clip": 0.01556964, + "auxiliary_loss_mlp": 0.00233421, + "balance_loss_clip": 1.26061451, + "balance_loss_mlp": 0.20032865, + "epoch": 0.29394258229370207, + "flos": 37925474382720.0, + "grad_norm": 2.072068413199802, + "language_loss": 0.6124239, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.63032764, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.33129883, + "step": 4889, + "time_per_iteration": 2.80428409576416 + }, + { + "auxiliary_loss_clip": 0.01560569, + "auxiliary_loss_mlp": 0.0024648, + "balance_loss_clip": 1.26463056, + "balance_loss_mlp": 0.21541393, + "epoch": 0.29400270554637004, + "flos": 20044690197120.0, + "grad_norm": 4.9476499333792505, + "language_loss": 0.91354263, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.93161309, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.31079102, + "step": 4890, + "time_per_iteration": 2.6812074184417725 + }, + { + "auxiliary_loss_clip": 0.01585664, + "auxiliary_loss_mlp": 0.00255856, + "balance_loss_clip": 1.27331817, + "balance_loss_mlp": 0.22088045, + "epoch": 0.294062828799038, + "flos": 15266380775040.0, + "grad_norm": 2.763896941661604, + "language_loss": 0.80707657, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.82549179, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.35009766, + "step": 4891, + "time_per_iteration": 2.691605806350708 + }, + { + "auxiliary_loss_clip": 0.01565829, + "auxiliary_loss_mlp": 0.00254881, + "balance_loss_clip": 1.2630744, + "balance_loss_mlp": 0.22256364, + "epoch": 0.294122952051706, + "flos": 22747147799040.0, + "grad_norm": 195.97269192635505, + "language_loss": 0.84506822, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.86327541, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.32324219, + "step": 4892, + "time_per_iteration": 2.65470552444458 + }, + { + "auxiliary_loss_clip": 0.01544291, + "auxiliary_loss_mlp": 0.00255681, + "balance_loss_clip": 1.25093508, + "balance_loss_mlp": 0.22187361, + "epoch": 0.294183075304374, + "flos": 24972891073920.0, + "grad_norm": 14.981574963847866, + "language_loss": 0.82390839, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.8419081, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.33813477, + "step": 4893, + "time_per_iteration": 2.6904594898223877 + }, + { + "auxiliary_loss_clip": 0.01560109, + "auxiliary_loss_mlp": 0.00244409, + "balance_loss_clip": 1.26016784, + "balance_loss_mlp": 0.21153116, + "epoch": 0.29424319855704195, + "flos": 30952986762240.0, + "grad_norm": 7.943945494125445, + "language_loss": 0.89851779, + "learning_rate": 3.311165788957864e-06, + "loss": 0.91656291, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.32885742, + "step": 4894, + "time_per_iteration": 2.706162929534912 + }, + { + "auxiliary_loss_clip": 0.01564191, + "auxiliary_loss_mlp": 0.00246161, + "balance_loss_clip": 1.25974727, + "balance_loss_mlp": 0.21259144, + "epoch": 0.2943033218097099, + "flos": 15231583474560.0, + "grad_norm": 554.7390429639485, + "language_loss": 0.97925097, + "learning_rate": 3.310871672543274e-06, + "loss": 0.99735451, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 3.04492188, + "router_z_loss_mlp": 0.33569336, + "step": 4895, + "time_per_iteration": 2.644888401031494 + }, + { + "auxiliary_loss_clip": 0.01572693, + "auxiliary_loss_mlp": 0.00284124, + "balance_loss_clip": 1.26775539, + "balance_loss_mlp": 0.25038785, + "epoch": 0.2943634450623779, + "flos": 21725884310400.0, + "grad_norm": 3.745031108136354, + "language_loss": 0.96112728, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.97969544, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 3.04492188, + "router_z_loss_mlp": 0.33740234, + "step": 4896, + "time_per_iteration": 2.74212384223938 + }, + { + "auxiliary_loss_clip": 0.01555809, + "auxiliary_loss_mlp": 0.00266837, + "balance_loss_clip": 1.25291657, + "balance_loss_mlp": 0.2322664, + "epoch": 0.29442356831504585, + "flos": 22602104680320.0, + "grad_norm": 27.79940233167017, + "language_loss": 0.79677582, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.8150022, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.34545898, + "step": 4897, + "time_per_iteration": 2.6942689418792725 + }, + { + "auxiliary_loss_clip": 0.01572964, + "auxiliary_loss_mlp": 0.00251935, + "balance_loss_clip": 1.26245177, + "balance_loss_mlp": 0.21912819, + "epoch": 0.2944836915677138, + "flos": 20011401267840.0, + "grad_norm": 28.666296256917633, + "language_loss": 0.83659971, + "learning_rate": 3.309989025093813e-06, + "loss": 0.85484862, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.328125, + "step": 4898, + "time_per_iteration": 2.6339564323425293 + }, + { + "auxiliary_loss_clip": 0.01586863, + "auxiliary_loss_mlp": 0.00312794, + "balance_loss_clip": 1.27552974, + "balance_loss_mlp": 0.27564791, + "epoch": 0.2945438148203818, + "flos": 20045875345920.0, + "grad_norm": 3.398844834666672, + "language_loss": 0.82648867, + "learning_rate": 3.309694709912618e-06, + "loss": 0.84548527, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.37133789, + "step": 4899, + "time_per_iteration": 2.6374709606170654 + }, + { + "auxiliary_loss_clip": 0.01552376, + "auxiliary_loss_mlp": 0.00305971, + "balance_loss_clip": 1.25357175, + "balance_loss_mlp": 0.27235356, + "epoch": 0.29460393807304974, + "flos": 23733542160000.0, + "grad_norm": 87.18935639466329, + "language_loss": 0.86218584, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.88076931, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.3359375, + "step": 4900, + "time_per_iteration": 2.6559653282165527 + }, + { + "auxiliary_loss_clip": 0.01567218, + "auxiliary_loss_mlp": 0.00284819, + "balance_loss_clip": 1.26311111, + "balance_loss_mlp": 0.25296587, + "epoch": 0.2946640613257177, + "flos": 14976079056000.0, + "grad_norm": 36.04462282724581, + "language_loss": 0.8727411, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.89126152, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.31835938, + "step": 4901, + "time_per_iteration": 2.6325416564941406 + }, + { + "auxiliary_loss_clip": 0.01576879, + "auxiliary_loss_mlp": 0.00277652, + "balance_loss_clip": 1.2828455, + "balance_loss_mlp": 0.24724174, + "epoch": 0.2947241845783857, + "flos": 24243904552320.0, + "grad_norm": 4.442421748483573, + "language_loss": 0.66384214, + "learning_rate": 3.308811466431157e-06, + "loss": 0.68238747, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.30407715, + "step": 4902, + "time_per_iteration": 2.6815147399902344 + }, + { + "auxiliary_loss_clip": 0.0156737, + "auxiliary_loss_mlp": 0.00296715, + "balance_loss_clip": 1.26849294, + "balance_loss_mlp": 0.26574424, + "epoch": 0.29478430783105364, + "flos": 19938394874880.0, + "grad_norm": 3.8208432734531836, + "language_loss": 0.82519406, + "learning_rate": 3.308516952661925e-06, + "loss": 0.84383494, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 2.99023438, + "router_z_loss_mlp": 0.30981445, + "step": 4903, + "time_per_iteration": 2.7038252353668213 + }, + { + "auxiliary_loss_clip": 0.01616479, + "auxiliary_loss_mlp": 0.00297667, + "balance_loss_clip": 1.30352068, + "balance_loss_mlp": 0.26598161, + "epoch": 0.2948444310837216, + "flos": 27381347856000.0, + "grad_norm": 666.165085415943, + "language_loss": 0.70741051, + "learning_rate": 3.3082223892736e-06, + "loss": 0.72655201, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.31713867, + "step": 4904, + "time_per_iteration": 2.7333292961120605 + }, + { + "auxiliary_loss_clip": 0.0160765, + "auxiliary_loss_mlp": 0.00308697, + "balance_loss_clip": 1.29696178, + "balance_loss_mlp": 0.27529496, + "epoch": 0.2949045543363896, + "flos": 23405462311680.0, + "grad_norm": 3.399832230751802, + "language_loss": 0.81180233, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.83096588, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 3.109375, + "router_z_loss_mlp": 0.33422852, + "step": 4905, + "time_per_iteration": 2.817603349685669 + }, + { + "auxiliary_loss_clip": 0.0161021, + "auxiliary_loss_mlp": 0.00306947, + "balance_loss_clip": 1.29880846, + "balance_loss_mlp": 0.27442691, + "epoch": 0.2949646775890576, + "flos": 23951483930880.0, + "grad_norm": 27.08106214973461, + "language_loss": 0.88693029, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.90610194, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.32495117, + "step": 4906, + "time_per_iteration": 2.666475296020508 + }, + { + "auxiliary_loss_clip": 0.01602357, + "auxiliary_loss_mlp": 0.00265874, + "balance_loss_clip": 1.29971695, + "balance_loss_mlp": 0.23502308, + "epoch": 0.29502480084172555, + "flos": 22784315397120.0, + "grad_norm": 11.488511977439128, + "language_loss": 0.93487805, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.95356041, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.30859375, + "step": 4907, + "time_per_iteration": 2.7563719749450684 + }, + { + "auxiliary_loss_clip": 0.01624353, + "auxiliary_loss_mlp": 0.00302169, + "balance_loss_clip": 1.30920923, + "balance_loss_mlp": 0.26926696, + "epoch": 0.2950849240943935, + "flos": 19646656611840.0, + "grad_norm": 11.44433723251566, + "language_loss": 0.89514732, + "learning_rate": 3.307043639752782e-06, + "loss": 0.9144125, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.32910156, + "step": 4908, + "time_per_iteration": 2.652764320373535 + }, + { + "auxiliary_loss_clip": 0.01654949, + "auxiliary_loss_mlp": 0.00082233, + "balance_loss_clip": 1.34268427, + "balance_loss_mlp": 0.07198129, + "epoch": 0.2951450473470615, + "flos": 71002829260800.0, + "grad_norm": 0.7920966601209799, + "language_loss": 0.57292002, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.5902918, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.10253906, + "step": 4909, + "time_per_iteration": 3.0749146938323975 + }, + { + "auxiliary_loss_clip": 0.01591594, + "auxiliary_loss_mlp": 0.00303792, + "balance_loss_clip": 1.28472471, + "balance_loss_mlp": 0.27086619, + "epoch": 0.29520517059972945, + "flos": 22966310632320.0, + "grad_norm": 4.519770571304, + "language_loss": 0.92212129, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.94107509, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.32958984, + "step": 4910, + "time_per_iteration": 2.7072455883026123 + }, + { + "auxiliary_loss_clip": 0.0160599, + "auxiliary_loss_mlp": 0.00305976, + "balance_loss_clip": 1.29235601, + "balance_loss_mlp": 0.27750924, + "epoch": 0.2952652938523974, + "flos": 20485673470080.0, + "grad_norm": 3.7948636123712816, + "language_loss": 0.80481577, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.82393551, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.28466797, + "step": 4911, + "time_per_iteration": 2.6546552181243896 + }, + { + "auxiliary_loss_clip": 0.01607194, + "auxiliary_loss_mlp": 0.00250682, + "balance_loss_clip": 1.30395615, + "balance_loss_mlp": 0.22185689, + "epoch": 0.2953254171050654, + "flos": 19646584784640.0, + "grad_norm": 175.54456370892424, + "language_loss": 0.9602294, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.9788081, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.28857422, + "step": 4912, + "time_per_iteration": 2.6215813159942627 + }, + { + "auxiliary_loss_clip": 0.015909, + "auxiliary_loss_mlp": 0.00309109, + "balance_loss_clip": 1.28518355, + "balance_loss_mlp": 0.27661273, + "epoch": 0.29538554035773334, + "flos": 22747973811840.0, + "grad_norm": 1.5758892268386497, + "language_loss": 0.88258433, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.90158445, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.32495117, + "step": 4913, + "time_per_iteration": 4.183511257171631 + }, + { + "auxiliary_loss_clip": 0.01593531, + "auxiliary_loss_mlp": 0.00300485, + "balance_loss_clip": 1.29208779, + "balance_loss_mlp": 0.27214882, + "epoch": 0.2954456636104013, + "flos": 21871861182720.0, + "grad_norm": 1.852993788676443, + "language_loss": 0.83459616, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.85353631, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.28344727, + "step": 4914, + "time_per_iteration": 2.6378183364868164 + }, + { + "auxiliary_loss_clip": 0.01589841, + "auxiliary_loss_mlp": 0.00290538, + "balance_loss_clip": 1.28439045, + "balance_loss_mlp": 0.25928137, + "epoch": 0.2955057868630693, + "flos": 40442560871040.0, + "grad_norm": 38.62096331755229, + "language_loss": 0.87942666, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.89823043, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.31225586, + "step": 4915, + "time_per_iteration": 2.7893927097320557 + }, + { + "auxiliary_loss_clip": 0.01602097, + "auxiliary_loss_mlp": 0.00300702, + "balance_loss_clip": 1.29442477, + "balance_loss_mlp": 0.27191314, + "epoch": 0.29556591011573724, + "flos": 22564506119040.0, + "grad_norm": 150.3059342064791, + "language_loss": 0.90921313, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.92824113, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.2878418, + "step": 4916, + "time_per_iteration": 4.11733603477478 + }, + { + "auxiliary_loss_clip": 0.01594235, + "auxiliary_loss_mlp": 0.00266542, + "balance_loss_clip": 1.28754973, + "balance_loss_mlp": 0.23751502, + "epoch": 0.2956260333684052, + "flos": 22089300163200.0, + "grad_norm": 183.88559603613297, + "language_loss": 0.76850086, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.78710866, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.29040527, + "step": 4917, + "time_per_iteration": 2.6759583950042725 + }, + { + "auxiliary_loss_clip": 0.0160035, + "auxiliary_loss_mlp": 0.00293559, + "balance_loss_clip": 1.28847826, + "balance_loss_mlp": 0.26397097, + "epoch": 0.2956861566210732, + "flos": 16435488643200.0, + "grad_norm": 8.533540732928461, + "language_loss": 0.96433914, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.98327827, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.29589844, + "step": 4918, + "time_per_iteration": 2.6252222061157227 + }, + { + "auxiliary_loss_clip": 0.01613226, + "auxiliary_loss_mlp": 0.00312972, + "balance_loss_clip": 1.29670823, + "balance_loss_mlp": 0.27859169, + "epoch": 0.2957462798737412, + "flos": 25812087500160.0, + "grad_norm": 23.457157713718594, + "language_loss": 0.79637992, + "learning_rate": 3.303797991757425e-06, + "loss": 0.81564188, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.34399414, + "step": 4919, + "time_per_iteration": 4.086320638656616 + }, + { + "auxiliary_loss_clip": 0.01604819, + "auxiliary_loss_mlp": 0.00272211, + "balance_loss_clip": 1.29391265, + "balance_loss_mlp": 0.24133614, + "epoch": 0.29580640312640916, + "flos": 16690849407360.0, + "grad_norm": 3.6834424393955145, + "language_loss": 0.82658887, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.84535921, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.30883789, + "step": 4920, + "time_per_iteration": 2.6230740547180176 + }, + { + "auxiliary_loss_clip": 0.01635977, + "auxiliary_loss_mlp": 0.00303199, + "balance_loss_clip": 1.31319344, + "balance_loss_mlp": 0.27415985, + "epoch": 0.2958665263790771, + "flos": 23945594100480.0, + "grad_norm": 9.423401882495394, + "language_loss": 0.7804265, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.79981828, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.2902832, + "step": 4921, + "time_per_iteration": 2.6944007873535156 + }, + { + "auxiliary_loss_clip": 0.01650793, + "auxiliary_loss_mlp": 0.00306487, + "balance_loss_clip": 1.32203031, + "balance_loss_mlp": 0.2718451, + "epoch": 0.2959266496317451, + "flos": 18478410670080.0, + "grad_norm": 24.727532860372353, + "language_loss": 0.84502423, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.86459708, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.34643555, + "step": 4922, + "time_per_iteration": 2.665149211883545 + }, + { + "auxiliary_loss_clip": 0.01649781, + "auxiliary_loss_mlp": 0.00307338, + "balance_loss_clip": 1.32230449, + "balance_loss_mlp": 0.27565229, + "epoch": 0.29598677288441305, + "flos": 25957489754880.0, + "grad_norm": 13.845178702256723, + "language_loss": 0.83494866, + "learning_rate": 3.302616272134737e-06, + "loss": 0.85451984, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.31665039, + "step": 4923, + "time_per_iteration": 2.694350481033325 + }, + { + "auxiliary_loss_clip": 0.01658387, + "auxiliary_loss_mlp": 0.00285673, + "balance_loss_clip": 1.33211541, + "balance_loss_mlp": 0.25591865, + "epoch": 0.296046896137081, + "flos": 25155999630720.0, + "grad_norm": 8.089433610443926, + "language_loss": 0.92656386, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.94600451, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.29760742, + "step": 4924, + "time_per_iteration": 2.7178127765655518 + }, + { + "auxiliary_loss_clip": 0.01652166, + "auxiliary_loss_mlp": 0.00259242, + "balance_loss_clip": 1.32675529, + "balance_loss_mlp": 0.23122747, + "epoch": 0.296107019389749, + "flos": 21761148487680.0, + "grad_norm": 9.092421568828517, + "language_loss": 0.86735064, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.88646472, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.27978516, + "step": 4925, + "time_per_iteration": 2.721133232116699 + }, + { + "auxiliary_loss_clip": 0.01688705, + "auxiliary_loss_mlp": 0.00268822, + "balance_loss_clip": 1.3541553, + "balance_loss_mlp": 0.2379947, + "epoch": 0.29616714264241695, + "flos": 17960039544960.0, + "grad_norm": 3.1811037333663292, + "language_loss": 0.95747721, + "learning_rate": 3.301729463727452e-06, + "loss": 0.97705245, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.30834961, + "step": 4926, + "time_per_iteration": 2.881401300430298 + }, + { + "auxiliary_loss_clip": 0.01692702, + "auxiliary_loss_mlp": 0.00289708, + "balance_loss_clip": 1.35046053, + "balance_loss_mlp": 0.25928593, + "epoch": 0.2962272658950849, + "flos": 15012779777280.0, + "grad_norm": 6.1646601781246035, + "language_loss": 0.93989438, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.95971847, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.30395508, + "step": 4927, + "time_per_iteration": 4.056729555130005 + }, + { + "auxiliary_loss_clip": 0.01682744, + "auxiliary_loss_mlp": 0.0025731, + "balance_loss_clip": 1.3491497, + "balance_loss_mlp": 0.22927216, + "epoch": 0.2962873891477529, + "flos": 14720861946240.0, + "grad_norm": 1.6235061446641497, + "language_loss": 0.85556185, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.87496239, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.28039551, + "step": 4928, + "time_per_iteration": 2.638834238052368 + }, + { + "auxiliary_loss_clip": 0.01711647, + "auxiliary_loss_mlp": 0.00335023, + "balance_loss_clip": 1.36009383, + "balance_loss_mlp": 0.29947478, + "epoch": 0.29634751240042084, + "flos": 26723787528960.0, + "grad_norm": 103.2197233480817, + "language_loss": 0.81449342, + "learning_rate": 3.300842211064773e-06, + "loss": 0.8349601, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.35546875, + "step": 4929, + "time_per_iteration": 2.713148355484009 + }, + { + "auxiliary_loss_clip": 0.01730431, + "auxiliary_loss_mlp": 0.00310093, + "balance_loss_clip": 1.37953722, + "balance_loss_mlp": 0.28062445, + "epoch": 0.2964076356530888, + "flos": 14571293713920.0, + "grad_norm": 26.086242054240792, + "language_loss": 0.81994665, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.84035188, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.29467773, + "step": 4930, + "time_per_iteration": 2.636342763900757 + }, + { + "auxiliary_loss_clip": 0.01756445, + "auxiliary_loss_mlp": 0.00084774, + "balance_loss_clip": 1.35945892, + "balance_loss_mlp": 0.07371178, + "epoch": 0.29646775890575683, + "flos": 63104315063040.0, + "grad_norm": 0.8395289189806204, + "language_loss": 0.60482299, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.62323523, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 3.96875, + "router_z_loss_mlp": 0.11083984, + "step": 4931, + "time_per_iteration": 3.077744722366333 + }, + { + "auxiliary_loss_clip": 0.0169344, + "auxiliary_loss_mlp": 0.00050582, + "balance_loss_clip": 1.31790805, + "balance_loss_mlp": 0.042333, + "epoch": 0.2965278821584248, + "flos": 63067686168960.0, + "grad_norm": 0.7490603683206308, + "language_loss": 0.52329993, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54074013, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 0.08251953, + "step": 4932, + "time_per_iteration": 3.013166666030884 + }, + { + "auxiliary_loss_clip": 0.01717327, + "auxiliary_loss_mlp": 0.00306046, + "balance_loss_clip": 1.36932659, + "balance_loss_mlp": 0.2762194, + "epoch": 0.29658800541109276, + "flos": 23768734510080.0, + "grad_norm": 5.006329736980159, + "language_loss": 0.88136572, + "learning_rate": 3.299658516973972e-06, + "loss": 0.90159947, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.29846191, + "step": 4933, + "time_per_iteration": 2.686816692352295 + }, + { + "auxiliary_loss_clip": 0.01704717, + "auxiliary_loss_mlp": 0.00329146, + "balance_loss_clip": 1.36000872, + "balance_loss_mlp": 0.2993196, + "epoch": 0.2966481286637607, + "flos": 23988543788160.0, + "grad_norm": 12.471326253350428, + "language_loss": 0.81125712, + "learning_rate": 3.299362470215261e-06, + "loss": 0.83159572, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 0.2980957, + "step": 4934, + "time_per_iteration": 2.852874755859375 + }, + { + "auxiliary_loss_clip": 0.01725719, + "auxiliary_loss_mlp": 0.00360179, + "balance_loss_clip": 1.37859702, + "balance_loss_mlp": 0.3284691, + "epoch": 0.2967082519164287, + "flos": 17165157523200.0, + "grad_norm": 13.5330203989292, + "language_loss": 0.70213258, + "learning_rate": 3.299066374184594e-06, + "loss": 0.72299159, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.31726074, + "step": 4935, + "time_per_iteration": 2.677365303039551 + }, + { + "auxiliary_loss_clip": 0.01738119, + "auxiliary_loss_mlp": 0.00343072, + "balance_loss_clip": 1.38659739, + "balance_loss_mlp": 0.31059954, + "epoch": 0.29676837516909665, + "flos": 29387712816000.0, + "grad_norm": 1.9738736223028253, + "language_loss": 0.85467857, + "learning_rate": 3.2987702288932e-06, + "loss": 0.87549049, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 3.51367188, + "router_z_loss_mlp": 0.32470703, + "step": 4936, + "time_per_iteration": 2.6857948303222656 + }, + { + "auxiliary_loss_clip": 0.01720873, + "auxiliary_loss_mlp": 0.00359349, + "balance_loss_clip": 1.37080598, + "balance_loss_mlp": 0.32627994, + "epoch": 0.2968284984217646, + "flos": 34751222616960.0, + "grad_norm": 4.919400437168214, + "language_loss": 0.80583334, + "learning_rate": 3.298474034352309e-06, + "loss": 0.8266356, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.33056641, + "step": 4937, + "time_per_iteration": 2.7490267753601074 + }, + { + "auxiliary_loss_clip": 0.01711689, + "auxiliary_loss_mlp": 0.0035362, + "balance_loss_clip": 1.36515403, + "balance_loss_mlp": 0.32143331, + "epoch": 0.2968886216744326, + "flos": 21544104556800.0, + "grad_norm": 83.78442434392386, + "language_loss": 0.82916129, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.84981441, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.32177734, + "step": 4938, + "time_per_iteration": 2.6939804553985596 + }, + { + "auxiliary_loss_clip": 0.01728143, + "auxiliary_loss_mlp": 0.00373097, + "balance_loss_clip": 1.37632906, + "balance_loss_mlp": 0.34050527, + "epoch": 0.29694874492710055, + "flos": 12787323811200.0, + "grad_norm": 62.41011748030156, + "language_loss": 0.85228348, + "learning_rate": 3.297881497566964e-06, + "loss": 0.87329578, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.32568359, + "step": 4939, + "time_per_iteration": 2.642885446548462 + }, + { + "auxiliary_loss_clip": 0.01711265, + "auxiliary_loss_mlp": 0.00379214, + "balance_loss_clip": 1.36799002, + "balance_loss_mlp": 0.34593093, + "epoch": 0.2970088681797685, + "flos": 24569973239040.0, + "grad_norm": 25.005099773318666, + "language_loss": 0.84399682, + "learning_rate": 3.297585155344979e-06, + "loss": 0.86490154, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.33300781, + "step": 4940, + "time_per_iteration": 2.6727726459503174 + }, + { + "auxiliary_loss_clip": 0.01701915, + "auxiliary_loss_mlp": 0.00349592, + "balance_loss_clip": 1.35936463, + "balance_loss_mlp": 0.31568897, + "epoch": 0.2970689914324365, + "flos": 23659171050240.0, + "grad_norm": 2.662894199931893, + "language_loss": 0.82166958, + "learning_rate": 3.297288763918435e-06, + "loss": 0.84218466, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.33911133, + "step": 4941, + "time_per_iteration": 2.6557960510253906 + }, + { + "auxiliary_loss_clip": 0.01754728, + "auxiliary_loss_mlp": 0.00355255, + "balance_loss_clip": 1.3927412, + "balance_loss_mlp": 0.32254425, + "epoch": 0.29712911468510445, + "flos": 39670301439360.0, + "grad_norm": 7.414456211423165, + "language_loss": 0.8236354, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.84473521, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.32714844, + "step": 4942, + "time_per_iteration": 2.784954786300659 + }, + { + "auxiliary_loss_clip": 0.01716824, + "auxiliary_loss_mlp": 0.00417176, + "balance_loss_clip": 1.36406326, + "balance_loss_mlp": 0.37883782, + "epoch": 0.2971892379377724, + "flos": 26395312631040.0, + "grad_norm": 2.5255736836105753, + "language_loss": 0.77059448, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.79193449, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 0.38354492, + "step": 4943, + "time_per_iteration": 2.6953606605529785 + }, + { + "auxiliary_loss_clip": 0.01728877, + "auxiliary_loss_mlp": 0.0035889, + "balance_loss_clip": 1.37924385, + "balance_loss_mlp": 0.32734677, + "epoch": 0.2972493611904404, + "flos": 17603195880960.0, + "grad_norm": 24.355685272639125, + "language_loss": 0.8683399, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.8892175, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 0.31542969, + "step": 4944, + "time_per_iteration": 2.621790647506714 + }, + { + "auxiliary_loss_clip": 0.01685646, + "auxiliary_loss_mlp": 0.00328918, + "balance_loss_clip": 1.34927201, + "balance_loss_mlp": 0.29902041, + "epoch": 0.2973094844431084, + "flos": 20412774817920.0, + "grad_norm": 14.222754648308573, + "language_loss": 0.90426636, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.92441201, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.29907227, + "step": 4945, + "time_per_iteration": 2.6186470985412598 + }, + { + "auxiliary_loss_clip": 0.01695984, + "auxiliary_loss_mlp": 0.00336795, + "balance_loss_clip": 1.36048913, + "balance_loss_mlp": 0.30698121, + "epoch": 0.29736960769577636, + "flos": 17493488766720.0, + "grad_norm": 7.789959551905341, + "language_loss": 0.7390101, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.7593379, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.29797363, + "step": 4946, + "time_per_iteration": 2.622807502746582 + }, + { + "auxiliary_loss_clip": 0.01683624, + "auxiliary_loss_mlp": 0.00324586, + "balance_loss_clip": 1.34923196, + "balance_loss_mlp": 0.29202995, + "epoch": 0.2974297309484443, + "flos": 26103969417600.0, + "grad_norm": 112.10609749797582, + "language_loss": 0.80707502, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.82715702, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.3260498, + "step": 4947, + "time_per_iteration": 2.786508321762085 + }, + { + "auxiliary_loss_clip": 0.01694662, + "auxiliary_loss_mlp": 0.00366281, + "balance_loss_clip": 1.35385048, + "balance_loss_mlp": 0.32927811, + "epoch": 0.2974898542011123, + "flos": 25666433850240.0, + "grad_norm": 2.333096832391043, + "language_loss": 0.80562615, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.82623553, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 3.41015625, + "router_z_loss_mlp": 0.36987305, + "step": 4948, + "time_per_iteration": 2.740069627761841 + }, + { + "auxiliary_loss_clip": 0.0169712, + "auxiliary_loss_mlp": 0.00339116, + "balance_loss_clip": 1.36234105, + "balance_loss_mlp": 0.3093375, + "epoch": 0.29754997745378026, + "flos": 18661339658880.0, + "grad_norm": 12.560784986578224, + "language_loss": 0.89112902, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.91149139, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.29772949, + "step": 4949, + "time_per_iteration": 2.6404497623443604 + }, + { + "auxiliary_loss_clip": 0.01684274, + "auxiliary_loss_mlp": 0.00354539, + "balance_loss_clip": 1.34668064, + "balance_loss_mlp": 0.32211423, + "epoch": 0.2976101007064482, + "flos": 22274599449600.0, + "grad_norm": 11.434321101350271, + "language_loss": 0.77820522, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.79859328, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.32470703, + "step": 4950, + "time_per_iteration": 2.737156867980957 + }, + { + "auxiliary_loss_clip": 0.01666781, + "auxiliary_loss_mlp": 0.00333309, + "balance_loss_clip": 1.34051967, + "balance_loss_mlp": 0.30467492, + "epoch": 0.2976702239591162, + "flos": 21945657674880.0, + "grad_norm": 60.689704056939306, + "language_loss": 0.88564229, + "learning_rate": 3.294322145875789e-06, + "loss": 0.90564322, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.28662109, + "step": 4951, + "time_per_iteration": 2.6988019943237305 + }, + { + "auxiliary_loss_clip": 0.01686661, + "auxiliary_loss_mlp": 0.00339625, + "balance_loss_clip": 1.34840012, + "balance_loss_mlp": 0.30624673, + "epoch": 0.29773034721178415, + "flos": 24637197542400.0, + "grad_norm": 4.13527628266465, + "language_loss": 0.81783247, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.83809537, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.33349609, + "step": 4952, + "time_per_iteration": 2.6742208003997803 + }, + { + "auxiliary_loss_clip": 0.01699122, + "auxiliary_loss_mlp": 0.00354355, + "balance_loss_clip": 1.36175394, + "balance_loss_mlp": 0.32059479, + "epoch": 0.2977904704644521, + "flos": 20557566541440.0, + "grad_norm": 2.0767210987190334, + "language_loss": 0.90701425, + "learning_rate": 3.293728232937228e-06, + "loss": 0.927549, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.33764648, + "step": 4953, + "time_per_iteration": 2.6548681259155273 + }, + { + "auxiliary_loss_clip": 0.01704368, + "auxiliary_loss_mlp": 0.00342496, + "balance_loss_clip": 1.36177695, + "balance_loss_mlp": 0.31115553, + "epoch": 0.2978505937171201, + "flos": 18916449027840.0, + "grad_norm": 2.1466075183501694, + "language_loss": 0.83146346, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.85193217, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.31335449, + "step": 4954, + "time_per_iteration": 2.636214017868042 + }, + { + "auxiliary_loss_clip": 0.01691459, + "auxiliary_loss_mlp": 0.00356159, + "balance_loss_clip": 1.35653162, + "balance_loss_mlp": 0.32406762, + "epoch": 0.29791071696978805, + "flos": 19317750750720.0, + "grad_norm": 5.496937605806224, + "language_loss": 0.81063008, + "learning_rate": 3.293134123765452e-06, + "loss": 0.83110631, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.32043457, + "step": 4955, + "time_per_iteration": 4.060130596160889 + }, + { + "auxiliary_loss_clip": 0.01694551, + "auxiliary_loss_mlp": 0.00364888, + "balance_loss_clip": 1.36031723, + "balance_loss_mlp": 0.33170056, + "epoch": 0.297970840222456, + "flos": 18806813740800.0, + "grad_norm": 6.964146340119644, + "language_loss": 0.78587067, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.80646509, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.33166504, + "step": 4956, + "time_per_iteration": 2.663576126098633 + }, + { + "auxiliary_loss_clip": 0.01720805, + "auxiliary_loss_mlp": 0.00397301, + "balance_loss_clip": 1.3787148, + "balance_loss_mlp": 0.36177617, + "epoch": 0.298030963475124, + "flos": 22852760762880.0, + "grad_norm": 32.935383236697355, + "language_loss": 0.84394133, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.86512232, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 3.41992188, + "router_z_loss_mlp": 0.35546875, + "step": 4957, + "time_per_iteration": 2.6786117553710938 + }, + { + "auxiliary_loss_clip": 0.01708245, + "auxiliary_loss_mlp": 0.00331537, + "balance_loss_clip": 1.37632608, + "balance_loss_mlp": 0.3008762, + "epoch": 0.298091086727792, + "flos": 21868485304320.0, + "grad_norm": 5.816596348544819, + "language_loss": 0.75882053, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.77921832, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.30639648, + "step": 4958, + "time_per_iteration": 4.065936088562012 + }, + { + "auxiliary_loss_clip": 0.01683528, + "auxiliary_loss_mlp": 0.00356702, + "balance_loss_clip": 1.35480547, + "balance_loss_mlp": 0.32546926, + "epoch": 0.29815120998045996, + "flos": 21175014355200.0, + "grad_norm": 2.363870600071229, + "language_loss": 0.83223379, + "learning_rate": 3.291945317082743e-06, + "loss": 0.85263604, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.3125, + "step": 4959, + "time_per_iteration": 2.6780941486358643 + }, + { + "auxiliary_loss_clip": 0.01690392, + "auxiliary_loss_mlp": 0.00346485, + "balance_loss_clip": 1.3516134, + "balance_loss_mlp": 0.31743366, + "epoch": 0.29821133323312793, + "flos": 19896271200000.0, + "grad_norm": 6.086673121639434, + "language_loss": 0.85522449, + "learning_rate": 3.291647992907147e-06, + "loss": 0.8755933, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.29040527, + "step": 4960, + "time_per_iteration": 2.700474262237549 + }, + { + "auxiliary_loss_clip": 0.0168649, + "auxiliary_loss_mlp": 0.00408454, + "balance_loss_clip": 1.3477627, + "balance_loss_mlp": 0.36994928, + "epoch": 0.2982714564857959, + "flos": 12750766744320.0, + "grad_norm": 23.379369036407812, + "language_loss": 0.82705277, + "learning_rate": 3.291350619752129e-06, + "loss": 0.84800225, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.38500977, + "step": 4961, + "time_per_iteration": 4.192182302474976 + }, + { + "auxiliary_loss_clip": 0.01700282, + "auxiliary_loss_mlp": 0.00375028, + "balance_loss_clip": 1.36463237, + "balance_loss_mlp": 0.34133989, + "epoch": 0.29833157973846386, + "flos": 22271905929600.0, + "grad_norm": 265.8479635417841, + "language_loss": 0.68132955, + "learning_rate": 3.291053197628967e-06, + "loss": 0.70208269, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.33666992, + "step": 4962, + "time_per_iteration": 2.657818555831909 + }, + { + "auxiliary_loss_clip": 0.01669434, + "auxiliary_loss_mlp": 0.00323708, + "balance_loss_clip": 1.34221232, + "balance_loss_mlp": 0.29190347, + "epoch": 0.2983917029911318, + "flos": 15372999319680.0, + "grad_norm": 62.37006561410585, + "language_loss": 0.88467997, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.90461135, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.31811523, + "step": 4963, + "time_per_iteration": 2.605969190597534 + }, + { + "auxiliary_loss_clip": 0.01712563, + "auxiliary_loss_mlp": 0.00346866, + "balance_loss_clip": 1.37072539, + "balance_loss_mlp": 0.31744492, + "epoch": 0.2984518262437998, + "flos": 15377632174080.0, + "grad_norm": 9.146345555641421, + "language_loss": 0.75778848, + "learning_rate": 3.290458206523322e-06, + "loss": 0.77838278, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 0.29443359, + "step": 4964, + "time_per_iteration": 2.618959903717041 + }, + { + "auxiliary_loss_clip": 0.01650455, + "auxiliary_loss_mlp": 0.00318648, + "balance_loss_clip": 1.32756376, + "balance_loss_mlp": 0.28946561, + "epoch": 0.29851194949646775, + "flos": 18108458542080.0, + "grad_norm": 9.454379289133874, + "language_loss": 0.77372408, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.79341513, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.29199219, + "step": 4965, + "time_per_iteration": 2.617427349090576 + }, + { + "auxiliary_loss_clip": 0.0164847, + "auxiliary_loss_mlp": 0.00331337, + "balance_loss_clip": 1.32997966, + "balance_loss_mlp": 0.3005099, + "epoch": 0.2985720727491357, + "flos": 22018233104640.0, + "grad_norm": 7.415887358225868, + "language_loss": 0.74740005, + "learning_rate": 3.289863019680461e-06, + "loss": 0.76719815, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.30822754, + "step": 4966, + "time_per_iteration": 2.6575918197631836 + }, + { + "auxiliary_loss_clip": 0.01676717, + "auxiliary_loss_mlp": 0.00385841, + "balance_loss_clip": 1.35216451, + "balance_loss_mlp": 0.35322571, + "epoch": 0.2986321960018037, + "flos": 13041355772160.0, + "grad_norm": 153.47733363732107, + "language_loss": 0.83440101, + "learning_rate": 3.289565352885785e-06, + "loss": 0.85502666, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.32617188, + "step": 4967, + "time_per_iteration": 2.5925536155700684 + }, + { + "auxiliary_loss_clip": 0.01663894, + "auxiliary_loss_mlp": 0.00360224, + "balance_loss_clip": 1.33939183, + "balance_loss_mlp": 0.3293013, + "epoch": 0.29869231925447165, + "flos": 14465034305280.0, + "grad_norm": 48.10295530623364, + "language_loss": 0.78370941, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.80395061, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.30932617, + "step": 4968, + "time_per_iteration": 2.620279312133789 + }, + { + "auxiliary_loss_clip": 0.01669872, + "auxiliary_loss_mlp": 0.00343684, + "balance_loss_clip": 1.34404039, + "balance_loss_mlp": 0.30942309, + "epoch": 0.2987524425071396, + "flos": 31650228639360.0, + "grad_norm": 98.41468214947817, + "language_loss": 0.82691276, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.84704834, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.34277344, + "step": 4969, + "time_per_iteration": 4.149555683135986 + }, + { + "auxiliary_loss_clip": 0.01659601, + "auxiliary_loss_mlp": 0.0029979, + "balance_loss_clip": 1.33852386, + "balance_loss_mlp": 0.27231205, + "epoch": 0.2988125657598076, + "flos": 21433427775360.0, + "grad_norm": 37.36652711622734, + "language_loss": 0.77078378, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.79037774, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.27514648, + "step": 4970, + "time_per_iteration": 2.6591849327087402 + }, + { + "auxiliary_loss_clip": 0.01697458, + "auxiliary_loss_mlp": 0.00354223, + "balance_loss_clip": 1.36034024, + "balance_loss_mlp": 0.32067734, + "epoch": 0.2988726890124756, + "flos": 18076965292800.0, + "grad_norm": 12.371556101898067, + "language_loss": 0.91362405, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.93414092, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.33496094, + "step": 4971, + "time_per_iteration": 2.9182960987091064 + }, + { + "auxiliary_loss_clip": 0.01659505, + "auxiliary_loss_mlp": 0.00310547, + "balance_loss_clip": 1.34335136, + "balance_loss_mlp": 0.28031504, + "epoch": 0.29893281226514357, + "flos": 21755653706880.0, + "grad_norm": 74.36891516350391, + "language_loss": 0.85126305, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.87096357, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.30212402, + "step": 4972, + "time_per_iteration": 2.771169424057007 + }, + { + "auxiliary_loss_clip": 0.01674858, + "auxiliary_loss_mlp": 0.00350461, + "balance_loss_clip": 1.34933317, + "balance_loss_mlp": 0.31665373, + "epoch": 0.29899293551781153, + "flos": 16836718538880.0, + "grad_norm": 266.2677418236052, + "language_loss": 0.91842389, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.93867707, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 3.25976562, + "router_z_loss_mlp": 0.33813477, + "step": 4973, + "time_per_iteration": 2.663741111755371 + }, + { + "auxiliary_loss_clip": 0.01706555, + "auxiliary_loss_mlp": 0.00308146, + "balance_loss_clip": 1.37542152, + "balance_loss_mlp": 0.27650821, + "epoch": 0.2990530587704795, + "flos": 11729215946880.0, + "grad_norm": 9.372460173876998, + "language_loss": 0.82528132, + "learning_rate": 3.287480316742863e-06, + "loss": 0.84542835, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.31640625, + "step": 4974, + "time_per_iteration": 2.6302711963653564 + }, + { + "auxiliary_loss_clip": 0.01687826, + "auxiliary_loss_mlp": 0.00300713, + "balance_loss_clip": 1.36573255, + "balance_loss_mlp": 0.27167404, + "epoch": 0.29911318202314746, + "flos": 28039877850240.0, + "grad_norm": 19.07694509376567, + "language_loss": 0.78437209, + "learning_rate": 3.287182259060815e-06, + "loss": 0.80425745, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.29052734, + "step": 4975, + "time_per_iteration": 2.7536585330963135 + }, + { + "auxiliary_loss_clip": 0.01659983, + "auxiliary_loss_mlp": 0.00321823, + "balance_loss_clip": 1.33849978, + "balance_loss_mlp": 0.28772882, + "epoch": 0.2991733052758154, + "flos": 18733555952640.0, + "grad_norm": 7.593827750029448, + "language_loss": 0.82728332, + "learning_rate": 3.286884152568687e-06, + "loss": 0.84710139, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.34057617, + "step": 4976, + "time_per_iteration": 2.6658823490142822 + }, + { + "auxiliary_loss_clip": 0.01647791, + "auxiliary_loss_mlp": 0.0033883, + "balance_loss_clip": 1.3317616, + "balance_loss_mlp": 0.30716839, + "epoch": 0.2992334285284834, + "flos": 15559160532480.0, + "grad_norm": 65.10500317731223, + "language_loss": 0.94397801, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.96384424, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.31665039, + "step": 4977, + "time_per_iteration": 2.7095606327056885 + }, + { + "auxiliary_loss_clip": 0.01666869, + "auxiliary_loss_mlp": 0.00303712, + "balance_loss_clip": 1.34636617, + "balance_loss_mlp": 0.2714541, + "epoch": 0.29929355178115136, + "flos": 21797561900160.0, + "grad_norm": 6.605630584820512, + "language_loss": 0.75482094, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.77452672, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.3223877, + "step": 4978, + "time_per_iteration": 2.7401673793792725 + }, + { + "auxiliary_loss_clip": 0.01676154, + "auxiliary_loss_mlp": 0.00320566, + "balance_loss_clip": 1.35202265, + "balance_loss_mlp": 0.2897979, + "epoch": 0.2993536750338193, + "flos": 21178533888000.0, + "grad_norm": 3.230585652342804, + "language_loss": 0.83918703, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.85915422, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.30773926, + "step": 4979, + "time_per_iteration": 2.6197946071624756 + }, + { + "auxiliary_loss_clip": 0.01660663, + "auxiliary_loss_mlp": 0.00339504, + "balance_loss_clip": 1.33979928, + "balance_loss_mlp": 0.30960613, + "epoch": 0.2994137982864873, + "flos": 32122130544000.0, + "grad_norm": 8.187358813677962, + "language_loss": 0.75501168, + "learning_rate": 3.285691238725484e-06, + "loss": 0.77501333, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.29907227, + "step": 4980, + "time_per_iteration": 2.720956563949585 + }, + { + "auxiliary_loss_clip": 0.01632354, + "auxiliary_loss_mlp": 0.00269343, + "balance_loss_clip": 1.32113278, + "balance_loss_mlp": 0.24217474, + "epoch": 0.29947392153915525, + "flos": 21105419754240.0, + "grad_norm": 5.642963770437911, + "language_loss": 0.79236364, + "learning_rate": 3.285392888352555e-06, + "loss": 0.81138062, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.27160645, + "step": 4981, + "time_per_iteration": 2.7009975910186768 + }, + { + "auxiliary_loss_clip": 0.01660525, + "auxiliary_loss_mlp": 0.00370159, + "balance_loss_clip": 1.33084178, + "balance_loss_mlp": 0.33544526, + "epoch": 0.2995340447918232, + "flos": 21542632099200.0, + "grad_norm": 2.103036377819465, + "language_loss": 0.90376019, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.92406702, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.34716797, + "step": 4982, + "time_per_iteration": 2.6899919509887695 + }, + { + "auxiliary_loss_clip": 0.01657403, + "auxiliary_loss_mlp": 0.00326814, + "balance_loss_clip": 1.34037805, + "balance_loss_mlp": 0.29317355, + "epoch": 0.2995941680444912, + "flos": 16725143917440.0, + "grad_norm": 41.121788176537265, + "language_loss": 0.93816894, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.95801115, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.33642578, + "step": 4983, + "time_per_iteration": 2.6623270511627197 + }, + { + "auxiliary_loss_clip": 0.01659323, + "auxiliary_loss_mlp": 0.00332192, + "balance_loss_clip": 1.33900368, + "balance_loss_mlp": 0.30162647, + "epoch": 0.2996542912971592, + "flos": 20923496346240.0, + "grad_norm": 12.95384173923672, + "language_loss": 0.84014714, + "learning_rate": 3.284497544825668e-06, + "loss": 0.8600623, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.30541992, + "step": 4984, + "time_per_iteration": 2.6239659786224365 + }, + { + "auxiliary_loss_clip": 0.01657784, + "auxiliary_loss_mlp": 0.00291065, + "balance_loss_clip": 1.33363974, + "balance_loss_mlp": 0.25880659, + "epoch": 0.29971441454982717, + "flos": 25079868754560.0, + "grad_norm": 1.773444121738521, + "language_loss": 0.84749371, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.86698216, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.32250977, + "step": 4985, + "time_per_iteration": 2.685504198074341 + }, + { + "auxiliary_loss_clip": 0.01676028, + "auxiliary_loss_mlp": 0.00341582, + "balance_loss_clip": 1.34401655, + "balance_loss_mlp": 0.30734521, + "epoch": 0.29977453780249513, + "flos": 52555911840000.0, + "grad_norm": 10.394232960241824, + "language_loss": 0.78879648, + "learning_rate": 3.283900405580837e-06, + "loss": 0.8089726, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.34277344, + "step": 4986, + "time_per_iteration": 2.9662270545959473 + }, + { + "auxiliary_loss_clip": 0.01682903, + "auxiliary_loss_mlp": 0.00311108, + "balance_loss_clip": 1.35201192, + "balance_loss_mlp": 0.2781111, + "epoch": 0.2998346610551631, + "flos": 22237144542720.0, + "grad_norm": 2.4230602280886866, + "language_loss": 0.8256802, + "learning_rate": 3.283601762924312e-06, + "loss": 0.84562033, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.32958984, + "step": 4987, + "time_per_iteration": 2.637967348098755 + }, + { + "auxiliary_loss_clip": 0.01655969, + "auxiliary_loss_mlp": 0.00309042, + "balance_loss_clip": 1.32787645, + "balance_loss_mlp": 0.2782979, + "epoch": 0.29989478430783106, + "flos": 16873203778560.0, + "grad_norm": 10.668973591842143, + "language_loss": 0.86789912, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.88754928, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.30749512, + "step": 4988, + "time_per_iteration": 2.599900722503662 + }, + { + "auxiliary_loss_clip": 0.0168316, + "auxiliary_loss_mlp": 0.00325076, + "balance_loss_clip": 1.35022473, + "balance_loss_mlp": 0.29257968, + "epoch": 0.29995490756049903, + "flos": 23768878164480.0, + "grad_norm": 6.731550311590487, + "language_loss": 0.76265168, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.78273404, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.32495117, + "step": 4989, + "time_per_iteration": 2.6515250205993652 + }, + { + "auxiliary_loss_clip": 0.0168226, + "auxiliary_loss_mlp": 0.0034487, + "balance_loss_clip": 1.34617054, + "balance_loss_mlp": 0.30991817, + "epoch": 0.300015030813167, + "flos": 14465321614080.0, + "grad_norm": 10.756527976583039, + "language_loss": 0.91291529, + "learning_rate": 3.282705542954199e-06, + "loss": 0.93318659, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.34960938, + "step": 4990, + "time_per_iteration": 2.6346352100372314 + }, + { + "auxiliary_loss_clip": 0.01707629, + "auxiliary_loss_mlp": 0.00340612, + "balance_loss_clip": 1.36520207, + "balance_loss_mlp": 0.3055402, + "epoch": 0.30007515406583496, + "flos": 25191982080000.0, + "grad_norm": 7.178048812923887, + "language_loss": 0.72436082, + "learning_rate": 3.28240670566841e-06, + "loss": 0.74484324, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 3.42382812, + "router_z_loss_mlp": 0.35058594, + "step": 4991, + "time_per_iteration": 2.724764585494995 + }, + { + "auxiliary_loss_clip": 0.01691934, + "auxiliary_loss_mlp": 0.00326868, + "balance_loss_clip": 1.35565495, + "balance_loss_mlp": 0.29348969, + "epoch": 0.3001352773185029, + "flos": 19391188106880.0, + "grad_norm": 4.184707140041777, + "language_loss": 0.87251562, + "learning_rate": 3.28210781975363e-06, + "loss": 0.89270365, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.33398438, + "step": 4992, + "time_per_iteration": 2.6661415100097656 + }, + { + "auxiliary_loss_clip": 0.01688964, + "auxiliary_loss_mlp": 0.00310974, + "balance_loss_clip": 1.35439634, + "balance_loss_mlp": 0.27945489, + "epoch": 0.3001954005711709, + "flos": 21543853161600.0, + "grad_norm": 148.43595987562165, + "language_loss": 0.89925432, + "learning_rate": 3.281808885221193e-06, + "loss": 0.91925371, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.31494141, + "step": 4993, + "time_per_iteration": 2.6864328384399414 + }, + { + "auxiliary_loss_clip": 0.01719055, + "auxiliary_loss_mlp": 0.00360724, + "balance_loss_clip": 1.37434721, + "balance_loss_mlp": 0.3226012, + "epoch": 0.30025552382383885, + "flos": 17384320356480.0, + "grad_norm": 6.576550322099237, + "language_loss": 0.94676054, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.96755832, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.38085938, + "step": 4994, + "time_per_iteration": 2.6785879135131836 + }, + { + "auxiliary_loss_clip": 0.01711434, + "auxiliary_loss_mlp": 0.00328793, + "balance_loss_clip": 1.36500406, + "balance_loss_mlp": 0.29653525, + "epoch": 0.3003156470765068, + "flos": 29533330552320.0, + "grad_norm": 10.257352334571467, + "language_loss": 0.86412692, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.88452923, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 3.46679688, + "router_z_loss_mlp": 0.32275391, + "step": 4995, + "time_per_iteration": 2.7360002994537354 + }, + { + "auxiliary_loss_clip": 0.01710779, + "auxiliary_loss_mlp": 0.00296442, + "balance_loss_clip": 1.36548114, + "balance_loss_mlp": 0.26587716, + "epoch": 0.3003757703291748, + "flos": 43646402465280.0, + "grad_norm": 3.5796208087272485, + "language_loss": 0.72241712, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.74248934, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.30566406, + "step": 4996, + "time_per_iteration": 2.8499486446380615 + }, + { + "auxiliary_loss_clip": 0.01697402, + "auxiliary_loss_mlp": 0.00329553, + "balance_loss_clip": 1.36001289, + "balance_loss_mlp": 0.29674622, + "epoch": 0.30043589358184275, + "flos": 22528380015360.0, + "grad_norm": 10.53163906149369, + "language_loss": 0.81874996, + "learning_rate": 3.280612661141615e-06, + "loss": 0.83901954, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.328125, + "step": 4997, + "time_per_iteration": 4.112249374389648 + }, + { + "auxiliary_loss_clip": 0.01715089, + "auxiliary_loss_mlp": 0.00327925, + "balance_loss_clip": 1.36983418, + "balance_loss_mlp": 0.29442692, + "epoch": 0.30049601683451077, + "flos": 20995892208000.0, + "grad_norm": 3.9715840663230835, + "language_loss": 0.84117079, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.861601, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 0.33496094, + "step": 4998, + "time_per_iteration": 2.675410270690918 + }, + { + "auxiliary_loss_clip": 0.01721982, + "auxiliary_loss_mlp": 0.00321721, + "balance_loss_clip": 1.37504089, + "balance_loss_mlp": 0.29101297, + "epoch": 0.30055614008717874, + "flos": 23916004272000.0, + "grad_norm": 10.479033523189317, + "language_loss": 0.78989244, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.81032956, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.30737305, + "step": 4999, + "time_per_iteration": 2.747399091720581 + }, + { + "auxiliary_loss_clip": 0.0172623, + "auxiliary_loss_mlp": 0.00325133, + "balance_loss_clip": 1.37625778, + "balance_loss_mlp": 0.28979951, + "epoch": 0.3006162633398467, + "flos": 19169798630400.0, + "grad_norm": 14.056966546728663, + "language_loss": 0.82011443, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.84062803, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 0.35327148, + "step": 5000, + "time_per_iteration": 4.122182607650757 + }, + { + "auxiliary_loss_clip": 0.01721291, + "auxiliary_loss_mlp": 0.00312279, + "balance_loss_clip": 1.37576985, + "balance_loss_mlp": 0.2811175, + "epoch": 0.30067638659251467, + "flos": 14679241061760.0, + "grad_norm": 5.8313290589078335, + "language_loss": 0.86561656, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.88595223, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 3.45703125, + "router_z_loss_mlp": 0.31152344, + "step": 5001, + "time_per_iteration": 2.67290997505188 + }, + { + "auxiliary_loss_clip": 0.01758843, + "auxiliary_loss_mlp": 0.00331558, + "balance_loss_clip": 1.41155815, + "balance_loss_mlp": 0.29837042, + "epoch": 0.30073650984518263, + "flos": 23368007404800.0, + "grad_norm": 47.77280351128787, + "language_loss": 0.87144363, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.89234769, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 0.33203125, + "step": 5002, + "time_per_iteration": 2.635960102081299 + }, + { + "auxiliary_loss_clip": 0.01754752, + "auxiliary_loss_mlp": 0.00310328, + "balance_loss_clip": 1.39504528, + "balance_loss_mlp": 0.27659163, + "epoch": 0.3007966330978506, + "flos": 22966633854720.0, + "grad_norm": 3.5274273634667255, + "language_loss": 0.7579124, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.7785632, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 0.33740234, + "step": 5003, + "time_per_iteration": 4.087014675140381 + }, + { + "auxiliary_loss_clip": 0.01718029, + "auxiliary_loss_mlp": 0.00320253, + "balance_loss_clip": 1.36763215, + "balance_loss_mlp": 0.28787535, + "epoch": 0.30085675635051856, + "flos": 27818452460160.0, + "grad_norm": 5.072252406270049, + "language_loss": 0.7743358, + "learning_rate": 3.27851739984233e-06, + "loss": 0.79471862, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.32348633, + "step": 5004, + "time_per_iteration": 2.7146992683410645 + }, + { + "auxiliary_loss_clip": 0.01747196, + "auxiliary_loss_mlp": 0.00343222, + "balance_loss_clip": 1.38922048, + "balance_loss_mlp": 0.30736393, + "epoch": 0.3009168796031865, + "flos": 10882729059840.0, + "grad_norm": 5.9531154662068335, + "language_loss": 0.88306785, + "learning_rate": 3.278217882782715e-06, + "loss": 0.90397203, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 0.35839844, + "step": 5005, + "time_per_iteration": 2.6417694091796875 + }, + { + "auxiliary_loss_clip": 0.01728636, + "auxiliary_loss_mlp": 0.00298763, + "balance_loss_clip": 1.38282931, + "balance_loss_mlp": 0.26770869, + "epoch": 0.3009770028558545, + "flos": 23805399317760.0, + "grad_norm": 11.067783974671691, + "language_loss": 0.82049155, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.8407656, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 3.45703125, + "router_z_loss_mlp": 0.31066895, + "step": 5006, + "time_per_iteration": 2.6451570987701416 + }, + { + "auxiliary_loss_clip": 0.01695053, + "auxiliary_loss_mlp": 0.00328943, + "balance_loss_clip": 1.35923886, + "balance_loss_mlp": 0.29630321, + "epoch": 0.30103712610852246, + "flos": 26468211283200.0, + "grad_norm": 2.774300457650169, + "language_loss": 0.78321826, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.80345815, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.32641602, + "step": 5007, + "time_per_iteration": 2.68097186088562 + }, + { + "auxiliary_loss_clip": 0.01727418, + "auxiliary_loss_mlp": 0.00331311, + "balance_loss_clip": 1.37650716, + "balance_loss_mlp": 0.29411763, + "epoch": 0.3010972493611904, + "flos": 22856459863680.0, + "grad_norm": 7.783775968336854, + "language_loss": 0.83831722, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.85890454, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.37207031, + "step": 5008, + "time_per_iteration": 2.644542932510376 + }, + { + "auxiliary_loss_clip": 0.0174071, + "auxiliary_loss_mlp": 0.00377858, + "balance_loss_clip": 1.38738561, + "balance_loss_mlp": 0.3403067, + "epoch": 0.3011573726138584, + "flos": 24053685102720.0, + "grad_norm": 8.247217436154942, + "language_loss": 0.90940523, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.93059087, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.37573242, + "step": 5009, + "time_per_iteration": 2.6476383209228516 + }, + { + "auxiliary_loss_clip": 0.01728671, + "auxiliary_loss_mlp": 0.00376503, + "balance_loss_clip": 1.3757751, + "balance_loss_mlp": 0.34064525, + "epoch": 0.30121749586652635, + "flos": 20259687052800.0, + "grad_norm": 6.048461281922027, + "language_loss": 0.91159689, + "learning_rate": 3.276719570659604e-06, + "loss": 0.93264866, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.35864258, + "step": 5010, + "time_per_iteration": 2.6815600395202637 + }, + { + "auxiliary_loss_clip": 0.01715341, + "auxiliary_loss_mlp": 0.00345556, + "balance_loss_clip": 1.36675942, + "balance_loss_mlp": 0.31208175, + "epoch": 0.3012776191191944, + "flos": 26943058103040.0, + "grad_norm": 26.32035322248561, + "language_loss": 0.92299318, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.94360209, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 3.48828125, + "router_z_loss_mlp": 0.33422852, + "step": 5011, + "time_per_iteration": 4.133784294128418 + }, + { + "auxiliary_loss_clip": 0.01720126, + "auxiliary_loss_mlp": 0.00320168, + "balance_loss_clip": 1.37171352, + "balance_loss_mlp": 0.28771937, + "epoch": 0.30133774237186234, + "flos": 20412307941120.0, + "grad_norm": 134.61118847974026, + "language_loss": 0.79049039, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.8108933, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 0.32446289, + "step": 5012, + "time_per_iteration": 2.645742177963257 + }, + { + "auxiliary_loss_clip": 0.01713601, + "auxiliary_loss_mlp": 0.00349841, + "balance_loss_clip": 1.36996698, + "balance_loss_mlp": 0.31572306, + "epoch": 0.3013978656245303, + "flos": 19792453916160.0, + "grad_norm": 43.40790410931366, + "language_loss": 0.95440465, + "learning_rate": 3.275820002334819e-06, + "loss": 0.97503906, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 3.43945312, + "router_z_loss_mlp": 0.34106445, + "step": 5013, + "time_per_iteration": 2.6414270401000977 + }, + { + "auxiliary_loss_clip": 0.01719333, + "auxiliary_loss_mlp": 0.00365829, + "balance_loss_clip": 1.36974883, + "balance_loss_mlp": 0.32758641, + "epoch": 0.30145798887719827, + "flos": 16249650652800.0, + "grad_norm": 39.91372373469943, + "language_loss": 0.90036809, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.92121971, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 0.3828125, + "step": 5014, + "time_per_iteration": 2.6194136142730713 + }, + { + "auxiliary_loss_clip": 0.01703318, + "auxiliary_loss_mlp": 0.00337547, + "balance_loss_clip": 1.36410832, + "balance_loss_mlp": 0.30464554, + "epoch": 0.30151811212986623, + "flos": 24571733005440.0, + "grad_norm": 2.364963849294682, + "language_loss": 0.74670434, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.76711297, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 3.39257812, + "router_z_loss_mlp": 0.32873535, + "step": 5015, + "time_per_iteration": 2.7236123085021973 + }, + { + "auxiliary_loss_clip": 0.01689977, + "auxiliary_loss_mlp": 0.00332771, + "balance_loss_clip": 1.35040426, + "balance_loss_mlp": 0.29851013, + "epoch": 0.3015782353825342, + "flos": 21872076664320.0, + "grad_norm": 35.4442736143837, + "language_loss": 0.82249701, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.84272456, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 3.39648438, + "router_z_loss_mlp": 0.34277344, + "step": 5016, + "time_per_iteration": 2.683513879776001 + }, + { + "auxiliary_loss_clip": 0.01703553, + "auxiliary_loss_mlp": 0.00350149, + "balance_loss_clip": 1.36030126, + "balance_loss_mlp": 0.31438616, + "epoch": 0.30163835863520216, + "flos": 28769331248640.0, + "grad_norm": 297.29508393605505, + "language_loss": 0.71986508, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.7404021, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.35717773, + "step": 5017, + "time_per_iteration": 2.7585113048553467 + }, + { + "auxiliary_loss_clip": 0.01727802, + "auxiliary_loss_mlp": 0.00341306, + "balance_loss_clip": 1.37879956, + "balance_loss_mlp": 0.30914325, + "epoch": 0.30169848188787013, + "flos": 22966202891520.0, + "grad_norm": 21.00277780380629, + "language_loss": 0.74886215, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.76955318, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 3.48632812, + "router_z_loss_mlp": 0.3215332, + "step": 5018, + "time_per_iteration": 2.7499818801879883 + }, + { + "auxiliary_loss_clip": 0.01691003, + "auxiliary_loss_mlp": 0.00309482, + "balance_loss_clip": 1.35711861, + "balance_loss_mlp": 0.2781533, + "epoch": 0.3017586051405381, + "flos": 21835268202240.0, + "grad_norm": 3.5423418358641996, + "language_loss": 0.82930624, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.84931099, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.31347656, + "step": 5019, + "time_per_iteration": 2.8014278411865234 + }, + { + "auxiliary_loss_clip": 0.01695461, + "auxiliary_loss_mlp": 0.0032915, + "balance_loss_clip": 1.35117388, + "balance_loss_mlp": 0.29169479, + "epoch": 0.30181872839320606, + "flos": 22160403135360.0, + "grad_norm": 9.33474981092558, + "language_loss": 0.76284552, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.78309166, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 3.44335938, + "router_z_loss_mlp": 0.37475586, + "step": 5020, + "time_per_iteration": 2.674314498901367 + }, + { + "auxiliary_loss_clip": 0.01758082, + "auxiliary_loss_mlp": 0.0035312, + "balance_loss_clip": 1.39264846, + "balance_loss_mlp": 0.32086208, + "epoch": 0.301878851645874, + "flos": 18114168804480.0, + "grad_norm": 18.924779812829176, + "language_loss": 0.84685081, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.86796284, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.32250977, + "step": 5021, + "time_per_iteration": 2.706972122192383 + }, + { + "auxiliary_loss_clip": 0.01727384, + "auxiliary_loss_mlp": 0.00309722, + "balance_loss_clip": 1.37036753, + "balance_loss_mlp": 0.27665353, + "epoch": 0.301938974898542, + "flos": 17602226213760.0, + "grad_norm": 2.6508545074569856, + "language_loss": 0.83589041, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.85626149, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.33032227, + "step": 5022, + "time_per_iteration": 2.6335978507995605 + }, + { + "auxiliary_loss_clip": 0.01733012, + "auxiliary_loss_mlp": 0.0033621, + "balance_loss_clip": 1.3746438, + "balance_loss_mlp": 0.30347469, + "epoch": 0.30199909815120995, + "flos": 11181219079680.0, + "grad_norm": 1.91572862461492, + "language_loss": 0.74512047, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.7658127, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.32714844, + "step": 5023, + "time_per_iteration": 2.640486717224121 + }, + { + "auxiliary_loss_clip": 0.01727792, + "auxiliary_loss_mlp": 0.00300615, + "balance_loss_clip": 1.36529016, + "balance_loss_mlp": 0.26795131, + "epoch": 0.302059221403878, + "flos": 21907843632000.0, + "grad_norm": 2.5118044101033066, + "language_loss": 0.77036607, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.79065013, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 0.3269043, + "step": 5024, + "time_per_iteration": 2.6287717819213867 + }, + { + "auxiliary_loss_clip": 0.01703377, + "auxiliary_loss_mlp": 0.00294333, + "balance_loss_clip": 1.35528588, + "balance_loss_mlp": 0.25911853, + "epoch": 0.30211934465654594, + "flos": 26396390039040.0, + "grad_norm": 2.8148181488349877, + "language_loss": 0.79657149, + "learning_rate": 3.272217377978061e-06, + "loss": 0.81654865, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.35229492, + "step": 5025, + "time_per_iteration": 2.665497064590454 + }, + { + "auxiliary_loss_clip": 0.01718012, + "auxiliary_loss_mlp": 0.00318234, + "balance_loss_clip": 1.3644706, + "balance_loss_mlp": 0.28392592, + "epoch": 0.3021794679092139, + "flos": 23400470321280.0, + "grad_norm": 19.087753369737943, + "language_loss": 0.72601795, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.74638045, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.34326172, + "step": 5026, + "time_per_iteration": 2.701030731201172 + }, + { + "auxiliary_loss_clip": 0.01711553, + "auxiliary_loss_mlp": 0.00288556, + "balance_loss_clip": 1.35355639, + "balance_loss_mlp": 0.25529629, + "epoch": 0.30223959116188187, + "flos": 20260979942400.0, + "grad_norm": 42.18497862430351, + "language_loss": 0.89787143, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.91787255, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 3.58007812, + "router_z_loss_mlp": 0.33276367, + "step": 5027, + "time_per_iteration": 2.6296753883361816 + }, + { + "auxiliary_loss_clip": 0.01699864, + "auxiliary_loss_mlp": 0.00295428, + "balance_loss_clip": 1.3492291, + "balance_loss_mlp": 0.26188284, + "epoch": 0.30229971441454984, + "flos": 26687840993280.0, + "grad_norm": 75.35000688746985, + "language_loss": 0.82110631, + "learning_rate": 3.271315635661351e-06, + "loss": 0.84105927, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.33569336, + "step": 5028, + "time_per_iteration": 2.686184883117676 + }, + { + "auxiliary_loss_clip": 0.01698529, + "auxiliary_loss_mlp": 0.00293606, + "balance_loss_clip": 1.34654367, + "balance_loss_mlp": 0.25977415, + "epoch": 0.3023598376672178, + "flos": 34345323953280.0, + "grad_norm": 38.42331342577075, + "language_loss": 0.8354497, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.85537106, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.33862305, + "step": 5029, + "time_per_iteration": 2.7305829524993896 + }, + { + "auxiliary_loss_clip": 0.01735173, + "auxiliary_loss_mlp": 0.00319435, + "balance_loss_clip": 1.36588573, + "balance_loss_mlp": 0.28095415, + "epoch": 0.30241996091988577, + "flos": 23112143850240.0, + "grad_norm": 3.8956889257423013, + "language_loss": 0.89389646, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.9144426, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 3.69726562, + "router_z_loss_mlp": 0.38476562, + "step": 5030, + "time_per_iteration": 2.6335182189941406 + }, + { + "auxiliary_loss_clip": 0.01723231, + "auxiliary_loss_mlp": 0.00281763, + "balance_loss_clip": 1.3625586, + "balance_loss_mlp": 0.24638139, + "epoch": 0.30248008417255373, + "flos": 19390002958080.0, + "grad_norm": 5.722549567813594, + "language_loss": 0.76277351, + "learning_rate": 3.270413459468905e-06, + "loss": 0.78282344, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.35400391, + "step": 5031, + "time_per_iteration": 2.658543348312378 + }, + { + "auxiliary_loss_clip": 0.01719766, + "auxiliary_loss_mlp": 0.00278056, + "balance_loss_clip": 1.3509798, + "balance_loss_mlp": 0.24229363, + "epoch": 0.3025402074252217, + "flos": 23769704177280.0, + "grad_norm": 15.225132401950619, + "language_loss": 0.8781023, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.89808059, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 3.68554688, + "router_z_loss_mlp": 0.35742188, + "step": 5032, + "time_per_iteration": 2.6790599822998047 + }, + { + "auxiliary_loss_clip": 0.01721841, + "auxiliary_loss_mlp": 0.00320415, + "balance_loss_clip": 1.35872877, + "balance_loss_mlp": 0.28272104, + "epoch": 0.30260033067788966, + "flos": 25994118648960.0, + "grad_norm": 18.27178497464597, + "language_loss": 0.81584972, + "learning_rate": 3.269811767783906e-06, + "loss": 0.83627224, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.37744141, + "step": 5033, + "time_per_iteration": 2.711526870727539 + }, + { + "auxiliary_loss_clip": 0.01685281, + "auxiliary_loss_mlp": 0.00268887, + "balance_loss_clip": 1.33443975, + "balance_loss_mlp": 0.23474541, + "epoch": 0.3026604539305576, + "flos": 25374551932800.0, + "grad_norm": 3.5308632377996823, + "language_loss": 0.80328631, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.82282799, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.34155273, + "step": 5034, + "time_per_iteration": 2.671795606613159 + }, + { + "auxiliary_loss_clip": 0.01726979, + "auxiliary_loss_mlp": 0.00275612, + "balance_loss_clip": 1.35879481, + "balance_loss_mlp": 0.23813248, + "epoch": 0.3027205771832256, + "flos": 25812733944960.0, + "grad_norm": 2.7502961568949726, + "language_loss": 0.79537642, + "learning_rate": 3.269209883493352e-06, + "loss": 0.81540227, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 3.68359375, + "router_z_loss_mlp": 0.37475586, + "step": 5035, + "time_per_iteration": 2.7078192234039307 + }, + { + "auxiliary_loss_clip": 0.01701185, + "auxiliary_loss_mlp": 0.00263252, + "balance_loss_clip": 1.34740114, + "balance_loss_mlp": 0.22779962, + "epoch": 0.30278070043589356, + "flos": 27344539393920.0, + "grad_norm": 9.382495955022954, + "language_loss": 0.93496794, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.95461226, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 0.35473633, + "step": 5036, + "time_per_iteration": 2.7551615238189697 + }, + { + "auxiliary_loss_clip": 0.01698353, + "auxiliary_loss_mlp": 0.00268642, + "balance_loss_clip": 1.34341669, + "balance_loss_mlp": 0.23252192, + "epoch": 0.3028408236885616, + "flos": 24786227070720.0, + "grad_norm": 5.444388563434392, + "language_loss": 0.83113027, + "learning_rate": 3.268607806688536e-06, + "loss": 0.85080028, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 0.36083984, + "step": 5037, + "time_per_iteration": 2.7718801498413086 + }, + { + "auxiliary_loss_clip": 0.01716359, + "auxiliary_loss_mlp": 0.00301495, + "balance_loss_clip": 1.34728754, + "balance_loss_mlp": 0.2664476, + "epoch": 0.30290094694122954, + "flos": 12932474670720.0, + "grad_norm": 2.772135000453805, + "language_loss": 0.87076163, + "learning_rate": 3.268306696121816e-06, + "loss": 0.89094019, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 0.3503418, + "step": 5038, + "time_per_iteration": 2.6879334449768066 + }, + { + "auxiliary_loss_clip": 0.01704447, + "auxiliary_loss_mlp": 0.00281648, + "balance_loss_clip": 1.34840775, + "balance_loss_mlp": 0.24638617, + "epoch": 0.3029610701938975, + "flos": 25916443488000.0, + "grad_norm": 15.279938817951566, + "language_loss": 0.8045398, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.82440078, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 0.3527832, + "step": 5039, + "time_per_iteration": 4.296742677688599 + }, + { + "auxiliary_loss_clip": 0.01674228, + "auxiliary_loss_mlp": 0.0024772, + "balance_loss_clip": 1.32457376, + "balance_loss_mlp": 0.216392, + "epoch": 0.3030211934465655, + "flos": 21980993679360.0, + "grad_norm": 9.597897944337266, + "language_loss": 0.85101956, + "learning_rate": 3.267704330716847e-06, + "loss": 0.87023902, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 0.31323242, + "step": 5040, + "time_per_iteration": 2.6590168476104736 + }, + { + "auxiliary_loss_clip": 0.01698168, + "auxiliary_loss_mlp": 0.00274268, + "balance_loss_clip": 1.3369025, + "balance_loss_mlp": 0.23871985, + "epoch": 0.30308131669923344, + "flos": 20991977625600.0, + "grad_norm": 104.00263289305478, + "language_loss": 0.87291014, + "learning_rate": 3.267403075901438e-06, + "loss": 0.89263451, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 0.35546875, + "step": 5041, + "time_per_iteration": 2.661561965942383 + }, + { + "auxiliary_loss_clip": 0.01754313, + "auxiliary_loss_mlp": 0.00050635, + "balance_loss_clip": 1.50181675, + "balance_loss_mlp": 0.03547137, + "epoch": 0.3031414399519014, + "flos": 60548875827840.0, + "grad_norm": 3.0495101800397357, + "language_loss": 0.59305155, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61110103, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.15136719, + "step": 5042, + "time_per_iteration": 4.6341187953948975 + }, + { + "auxiliary_loss_clip": 0.01717315, + "auxiliary_loss_mlp": 0.0026997, + "balance_loss_clip": 1.35316348, + "balance_loss_mlp": 0.23756903, + "epoch": 0.30320156320456937, + "flos": 21907664064000.0, + "grad_norm": 4.481557276230008, + "language_loss": 0.78915846, + "learning_rate": 3.266800422101892e-06, + "loss": 0.80903137, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.32397461, + "step": 5043, + "time_per_iteration": 2.6353514194488525 + }, + { + "auxiliary_loss_clip": 0.01704955, + "auxiliary_loss_mlp": 0.00270294, + "balance_loss_clip": 1.35057855, + "balance_loss_mlp": 0.23529419, + "epoch": 0.30326168645723733, + "flos": 21652770176640.0, + "grad_norm": 11.824032253737897, + "language_loss": 0.76046658, + "learning_rate": 3.266499023140606e-06, + "loss": 0.78021908, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 0.35009766, + "step": 5044, + "time_per_iteration": 2.6414406299591064 + }, + { + "auxiliary_loss_clip": 0.01705091, + "auxiliary_loss_mlp": 0.00275346, + "balance_loss_clip": 1.35046339, + "balance_loss_mlp": 0.24129997, + "epoch": 0.3033218097099053, + "flos": 21871286565120.0, + "grad_norm": 22.806482039488614, + "language_loss": 0.82081127, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.84061575, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 0.34057617, + "step": 5045, + "time_per_iteration": 4.039073705673218 + }, + { + "auxiliary_loss_clip": 0.0169761, + "auxiliary_loss_mlp": 0.00251017, + "balance_loss_clip": 1.34342873, + "balance_loss_mlp": 0.21499217, + "epoch": 0.30338193296257326, + "flos": 27089717333760.0, + "grad_norm": 13.59011574760537, + "language_loss": 0.78183222, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.80131853, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 0.36010742, + "step": 5046, + "time_per_iteration": 2.710190534591675 + }, + { + "auxiliary_loss_clip": 0.0173002, + "auxiliary_loss_mlp": 0.00268963, + "balance_loss_clip": 1.35574532, + "balance_loss_mlp": 0.23114944, + "epoch": 0.30344205621524123, + "flos": 19534363718400.0, + "grad_norm": 12.832081918181148, + "language_loss": 0.8762337, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.89622355, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 3.74023438, + "router_z_loss_mlp": 0.37792969, + "step": 5047, + "time_per_iteration": 2.800795555114746 + }, + { + "auxiliary_loss_clip": 0.01725061, + "auxiliary_loss_mlp": 0.00266837, + "balance_loss_clip": 1.36229467, + "balance_loss_mlp": 0.2345078, + "epoch": 0.3035021794679092, + "flos": 23910976368000.0, + "grad_norm": 2.061800633387559, + "language_loss": 0.77328014, + "learning_rate": 3.265292947152084e-06, + "loss": 0.79319906, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 0.32287598, + "step": 5048, + "time_per_iteration": 2.684461832046509 + }, + { + "auxiliary_loss_clip": 0.01689925, + "auxiliary_loss_mlp": 0.00253282, + "balance_loss_clip": 1.33996153, + "balance_loss_mlp": 0.21926013, + "epoch": 0.30356230272057716, + "flos": 16143606725760.0, + "grad_norm": 17.458672167188546, + "language_loss": 0.81157267, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.83100474, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 0.34033203, + "step": 5049, + "time_per_iteration": 2.7144885063171387 + }, + { + "auxiliary_loss_clip": 0.01703953, + "auxiliary_loss_mlp": 0.00271666, + "balance_loss_clip": 1.34200525, + "balance_loss_mlp": 0.23552197, + "epoch": 0.3036224259732452, + "flos": 28914697589760.0, + "grad_norm": 386.2512468907635, + "language_loss": 0.87627423, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.89603043, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.36132812, + "step": 5050, + "time_per_iteration": 2.741415023803711 + }, + { + "auxiliary_loss_clip": 0.0175467, + "auxiliary_loss_mlp": 0.00258078, + "balance_loss_clip": 1.37837243, + "balance_loss_mlp": 0.21902472, + "epoch": 0.30368254922591315, + "flos": 21105599322240.0, + "grad_norm": 2.7022949184552862, + "language_loss": 0.80814093, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.82826841, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 0.39038086, + "step": 5051, + "time_per_iteration": 2.7059969902038574 + }, + { + "auxiliary_loss_clip": 0.01706168, + "auxiliary_loss_mlp": 0.00272332, + "balance_loss_clip": 1.34525895, + "balance_loss_mlp": 0.23847628, + "epoch": 0.3037426724785811, + "flos": 23002293081600.0, + "grad_norm": 22.57106117504861, + "language_loss": 0.81524903, + "learning_rate": 3.264086103483033e-06, + "loss": 0.83503401, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 0.33886719, + "step": 5052, + "time_per_iteration": 2.6755216121673584 + }, + { + "auxiliary_loss_clip": 0.01727333, + "auxiliary_loss_mlp": 0.00300306, + "balance_loss_clip": 1.35398507, + "balance_loss_mlp": 0.26432893, + "epoch": 0.3038027957312491, + "flos": 15632705629440.0, + "grad_norm": 45.46222419954672, + "language_loss": 0.90903413, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.92931056, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.35986328, + "step": 5053, + "time_per_iteration": 4.099026203155518 + }, + { + "auxiliary_loss_clip": 0.01708596, + "auxiliary_loss_mlp": 0.00271575, + "balance_loss_clip": 1.34312212, + "balance_loss_mlp": 0.23376164, + "epoch": 0.30386291898391704, + "flos": 12713994195840.0, + "grad_norm": 2.697546057335955, + "language_loss": 0.768924, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.78872561, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 0.37792969, + "step": 5054, + "time_per_iteration": 2.614152431488037 + }, + { + "auxiliary_loss_clip": 0.01718269, + "auxiliary_loss_mlp": 0.00285457, + "balance_loss_clip": 1.35242319, + "balance_loss_mlp": 0.25019479, + "epoch": 0.303923042236585, + "flos": 26359437922560.0, + "grad_norm": 75.505274193201, + "language_loss": 0.74808049, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.76811779, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 0.35253906, + "step": 5055, + "time_per_iteration": 2.6913764476776123 + }, + { + "auxiliary_loss_clip": 0.01695379, + "auxiliary_loss_mlp": 0.00279837, + "balance_loss_clip": 1.34272397, + "balance_loss_mlp": 0.24738786, + "epoch": 0.30398316548925297, + "flos": 19719232041600.0, + "grad_norm": 26.476816939715214, + "language_loss": 0.74013674, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.75988889, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 3.52734375, + "router_z_loss_mlp": 0.32446289, + "step": 5056, + "time_per_iteration": 2.645218849182129 + }, + { + "auxiliary_loss_clip": 0.01692734, + "auxiliary_loss_mlp": 0.00267353, + "balance_loss_clip": 1.33769035, + "balance_loss_mlp": 0.23192434, + "epoch": 0.30404328874192094, + "flos": 24239846315520.0, + "grad_norm": 350.99599910570345, + "language_loss": 0.86252248, + "learning_rate": 3.262576470461507e-06, + "loss": 0.88212335, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 3.55078125, + "router_z_loss_mlp": 0.35449219, + "step": 5057, + "time_per_iteration": 2.675706624984741 + }, + { + "auxiliary_loss_clip": 0.01702663, + "auxiliary_loss_mlp": 0.00285542, + "balance_loss_clip": 1.34175372, + "balance_loss_mlp": 0.2489206, + "epoch": 0.3041034119945889, + "flos": 24498942094080.0, + "grad_norm": 35.33363619159725, + "language_loss": 0.94384646, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.96372855, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 3.609375, + "router_z_loss_mlp": 0.3659668, + "step": 5058, + "time_per_iteration": 2.7421422004699707 + }, + { + "auxiliary_loss_clip": 0.01673422, + "auxiliary_loss_mlp": 0.00276341, + "balance_loss_clip": 1.32050133, + "balance_loss_mlp": 0.23988676, + "epoch": 0.30416353524725687, + "flos": 28288881907200.0, + "grad_norm": 86.6072241297052, + "language_loss": 0.78847158, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.80796921, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.36425781, + "step": 5059, + "time_per_iteration": 2.681610107421875 + }, + { + "auxiliary_loss_clip": 0.01671024, + "auxiliary_loss_mlp": 0.00275782, + "balance_loss_clip": 1.31723785, + "balance_loss_mlp": 0.24016264, + "epoch": 0.30422365849992483, + "flos": 23660392112640.0, + "grad_norm": 3.168197006093855, + "language_loss": 0.77937061, + "learning_rate": 3.26167011603268e-06, + "loss": 0.79883868, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.35644531, + "step": 5060, + "time_per_iteration": 2.7104897499084473 + }, + { + "auxiliary_loss_clip": 0.01671868, + "auxiliary_loss_mlp": 0.0028499, + "balance_loss_clip": 1.32042551, + "balance_loss_mlp": 0.24801157, + "epoch": 0.3042837817525928, + "flos": 22998773548800.0, + "grad_norm": 76.19916930633615, + "language_loss": 0.8245827, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.84415132, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 0.36987305, + "step": 5061, + "time_per_iteration": 2.6696934700012207 + }, + { + "auxiliary_loss_clip": 0.01701769, + "auxiliary_loss_mlp": 0.00304697, + "balance_loss_clip": 1.33029675, + "balance_loss_mlp": 0.26430863, + "epoch": 0.30434390500526076, + "flos": 22082332924800.0, + "grad_norm": 11.810597480865635, + "language_loss": 0.89286911, + "learning_rate": 3.261065640514415e-06, + "loss": 0.91293377, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 3.7109375, + "router_z_loss_mlp": 0.40405273, + "step": 5062, + "time_per_iteration": 2.630356550216675 + }, + { + "auxiliary_loss_clip": 0.01667679, + "auxiliary_loss_mlp": 0.0028771, + "balance_loss_clip": 1.31301808, + "balance_loss_mlp": 0.2517091, + "epoch": 0.3044040282579287, + "flos": 25483504861440.0, + "grad_norm": 8.293970348350475, + "language_loss": 0.82513773, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.84469163, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 0.35986328, + "step": 5063, + "time_per_iteration": 2.7010204792022705 + }, + { + "auxiliary_loss_clip": 0.01679065, + "auxiliary_loss_mlp": 0.00284683, + "balance_loss_clip": 1.31701529, + "balance_loss_mlp": 0.2477282, + "epoch": 0.30446415151059675, + "flos": 21945478106880.0, + "grad_norm": 41.497354615023326, + "language_loss": 0.89978385, + "learning_rate": 3.26046097371721e-06, + "loss": 0.91942132, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.36987305, + "step": 5064, + "time_per_iteration": 2.688836097717285 + }, + { + "auxiliary_loss_clip": 0.01658797, + "auxiliary_loss_mlp": 0.0027288, + "balance_loss_clip": 1.30101669, + "balance_loss_mlp": 0.23799953, + "epoch": 0.3045242747632647, + "flos": 16435416816000.0, + "grad_norm": 15.004709595720298, + "language_loss": 0.83435589, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.85367262, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.34863281, + "step": 5065, + "time_per_iteration": 2.874338150024414 + }, + { + "auxiliary_loss_clip": 0.01683736, + "auxiliary_loss_mlp": 0.00281032, + "balance_loss_clip": 1.31594324, + "balance_loss_mlp": 0.24464929, + "epoch": 0.3045843980159327, + "flos": 31540341957120.0, + "grad_norm": 245.76264539240603, + "language_loss": 0.69777107, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.71741867, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 3.67773438, + "router_z_loss_mlp": 0.36425781, + "step": 5066, + "time_per_iteration": 2.797149181365967 + }, + { + "auxiliary_loss_clip": 0.0170354, + "auxiliary_loss_mlp": 0.00282225, + "balance_loss_clip": 1.32695138, + "balance_loss_mlp": 0.24674864, + "epoch": 0.30464452126860064, + "flos": 17853636481920.0, + "grad_norm": 25.331825483445815, + "language_loss": 0.90886879, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.92872649, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 0.35473633, + "step": 5067, + "time_per_iteration": 2.6480965614318848 + }, + { + "auxiliary_loss_clip": 0.01669987, + "auxiliary_loss_mlp": 0.00314031, + "balance_loss_clip": 1.30796981, + "balance_loss_mlp": 0.27669472, + "epoch": 0.3047046445212686, + "flos": 20631398947200.0, + "grad_norm": 14.283558712056534, + "language_loss": 0.69276083, + "learning_rate": 3.259251066652873e-06, + "loss": 0.71260095, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 0.37353516, + "step": 5068, + "time_per_iteration": 2.606750965118408 + }, + { + "auxiliary_loss_clip": 0.01664318, + "auxiliary_loss_mlp": 0.00276996, + "balance_loss_clip": 1.31033111, + "balance_loss_mlp": 0.23815808, + "epoch": 0.3047647677739366, + "flos": 21287594557440.0, + "grad_norm": 406.99789753427166, + "language_loss": 0.82798809, + "learning_rate": 3.258948470480793e-06, + "loss": 0.84740126, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 0.38818359, + "step": 5069, + "time_per_iteration": 2.692472457885742 + }, + { + "auxiliary_loss_clip": 0.01679176, + "auxiliary_loss_mlp": 0.003009, + "balance_loss_clip": 1.31773758, + "balance_loss_mlp": 0.26463646, + "epoch": 0.30482489102660454, + "flos": 20995928121600.0, + "grad_norm": 3.2504784757531575, + "language_loss": 0.82782066, + "learning_rate": 3.258645826569261e-06, + "loss": 0.84762144, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.36279297, + "step": 5070, + "time_per_iteration": 2.642348289489746 + }, + { + "auxiliary_loss_clip": 0.01711694, + "auxiliary_loss_mlp": 0.00316045, + "balance_loss_clip": 1.32983994, + "balance_loss_mlp": 0.27804118, + "epoch": 0.3048850142792725, + "flos": 26290812988800.0, + "grad_norm": 2.375550364663542, + "language_loss": 0.87931466, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.89959204, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 0.37963867, + "step": 5071, + "time_per_iteration": 2.7070114612579346 + }, + { + "auxiliary_loss_clip": 0.01685969, + "auxiliary_loss_mlp": 0.00315429, + "balance_loss_clip": 1.31780839, + "balance_loss_mlp": 0.27649528, + "epoch": 0.30494513753194047, + "flos": 22346241125760.0, + "grad_norm": 3.016786335465832, + "language_loss": 0.84018189, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.86019588, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 0.3894043, + "step": 5072, + "time_per_iteration": 2.63873291015625 + }, + { + "auxiliary_loss_clip": 0.01691306, + "auxiliary_loss_mlp": 0.00270172, + "balance_loss_clip": 1.31983995, + "balance_loss_mlp": 0.2312142, + "epoch": 0.30500526078460843, + "flos": 19537667769600.0, + "grad_norm": 8.899947370532892, + "language_loss": 0.78552371, + "learning_rate": 3.257737608512723e-06, + "loss": 0.80513847, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.3894043, + "step": 5073, + "time_per_iteration": 2.653093099594116 + }, + { + "auxiliary_loss_clip": 0.01714172, + "auxiliary_loss_mlp": 0.00307423, + "balance_loss_clip": 1.33085001, + "balance_loss_mlp": 0.27247104, + "epoch": 0.3050653840372764, + "flos": 14465321614080.0, + "grad_norm": 16.737960431523828, + "language_loss": 0.85949981, + "learning_rate": 3.257434773758163e-06, + "loss": 0.8797158, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 0.34936523, + "step": 5074, + "time_per_iteration": 2.585664987564087 + }, + { + "auxiliary_loss_clip": 0.01702124, + "auxiliary_loss_mlp": 0.00299576, + "balance_loss_clip": 1.33506703, + "balance_loss_mlp": 0.26302597, + "epoch": 0.30512550728994436, + "flos": 24243796811520.0, + "grad_norm": 22.300047326971367, + "language_loss": 0.82098347, + "learning_rate": 3.25713189132155e-06, + "loss": 0.84100056, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.36572266, + "step": 5075, + "time_per_iteration": 2.696277379989624 + }, + { + "auxiliary_loss_clip": 0.01715402, + "auxiliary_loss_mlp": 0.00291142, + "balance_loss_clip": 1.33064282, + "balance_loss_mlp": 0.2517074, + "epoch": 0.30518563054261233, + "flos": 16360542915840.0, + "grad_norm": 3.0853863224596223, + "language_loss": 0.81179559, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.83186102, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.39428711, + "step": 5076, + "time_per_iteration": 2.6479859352111816 + }, + { + "auxiliary_loss_clip": 0.01719766, + "auxiliary_loss_mlp": 0.00321408, + "balance_loss_clip": 1.34309614, + "balance_loss_mlp": 0.28416663, + "epoch": 0.30524575379528035, + "flos": 21579584215680.0, + "grad_norm": 7.991304867853234, + "language_loss": 0.84550244, + "learning_rate": 3.25652598344811e-06, + "loss": 0.86591411, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 0.37280273, + "step": 5077, + "time_per_iteration": 2.6670756340026855 + }, + { + "auxiliary_loss_clip": 0.01672514, + "auxiliary_loss_mlp": 0.00261067, + "balance_loss_clip": 1.31552649, + "balance_loss_mlp": 0.22580467, + "epoch": 0.3053058770479483, + "flos": 16545231671040.0, + "grad_norm": 5.445529076744947, + "language_loss": 0.81052649, + "learning_rate": 3.256222958034259e-06, + "loss": 0.8298623, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 0.35253906, + "step": 5078, + "time_per_iteration": 2.636361837387085 + }, + { + "auxiliary_loss_clip": 0.01713392, + "auxiliary_loss_mlp": 0.00336122, + "balance_loss_clip": 1.3356818, + "balance_loss_mlp": 0.29752186, + "epoch": 0.3053660003006163, + "flos": 12312907954560.0, + "grad_norm": 55.95493812940133, + "language_loss": 0.72402191, + "learning_rate": 3.255919884984307e-06, + "loss": 0.74451709, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 3.77539062, + "router_z_loss_mlp": 0.38623047, + "step": 5079, + "time_per_iteration": 2.6536014080047607 + }, + { + "auxiliary_loss_clip": 0.01701681, + "auxiliary_loss_mlp": 0.00306907, + "balance_loss_clip": 1.33269727, + "balance_loss_mlp": 0.27076262, + "epoch": 0.30542612355328425, + "flos": 23112287504640.0, + "grad_norm": 5.166235749431431, + "language_loss": 0.84887159, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.86895746, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 0.36132812, + "step": 5080, + "time_per_iteration": 2.6893739700317383 + }, + { + "auxiliary_loss_clip": 0.01695917, + "auxiliary_loss_mlp": 0.00307081, + "balance_loss_clip": 1.32928658, + "balance_loss_mlp": 0.27003068, + "epoch": 0.3054862468059522, + "flos": 24389450461440.0, + "grad_norm": 36.95896810730087, + "language_loss": 0.87477231, + "learning_rate": 3.255313596022074e-06, + "loss": 0.89480233, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.37036133, + "step": 5081, + "time_per_iteration": 2.722252368927002 + }, + { + "auxiliary_loss_clip": 0.01723213, + "auxiliary_loss_mlp": 0.00319343, + "balance_loss_clip": 1.34751892, + "balance_loss_mlp": 0.28198236, + "epoch": 0.3055463700586202, + "flos": 29386096704000.0, + "grad_norm": 3.1307730869605113, + "language_loss": 0.76509255, + "learning_rate": 3.255010380132783e-06, + "loss": 0.78551811, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.37353516, + "step": 5082, + "time_per_iteration": 4.126389026641846 + }, + { + "auxiliary_loss_clip": 0.0172507, + "auxiliary_loss_mlp": 0.00329156, + "balance_loss_clip": 1.32928216, + "balance_loss_mlp": 0.28872061, + "epoch": 0.30560649331128814, + "flos": 25591775431680.0, + "grad_norm": 6.473991210979472, + "language_loss": 0.78777373, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.80831605, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 3.95703125, + "router_z_loss_mlp": 0.40429688, + "step": 5083, + "time_per_iteration": 2.6821818351745605 + }, + { + "auxiliary_loss_clip": 0.01694221, + "auxiliary_loss_mlp": 0.00299193, + "balance_loss_clip": 1.32723653, + "balance_loss_mlp": 0.26188082, + "epoch": 0.3056666165639561, + "flos": 19128321400320.0, + "grad_norm": 34.90890507258392, + "language_loss": 0.76504773, + "learning_rate": 3.254403805595344e-06, + "loss": 0.78498185, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 3.66992188, + "router_z_loss_mlp": 0.37329102, + "step": 5084, + "time_per_iteration": 4.0594282150268555 + }, + { + "auxiliary_loss_clip": 0.01727525, + "auxiliary_loss_mlp": 0.00309902, + "balance_loss_clip": 1.34201479, + "balance_loss_mlp": 0.27311438, + "epoch": 0.30572673981662407, + "flos": 15523860441600.0, + "grad_norm": 2.7260888768982308, + "language_loss": 0.87684566, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.89721996, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 3.85742188, + "router_z_loss_mlp": 0.36767578, + "step": 5085, + "time_per_iteration": 2.636312484741211 + }, + { + "auxiliary_loss_clip": 0.01691782, + "auxiliary_loss_mlp": 0.0031813, + "balance_loss_clip": 1.32588232, + "balance_loss_mlp": 0.28267673, + "epoch": 0.30578686306929204, + "flos": 21506541909120.0, + "grad_norm": 6.500023640293035, + "language_loss": 0.82808828, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.84818739, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.35449219, + "step": 5086, + "time_per_iteration": 2.6758785247802734 + }, + { + "auxiliary_loss_clip": 0.01682534, + "auxiliary_loss_mlp": 0.00306814, + "balance_loss_clip": 1.32837868, + "balance_loss_mlp": 0.27019301, + "epoch": 0.30584698632196, + "flos": 20954271323520.0, + "grad_norm": 2.5704224874904678, + "language_loss": 0.83394074, + "learning_rate": 3.253493587064563e-06, + "loss": 0.85383421, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 3.54296875, + "router_z_loss_mlp": 0.36621094, + "step": 5087, + "time_per_iteration": 2.724257230758667 + }, + { + "auxiliary_loss_clip": 0.01699365, + "auxiliary_loss_mlp": 0.00323417, + "balance_loss_clip": 1.3261106, + "balance_loss_mlp": 0.28531796, + "epoch": 0.30590710957462797, + "flos": 24681116897280.0, + "grad_norm": 3.2426920509454904, + "language_loss": 0.7722404, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.79246819, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.38134766, + "step": 5088, + "time_per_iteration": 4.275417804718018 + }, + { + "auxiliary_loss_clip": 0.01725886, + "auxiliary_loss_mlp": 0.00329005, + "balance_loss_clip": 1.33700526, + "balance_loss_mlp": 0.29054773, + "epoch": 0.30596723282729593, + "flos": 17086907744640.0, + "grad_norm": 10.361691111790876, + "language_loss": 0.88181674, + "learning_rate": 3.252886537028521e-06, + "loss": 0.90236568, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 3.89453125, + "router_z_loss_mlp": 0.38476562, + "step": 5089, + "time_per_iteration": 2.641275644302368 + }, + { + "auxiliary_loss_clip": 0.01694911, + "auxiliary_loss_mlp": 0.00333918, + "balance_loss_clip": 1.33178568, + "balance_loss_mlp": 0.29322028, + "epoch": 0.30602735607996395, + "flos": 22857106308480.0, + "grad_norm": 3.163428747918257, + "language_loss": 0.84071654, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.86100483, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.40698242, + "step": 5090, + "time_per_iteration": 2.645031213760376 + }, + { + "auxiliary_loss_clip": 0.01721682, + "auxiliary_loss_mlp": 0.00326472, + "balance_loss_clip": 1.33654332, + "balance_loss_mlp": 0.28880173, + "epoch": 0.3060874793326319, + "flos": 29861482227840.0, + "grad_norm": 25.83811927150604, + "language_loss": 0.83572197, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.85620356, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 3.8515625, + "router_z_loss_mlp": 0.37695312, + "step": 5091, + "time_per_iteration": 2.7386362552642822 + }, + { + "auxiliary_loss_clip": 0.01690512, + "auxiliary_loss_mlp": 0.00302911, + "balance_loss_clip": 1.32667708, + "balance_loss_mlp": 0.26607549, + "epoch": 0.3061476025852999, + "flos": 20448577699200.0, + "grad_norm": 4.897062763175659, + "language_loss": 0.77490127, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.79483551, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 0.36816406, + "step": 5092, + "time_per_iteration": 2.6400818824768066 + }, + { + "auxiliary_loss_clip": 0.01716013, + "auxiliary_loss_mlp": 0.00296124, + "balance_loss_clip": 1.35041046, + "balance_loss_mlp": 0.2581917, + "epoch": 0.30620772583796785, + "flos": 19391475415680.0, + "grad_norm": 4.944158568787182, + "language_loss": 0.88317454, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.90329599, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 0.37939453, + "step": 5093, + "time_per_iteration": 2.719830274581909 + }, + { + "auxiliary_loss_clip": 0.01692347, + "auxiliary_loss_mlp": 0.00283086, + "balance_loss_clip": 1.3330152, + "balance_loss_mlp": 0.24589245, + "epoch": 0.3062678490906358, + "flos": 24024562151040.0, + "grad_norm": 4.162216387482296, + "language_loss": 0.80938834, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.82914269, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 0.37182617, + "step": 5094, + "time_per_iteration": 2.709756851196289 + }, + { + "auxiliary_loss_clip": 0.0168095, + "auxiliary_loss_mlp": 0.00266596, + "balance_loss_clip": 1.32101798, + "balance_loss_mlp": 0.23016523, + "epoch": 0.3063279723433038, + "flos": 19754639873280.0, + "grad_norm": 31.474036936437027, + "language_loss": 0.83043337, + "learning_rate": 3.251064247058868e-06, + "loss": 0.84990877, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 0.36450195, + "step": 5095, + "time_per_iteration": 2.7167506217956543 + }, + { + "auxiliary_loss_clip": 0.016783, + "auxiliary_loss_mlp": 0.00277495, + "balance_loss_clip": 1.33114707, + "balance_loss_mlp": 0.24273348, + "epoch": 0.30638809559597174, + "flos": 22450022496000.0, + "grad_norm": 4.928067460159174, + "language_loss": 0.84571767, + "learning_rate": 3.250760365955042e-06, + "loss": 0.86527556, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 0.34741211, + "step": 5096, + "time_per_iteration": 4.038203001022339 + }, + { + "auxiliary_loss_clip": 0.01687233, + "auxiliary_loss_mlp": 0.00291821, + "balance_loss_clip": 1.32913005, + "balance_loss_mlp": 0.25531876, + "epoch": 0.3064482188486397, + "flos": 17165157523200.0, + "grad_norm": 5.901843330278196, + "language_loss": 0.90392905, + "learning_rate": 3.250456437422258e-06, + "loss": 0.92371964, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 0.36474609, + "step": 5097, + "time_per_iteration": 2.640777349472046 + }, + { + "auxiliary_loss_clip": 0.01677908, + "auxiliary_loss_mlp": 0.00287418, + "balance_loss_clip": 1.32882905, + "balance_loss_mlp": 0.24919964, + "epoch": 0.3065083421013077, + "flos": 23768483114880.0, + "grad_norm": 22.44866442075845, + "language_loss": 0.83746088, + "learning_rate": 3.250152461472041e-06, + "loss": 0.8571142, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 3.49023438, + "router_z_loss_mlp": 0.38208008, + "step": 5098, + "time_per_iteration": 2.7022149562835693 + }, + { + "auxiliary_loss_clip": 0.01681234, + "auxiliary_loss_mlp": 0.00269237, + "balance_loss_clip": 1.33530664, + "balance_loss_mlp": 0.23628798, + "epoch": 0.30656846535397564, + "flos": 26431833784320.0, + "grad_norm": 36.98887344761161, + "language_loss": 0.90828222, + "learning_rate": 3.249848438115917e-06, + "loss": 0.92778695, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 0.32958984, + "step": 5099, + "time_per_iteration": 2.712043285369873 + }, + { + "auxiliary_loss_clip": 0.01688284, + "auxiliary_loss_mlp": 0.00280329, + "balance_loss_clip": 1.33199334, + "balance_loss_mlp": 0.24413671, + "epoch": 0.3066285886066436, + "flos": 26651786716800.0, + "grad_norm": 9.716576941818325, + "language_loss": 0.90909195, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.92877811, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.36181641, + "step": 5100, + "time_per_iteration": 2.7526309490203857 + }, + { + "auxiliary_loss_clip": 0.01692205, + "auxiliary_loss_mlp": 0.00286398, + "balance_loss_clip": 1.33869076, + "balance_loss_mlp": 0.24930041, + "epoch": 0.30668871185931157, + "flos": 15049947375360.0, + "grad_norm": 40.76826934534667, + "language_loss": 0.85410166, + "learning_rate": 3.249240249232065e-06, + "loss": 0.87388766, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.37060547, + "step": 5101, + "time_per_iteration": 2.646016836166382 + }, + { + "auxiliary_loss_clip": 0.01707477, + "auxiliary_loss_mlp": 0.00276771, + "balance_loss_clip": 1.35041666, + "balance_loss_mlp": 0.23776546, + "epoch": 0.30674883511197953, + "flos": 20082109190400.0, + "grad_norm": 4.7545075824236465, + "language_loss": 0.86594069, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.8857832, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.39013672, + "step": 5102, + "time_per_iteration": 2.7391293048858643 + }, + { + "auxiliary_loss_clip": 0.01714825, + "auxiliary_loss_mlp": 0.00300446, + "balance_loss_clip": 1.35660493, + "balance_loss_mlp": 0.26208395, + "epoch": 0.30680895836464755, + "flos": 22893807029760.0, + "grad_norm": 43.40294235697135, + "language_loss": 0.94077504, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.96092778, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.38354492, + "step": 5103, + "time_per_iteration": 2.6569769382476807 + }, + { + "auxiliary_loss_clip": 0.01725857, + "auxiliary_loss_mlp": 0.00271356, + "balance_loss_clip": 1.36440659, + "balance_loss_mlp": 0.23432994, + "epoch": 0.3068690816173155, + "flos": 23696159080320.0, + "grad_norm": 10.667197228311705, + "language_loss": 0.80320418, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.82317632, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 0.37036133, + "step": 5104, + "time_per_iteration": 2.6730387210845947 + }, + { + "auxiliary_loss_clip": 0.01710581, + "auxiliary_loss_mlp": 0.00304496, + "balance_loss_clip": 1.35323596, + "balance_loss_mlp": 0.26570508, + "epoch": 0.3069292048699835, + "flos": 23551044134400.0, + "grad_norm": 2.3597249638049496, + "language_loss": 0.7898289, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.80997968, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 0.38793945, + "step": 5105, + "time_per_iteration": 2.6618165969848633 + }, + { + "auxiliary_loss_clip": 0.01726919, + "auxiliary_loss_mlp": 0.00315844, + "balance_loss_clip": 1.3666656, + "balance_loss_mlp": 0.27841267, + "epoch": 0.30698932812265145, + "flos": 24531656405760.0, + "grad_norm": 2.8067233311305397, + "language_loss": 0.91137588, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.93180352, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 3.60546875, + "router_z_loss_mlp": 0.37402344, + "step": 5106, + "time_per_iteration": 2.6973822116851807 + }, + { + "auxiliary_loss_clip": 0.01722154, + "auxiliary_loss_mlp": 0.00333417, + "balance_loss_clip": 1.3556633, + "balance_loss_mlp": 0.29331517, + "epoch": 0.3070494513753194, + "flos": 20996430912000.0, + "grad_norm": 23.17094655009606, + "language_loss": 0.80417478, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.82473052, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.40112305, + "step": 5107, + "time_per_iteration": 2.6867480278015137 + }, + { + "auxiliary_loss_clip": 0.01723163, + "auxiliary_loss_mlp": 0.00289949, + "balance_loss_clip": 1.36108613, + "balance_loss_mlp": 0.25220746, + "epoch": 0.3071095746279874, + "flos": 19025940660480.0, + "grad_norm": 2.6588765349276544, + "language_loss": 0.78392768, + "learning_rate": 3.247110096547814e-06, + "loss": 0.80405879, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 0.37744141, + "step": 5108, + "time_per_iteration": 2.7106289863586426 + }, + { + "auxiliary_loss_clip": 0.01722373, + "auxiliary_loss_mlp": 0.003255, + "balance_loss_clip": 1.36247325, + "balance_loss_mlp": 0.28711426, + "epoch": 0.30716969788065535, + "flos": 21215521918080.0, + "grad_norm": 2.802297815703217, + "language_loss": 0.92712289, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.94760162, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 0.3840332, + "step": 5109, + "time_per_iteration": 2.743403434753418 + }, + { + "auxiliary_loss_clip": 0.01718532, + "auxiliary_loss_mlp": 0.0030689, + "balance_loss_clip": 1.35807228, + "balance_loss_mlp": 0.27200904, + "epoch": 0.3072298211333233, + "flos": 25772765086080.0, + "grad_norm": 11.500764534145157, + "language_loss": 0.73212767, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.7523818, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 0.34912109, + "step": 5110, + "time_per_iteration": 2.6858537197113037 + }, + { + "auxiliary_loss_clip": 0.01714163, + "auxiliary_loss_mlp": 0.00278667, + "balance_loss_clip": 1.36410332, + "balance_loss_mlp": 0.24426308, + "epoch": 0.3072899443859913, + "flos": 25848931875840.0, + "grad_norm": 8.903527690591732, + "language_loss": 0.81461811, + "learning_rate": 3.246196464379919e-06, + "loss": 0.83454645, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.34399414, + "step": 5111, + "time_per_iteration": 2.6853127479553223 + }, + { + "auxiliary_loss_clip": 0.01743217, + "auxiliary_loss_mlp": 0.00306142, + "balance_loss_clip": 1.37481213, + "balance_loss_mlp": 0.26961631, + "epoch": 0.30735006763865924, + "flos": 25922800195200.0, + "grad_norm": 3.9521624984920822, + "language_loss": 0.74848413, + "learning_rate": 3.245891825796765e-06, + "loss": 0.76897764, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 0.36523438, + "step": 5112, + "time_per_iteration": 2.801832914352417 + }, + { + "auxiliary_loss_clip": 0.01748283, + "auxiliary_loss_mlp": 0.0033869, + "balance_loss_clip": 1.37925255, + "balance_loss_mlp": 0.30016136, + "epoch": 0.3074101908913272, + "flos": 30917004312960.0, + "grad_norm": 5.099620436919375, + "language_loss": 0.86822814, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.88909787, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 0.38549805, + "step": 5113, + "time_per_iteration": 2.833498001098633 + }, + { + "auxiliary_loss_clip": 0.01734361, + "auxiliary_loss_mlp": 0.00312205, + "balance_loss_clip": 1.37279224, + "balance_loss_mlp": 0.27355707, + "epoch": 0.30747031414399517, + "flos": 18401058731520.0, + "grad_norm": 11.802789216737423, + "language_loss": 0.85477757, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.87524319, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 0.38623047, + "step": 5114, + "time_per_iteration": 2.6279284954071045 + }, + { + "auxiliary_loss_clip": 0.01717352, + "auxiliary_loss_mlp": 0.00294676, + "balance_loss_clip": 1.36131525, + "balance_loss_mlp": 0.25614792, + "epoch": 0.30753043739666314, + "flos": 22633166966400.0, + "grad_norm": 4.407737924458952, + "language_loss": 0.68319571, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.70331597, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 3.56054688, + "router_z_loss_mlp": 0.38500977, + "step": 5115, + "time_per_iteration": 2.6493804454803467 + }, + { + "auxiliary_loss_clip": 0.01724827, + "auxiliary_loss_mlp": 0.00326839, + "balance_loss_clip": 1.36300075, + "balance_loss_mlp": 0.29055128, + "epoch": 0.3075905606493311, + "flos": 27344072517120.0, + "grad_norm": 331.60580209235957, + "language_loss": 0.89157343, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.91209006, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.36303711, + "step": 5116, + "time_per_iteration": 2.6803503036499023 + }, + { + "auxiliary_loss_clip": 0.01730502, + "auxiliary_loss_mlp": 0.00306304, + "balance_loss_clip": 1.36754358, + "balance_loss_mlp": 0.2652964, + "epoch": 0.3076506839019991, + "flos": 22090808534400.0, + "grad_norm": 6.374467127224982, + "language_loss": 0.81918931, + "learning_rate": 3.244367924446952e-06, + "loss": 0.83955741, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.41015625, + "step": 5117, + "time_per_iteration": 2.647113561630249 + }, + { + "auxiliary_loss_clip": 0.0175631, + "auxiliary_loss_mlp": 0.00317915, + "balance_loss_clip": 1.38645005, + "balance_loss_mlp": 0.27986386, + "epoch": 0.3077108071546671, + "flos": 21289533891840.0, + "grad_norm": 15.287581150150427, + "language_loss": 0.77623105, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.79697323, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 0.38061523, + "step": 5118, + "time_per_iteration": 2.633878231048584 + }, + { + "auxiliary_loss_clip": 0.01721429, + "auxiliary_loss_mlp": 0.00295961, + "balance_loss_clip": 1.3628056, + "balance_loss_mlp": 0.25790966, + "epoch": 0.30777093040733505, + "flos": 21430985650560.0, + "grad_norm": 20.600950440471088, + "language_loss": 0.80117202, + "learning_rate": 3.243758033520219e-06, + "loss": 0.82134593, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.38061523, + "step": 5119, + "time_per_iteration": 2.6808576583862305 + }, + { + "auxiliary_loss_clip": 0.01723997, + "auxiliary_loss_mlp": 0.00291679, + "balance_loss_clip": 1.36920762, + "balance_loss_mlp": 0.25391322, + "epoch": 0.307831053660003, + "flos": 23149275534720.0, + "grad_norm": 17.77802738036293, + "language_loss": 0.85141927, + "learning_rate": 3.243453017305926e-06, + "loss": 0.87157607, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 3.54492188, + "router_z_loss_mlp": 0.37792969, + "step": 5120, + "time_per_iteration": 2.6771743297576904 + }, + { + "auxiliary_loss_clip": 0.01721656, + "auxiliary_loss_mlp": 0.0029615, + "balance_loss_clip": 1.37153077, + "balance_loss_mlp": 0.26270011, + "epoch": 0.307891176912671, + "flos": 17019755268480.0, + "grad_norm": 870.4327912939941, + "language_loss": 0.85909259, + "learning_rate": 3.24314795393977e-06, + "loss": 0.87927067, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.33447266, + "step": 5121, + "time_per_iteration": 2.662991523742676 + }, + { + "auxiliary_loss_clip": 0.01727957, + "auxiliary_loss_mlp": 0.00283856, + "balance_loss_clip": 1.37971151, + "balance_loss_mlp": 0.2463046, + "epoch": 0.30795130016533895, + "flos": 27705046245120.0, + "grad_norm": 14.935648000163184, + "language_loss": 0.87636626, + "learning_rate": 3.242842843433319e-06, + "loss": 0.89648438, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.37548828, + "step": 5122, + "time_per_iteration": 2.769547462463379 + }, + { + "auxiliary_loss_clip": 0.01900421, + "auxiliary_loss_mlp": 0.00172307, + "balance_loss_clip": 1.63681865, + "balance_loss_mlp": 0.16367589, + "epoch": 0.3080114234180069, + "flos": 69058699591680.0, + "grad_norm": 0.7581088488885888, + "language_loss": 0.58735389, + "learning_rate": 3.242537685798143e-06, + "loss": 0.60808116, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.08642578, + "step": 5123, + "time_per_iteration": 3.2759780883789062 + }, + { + "auxiliary_loss_clip": 0.01750344, + "auxiliary_loss_mlp": 0.00325877, + "balance_loss_clip": 1.38352156, + "balance_loss_mlp": 0.28446397, + "epoch": 0.3080715466706749, + "flos": 24060221377920.0, + "grad_norm": 1.9334779220168143, + "language_loss": 0.90086889, + "learning_rate": 3.242232481045813e-06, + "loss": 0.92163116, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 0.41455078, + "step": 5124, + "time_per_iteration": 4.185925006866455 + }, + { + "auxiliary_loss_clip": 0.01744978, + "auxiliary_loss_mlp": 0.00268946, + "balance_loss_clip": 1.38044572, + "balance_loss_mlp": 0.2312282, + "epoch": 0.30813166992334284, + "flos": 25848680480640.0, + "grad_norm": 2.231218683316119, + "language_loss": 0.86506063, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.8851999, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.37719727, + "step": 5125, + "time_per_iteration": 2.7389976978302 + }, + { + "auxiliary_loss_clip": 0.01750572, + "auxiliary_loss_mlp": 0.00306991, + "balance_loss_clip": 1.38209462, + "balance_loss_mlp": 0.26910591, + "epoch": 0.3081917931760108, + "flos": 20449619193600.0, + "grad_norm": 4.695553451887266, + "language_loss": 0.72740835, + "learning_rate": 3.241621930235989e-06, + "loss": 0.74798399, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 0.37890625, + "step": 5126, + "time_per_iteration": 4.077983617782593 + }, + { + "auxiliary_loss_clip": 0.01711025, + "auxiliary_loss_mlp": 0.00253475, + "balance_loss_clip": 1.36658001, + "balance_loss_mlp": 0.21749774, + "epoch": 0.3082519164286788, + "flos": 22166257052160.0, + "grad_norm": 7.92998448785644, + "language_loss": 0.91114366, + "learning_rate": 3.241316584201646e-06, + "loss": 0.93078864, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.35961914, + "step": 5127, + "time_per_iteration": 2.713705539703369 + }, + { + "auxiliary_loss_clip": 0.01720969, + "auxiliary_loss_mlp": 0.00270282, + "balance_loss_clip": 1.37477243, + "balance_loss_mlp": 0.23418507, + "epoch": 0.30831203968134674, + "flos": 28913404700160.0, + "grad_norm": 1.791776631169313, + "language_loss": 0.73701656, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.7569291, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.36108398, + "step": 5128, + "time_per_iteration": 2.7165515422821045 + }, + { + "auxiliary_loss_clip": 0.01727063, + "auxiliary_loss_mlp": 0.00289679, + "balance_loss_clip": 1.36832058, + "balance_loss_mlp": 0.25024456, + "epoch": 0.3083721629340147, + "flos": 25667726739840.0, + "grad_norm": 3.089468405968999, + "language_loss": 0.77942169, + "learning_rate": 3.240705750931993e-06, + "loss": 0.7995891, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 0.39477539, + "step": 5129, + "time_per_iteration": 2.7070820331573486 + }, + { + "auxiliary_loss_clip": 0.0174413, + "auxiliary_loss_mlp": 0.00048075, + "balance_loss_clip": 1.4826715, + "balance_loss_mlp": 0.04199563, + "epoch": 0.3084322861866827, + "flos": 68212679581440.0, + "grad_norm": 0.8616940327018665, + "language_loss": 0.58646894, + "learning_rate": 3.240400263719846e-06, + "loss": 0.60439098, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.06079102, + "step": 5130, + "time_per_iteration": 4.513909816741943 + }, + { + "auxiliary_loss_clip": 0.01734287, + "auxiliary_loss_mlp": 0.00299217, + "balance_loss_clip": 1.3803854, + "balance_loss_mlp": 0.26123667, + "epoch": 0.3084924094393507, + "flos": 20296495514880.0, + "grad_norm": 17.528225827700425, + "language_loss": 0.82093966, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.84127474, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.37988281, + "step": 5131, + "time_per_iteration": 2.6802375316619873 + }, + { + "auxiliary_loss_clip": 0.01722317, + "auxiliary_loss_mlp": 0.00267519, + "balance_loss_clip": 1.3807224, + "balance_loss_mlp": 0.23270977, + "epoch": 0.30855253269201866, + "flos": 23949831905280.0, + "grad_norm": 10.39632345769879, + "language_loss": 0.76543266, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.78533101, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 0.34814453, + "step": 5132, + "time_per_iteration": 2.6738364696502686 + }, + { + "auxiliary_loss_clip": 0.01691777, + "auxiliary_loss_mlp": 0.00253405, + "balance_loss_clip": 1.35606635, + "balance_loss_mlp": 0.21666446, + "epoch": 0.3086126559446866, + "flos": 19281876042240.0, + "grad_norm": 7.272241563070787, + "language_loss": 0.95952082, + "learning_rate": 3.239483519913136e-06, + "loss": 0.97897261, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.3671875, + "step": 5133, + "time_per_iteration": 2.637441396713257 + }, + { + "auxiliary_loss_clip": 0.0172505, + "auxiliary_loss_mlp": 0.00306087, + "balance_loss_clip": 1.37955475, + "balance_loss_mlp": 0.26734376, + "epoch": 0.3086727791973546, + "flos": 33760770019200.0, + "grad_norm": 3.6752641222841596, + "language_loss": 0.74254131, + "learning_rate": 3.239177844626102e-06, + "loss": 0.76285267, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.38745117, + "step": 5134, + "time_per_iteration": 2.771911382675171 + }, + { + "auxiliary_loss_clip": 0.01712991, + "auxiliary_loss_mlp": 0.00294925, + "balance_loss_clip": 1.36710286, + "balance_loss_mlp": 0.25482309, + "epoch": 0.30873290245002255, + "flos": 16034151006720.0, + "grad_norm": 12.484188002938083, + "language_loss": 0.89189434, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.91197354, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.40063477, + "step": 5135, + "time_per_iteration": 2.6252338886260986 + }, + { + "auxiliary_loss_clip": 0.01679923, + "auxiliary_loss_mlp": 0.00036712, + "balance_loss_clip": 1.43073547, + "balance_loss_mlp": 0.03063202, + "epoch": 0.3087930257026905, + "flos": 65048304055680.0, + "grad_norm": 0.7113941476861222, + "language_loss": 0.55281466, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.56998098, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.06079102, + "step": 5136, + "time_per_iteration": 3.164055824279785 + }, + { + "auxiliary_loss_clip": 0.01698614, + "auxiliary_loss_mlp": 0.00264234, + "balance_loss_clip": 1.36618018, + "balance_loss_mlp": 0.22887659, + "epoch": 0.3088531489553585, + "flos": 74738829824640.0, + "grad_norm": 2.8413403338439247, + "language_loss": 0.83033901, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.84996748, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.35351562, + "step": 5137, + "time_per_iteration": 3.0946240425109863 + }, + { + "auxiliary_loss_clip": 0.01701646, + "auxiliary_loss_mlp": 0.00272031, + "balance_loss_clip": 1.36454618, + "balance_loss_mlp": 0.23531494, + "epoch": 0.30891327220802645, + "flos": 21142300043520.0, + "grad_norm": 3.673364483915258, + "language_loss": 0.85371965, + "learning_rate": 3.237954673696424e-06, + "loss": 0.87345642, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.36694336, + "step": 5138, + "time_per_iteration": 4.117339849472046 + }, + { + "auxiliary_loss_clip": 0.01712896, + "auxiliary_loss_mlp": 0.00262564, + "balance_loss_clip": 1.37028193, + "balance_loss_mlp": 0.22486958, + "epoch": 0.3089733954606944, + "flos": 25664494515840.0, + "grad_norm": 1.9689633144243122, + "language_loss": 0.86087477, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.88062936, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.37695312, + "step": 5139, + "time_per_iteration": 2.742020606994629 + }, + { + "auxiliary_loss_clip": 0.01708624, + "auxiliary_loss_mlp": 0.00315562, + "balance_loss_clip": 1.36384761, + "balance_loss_mlp": 0.27727178, + "epoch": 0.3090335187133624, + "flos": 19427350124160.0, + "grad_norm": 9.301213739218095, + "language_loss": 0.84695053, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.86719239, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.3828125, + "step": 5140, + "time_per_iteration": 2.629746198654175 + }, + { + "auxiliary_loss_clip": 0.01670424, + "auxiliary_loss_mlp": 0.00268781, + "balance_loss_clip": 1.34625053, + "balance_loss_mlp": 0.23483041, + "epoch": 0.30909364196603034, + "flos": 20011329440640.0, + "grad_norm": 6.467792144708172, + "language_loss": 0.84575069, + "learning_rate": 3.237036802553252e-06, + "loss": 0.8651427, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.33959961, + "step": 5141, + "time_per_iteration": 2.6718130111694336 + }, + { + "auxiliary_loss_clip": 0.01681438, + "auxiliary_loss_mlp": 0.00281472, + "balance_loss_clip": 1.34319866, + "balance_loss_mlp": 0.24404068, + "epoch": 0.3091537652186983, + "flos": 19677575243520.0, + "grad_norm": 51.26661462855091, + "language_loss": 0.9709568, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.99058592, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.37426758, + "step": 5142, + "time_per_iteration": 2.6694140434265137 + }, + { + "auxiliary_loss_clip": 0.01701137, + "auxiliary_loss_mlp": 0.00308066, + "balance_loss_clip": 1.36219311, + "balance_loss_mlp": 0.27378103, + "epoch": 0.3092138884713663, + "flos": 17020042577280.0, + "grad_norm": 11.3142667866755, + "language_loss": 0.87914574, + "learning_rate": 3.23642465389567e-06, + "loss": 0.89923775, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.34277344, + "step": 5143, + "time_per_iteration": 2.6344077587127686 + }, + { + "auxiliary_loss_clip": 0.01676726, + "auxiliary_loss_mlp": 0.00284008, + "balance_loss_clip": 1.34655321, + "balance_loss_mlp": 0.24717201, + "epoch": 0.3092740117240343, + "flos": 25009986844800.0, + "grad_norm": 28.41120477578561, + "language_loss": 0.80043495, + "learning_rate": 3.236118509233055e-06, + "loss": 0.82004225, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.3684082, + "step": 5144, + "time_per_iteration": 2.734423875808716 + }, + { + "auxiliary_loss_clip": 0.01699619, + "auxiliary_loss_mlp": 0.00320813, + "balance_loss_clip": 1.35201406, + "balance_loss_mlp": 0.28466839, + "epoch": 0.30933413497670226, + "flos": 25590410714880.0, + "grad_norm": 1.9558723523603616, + "language_loss": 0.81329709, + "learning_rate": 3.235812317696702e-06, + "loss": 0.8335014, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 3.47460938, + "router_z_loss_mlp": 0.36132812, + "step": 5145, + "time_per_iteration": 2.6887567043304443 + }, + { + "auxiliary_loss_clip": 0.01682319, + "auxiliary_loss_mlp": 0.00329417, + "balance_loss_clip": 1.34293497, + "balance_loss_mlp": 0.29231885, + "epoch": 0.3093942582293702, + "flos": 24389665943040.0, + "grad_norm": 6.887443098805595, + "language_loss": 0.82231766, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.842435, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 3.39648438, + "router_z_loss_mlp": 0.37133789, + "step": 5146, + "time_per_iteration": 2.6934635639190674 + }, + { + "auxiliary_loss_clip": 0.01664858, + "auxiliary_loss_mlp": 0.00276614, + "balance_loss_clip": 1.34359527, + "balance_loss_mlp": 0.24373591, + "epoch": 0.3094543814820382, + "flos": 19646441130240.0, + "grad_norm": 3.4363251750098702, + "language_loss": 0.73902059, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.75843537, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.32861328, + "step": 5147, + "time_per_iteration": 2.6320059299468994 + }, + { + "auxiliary_loss_clip": 0.01682623, + "auxiliary_loss_mlp": 0.00332583, + "balance_loss_clip": 1.35002899, + "balance_loss_mlp": 0.29901367, + "epoch": 0.30951450473470615, + "flos": 25663812157440.0, + "grad_norm": 5.079801987492071, + "language_loss": 0.82594693, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.84609896, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.33569336, + "step": 5148, + "time_per_iteration": 2.692263603210449 + }, + { + "auxiliary_loss_clip": 0.01699424, + "auxiliary_loss_mlp": 0.0032528, + "balance_loss_clip": 1.35282755, + "balance_loss_mlp": 0.2871803, + "epoch": 0.3095746279873741, + "flos": 12020415505920.0, + "grad_norm": 12.798213015925347, + "language_loss": 0.82503432, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.84528136, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.38085938, + "step": 5149, + "time_per_iteration": 2.5879576206207275 + }, + { + "auxiliary_loss_clip": 0.01692506, + "auxiliary_loss_mlp": 0.00359366, + "balance_loss_clip": 1.3533442, + "balance_loss_mlp": 0.32214871, + "epoch": 0.3096347512400421, + "flos": 23623044946560.0, + "grad_norm": 6.674509396184529, + "language_loss": 0.91234744, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.9328661, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.37231445, + "step": 5150, + "time_per_iteration": 2.770207405090332 + }, + { + "auxiliary_loss_clip": 0.01689195, + "auxiliary_loss_mlp": 0.00315887, + "balance_loss_clip": 1.36401629, + "balance_loss_mlp": 0.28474927, + "epoch": 0.30969487449271005, + "flos": 22529313768960.0, + "grad_norm": 2.246831319575793, + "language_loss": 0.8434037, + "learning_rate": 3.233974184780424e-06, + "loss": 0.86345446, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.31152344, + "step": 5151, + "time_per_iteration": 2.779845714569092 + }, + { + "auxiliary_loss_clip": 0.01671145, + "auxiliary_loss_mlp": 0.00306576, + "balance_loss_clip": 1.33753908, + "balance_loss_mlp": 0.2705276, + "epoch": 0.309754997745378, + "flos": 15267925059840.0, + "grad_norm": 26.119431109916697, + "language_loss": 0.77106792, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.7908451, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.36035156, + "step": 5152, + "time_per_iteration": 2.6572108268737793 + }, + { + "auxiliary_loss_clip": 0.01681082, + "auxiliary_loss_mlp": 0.00344223, + "balance_loss_clip": 1.34221697, + "balance_loss_mlp": 0.30805531, + "epoch": 0.309815120998046, + "flos": 26979291947520.0, + "grad_norm": 8.652298710963008, + "language_loss": 0.89523554, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.9154886, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.36108398, + "step": 5153, + "time_per_iteration": 2.709210157394409 + }, + { + "auxiliary_loss_clip": 0.01676355, + "auxiliary_loss_mlp": 0.00324756, + "balance_loss_clip": 1.34514868, + "balance_loss_mlp": 0.29073405, + "epoch": 0.30987524425071394, + "flos": 21143161969920.0, + "grad_norm": 2.5005432572890562, + "language_loss": 0.79992777, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.8199389, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.34008789, + "step": 5154, + "time_per_iteration": 2.665780544281006 + }, + { + "auxiliary_loss_clip": 0.01657454, + "auxiliary_loss_mlp": 0.00319961, + "balance_loss_clip": 1.34164047, + "balance_loss_mlp": 0.28546226, + "epoch": 0.3099353675033819, + "flos": 15268284195840.0, + "grad_norm": 3.8003115844964, + "language_loss": 0.82454014, + "learning_rate": 3.232747826832858e-06, + "loss": 0.84431428, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.3449707, + "step": 5155, + "time_per_iteration": 2.5902154445648193 + }, + { + "auxiliary_loss_clip": 0.01653074, + "auxiliary_loss_mlp": 0.00308415, + "balance_loss_clip": 1.32345366, + "balance_loss_mlp": 0.27021989, + "epoch": 0.30999549075604993, + "flos": 15413794191360.0, + "grad_norm": 706.755243542043, + "language_loss": 0.87837803, + "learning_rate": 3.232441120452094e-06, + "loss": 0.89799285, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.38183594, + "step": 5156, + "time_per_iteration": 2.6600122451782227 + }, + { + "auxiliary_loss_clip": 0.0163886, + "auxiliary_loss_mlp": 0.00336619, + "balance_loss_clip": 1.31697321, + "balance_loss_mlp": 0.30083275, + "epoch": 0.3100556140087179, + "flos": 23184539712000.0, + "grad_norm": 40.02308935830575, + "language_loss": 0.80953228, + "learning_rate": 3.23213436733704e-06, + "loss": 0.82928717, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.35791016, + "step": 5157, + "time_per_iteration": 2.6854069232940674 + }, + { + "auxiliary_loss_clip": 0.01616162, + "auxiliary_loss_mlp": 0.00283838, + "balance_loss_clip": 1.29738736, + "balance_loss_mlp": 0.25203249, + "epoch": 0.31011573726138586, + "flos": 25742169676800.0, + "grad_norm": 315.7792588959449, + "language_loss": 0.74506241, + "learning_rate": 3.231827567499327e-06, + "loss": 0.7640624, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.31835938, + "step": 5158, + "time_per_iteration": 2.8443126678466797 + }, + { + "auxiliary_loss_clip": 0.01618809, + "auxiliary_loss_mlp": 0.00250836, + "balance_loss_clip": 1.30112278, + "balance_loss_mlp": 0.21841113, + "epoch": 0.3101758605140538, + "flos": 20011329440640.0, + "grad_norm": 54.70326830024265, + "language_loss": 0.90621269, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.92490911, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 3.17382812, + "router_z_loss_mlp": 0.32421875, + "step": 5159, + "time_per_iteration": 2.6315832138061523 + }, + { + "auxiliary_loss_clip": 0.01625776, + "auxiliary_loss_mlp": 0.00309697, + "balance_loss_clip": 1.3055172, + "balance_loss_mlp": 0.27515015, + "epoch": 0.3102359837667218, + "flos": 19135683688320.0, + "grad_norm": 10.354334911713243, + "language_loss": 0.90822387, + "learning_rate": 3.231213827702462e-06, + "loss": 0.92757869, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.34545898, + "step": 5160, + "time_per_iteration": 2.5971248149871826 + }, + { + "auxiliary_loss_clip": 0.01610218, + "auxiliary_loss_mlp": 0.00248509, + "balance_loss_clip": 1.30056322, + "balance_loss_mlp": 0.21699011, + "epoch": 0.31029610701938976, + "flos": 22265405568000.0, + "grad_norm": 5.864630786557637, + "language_loss": 0.83546948, + "learning_rate": 3.230906887766584e-06, + "loss": 0.85405678, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.31542969, + "step": 5161, + "time_per_iteration": 2.631803512573242 + }, + { + "auxiliary_loss_clip": 0.01622129, + "auxiliary_loss_mlp": 0.00274063, + "balance_loss_clip": 1.30049276, + "balance_loss_mlp": 0.23937286, + "epoch": 0.3103562302720577, + "flos": 20805349536000.0, + "grad_norm": 2.371181334548887, + "language_loss": 0.9048751, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.92383707, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.34643555, + "step": 5162, + "time_per_iteration": 2.622304916381836 + }, + { + "auxiliary_loss_clip": 0.01593234, + "auxiliary_loss_mlp": 0.00251677, + "balance_loss_clip": 1.28286338, + "balance_loss_mlp": 0.2203486, + "epoch": 0.3104163535247257, + "flos": 22344158136960.0, + "grad_norm": 1.962247989747067, + "language_loss": 0.86771071, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.88615984, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.31298828, + "step": 5163, + "time_per_iteration": 2.640636682510376 + }, + { + "auxiliary_loss_clip": 0.01611927, + "auxiliary_loss_mlp": 0.00279225, + "balance_loss_clip": 1.29241776, + "balance_loss_mlp": 0.24479714, + "epoch": 0.31047647677739365, + "flos": 21689363157120.0, + "grad_norm": 4.240850566900648, + "language_loss": 0.82602823, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.84493971, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.34448242, + "step": 5164, + "time_per_iteration": 2.671548366546631 + }, + { + "auxiliary_loss_clip": 0.0160132, + "auxiliary_loss_mlp": 0.00256781, + "balance_loss_clip": 1.28922963, + "balance_loss_mlp": 0.22225833, + "epoch": 0.3105366000300616, + "flos": 18917275040640.0, + "grad_norm": 77.73382144861004, + "language_loss": 0.81160223, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.83018327, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.34521484, + "step": 5165, + "time_per_iteration": 2.6305410861968994 + }, + { + "auxiliary_loss_clip": 0.01607762, + "auxiliary_loss_mlp": 0.00262365, + "balance_loss_clip": 1.2965076, + "balance_loss_mlp": 0.23063183, + "epoch": 0.3105967232827296, + "flos": 18260397072000.0, + "grad_norm": 1.652636143067467, + "language_loss": 0.81607097, + "learning_rate": 3.229371488178348e-06, + "loss": 0.83477223, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.31713867, + "step": 5166, + "time_per_iteration": 4.0905468463897705 + }, + { + "auxiliary_loss_clip": 0.0161492, + "auxiliary_loss_mlp": 0.00258089, + "balance_loss_clip": 1.30012631, + "balance_loss_mlp": 0.22368567, + "epoch": 0.31065684653539755, + "flos": 17672144037120.0, + "grad_norm": 2.6270812145756306, + "language_loss": 0.81540453, + "learning_rate": 3.229064268360444e-06, + "loss": 0.83413464, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.34423828, + "step": 5167, + "time_per_iteration": 2.6460793018341064 + }, + { + "auxiliary_loss_clip": 0.01444654, + "auxiliary_loss_mlp": 0.00094289, + "balance_loss_clip": 1.2652992, + "balance_loss_mlp": 0.08518146, + "epoch": 0.3107169697880655, + "flos": 68531996511360.0, + "grad_norm": 0.7430316260024206, + "language_loss": 0.53002322, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.54541266, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.09130859, + "step": 5168, + "time_per_iteration": 3.1841344833374023 + }, + { + "auxiliary_loss_clip": 0.01593857, + "auxiliary_loss_mlp": 0.00264806, + "balance_loss_clip": 1.28233552, + "balance_loss_mlp": 0.23059337, + "epoch": 0.3107770930407335, + "flos": 13188733274880.0, + "grad_norm": 2.3700911615921973, + "language_loss": 0.8606289, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.87921554, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.34204102, + "step": 5169, + "time_per_iteration": 4.010276556015015 + }, + { + "auxiliary_loss_clip": 0.01622144, + "auxiliary_loss_mlp": 0.00272231, + "balance_loss_clip": 1.30175829, + "balance_loss_mlp": 0.23813726, + "epoch": 0.3108372162934015, + "flos": 31580849520000.0, + "grad_norm": 7.69615503459207, + "language_loss": 0.70168376, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.72062755, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.34082031, + "step": 5170, + "time_per_iteration": 2.753401756286621 + }, + { + "auxiliary_loss_clip": 0.01598244, + "auxiliary_loss_mlp": 0.00255349, + "balance_loss_clip": 1.29638076, + "balance_loss_mlp": 0.22157723, + "epoch": 0.31089733954606946, + "flos": 28729829266560.0, + "grad_norm": 5.918053560078943, + "language_loss": 0.84496701, + "learning_rate": 3.22783492314295e-06, + "loss": 0.86350292, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.33776855, + "step": 5171, + "time_per_iteration": 2.694481134414673 + }, + { + "auxiliary_loss_clip": 0.01596969, + "auxiliary_loss_mlp": 0.00261056, + "balance_loss_clip": 1.2848289, + "balance_loss_mlp": 0.22641338, + "epoch": 0.3109574627987374, + "flos": 19683249592320.0, + "grad_norm": 16.99084222793687, + "language_loss": 0.89268345, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.9112637, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 3.12304688, + "router_z_loss_mlp": 0.34643555, + "step": 5172, + "time_per_iteration": 2.6495842933654785 + }, + { + "auxiliary_loss_clip": 0.01595888, + "auxiliary_loss_mlp": 0.00274884, + "balance_loss_clip": 1.27819419, + "balance_loss_mlp": 0.24155267, + "epoch": 0.3110175860514054, + "flos": 14683981656960.0, + "grad_norm": 88.3177320268542, + "language_loss": 0.93367279, + "learning_rate": 3.227219971129842e-06, + "loss": 0.95238054, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.33349609, + "step": 5173, + "time_per_iteration": 4.041758060455322 + }, + { + "auxiliary_loss_clip": 0.01582644, + "auxiliary_loss_mlp": 0.00251323, + "balance_loss_clip": 1.27724814, + "balance_loss_mlp": 0.2194462, + "epoch": 0.31107770930407336, + "flos": 25739655724800.0, + "grad_norm": 7.448617145915292, + "language_loss": 0.88836068, + "learning_rate": 3.226912425313001e-06, + "loss": 0.90670037, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.31884766, + "step": 5174, + "time_per_iteration": 2.6568233966827393 + }, + { + "auxiliary_loss_clip": 0.01599766, + "auxiliary_loss_mlp": 0.00280486, + "balance_loss_clip": 1.28613877, + "balance_loss_mlp": 0.24503322, + "epoch": 0.3111378325567413, + "flos": 19208259118080.0, + "grad_norm": 165.09183700616617, + "language_loss": 0.9080025, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.92680502, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.35449219, + "step": 5175, + "time_per_iteration": 2.6088955402374268 + }, + { + "auxiliary_loss_clip": 0.0157846, + "auxiliary_loss_mlp": 0.00256432, + "balance_loss_clip": 1.2823689, + "balance_loss_mlp": 0.22760725, + "epoch": 0.3111979558094093, + "flos": 23696374561920.0, + "grad_norm": 16.43425317141788, + "language_loss": 0.90578938, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.92413831, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.28808594, + "step": 5176, + "time_per_iteration": 2.6939449310302734 + }, + { + "auxiliary_loss_clip": 0.01556554, + "auxiliary_loss_mlp": 0.00267915, + "balance_loss_clip": 1.25739765, + "balance_loss_mlp": 0.23317754, + "epoch": 0.31125807906207725, + "flos": 21033023892480.0, + "grad_norm": 5.142249104474445, + "language_loss": 0.87042367, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.8886683, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.34716797, + "step": 5177, + "time_per_iteration": 2.7073006629943848 + }, + { + "auxiliary_loss_clip": 0.0156029, + "auxiliary_loss_mlp": 0.00260576, + "balance_loss_clip": 1.25952101, + "balance_loss_mlp": 0.22624353, + "epoch": 0.3113182023147452, + "flos": 23076628277760.0, + "grad_norm": 4.190502584350524, + "language_loss": 0.88362288, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.90183151, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.34350586, + "step": 5178, + "time_per_iteration": 2.6452620029449463 + }, + { + "auxiliary_loss_clip": 0.01568268, + "auxiliary_loss_mlp": 0.00283455, + "balance_loss_clip": 1.2657876, + "balance_loss_mlp": 0.25060102, + "epoch": 0.3113783255674132, + "flos": 11838994888320.0, + "grad_norm": 9.737249328263367, + "language_loss": 0.89126682, + "learning_rate": 3.225373998592471e-06, + "loss": 0.90978408, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.32861328, + "step": 5179, + "time_per_iteration": 2.68013596534729 + }, + { + "auxiliary_loss_clip": 0.01550263, + "auxiliary_loss_mlp": 0.00265424, + "balance_loss_clip": 1.25832403, + "balance_loss_mlp": 0.23376164, + "epoch": 0.31143844882008115, + "flos": 16289547684480.0, + "grad_norm": 6.491229250897338, + "language_loss": 0.85819787, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.87635469, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.31665039, + "step": 5180, + "time_per_iteration": 3.9969730377197266 + }, + { + "auxiliary_loss_clip": 0.01566438, + "auxiliary_loss_mlp": 0.00253022, + "balance_loss_clip": 1.26688254, + "balance_loss_mlp": 0.22090672, + "epoch": 0.3114985720727491, + "flos": 23217792727680.0, + "grad_norm": 4.8323856041392075, + "language_loss": 0.89711767, + "learning_rate": 3.22475830255844e-06, + "loss": 0.91531229, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.32128906, + "step": 5181, + "time_per_iteration": 2.6742959022521973 + }, + { + "auxiliary_loss_clip": 0.015453, + "auxiliary_loss_mlp": 0.00235394, + "balance_loss_clip": 1.25412965, + "balance_loss_mlp": 0.2081185, + "epoch": 0.3115586953254171, + "flos": 30044626698240.0, + "grad_norm": 3.27502853085985, + "language_loss": 0.82114053, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.83894742, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.27270508, + "step": 5182, + "time_per_iteration": 2.704859733581543 + }, + { + "auxiliary_loss_clip": 0.01585708, + "auxiliary_loss_mlp": 0.00295127, + "balance_loss_clip": 1.28105307, + "balance_loss_mlp": 0.26184365, + "epoch": 0.3116188185780851, + "flos": 25666326109440.0, + "grad_norm": 5.1830296074157145, + "language_loss": 0.78651512, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.80532348, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.33276367, + "step": 5183, + "time_per_iteration": 2.685634136199951 + }, + { + "auxiliary_loss_clip": 0.01535261, + "auxiliary_loss_mlp": 0.00063168, + "balance_loss_clip": 1.34594262, + "balance_loss_mlp": 0.05572958, + "epoch": 0.31167894183075306, + "flos": 69510058917120.0, + "grad_norm": 0.9303002617195502, + "language_loss": 0.58773911, + "learning_rate": 3.223834410214408e-06, + "loss": 0.60372341, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.07421875, + "step": 5184, + "time_per_iteration": 3.123053789138794 + }, + { + "auxiliary_loss_clip": 0.0156757, + "auxiliary_loss_mlp": 0.00304107, + "balance_loss_clip": 1.26794672, + "balance_loss_mlp": 0.2717773, + "epoch": 0.31173906508342103, + "flos": 14939845211520.0, + "grad_norm": 9.75270172430182, + "language_loss": 0.77644271, + "learning_rate": 3.223526353268311e-06, + "loss": 0.79515946, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.32324219, + "step": 5185, + "time_per_iteration": 2.729487419128418 + }, + { + "auxiliary_loss_clip": 0.01583509, + "auxiliary_loss_mlp": 0.0028415, + "balance_loss_clip": 1.27464485, + "balance_loss_mlp": 0.25217772, + "epoch": 0.311799188336089, + "flos": 16176033728640.0, + "grad_norm": 11.919077637816148, + "language_loss": 0.72948694, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.74816352, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.31982422, + "step": 5186, + "time_per_iteration": 2.6438100337982178 + }, + { + "auxiliary_loss_clip": 0.01582874, + "auxiliary_loss_mlp": 0.00306536, + "balance_loss_clip": 1.27580512, + "balance_loss_mlp": 0.27203709, + "epoch": 0.31185931158875696, + "flos": 25009627708800.0, + "grad_norm": 32.8214145283845, + "language_loss": 0.94594038, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.96483451, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.34521484, + "step": 5187, + "time_per_iteration": 2.643606662750244 + }, + { + "auxiliary_loss_clip": 0.01567998, + "auxiliary_loss_mlp": 0.00287149, + "balance_loss_clip": 1.27199888, + "balance_loss_mlp": 0.25738221, + "epoch": 0.3119194348414249, + "flos": 37232901273600.0, + "grad_norm": 24.407413303054483, + "language_loss": 0.68841392, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.70696539, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.29748535, + "step": 5188, + "time_per_iteration": 2.824032783508301 + }, + { + "auxiliary_loss_clip": 0.0155469, + "auxiliary_loss_mlp": 0.00274282, + "balance_loss_clip": 1.26094031, + "balance_loss_mlp": 0.24314414, + "epoch": 0.3119795580940929, + "flos": 15012779777280.0, + "grad_norm": 5.081758967932617, + "language_loss": 0.90469623, + "learning_rate": 3.222293661638346e-06, + "loss": 0.92298603, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.3112793, + "step": 5189, + "time_per_iteration": 2.6271965503692627 + }, + { + "auxiliary_loss_clip": 0.01538654, + "auxiliary_loss_mlp": 0.00280739, + "balance_loss_clip": 1.25239718, + "balance_loss_mlp": 0.25172368, + "epoch": 0.31203968134676086, + "flos": 15998168557440.0, + "grad_norm": 6.4056594205872734, + "language_loss": 0.85207421, + "learning_rate": 3.22198537282789e-06, + "loss": 0.87026817, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.29003906, + "step": 5190, + "time_per_iteration": 2.6388747692108154 + }, + { + "auxiliary_loss_clip": 0.01551084, + "auxiliary_loss_mlp": 0.00280086, + "balance_loss_clip": 1.2560811, + "balance_loss_mlp": 0.24911526, + "epoch": 0.3120998045994288, + "flos": 23837359443840.0, + "grad_norm": 1.761036811608107, + "language_loss": 0.80473322, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.8230449, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.30932617, + "step": 5191, + "time_per_iteration": 2.6815295219421387 + }, + { + "auxiliary_loss_clip": 0.01515095, + "auxiliary_loss_mlp": 0.00046679, + "balance_loss_clip": 1.33716488, + "balance_loss_mlp": 0.04088529, + "epoch": 0.3121599278520968, + "flos": 69184205712000.0, + "grad_norm": 0.871003740614933, + "language_loss": 0.64012301, + "learning_rate": 3.221368656205247e-06, + "loss": 0.65574074, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.05786133, + "step": 5192, + "time_per_iteration": 3.226191520690918 + }, + { + "auxiliary_loss_clip": 0.01570018, + "auxiliary_loss_mlp": 0.003037, + "balance_loss_clip": 1.26493716, + "balance_loss_mlp": 0.26989177, + "epoch": 0.31222005110476475, + "flos": 23806368984960.0, + "grad_norm": 18.05986348928334, + "language_loss": 0.86403871, + "learning_rate": 3.221060228416446e-06, + "loss": 0.8827759, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.33813477, + "step": 5193, + "time_per_iteration": 2.6632587909698486 + }, + { + "auxiliary_loss_clip": 0.01566551, + "auxiliary_loss_mlp": 0.0029268, + "balance_loss_clip": 1.26340222, + "balance_loss_mlp": 0.25970715, + "epoch": 0.3122801743574327, + "flos": 25226132935680.0, + "grad_norm": 2.5964703345403053, + "language_loss": 0.80805731, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.82664961, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.32983398, + "step": 5194, + "time_per_iteration": 2.6951637268066406 + }, + { + "auxiliary_loss_clip": 0.01575031, + "auxiliary_loss_mlp": 0.00305042, + "balance_loss_clip": 1.27567017, + "balance_loss_mlp": 0.27400008, + "epoch": 0.3123402976101007, + "flos": 22966490200320.0, + "grad_norm": 11.33058440580533, + "language_loss": 0.81282032, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.83162105, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 2.99414062, + "router_z_loss_mlp": 0.3104248, + "step": 5195, + "time_per_iteration": 2.6969006061553955 + }, + { + "auxiliary_loss_clip": 0.01573573, + "auxiliary_loss_mlp": 0.00311501, + "balance_loss_clip": 1.27441549, + "balance_loss_mlp": 0.27688217, + "epoch": 0.3124004208627687, + "flos": 25192089820800.0, + "grad_norm": 1.8002084226838486, + "language_loss": 0.8360076, + "learning_rate": 3.220134667280476e-06, + "loss": 0.85485834, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.34594727, + "step": 5196, + "time_per_iteration": 2.687711715698242 + }, + { + "auxiliary_loss_clip": 0.01489779, + "auxiliary_loss_mlp": 0.00045197, + "balance_loss_clip": 1.31168723, + "balance_loss_mlp": 0.03856891, + "epoch": 0.31246054411543667, + "flos": 67485165517440.0, + "grad_norm": 0.7982790604436729, + "language_loss": 0.54670632, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56205606, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.06640625, + "step": 5197, + "time_per_iteration": 3.174241065979004 + }, + { + "auxiliary_loss_clip": 0.01565524, + "auxiliary_loss_mlp": 0.00299693, + "balance_loss_clip": 1.2709527, + "balance_loss_mlp": 0.2678169, + "epoch": 0.31252066736810463, + "flos": 17858520731520.0, + "grad_norm": 98.707273646863, + "language_loss": 0.73054367, + "learning_rate": 3.21951739516552e-06, + "loss": 0.74919581, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.31933594, + "step": 5198, + "time_per_iteration": 2.612720251083374 + }, + { + "auxiliary_loss_clip": 0.0159603, + "auxiliary_loss_mlp": 0.00361434, + "balance_loss_clip": 1.28628087, + "balance_loss_mlp": 0.32393116, + "epoch": 0.3125807906207726, + "flos": 18475034791680.0, + "grad_norm": 32.72477416310552, + "language_loss": 0.77300107, + "learning_rate": 3.219208689735857e-06, + "loss": 0.79257572, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.375, + "step": 5199, + "time_per_iteration": 2.621884346008301 + }, + { + "auxiliary_loss_clip": 0.01573598, + "auxiliary_loss_mlp": 0.00330266, + "balance_loss_clip": 1.26907074, + "balance_loss_mlp": 0.2964586, + "epoch": 0.31264091387344056, + "flos": 18946541646720.0, + "grad_norm": 3.240837301615117, + "language_loss": 0.8448236, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.86386216, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.33813477, + "step": 5200, + "time_per_iteration": 2.6037113666534424 + }, + { + "auxiliary_loss_clip": 0.01545537, + "auxiliary_loss_mlp": 0.00282956, + "balance_loss_clip": 1.25757027, + "balance_loss_mlp": 0.25184235, + "epoch": 0.3127010371261085, + "flos": 21468512384640.0, + "grad_norm": 9.097104911077793, + "language_loss": 0.90731728, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.92560214, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 2.88085938, + "router_z_loss_mlp": 0.31103516, + "step": 5201, + "time_per_iteration": 2.650531768798828 + }, + { + "auxiliary_loss_clip": 0.01587552, + "auxiliary_loss_mlp": 0.00345337, + "balance_loss_clip": 1.27403831, + "balance_loss_mlp": 0.30940688, + "epoch": 0.3127611603787765, + "flos": 15336047203200.0, + "grad_norm": 57.814239026403655, + "language_loss": 0.74651515, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.76584399, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.35961914, + "step": 5202, + "time_per_iteration": 2.6267590522766113 + }, + { + "auxiliary_loss_clip": 0.01562566, + "auxiliary_loss_mlp": 0.0034026, + "balance_loss_clip": 1.25439453, + "balance_loss_mlp": 0.30734628, + "epoch": 0.31282128363144446, + "flos": 17602980399360.0, + "grad_norm": 40.98880632081559, + "language_loss": 0.90654284, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.92557114, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.32897949, + "step": 5203, + "time_per_iteration": 2.608332633972168 + }, + { + "auxiliary_loss_clip": 0.01585177, + "auxiliary_loss_mlp": 0.00347937, + "balance_loss_clip": 1.27731025, + "balance_loss_mlp": 0.31501126, + "epoch": 0.3128814068841124, + "flos": 26756753235840.0, + "grad_norm": 27.07678771875222, + "language_loss": 0.67452812, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.69385922, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.32910156, + "step": 5204, + "time_per_iteration": 2.82592511177063 + }, + { + "auxiliary_loss_clip": 0.01560347, + "auxiliary_loss_mlp": 0.00336455, + "balance_loss_clip": 1.26175284, + "balance_loss_mlp": 0.30499542, + "epoch": 0.3129415301367804, + "flos": 22272372806400.0, + "grad_norm": 5.878566828184419, + "language_loss": 0.7148664, + "learning_rate": 3.217355486684887e-06, + "loss": 0.73383445, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.3145752, + "step": 5205, + "time_per_iteration": 2.696436882019043 + }, + { + "auxiliary_loss_clip": 0.0156741, + "auxiliary_loss_mlp": 0.00351729, + "balance_loss_clip": 1.25480759, + "balance_loss_mlp": 0.31663379, + "epoch": 0.31300165338944835, + "flos": 26464907232000.0, + "grad_norm": 236.6667865399722, + "language_loss": 0.80516785, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.82435924, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.35107422, + "step": 5206, + "time_per_iteration": 2.6967172622680664 + }, + { + "auxiliary_loss_clip": 0.01570888, + "auxiliary_loss_mlp": 0.00323054, + "balance_loss_clip": 1.27014577, + "balance_loss_mlp": 0.29146352, + "epoch": 0.3130617766421163, + "flos": 21944652094080.0, + "grad_norm": 66.44320519251114, + "language_loss": 0.9093442, + "learning_rate": 3.216737382911672e-06, + "loss": 0.92828369, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.31591797, + "step": 5207, + "time_per_iteration": 2.633481979370117 + }, + { + "auxiliary_loss_clip": 0.01564815, + "auxiliary_loss_mlp": 0.00319178, + "balance_loss_clip": 1.25893354, + "balance_loss_mlp": 0.28665805, + "epoch": 0.3131218998947843, + "flos": 23292774368640.0, + "grad_norm": 2.0333511446016277, + "language_loss": 0.76793182, + "learning_rate": 3.216428261810999e-06, + "loss": 0.78677177, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.32495117, + "step": 5208, + "time_per_iteration": 4.243360996246338 + }, + { + "auxiliary_loss_clip": 0.01576431, + "auxiliary_loss_mlp": 0.00353737, + "balance_loss_clip": 1.27273607, + "balance_loss_mlp": 0.31876129, + "epoch": 0.3131820231474523, + "flos": 21139642437120.0, + "grad_norm": 187.71046542974085, + "language_loss": 0.80423236, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.82353407, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.34960938, + "step": 5209, + "time_per_iteration": 2.6728968620300293 + }, + { + "auxiliary_loss_clip": 0.01552862, + "auxiliary_loss_mlp": 0.00331102, + "balance_loss_clip": 1.2512486, + "balance_loss_mlp": 0.30094165, + "epoch": 0.31324214640012027, + "flos": 23909863046400.0, + "grad_norm": 9.54085365953534, + "language_loss": 0.82424819, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.84308779, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.30175781, + "step": 5210, + "time_per_iteration": 2.7409324645996094 + }, + { + "auxiliary_loss_clip": 0.01541549, + "auxiliary_loss_mlp": 0.00326715, + "balance_loss_clip": 1.24574471, + "balance_loss_mlp": 0.2947669, + "epoch": 0.31330226965278823, + "flos": 22236929061120.0, + "grad_norm": 72.44506270297991, + "language_loss": 0.85031897, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.86900163, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.31958008, + "step": 5211, + "time_per_iteration": 4.0619893074035645 + }, + { + "auxiliary_loss_clip": 0.01550021, + "auxiliary_loss_mlp": 0.0034268, + "balance_loss_clip": 1.24297941, + "balance_loss_mlp": 0.30958709, + "epoch": 0.3133623929054562, + "flos": 19753993428480.0, + "grad_norm": 17.983163519680392, + "language_loss": 0.83994037, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.85886741, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.33081055, + "step": 5212, + "time_per_iteration": 2.65610408782959 + }, + { + "auxiliary_loss_clip": 0.01588319, + "auxiliary_loss_mlp": 0.00364967, + "balance_loss_clip": 1.27769744, + "balance_loss_mlp": 0.32963297, + "epoch": 0.31342251615812416, + "flos": 27162256849920.0, + "grad_norm": 10.277286245749874, + "language_loss": 0.79345858, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.8129915, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.35327148, + "step": 5213, + "time_per_iteration": 2.7141189575195312 + }, + { + "auxiliary_loss_clip": 0.01583919, + "auxiliary_loss_mlp": 0.00353294, + "balance_loss_clip": 1.26442289, + "balance_loss_mlp": 0.31927124, + "epoch": 0.31348263941079213, + "flos": 20229809915520.0, + "grad_norm": 8.17063003571594, + "language_loss": 0.84714413, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.86651623, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.34033203, + "step": 5214, + "time_per_iteration": 2.6915481090545654 + }, + { + "auxiliary_loss_clip": 0.01561726, + "auxiliary_loss_mlp": 0.00375722, + "balance_loss_clip": 1.26056516, + "balance_loss_mlp": 0.34355953, + "epoch": 0.3135427626634601, + "flos": 24607643627520.0, + "grad_norm": 6.32314438572992, + "language_loss": 0.88344622, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.90282071, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.3215332, + "step": 5215, + "time_per_iteration": 4.2121241092681885 + }, + { + "auxiliary_loss_clip": 0.01565273, + "auxiliary_loss_mlp": 0.00368261, + "balance_loss_clip": 1.25413227, + "balance_loss_mlp": 0.33164024, + "epoch": 0.31360288591612806, + "flos": 20959873845120.0, + "grad_norm": 8.893307872385545, + "language_loss": 0.85902059, + "learning_rate": 3.213953633415686e-06, + "loss": 0.87835598, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 3.109375, + "router_z_loss_mlp": 0.36621094, + "step": 5216, + "time_per_iteration": 2.6484603881835938 + }, + { + "auxiliary_loss_clip": 0.0159409, + "auxiliary_loss_mlp": 0.00411217, + "balance_loss_clip": 1.27027833, + "balance_loss_mlp": 0.37330818, + "epoch": 0.313663009168796, + "flos": 26980513009920.0, + "grad_norm": 18.69557172570863, + "language_loss": 0.74082577, + "learning_rate": 3.213644097593477e-06, + "loss": 0.76087892, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.37915039, + "step": 5217, + "time_per_iteration": 2.6815922260284424 + }, + { + "auxiliary_loss_clip": 0.01596499, + "auxiliary_loss_mlp": 0.00349597, + "balance_loss_clip": 1.27640843, + "balance_loss_mlp": 0.31745842, + "epoch": 0.313723132421464, + "flos": 18040911016320.0, + "grad_norm": 11.705595127272716, + "language_loss": 0.85644126, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.87590218, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.32128906, + "step": 5218, + "time_per_iteration": 2.6391115188598633 + }, + { + "auxiliary_loss_clip": 0.01593821, + "auxiliary_loss_mlp": 0.00395909, + "balance_loss_clip": 1.27681339, + "balance_loss_mlp": 0.35903746, + "epoch": 0.31378325567413196, + "flos": 22488913946880.0, + "grad_norm": 15.874475603497084, + "language_loss": 0.77620637, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.79610372, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.36877441, + "step": 5219, + "time_per_iteration": 2.6229169368743896 + }, + { + "auxiliary_loss_clip": 0.016061, + "auxiliary_loss_mlp": 0.00378652, + "balance_loss_clip": 1.27729166, + "balance_loss_mlp": 0.34517795, + "epoch": 0.3138433789267999, + "flos": 22419247518720.0, + "grad_norm": 28.311932080484016, + "language_loss": 0.85818166, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.87802917, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.3347168, + "step": 5220, + "time_per_iteration": 2.711939811706543 + }, + { + "auxiliary_loss_clip": 0.01582905, + "auxiliary_loss_mlp": 0.003944, + "balance_loss_clip": 1.26456118, + "balance_loss_mlp": 0.35842302, + "epoch": 0.3139035021794679, + "flos": 13005912026880.0, + "grad_norm": 3.830634630104383, + "language_loss": 0.7972911, + "learning_rate": 3.212405494206986e-06, + "loss": 0.81706417, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.35986328, + "step": 5221, + "time_per_iteration": 2.6668009757995605 + }, + { + "auxiliary_loss_clip": 0.0158065, + "auxiliary_loss_mlp": 0.00342856, + "balance_loss_clip": 1.26559639, + "balance_loss_mlp": 0.31150359, + "epoch": 0.31396362543213585, + "flos": 16945994689920.0, + "grad_norm": 15.914581925145113, + "language_loss": 0.88432026, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.90355539, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.31347656, + "step": 5222, + "time_per_iteration": 3.961674213409424 + }, + { + "auxiliary_loss_clip": 0.01605619, + "auxiliary_loss_mlp": 0.00418937, + "balance_loss_clip": 1.2729243, + "balance_loss_mlp": 0.37990832, + "epoch": 0.31402374868480387, + "flos": 20156731695360.0, + "grad_norm": 8.453588701086954, + "language_loss": 0.77383476, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.79408026, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.39013672, + "step": 5223, + "time_per_iteration": 2.6180508136749268 + }, + { + "auxiliary_loss_clip": 0.0157308, + "auxiliary_loss_mlp": 0.00377336, + "balance_loss_clip": 1.25338483, + "balance_loss_mlp": 0.3448635, + "epoch": 0.31408387193747184, + "flos": 21251073404160.0, + "grad_norm": 17.81485908202575, + "language_loss": 0.85841262, + "learning_rate": 3.211476058893379e-06, + "loss": 0.87791681, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.32446289, + "step": 5224, + "time_per_iteration": 2.6553549766540527 + }, + { + "auxiliary_loss_clip": 0.01595538, + "auxiliary_loss_mlp": 0.00436582, + "balance_loss_clip": 1.26422858, + "balance_loss_mlp": 0.39993745, + "epoch": 0.3141439951901398, + "flos": 27484267299840.0, + "grad_norm": 13.449480919213634, + "language_loss": 0.63997406, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.66029525, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.36645508, + "step": 5225, + "time_per_iteration": 2.7360916137695312 + }, + { + "auxiliary_loss_clip": 0.01581466, + "auxiliary_loss_mlp": 0.00378835, + "balance_loss_clip": 1.26590276, + "balance_loss_mlp": 0.34519389, + "epoch": 0.31420411844280777, + "flos": 17852235851520.0, + "grad_norm": 4.747996672424668, + "language_loss": 0.88170809, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.90131104, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.33618164, + "step": 5226, + "time_per_iteration": 2.7337799072265625 + }, + { + "auxiliary_loss_clip": 0.0160751, + "auxiliary_loss_mlp": 0.00389138, + "balance_loss_clip": 1.27733278, + "balance_loss_mlp": 0.35280341, + "epoch": 0.31426424169547573, + "flos": 21616967295360.0, + "grad_norm": 24.80659790215287, + "language_loss": 0.81203043, + "learning_rate": 3.210546210126141e-06, + "loss": 0.83199692, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.36328125, + "step": 5227, + "time_per_iteration": 2.700890302658081 + }, + { + "auxiliary_loss_clip": 0.01614096, + "auxiliary_loss_mlp": 0.00414152, + "balance_loss_clip": 1.28742194, + "balance_loss_mlp": 0.37822258, + "epoch": 0.3143243649481437, + "flos": 30920631586560.0, + "grad_norm": 2.9576866032803193, + "language_loss": 0.74516571, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.76544815, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.359375, + "step": 5228, + "time_per_iteration": 2.719413995742798 + }, + { + "auxiliary_loss_clip": 0.01597505, + "auxiliary_loss_mlp": 0.00366744, + "balance_loss_clip": 1.27437544, + "balance_loss_mlp": 0.33501083, + "epoch": 0.31438448820081166, + "flos": 22821411168000.0, + "grad_norm": 25.19554704668356, + "language_loss": 0.86171788, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.88136041, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.31726074, + "step": 5229, + "time_per_iteration": 2.6470229625701904 + }, + { + "auxiliary_loss_clip": 0.01574637, + "auxiliary_loss_mlp": 0.00388222, + "balance_loss_clip": 1.26099336, + "balance_loss_mlp": 0.35570192, + "epoch": 0.3144446114534796, + "flos": 23292127923840.0, + "grad_norm": 12.743496154067719, + "language_loss": 0.75659299, + "learning_rate": 3.209615948222611e-06, + "loss": 0.77622151, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.32519531, + "step": 5230, + "time_per_iteration": 2.6802783012390137 + }, + { + "auxiliary_loss_clip": 0.01570047, + "auxiliary_loss_mlp": 0.00407857, + "balance_loss_clip": 1.24995947, + "balance_loss_mlp": 0.37209386, + "epoch": 0.3145047347061476, + "flos": 31355976424320.0, + "grad_norm": 62.039945714098884, + "language_loss": 0.83981675, + "learning_rate": 3.209305769168239e-06, + "loss": 0.85959578, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.35766602, + "step": 5231, + "time_per_iteration": 2.8813371658325195 + }, + { + "auxiliary_loss_clip": 0.01588985, + "auxiliary_loss_mlp": 0.00363853, + "balance_loss_clip": 1.26517534, + "balance_loss_mlp": 0.33123687, + "epoch": 0.31456485795881556, + "flos": 10889552643840.0, + "grad_norm": 14.596569994018306, + "language_loss": 0.9090485, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.92857683, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 3.23828125, + "router_z_loss_mlp": 0.32617188, + "step": 5232, + "time_per_iteration": 2.7140023708343506 + }, + { + "auxiliary_loss_clip": 0.01554126, + "auxiliary_loss_mlp": 0.00342255, + "balance_loss_clip": 1.23788214, + "balance_loss_mlp": 0.30972308, + "epoch": 0.3146249812114835, + "flos": 17092438439040.0, + "grad_norm": 5.590829153433804, + "language_loss": 0.85758239, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.87654614, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 3.16601562, + "router_z_loss_mlp": 0.32531738, + "step": 5233, + "time_per_iteration": 2.6448991298675537 + }, + { + "auxiliary_loss_clip": 0.01608428, + "auxiliary_loss_mlp": 0.00388972, + "balance_loss_clip": 1.26924682, + "balance_loss_mlp": 0.35392439, + "epoch": 0.3146851044641515, + "flos": 55291442889600.0, + "grad_norm": 33.10692676712518, + "language_loss": 0.77559513, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.79556912, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.3503418, + "step": 5234, + "time_per_iteration": 2.9477930068969727 + }, + { + "auxiliary_loss_clip": 0.01582194, + "auxiliary_loss_mlp": 0.00386387, + "balance_loss_clip": 1.25375581, + "balance_loss_mlp": 0.3520309, + "epoch": 0.31474522771681945, + "flos": 27015884928000.0, + "grad_norm": 1.8445233677597146, + "language_loss": 0.78115696, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.8008427, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.34375, + "step": 5235, + "time_per_iteration": 2.7089805603027344 + }, + { + "auxiliary_loss_clip": 0.0155209, + "auxiliary_loss_mlp": 0.00356474, + "balance_loss_clip": 1.23748267, + "balance_loss_mlp": 0.32452568, + "epoch": 0.3148053509694875, + "flos": 21251935330560.0, + "grad_norm": 114.95642334395872, + "language_loss": 0.84551239, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.86459804, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.31933594, + "step": 5236, + "time_per_iteration": 2.6647696495056152 + }, + { + "auxiliary_loss_clip": 0.01583318, + "auxiliary_loss_mlp": 0.00402092, + "balance_loss_clip": 1.25083733, + "balance_loss_mlp": 0.36585268, + "epoch": 0.31486547422215544, + "flos": 31248675521280.0, + "grad_norm": 34.92864178311655, + "language_loss": 0.81239069, + "learning_rate": 3.207443732256881e-06, + "loss": 0.83224475, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.36230469, + "step": 5237, + "time_per_iteration": 2.725630521774292 + }, + { + "auxiliary_loss_clip": 0.01540856, + "auxiliary_loss_mlp": 0.00340787, + "balance_loss_clip": 1.2398206, + "balance_loss_mlp": 0.31000701, + "epoch": 0.3149255974748234, + "flos": 19828615933440.0, + "grad_norm": 5.032117702576191, + "language_loss": 0.84132707, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.86014354, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.30773926, + "step": 5238, + "time_per_iteration": 2.6392412185668945 + }, + { + "auxiliary_loss_clip": 0.01566415, + "auxiliary_loss_mlp": 0.00076315, + "balance_loss_clip": 1.36894107, + "balance_loss_mlp": 0.06372621, + "epoch": 0.31498572072749137, + "flos": 67683965339520.0, + "grad_norm": 0.8304693781228705, + "language_loss": 0.68196237, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.69838965, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.12597656, + "step": 5239, + "time_per_iteration": 3.1590588092803955 + }, + { + "auxiliary_loss_clip": 0.01594141, + "auxiliary_loss_mlp": 0.0041532, + "balance_loss_clip": 1.26081502, + "balance_loss_mlp": 0.37822217, + "epoch": 0.31504584398015933, + "flos": 19793136274560.0, + "grad_norm": 10.161030349840265, + "language_loss": 0.88503206, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.90512669, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.37109375, + "step": 5240, + "time_per_iteration": 2.6952526569366455 + }, + { + "auxiliary_loss_clip": 0.01574235, + "auxiliary_loss_mlp": 0.00390882, + "balance_loss_clip": 1.25756848, + "balance_loss_mlp": 0.3583374, + "epoch": 0.3151059672328273, + "flos": 26615409217920.0, + "grad_norm": 22.17738384463229, + "language_loss": 0.86047637, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.88012755, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 3.16601562, + "router_z_loss_mlp": 0.32519531, + "step": 5241, + "time_per_iteration": 2.7122690677642822 + }, + { + "auxiliary_loss_clip": 0.01567036, + "auxiliary_loss_mlp": 0.00388371, + "balance_loss_clip": 1.25479567, + "balance_loss_mlp": 0.35441977, + "epoch": 0.31516609048549526, + "flos": 24204438483840.0, + "grad_norm": 9.88145355077178, + "language_loss": 0.80073357, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.82028764, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.33935547, + "step": 5242, + "time_per_iteration": 2.7351906299591064 + }, + { + "auxiliary_loss_clip": 0.0155379, + "auxiliary_loss_mlp": 0.00366329, + "balance_loss_clip": 1.25166118, + "balance_loss_mlp": 0.33423766, + "epoch": 0.31522621373816323, + "flos": 25958710817280.0, + "grad_norm": 602.069171735207, + "language_loss": 0.79684192, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.81604314, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.32104492, + "step": 5243, + "time_per_iteration": 2.682337522506714 + }, + { + "auxiliary_loss_clip": 0.01585972, + "auxiliary_loss_mlp": 0.0038423, + "balance_loss_clip": 1.26024437, + "balance_loss_mlp": 0.35161415, + "epoch": 0.3152863369908312, + "flos": 21908813299200.0, + "grad_norm": 4.672459064291928, + "language_loss": 0.71679115, + "learning_rate": 3.205269272758513e-06, + "loss": 0.73649323, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.3260498, + "step": 5244, + "time_per_iteration": 2.634270668029785 + }, + { + "auxiliary_loss_clip": 0.01589141, + "auxiliary_loss_mlp": 0.00387039, + "balance_loss_clip": 1.258991, + "balance_loss_mlp": 0.35361266, + "epoch": 0.31534646024349916, + "flos": 16281072074880.0, + "grad_norm": 4.733767879627094, + "language_loss": 0.96493948, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.98470116, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.33398438, + "step": 5245, + "time_per_iteration": 2.5917203426361084 + }, + { + "auxiliary_loss_clip": 0.01606719, + "auxiliary_loss_mlp": 0.00410772, + "balance_loss_clip": 1.27477503, + "balance_loss_mlp": 0.37610599, + "epoch": 0.3154065834961671, + "flos": 24717243000960.0, + "grad_norm": 5.6498106886401676, + "language_loss": 0.80599135, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.82616627, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.34643555, + "step": 5246, + "time_per_iteration": 2.662494659423828 + }, + { + "auxiliary_loss_clip": 0.01592328, + "auxiliary_loss_mlp": 0.0040225, + "balance_loss_clip": 1.2627691, + "balance_loss_mlp": 0.36639142, + "epoch": 0.3154667067488351, + "flos": 35371148469120.0, + "grad_norm": 44.193181987117775, + "language_loss": 0.66926587, + "learning_rate": 3.204336675750321e-06, + "loss": 0.68921167, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35864258, + "step": 5247, + "time_per_iteration": 2.7901716232299805 + }, + { + "auxiliary_loss_clip": 0.01591615, + "auxiliary_loss_mlp": 0.00397473, + "balance_loss_clip": 1.26216781, + "balance_loss_mlp": 0.36235428, + "epoch": 0.31552683000150306, + "flos": 17456464823040.0, + "grad_norm": 3.2074012149165725, + "language_loss": 0.89550936, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.91540027, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.35107422, + "step": 5248, + "time_per_iteration": 2.6421475410461426 + }, + { + "auxiliary_loss_clip": 0.01589114, + "auxiliary_loss_mlp": 0.00415813, + "balance_loss_clip": 1.25962782, + "balance_loss_mlp": 0.37957358, + "epoch": 0.3155869532541711, + "flos": 18405763413120.0, + "grad_norm": 3.4197484465187937, + "language_loss": 0.92576432, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.9458136, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.36206055, + "step": 5249, + "time_per_iteration": 2.6396310329437256 + }, + { + "auxiliary_loss_clip": 0.01595065, + "auxiliary_loss_mlp": 0.00422429, + "balance_loss_clip": 1.26636076, + "balance_loss_mlp": 0.37989467, + "epoch": 0.31564707650683904, + "flos": 21579763783680.0, + "grad_norm": 13.680312781915164, + "language_loss": 0.91421288, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.9343878, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.42553711, + "step": 5250, + "time_per_iteration": 4.33744215965271 + }, + { + "auxiliary_loss_clip": 0.01601618, + "auxiliary_loss_mlp": 0.0038637, + "balance_loss_clip": 1.2738483, + "balance_loss_mlp": 0.35163224, + "epoch": 0.315707199759507, + "flos": 21030976817280.0, + "grad_norm": 46.81800593724332, + "language_loss": 0.78657603, + "learning_rate": 3.203092573767835e-06, + "loss": 0.80645591, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.34741211, + "step": 5251, + "time_per_iteration": 2.701134443283081 + }, + { + "auxiliary_loss_clip": 0.01615235, + "auxiliary_loss_mlp": 0.00393191, + "balance_loss_clip": 1.28328562, + "balance_loss_mlp": 0.35928804, + "epoch": 0.31576732301217497, + "flos": 26828861788800.0, + "grad_norm": 10.109807932609, + "language_loss": 0.83079648, + "learning_rate": 3.202781434189246e-06, + "loss": 0.85088074, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.33886719, + "step": 5252, + "time_per_iteration": 2.7337260246276855 + }, + { + "auxiliary_loss_clip": 0.01575154, + "auxiliary_loss_mlp": 0.00340079, + "balance_loss_clip": 1.26022959, + "balance_loss_mlp": 0.30565113, + "epoch": 0.31582744626484294, + "flos": 22711165349760.0, + "grad_norm": 678.54343437277, + "language_loss": 0.78879476, + "learning_rate": 3.202470249001066e-06, + "loss": 0.80794716, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.34448242, + "step": 5253, + "time_per_iteration": 4.194349527359009 + }, + { + "auxiliary_loss_clip": 0.01580718, + "auxiliary_loss_mlp": 0.00386436, + "balance_loss_clip": 1.26355267, + "balance_loss_mlp": 0.35246187, + "epoch": 0.3158875695175109, + "flos": 23951914894080.0, + "grad_norm": 5.299734122347078, + "language_loss": 0.79645002, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.81612158, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.33984375, + "step": 5254, + "time_per_iteration": 2.703737735748291 + }, + { + "auxiliary_loss_clip": 0.01587681, + "auxiliary_loss_mlp": 0.00396512, + "balance_loss_clip": 1.26049352, + "balance_loss_mlp": 0.36065397, + "epoch": 0.31594769277017887, + "flos": 13261883322240.0, + "grad_norm": 91.39716424338937, + "language_loss": 0.83851147, + "learning_rate": 3.201847741843128e-06, + "loss": 0.85835338, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.35839844, + "step": 5255, + "time_per_iteration": 2.706033229827881 + }, + { + "auxiliary_loss_clip": 0.01583006, + "auxiliary_loss_mlp": 0.00340587, + "balance_loss_clip": 1.26113045, + "balance_loss_mlp": 0.30637437, + "epoch": 0.31600781602284683, + "flos": 23368258800000.0, + "grad_norm": 8.716197177860044, + "language_loss": 0.8470614, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.8662973, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.34204102, + "step": 5256, + "time_per_iteration": 2.7022359371185303 + }, + { + "auxiliary_loss_clip": 0.01574246, + "auxiliary_loss_mlp": 0.00321235, + "balance_loss_clip": 1.25856614, + "balance_loss_mlp": 0.29007342, + "epoch": 0.3160679392755148, + "flos": 19828580019840.0, + "grad_norm": 39.32699674244255, + "language_loss": 0.75615382, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.77510864, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.31176758, + "step": 5257, + "time_per_iteration": 2.7090325355529785 + }, + { + "auxiliary_loss_clip": 0.015866, + "auxiliary_loss_mlp": 0.00349927, + "balance_loss_clip": 1.26771903, + "balance_loss_mlp": 0.31495139, + "epoch": 0.31612806252818276, + "flos": 20193216935040.0, + "grad_norm": 92338.76991869432, + "language_loss": 0.84649801, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.86586332, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.34960938, + "step": 5258, + "time_per_iteration": 4.074587106704712 + }, + { + "auxiliary_loss_clip": 0.01595256, + "auxiliary_loss_mlp": 0.00346452, + "balance_loss_clip": 1.27387345, + "balance_loss_mlp": 0.31371751, + "epoch": 0.31618818578085073, + "flos": 24235967646720.0, + "grad_norm": 25.684970184120022, + "language_loss": 0.7941674, + "learning_rate": 3.200602180731467e-06, + "loss": 0.81358457, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.32763672, + "step": 5259, + "time_per_iteration": 2.63191294670105 + }, + { + "auxiliary_loss_clip": 0.01598585, + "auxiliary_loss_mlp": 0.00362275, + "balance_loss_clip": 1.27042127, + "balance_loss_mlp": 0.33101803, + "epoch": 0.3162483090335187, + "flos": 25081844002560.0, + "grad_norm": 53.44371253774358, + "language_loss": 0.72100997, + "learning_rate": 3.20029067660664e-06, + "loss": 0.74061859, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.31225586, + "step": 5260, + "time_per_iteration": 2.635963201522827 + }, + { + "auxiliary_loss_clip": 0.01581998, + "auxiliary_loss_mlp": 0.0036435, + "balance_loss_clip": 1.26760292, + "balance_loss_mlp": 0.32977933, + "epoch": 0.31630843228618666, + "flos": 26323383646080.0, + "grad_norm": 2.552057576980257, + "language_loss": 0.78547484, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.80493832, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.34570312, + "step": 5261, + "time_per_iteration": 2.6803717613220215 + }, + { + "auxiliary_loss_clip": 0.01576608, + "auxiliary_loss_mlp": 0.00195705, + "balance_loss_clip": 1.34191966, + "balance_loss_mlp": 0.18592937, + "epoch": 0.3163685555388547, + "flos": 66758441552640.0, + "grad_norm": 0.7501868637089333, + "language_loss": 0.50677478, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.52449793, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.09765625, + "step": 5262, + "time_per_iteration": 3.128899097442627 + }, + { + "auxiliary_loss_clip": 0.01648848, + "auxiliary_loss_mlp": 0.00366361, + "balance_loss_clip": 1.31014025, + "balance_loss_mlp": 0.33312559, + "epoch": 0.31642867879152264, + "flos": 25995662933760.0, + "grad_norm": 8.061371635901653, + "language_loss": 0.89479172, + "learning_rate": 3.19935589118856e-06, + "loss": 0.91494381, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.33203125, + "step": 5263, + "time_per_iteration": 2.8143084049224854 + }, + { + "auxiliary_loss_clip": 0.01585175, + "auxiliary_loss_mlp": 0.00334935, + "balance_loss_clip": 1.27115452, + "balance_loss_mlp": 0.30229563, + "epoch": 0.3164888020441906, + "flos": 25774955815680.0, + "grad_norm": 14.246709033467031, + "language_loss": 0.85983151, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.87903261, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.32641602, + "step": 5264, + "time_per_iteration": 4.075377464294434 + }, + { + "auxiliary_loss_clip": 0.01623444, + "auxiliary_loss_mlp": 0.00379761, + "balance_loss_clip": 1.29460204, + "balance_loss_mlp": 0.34170926, + "epoch": 0.3165489252968586, + "flos": 19756220071680.0, + "grad_norm": 5.525708039561475, + "language_loss": 0.86320871, + "learning_rate": 3.19873247349167e-06, + "loss": 0.8832407, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.38037109, + "step": 5265, + "time_per_iteration": 2.6252501010894775 + }, + { + "auxiliary_loss_clip": 0.01622758, + "auxiliary_loss_mlp": 0.00384697, + "balance_loss_clip": 1.29086566, + "balance_loss_mlp": 0.34714568, + "epoch": 0.31660904854952654, + "flos": 23183929180800.0, + "grad_norm": 1.8705618949704237, + "language_loss": 0.82190269, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.84197724, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.37573242, + "step": 5266, + "time_per_iteration": 2.6299796104431152 + }, + { + "auxiliary_loss_clip": 0.01623832, + "auxiliary_loss_mlp": 0.00354409, + "balance_loss_clip": 1.29337978, + "balance_loss_mlp": 0.32081628, + "epoch": 0.3166691718021945, + "flos": 20408501099520.0, + "grad_norm": 690.1843622165735, + "language_loss": 0.87858093, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.89836341, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.33569336, + "step": 5267, + "time_per_iteration": 2.665837049484253 + }, + { + "auxiliary_loss_clip": 0.01577983, + "auxiliary_loss_mlp": 0.001976, + "balance_loss_clip": 1.34588742, + "balance_loss_mlp": 0.18758661, + "epoch": 0.31672929505486247, + "flos": 70144781172480.0, + "grad_norm": 0.7098952831152016, + "language_loss": 0.57423961, + "learning_rate": 3.197797006055478e-06, + "loss": 0.59199548, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.10009766, + "step": 5268, + "time_per_iteration": 3.1527059078216553 + }, + { + "auxiliary_loss_clip": 0.01625006, + "auxiliary_loss_mlp": 0.00359805, + "balance_loss_clip": 1.29306412, + "balance_loss_mlp": 0.32771403, + "epoch": 0.31678941830753043, + "flos": 14355758154240.0, + "grad_norm": 4.599633895155689, + "language_loss": 0.81322241, + "learning_rate": 3.197485092719815e-06, + "loss": 0.83307052, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.32128906, + "step": 5269, + "time_per_iteration": 2.6955034732818604 + }, + { + "auxiliary_loss_clip": 0.01609805, + "auxiliary_loss_mlp": 0.00354469, + "balance_loss_clip": 1.28255999, + "balance_loss_mlp": 0.31906432, + "epoch": 0.3168495415601984, + "flos": 22747722416640.0, + "grad_norm": 38.39819485962703, + "language_loss": 0.85427475, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.87391746, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.35424805, + "step": 5270, + "time_per_iteration": 2.671257257461548 + }, + { + "auxiliary_loss_clip": 0.01665746, + "auxiliary_loss_mlp": 0.00363081, + "balance_loss_clip": 1.32581544, + "balance_loss_mlp": 0.32660329, + "epoch": 0.31690966481286637, + "flos": 20115254465280.0, + "grad_norm": 4.2860118000327105, + "language_loss": 0.85172188, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.87201011, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.36450195, + "step": 5271, + "time_per_iteration": 2.6367621421813965 + }, + { + "auxiliary_loss_clip": 0.01653668, + "auxiliary_loss_mlp": 0.00389391, + "balance_loss_clip": 1.31413519, + "balance_loss_mlp": 0.35329473, + "epoch": 0.31696978806553433, + "flos": 21178928937600.0, + "grad_norm": 3091.583119968142, + "language_loss": 0.79578626, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.81621683, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.36083984, + "step": 5272, + "time_per_iteration": 2.652951240539551 + }, + { + "auxiliary_loss_clip": 0.01640764, + "auxiliary_loss_mlp": 0.0038642, + "balance_loss_clip": 1.29505873, + "balance_loss_mlp": 0.34736753, + "epoch": 0.3170299113182023, + "flos": 42997030439040.0, + "grad_norm": 136.7137996689856, + "language_loss": 0.76313818, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.78341001, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.39038086, + "step": 5273, + "time_per_iteration": 2.861022710800171 + }, + { + "auxiliary_loss_clip": 0.0166874, + "auxiliary_loss_mlp": 0.00356119, + "balance_loss_clip": 1.32500875, + "balance_loss_mlp": 0.32300264, + "epoch": 0.31709003457087026, + "flos": 24460158384000.0, + "grad_norm": 44.34488931378684, + "language_loss": 0.75279641, + "learning_rate": 3.195924845146795e-06, + "loss": 0.77304506, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 3.43554688, + "router_z_loss_mlp": 0.33105469, + "step": 5274, + "time_per_iteration": 2.7464914321899414 + }, + { + "auxiliary_loss_clip": 0.01644439, + "auxiliary_loss_mlp": 0.00351404, + "balance_loss_clip": 1.31374955, + "balance_loss_mlp": 0.31802493, + "epoch": 0.3171501578235382, + "flos": 24135310759680.0, + "grad_norm": 2.348824893401691, + "language_loss": 0.85444582, + "learning_rate": 3.195612659536081e-06, + "loss": 0.87440419, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.33374023, + "step": 5275, + "time_per_iteration": 2.6825666427612305 + }, + { + "auxiliary_loss_clip": 0.01683979, + "auxiliary_loss_mlp": 0.00411433, + "balance_loss_clip": 1.33297157, + "balance_loss_mlp": 0.37385789, + "epoch": 0.31721028107620625, + "flos": 18879712392960.0, + "grad_norm": 16.79348259869333, + "language_loss": 0.79202342, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.81297755, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.37548828, + "step": 5276, + "time_per_iteration": 2.6985857486724854 + }, + { + "auxiliary_loss_clip": 0.01685967, + "auxiliary_loss_mlp": 0.00419788, + "balance_loss_clip": 1.34236729, + "balance_loss_mlp": 0.38548005, + "epoch": 0.3172704043288742, + "flos": 23147874904320.0, + "grad_norm": 2.806606664930923, + "language_loss": 0.83184206, + "learning_rate": 3.194988152313236e-06, + "loss": 0.85289967, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.34326172, + "step": 5277, + "time_per_iteration": 2.880139112472534 + }, + { + "auxiliary_loss_clip": 0.01689323, + "auxiliary_loss_mlp": 0.00466422, + "balance_loss_clip": 1.34633529, + "balance_loss_mlp": 0.42901462, + "epoch": 0.3173305275815422, + "flos": 17858520731520.0, + "grad_norm": 218.1388402175099, + "language_loss": 0.85128176, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.87283915, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.37402344, + "step": 5278, + "time_per_iteration": 2.724992275238037 + }, + { + "auxiliary_loss_clip": 0.01815472, + "auxiliary_loss_mlp": 0.00214941, + "balance_loss_clip": 1.59367609, + "balance_loss_mlp": 0.20111278, + "epoch": 0.31739065083421014, + "flos": 59973476883840.0, + "grad_norm": 0.8528982611080177, + "language_loss": 0.62787187, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.64817601, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.13867188, + "step": 5279, + "time_per_iteration": 2.905529737472534 + }, + { + "auxiliary_loss_clip": 0.01711148, + "auxiliary_loss_mlp": 0.00455496, + "balance_loss_clip": 1.35605621, + "balance_loss_mlp": 0.41839767, + "epoch": 0.3174507740868781, + "flos": 23800981944960.0, + "grad_norm": 3.968999841652131, + "language_loss": 0.86500371, + "learning_rate": 3.194051051653053e-06, + "loss": 0.88667011, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 0.37109375, + "step": 5280, + "time_per_iteration": 2.729020357131958 + }, + { + "auxiliary_loss_clip": 0.01695016, + "auxiliary_loss_mlp": 0.00437171, + "balance_loss_clip": 1.35099912, + "balance_loss_mlp": 0.40367281, + "epoch": 0.31751089733954607, + "flos": 27638899349760.0, + "grad_norm": 2.2694503997751907, + "language_loss": 0.82888037, + "learning_rate": 3.19373859419346e-06, + "loss": 0.85020226, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 3.43945312, + "router_z_loss_mlp": 0.33496094, + "step": 5281, + "time_per_iteration": 2.7531557083129883 + }, + { + "auxiliary_loss_clip": 0.01691487, + "auxiliary_loss_mlp": 0.0047499, + "balance_loss_clip": 1.34465456, + "balance_loss_mlp": 0.43817836, + "epoch": 0.31757102059221404, + "flos": 23769273214080.0, + "grad_norm": 2.876056723979226, + "language_loss": 0.82677937, + "learning_rate": 3.193426091467179e-06, + "loss": 0.84844416, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.36816406, + "step": 5282, + "time_per_iteration": 2.705524206161499 + }, + { + "auxiliary_loss_clip": 0.01714014, + "auxiliary_loss_mlp": 0.00479749, + "balance_loss_clip": 1.35738599, + "balance_loss_mlp": 0.44215065, + "epoch": 0.317631143844882, + "flos": 25264521596160.0, + "grad_norm": 14.273216511553482, + "language_loss": 0.7423743, + "learning_rate": 3.193113543486061e-06, + "loss": 0.76431191, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.3762207, + "step": 5283, + "time_per_iteration": 2.746896266937256 + }, + { + "auxiliary_loss_clip": 0.01828924, + "auxiliary_loss_mlp": 0.00204047, + "balance_loss_clip": 1.59749889, + "balance_loss_mlp": 0.19021861, + "epoch": 0.31769126709754997, + "flos": 55825939221120.0, + "grad_norm": 0.7150292259334943, + "language_loss": 0.52286828, + "learning_rate": 3.192800950261958e-06, + "loss": 0.54319799, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.13867188, + "step": 5284, + "time_per_iteration": 3.142355442047119 + }, + { + "auxiliary_loss_clip": 0.01729488, + "auxiliary_loss_mlp": 0.00467404, + "balance_loss_clip": 1.36336708, + "balance_loss_mlp": 0.42963889, + "epoch": 0.31775139035021793, + "flos": 16690562098560.0, + "grad_norm": 37.25394615742276, + "language_loss": 0.77742279, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.79939169, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 3.66015625, + "router_z_loss_mlp": 0.37768555, + "step": 5285, + "time_per_iteration": 2.730858564376831 + }, + { + "auxiliary_loss_clip": 0.0185287, + "auxiliary_loss_mlp": 0.00125159, + "balance_loss_clip": 1.61714339, + "balance_loss_mlp": 0.10656277, + "epoch": 0.3178115136028859, + "flos": 64227241019520.0, + "grad_norm": 0.8552242604456926, + "language_loss": 0.603935, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.62371528, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.18554688, + "step": 5286, + "time_per_iteration": 3.1397857666015625 + }, + { + "auxiliary_loss_clip": 0.01715046, + "auxiliary_loss_mlp": 0.00419703, + "balance_loss_clip": 1.35504818, + "balance_loss_mlp": 0.38107902, + "epoch": 0.31787163685555386, + "flos": 18697465762560.0, + "grad_norm": 199.46601753811717, + "language_loss": 0.78561401, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.80696154, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 0.38647461, + "step": 5287, + "time_per_iteration": 2.6318671703338623 + }, + { + "auxiliary_loss_clip": 0.01694603, + "auxiliary_loss_mlp": 0.00452936, + "balance_loss_clip": 1.33532333, + "balance_loss_mlp": 0.41538453, + "epoch": 0.31793176010822183, + "flos": 21324762155520.0, + "grad_norm": 46.598257215988475, + "language_loss": 0.81964117, + "learning_rate": 3.191550125172792e-06, + "loss": 0.84111655, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 3.58984375, + "router_z_loss_mlp": 0.37548828, + "step": 5288, + "time_per_iteration": 2.666808605194092 + }, + { + "auxiliary_loss_clip": 0.01698756, + "auxiliary_loss_mlp": 0.00414086, + "balance_loss_clip": 1.35089982, + "balance_loss_mlp": 0.38092154, + "epoch": 0.31799188336088985, + "flos": 20958688696320.0, + "grad_norm": 93.36377169583024, + "language_loss": 0.93863559, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.959764, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.33178711, + "step": 5289, + "time_per_iteration": 2.8286237716674805 + }, + { + "auxiliary_loss_clip": 0.01726885, + "auxiliary_loss_mlp": 0.00393542, + "balance_loss_clip": 1.36424232, + "balance_loss_mlp": 0.35863787, + "epoch": 0.3180520066135578, + "flos": 22491930689280.0, + "grad_norm": 3.0956754457319957, + "language_loss": 0.7274999, + "learning_rate": 3.190924441478572e-06, + "loss": 0.74870414, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 3.62695312, + "router_z_loss_mlp": 0.34887695, + "step": 5290, + "time_per_iteration": 2.7378129959106445 + }, + { + "auxiliary_loss_clip": 0.01737961, + "auxiliary_loss_mlp": 0.00484122, + "balance_loss_clip": 1.36238325, + "balance_loss_mlp": 0.44235075, + "epoch": 0.3181121298662258, + "flos": 27235335070080.0, + "grad_norm": 13.925164534892899, + "language_loss": 0.8578434, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.88006425, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 3.7578125, + "router_z_loss_mlp": 0.41772461, + "step": 5291, + "time_per_iteration": 2.8088250160217285 + }, + { + "auxiliary_loss_clip": 0.01718282, + "auxiliary_loss_mlp": 0.00471605, + "balance_loss_clip": 1.35678351, + "balance_loss_mlp": 0.43240905, + "epoch": 0.31817225311889374, + "flos": 23180158252800.0, + "grad_norm": 5.940563772213058, + "language_loss": 0.86545694, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.8873558, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.39233398, + "step": 5292, + "time_per_iteration": 2.7348055839538574 + }, + { + "auxiliary_loss_clip": 0.01686985, + "auxiliary_loss_mlp": 0.00388534, + "balance_loss_clip": 1.34214163, + "balance_loss_mlp": 0.35508353, + "epoch": 0.3182323763715617, + "flos": 23258803080960.0, + "grad_norm": 5.13819635451938, + "language_loss": 0.80813932, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.8288945, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 0.33447266, + "step": 5293, + "time_per_iteration": 4.134828090667725 + }, + { + "auxiliary_loss_clip": 0.01699339, + "auxiliary_loss_mlp": 0.00415983, + "balance_loss_clip": 1.34583592, + "balance_loss_mlp": 0.38021994, + "epoch": 0.3182924996242297, + "flos": 29016683280000.0, + "grad_norm": 6.668535299111072, + "language_loss": 0.80571091, + "learning_rate": 3.189672532265379e-06, + "loss": 0.82686412, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.35766602, + "step": 5294, + "time_per_iteration": 2.78254771232605 + }, + { + "auxiliary_loss_clip": 0.01721351, + "auxiliary_loss_mlp": 0.00459245, + "balance_loss_clip": 1.35933638, + "balance_loss_mlp": 0.42054972, + "epoch": 0.31835262287689764, + "flos": 20449188230400.0, + "grad_norm": 10.762652059869128, + "language_loss": 0.84346086, + "learning_rate": 3.189359442151152e-06, + "loss": 0.8652668, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.38696289, + "step": 5295, + "time_per_iteration": 4.361624240875244 + }, + { + "auxiliary_loss_clip": 0.01748937, + "auxiliary_loss_mlp": 0.0044543, + "balance_loss_clip": 1.37637639, + "balance_loss_mlp": 0.40918994, + "epoch": 0.3184127461295656, + "flos": 25119478477440.0, + "grad_norm": 3.4555869258319993, + "language_loss": 0.76394141, + "learning_rate": 3.189046306936296e-06, + "loss": 0.7858851, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.36230469, + "step": 5296, + "time_per_iteration": 2.8007869720458984 + }, + { + "auxiliary_loss_clip": 0.017187, + "auxiliary_loss_mlp": 0.00395452, + "balance_loss_clip": 1.35629117, + "balance_loss_mlp": 0.36076164, + "epoch": 0.31847286938223357, + "flos": 25551231955200.0, + "grad_norm": 102.21779204433832, + "language_loss": 0.83361399, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.85475552, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 0.34667969, + "step": 5297, + "time_per_iteration": 2.8877604007720947 + }, + { + "auxiliary_loss_clip": 0.01694658, + "auxiliary_loss_mlp": 0.00391142, + "balance_loss_clip": 1.34677148, + "balance_loss_mlp": 0.35452074, + "epoch": 0.31853299263490154, + "flos": 27782470010880.0, + "grad_norm": 209.81542942637273, + "language_loss": 0.8476584, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.86851645, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 3.48046875, + "router_z_loss_mlp": 0.3659668, + "step": 5298, + "time_per_iteration": 2.874307870864868 + }, + { + "auxiliary_loss_clip": 0.0172742, + "auxiliary_loss_mlp": 0.00418194, + "balance_loss_clip": 1.36358809, + "balance_loss_mlp": 0.38190711, + "epoch": 0.3185931158875695, + "flos": 22706747976960.0, + "grad_norm": 3.6936767994557136, + "language_loss": 0.79605472, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.81751084, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 3.63671875, + "router_z_loss_mlp": 0.36303711, + "step": 5299, + "time_per_iteration": 2.673214912414551 + }, + { + "auxiliary_loss_clip": 0.01725132, + "auxiliary_loss_mlp": 0.004235, + "balance_loss_clip": 1.35831928, + "balance_loss_mlp": 0.3866643, + "epoch": 0.31865323914023747, + "flos": 24571517523840.0, + "grad_norm": 2.659835608724678, + "language_loss": 0.84340334, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.86488962, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 3.66796875, + "router_z_loss_mlp": 0.36816406, + "step": 5300, + "time_per_iteration": 2.725107431411743 + }, + { + "auxiliary_loss_clip": 0.01709414, + "auxiliary_loss_mlp": 0.00396935, + "balance_loss_clip": 1.35558712, + "balance_loss_mlp": 0.36064738, + "epoch": 0.31871336239290543, + "flos": 18186564666240.0, + "grad_norm": 5.157814773361325, + "language_loss": 0.90324616, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.92430961, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 3.54101562, + "router_z_loss_mlp": 0.36303711, + "step": 5301, + "time_per_iteration": 4.032884836196899 + }, + { + "auxiliary_loss_clip": 0.01743598, + "auxiliary_loss_mlp": 0.00484469, + "balance_loss_clip": 1.3827188, + "balance_loss_mlp": 0.44067162, + "epoch": 0.31877348564557345, + "flos": 21826756679040.0, + "grad_norm": 7.246737242243414, + "language_loss": 0.83416533, + "learning_rate": 3.187166549199015e-06, + "loss": 0.85644603, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 0.43798828, + "step": 5302, + "time_per_iteration": 2.61442494392395 + }, + { + "auxiliary_loss_clip": 0.01713699, + "auxiliary_loss_mlp": 0.00398935, + "balance_loss_clip": 1.36358833, + "balance_loss_mlp": 0.36300546, + "epoch": 0.3188336088982414, + "flos": 22015252275840.0, + "grad_norm": 7.988477762781484, + "language_loss": 0.85394502, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.87507129, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.35913086, + "step": 5303, + "time_per_iteration": 2.636838912963867 + }, + { + "auxiliary_loss_clip": 0.01732017, + "auxiliary_loss_mlp": 0.00479411, + "balance_loss_clip": 1.35822463, + "balance_loss_mlp": 0.43690121, + "epoch": 0.3188937321509094, + "flos": 20047886507520.0, + "grad_norm": 52.36716422601023, + "language_loss": 0.80699813, + "learning_rate": 3.186539603020047e-06, + "loss": 0.82911241, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 0.42553711, + "step": 5304, + "time_per_iteration": 2.652920722961426 + }, + { + "auxiliary_loss_clip": 0.01701611, + "auxiliary_loss_mlp": 0.00454162, + "balance_loss_clip": 1.35833609, + "balance_loss_mlp": 0.41575235, + "epoch": 0.31895385540357735, + "flos": 25848105863040.0, + "grad_norm": 2.481263795678358, + "language_loss": 0.78354096, + "learning_rate": 3.186226062434068e-06, + "loss": 0.80509865, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 3.43359375, + "router_z_loss_mlp": 0.3840332, + "step": 5305, + "time_per_iteration": 2.693019390106201 + }, + { + "auxiliary_loss_clip": 0.01718931, + "auxiliary_loss_mlp": 0.00454895, + "balance_loss_clip": 1.3646903, + "balance_loss_mlp": 0.41641381, + "epoch": 0.3190139786562453, + "flos": 23477714519040.0, + "grad_norm": 2.614960997721816, + "language_loss": 0.69231534, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.71405363, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 3.546875, + "router_z_loss_mlp": 0.38500977, + "step": 5306, + "time_per_iteration": 4.1589226722717285 + }, + { + "auxiliary_loss_clip": 0.01739448, + "auxiliary_loss_mlp": 0.00493905, + "balance_loss_clip": 1.37364554, + "balance_loss_mlp": 0.45015484, + "epoch": 0.3190741019089133, + "flos": 29095543589760.0, + "grad_norm": 6.654300061910735, + "language_loss": 0.84727508, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.86960858, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 3.65820312, + "router_z_loss_mlp": 0.4375, + "step": 5307, + "time_per_iteration": 2.7024319171905518 + }, + { + "auxiliary_loss_clip": 0.01718994, + "auxiliary_loss_mlp": 0.00453631, + "balance_loss_clip": 1.36287189, + "balance_loss_mlp": 0.41016725, + "epoch": 0.31913422516158124, + "flos": 17129534209920.0, + "grad_norm": 6.619124019128554, + "language_loss": 0.84394681, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.86567307, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.43457031, + "step": 5308, + "time_per_iteration": 2.5934371948242188 + }, + { + "auxiliary_loss_clip": 0.01772577, + "auxiliary_loss_mlp": 0.0047594, + "balance_loss_clip": 1.38983309, + "balance_loss_mlp": 0.43154624, + "epoch": 0.3191943484142492, + "flos": 16069846147200.0, + "grad_norm": 3.5987620811593817, + "language_loss": 0.82845581, + "learning_rate": 3.184971450390961e-06, + "loss": 0.850941, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 0.4440918, + "step": 5309, + "time_per_iteration": 2.6370866298675537 + }, + { + "auxiliary_loss_clip": 0.01710817, + "auxiliary_loss_mlp": 0.00490966, + "balance_loss_clip": 1.3531692, + "balance_loss_mlp": 0.45055419, + "epoch": 0.3192544716669172, + "flos": 22966166977920.0, + "grad_norm": 24.05355927448271, + "language_loss": 0.88434446, + "learning_rate": 3.184657685014856e-06, + "loss": 0.90636224, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 3.57421875, + "router_z_loss_mlp": 0.40380859, + "step": 5310, + "time_per_iteration": 2.6801815032958984 + }, + { + "auxiliary_loss_clip": 0.01701893, + "auxiliary_loss_mlp": 0.00503033, + "balance_loss_clip": 1.34938145, + "balance_loss_mlp": 0.46350345, + "epoch": 0.31931459491958514, + "flos": 26870339018880.0, + "grad_norm": 2.4399319914904263, + "language_loss": 0.81751913, + "learning_rate": 3.184343874716412e-06, + "loss": 0.83956838, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.39526367, + "step": 5311, + "time_per_iteration": 2.7080507278442383 + }, + { + "auxiliary_loss_clip": 0.01726892, + "auxiliary_loss_mlp": 0.00475049, + "balance_loss_clip": 1.37104058, + "balance_loss_mlp": 0.43663949, + "epoch": 0.3193747181722531, + "flos": 21836525178240.0, + "grad_norm": 26.139032177883394, + "language_loss": 0.90551031, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.92752975, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 0.38427734, + "step": 5312, + "time_per_iteration": 2.6891252994537354 + }, + { + "auxiliary_loss_clip": 0.01765996, + "auxiliary_loss_mlp": 0.0055989, + "balance_loss_clip": 1.3896544, + "balance_loss_mlp": 0.51428097, + "epoch": 0.31943484142492107, + "flos": 18324999682560.0, + "grad_norm": 16500.420739845704, + "language_loss": 0.8763659, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.89962476, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 0.45629883, + "step": 5313, + "time_per_iteration": 2.637005090713501 + }, + { + "auxiliary_loss_clip": 0.0171895, + "auxiliary_loss_mlp": 0.00503515, + "balance_loss_clip": 1.35774231, + "balance_loss_mlp": 0.45986021, + "epoch": 0.31949496467758903, + "flos": 21615818060160.0, + "grad_norm": 27.505662907547688, + "language_loss": 0.93054336, + "learning_rate": 3.183402174406057e-06, + "loss": 0.95276797, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 0.43652344, + "step": 5314, + "time_per_iteration": 2.6137049198150635 + }, + { + "auxiliary_loss_clip": 0.01719111, + "auxiliary_loss_mlp": 0.00449448, + "balance_loss_clip": 1.35462117, + "balance_loss_mlp": 0.40846366, + "epoch": 0.31955508793025705, + "flos": 21760214734080.0, + "grad_norm": 7.091221261371386, + "language_loss": 0.85859489, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.88028049, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.40966797, + "step": 5315, + "time_per_iteration": 2.661275863647461 + }, + { + "auxiliary_loss_clip": 0.01731629, + "auxiliary_loss_mlp": 0.00546566, + "balance_loss_clip": 1.36689258, + "balance_loss_mlp": 0.50200564, + "epoch": 0.319615211182925, + "flos": 17164331510400.0, + "grad_norm": 4.166853666189118, + "language_loss": 0.72203255, + "learning_rate": 3.18277414980567e-06, + "loss": 0.74481452, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 0.44555664, + "step": 5316, + "time_per_iteration": 2.6852006912231445 + }, + { + "auxiliary_loss_clip": 0.01708103, + "auxiliary_loss_mlp": 0.00501932, + "balance_loss_clip": 1.35315096, + "balance_loss_mlp": 0.46304637, + "epoch": 0.319675334435593, + "flos": 28112812416000.0, + "grad_norm": 3.105118985887463, + "language_loss": 0.73223877, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.7543391, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 0.38891602, + "step": 5317, + "time_per_iteration": 2.870189905166626 + }, + { + "auxiliary_loss_clip": 0.0214604, + "auxiliary_loss_mlp": 0.00332953, + "balance_loss_clip": 1.88205564, + "balance_loss_mlp": 0.31406975, + "epoch": 0.31973545768826095, + "flos": 69501119408640.0, + "grad_norm": 0.723936095649714, + "language_loss": 0.52444285, + "learning_rate": 3.182145945801628e-06, + "loss": 0.54923278, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.18847656, + "step": 5318, + "time_per_iteration": 3.3257522583007812 + }, + { + "auxiliary_loss_clip": 0.01728356, + "auxiliary_loss_mlp": 0.00477608, + "balance_loss_clip": 1.36918175, + "balance_loss_mlp": 0.43598038, + "epoch": 0.3197955809409289, + "flos": 13699203408000.0, + "grad_norm": 2.3196256640344064, + "language_loss": 0.88436198, + "learning_rate": 3.181831776553012e-06, + "loss": 0.90642154, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 3.59179688, + "router_z_loss_mlp": 0.41650391, + "step": 5319, + "time_per_iteration": 2.6685307025909424 + }, + { + "auxiliary_loss_clip": 0.01712763, + "auxiliary_loss_mlp": 0.00506486, + "balance_loss_clip": 1.35387349, + "balance_loss_mlp": 0.46297482, + "epoch": 0.3198557041935969, + "flos": 33218124278400.0, + "grad_norm": 4.666115024303236, + "language_loss": 0.6920771, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.71426964, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.43530273, + "step": 5320, + "time_per_iteration": 2.797147274017334 + }, + { + "auxiliary_loss_clip": 0.01741138, + "auxiliary_loss_mlp": 0.00513903, + "balance_loss_clip": 1.36418307, + "balance_loss_mlp": 0.46929485, + "epoch": 0.31991582744626484, + "flos": 23732033788800.0, + "grad_norm": 12.957188868701222, + "language_loss": 0.75577235, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.77832282, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 3.77148438, + "router_z_loss_mlp": 0.44555664, + "step": 5321, + "time_per_iteration": 2.6608736515045166 + }, + { + "auxiliary_loss_clip": 0.01768294, + "auxiliary_loss_mlp": 0.00546411, + "balance_loss_clip": 1.38516927, + "balance_loss_mlp": 0.50087309, + "epoch": 0.3199759506989328, + "flos": 18550842445440.0, + "grad_norm": 4.851887266712705, + "language_loss": 0.91747189, + "learning_rate": 3.180888999963749e-06, + "loss": 0.94061893, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 0.45556641, + "step": 5322, + "time_per_iteration": 2.636070966720581 + }, + { + "auxiliary_loss_clip": 0.01691942, + "auxiliary_loss_mlp": 0.0046032, + "balance_loss_clip": 1.33862114, + "balance_loss_mlp": 0.42012292, + "epoch": 0.3200360739516008, + "flos": 22418888382720.0, + "grad_norm": 20.741659168915366, + "language_loss": 0.88024211, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.90176475, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.40185547, + "step": 5323, + "time_per_iteration": 2.722917318344116 + }, + { + "auxiliary_loss_clip": 0.01699879, + "auxiliary_loss_mlp": 0.00488897, + "balance_loss_clip": 1.34733093, + "balance_loss_mlp": 0.44316855, + "epoch": 0.32009619720426874, + "flos": 20595236929920.0, + "grad_norm": 10.456138840422868, + "language_loss": 0.834512, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.85639977, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 3.52539062, + "router_z_loss_mlp": 0.45751953, + "step": 5324, + "time_per_iteration": 2.6193459033966064 + }, + { + "auxiliary_loss_clip": 0.01700404, + "auxiliary_loss_mlp": 0.00429234, + "balance_loss_clip": 1.34624791, + "balance_loss_mlp": 0.38968033, + "epoch": 0.3201563204569367, + "flos": 18147637301760.0, + "grad_norm": 59.342111246778316, + "language_loss": 0.85458195, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.87587833, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 3.5390625, + "router_z_loss_mlp": 0.39550781, + "step": 5325, + "time_per_iteration": 2.5910449028015137 + }, + { + "auxiliary_loss_clip": 0.01748158, + "auxiliary_loss_mlp": 0.00491394, + "balance_loss_clip": 1.37642813, + "balance_loss_mlp": 0.44740576, + "epoch": 0.32021644370960467, + "flos": 31684235840640.0, + "grad_norm": 2.531549607091387, + "language_loss": 0.79877114, + "learning_rate": 3.179631337655037e-06, + "loss": 0.82116663, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 3.71875, + "router_z_loss_mlp": 0.43994141, + "step": 5326, + "time_per_iteration": 2.698843240737915 + }, + { + "auxiliary_loss_clip": 0.01721269, + "auxiliary_loss_mlp": 0.00479162, + "balance_loss_clip": 1.36466336, + "balance_loss_mlp": 0.43803519, + "epoch": 0.32027656696227264, + "flos": 26865921646080.0, + "grad_norm": 325.05977496945695, + "language_loss": 0.84618151, + "learning_rate": 3.179316810218701e-06, + "loss": 0.86818582, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.41113281, + "step": 5327, + "time_per_iteration": 2.6728932857513428 + }, + { + "auxiliary_loss_clip": 0.01725154, + "auxiliary_loss_mlp": 0.00468179, + "balance_loss_clip": 1.3526727, + "balance_loss_mlp": 0.42681384, + "epoch": 0.32033669021494066, + "flos": 24169928492160.0, + "grad_norm": 6.152228868253943, + "language_loss": 0.80736208, + "learning_rate": 3.179002238062554e-06, + "loss": 0.8292954, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 3.72070312, + "router_z_loss_mlp": 0.41357422, + "step": 5328, + "time_per_iteration": 2.727324962615967 + }, + { + "auxiliary_loss_clip": 0.01723256, + "auxiliary_loss_mlp": 0.00466248, + "balance_loss_clip": 1.36527967, + "balance_loss_mlp": 0.42132989, + "epoch": 0.3203968134676086, + "flos": 24460768915200.0, + "grad_norm": 3.979778810289534, + "language_loss": 0.79304135, + "learning_rate": 3.178687621198524e-06, + "loss": 0.8149364, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 0.44946289, + "step": 5329, + "time_per_iteration": 2.761971950531006 + }, + { + "auxiliary_loss_clip": 0.01709156, + "auxiliary_loss_mlp": 0.00450292, + "balance_loss_clip": 1.35900331, + "balance_loss_mlp": 0.41014215, + "epoch": 0.3204569367202766, + "flos": 18004713085440.0, + "grad_norm": 5.423415900636356, + "language_loss": 0.75235623, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.7739507, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.40136719, + "step": 5330, + "time_per_iteration": 2.6481165885925293 + }, + { + "auxiliary_loss_clip": 0.01713445, + "auxiliary_loss_mlp": 0.00468089, + "balance_loss_clip": 1.34898973, + "balance_loss_mlp": 0.4263179, + "epoch": 0.32051705997294455, + "flos": 30589678650240.0, + "grad_norm": 228.63663213000595, + "language_loss": 0.85417676, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.87599218, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.41772461, + "step": 5331, + "time_per_iteration": 2.7270965576171875 + }, + { + "auxiliary_loss_clip": 0.0204186, + "auxiliary_loss_mlp": 0.00184144, + "balance_loss_clip": 1.78756797, + "balance_loss_mlp": 0.16879018, + "epoch": 0.3205771832256125, + "flos": 68417979765120.0, + "grad_norm": 0.8405055316079078, + "language_loss": 0.57824594, + "learning_rate": 3.177743502478447e-06, + "loss": 0.60050589, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.15332031, + "step": 5332, + "time_per_iteration": 3.150365114212036 + }, + { + "auxiliary_loss_clip": 0.01755669, + "auxiliary_loss_mlp": 0.00416776, + "balance_loss_clip": 1.38321352, + "balance_loss_mlp": 0.37562457, + "epoch": 0.3206373064782805, + "flos": 30443953173120.0, + "grad_norm": 7.191721494461687, + "language_loss": 0.7948072, + "learning_rate": 3.177428706902205e-06, + "loss": 0.81653172, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.41186523, + "step": 5333, + "time_per_iteration": 2.8208911418914795 + }, + { + "auxiliary_loss_clip": 0.01703662, + "auxiliary_loss_mlp": 0.00479265, + "balance_loss_clip": 1.34517097, + "balance_loss_mlp": 0.43768477, + "epoch": 0.32069742973094845, + "flos": 22054502862720.0, + "grad_norm": 67.01280208774357, + "language_loss": 0.77010524, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.79193455, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 3.5859375, + "router_z_loss_mlp": 0.41601562, + "step": 5334, + "time_per_iteration": 2.657924175262451 + }, + { + "auxiliary_loss_clip": 0.01697242, + "auxiliary_loss_mlp": 0.00431344, + "balance_loss_clip": 1.33943069, + "balance_loss_mlp": 0.39310208, + "epoch": 0.3207575529836164, + "flos": 22054000072320.0, + "grad_norm": 45.705353479362955, + "language_loss": 0.82068557, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.84197146, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 0.38256836, + "step": 5335, + "time_per_iteration": 4.110499143600464 + }, + { + "auxiliary_loss_clip": 0.01687736, + "auxiliary_loss_mlp": 0.00430527, + "balance_loss_clip": 1.33443999, + "balance_loss_mlp": 0.39025843, + "epoch": 0.3208176762362844, + "flos": 34057536186240.0, + "grad_norm": 82.03594537750911, + "language_loss": 0.73619497, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.75737756, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.40283203, + "step": 5336, + "time_per_iteration": 2.8323209285736084 + }, + { + "auxiliary_loss_clip": 0.01714047, + "auxiliary_loss_mlp": 0.00453074, + "balance_loss_clip": 1.35130465, + "balance_loss_mlp": 0.41356763, + "epoch": 0.32087779948895234, + "flos": 21798711135360.0, + "grad_norm": 19.206202026266023, + "language_loss": 0.83321321, + "learning_rate": 3.176169078234487e-06, + "loss": 0.85488439, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.39526367, + "step": 5337, + "time_per_iteration": 4.083790063858032 + }, + { + "auxiliary_loss_clip": 0.0170653, + "auxiliary_loss_mlp": 0.00435685, + "balance_loss_clip": 1.35603833, + "balance_loss_mlp": 0.3976576, + "epoch": 0.3209379227416203, + "flos": 21434110133760.0, + "grad_norm": 36.2260311016666, + "language_loss": 0.79676151, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.8181836, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.38037109, + "step": 5338, + "time_per_iteration": 2.681302547454834 + }, + { + "auxiliary_loss_clip": 0.01716521, + "auxiliary_loss_mlp": 0.00457119, + "balance_loss_clip": 1.34539962, + "balance_loss_mlp": 0.41441774, + "epoch": 0.3209980459942883, + "flos": 25849075530240.0, + "grad_norm": 29.985344490420072, + "language_loss": 0.67065066, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.6923871, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 3.71289062, + "router_z_loss_mlp": 0.42700195, + "step": 5339, + "time_per_iteration": 2.7811601161956787 + }, + { + "auxiliary_loss_clip": 0.0173619, + "auxiliary_loss_mlp": 0.00438312, + "balance_loss_clip": 1.36323452, + "balance_loss_mlp": 0.3958976, + "epoch": 0.32105816924695624, + "flos": 19099162535040.0, + "grad_norm": 20.953203430334028, + "language_loss": 0.88539886, + "learning_rate": 3.175223888387192e-06, + "loss": 0.90714389, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.42407227, + "step": 5340, + "time_per_iteration": 2.7758517265319824 + }, + { + "auxiliary_loss_clip": 0.01748346, + "auxiliary_loss_mlp": 0.00430686, + "balance_loss_clip": 1.37357712, + "balance_loss_mlp": 0.39072728, + "epoch": 0.3211182924996242, + "flos": 16581860565120.0, + "grad_norm": 9.29176722591997, + "language_loss": 0.81435549, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.83614582, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.3996582, + "step": 5341, + "time_per_iteration": 2.8800814151763916 + }, + { + "auxiliary_loss_clip": 0.01726169, + "auxiliary_loss_mlp": 0.00444642, + "balance_loss_clip": 1.364223, + "balance_loss_mlp": 0.40251321, + "epoch": 0.3211784157522922, + "flos": 22672202071680.0, + "grad_norm": 9.24975323944955, + "language_loss": 0.84109414, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.86280221, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.42163086, + "step": 5342, + "time_per_iteration": 2.6589980125427246 + }, + { + "auxiliary_loss_clip": 0.01738929, + "auxiliary_loss_mlp": 0.00470275, + "balance_loss_clip": 1.36383533, + "balance_loss_mlp": 0.42781252, + "epoch": 0.3212385390049602, + "flos": 20558787603840.0, + "grad_norm": 4.428488895852741, + "language_loss": 0.8184011, + "learning_rate": 3.174278297458438e-06, + "loss": 0.8404932, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 3.75195312, + "router_z_loss_mlp": 0.42431641, + "step": 5343, + "time_per_iteration": 4.057276964187622 + }, + { + "auxiliary_loss_clip": 0.01740952, + "auxiliary_loss_mlp": 0.00445387, + "balance_loss_clip": 1.36550665, + "balance_loss_mlp": 0.40371096, + "epoch": 0.32129866225762815, + "flos": 24791147233920.0, + "grad_norm": 16.521403325537324, + "language_loss": 0.87509346, + "learning_rate": 3.173963011408748e-06, + "loss": 0.8969568, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 3.75390625, + "router_z_loss_mlp": 0.41699219, + "step": 5344, + "time_per_iteration": 2.7205429077148438 + }, + { + "auxiliary_loss_clip": 0.01732332, + "auxiliary_loss_mlp": 0.00428256, + "balance_loss_clip": 1.35939765, + "balance_loss_mlp": 0.3871524, + "epoch": 0.3213587855102961, + "flos": 18366871962240.0, + "grad_norm": 5.333940563530704, + "language_loss": 0.8709991, + "learning_rate": 3.173647680842262e-06, + "loss": 0.89260495, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.41088867, + "step": 5345, + "time_per_iteration": 2.6423041820526123 + }, + { + "auxiliary_loss_clip": 0.01741688, + "auxiliary_loss_mlp": 0.00444918, + "balance_loss_clip": 1.36232805, + "balance_loss_mlp": 0.40402907, + "epoch": 0.3214189087629641, + "flos": 27015992668800.0, + "grad_norm": 6.582095642625461, + "language_loss": 0.88990968, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.91177571, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.40893555, + "step": 5346, + "time_per_iteration": 2.6737165451049805 + }, + { + "auxiliary_loss_clip": 0.01734172, + "auxiliary_loss_mlp": 0.00461336, + "balance_loss_clip": 1.36105418, + "balance_loss_mlp": 0.41834933, + "epoch": 0.32147903201563205, + "flos": 23148269953920.0, + "grad_norm": 147.1817099614968, + "language_loss": 0.86252403, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.88447917, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 0.42993164, + "step": 5347, + "time_per_iteration": 2.6493818759918213 + }, + { + "auxiliary_loss_clip": 0.01758943, + "auxiliary_loss_mlp": 0.00423824, + "balance_loss_clip": 1.38428283, + "balance_loss_mlp": 0.38319784, + "epoch": 0.3215391552683, + "flos": 16580747243520.0, + "grad_norm": 997.8325512875963, + "language_loss": 0.84685218, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.86867988, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 3.74804688, + "router_z_loss_mlp": 0.40625, + "step": 5348, + "time_per_iteration": 3.971548318862915 + }, + { + "auxiliary_loss_clip": 0.01758128, + "auxiliary_loss_mlp": 0.00437387, + "balance_loss_clip": 1.37611198, + "balance_loss_mlp": 0.39723742, + "epoch": 0.321599278520968, + "flos": 17821820010240.0, + "grad_norm": 6.230409482743181, + "language_loss": 0.91577208, + "learning_rate": 3.172385913647542e-06, + "loss": 0.93772733, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 3.82226562, + "router_z_loss_mlp": 0.40161133, + "step": 5349, + "time_per_iteration": 2.624138355255127 + }, + { + "auxiliary_loss_clip": 0.01766398, + "auxiliary_loss_mlp": 0.00441154, + "balance_loss_clip": 1.37653732, + "balance_loss_mlp": 0.39859584, + "epoch": 0.32165940177363594, + "flos": 16251769555200.0, + "grad_norm": 10.465101388754215, + "language_loss": 0.853001, + "learning_rate": 3.172070360676475e-06, + "loss": 0.87507641, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 3.90234375, + "router_z_loss_mlp": 0.42553711, + "step": 5350, + "time_per_iteration": 2.6980488300323486 + }, + { + "auxiliary_loss_clip": 0.01723951, + "auxiliary_loss_mlp": 0.00402879, + "balance_loss_clip": 1.35880721, + "balance_loss_mlp": 0.36730716, + "epoch": 0.3217195250263039, + "flos": 27599900158080.0, + "grad_norm": 2.7611779289012484, + "language_loss": 0.83755827, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.85882658, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 0.35595703, + "step": 5351, + "time_per_iteration": 2.707916736602783 + }, + { + "auxiliary_loss_clip": 0.01761579, + "auxiliary_loss_mlp": 0.00437328, + "balance_loss_clip": 1.38087535, + "balance_loss_mlp": 0.39336371, + "epoch": 0.3217796482789719, + "flos": 21470595373440.0, + "grad_norm": 2.4694399934877644, + "language_loss": 0.81983256, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.84182167, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.43994141, + "step": 5352, + "time_per_iteration": 2.6870014667510986 + }, + { + "auxiliary_loss_clip": 0.01736802, + "auxiliary_loss_mlp": 0.00459245, + "balance_loss_clip": 1.36315322, + "balance_loss_mlp": 0.41799891, + "epoch": 0.32183977153163984, + "flos": 21215593745280.0, + "grad_norm": 16.439411627145446, + "language_loss": 0.87876016, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.9007206, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 0.41235352, + "step": 5353, + "time_per_iteration": 2.6717095375061035 + }, + { + "auxiliary_loss_clip": 0.01741641, + "auxiliary_loss_mlp": 0.00442971, + "balance_loss_clip": 1.37649751, + "balance_loss_mlp": 0.40241641, + "epoch": 0.3218998947843078, + "flos": 24608182331520.0, + "grad_norm": 7.804477434550239, + "language_loss": 0.77912045, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.80096662, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 3.65234375, + "router_z_loss_mlp": 0.40576172, + "step": 5354, + "time_per_iteration": 2.7204020023345947 + }, + { + "auxiliary_loss_clip": 0.01765065, + "auxiliary_loss_mlp": 0.00458198, + "balance_loss_clip": 1.38737154, + "balance_loss_mlp": 0.41411406, + "epoch": 0.3219600180369758, + "flos": 22270577126400.0, + "grad_norm": 12.227088884031177, + "language_loss": 0.88231307, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.90454566, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.44042969, + "step": 5355, + "time_per_iteration": 2.736206531524658 + }, + { + "auxiliary_loss_clip": 0.01761099, + "auxiliary_loss_mlp": 0.00431673, + "balance_loss_clip": 1.38397467, + "balance_loss_mlp": 0.39130852, + "epoch": 0.3220201412896438, + "flos": 14939126939520.0, + "grad_norm": 2.2280261303343267, + "language_loss": 0.76718211, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.78910983, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 3.7734375, + "router_z_loss_mlp": 0.40356445, + "step": 5356, + "time_per_iteration": 2.684670925140381 + }, + { + "auxiliary_loss_clip": 0.01809588, + "auxiliary_loss_mlp": 0.0045967, + "balance_loss_clip": 1.39688182, + "balance_loss_mlp": 0.41420311, + "epoch": 0.32208026454231176, + "flos": 22667389649280.0, + "grad_norm": 5.717652274955718, + "language_loss": 0.76678914, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.78948176, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 4.125, + "router_z_loss_mlp": 0.45458984, + "step": 5357, + "time_per_iteration": 2.772672653198242 + }, + { + "auxiliary_loss_clip": 0.02034832, + "auxiliary_loss_mlp": 0.00110077, + "balance_loss_clip": 1.78668559, + "balance_loss_mlp": 0.09643945, + "epoch": 0.3221403877949797, + "flos": 64605130053120.0, + "grad_norm": 0.7449338119182936, + "language_loss": 0.58332777, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60477686, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.13671875, + "step": 5358, + "time_per_iteration": 3.2751173973083496 + }, + { + "auxiliary_loss_clip": 0.01753956, + "auxiliary_loss_mlp": 0.00458005, + "balance_loss_clip": 1.37029171, + "balance_loss_mlp": 0.41418332, + "epoch": 0.3222005110476477, + "flos": 20157019004160.0, + "grad_norm": 17.81984872717862, + "language_loss": 0.88275611, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.90487576, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 3.83398438, + "router_z_loss_mlp": 0.43847656, + "step": 5359, + "time_per_iteration": 2.649867057800293 + }, + { + "auxiliary_loss_clip": 0.01742643, + "auxiliary_loss_mlp": 0.00445392, + "balance_loss_clip": 1.36888528, + "balance_loss_mlp": 0.40438405, + "epoch": 0.32226063430031565, + "flos": 22674177319680.0, + "grad_norm": 60.58402497155927, + "language_loss": 0.84699863, + "learning_rate": 3.168912388464595e-06, + "loss": 0.86887896, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 3.73828125, + "router_z_loss_mlp": 0.41015625, + "step": 5360, + "time_per_iteration": 2.6827168464660645 + }, + { + "auxiliary_loss_clip": 0.02021787, + "auxiliary_loss_mlp": 0.00116668, + "balance_loss_clip": 1.77722979, + "balance_loss_mlp": 0.10527184, + "epoch": 0.3223207575529836, + "flos": 63828525075840.0, + "grad_norm": 0.6424278133447734, + "language_loss": 0.56201422, + "learning_rate": 3.168596347256737e-06, + "loss": 0.58339882, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.11376953, + "step": 5361, + "time_per_iteration": 3.0779762268066406 + }, + { + "auxiliary_loss_clip": 0.0176141, + "auxiliary_loss_mlp": 0.00451613, + "balance_loss_clip": 1.37866449, + "balance_loss_mlp": 0.40648085, + "epoch": 0.3223808808056516, + "flos": 26870123537280.0, + "grad_norm": 8.077084231627092, + "language_loss": 0.77284688, + "learning_rate": 3.168280261735588e-06, + "loss": 0.79497707, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 0.45141602, + "step": 5362, + "time_per_iteration": 2.7750682830810547 + }, + { + "auxiliary_loss_clip": 0.01760361, + "auxiliary_loss_mlp": 0.00469238, + "balance_loss_clip": 1.37322128, + "balance_loss_mlp": 0.42334259, + "epoch": 0.32244100405831955, + "flos": 26761350176640.0, + "grad_norm": 8.81325571355015, + "language_loss": 0.78271919, + "learning_rate": 3.167964131913135e-06, + "loss": 0.80501521, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 3.86914062, + "router_z_loss_mlp": 0.45922852, + "step": 5363, + "time_per_iteration": 2.7448463439941406 + }, + { + "auxiliary_loss_clip": 0.01762042, + "auxiliary_loss_mlp": 0.00480878, + "balance_loss_clip": 1.36235106, + "balance_loss_mlp": 0.43524519, + "epoch": 0.3225011273109875, + "flos": 23803029020160.0, + "grad_norm": 297.1902528303222, + "language_loss": 0.80552548, + "learning_rate": 3.167647957801365e-06, + "loss": 0.82795471, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 3.9921875, + "router_z_loss_mlp": 0.45629883, + "step": 5364, + "time_per_iteration": 2.6377387046813965 + }, + { + "auxiliary_loss_clip": 0.01768046, + "auxiliary_loss_mlp": 0.00469002, + "balance_loss_clip": 1.385427, + "balance_loss_mlp": 0.42258191, + "epoch": 0.3225612505636555, + "flos": 17274505501440.0, + "grad_norm": 3.9357790984981835, + "language_loss": 0.81346393, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.83583438, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 0.46411133, + "step": 5365, + "time_per_iteration": 2.5915067195892334 + }, + { + "auxiliary_loss_clip": 0.01766733, + "auxiliary_loss_mlp": 0.00487688, + "balance_loss_clip": 1.37798953, + "balance_loss_mlp": 0.442913, + "epoch": 0.32262137381632344, + "flos": 23366247638400.0, + "grad_norm": 4.477916868887371, + "language_loss": 0.81284535, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.83538949, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 3.890625, + "router_z_loss_mlp": 0.44775391, + "step": 5366, + "time_per_iteration": 2.6553051471710205 + }, + { + "auxiliary_loss_clip": 0.01756767, + "auxiliary_loss_mlp": 0.00451975, + "balance_loss_clip": 1.3754313, + "balance_loss_mlp": 0.4084878, + "epoch": 0.3226814970689914, + "flos": 23258803080960.0, + "grad_norm": 10.078397649601087, + "language_loss": 0.78411591, + "learning_rate": 3.166699169850055e-06, + "loss": 0.80620337, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 0.43481445, + "step": 5367, + "time_per_iteration": 2.6642096042633057 + }, + { + "auxiliary_loss_clip": 0.0177465, + "auxiliary_loss_mlp": 0.00444261, + "balance_loss_clip": 1.38732529, + "balance_loss_mlp": 0.40225124, + "epoch": 0.32274162032165943, + "flos": 16395196561920.0, + "grad_norm": 10.265467161621235, + "language_loss": 0.7992003, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.82138938, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.42016602, + "step": 5368, + "time_per_iteration": 2.6223528385162354 + }, + { + "auxiliary_loss_clip": 0.01754651, + "auxiliary_loss_mlp": 0.00463158, + "balance_loss_clip": 1.3767519, + "balance_loss_mlp": 0.42021933, + "epoch": 0.3228017435743274, + "flos": 27855081354240.0, + "grad_norm": 5.634633015266428, + "language_loss": 0.84334195, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.86552006, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.42919922, + "step": 5369, + "time_per_iteration": 2.815669298171997 + }, + { + "auxiliary_loss_clip": 0.01742166, + "auxiliary_loss_mlp": 0.00451284, + "balance_loss_clip": 1.37082553, + "balance_loss_mlp": 0.40958509, + "epoch": 0.32286186682699536, + "flos": 19608770741760.0, + "grad_norm": 28.087854046742503, + "language_loss": 0.87376636, + "learning_rate": 3.16574998372661e-06, + "loss": 0.89570081, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.41723633, + "step": 5370, + "time_per_iteration": 2.733877658843994 + }, + { + "auxiliary_loss_clip": 0.01756079, + "auxiliary_loss_mlp": 0.00468385, + "balance_loss_clip": 1.37749124, + "balance_loss_mlp": 0.42537397, + "epoch": 0.3229219900796633, + "flos": 24134017870080.0, + "grad_norm": 18.59988933227063, + "language_loss": 0.87593722, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.8981818, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 3.78515625, + "router_z_loss_mlp": 0.43041992, + "step": 5371, + "time_per_iteration": 2.714113473892212 + }, + { + "auxiliary_loss_clip": 0.01758871, + "auxiliary_loss_mlp": 0.00510389, + "balance_loss_clip": 1.36895227, + "balance_loss_mlp": 0.46311104, + "epoch": 0.3229821133323313, + "flos": 17748705876480.0, + "grad_norm": 11.121726575004729, + "language_loss": 0.94626832, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.96896088, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 3.89648438, + "router_z_loss_mlp": 0.47338867, + "step": 5372, + "time_per_iteration": 2.7691824436187744 + }, + { + "auxiliary_loss_clip": 0.01739104, + "auxiliary_loss_mlp": 0.00488996, + "balance_loss_clip": 1.36094296, + "balance_loss_mlp": 0.4442451, + "epoch": 0.32304223658499925, + "flos": 22346025644160.0, + "grad_norm": 3.397076080910925, + "language_loss": 0.76769137, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.78997231, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 0.44775391, + "step": 5373, + "time_per_iteration": 2.664355993270874 + }, + { + "auxiliary_loss_clip": 0.01733184, + "auxiliary_loss_mlp": 0.0051101, + "balance_loss_clip": 1.36162257, + "balance_loss_mlp": 0.46640217, + "epoch": 0.3231023598376672, + "flos": 18478302929280.0, + "grad_norm": 4.570110681734641, + "language_loss": 0.84853977, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.87098169, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 3.71679688, + "router_z_loss_mlp": 0.44628906, + "step": 5374, + "time_per_iteration": 2.627004861831665 + }, + { + "auxiliary_loss_clip": 0.01734836, + "auxiliary_loss_mlp": 0.00508167, + "balance_loss_clip": 1.36226296, + "balance_loss_mlp": 0.4635348, + "epoch": 0.3231624830903352, + "flos": 27636313570560.0, + "grad_norm": 209.26717321525177, + "language_loss": 0.92479718, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.94722712, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 3.72851562, + "router_z_loss_mlp": 0.4465332, + "step": 5375, + "time_per_iteration": 2.7411937713623047 + }, + { + "auxiliary_loss_clip": 0.01749977, + "auxiliary_loss_mlp": 0.00497339, + "balance_loss_clip": 1.35978281, + "balance_loss_mlp": 0.44996512, + "epoch": 0.32322260634300315, + "flos": 21726423014400.0, + "grad_norm": 243.092844268399, + "language_loss": 0.80099702, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.82347012, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 3.90429688, + "router_z_loss_mlp": 0.47387695, + "step": 5376, + "time_per_iteration": 2.6381583213806152 + }, + { + "auxiliary_loss_clip": 0.01739849, + "auxiliary_loss_mlp": 0.00488411, + "balance_loss_clip": 1.35693693, + "balance_loss_mlp": 0.44563907, + "epoch": 0.3232827295956711, + "flos": 22637656166400.0, + "grad_norm": 62.53695825974241, + "language_loss": 0.7101441, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.7324267, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 0.42749023, + "step": 5377, + "time_per_iteration": 2.7161648273468018 + }, + { + "auxiliary_loss_clip": 0.01744746, + "auxiliary_loss_mlp": 0.00484623, + "balance_loss_clip": 1.35918999, + "balance_loss_mlp": 0.43655822, + "epoch": 0.3233428528483391, + "flos": 26322593546880.0, + "grad_norm": 38.38556851047563, + "language_loss": 0.77251601, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.7948097, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 3.85742188, + "router_z_loss_mlp": 0.48046875, + "step": 5378, + "time_per_iteration": 4.16313099861145 + }, + { + "auxiliary_loss_clip": 0.01767853, + "auxiliary_loss_mlp": 0.00469403, + "balance_loss_clip": 1.38380146, + "balance_loss_mlp": 0.42722651, + "epoch": 0.32340297610100704, + "flos": 28585217111040.0, + "grad_norm": 21.220625828874667, + "language_loss": 0.88555533, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.90792787, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 3.83984375, + "router_z_loss_mlp": 0.42211914, + "step": 5379, + "time_per_iteration": 2.8018434047698975 + }, + { + "auxiliary_loss_clip": 0.01764341, + "auxiliary_loss_mlp": 0.00509123, + "balance_loss_clip": 1.36760855, + "balance_loss_mlp": 0.46611181, + "epoch": 0.323463099353675, + "flos": 30773792787840.0, + "grad_norm": 29.115757878277655, + "language_loss": 0.82945299, + "learning_rate": 3.162583158454388e-06, + "loss": 0.85218763, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.43017578, + "step": 5380, + "time_per_iteration": 4.112062454223633 + }, + { + "auxiliary_loss_clip": 0.01757828, + "auxiliary_loss_mlp": 0.00509655, + "balance_loss_clip": 1.36506498, + "balance_loss_mlp": 0.46161401, + "epoch": 0.32352322260634303, + "flos": 25228610974080.0, + "grad_norm": 190.96577913615303, + "language_loss": 0.8286109, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.8512857, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 3.9296875, + "router_z_loss_mlp": 0.48071289, + "step": 5381, + "time_per_iteration": 2.7150115966796875 + }, + { + "auxiliary_loss_clip": 0.0173355, + "auxiliary_loss_mlp": 0.00451775, + "balance_loss_clip": 1.36081195, + "balance_loss_mlp": 0.41036174, + "epoch": 0.323583345859011, + "flos": 23330480670720.0, + "grad_norm": 7.090879928974393, + "language_loss": 0.7803489, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.80220217, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 3.72460938, + "router_z_loss_mlp": 0.4140625, + "step": 5382, + "time_per_iteration": 2.828624725341797 + }, + { + "auxiliary_loss_clip": 0.01758019, + "auxiliary_loss_mlp": 0.00508798, + "balance_loss_clip": 1.36640906, + "balance_loss_mlp": 0.464834, + "epoch": 0.32364346911167896, + "flos": 26207499392640.0, + "grad_norm": 2.835303541111796, + "language_loss": 0.78023958, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.80290776, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 3.91796875, + "router_z_loss_mlp": 0.43969727, + "step": 5383, + "time_per_iteration": 2.8823087215423584 + }, + { + "auxiliary_loss_clip": 0.01768518, + "auxiliary_loss_mlp": 0.00455598, + "balance_loss_clip": 1.38711429, + "balance_loss_mlp": 0.41625881, + "epoch": 0.3237035923643469, + "flos": 23695764030720.0, + "grad_norm": 29.59502525042219, + "language_loss": 0.82999772, + "learning_rate": 3.161315193285283e-06, + "loss": 0.85223883, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 3.8125, + "router_z_loss_mlp": 0.39331055, + "step": 5384, + "time_per_iteration": 2.7238235473632812 + }, + { + "auxiliary_loss_clip": 0.01762258, + "auxiliary_loss_mlp": 0.00488322, + "balance_loss_clip": 1.36154497, + "balance_loss_mlp": 0.44202131, + "epoch": 0.3237637156170149, + "flos": 14428728633600.0, + "grad_norm": 633.8875870087014, + "language_loss": 0.82864565, + "learning_rate": 3.16099809186998e-06, + "loss": 0.85115147, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 4.0078125, + "router_z_loss_mlp": 0.46313477, + "step": 5385, + "time_per_iteration": 4.077510118484497 + }, + { + "auxiliary_loss_clip": 0.01791221, + "auxiliary_loss_mlp": 0.00473478, + "balance_loss_clip": 1.39400315, + "balance_loss_mlp": 0.42884657, + "epoch": 0.32382383886968286, + "flos": 31062981185280.0, + "grad_norm": 3.8052127767330015, + "language_loss": 0.78033972, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.80298662, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.4465332, + "step": 5386, + "time_per_iteration": 2.8729071617126465 + }, + { + "auxiliary_loss_clip": 0.01763044, + "auxiliary_loss_mlp": 0.0050499, + "balance_loss_clip": 1.36552119, + "balance_loss_mlp": 0.45821202, + "epoch": 0.3238839621223508, + "flos": 23256935573760.0, + "grad_norm": 21.79709856254866, + "language_loss": 1.00606298, + "learning_rate": 3.1603637569759095e-06, + "loss": 1.02874327, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 3.97265625, + "router_z_loss_mlp": 0.4675293, + "step": 5387, + "time_per_iteration": 2.7540955543518066 + }, + { + "auxiliary_loss_clip": 0.01785306, + "auxiliary_loss_mlp": 0.0050496, + "balance_loss_clip": 1.38349724, + "balance_loss_mlp": 0.45446324, + "epoch": 0.3239440853750188, + "flos": 22964658606720.0, + "grad_norm": 46.549576029520495, + "language_loss": 0.84239417, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.86529684, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 4.01757812, + "router_z_loss_mlp": 0.50512695, + "step": 5388, + "time_per_iteration": 2.641906976699829 + }, + { + "auxiliary_loss_clip": 0.01753359, + "auxiliary_loss_mlp": 0.0050422, + "balance_loss_clip": 1.36329293, + "balance_loss_mlp": 0.45925438, + "epoch": 0.32400420862768675, + "flos": 36246614653440.0, + "grad_norm": 7.51814937407651, + "language_loss": 0.77521807, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.79779387, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 3.8984375, + "router_z_loss_mlp": 0.44970703, + "step": 5389, + "time_per_iteration": 2.7651724815368652 + }, + { + "auxiliary_loss_clip": 0.0173168, + "auxiliary_loss_mlp": 0.00483741, + "balance_loss_clip": 1.35699928, + "balance_loss_mlp": 0.44003853, + "epoch": 0.3240643318803547, + "flos": 21616500418560.0, + "grad_norm": 426.323453519577, + "language_loss": 0.86353457, + "learning_rate": 3.159411924656557e-06, + "loss": 0.88568884, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 3.74414062, + "router_z_loss_mlp": 0.43701172, + "step": 5390, + "time_per_iteration": 2.6310060024261475 + }, + { + "auxiliary_loss_clip": 0.01782724, + "auxiliary_loss_mlp": 0.00492853, + "balance_loss_clip": 1.3941164, + "balance_loss_mlp": 0.44595596, + "epoch": 0.3241244551330227, + "flos": 23295611543040.0, + "grad_norm": 3.580452327747769, + "language_loss": 0.78380203, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.80655777, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 3.88476562, + "router_z_loss_mlp": 0.46899414, + "step": 5391, + "time_per_iteration": 4.105253219604492 + }, + { + "auxiliary_loss_clip": 0.01738834, + "auxiliary_loss_mlp": 0.00454613, + "balance_loss_clip": 1.36244833, + "balance_loss_mlp": 0.41277072, + "epoch": 0.32418457838569065, + "flos": 14097236993280.0, + "grad_norm": 2.788761463833981, + "language_loss": 0.82459867, + "learning_rate": 3.158777149931855e-06, + "loss": 0.84653312, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 0.41821289, + "step": 5392, + "time_per_iteration": 2.6442453861236572 + }, + { + "auxiliary_loss_clip": 0.01759943, + "auxiliary_loss_mlp": 0.00467576, + "balance_loss_clip": 1.37656784, + "balance_loss_mlp": 0.42055973, + "epoch": 0.3242447016383586, + "flos": 29752672953600.0, + "grad_norm": 5.006453255503665, + "language_loss": 0.69378924, + "learning_rate": 3.158459696652067e-06, + "loss": 0.71606439, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 0.4699707, + "step": 5393, + "time_per_iteration": 2.7339701652526855 + }, + { + "auxiliary_loss_clip": 0.01729172, + "auxiliary_loss_mlp": 0.00437734, + "balance_loss_clip": 1.35934377, + "balance_loss_mlp": 0.39386538, + "epoch": 0.3243048248910266, + "flos": 24351205455360.0, + "grad_norm": 10.878870564428043, + "language_loss": 0.88646066, + "learning_rate": 3.158142199443371e-06, + "loss": 0.90812969, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 3.70117188, + "router_z_loss_mlp": 0.43847656, + "step": 5394, + "time_per_iteration": 2.737839937210083 + }, + { + "auxiliary_loss_clip": 0.01730989, + "auxiliary_loss_mlp": 0.00450533, + "balance_loss_clip": 1.37263477, + "balance_loss_mlp": 0.41167068, + "epoch": 0.3243649481436946, + "flos": 24353037048960.0, + "grad_norm": 6.136108248963986, + "language_loss": 0.88194394, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.90375918, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.38842773, + "step": 5395, + "time_per_iteration": 2.801877975463867 + }, + { + "auxiliary_loss_clip": 0.0174041, + "auxiliary_loss_mlp": 0.00466956, + "balance_loss_clip": 1.37642741, + "balance_loss_mlp": 0.4230873, + "epoch": 0.32442507139636256, + "flos": 22925228451840.0, + "grad_norm": 3.7057434064723282, + "language_loss": 0.88345754, + "learning_rate": 3.157507073287417e-06, + "loss": 0.90553117, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 3.63867188, + "router_z_loss_mlp": 0.43847656, + "step": 5396, + "time_per_iteration": 2.681025981903076 + }, + { + "auxiliary_loss_clip": 0.01763877, + "auxiliary_loss_mlp": 0.00479246, + "balance_loss_clip": 1.37925172, + "balance_loss_mlp": 0.43227726, + "epoch": 0.32448519464903053, + "flos": 22200192426240.0, + "grad_norm": 1.9847215204788546, + "language_loss": 0.83170199, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.85413325, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 3.84765625, + "router_z_loss_mlp": 0.46948242, + "step": 5397, + "time_per_iteration": 2.619535446166992 + }, + { + "auxiliary_loss_clip": 0.01715311, + "auxiliary_loss_mlp": 0.00461845, + "balance_loss_clip": 1.35306799, + "balance_loss_mlp": 0.41969228, + "epoch": 0.3245453179016985, + "flos": 18838450644480.0, + "grad_norm": 75.3545319744597, + "language_loss": 0.74081826, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.76258987, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 0.42163086, + "step": 5398, + "time_per_iteration": 2.6087658405303955 + }, + { + "auxiliary_loss_clip": 0.01755023, + "auxiliary_loss_mlp": 0.00464051, + "balance_loss_clip": 1.38191199, + "balance_loss_mlp": 0.42096841, + "epoch": 0.32460544115436646, + "flos": 21178390233600.0, + "grad_norm": 8.53060412121265, + "language_loss": 0.78023934, + "learning_rate": 3.156554054887718e-06, + "loss": 0.80243003, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 3.73242188, + "router_z_loss_mlp": 0.4309082, + "step": 5399, + "time_per_iteration": 2.67522931098938 + }, + { + "auxiliary_loss_clip": 0.01764904, + "auxiliary_loss_mlp": 0.00457798, + "balance_loss_clip": 1.38276327, + "balance_loss_mlp": 0.41225994, + "epoch": 0.3246655644070344, + "flos": 21981137333760.0, + "grad_norm": 4.419731305592811, + "language_loss": 0.7683745, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.79060155, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 0.45507812, + "step": 5400, + "time_per_iteration": 2.628851890563965 + }, + { + "auxiliary_loss_clip": 0.01717169, + "auxiliary_loss_mlp": 0.0047518, + "balance_loss_clip": 1.34822524, + "balance_loss_mlp": 0.43112031, + "epoch": 0.3247256876597024, + "flos": 32159729105280.0, + "grad_norm": 58.467045960401705, + "language_loss": 0.8635844, + "learning_rate": 3.155918489984614e-06, + "loss": 0.88550788, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 3.68945312, + "router_z_loss_mlp": 0.44042969, + "step": 5401, + "time_per_iteration": 2.7424001693725586 + }, + { + "auxiliary_loss_clip": 0.01752992, + "auxiliary_loss_mlp": 0.00447132, + "balance_loss_clip": 1.37876058, + "balance_loss_mlp": 0.40028319, + "epoch": 0.32478581091237035, + "flos": 20997544233600.0, + "grad_norm": 4.95861621361043, + "language_loss": 0.91762924, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.93963051, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 3.7421875, + "router_z_loss_mlp": 0.46875, + "step": 5402, + "time_per_iteration": 2.653144359588623 + }, + { + "auxiliary_loss_clip": 0.0168881, + "auxiliary_loss_mlp": 0.00420563, + "balance_loss_clip": 1.34411681, + "balance_loss_mlp": 0.3817009, + "epoch": 0.3248459341650383, + "flos": 17924990849280.0, + "grad_norm": 22.679003451762107, + "language_loss": 0.89334285, + "learning_rate": 3.155282749751332e-06, + "loss": 0.91443658, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.38867188, + "step": 5403, + "time_per_iteration": 2.6331961154937744 + }, + { + "auxiliary_loss_clip": 0.01678712, + "auxiliary_loss_mlp": 0.00415446, + "balance_loss_clip": 1.34061766, + "balance_loss_mlp": 0.3768456, + "epoch": 0.3249060574177063, + "flos": 24535606901760.0, + "grad_norm": 108.43416284042576, + "language_loss": 0.91988266, + "learning_rate": 3.154964813916007e-06, + "loss": 0.94082427, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.38623047, + "step": 5404, + "time_per_iteration": 2.819408655166626 + }, + { + "auxiliary_loss_clip": 0.01681394, + "auxiliary_loss_mlp": 0.00415338, + "balance_loss_clip": 1.33690882, + "balance_loss_mlp": 0.3778345, + "epoch": 0.32496618067037425, + "flos": 25994765093760.0, + "grad_norm": 17.638890644193594, + "language_loss": 0.7810849, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.80205226, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.37524414, + "step": 5405, + "time_per_iteration": 2.7167999744415283 + }, + { + "auxiliary_loss_clip": 0.01718972, + "auxiliary_loss_mlp": 0.00413575, + "balance_loss_clip": 1.36414552, + "balance_loss_mlp": 0.37459368, + "epoch": 0.3250263039230422, + "flos": 19573757959680.0, + "grad_norm": 8.457375672989178, + "language_loss": 0.89439523, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.9157207, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 3.55273438, + "router_z_loss_mlp": 0.38989258, + "step": 5406, + "time_per_iteration": 2.756756544113159 + }, + { + "auxiliary_loss_clip": 0.01727502, + "auxiliary_loss_mlp": 0.00476203, + "balance_loss_clip": 1.37260473, + "balance_loss_mlp": 0.43300164, + "epoch": 0.3250864271757102, + "flos": 16763640318720.0, + "grad_norm": 3.4445528084286203, + "language_loss": 0.9188754, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.94091243, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 0.43188477, + "step": 5407, + "time_per_iteration": 2.661437749862671 + }, + { + "auxiliary_loss_clip": 0.0170032, + "auxiliary_loss_mlp": 0.00496175, + "balance_loss_clip": 1.35150337, + "balance_loss_mlp": 0.45199615, + "epoch": 0.3251465504283782, + "flos": 27819458040960.0, + "grad_norm": 1.7576310552766574, + "language_loss": 0.72780395, + "learning_rate": 3.153692632731479e-06, + "loss": 0.74976885, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 3.49023438, + "router_z_loss_mlp": 0.44189453, + "step": 5408, + "time_per_iteration": 2.735755443572998 + }, + { + "auxiliary_loss_clip": 0.01699982, + "auxiliary_loss_mlp": 0.00492609, + "balance_loss_clip": 1.33868575, + "balance_loss_mlp": 0.45026553, + "epoch": 0.32520667368104617, + "flos": 19063144172160.0, + "grad_norm": 39.5961323176261, + "language_loss": 0.83086157, + "learning_rate": 3.153374478034841e-06, + "loss": 0.85278749, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 3.61328125, + "router_z_loss_mlp": 0.42333984, + "step": 5409, + "time_per_iteration": 2.6374149322509766 + }, + { + "auxiliary_loss_clip": 0.01682848, + "auxiliary_loss_mlp": 0.00520323, + "balance_loss_clip": 1.32887602, + "balance_loss_mlp": 0.47797978, + "epoch": 0.32526679693371413, + "flos": 29382146208000.0, + "grad_norm": 17.40190198297269, + "language_loss": 0.87303042, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.89506215, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.42358398, + "step": 5410, + "time_per_iteration": 2.6688005924224854 + }, + { + "auxiliary_loss_clip": 0.01688417, + "auxiliary_loss_mlp": 0.00467247, + "balance_loss_clip": 1.35565615, + "balance_loss_mlp": 0.42886174, + "epoch": 0.3253269201863821, + "flos": 20704513080960.0, + "grad_norm": 1.8123045202325114, + "language_loss": 0.76095158, + "learning_rate": 3.152738037445405e-06, + "loss": 0.78250825, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.38378906, + "step": 5411, + "time_per_iteration": 2.6537535190582275 + }, + { + "auxiliary_loss_clip": 0.01665221, + "auxiliary_loss_mlp": 0.00483283, + "balance_loss_clip": 1.32434416, + "balance_loss_mlp": 0.4391993, + "epoch": 0.32538704343905006, + "flos": 29094142959360.0, + "grad_norm": 338.8647445684552, + "language_loss": 0.86297131, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.88445634, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 3.41210938, + "router_z_loss_mlp": 0.44091797, + "step": 5412, + "time_per_iteration": 2.735797643661499 + }, + { + "auxiliary_loss_clip": 0.01694534, + "auxiliary_loss_mlp": 0.0052623, + "balance_loss_clip": 1.34283745, + "balance_loss_mlp": 0.48183632, + "epoch": 0.325447166691718, + "flos": 24676124906880.0, + "grad_norm": 5.235351034014789, + "language_loss": 0.85710263, + "learning_rate": 3.152101422008203e-06, + "loss": 0.87931025, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.44433594, + "step": 5413, + "time_per_iteration": 2.637565851211548 + }, + { + "auxiliary_loss_clip": 0.01681003, + "auxiliary_loss_mlp": 0.00526038, + "balance_loss_clip": 1.33821619, + "balance_loss_mlp": 0.48123962, + "epoch": 0.325507289944386, + "flos": 21543134889600.0, + "grad_norm": 3.3411484425489193, + "language_loss": 0.81992185, + "learning_rate": 3.151783048751864e-06, + "loss": 0.84199226, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.44799805, + "step": 5414, + "time_per_iteration": 2.614631175994873 + }, + { + "auxiliary_loss_clip": 0.01972756, + "auxiliary_loss_mlp": 0.00125667, + "balance_loss_clip": 1.72370255, + "balance_loss_mlp": 0.11613069, + "epoch": 0.32556741319705396, + "flos": 71518722347520.0, + "grad_norm": 0.9168634537251126, + "language_loss": 0.6382606, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.65924489, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.09521484, + "step": 5415, + "time_per_iteration": 3.1185145378112793 + }, + { + "auxiliary_loss_clip": 0.01646899, + "auxiliary_loss_mlp": 0.00549659, + "balance_loss_clip": 1.31337404, + "balance_loss_mlp": 0.50357282, + "epoch": 0.3256275364497219, + "flos": 23732428838400.0, + "grad_norm": 4.952079817775966, + "language_loss": 0.7929424, + "learning_rate": 3.151146171224075e-06, + "loss": 0.81490803, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.46020508, + "step": 5416, + "time_per_iteration": 2.6879985332489014 + }, + { + "auxiliary_loss_clip": 0.01913783, + "auxiliary_loss_mlp": 0.00173828, + "balance_loss_clip": 1.68449891, + "balance_loss_mlp": 0.16300404, + "epoch": 0.3256876597023899, + "flos": 67289199891840.0, + "grad_norm": 0.770712207219402, + "language_loss": 0.5739345, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.59481066, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.10839844, + "step": 5417, + "time_per_iteration": 3.203908681869507 + }, + { + "auxiliary_loss_clip": 0.01902927, + "auxiliary_loss_mlp": 0.00111675, + "balance_loss_clip": 1.66992319, + "balance_loss_mlp": 0.10094664, + "epoch": 0.32574778295505785, + "flos": 71282323964160.0, + "grad_norm": 0.8329424988425485, + "language_loss": 0.63572907, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65587509, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.10742188, + "step": 5418, + "time_per_iteration": 3.252798318862915 + }, + { + "auxiliary_loss_clip": 0.01659694, + "auxiliary_loss_mlp": 0.00526432, + "balance_loss_clip": 1.32567668, + "balance_loss_mlp": 0.48296845, + "epoch": 0.3258079062077258, + "flos": 20776370238720.0, + "grad_norm": 5.686045105507039, + "language_loss": 0.75974709, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.78160834, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.43481445, + "step": 5419, + "time_per_iteration": 2.630409002304077 + }, + { + "auxiliary_loss_clip": 0.01674391, + "auxiliary_loss_mlp": 0.00565353, + "balance_loss_clip": 1.32911122, + "balance_loss_mlp": 0.51528478, + "epoch": 0.3258680294603938, + "flos": 22235456603520.0, + "grad_norm": 130.49762058746728, + "language_loss": 0.82562959, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.84802711, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 0.50073242, + "step": 5420, + "time_per_iteration": 4.106637477874756 + }, + { + "auxiliary_loss_clip": 0.01662306, + "auxiliary_loss_mlp": 0.00555609, + "balance_loss_clip": 1.3210659, + "balance_loss_mlp": 0.50628042, + "epoch": 0.3259281527130618, + "flos": 26979974305920.0, + "grad_norm": 9.557827752269842, + "language_loss": 0.84624422, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.86842334, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.49365234, + "step": 5421, + "time_per_iteration": 2.725907802581787 + }, + { + "auxiliary_loss_clip": 0.01660167, + "auxiliary_loss_mlp": 0.00611014, + "balance_loss_clip": 1.33262229, + "balance_loss_mlp": 0.55958766, + "epoch": 0.32598827596572977, + "flos": 26214251149440.0, + "grad_norm": 45.49535577367021, + "language_loss": 0.81002855, + "learning_rate": 3.149234491389381e-06, + "loss": 0.83274031, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.51416016, + "step": 5422, + "time_per_iteration": 4.125760793685913 + }, + { + "auxiliary_loss_clip": 0.01672455, + "auxiliary_loss_mlp": 0.00596349, + "balance_loss_clip": 1.33226192, + "balance_loss_mlp": 0.54547042, + "epoch": 0.32604839921839773, + "flos": 17639752947840.0, + "grad_norm": 4.678913275859054, + "language_loss": 0.69772029, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.72040832, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 3.40039062, + "router_z_loss_mlp": 0.50878906, + "step": 5423, + "time_per_iteration": 2.604306936264038 + }, + { + "auxiliary_loss_clip": 0.01643395, + "auxiliary_loss_mlp": 0.00539062, + "balance_loss_clip": 1.31819844, + "balance_loss_mlp": 0.4918313, + "epoch": 0.3261085224710657, + "flos": 23622721724160.0, + "grad_norm": 80.00149126373606, + "language_loss": 0.80341262, + "learning_rate": 3.148596916016224e-06, + "loss": 0.82523715, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.47265625, + "step": 5424, + "time_per_iteration": 2.699673891067505 + }, + { + "auxiliary_loss_clip": 0.01648276, + "auxiliary_loss_mlp": 0.00610886, + "balance_loss_clip": 1.31661916, + "balance_loss_mlp": 0.5598166, + "epoch": 0.32616864572373366, + "flos": 23260455106560.0, + "grad_norm": 5.896590109142663, + "language_loss": 0.81319028, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.83578193, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.51098633, + "step": 5425, + "time_per_iteration": 2.7810757160186768 + }, + { + "auxiliary_loss_clip": 0.01616663, + "auxiliary_loss_mlp": 0.006086, + "balance_loss_clip": 1.28477156, + "balance_loss_mlp": 0.55526614, + "epoch": 0.32622876897640163, + "flos": 25593427457280.0, + "grad_norm": 10.241206654472697, + "language_loss": 0.85522681, + "learning_rate": 3.147959166423428e-06, + "loss": 0.87747943, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.53393555, + "step": 5426, + "time_per_iteration": 2.7226595878601074 + }, + { + "auxiliary_loss_clip": 0.01642244, + "auxiliary_loss_mlp": 0.00601613, + "balance_loss_clip": 1.3055824, + "balance_loss_mlp": 0.5525229, + "epoch": 0.3262888922290696, + "flos": 22418996123520.0, + "grad_norm": 11.950149187372197, + "language_loss": 0.79343951, + "learning_rate": 3.147640226324893e-06, + "loss": 0.81587803, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.49121094, + "step": 5427, + "time_per_iteration": 4.085914134979248 + }, + { + "auxiliary_loss_clip": 0.0165128, + "auxiliary_loss_mlp": 0.00605383, + "balance_loss_clip": 1.31187224, + "balance_loss_mlp": 0.55483866, + "epoch": 0.32634901548173756, + "flos": 19718908819200.0, + "grad_norm": 20.960401779377484, + "language_loss": 0.84160268, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.86416924, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.50512695, + "step": 5428, + "time_per_iteration": 2.630246162414551 + }, + { + "auxiliary_loss_clip": 0.01633431, + "auxiliary_loss_mlp": 0.00623736, + "balance_loss_clip": 1.30489945, + "balance_loss_mlp": 0.56990147, + "epoch": 0.3264091387344055, + "flos": 16142924367360.0, + "grad_norm": 11.152283468337448, + "language_loss": 0.76853275, + "learning_rate": 3.147002215584023e-06, + "loss": 0.79110444, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.53833008, + "step": 5429, + "time_per_iteration": 2.6388838291168213 + }, + { + "auxiliary_loss_clip": 0.0165247, + "auxiliary_loss_mlp": 0.00590701, + "balance_loss_clip": 1.31648779, + "balance_loss_mlp": 0.54215902, + "epoch": 0.3264692619870735, + "flos": 16399075230720.0, + "grad_norm": 12.795348719742941, + "language_loss": 0.82734609, + "learning_rate": 3.146683144965881e-06, + "loss": 0.84977776, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.4855957, + "step": 5430, + "time_per_iteration": 2.634023666381836 + }, + { + "auxiliary_loss_clip": 0.01685433, + "auxiliary_loss_mlp": 0.00595522, + "balance_loss_clip": 1.3449266, + "balance_loss_mlp": 0.54268873, + "epoch": 0.32652938523974145, + "flos": 22382331315840.0, + "grad_norm": 3.4534032476823793, + "language_loss": 0.88504297, + "learning_rate": 3.146364030865399e-06, + "loss": 0.90785253, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 3.40625, + "router_z_loss_mlp": 0.52832031, + "step": 5431, + "time_per_iteration": 2.642017364501953 + }, + { + "auxiliary_loss_clip": 0.01607699, + "auxiliary_loss_mlp": 0.00549789, + "balance_loss_clip": 1.28813279, + "balance_loss_mlp": 0.50401282, + "epoch": 0.3265895084924094, + "flos": 21908059113600.0, + "grad_norm": 16.52947118815553, + "language_loss": 0.75243706, + "learning_rate": 3.146044873294678e-06, + "loss": 0.77401197, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.45703125, + "step": 5432, + "time_per_iteration": 2.8431761264801025 + }, + { + "auxiliary_loss_clip": 0.01648722, + "auxiliary_loss_mlp": 0.00557027, + "balance_loss_clip": 1.31585526, + "balance_loss_mlp": 0.50955814, + "epoch": 0.3266496317450774, + "flos": 16067152627200.0, + "grad_norm": 57.016033323607175, + "language_loss": 0.89353561, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.91559303, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.47436523, + "step": 5433, + "time_per_iteration": 4.076584100723267 + }, + { + "auxiliary_loss_clip": 0.01619756, + "auxiliary_loss_mlp": 0.00562617, + "balance_loss_clip": 1.3001821, + "balance_loss_mlp": 0.51426589, + "epoch": 0.3267097549977454, + "flos": 22528236360960.0, + "grad_norm": 4.63628104358729, + "language_loss": 0.89520562, + "learning_rate": 3.145406427790931e-06, + "loss": 0.91702932, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.48339844, + "step": 5434, + "time_per_iteration": 2.687565326690674 + }, + { + "auxiliary_loss_clip": 0.01647701, + "auxiliary_loss_mlp": 0.00562506, + "balance_loss_clip": 1.31622863, + "balance_loss_mlp": 0.51289129, + "epoch": 0.32676987825041337, + "flos": 27270419679360.0, + "grad_norm": 5.065249582473822, + "language_loss": 0.93420547, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.95630753, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.49633789, + "step": 5435, + "time_per_iteration": 2.6761550903320312 + }, + { + "auxiliary_loss_clip": 0.01651288, + "auxiliary_loss_mlp": 0.00568025, + "balance_loss_clip": 1.31861997, + "balance_loss_mlp": 0.51900643, + "epoch": 0.32683000150308134, + "flos": 11508257433600.0, + "grad_norm": 156.20832864833798, + "language_loss": 0.82516557, + "learning_rate": 3.144767808551479e-06, + "loss": 0.8473587, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.4909668, + "step": 5436, + "time_per_iteration": 2.645685911178589 + }, + { + "auxiliary_loss_clip": 0.01633286, + "auxiliary_loss_mlp": 0.00530462, + "balance_loss_clip": 1.31273651, + "balance_loss_mlp": 0.48304003, + "epoch": 0.3268901247557493, + "flos": 25630200005760.0, + "grad_norm": 3.390763175484594, + "language_loss": 0.75472844, + "learning_rate": 3.144448433811134e-06, + "loss": 0.77636588, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.47436523, + "step": 5437, + "time_per_iteration": 2.7007668018341064 + }, + { + "auxiliary_loss_clip": 0.01639102, + "auxiliary_loss_mlp": 0.00582165, + "balance_loss_clip": 1.30209446, + "balance_loss_mlp": 0.52964121, + "epoch": 0.32695024800841727, + "flos": 24860849575680.0, + "grad_norm": 93.24123748770873, + "language_loss": 0.68825388, + "learning_rate": 3.144129015673189e-06, + "loss": 0.7104665, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.52514648, + "step": 5438, + "time_per_iteration": 2.691960096359253 + }, + { + "auxiliary_loss_clip": 0.01662544, + "auxiliary_loss_mlp": 0.00496577, + "balance_loss_clip": 1.33509636, + "balance_loss_mlp": 0.45366174, + "epoch": 0.32701037126108523, + "flos": 28839249072000.0, + "grad_norm": 10.487936625501808, + "language_loss": 0.79824412, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.81983542, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.42944336, + "step": 5439, + "time_per_iteration": 2.73903489112854 + }, + { + "auxiliary_loss_clip": 0.01632093, + "auxiliary_loss_mlp": 0.00572777, + "balance_loss_clip": 1.30852914, + "balance_loss_mlp": 0.52175516, + "epoch": 0.3270704945137532, + "flos": 27965075777280.0, + "grad_norm": 6.235625892106406, + "language_loss": 0.78891122, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.81095994, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 3.23828125, + "router_z_loss_mlp": 0.51025391, + "step": 5440, + "time_per_iteration": 2.7226908206939697 + }, + { + "auxiliary_loss_clip": 0.01623689, + "auxiliary_loss_mlp": 0.00552092, + "balance_loss_clip": 1.30559421, + "balance_loss_mlp": 0.5063867, + "epoch": 0.32713061776642116, + "flos": 23690700213120.0, + "grad_norm": 16.533000287195666, + "language_loss": 0.88637275, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.90813053, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.45678711, + "step": 5441, + "time_per_iteration": 2.6751458644866943 + }, + { + "auxiliary_loss_clip": 0.01643225, + "auxiliary_loss_mlp": 0.00547514, + "balance_loss_clip": 1.30912137, + "balance_loss_mlp": 0.49680212, + "epoch": 0.3271907410190891, + "flos": 22455625017600.0, + "grad_norm": 35.81865654685802, + "language_loss": 0.90856844, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.93047583, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.50708008, + "step": 5442, + "time_per_iteration": 2.701514959335327 + }, + { + "auxiliary_loss_clip": 0.01627213, + "auxiliary_loss_mlp": 0.0053528, + "balance_loss_clip": 1.29930472, + "balance_loss_mlp": 0.48816815, + "epoch": 0.3272508642717571, + "flos": 22820118278400.0, + "grad_norm": 2.16330705364133, + "language_loss": 0.81060863, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.83223349, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.47167969, + "step": 5443, + "time_per_iteration": 2.723113536834717 + }, + { + "auxiliary_loss_clip": 0.01642482, + "auxiliary_loss_mlp": 0.00560901, + "balance_loss_clip": 1.31188703, + "balance_loss_mlp": 0.5125016, + "epoch": 0.32731098752442506, + "flos": 11801360413440.0, + "grad_norm": 37.67681278188588, + "language_loss": 0.89205813, + "learning_rate": 3.142211596174343e-06, + "loss": 0.91409206, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 3.30664062, + "router_z_loss_mlp": 0.48413086, + "step": 5444, + "time_per_iteration": 2.6586766242980957 + }, + { + "auxiliary_loss_clip": 0.01651855, + "auxiliary_loss_mlp": 0.0058903, + "balance_loss_clip": 1.31563413, + "balance_loss_mlp": 0.53564793, + "epoch": 0.327371110777093, + "flos": 21027780506880.0, + "grad_norm": 24.68992460405587, + "language_loss": 0.63863891, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.66104776, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.53442383, + "step": 5445, + "time_per_iteration": 2.7059619426727295 + }, + { + "auxiliary_loss_clip": 0.01620198, + "auxiliary_loss_mlp": 0.00576149, + "balance_loss_clip": 1.29869521, + "balance_loss_mlp": 0.52748734, + "epoch": 0.327431234029761, + "flos": 19062102677760.0, + "grad_norm": 2.3340214423567494, + "language_loss": 0.9258154, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.94777894, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.48632812, + "step": 5446, + "time_per_iteration": 2.6818618774414062 + }, + { + "auxiliary_loss_clip": 0.01655124, + "auxiliary_loss_mlp": 0.00601734, + "balance_loss_clip": 1.31691337, + "balance_loss_mlp": 0.54673052, + "epoch": 0.32749135728242895, + "flos": 25849219184640.0, + "grad_norm": 7.056093630370844, + "language_loss": 0.84576035, + "learning_rate": 3.141252301538802e-06, + "loss": 0.86832893, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.55029297, + "step": 5447, + "time_per_iteration": 2.781341314315796 + }, + { + "auxiliary_loss_clip": 0.0162362, + "auxiliary_loss_mlp": 0.00552771, + "balance_loss_clip": 1.30433345, + "balance_loss_mlp": 0.50420493, + "epoch": 0.327551480535097, + "flos": 20120533764480.0, + "grad_norm": 3.559026544222916, + "language_loss": 0.78469062, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.80645454, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.4855957, + "step": 5448, + "time_per_iteration": 2.6433005332946777 + }, + { + "auxiliary_loss_clip": 0.01620868, + "auxiliary_loss_mlp": 0.00583184, + "balance_loss_clip": 1.30444264, + "balance_loss_mlp": 0.52934897, + "epoch": 0.32761160378776494, + "flos": 28803553931520.0, + "grad_norm": 1.5046887710947887, + "language_loss": 0.71551991, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.73756039, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 3.16601562, + "router_z_loss_mlp": 0.53808594, + "step": 5449, + "time_per_iteration": 2.7022290229797363 + }, + { + "auxiliary_loss_clip": 0.01649799, + "auxiliary_loss_mlp": 0.00513288, + "balance_loss_clip": 1.3272239, + "balance_loss_mlp": 0.46691555, + "epoch": 0.3276717270404329, + "flos": 26937778803840.0, + "grad_norm": 76.71003442585912, + "language_loss": 0.70480764, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.72643852, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.46386719, + "step": 5450, + "time_per_iteration": 2.667043447494507 + }, + { + "auxiliary_loss_clip": 0.01635844, + "auxiliary_loss_mlp": 0.00516066, + "balance_loss_clip": 1.32088232, + "balance_loss_mlp": 0.47083831, + "epoch": 0.32773185029310087, + "flos": 25338425829120.0, + "grad_norm": 6.04832581832523, + "language_loss": 0.82720625, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.84872538, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.45288086, + "step": 5451, + "time_per_iteration": 2.655608654022217 + }, + { + "auxiliary_loss_clip": 0.01615307, + "auxiliary_loss_mlp": 0.00537843, + "balance_loss_clip": 1.29357922, + "balance_loss_mlp": 0.4862496, + "epoch": 0.32779197354576883, + "flos": 26391721271040.0, + "grad_norm": 4.527544759907656, + "language_loss": 0.7626878, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.78421932, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 3.21679688, + "router_z_loss_mlp": 0.51611328, + "step": 5452, + "time_per_iteration": 2.6630730628967285 + }, + { + "auxiliary_loss_clip": 0.01623486, + "auxiliary_loss_mlp": 0.00525537, + "balance_loss_clip": 1.31421888, + "balance_loss_mlp": 0.47830617, + "epoch": 0.3278520967984368, + "flos": 24899381890560.0, + "grad_norm": 8.802770316652174, + "language_loss": 0.83770895, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.85919917, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.47241211, + "step": 5453, + "time_per_iteration": 2.669976234436035 + }, + { + "auxiliary_loss_clip": 0.01606051, + "auxiliary_loss_mlp": 0.00501477, + "balance_loss_clip": 1.29318941, + "balance_loss_mlp": 0.45727453, + "epoch": 0.32791222005110476, + "flos": 29752996176000.0, + "grad_norm": 31.914984198998788, + "language_loss": 0.82173002, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.84280533, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.44189453, + "step": 5454, + "time_per_iteration": 2.6846940517425537 + }, + { + "auxiliary_loss_clip": 0.01581637, + "auxiliary_loss_mlp": 0.00482101, + "balance_loss_clip": 1.2870152, + "balance_loss_mlp": 0.43885195, + "epoch": 0.32797234330377273, + "flos": 16508064072960.0, + "grad_norm": 106.66614293658785, + "language_loss": 0.82518023, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.84581763, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.43237305, + "step": 5455, + "time_per_iteration": 2.6284332275390625 + }, + { + "auxiliary_loss_clip": 0.01631514, + "auxiliary_loss_mlp": 0.00542512, + "balance_loss_clip": 1.31495082, + "balance_loss_mlp": 0.49380302, + "epoch": 0.3280324665564407, + "flos": 26577918397440.0, + "grad_norm": 433.21396879012076, + "language_loss": 0.79043758, + "learning_rate": 3.138372082016768e-06, + "loss": 0.81217778, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.48681641, + "step": 5456, + "time_per_iteration": 2.6728460788726807 + }, + { + "auxiliary_loss_clip": 0.01626539, + "auxiliary_loss_mlp": 0.00487053, + "balance_loss_clip": 1.31768775, + "balance_loss_mlp": 0.44387555, + "epoch": 0.32809258980910866, + "flos": 22929969047040.0, + "grad_norm": 10.506549283485755, + "language_loss": 0.81995082, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.84108675, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.43164062, + "step": 5457, + "time_per_iteration": 2.6595494747161865 + }, + { + "auxiliary_loss_clip": 0.0162196, + "auxiliary_loss_mlp": 0.00501698, + "balance_loss_clip": 1.30670571, + "balance_loss_mlp": 0.45554, + "epoch": 0.3281527130617766, + "flos": 22783848520320.0, + "grad_norm": 4.34387686736333, + "language_loss": 0.86209273, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.88332927, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 3.15234375, + "router_z_loss_mlp": 0.46166992, + "step": 5458, + "time_per_iteration": 2.776191473007202 + }, + { + "auxiliary_loss_clip": 0.01604777, + "auxiliary_loss_mlp": 0.00502655, + "balance_loss_clip": 1.30275309, + "balance_loss_mlp": 0.45892856, + "epoch": 0.3282128363144446, + "flos": 21250678354560.0, + "grad_norm": 54.033404419320306, + "language_loss": 0.77799577, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.79907006, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.43725586, + "step": 5459, + "time_per_iteration": 2.7742886543273926 + }, + { + "auxiliary_loss_clip": 0.01624414, + "auxiliary_loss_mlp": 0.00508892, + "balance_loss_clip": 1.31887293, + "balance_loss_mlp": 0.46189988, + "epoch": 0.32827295956711255, + "flos": 30843064166400.0, + "grad_norm": 2.719904375630614, + "language_loss": 0.89343786, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.91477096, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.47045898, + "step": 5460, + "time_per_iteration": 2.825568199157715 + }, + { + "auxiliary_loss_clip": 0.01628996, + "auxiliary_loss_mlp": 0.00511817, + "balance_loss_clip": 1.31663489, + "balance_loss_mlp": 0.4648723, + "epoch": 0.3283330828197806, + "flos": 25915006944000.0, + "grad_norm": 10.179875674757351, + "language_loss": 0.82017279, + "learning_rate": 3.136770448642288e-06, + "loss": 0.84158099, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.46972656, + "step": 5461, + "time_per_iteration": 2.778759241104126 + }, + { + "auxiliary_loss_clip": 0.01615331, + "auxiliary_loss_mlp": 0.00526603, + "balance_loss_clip": 1.31076217, + "balance_loss_mlp": 0.4775604, + "epoch": 0.32839320607244854, + "flos": 38582065042560.0, + "grad_norm": 27.576833370170483, + "language_loss": 0.67845285, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.69987226, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.49047852, + "step": 5462, + "time_per_iteration": 2.78997540473938 + }, + { + "auxiliary_loss_clip": 0.01618542, + "auxiliary_loss_mlp": 0.00510453, + "balance_loss_clip": 1.32115889, + "balance_loss_mlp": 0.46510607, + "epoch": 0.3284533293251165, + "flos": 26650888876800.0, + "grad_norm": 15.632919898660074, + "language_loss": 0.81877697, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.84006691, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.453125, + "step": 5463, + "time_per_iteration": 4.147435188293457 + }, + { + "auxiliary_loss_clip": 0.01623142, + "auxiliary_loss_mlp": 0.00520211, + "balance_loss_clip": 1.3189919, + "balance_loss_mlp": 0.47288504, + "epoch": 0.32851345257778447, + "flos": 15304158904320.0, + "grad_norm": 8.755544136688252, + "language_loss": 0.75484043, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.77627385, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.47338867, + "step": 5464, + "time_per_iteration": 4.049316167831421 + }, + { + "auxiliary_loss_clip": 0.01600657, + "auxiliary_loss_mlp": 0.00507256, + "balance_loss_clip": 1.30724907, + "balance_loss_mlp": 0.4637689, + "epoch": 0.32857357583045244, + "flos": 23513732881920.0, + "grad_norm": 1.972814613611793, + "language_loss": 0.75301504, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.77409422, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.43505859, + "step": 5465, + "time_per_iteration": 2.643740177154541 + }, + { + "auxiliary_loss_clip": 0.01593662, + "auxiliary_loss_mlp": 0.0049515, + "balance_loss_clip": 1.29574835, + "balance_loss_mlp": 0.44989815, + "epoch": 0.3286336990831204, + "flos": 20995209849600.0, + "grad_norm": 4.186885133424252, + "language_loss": 0.86699128, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.88787943, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.45263672, + "step": 5466, + "time_per_iteration": 2.627324104309082 + }, + { + "auxiliary_loss_clip": 0.01607737, + "auxiliary_loss_mlp": 0.00536943, + "balance_loss_clip": 1.30747724, + "balance_loss_mlp": 0.48804289, + "epoch": 0.32869382233578837, + "flos": 23658811914240.0, + "grad_norm": 7094.063653604693, + "language_loss": 0.83882749, + "learning_rate": 3.134847066213879e-06, + "loss": 0.86027431, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.48876953, + "step": 5467, + "time_per_iteration": 2.654825210571289 + }, + { + "auxiliary_loss_clip": 0.01595777, + "auxiliary_loss_mlp": 0.00515924, + "balance_loss_clip": 1.29259253, + "balance_loss_mlp": 0.46769148, + "epoch": 0.32875394558845633, + "flos": 25336522408320.0, + "grad_norm": 3135.1630590826926, + "language_loss": 0.78245425, + "learning_rate": 3.134526351787587e-06, + "loss": 0.80357122, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.48193359, + "step": 5468, + "time_per_iteration": 2.700373888015747 + }, + { + "auxiliary_loss_clip": 0.01646242, + "auxiliary_loss_mlp": 0.00538675, + "balance_loss_clip": 1.32933569, + "balance_loss_mlp": 0.48920363, + "epoch": 0.3288140688411243, + "flos": 14903108576640.0, + "grad_norm": 296.3519248686756, + "language_loss": 0.8575893, + "learning_rate": 3.134205594339942e-06, + "loss": 0.87943852, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.49414062, + "step": 5469, + "time_per_iteration": 4.12693977355957 + }, + { + "auxiliary_loss_clip": 0.01596988, + "auxiliary_loss_mlp": 0.00519534, + "balance_loss_clip": 1.29805183, + "balance_loss_mlp": 0.47211242, + "epoch": 0.32887419209379226, + "flos": 18551345235840.0, + "grad_norm": 3.6421076329059314, + "language_loss": 0.87355548, + "learning_rate": 3.133884793883107e-06, + "loss": 0.89472073, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.47460938, + "step": 5470, + "time_per_iteration": 2.6616621017456055 + }, + { + "auxiliary_loss_clip": 0.01607341, + "auxiliary_loss_mlp": 0.00509271, + "balance_loss_clip": 1.30606508, + "balance_loss_mlp": 0.4622075, + "epoch": 0.3289343153464602, + "flos": 48105610439040.0, + "grad_norm": 135.5674597582698, + "language_loss": 0.74782592, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.76899207, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.47070312, + "step": 5471, + "time_per_iteration": 2.915369987487793 + }, + { + "auxiliary_loss_clip": 0.0162953, + "auxiliary_loss_mlp": 0.0053284, + "balance_loss_clip": 1.31556022, + "balance_loss_mlp": 0.48243839, + "epoch": 0.3289944385991282, + "flos": 27600295207680.0, + "grad_norm": 13.04913683583274, + "language_loss": 0.72062254, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.74224627, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.50415039, + "step": 5472, + "time_per_iteration": 2.734987497329712 + }, + { + "auxiliary_loss_clip": 0.01631737, + "auxiliary_loss_mlp": 0.00531692, + "balance_loss_clip": 1.32386041, + "balance_loss_mlp": 0.48176715, + "epoch": 0.32905456185179616, + "flos": 20120318282880.0, + "grad_norm": 50.50330287303695, + "language_loss": 0.94246316, + "learning_rate": 3.13292213457912e-06, + "loss": 0.96409738, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.4987793, + "step": 5473, + "time_per_iteration": 2.6354663372039795 + }, + { + "auxiliary_loss_clip": 0.01631888, + "auxiliary_loss_mlp": 0.00507023, + "balance_loss_clip": 1.32324898, + "balance_loss_mlp": 0.45967346, + "epoch": 0.3291146851044642, + "flos": 23180230080000.0, + "grad_norm": 3.1988385082058888, + "language_loss": 0.84395707, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.86534619, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 3.08398438, + "router_z_loss_mlp": 0.47314453, + "step": 5474, + "time_per_iteration": 2.63146710395813 + }, + { + "auxiliary_loss_clip": 0.01636391, + "auxiliary_loss_mlp": 0.00247626, + "balance_loss_clip": 1.46130013, + "balance_loss_mlp": 0.23580031, + "epoch": 0.32917480835713214, + "flos": 67621912594560.0, + "grad_norm": 0.7976136826564614, + "language_loss": 0.59858251, + "learning_rate": 3.132280146886911e-06, + "loss": 0.6174227, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.11816406, + "step": 5475, + "time_per_iteration": 4.548194885253906 + }, + { + "auxiliary_loss_clip": 0.01623055, + "auxiliary_loss_mlp": 0.00568209, + "balance_loss_clip": 1.30919743, + "balance_loss_mlp": 0.5195477, + "epoch": 0.3292349316098001, + "flos": 27964537073280.0, + "grad_norm": 10.289917811681038, + "language_loss": 0.82541609, + "learning_rate": 3.131959088630455e-06, + "loss": 0.84732878, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.48681641, + "step": 5476, + "time_per_iteration": 2.6423418521881104 + }, + { + "auxiliary_loss_clip": 0.01630151, + "auxiliary_loss_mlp": 0.00542515, + "balance_loss_clip": 1.3293407, + "balance_loss_mlp": 0.49576131, + "epoch": 0.3292950548624681, + "flos": 20263673462400.0, + "grad_norm": 13.3842024268705, + "language_loss": 0.80286467, + "learning_rate": 3.131637987449997e-06, + "loss": 0.82459128, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.4675293, + "step": 5477, + "time_per_iteration": 2.664680242538452 + }, + { + "auxiliary_loss_clip": 0.01617347, + "auxiliary_loss_mlp": 0.00500041, + "balance_loss_clip": 1.32122719, + "balance_loss_mlp": 0.45500416, + "epoch": 0.32935517811513604, + "flos": 20812999132800.0, + "grad_norm": 162.99385028683753, + "language_loss": 0.84574187, + "learning_rate": 3.131316843357713e-06, + "loss": 0.86691582, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.45043945, + "step": 5478, + "time_per_iteration": 2.8050224781036377 + }, + { + "auxiliary_loss_clip": 0.01590384, + "auxiliary_loss_mlp": 0.0047953, + "balance_loss_clip": 1.29756498, + "balance_loss_mlp": 0.43694851, + "epoch": 0.329415301367804, + "flos": 18441853603200.0, + "grad_norm": 8.126989221524642, + "language_loss": 0.85436523, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.87506437, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.42602539, + "step": 5479, + "time_per_iteration": 2.728492498397827 + }, + { + "auxiliary_loss_clip": 0.01612055, + "auxiliary_loss_mlp": 0.00134671, + "balance_loss_clip": 1.43937945, + "balance_loss_mlp": 0.12646905, + "epoch": 0.32947542462047197, + "flos": 66323024887680.0, + "grad_norm": 0.7406366265279102, + "language_loss": 0.55889201, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.57635927, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.08203125, + "step": 5480, + "time_per_iteration": 3.214444875717163 + }, + { + "auxiliary_loss_clip": 0.01604065, + "auxiliary_loss_mlp": 0.0054641, + "balance_loss_clip": 1.30433571, + "balance_loss_mlp": 0.49920267, + "epoch": 0.32953554787313993, + "flos": 23221599569280.0, + "grad_norm": 3.764826295571361, + "language_loss": 0.8279866, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.84949136, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.47216797, + "step": 5481, + "time_per_iteration": 2.67130708694458 + }, + { + "auxiliary_loss_clip": 0.01599596, + "auxiliary_loss_mlp": 0.00541607, + "balance_loss_clip": 1.29340029, + "balance_loss_mlp": 0.49211109, + "epoch": 0.3295956711258079, + "flos": 27009492307200.0, + "grad_norm": 8.536843678225932, + "language_loss": 0.8287257, + "learning_rate": 3.130031838113899e-06, + "loss": 0.85013771, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.49536133, + "step": 5482, + "time_per_iteration": 2.7413885593414307 + }, + { + "auxiliary_loss_clip": 0.01619527, + "auxiliary_loss_mlp": 0.00521466, + "balance_loss_clip": 1.30755019, + "balance_loss_mlp": 0.47547522, + "epoch": 0.32965579437847586, + "flos": 19171702051200.0, + "grad_norm": 48.81309756320408, + "language_loss": 0.79440475, + "learning_rate": 3.129710479645185e-06, + "loss": 0.81581467, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.4597168, + "step": 5483, + "time_per_iteration": 2.6674184799194336 + }, + { + "auxiliary_loss_clip": 0.01608972, + "auxiliary_loss_mlp": 0.00487906, + "balance_loss_clip": 1.30505037, + "balance_loss_mlp": 0.44582477, + "epoch": 0.32971591763114383, + "flos": 30482521401600.0, + "grad_norm": 10.711384805532505, + "language_loss": 0.80911171, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.83008051, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.4206543, + "step": 5484, + "time_per_iteration": 2.7445266246795654 + }, + { + "auxiliary_loss_clip": 0.01606248, + "auxiliary_loss_mlp": 0.00550867, + "balance_loss_clip": 1.30385447, + "balance_loss_mlp": 0.50065613, + "epoch": 0.3297760408838118, + "flos": 16289583598080.0, + "grad_norm": 11.253346283219042, + "language_loss": 0.7866739, + "learning_rate": 3.129067634203742e-06, + "loss": 0.80824494, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.50170898, + "step": 5485, + "time_per_iteration": 2.6161720752716064 + }, + { + "auxiliary_loss_clip": 0.01571646, + "auxiliary_loss_mlp": 0.00485318, + "balance_loss_clip": 1.28344166, + "balance_loss_mlp": 0.44254529, + "epoch": 0.32983616413647976, + "flos": 29530924341120.0, + "grad_norm": 25.771878319631295, + "language_loss": 0.84877497, + "learning_rate": 3.128746147255388e-06, + "loss": 0.86934459, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.42797852, + "step": 5486, + "time_per_iteration": 2.7425038814544678 + }, + { + "auxiliary_loss_clip": 0.01592957, + "auxiliary_loss_mlp": 0.00512824, + "balance_loss_clip": 1.29898262, + "balance_loss_mlp": 0.46661848, + "epoch": 0.3298962873891478, + "flos": 20631398947200.0, + "grad_norm": 18.344522045575467, + "language_loss": 0.89674795, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.91780573, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.46264648, + "step": 5487, + "time_per_iteration": 2.6852545738220215 + }, + { + "auxiliary_loss_clip": 0.01608036, + "auxiliary_loss_mlp": 0.00521288, + "balance_loss_clip": 1.30626392, + "balance_loss_mlp": 0.47305632, + "epoch": 0.32995641064181574, + "flos": 14976007228800.0, + "grad_norm": 36.49719195961084, + "language_loss": 0.81011891, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.8314122, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.48217773, + "step": 5488, + "time_per_iteration": 2.6665873527526855 + }, + { + "auxiliary_loss_clip": 0.01616369, + "auxiliary_loss_mlp": 0.00520748, + "balance_loss_clip": 1.31478834, + "balance_loss_mlp": 0.47428018, + "epoch": 0.3300165338944837, + "flos": 18661447399680.0, + "grad_norm": 45.32937127985059, + "language_loss": 0.77830148, + "learning_rate": 3.127781429646098e-06, + "loss": 0.7996726, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.46435547, + "step": 5489, + "time_per_iteration": 2.6557607650756836 + }, + { + "auxiliary_loss_clip": 0.01594196, + "auxiliary_loss_mlp": 0.00519297, + "balance_loss_clip": 1.29829001, + "balance_loss_mlp": 0.47397316, + "epoch": 0.3300766571471517, + "flos": 25583730785280.0, + "grad_norm": 48.54707219330723, + "language_loss": 0.95111138, + "learning_rate": 3.127459771562238e-06, + "loss": 0.97224629, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.453125, + "step": 5490, + "time_per_iteration": 2.8021819591522217 + }, + { + "auxiliary_loss_clip": 0.01566888, + "auxiliary_loss_mlp": 0.00465372, + "balance_loss_clip": 1.2749846, + "balance_loss_mlp": 0.42386305, + "epoch": 0.33013678039981964, + "flos": 11363501623680.0, + "grad_norm": 10.082812198299813, + "language_loss": 0.88212633, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.90244889, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.4152832, + "step": 5491, + "time_per_iteration": 2.6597111225128174 + }, + { + "auxiliary_loss_clip": 0.01596926, + "auxiliary_loss_mlp": 0.0048842, + "balance_loss_clip": 1.30161297, + "balance_loss_mlp": 0.44319224, + "epoch": 0.3301969036524876, + "flos": 24821203939200.0, + "grad_norm": 9.628339165638607, + "language_loss": 0.81991971, + "learning_rate": 3.126816327146554e-06, + "loss": 0.84077322, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.45263672, + "step": 5492, + "time_per_iteration": 2.6714067459106445 + }, + { + "auxiliary_loss_clip": 0.0160378, + "auxiliary_loss_mlp": 0.00522506, + "balance_loss_clip": 1.30173826, + "balance_loss_mlp": 0.4735114, + "epoch": 0.33025702690515557, + "flos": 15961144613760.0, + "grad_norm": 47.15062331655749, + "language_loss": 0.83253074, + "learning_rate": 3.12649454083913e-06, + "loss": 0.85379362, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.48999023, + "step": 5493, + "time_per_iteration": 2.633070230484009 + }, + { + "auxiliary_loss_clip": 0.0160077, + "auxiliary_loss_mlp": 0.00102162, + "balance_loss_clip": 1.41763222, + "balance_loss_mlp": 0.0947707, + "epoch": 0.33031715015782354, + "flos": 59416755989760.0, + "grad_norm": 0.7633667836140986, + "language_loss": 0.53691661, + "learning_rate": 3.12617271181492e-06, + "loss": 0.55394602, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.07373047, + "step": 5494, + "time_per_iteration": 3.084963083267212 + }, + { + "auxiliary_loss_clip": 0.01581346, + "auxiliary_loss_mlp": 0.00492441, + "balance_loss_clip": 1.28875721, + "balance_loss_mlp": 0.44950211, + "epoch": 0.3303772734104915, + "flos": 23184360144000.0, + "grad_norm": 18.116930325982445, + "language_loss": 0.91413337, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.9348712, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.42944336, + "step": 5495, + "time_per_iteration": 2.6988515853881836 + }, + { + "auxiliary_loss_clip": 0.01600808, + "auxiliary_loss_mlp": 0.00495981, + "balance_loss_clip": 1.29974318, + "balance_loss_mlp": 0.45087269, + "epoch": 0.33043739666315947, + "flos": 33071896010880.0, + "grad_norm": 30.02817716433816, + "language_loss": 0.78475273, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.80572063, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.45092773, + "step": 5496, + "time_per_iteration": 2.7362167835235596 + }, + { + "auxiliary_loss_clip": 0.0158997, + "auxiliary_loss_mlp": 0.00485261, + "balance_loss_clip": 1.29513121, + "balance_loss_mlp": 0.44158289, + "epoch": 0.33049751991582743, + "flos": 24895431394560.0, + "grad_norm": 2.492586907083643, + "language_loss": 0.7814883, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.80224061, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.43725586, + "step": 5497, + "time_per_iteration": 2.694695472717285 + }, + { + "auxiliary_loss_clip": 0.01564706, + "auxiliary_loss_mlp": 0.00492391, + "balance_loss_clip": 1.2771132, + "balance_loss_mlp": 0.44997615, + "epoch": 0.3305576431684954, + "flos": 29460575554560.0, + "grad_norm": 1469.4356700417886, + "language_loss": 0.86169368, + "learning_rate": 3.124884968794321e-06, + "loss": 0.88226467, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.42407227, + "step": 5498, + "time_per_iteration": 2.7140886783599854 + }, + { + "auxiliary_loss_clip": 0.01583421, + "auxiliary_loss_mlp": 0.00494, + "balance_loss_clip": 1.28590345, + "balance_loss_mlp": 0.44662669, + "epoch": 0.33061776642116336, + "flos": 22632305040000.0, + "grad_norm": 7.507273359954162, + "language_loss": 0.82498419, + "learning_rate": 3.12456292636927e-06, + "loss": 0.84575838, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.47412109, + "step": 5499, + "time_per_iteration": 2.6105291843414307 + }, + { + "auxiliary_loss_clip": 0.01564736, + "auxiliary_loss_mlp": 0.00481426, + "balance_loss_clip": 1.2739867, + "balance_loss_mlp": 0.43822449, + "epoch": 0.3306778896738313, + "flos": 25776320532480.0, + "grad_norm": 1.6427377415211089, + "language_loss": 0.82627648, + "learning_rate": 3.124240841300681e-06, + "loss": 0.8467381, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.43212891, + "step": 5500, + "time_per_iteration": 2.6986587047576904 + }, + { + "auxiliary_loss_clip": 0.01599075, + "auxiliary_loss_mlp": 0.00496313, + "balance_loss_clip": 1.30399132, + "balance_loss_mlp": 0.4507277, + "epoch": 0.33073801292649935, + "flos": 36940552479360.0, + "grad_norm": 7.588102521362138, + "language_loss": 0.71682125, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.73777515, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.45556641, + "step": 5501, + "time_per_iteration": 2.8177385330200195 + }, + { + "auxiliary_loss_clip": 0.01587088, + "auxiliary_loss_mlp": 0.0049537, + "balance_loss_clip": 1.29485989, + "balance_loss_mlp": 0.44954628, + "epoch": 0.3307981361791673, + "flos": 12967738848000.0, + "grad_norm": 3.853689270145967, + "language_loss": 0.85058421, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.87140876, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.45800781, + "step": 5502, + "time_per_iteration": 2.6845107078552246 + }, + { + "auxiliary_loss_clip": 0.01604459, + "auxiliary_loss_mlp": 0.00527821, + "balance_loss_clip": 1.30696535, + "balance_loss_mlp": 0.47734821, + "epoch": 0.3308582594318353, + "flos": 25374372364800.0, + "grad_norm": 39.65166413517039, + "language_loss": 0.77644861, + "learning_rate": 3.123274330355824e-06, + "loss": 0.79777145, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.50488281, + "step": 5503, + "time_per_iteration": 2.7161052227020264 + }, + { + "auxiliary_loss_clip": 0.0159011, + "auxiliary_loss_mlp": 0.00443941, + "balance_loss_clip": 1.29754448, + "balance_loss_mlp": 0.40484035, + "epoch": 0.33091838268450324, + "flos": 26468570419200.0, + "grad_norm": 6.72807496081418, + "language_loss": 0.79734194, + "learning_rate": 3.12295207483523e-06, + "loss": 0.8176825, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.390625, + "step": 5504, + "time_per_iteration": 2.7365355491638184 + }, + { + "auxiliary_loss_clip": 0.0156763, + "auxiliary_loss_mlp": 0.00466792, + "balance_loss_clip": 1.27611661, + "balance_loss_mlp": 0.42637983, + "epoch": 0.3309785059371712, + "flos": 24971167221120.0, + "grad_norm": 24.071980567442708, + "language_loss": 0.76233304, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.78267729, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.40405273, + "step": 5505, + "time_per_iteration": 4.213175058364868 + }, + { + "auxiliary_loss_clip": 0.01568654, + "auxiliary_loss_mlp": 0.00520462, + "balance_loss_clip": 1.27827704, + "balance_loss_mlp": 0.47568724, + "epoch": 0.3310386291898392, + "flos": 20446710192000.0, + "grad_norm": 7.513282307336765, + "language_loss": 0.86778152, + "learning_rate": 3.122307436058899e-06, + "loss": 0.88867265, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.44799805, + "step": 5506, + "time_per_iteration": 4.189249515533447 + }, + { + "auxiliary_loss_clip": 0.01602595, + "auxiliary_loss_mlp": 0.00466431, + "balance_loss_clip": 1.30557442, + "balance_loss_mlp": 0.42308614, + "epoch": 0.33109875244250714, + "flos": 23182672204800.0, + "grad_norm": 3.3544234422992614, + "language_loss": 0.8454318, + "learning_rate": 3.121985052827606e-06, + "loss": 0.86612207, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.43383789, + "step": 5507, + "time_per_iteration": 2.665292978286743 + }, + { + "auxiliary_loss_clip": 0.01578458, + "auxiliary_loss_mlp": 0.00526244, + "balance_loss_clip": 1.28769743, + "balance_loss_mlp": 0.48158786, + "epoch": 0.3311588756951751, + "flos": 24168384207360.0, + "grad_norm": 3.737896871285951, + "language_loss": 0.77929831, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.8003453, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.4465332, + "step": 5508, + "time_per_iteration": 2.7712974548339844 + }, + { + "auxiliary_loss_clip": 0.01569202, + "auxiliary_loss_mlp": 0.0046803, + "balance_loss_clip": 1.28664875, + "balance_loss_mlp": 0.42802307, + "epoch": 0.33121899894784307, + "flos": 28145742209280.0, + "grad_norm": 5.375295266430183, + "language_loss": 0.76988304, + "learning_rate": 3.12134015873989e-06, + "loss": 0.79025543, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.39990234, + "step": 5509, + "time_per_iteration": 2.7654919624328613 + }, + { + "auxiliary_loss_clip": 0.01595682, + "auxiliary_loss_mlp": 0.00494421, + "balance_loss_clip": 1.30281687, + "balance_loss_mlp": 0.45176768, + "epoch": 0.33127912220051103, + "flos": 29567660976000.0, + "grad_norm": 8.225858482513216, + "language_loss": 0.79477137, + "learning_rate": 3.121017647907921e-06, + "loss": 0.8156724, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.42626953, + "step": 5510, + "time_per_iteration": 2.755457639694214 + }, + { + "auxiliary_loss_clip": 0.01570833, + "auxiliary_loss_mlp": 0.00481457, + "balance_loss_clip": 1.28485131, + "balance_loss_mlp": 0.43954289, + "epoch": 0.331339245453179, + "flos": 14428836374400.0, + "grad_norm": 974.2229342526064, + "language_loss": 0.94503003, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.96555293, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.41894531, + "step": 5511, + "time_per_iteration": 4.104701995849609 + }, + { + "auxiliary_loss_clip": 0.01551707, + "auxiliary_loss_mlp": 0.00474564, + "balance_loss_clip": 1.27371931, + "balance_loss_mlp": 0.43436646, + "epoch": 0.33139936870584696, + "flos": 20887118847360.0, + "grad_norm": 11.219581751951605, + "language_loss": 0.78327572, + "learning_rate": 3.12037249872891e-06, + "loss": 0.80353844, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.40161133, + "step": 5512, + "time_per_iteration": 2.650683879852295 + }, + { + "auxiliary_loss_clip": 0.0157338, + "auxiliary_loss_mlp": 0.0048861, + "balance_loss_clip": 1.28579867, + "balance_loss_mlp": 0.4497481, + "epoch": 0.33145949195851493, + "flos": 36284356869120.0, + "grad_norm": 48.40613141337527, + "language_loss": 0.77676034, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.79738021, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.38891602, + "step": 5513, + "time_per_iteration": 2.7655158042907715 + }, + { + "auxiliary_loss_clip": 0.01569343, + "auxiliary_loss_mlp": 0.00514148, + "balance_loss_clip": 1.27710509, + "balance_loss_mlp": 0.46853822, + "epoch": 0.33151961521118295, + "flos": 14279735018880.0, + "grad_norm": 15.249702380315565, + "language_loss": 0.76081777, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.78165263, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.45654297, + "step": 5514, + "time_per_iteration": 2.6223273277282715 + }, + { + "auxiliary_loss_clip": 0.01604628, + "auxiliary_loss_mlp": 0.0053409, + "balance_loss_clip": 1.30755317, + "balance_loss_mlp": 0.48895711, + "epoch": 0.3315797384638509, + "flos": 20774323163520.0, + "grad_norm": 18.75448424433654, + "language_loss": 0.73163176, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.75301892, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.45141602, + "step": 5515, + "time_per_iteration": 2.6470863819122314 + }, + { + "auxiliary_loss_clip": 0.01576537, + "auxiliary_loss_mlp": 0.00530322, + "balance_loss_clip": 1.28342676, + "balance_loss_mlp": 0.48774058, + "epoch": 0.3316398617165189, + "flos": 24679464871680.0, + "grad_norm": 16.28488196648176, + "language_loss": 0.73499846, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.75606704, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.42578125, + "step": 5516, + "time_per_iteration": 2.6702098846435547 + }, + { + "auxiliary_loss_clip": 0.01575217, + "auxiliary_loss_mlp": 0.00546743, + "balance_loss_clip": 1.27666104, + "balance_loss_mlp": 0.49941662, + "epoch": 0.33169998496918685, + "flos": 18587974129920.0, + "grad_norm": 22.676130120796145, + "language_loss": 0.87242699, + "learning_rate": 3.118758882514359e-06, + "loss": 0.8936466, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.47363281, + "step": 5517, + "time_per_iteration": 4.007158041000366 + }, + { + "auxiliary_loss_clip": 0.01570445, + "auxiliary_loss_mlp": 0.00514952, + "balance_loss_clip": 1.28502011, + "balance_loss_mlp": 0.47337186, + "epoch": 0.3317601082218548, + "flos": 20193647898240.0, + "grad_norm": 19.545757000585017, + "language_loss": 0.79362309, + "learning_rate": 3.118436031952143e-06, + "loss": 0.81447709, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.41577148, + "step": 5518, + "time_per_iteration": 2.6238152980804443 + }, + { + "auxiliary_loss_clip": 0.01540833, + "auxiliary_loss_mlp": 0.00160397, + "balance_loss_clip": 1.34395957, + "balance_loss_mlp": 0.15162329, + "epoch": 0.3318202314745228, + "flos": 68974703637120.0, + "grad_norm": 0.6019499570280913, + "language_loss": 0.53956068, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.55657303, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.08789062, + "step": 5519, + "time_per_iteration": 3.2296977043151855 + }, + { + "auxiliary_loss_clip": 0.01573207, + "auxiliary_loss_mlp": 0.00546561, + "balance_loss_clip": 1.28171039, + "balance_loss_mlp": 0.50169069, + "epoch": 0.33188035472719074, + "flos": 21500113374720.0, + "grad_norm": 31.750921576731162, + "language_loss": 0.84345376, + "learning_rate": 3.117790203606336e-06, + "loss": 0.86465144, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.44873047, + "step": 5520, + "time_per_iteration": 2.667644500732422 + }, + { + "auxiliary_loss_clip": 0.01573939, + "auxiliary_loss_mlp": 0.00493025, + "balance_loss_clip": 1.28422642, + "balance_loss_mlp": 0.45065793, + "epoch": 0.3319404779798587, + "flos": 28870490926080.0, + "grad_norm": 6.498715236426103, + "language_loss": 0.80940819, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.83007777, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.42407227, + "step": 5521, + "time_per_iteration": 2.739652633666992 + }, + { + "auxiliary_loss_clip": 0.01578892, + "auxiliary_loss_mlp": 0.00537682, + "balance_loss_clip": 1.28021741, + "balance_loss_mlp": 0.49123853, + "epoch": 0.33200060123252667, + "flos": 23076915586560.0, + "grad_norm": 127.35680076721854, + "language_loss": 0.77643085, + "learning_rate": 3.117144205713664e-06, + "loss": 0.79759657, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.46386719, + "step": 5522, + "time_per_iteration": 2.7655816078186035 + }, + { + "auxiliary_loss_clip": 0.01572213, + "auxiliary_loss_mlp": 0.004873, + "balance_loss_clip": 1.28385282, + "balance_loss_mlp": 0.44664928, + "epoch": 0.33206072448519464, + "flos": 21142479611520.0, + "grad_norm": 12.143204836526795, + "language_loss": 0.80655718, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.82715225, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 2.88085938, + "router_z_loss_mlp": 0.40649414, + "step": 5523, + "time_per_iteration": 2.737348794937134 + }, + { + "auxiliary_loss_clip": 0.01580682, + "auxiliary_loss_mlp": 0.00490841, + "balance_loss_clip": 1.28779769, + "balance_loss_mlp": 0.44942731, + "epoch": 0.3321208477378626, + "flos": 13079097987840.0, + "grad_norm": 17.70192336304605, + "language_loss": 0.86335361, + "learning_rate": 3.116498038372114e-06, + "loss": 0.88406885, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.41430664, + "step": 5524, + "time_per_iteration": 2.726077079772949 + }, + { + "auxiliary_loss_clip": 0.01578194, + "auxiliary_loss_mlp": 0.00497624, + "balance_loss_clip": 1.28742075, + "balance_loss_mlp": 0.45821327, + "epoch": 0.33218097099053057, + "flos": 21215414177280.0, + "grad_norm": 3.823614868994269, + "language_loss": 0.89823604, + "learning_rate": 3.116174891188636e-06, + "loss": 0.91899425, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.39379883, + "step": 5525, + "time_per_iteration": 2.6567418575286865 + }, + { + "auxiliary_loss_clip": 0.01558659, + "auxiliary_loss_mlp": 0.00157156, + "balance_loss_clip": 1.35221314, + "balance_loss_mlp": 0.148764, + "epoch": 0.33224109424319853, + "flos": 64348979189760.0, + "grad_norm": 0.7521071899607129, + "language_loss": 0.51975727, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.53691542, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.08398438, + "step": 5526, + "time_per_iteration": 3.1209027767181396 + }, + { + "auxiliary_loss_clip": 0.0159959, + "auxiliary_loss_mlp": 0.00544269, + "balance_loss_clip": 1.29486334, + "balance_loss_mlp": 0.49777743, + "epoch": 0.33230121749586655, + "flos": 17346003523200.0, + "grad_norm": 44.16976911131203, + "language_loss": 0.84170341, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.86314201, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.46484375, + "step": 5527, + "time_per_iteration": 2.651883602142334 + }, + { + "auxiliary_loss_clip": 0.01580472, + "auxiliary_loss_mlp": 0.00530901, + "balance_loss_clip": 1.29042125, + "balance_loss_mlp": 0.48901063, + "epoch": 0.3323613407485345, + "flos": 20997041443200.0, + "grad_norm": 5.064860664836912, + "language_loss": 0.78500509, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.80611879, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.41918945, + "step": 5528, + "time_per_iteration": 2.676032304763794 + }, + { + "auxiliary_loss_clip": 0.01575552, + "auxiliary_loss_mlp": 0.00511862, + "balance_loss_clip": 1.28438246, + "balance_loss_mlp": 0.46944773, + "epoch": 0.3324214640012025, + "flos": 13152535344000.0, + "grad_norm": 3.8044523033866198, + "language_loss": 0.88724518, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.90811932, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.42407227, + "step": 5529, + "time_per_iteration": 2.650164842605591 + }, + { + "auxiliary_loss_clip": 0.01604846, + "auxiliary_loss_mlp": 0.0053227, + "balance_loss_clip": 1.30283022, + "balance_loss_mlp": 0.48809063, + "epoch": 0.33248158725387045, + "flos": 22273522041600.0, + "grad_norm": 15.485608236127607, + "language_loss": 0.76649082, + "learning_rate": 3.114558520634423e-06, + "loss": 0.787862, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.44213867, + "step": 5530, + "time_per_iteration": 2.737518072128296 + }, + { + "auxiliary_loss_clip": 0.01591692, + "auxiliary_loss_mlp": 0.00517289, + "balance_loss_clip": 1.28805137, + "balance_loss_mlp": 0.47644785, + "epoch": 0.3325417105065384, + "flos": 20740998320640.0, + "grad_norm": 6.311591444167064, + "language_loss": 0.82121098, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.84230077, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.40869141, + "step": 5531, + "time_per_iteration": 2.6790318489074707 + }, + { + "auxiliary_loss_clip": 0.01600055, + "auxiliary_loss_mlp": 0.00539687, + "balance_loss_clip": 1.30024648, + "balance_loss_mlp": 0.49467313, + "epoch": 0.3326018337592064, + "flos": 24790536702720.0, + "grad_norm": 69.33304113066231, + "language_loss": 0.78527749, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.80667484, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.45019531, + "step": 5532, + "time_per_iteration": 2.6913888454437256 + }, + { + "auxiliary_loss_clip": 0.01584453, + "auxiliary_loss_mlp": 0.00502952, + "balance_loss_clip": 1.28716898, + "balance_loss_mlp": 0.45758128, + "epoch": 0.33266195701187434, + "flos": 14501699112960.0, + "grad_norm": 14.834023333039529, + "language_loss": 0.73751616, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.75839019, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.45336914, + "step": 5533, + "time_per_iteration": 2.7271888256073 + }, + { + "auxiliary_loss_clip": 0.01566034, + "auxiliary_loss_mlp": 0.00500704, + "balance_loss_clip": 1.27180243, + "balance_loss_mlp": 0.45795509, + "epoch": 0.3327220802645423, + "flos": 15304410299520.0, + "grad_norm": 6.245691370170473, + "language_loss": 0.77020419, + "learning_rate": 3.113264663362451e-06, + "loss": 0.79087162, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.42724609, + "step": 5534, + "time_per_iteration": 2.698463201522827 + }, + { + "auxiliary_loss_clip": 0.01583257, + "auxiliary_loss_mlp": 0.00528854, + "balance_loss_clip": 1.28748798, + "balance_loss_mlp": 0.48541397, + "epoch": 0.3327822035172103, + "flos": 23477534951040.0, + "grad_norm": 70.97305995391447, + "language_loss": 0.7228334, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.74395448, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.43408203, + "step": 5535, + "time_per_iteration": 2.692518711090088 + }, + { + "auxiliary_loss_clip": 0.01571523, + "auxiliary_loss_mlp": 0.00547771, + "balance_loss_clip": 1.27563143, + "balance_loss_mlp": 0.5010649, + "epoch": 0.33284232676987824, + "flos": 25374516019200.0, + "grad_norm": 28.395863270228844, + "language_loss": 0.779576, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.80076897, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.46679688, + "step": 5536, + "time_per_iteration": 2.660176992416382 + }, + { + "auxiliary_loss_clip": 0.01574601, + "auxiliary_loss_mlp": 0.005375, + "balance_loss_clip": 1.27547586, + "balance_loss_mlp": 0.4915086, + "epoch": 0.3329024500225462, + "flos": 23694363400320.0, + "grad_norm": 14.569887290546781, + "language_loss": 0.87195206, + "learning_rate": 3.112293827106917e-06, + "loss": 0.89307308, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.46020508, + "step": 5537, + "time_per_iteration": 2.6532106399536133 + }, + { + "auxiliary_loss_clip": 0.01585102, + "auxiliary_loss_mlp": 0.00511739, + "balance_loss_clip": 1.28471637, + "balance_loss_mlp": 0.46708292, + "epoch": 0.33296257327521417, + "flos": 31723163205120.0, + "grad_norm": 5.754768013059788, + "language_loss": 0.77808118, + "learning_rate": 3.111970130648789e-06, + "loss": 0.7990495, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.44677734, + "step": 5538, + "time_per_iteration": 2.7051334381103516 + }, + { + "auxiliary_loss_clip": 0.01540243, + "auxiliary_loss_mlp": 0.00476653, + "balance_loss_clip": 1.25436044, + "balance_loss_mlp": 0.43540639, + "epoch": 0.33302269652788213, + "flos": 22744705674240.0, + "grad_norm": 28.202062898663343, + "language_loss": 0.80048895, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.82065797, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.4128418, + "step": 5539, + "time_per_iteration": 2.6633849143981934 + }, + { + "auxiliary_loss_clip": 0.01577545, + "auxiliary_loss_mlp": 0.00515418, + "balance_loss_clip": 1.27594006, + "balance_loss_mlp": 0.4706912, + "epoch": 0.33308281978055015, + "flos": 11473747441920.0, + "grad_norm": 5.322936735697194, + "language_loss": 0.7732451, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.79417473, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.44750977, + "step": 5540, + "time_per_iteration": 2.63276743888855 + }, + { + "auxiliary_loss_clip": 0.01554375, + "auxiliary_loss_mlp": 0.00527425, + "balance_loss_clip": 1.26103187, + "balance_loss_mlp": 0.48355561, + "epoch": 0.3331429430332181, + "flos": 38213693112960.0, + "grad_norm": 2.7294732945503584, + "language_loss": 0.65095437, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.67177236, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.4387207, + "step": 5541, + "time_per_iteration": 2.765655279159546 + }, + { + "auxiliary_loss_clip": 0.01534911, + "auxiliary_loss_mlp": 0.00498972, + "balance_loss_clip": 1.24259782, + "balance_loss_mlp": 0.45684344, + "epoch": 0.3332030662858861, + "flos": 22528667324160.0, + "grad_norm": 2.674965133707393, + "language_loss": 0.75876474, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.77910358, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.42138672, + "step": 5542, + "time_per_iteration": 2.650214195251465 + }, + { + "auxiliary_loss_clip": 0.01544128, + "auxiliary_loss_mlp": 0.00535913, + "balance_loss_clip": 1.24934721, + "balance_loss_mlp": 0.49075603, + "epoch": 0.33326318953855405, + "flos": 15997773507840.0, + "grad_norm": 38.439029142757065, + "language_loss": 0.7938571, + "learning_rate": 3.110351016113414e-06, + "loss": 0.81465745, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.45166016, + "step": 5543, + "time_per_iteration": 2.6148736476898193 + }, + { + "auxiliary_loss_clip": 0.01561131, + "auxiliary_loss_mlp": 0.00517572, + "balance_loss_clip": 1.26374698, + "balance_loss_mlp": 0.47139078, + "epoch": 0.333323312791222, + "flos": 25593535198080.0, + "grad_norm": 46.708099611856134, + "language_loss": 0.80759609, + "learning_rate": 3.110027066843348e-06, + "loss": 0.82838321, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.46142578, + "step": 5544, + "time_per_iteration": 2.70951247215271 + }, + { + "auxiliary_loss_clip": 0.01536038, + "auxiliary_loss_mlp": 0.00568343, + "balance_loss_clip": 1.24336219, + "balance_loss_mlp": 0.52115953, + "epoch": 0.33338343604389, + "flos": 25119550304640.0, + "grad_norm": 52.51805925828019, + "language_loss": 0.76017356, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.78121734, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.47167969, + "step": 5545, + "time_per_iteration": 2.732720375061035 + }, + { + "auxiliary_loss_clip": 0.01545581, + "auxiliary_loss_mlp": 0.0051878, + "balance_loss_clip": 1.25494313, + "balance_loss_mlp": 0.47393349, + "epoch": 0.33344355929655795, + "flos": 16947287579520.0, + "grad_norm": 6.875822240436938, + "language_loss": 0.74610537, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.76674896, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.44824219, + "step": 5546, + "time_per_iteration": 2.6604368686676025 + }, + { + "auxiliary_loss_clip": 0.01520167, + "auxiliary_loss_mlp": 0.00562627, + "balance_loss_clip": 1.22705519, + "balance_loss_mlp": 0.51568288, + "epoch": 0.3335036825492259, + "flos": 27889591345920.0, + "grad_norm": 133.25581305137445, + "language_loss": 0.69674468, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.71757263, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.46948242, + "step": 5547, + "time_per_iteration": 4.260615348815918 + }, + { + "auxiliary_loss_clip": 0.01550578, + "auxiliary_loss_mlp": 0.00526865, + "balance_loss_clip": 1.25759506, + "balance_loss_mlp": 0.48478419, + "epoch": 0.3335638058018939, + "flos": 16179553261440.0, + "grad_norm": 19.687291209613875, + "language_loss": 0.91841704, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.93919146, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.42089844, + "step": 5548, + "time_per_iteration": 4.02365255355835 + }, + { + "auxiliary_loss_clip": 0.01541779, + "auxiliary_loss_mlp": 0.00503082, + "balance_loss_clip": 1.24198031, + "balance_loss_mlp": 0.4568764, + "epoch": 0.33362392905456184, + "flos": 39896108288640.0, + "grad_norm": 7.318126467127114, + "language_loss": 0.7980876, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.81853622, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.46191406, + "step": 5549, + "time_per_iteration": 2.841622829437256 + }, + { + "auxiliary_loss_clip": 0.01532116, + "auxiliary_loss_mlp": 0.00497906, + "balance_loss_clip": 1.23636389, + "balance_loss_mlp": 0.45515773, + "epoch": 0.3336840523072298, + "flos": 44271212567040.0, + "grad_norm": 13.162228822589661, + "language_loss": 0.73314977, + "learning_rate": 3.108082487713921e-06, + "loss": 0.75345004, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.42773438, + "step": 5550, + "time_per_iteration": 2.9085965156555176 + }, + { + "auxiliary_loss_clip": 0.01546138, + "auxiliary_loss_mlp": 0.00510713, + "balance_loss_clip": 1.24776793, + "balance_loss_mlp": 0.46841744, + "epoch": 0.33374417555989777, + "flos": 15085678429440.0, + "grad_norm": 6.905128605925467, + "language_loss": 0.6645838, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.68515229, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.42285156, + "step": 5551, + "time_per_iteration": 2.6311562061309814 + }, + { + "auxiliary_loss_clip": 0.01525882, + "auxiliary_loss_mlp": 0.00516762, + "balance_loss_clip": 1.23643744, + "balance_loss_mlp": 0.47201061, + "epoch": 0.33380429881256574, + "flos": 15849174942720.0, + "grad_norm": 5.167904644467356, + "language_loss": 0.75524777, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.77567416, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.44750977, + "step": 5552, + "time_per_iteration": 2.797231435775757 + }, + { + "auxiliary_loss_clip": 0.01513597, + "auxiliary_loss_mlp": 0.00493994, + "balance_loss_clip": 1.22580624, + "balance_loss_mlp": 0.45181811, + "epoch": 0.33386442206523376, + "flos": 13480327883520.0, + "grad_norm": 13.274758104406505, + "language_loss": 0.8980478, + "learning_rate": 3.107109630732192e-06, + "loss": 0.91812372, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.42211914, + "step": 5553, + "time_per_iteration": 4.007889270782471 + }, + { + "auxiliary_loss_clip": 0.01523478, + "auxiliary_loss_mlp": 0.00471147, + "balance_loss_clip": 1.23198164, + "balance_loss_mlp": 0.43023473, + "epoch": 0.3339245453179017, + "flos": 16690669839360.0, + "grad_norm": 25.027656995856034, + "language_loss": 0.87479091, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.89473712, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.40942383, + "step": 5554, + "time_per_iteration": 2.7050328254699707 + }, + { + "auxiliary_loss_clip": 0.01530255, + "auxiliary_loss_mlp": 0.00474902, + "balance_loss_clip": 1.23294389, + "balance_loss_mlp": 0.43246388, + "epoch": 0.3339846685705697, + "flos": 24610624456320.0, + "grad_norm": 20.86485701924114, + "language_loss": 0.87275565, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.89280719, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.42456055, + "step": 5555, + "time_per_iteration": 2.7094626426696777 + }, + { + "auxiliary_loss_clip": 0.01536364, + "auxiliary_loss_mlp": 0.0049315, + "balance_loss_clip": 1.24181604, + "balance_loss_mlp": 0.44973415, + "epoch": 0.33404479182323765, + "flos": 30953812775040.0, + "grad_norm": 11.884020416932232, + "language_loss": 0.79386497, + "learning_rate": 3.106136395915099e-06, + "loss": 0.81416011, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.43457031, + "step": 5556, + "time_per_iteration": 2.732884407043457 + }, + { + "auxiliary_loss_clip": 0.01527082, + "auxiliary_loss_mlp": 0.00466273, + "balance_loss_clip": 1.23810136, + "balance_loss_mlp": 0.42559928, + "epoch": 0.3341049150759056, + "flos": 23513301918720.0, + "grad_norm": 5.634430592156986, + "language_loss": 0.86651528, + "learning_rate": 3.105811900403391e-06, + "loss": 0.88644886, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.40649414, + "step": 5557, + "time_per_iteration": 2.698448896408081 + }, + { + "auxiliary_loss_clip": 0.01531305, + "auxiliary_loss_mlp": 0.00426581, + "balance_loss_clip": 1.23852038, + "balance_loss_mlp": 0.38960272, + "epoch": 0.3341650383285736, + "flos": 24026824707840.0, + "grad_norm": 352.5943687907297, + "language_loss": 0.8342644, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.85384321, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.36962891, + "step": 5558, + "time_per_iteration": 2.825575828552246 + }, + { + "auxiliary_loss_clip": 0.01510948, + "auxiliary_loss_mlp": 0.00432179, + "balance_loss_clip": 1.21573627, + "balance_loss_mlp": 0.39341214, + "epoch": 0.33422516158124155, + "flos": 24901967669760.0, + "grad_norm": 2.1941763082298333, + "language_loss": 0.87150031, + "learning_rate": 3.105162783594788e-06, + "loss": 0.89093161, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.38793945, + "step": 5559, + "time_per_iteration": 4.196875095367432 + }, + { + "auxiliary_loss_clip": 0.01496717, + "auxiliary_loss_mlp": 0.00392367, + "balance_loss_clip": 1.21420026, + "balance_loss_mlp": 0.35724795, + "epoch": 0.3342852848339095, + "flos": 18333403464960.0, + "grad_norm": 6.05266729596821, + "language_loss": 0.76116145, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.7800523, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.35107422, + "step": 5560, + "time_per_iteration": 2.646061420440674 + }, + { + "auxiliary_loss_clip": 0.01508851, + "auxiliary_loss_mlp": 0.00423967, + "balance_loss_clip": 1.21763921, + "balance_loss_mlp": 0.38534313, + "epoch": 0.3343454080865775, + "flos": 30046530119040.0, + "grad_norm": 4.716856440101298, + "language_loss": 0.80786216, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.82719034, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.38623047, + "step": 5561, + "time_per_iteration": 2.7490670680999756 + }, + { + "auxiliary_loss_clip": 0.0151983, + "auxiliary_loss_mlp": 0.00420586, + "balance_loss_clip": 1.2287029, + "balance_loss_mlp": 0.3825103, + "epoch": 0.33440553133924544, + "flos": 16398823835520.0, + "grad_norm": 216.9696923270397, + "language_loss": 0.75399613, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.77340031, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.38085938, + "step": 5562, + "time_per_iteration": 2.639038324356079 + }, + { + "auxiliary_loss_clip": 0.01506078, + "auxiliary_loss_mlp": 0.00364154, + "balance_loss_clip": 1.21729779, + "balance_loss_mlp": 0.3308233, + "epoch": 0.3344656545919134, + "flos": 24242072958720.0, + "grad_norm": 5.591433907291675, + "language_loss": 0.70583355, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.72453582, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.33349609, + "step": 5563, + "time_per_iteration": 2.690009832382202 + }, + { + "auxiliary_loss_clip": 0.01530579, + "auxiliary_loss_mlp": 0.00419479, + "balance_loss_clip": 1.23253655, + "balance_loss_mlp": 0.38002074, + "epoch": 0.3345257778445814, + "flos": 52118843149440.0, + "grad_norm": 19.87167114398471, + "language_loss": 0.78265846, + "learning_rate": 3.103539258400766e-06, + "loss": 0.80215895, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.39453125, + "step": 5564, + "time_per_iteration": 2.940110445022583 + }, + { + "auxiliary_loss_clip": 0.01496691, + "auxiliary_loss_mlp": 0.00073659, + "balance_loss_clip": 1.29375601, + "balance_loss_mlp": 0.06369306, + "epoch": 0.33458590109724934, + "flos": 68048602254720.0, + "grad_norm": 0.8057517237810922, + "language_loss": 0.54941928, + "learning_rate": 3.103214427773745e-06, + "loss": 0.56512272, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.09960938, + "step": 5565, + "time_per_iteration": 3.1427042484283447 + }, + { + "auxiliary_loss_clip": 0.01521776, + "auxiliary_loss_mlp": 0.00394313, + "balance_loss_clip": 1.22981298, + "balance_loss_mlp": 0.35909843, + "epoch": 0.3346460243499173, + "flos": 37414788768000.0, + "grad_norm": 79.65497083136118, + "language_loss": 0.71349871, + "learning_rate": 3.102889555312721e-06, + "loss": 0.73265958, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.35229492, + "step": 5566, + "time_per_iteration": 2.766519069671631 + }, + { + "auxiliary_loss_clip": 0.015084, + "auxiliary_loss_mlp": 0.00391326, + "balance_loss_clip": 1.21862936, + "balance_loss_mlp": 0.35587275, + "epoch": 0.3347061476025853, + "flos": 18697358021760.0, + "grad_norm": 2.652467543853517, + "language_loss": 0.82713145, + "learning_rate": 3.102564641030016e-06, + "loss": 0.84612876, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.35473633, + "step": 5567, + "time_per_iteration": 2.665616989135742 + }, + { + "auxiliary_loss_clip": 0.01511983, + "auxiliary_loss_mlp": 0.00426049, + "balance_loss_clip": 1.21596718, + "balance_loss_mlp": 0.38678133, + "epoch": 0.3347662708552533, + "flos": 13917827537280.0, + "grad_norm": 2.7348646788553763, + "language_loss": 0.83195448, + "learning_rate": 3.102239684937949e-06, + "loss": 0.85133481, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.39306641, + "step": 5568, + "time_per_iteration": 2.6135306358337402 + }, + { + "auxiliary_loss_clip": 0.01535037, + "auxiliary_loss_mlp": 0.0041354, + "balance_loss_clip": 1.23490989, + "balance_loss_mlp": 0.37625092, + "epoch": 0.33482639410792125, + "flos": 19750402068480.0, + "grad_norm": 76.17103427571024, + "language_loss": 0.76378047, + "learning_rate": 3.101914687048842e-06, + "loss": 0.78326631, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.37280273, + "step": 5569, + "time_per_iteration": 2.718444585800171 + }, + { + "auxiliary_loss_clip": 0.01520853, + "auxiliary_loss_mlp": 0.0041368, + "balance_loss_clip": 1.2216177, + "balance_loss_mlp": 0.3769401, + "epoch": 0.3348865173605892, + "flos": 16102991422080.0, + "grad_norm": 293.08048769402546, + "language_loss": 0.95181799, + "learning_rate": 3.10158964737502e-06, + "loss": 0.97116333, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 2.99414062, + "router_z_loss_mlp": 0.3671875, + "step": 5570, + "time_per_iteration": 2.862739086151123 + }, + { + "auxiliary_loss_clip": 0.01515847, + "auxiliary_loss_mlp": 0.00386704, + "balance_loss_clip": 1.22176993, + "balance_loss_mlp": 0.34920084, + "epoch": 0.3349466406132572, + "flos": 25008945350400.0, + "grad_norm": 30.53975102390588, + "language_loss": 0.85130179, + "learning_rate": 3.101264565928808e-06, + "loss": 0.87032729, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.37524414, + "step": 5571, + "time_per_iteration": 2.783130645751953 + }, + { + "auxiliary_loss_clip": 0.01545861, + "auxiliary_loss_mlp": 0.00230658, + "balance_loss_clip": 1.33117783, + "balance_loss_mlp": 0.21888056, + "epoch": 0.33500676386592515, + "flos": 54319991564160.0, + "grad_norm": 1.0264118522767554, + "language_loss": 0.55268157, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.57044673, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.11767578, + "step": 5572, + "time_per_iteration": 3.1498193740844727 + }, + { + "auxiliary_loss_clip": 0.01534155, + "auxiliary_loss_mlp": 0.00431359, + "balance_loss_clip": 1.23993659, + "balance_loss_mlp": 0.39411792, + "epoch": 0.3350668871185931, + "flos": 26797332625920.0, + "grad_norm": 208.2353012507294, + "language_loss": 0.85203719, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.87169236, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.37231445, + "step": 5573, + "time_per_iteration": 2.7048099040985107 + }, + { + "auxiliary_loss_clip": 0.01523163, + "auxiliary_loss_mlp": 0.00382255, + "balance_loss_clip": 1.22708035, + "balance_loss_mlp": 0.34746981, + "epoch": 0.3351270103712611, + "flos": 33510508986240.0, + "grad_norm": 8.606320182678555, + "language_loss": 0.80444169, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.82349586, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.34790039, + "step": 5574, + "time_per_iteration": 2.7470831871032715 + }, + { + "auxiliary_loss_clip": 0.01540146, + "auxiliary_loss_mlp": 0.00365377, + "balance_loss_clip": 1.24823332, + "balance_loss_mlp": 0.33156967, + "epoch": 0.33518713362392905, + "flos": 26506240807680.0, + "grad_norm": 6.444732463716021, + "language_loss": 0.92807651, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.94713169, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.33813477, + "step": 5575, + "time_per_iteration": 2.7024929523468018 + }, + { + "auxiliary_loss_clip": 0.01583254, + "auxiliary_loss_mlp": 0.00372733, + "balance_loss_clip": 1.26940513, + "balance_loss_mlp": 0.33606422, + "epoch": 0.335247256876597, + "flos": 17232345912960.0, + "grad_norm": 4.1518286083812805, + "language_loss": 0.87862605, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.89818591, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.36645508, + "step": 5576, + "time_per_iteration": 2.639709711074829 + }, + { + "auxiliary_loss_clip": 0.01534514, + "auxiliary_loss_mlp": 0.0035317, + "balance_loss_clip": 1.2368871, + "balance_loss_mlp": 0.31595263, + "epoch": 0.335307380129265, + "flos": 25629373992960.0, + "grad_norm": 1052.0087385056956, + "language_loss": 0.78825974, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.8071366, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.37231445, + "step": 5577, + "time_per_iteration": 2.6559345722198486 + }, + { + "auxiliary_loss_clip": 0.01561902, + "auxiliary_loss_mlp": 0.0036954, + "balance_loss_clip": 1.26151848, + "balance_loss_mlp": 0.33263314, + "epoch": 0.33536750338193294, + "flos": 19680089195520.0, + "grad_norm": 55.66494601609688, + "language_loss": 0.85998583, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.87930024, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.36914062, + "step": 5578, + "time_per_iteration": 2.6454966068267822 + }, + { + "auxiliary_loss_clip": 0.01533621, + "auxiliary_loss_mlp": 0.0037571, + "balance_loss_clip": 1.24074733, + "balance_loss_mlp": 0.33911306, + "epoch": 0.3354276266346009, + "flos": 18332613365760.0, + "grad_norm": 15.32926166381112, + "language_loss": 0.80194861, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.82104194, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.36547852, + "step": 5579, + "time_per_iteration": 2.612842559814453 + }, + { + "auxiliary_loss_clip": 0.01526289, + "auxiliary_loss_mlp": 0.00382907, + "balance_loss_clip": 1.23031151, + "balance_loss_mlp": 0.34802598, + "epoch": 0.3354877498872689, + "flos": 17858556645120.0, + "grad_norm": 21.226347619111102, + "language_loss": 0.88099837, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.90009034, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.34887695, + "step": 5580, + "time_per_iteration": 2.632863759994507 + }, + { + "auxiliary_loss_clip": 0.01543369, + "auxiliary_loss_mlp": 0.0040766, + "balance_loss_clip": 1.24537349, + "balance_loss_mlp": 0.37020475, + "epoch": 0.3355478731399369, + "flos": 24717745791360.0, + "grad_norm": 3.7471992009845363, + "language_loss": 0.82669032, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.84620064, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.37426758, + "step": 5581, + "time_per_iteration": 2.68477725982666 + }, + { + "auxiliary_loss_clip": 0.01569792, + "auxiliary_loss_mlp": 0.00413438, + "balance_loss_clip": 1.26225519, + "balance_loss_mlp": 0.3743847, + "epoch": 0.33560799639260486, + "flos": 16873886136960.0, + "grad_norm": 11.334767225374792, + "language_loss": 0.8073982, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.82723045, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.39038086, + "step": 5582, + "time_per_iteration": 2.6129021644592285 + }, + { + "auxiliary_loss_clip": 0.01551277, + "auxiliary_loss_mlp": 0.00396031, + "balance_loss_clip": 1.2513001, + "balance_loss_mlp": 0.35845616, + "epoch": 0.3356681196452728, + "flos": 18333511205760.0, + "grad_norm": 3299.54745304531, + "language_loss": 0.88300955, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.90248257, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.37597656, + "step": 5583, + "time_per_iteration": 2.676743745803833 + }, + { + "auxiliary_loss_clip": 0.01548624, + "auxiliary_loss_mlp": 0.0036086, + "balance_loss_clip": 1.2516669, + "balance_loss_mlp": 0.32264179, + "epoch": 0.3357282428979408, + "flos": 34750612085760.0, + "grad_norm": 35.37925294199629, + "language_loss": 0.8425647, + "learning_rate": 3.097034711451581e-06, + "loss": 0.86165953, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.38208008, + "step": 5584, + "time_per_iteration": 2.8251233100891113 + }, + { + "auxiliary_loss_clip": 0.01550153, + "auxiliary_loss_mlp": 0.00391207, + "balance_loss_clip": 1.2517885, + "balance_loss_mlp": 0.35425195, + "epoch": 0.33578836615060875, + "flos": 21580087006080.0, + "grad_norm": 5.388423713421417, + "language_loss": 0.82181233, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.84122592, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.36987305, + "step": 5585, + "time_per_iteration": 2.6921873092651367 + }, + { + "auxiliary_loss_clip": 0.01576285, + "auxiliary_loss_mlp": 0.00374348, + "balance_loss_clip": 1.27838063, + "balance_loss_mlp": 0.33779863, + "epoch": 0.3358484894032767, + "flos": 24530291688960.0, + "grad_norm": 24.195852877666933, + "language_loss": 0.82955867, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.84906495, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.36572266, + "step": 5586, + "time_per_iteration": 2.7588794231414795 + }, + { + "auxiliary_loss_clip": 0.01588363, + "auxiliary_loss_mlp": 0.00353321, + "balance_loss_clip": 1.27562213, + "balance_loss_mlp": 0.31410095, + "epoch": 0.3359086126559447, + "flos": 22455589104000.0, + "grad_norm": 103.3704683249579, + "language_loss": 0.8943662, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.91378307, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.39233398, + "step": 5587, + "time_per_iteration": 2.7122697830200195 + }, + { + "auxiliary_loss_clip": 0.01589126, + "auxiliary_loss_mlp": 0.00349829, + "balance_loss_clip": 1.29490042, + "balance_loss_mlp": 0.31568766, + "epoch": 0.33596873590861265, + "flos": 16543687386240.0, + "grad_norm": 1194.232375840977, + "language_loss": 0.73894513, + "learning_rate": 3.095731802118677e-06, + "loss": 0.75833464, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.34179688, + "step": 5588, + "time_per_iteration": 2.685500383377075 + }, + { + "auxiliary_loss_clip": 0.01591555, + "auxiliary_loss_mlp": 0.0036385, + "balance_loss_clip": 1.28324127, + "balance_loss_mlp": 0.32515472, + "epoch": 0.3360288591612806, + "flos": 31175812782720.0, + "grad_norm": 22.357301113988644, + "language_loss": 0.75951552, + "learning_rate": 3.095405970878919e-06, + "loss": 0.77906954, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.38720703, + "step": 5589, + "time_per_iteration": 4.156806468963623 + }, + { + "auxiliary_loss_clip": 0.01587954, + "auxiliary_loss_mlp": 0.00371514, + "balance_loss_clip": 1.28618753, + "balance_loss_mlp": 0.33608454, + "epoch": 0.3360889824139486, + "flos": 23696913265920.0, + "grad_norm": 57.76248346447716, + "language_loss": 0.72684246, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.74643713, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.35424805, + "step": 5590, + "time_per_iteration": 4.171026706695557 + }, + { + "auxiliary_loss_clip": 0.01593126, + "auxiliary_loss_mlp": 0.00337489, + "balance_loss_clip": 1.29468179, + "balance_loss_mlp": 0.30203605, + "epoch": 0.33614910566661654, + "flos": 19318109886720.0, + "grad_norm": 3.0197056841907592, + "language_loss": 0.81321573, + "learning_rate": 3.094754183798047e-06, + "loss": 0.83252186, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.35449219, + "step": 5591, + "time_per_iteration": 2.6838200092315674 + }, + { + "auxiliary_loss_clip": 0.01591979, + "auxiliary_loss_mlp": 0.00349122, + "balance_loss_clip": 1.29555023, + "balance_loss_mlp": 0.31207216, + "epoch": 0.3362092289192845, + "flos": 16472261191680.0, + "grad_norm": 33.06302673028555, + "language_loss": 0.76515383, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.78456485, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.37084961, + "step": 5592, + "time_per_iteration": 2.6967225074768066 + }, + { + "auxiliary_loss_clip": 0.01558369, + "auxiliary_loss_mlp": 0.00347929, + "balance_loss_clip": 1.26691604, + "balance_loss_mlp": 0.31340617, + "epoch": 0.33626935217195253, + "flos": 24243581329920.0, + "grad_norm": 27.44926009669974, + "language_loss": 0.82425714, + "learning_rate": 3.094102230664423e-06, + "loss": 0.84332013, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.34545898, + "step": 5593, + "time_per_iteration": 2.6847188472747803 + }, + { + "auxiliary_loss_clip": 0.01581792, + "auxiliary_loss_mlp": 0.00362582, + "balance_loss_clip": 1.27785945, + "balance_loss_mlp": 0.32348162, + "epoch": 0.3363294754246205, + "flos": 19718765164800.0, + "grad_norm": 87.56725031354604, + "language_loss": 0.78874129, + "learning_rate": 3.093776191858731e-06, + "loss": 0.80818498, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 3.03515625, + "router_z_loss_mlp": 0.39111328, + "step": 5594, + "time_per_iteration": 2.6315743923187256 + }, + { + "auxiliary_loss_clip": 0.01593111, + "auxiliary_loss_mlp": 0.00367594, + "balance_loss_clip": 1.29271281, + "balance_loss_mlp": 0.32861251, + "epoch": 0.33638959867728846, + "flos": 22596286677120.0, + "grad_norm": 25.581757089423586, + "language_loss": 0.84858316, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.86819023, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.38989258, + "step": 5595, + "time_per_iteration": 4.11200737953186 + }, + { + "auxiliary_loss_clip": 0.01597624, + "auxiliary_loss_mlp": 0.00324098, + "balance_loss_clip": 1.30071568, + "balance_loss_mlp": 0.2880972, + "epoch": 0.3364497219299564, + "flos": 20994742972800.0, + "grad_norm": 19.125038599518827, + "language_loss": 0.87747091, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.8966881, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.36010742, + "step": 5596, + "time_per_iteration": 2.712236166000366 + }, + { + "auxiliary_loss_clip": 0.01591046, + "auxiliary_loss_mlp": 0.00363684, + "balance_loss_clip": 1.29055262, + "balance_loss_mlp": 0.3294946, + "epoch": 0.3365098451826244, + "flos": 25228610974080.0, + "grad_norm": 18.191507710314223, + "language_loss": 0.80562866, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.82517594, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.34204102, + "step": 5597, + "time_per_iteration": 2.836308002471924 + }, + { + "auxiliary_loss_clip": 0.01588202, + "auxiliary_loss_mlp": 0.00371647, + "balance_loss_clip": 1.29056537, + "balance_loss_mlp": 0.33504951, + "epoch": 0.33656996843529235, + "flos": 24571697091840.0, + "grad_norm": 3.6634150951222573, + "language_loss": 0.85010904, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.86970752, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.36572266, + "step": 5598, + "time_per_iteration": 2.6566855907440186 + }, + { + "auxiliary_loss_clip": 0.01621995, + "auxiliary_loss_mlp": 0.00402566, + "balance_loss_clip": 1.30762208, + "balance_loss_mlp": 0.36241624, + "epoch": 0.3366300916879603, + "flos": 44091120752640.0, + "grad_norm": 14.016018202304966, + "language_loss": 0.70498919, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.72523481, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.40136719, + "step": 5599, + "time_per_iteration": 2.821074962615967 + }, + { + "auxiliary_loss_clip": 0.01582871, + "auxiliary_loss_mlp": 0.00381205, + "balance_loss_clip": 1.27528596, + "balance_loss_mlp": 0.34022084, + "epoch": 0.3366902149406283, + "flos": 13879869840000.0, + "grad_norm": 9.64623936925707, + "language_loss": 0.90438521, + "learning_rate": 3.091819088459249e-06, + "loss": 0.92402595, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 3.07617188, + "router_z_loss_mlp": 0.40966797, + "step": 5600, + "time_per_iteration": 2.590841293334961 + }, + { + "auxiliary_loss_clip": 0.01593971, + "auxiliary_loss_mlp": 0.00347623, + "balance_loss_clip": 1.28846407, + "balance_loss_mlp": 0.31200367, + "epoch": 0.33675033819329625, + "flos": 16253098358400.0, + "grad_norm": 123.89688377472987, + "language_loss": 0.88376749, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.90318346, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.35644531, + "step": 5601, + "time_per_iteration": 3.998593330383301 + }, + { + "auxiliary_loss_clip": 0.01611561, + "auxiliary_loss_mlp": 0.00336867, + "balance_loss_clip": 1.31644046, + "balance_loss_mlp": 0.30346438, + "epoch": 0.3368104614459642, + "flos": 17055809544960.0, + "grad_norm": 73.65743158595602, + "language_loss": 0.89894515, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.91842943, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.33422852, + "step": 5602, + "time_per_iteration": 2.6501998901367188 + }, + { + "auxiliary_loss_clip": 0.01599793, + "auxiliary_loss_mlp": 0.00373667, + "balance_loss_clip": 1.30186224, + "balance_loss_mlp": 0.33709359, + "epoch": 0.3368705846986322, + "flos": 17858628472320.0, + "grad_norm": 9.367974326772686, + "language_loss": 0.75468409, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.77441871, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.3659668, + "step": 5603, + "time_per_iteration": 2.6472818851470947 + }, + { + "auxiliary_loss_clip": 0.01629645, + "auxiliary_loss_mlp": 0.00390888, + "balance_loss_clip": 1.32413876, + "balance_loss_mlp": 0.35266981, + "epoch": 0.33693070795130015, + "flos": 22929502170240.0, + "grad_norm": 32.85631066989053, + "language_loss": 0.87790245, + "learning_rate": 3.090513524656898e-06, + "loss": 0.89810777, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.38208008, + "step": 5604, + "time_per_iteration": 2.6615848541259766 + }, + { + "auxiliary_loss_clip": 0.01621022, + "auxiliary_loss_mlp": 0.00379452, + "balance_loss_clip": 1.31628835, + "balance_loss_mlp": 0.34216344, + "epoch": 0.3369908312039681, + "flos": 22017443005440.0, + "grad_norm": 2.08017111106845, + "language_loss": 0.79698372, + "learning_rate": 3.090187030294409e-06, + "loss": 0.81698841, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.37280273, + "step": 5605, + "time_per_iteration": 2.6683003902435303 + }, + { + "auxiliary_loss_clip": 0.01643846, + "auxiliary_loss_mlp": 0.00413838, + "balance_loss_clip": 1.33239174, + "balance_loss_mlp": 0.37385544, + "epoch": 0.33705095445663613, + "flos": 11801970944640.0, + "grad_norm": 6.184367869605203, + "language_loss": 0.89504039, + "learning_rate": 3.089860494591919e-06, + "loss": 0.91561717, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.40014648, + "step": 5606, + "time_per_iteration": 2.627930164337158 + }, + { + "auxiliary_loss_clip": 0.01630758, + "auxiliary_loss_mlp": 0.00363268, + "balance_loss_clip": 1.32452416, + "balance_loss_mlp": 0.32409561, + "epoch": 0.3371110777093041, + "flos": 25046400257280.0, + "grad_norm": 1.758199402811793, + "language_loss": 0.73085803, + "learning_rate": 3.089533917561809e-06, + "loss": 0.75079823, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.39160156, + "step": 5607, + "time_per_iteration": 2.7224080562591553 + }, + { + "auxiliary_loss_clip": 0.01633906, + "auxiliary_loss_mlp": 0.00427294, + "balance_loss_clip": 1.32143724, + "balance_loss_mlp": 0.38719207, + "epoch": 0.33717120096197206, + "flos": 26579031719040.0, + "grad_norm": 5.635755917585983, + "language_loss": 0.7832191, + "learning_rate": 3.089207299216464e-06, + "loss": 0.80383104, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 0.40087891, + "step": 5608, + "time_per_iteration": 2.837798833847046 + }, + { + "auxiliary_loss_clip": 0.01633129, + "auxiliary_loss_mlp": 0.00415322, + "balance_loss_clip": 1.32576799, + "balance_loss_mlp": 0.37674558, + "epoch": 0.33723132421464, + "flos": 15158541168000.0, + "grad_norm": 9.403551934723325, + "language_loss": 0.8491286, + "learning_rate": 3.088880639568269e-06, + "loss": 0.86961311, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.38574219, + "step": 5609, + "time_per_iteration": 2.6769025325775146 + }, + { + "auxiliary_loss_clip": 0.0164015, + "auxiliary_loss_mlp": 0.00452785, + "balance_loss_clip": 1.32881725, + "balance_loss_mlp": 0.40998852, + "epoch": 0.337291447467308, + "flos": 23436093634560.0, + "grad_norm": 6.40304731037525, + "language_loss": 0.87100255, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.89193189, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.42773438, + "step": 5610, + "time_per_iteration": 2.6841881275177 + }, + { + "auxiliary_loss_clip": 0.01652004, + "auxiliary_loss_mlp": 0.0041201, + "balance_loss_clip": 1.34475589, + "balance_loss_mlp": 0.37286192, + "epoch": 0.33735157071997596, + "flos": 17238163916160.0, + "grad_norm": 2.6111512218583526, + "language_loss": 0.88976723, + "learning_rate": 3.088227196412879e-06, + "loss": 0.91040736, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.3918457, + "step": 5611, + "time_per_iteration": 2.6425278186798096 + }, + { + "auxiliary_loss_clip": 0.01635836, + "auxiliary_loss_mlp": 0.00462604, + "balance_loss_clip": 1.32646894, + "balance_loss_mlp": 0.41878232, + "epoch": 0.3374116939726439, + "flos": 28257388657920.0, + "grad_norm": 97.05374320654698, + "language_loss": 0.84320545, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.8641898, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.43774414, + "step": 5612, + "time_per_iteration": 2.7237908840179443 + }, + { + "auxiliary_loss_clip": 0.0164081, + "auxiliary_loss_mlp": 0.00491291, + "balance_loss_clip": 1.33052969, + "balance_loss_mlp": 0.44789928, + "epoch": 0.3374718172253119, + "flos": 35919396731520.0, + "grad_norm": 28.617111203699505, + "language_loss": 0.75644296, + "learning_rate": 3.087573588194753e-06, + "loss": 0.77776396, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.43432617, + "step": 5613, + "time_per_iteration": 2.7829782962799072 + }, + { + "auxiliary_loss_clip": 0.01659478, + "auxiliary_loss_mlp": 0.00461238, + "balance_loss_clip": 1.34491527, + "balance_loss_mlp": 0.41929978, + "epoch": 0.33753194047797985, + "flos": 18186672407040.0, + "grad_norm": 18.727949652256477, + "language_loss": 0.84484076, + "learning_rate": 3.087246722218144e-06, + "loss": 0.86604798, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.41943359, + "step": 5614, + "time_per_iteration": 2.724947690963745 + }, + { + "auxiliary_loss_clip": 0.01659461, + "auxiliary_loss_mlp": 0.00508103, + "balance_loss_clip": 1.34440458, + "balance_loss_mlp": 0.46056187, + "epoch": 0.3375920637306478, + "flos": 23148916398720.0, + "grad_norm": 10.083504187588652, + "language_loss": 0.96133846, + "learning_rate": 3.086919815013031e-06, + "loss": 0.98301405, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.47583008, + "step": 5615, + "time_per_iteration": 2.7331082820892334 + }, + { + "auxiliary_loss_clip": 0.01646436, + "auxiliary_loss_mlp": 0.0050217, + "balance_loss_clip": 1.34010863, + "balance_loss_mlp": 0.45958847, + "epoch": 0.3376521869833158, + "flos": 23112215677440.0, + "grad_norm": 8.56628761434008, + "language_loss": 0.85308164, + "learning_rate": 3.086592866591809e-06, + "loss": 0.87456769, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.42553711, + "step": 5616, + "time_per_iteration": 2.737654447555542 + }, + { + "auxiliary_loss_clip": 0.01666528, + "auxiliary_loss_mlp": 0.00455815, + "balance_loss_clip": 1.34549296, + "balance_loss_mlp": 0.4128992, + "epoch": 0.33771231023598375, + "flos": 19274585581440.0, + "grad_norm": 38.77886928773457, + "language_loss": 0.89252031, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.91374373, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.42895508, + "step": 5617, + "time_per_iteration": 2.6170666217803955 + }, + { + "auxiliary_loss_clip": 0.01674944, + "auxiliary_loss_mlp": 0.0045434, + "balance_loss_clip": 1.3621515, + "balance_loss_mlp": 0.41101977, + "epoch": 0.3377724334886517, + "flos": 18150187167360.0, + "grad_norm": 24.142870080227333, + "language_loss": 0.84607404, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.86736691, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.43310547, + "step": 5618, + "time_per_iteration": 2.62235689163208 + }, + { + "auxiliary_loss_clip": 0.01673312, + "auxiliary_loss_mlp": 0.0050885, + "balance_loss_clip": 1.35544503, + "balance_loss_mlp": 0.46190548, + "epoch": 0.3378325567413197, + "flos": 25775997310080.0, + "grad_norm": 24.18243213084034, + "language_loss": 0.76862633, + "learning_rate": 3.085611774155481e-06, + "loss": 0.79044801, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.46972656, + "step": 5619, + "time_per_iteration": 2.7528467178344727 + }, + { + "auxiliary_loss_clip": 0.01691774, + "auxiliary_loss_mlp": 0.00481935, + "balance_loss_clip": 1.37267637, + "balance_loss_mlp": 0.43623054, + "epoch": 0.3378926799939877, + "flos": 21317112558720.0, + "grad_norm": 423.6003237403087, + "language_loss": 0.76496571, + "learning_rate": 3.085284660993821e-06, + "loss": 0.78670275, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.45727539, + "step": 5620, + "time_per_iteration": 2.697732448577881 + }, + { + "auxiliary_loss_clip": 0.01656281, + "auxiliary_loss_mlp": 0.00510883, + "balance_loss_clip": 1.34566855, + "balance_loss_mlp": 0.46520182, + "epoch": 0.33795280324665566, + "flos": 24900028335360.0, + "grad_norm": 34.09951818416055, + "language_loss": 0.75049305, + "learning_rate": 3.084957506678058e-06, + "loss": 0.77216464, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.45703125, + "step": 5621, + "time_per_iteration": 2.7057900428771973 + }, + { + "auxiliary_loss_clip": 0.01646339, + "auxiliary_loss_mlp": 0.00493199, + "balance_loss_clip": 1.34071016, + "balance_loss_mlp": 0.45009333, + "epoch": 0.33801292649932363, + "flos": 24753943722240.0, + "grad_norm": 5.487472678520415, + "language_loss": 0.87219661, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.893592, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.43066406, + "step": 5622, + "time_per_iteration": 2.675394058227539 + }, + { + "auxiliary_loss_clip": 0.01674453, + "auxiliary_loss_mlp": 0.00491399, + "balance_loss_clip": 1.36461687, + "balance_loss_mlp": 0.44958037, + "epoch": 0.3380730497519916, + "flos": 26723967096960.0, + "grad_norm": 9.238216684294697, + "language_loss": 0.7780472, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.79970574, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.41796875, + "step": 5623, + "time_per_iteration": 2.6746809482574463 + }, + { + "auxiliary_loss_clip": 0.02048074, + "auxiliary_loss_mlp": 0.00264322, + "balance_loss_clip": 1.84051514, + "balance_loss_mlp": 0.24610667, + "epoch": 0.33813317300465956, + "flos": 70035756416640.0, + "grad_norm": 0.7573395481446963, + "language_loss": 0.54530442, + "learning_rate": 3.083975796930215e-06, + "loss": 0.5684284, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.18261719, + "step": 5624, + "time_per_iteration": 3.243288040161133 + }, + { + "auxiliary_loss_clip": 0.01677985, + "auxiliary_loss_mlp": 0.00495723, + "balance_loss_clip": 1.36337876, + "balance_loss_mlp": 0.45094782, + "epoch": 0.3381932962573275, + "flos": 24097317148800.0, + "grad_norm": 185.56917330617796, + "language_loss": 0.79099125, + "learning_rate": 3.083648478122111e-06, + "loss": 0.81272829, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.44775391, + "step": 5625, + "time_per_iteration": 2.6991119384765625 + }, + { + "auxiliary_loss_clip": 0.01690921, + "auxiliary_loss_mlp": 0.00487842, + "balance_loss_clip": 1.37075865, + "balance_loss_mlp": 0.44137439, + "epoch": 0.3382534195099955, + "flos": 19278248768640.0, + "grad_norm": 6.169780062407367, + "language_loss": 0.76683366, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.78862131, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.46508789, + "step": 5626, + "time_per_iteration": 2.63055157661438 + }, + { + "auxiliary_loss_clip": 0.0168893, + "auxiliary_loss_mlp": 0.00467423, + "balance_loss_clip": 1.38139033, + "balance_loss_mlp": 0.42453191, + "epoch": 0.33831354276266346, + "flos": 25226240676480.0, + "grad_norm": 36.821120407016245, + "language_loss": 0.84339124, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.86495483, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 3.07617188, + "router_z_loss_mlp": 0.42871094, + "step": 5627, + "time_per_iteration": 2.68215274810791 + }, + { + "auxiliary_loss_clip": 0.01670614, + "auxiliary_loss_mlp": 0.00490446, + "balance_loss_clip": 1.35621345, + "balance_loss_mlp": 0.4462429, + "epoch": 0.3383736660153314, + "flos": 23112000195840.0, + "grad_norm": 16.352295523515277, + "language_loss": 0.86174428, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.8833549, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.44189453, + "step": 5628, + "time_per_iteration": 2.6627743244171143 + }, + { + "auxiliary_loss_clip": 0.01690991, + "auxiliary_loss_mlp": 0.0051162, + "balance_loss_clip": 1.36639047, + "balance_loss_mlp": 0.46338803, + "epoch": 0.3384337892679994, + "flos": 23477139901440.0, + "grad_norm": 2.9168502097130826, + "language_loss": 0.84676969, + "learning_rate": 3.082338792093254e-06, + "loss": 0.86879587, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.48193359, + "step": 5629, + "time_per_iteration": 2.7029573917388916 + }, + { + "auxiliary_loss_clip": 0.01675221, + "auxiliary_loss_mlp": 0.00544875, + "balance_loss_clip": 1.35501289, + "balance_loss_mlp": 0.49380583, + "epoch": 0.33849391252066735, + "flos": 19425805839360.0, + "grad_norm": 6.2141078884506165, + "language_loss": 0.90525162, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.92745256, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.51098633, + "step": 5630, + "time_per_iteration": 2.6588196754455566 + }, + { + "auxiliary_loss_clip": 0.01677287, + "auxiliary_loss_mlp": 0.00472389, + "balance_loss_clip": 1.36633229, + "balance_loss_mlp": 0.4299503, + "epoch": 0.3385540357733353, + "flos": 21064840364160.0, + "grad_norm": 57.40580308839691, + "language_loss": 0.78036541, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.80186212, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.42431641, + "step": 5631, + "time_per_iteration": 4.1633875370025635 + }, + { + "auxiliary_loss_clip": 0.01990598, + "auxiliary_loss_mlp": 0.00153214, + "balance_loss_clip": 1.79332829, + "balance_loss_mlp": 0.14024439, + "epoch": 0.3386141590260033, + "flos": 69208013450880.0, + "grad_norm": 0.9528878836206, + "language_loss": 0.55906165, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58049977, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.12988281, + "step": 5632, + "time_per_iteration": 3.205193042755127 + }, + { + "auxiliary_loss_clip": 0.01670882, + "auxiliary_loss_mlp": 0.00479629, + "balance_loss_clip": 1.35524642, + "balance_loss_mlp": 0.4360939, + "epoch": 0.3386742822786713, + "flos": 25519487310720.0, + "grad_norm": 21.95724780594213, + "language_loss": 0.84613764, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.86764276, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.43554688, + "step": 5633, + "time_per_iteration": 4.262273788452148 + }, + { + "auxiliary_loss_clip": 0.0163763, + "auxiliary_loss_mlp": 0.00441638, + "balance_loss_clip": 1.32633138, + "balance_loss_mlp": 0.40175086, + "epoch": 0.33873440553133927, + "flos": 23623116773760.0, + "grad_norm": 4.003270799141251, + "language_loss": 0.66261518, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.6834079, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.39892578, + "step": 5634, + "time_per_iteration": 2.723479986190796 + }, + { + "auxiliary_loss_clip": 0.01648715, + "auxiliary_loss_mlp": 0.00481733, + "balance_loss_clip": 1.34060037, + "balance_loss_mlp": 0.43881768, + "epoch": 0.33879452878400723, + "flos": 17088882992640.0, + "grad_norm": 18.660766430528557, + "language_loss": 0.9697445, + "learning_rate": 3.080373032026589e-06, + "loss": 0.99104899, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.42895508, + "step": 5635, + "time_per_iteration": 2.683528184890747 + }, + { + "auxiliary_loss_clip": 0.01650593, + "auxiliary_loss_mlp": 0.00505514, + "balance_loss_clip": 1.35081482, + "balance_loss_mlp": 0.4600476, + "epoch": 0.3388546520366752, + "flos": 15742053607680.0, + "grad_norm": 2.355476753450642, + "language_loss": 0.80813694, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.82969803, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 2.99414062, + "router_z_loss_mlp": 0.4543457, + "step": 5636, + "time_per_iteration": 2.590348958969116 + }, + { + "auxiliary_loss_clip": 0.0163409, + "auxiliary_loss_mlp": 0.0050307, + "balance_loss_clip": 1.33239055, + "balance_loss_mlp": 0.45908174, + "epoch": 0.33891477528934316, + "flos": 22418744728320.0, + "grad_norm": 17.96299111891339, + "language_loss": 0.89739394, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.91876554, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.43969727, + "step": 5637, + "time_per_iteration": 2.652534008026123 + }, + { + "auxiliary_loss_clip": 0.01643711, + "auxiliary_loss_mlp": 0.00559935, + "balance_loss_clip": 1.33469391, + "balance_loss_mlp": 0.51239389, + "epoch": 0.3389748985420111, + "flos": 17274828723840.0, + "grad_norm": 6.916622723291121, + "language_loss": 0.76354319, + "learning_rate": 3.079389598759495e-06, + "loss": 0.78557962, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.4753418, + "step": 5638, + "time_per_iteration": 4.059498310089111 + }, + { + "auxiliary_loss_clip": 0.01611362, + "auxiliary_loss_mlp": 0.00459081, + "balance_loss_clip": 1.31303573, + "balance_loss_mlp": 0.41635692, + "epoch": 0.3390350217946791, + "flos": 27744979190400.0, + "grad_norm": 12.07334466484072, + "language_loss": 0.86321962, + "learning_rate": 3.079061705792765e-06, + "loss": 0.88392407, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.42700195, + "step": 5639, + "time_per_iteration": 2.7078371047973633 + }, + { + "auxiliary_loss_clip": 0.016025, + "auxiliary_loss_mlp": 0.00494724, + "balance_loss_clip": 1.29636431, + "balance_loss_mlp": 0.45064029, + "epoch": 0.33909514504734706, + "flos": 20339804338560.0, + "grad_norm": 2.9447441467685485, + "language_loss": 0.75569904, + "learning_rate": 3.078733771907907e-06, + "loss": 0.77667123, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.44067383, + "step": 5640, + "time_per_iteration": 2.6862857341766357 + }, + { + "auxiliary_loss_clip": 0.01605412, + "auxiliary_loss_mlp": 0.00516295, + "balance_loss_clip": 1.30424452, + "balance_loss_mlp": 0.46985066, + "epoch": 0.339155268300015, + "flos": 14830030356480.0, + "grad_norm": 30.987080591917014, + "language_loss": 0.7501936, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.77141064, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.46484375, + "step": 5641, + "time_per_iteration": 2.608158588409424 + }, + { + "auxiliary_loss_clip": 0.01606893, + "auxiliary_loss_mlp": 0.00468371, + "balance_loss_clip": 1.30313706, + "balance_loss_mlp": 0.42710093, + "epoch": 0.339215391552683, + "flos": 26067951054720.0, + "grad_norm": 263121.91431245895, + "language_loss": 0.92944169, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.95019436, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.4128418, + "step": 5642, + "time_per_iteration": 2.6640493869781494 + }, + { + "auxiliary_loss_clip": 0.01587595, + "auxiliary_loss_mlp": 0.00454491, + "balance_loss_clip": 1.30117512, + "balance_loss_mlp": 0.41620094, + "epoch": 0.33927551480535095, + "flos": 14574705505920.0, + "grad_norm": 20.852150725974038, + "language_loss": 0.88211054, + "learning_rate": 3.077749724868924e-06, + "loss": 0.90253139, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.38305664, + "step": 5643, + "time_per_iteration": 4.0114524364471436 + }, + { + "auxiliary_loss_clip": 0.01608968, + "auxiliary_loss_mlp": 0.00445923, + "balance_loss_clip": 1.31616831, + "balance_loss_mlp": 0.40641725, + "epoch": 0.3393356380580189, + "flos": 23805578885760.0, + "grad_norm": 15.191693125753208, + "language_loss": 0.81951714, + "learning_rate": 3.077421627435922e-06, + "loss": 0.84006602, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.39501953, + "step": 5644, + "time_per_iteration": 2.672614097595215 + }, + { + "auxiliary_loss_clip": 0.01584404, + "auxiliary_loss_mlp": 0.00472852, + "balance_loss_clip": 1.2901907, + "balance_loss_mlp": 0.43201119, + "epoch": 0.3393957613106869, + "flos": 17347871030400.0, + "grad_norm": 4.357574613666112, + "language_loss": 0.73070824, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.75128084, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.40844727, + "step": 5645, + "time_per_iteration": 2.778305768966675 + }, + { + "auxiliary_loss_clip": 0.01597758, + "auxiliary_loss_mlp": 0.00434345, + "balance_loss_clip": 1.3074007, + "balance_loss_mlp": 0.39455256, + "epoch": 0.3394558845633549, + "flos": 28433960939520.0, + "grad_norm": 79.87131542036131, + "language_loss": 0.82893062, + "learning_rate": 3.076765310014552e-06, + "loss": 0.84925163, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.39819336, + "step": 5646, + "time_per_iteration": 2.765425443649292 + }, + { + "auxiliary_loss_clip": 0.01576168, + "auxiliary_loss_mlp": 0.00466689, + "balance_loss_clip": 1.27969992, + "balance_loss_mlp": 0.42646772, + "epoch": 0.33951600781602287, + "flos": 22086929865600.0, + "grad_norm": 7.433345002187825, + "language_loss": 0.85192478, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.87235332, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.40209961, + "step": 5647, + "time_per_iteration": 2.706400156021118 + }, + { + "auxiliary_loss_clip": 0.0159833, + "auxiliary_loss_mlp": 0.0043732, + "balance_loss_clip": 1.30498028, + "balance_loss_mlp": 0.3976706, + "epoch": 0.33957613106869083, + "flos": 23878262056320.0, + "grad_norm": 94.7845844910692, + "language_loss": 0.81488878, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.83524525, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.39624023, + "step": 5648, + "time_per_iteration": 2.689861536026001 + }, + { + "auxiliary_loss_clip": 0.01839869, + "auxiliary_loss_mlp": 0.00141911, + "balance_loss_clip": 1.64635777, + "balance_loss_mlp": 0.13075307, + "epoch": 0.3396362543213588, + "flos": 71242642414080.0, + "grad_norm": 0.7996376082361883, + "language_loss": 0.55675316, + "learning_rate": 3.075780527680754e-06, + "loss": 0.57657099, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.11181641, + "step": 5649, + "time_per_iteration": 3.1723403930664062 + }, + { + "auxiliary_loss_clip": 0.01581095, + "auxiliary_loss_mlp": 0.00433917, + "balance_loss_clip": 1.28648162, + "balance_loss_mlp": 0.39743865, + "epoch": 0.33969637757402676, + "flos": 25921615046400.0, + "grad_norm": 22.440707290410945, + "language_loss": 0.89513218, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.91528237, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.36499023, + "step": 5650, + "time_per_iteration": 2.725966453552246 + }, + { + "auxiliary_loss_clip": 0.01584408, + "auxiliary_loss_mlp": 0.00440437, + "balance_loss_clip": 1.29048729, + "balance_loss_mlp": 0.40257573, + "epoch": 0.33975650082669473, + "flos": 35261728663680.0, + "grad_norm": 16.713692650544395, + "language_loss": 0.75786531, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.77811372, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.37866211, + "step": 5651, + "time_per_iteration": 2.798872470855713 + }, + { + "auxiliary_loss_clip": 0.0158869, + "auxiliary_loss_mlp": 0.0044851, + "balance_loss_clip": 1.29467571, + "balance_loss_mlp": 0.4106968, + "epoch": 0.3398166240793627, + "flos": 16647001879680.0, + "grad_norm": 5.373562700958699, + "language_loss": 0.86477512, + "learning_rate": 3.074795378203616e-06, + "loss": 0.88514709, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.37792969, + "step": 5652, + "time_per_iteration": 2.6688833236694336 + }, + { + "auxiliary_loss_clip": 0.01582948, + "auxiliary_loss_mlp": 0.00450603, + "balance_loss_clip": 1.28889322, + "balance_loss_mlp": 0.41004759, + "epoch": 0.33987674733203066, + "flos": 24062196625920.0, + "grad_norm": 3.193395553694335, + "language_loss": 0.82840371, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.84873915, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.40576172, + "step": 5653, + "time_per_iteration": 2.725322961807251 + }, + { + "auxiliary_loss_clip": 0.01582495, + "auxiliary_loss_mlp": 0.00440518, + "balance_loss_clip": 1.29144013, + "balance_loss_mlp": 0.40258539, + "epoch": 0.3399368705846986, + "flos": 13250678279040.0, + "grad_norm": 9.318497629239587, + "language_loss": 0.94020462, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.96043479, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.37939453, + "step": 5654, + "time_per_iteration": 2.610999584197998 + }, + { + "auxiliary_loss_clip": 0.01583397, + "auxiliary_loss_mlp": 0.00443553, + "balance_loss_clip": 1.29710507, + "balance_loss_mlp": 0.40430921, + "epoch": 0.3399969938373666, + "flos": 27012832272000.0, + "grad_norm": 5.722588244033212, + "language_loss": 0.72302699, + "learning_rate": 3.073809861919351e-06, + "loss": 0.7432965, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.39233398, + "step": 5655, + "time_per_iteration": 2.7244269847869873 + }, + { + "auxiliary_loss_clip": 0.01585103, + "auxiliary_loss_mlp": 0.00428154, + "balance_loss_clip": 1.29533494, + "balance_loss_mlp": 0.39076969, + "epoch": 0.34005711709003456, + "flos": 28550096588160.0, + "grad_norm": 24.31206323692729, + "language_loss": 0.81125832, + "learning_rate": 3.073481275036697e-06, + "loss": 0.83139086, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.37402344, + "step": 5656, + "time_per_iteration": 2.739196538925171 + }, + { + "auxiliary_loss_clip": 0.01606974, + "auxiliary_loss_mlp": 0.00418602, + "balance_loss_clip": 1.30718315, + "balance_loss_mlp": 0.3815276, + "epoch": 0.3401172403427025, + "flos": 21617003208960.0, + "grad_norm": 6.4120001019126684, + "language_loss": 0.89067924, + "learning_rate": 3.073152647447525e-06, + "loss": 0.91093498, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.37060547, + "step": 5657, + "time_per_iteration": 2.6502573490142822 + }, + { + "auxiliary_loss_clip": 0.01587622, + "auxiliary_loss_mlp": 0.00445154, + "balance_loss_clip": 1.30127001, + "balance_loss_mlp": 0.40750772, + "epoch": 0.3401773635953705, + "flos": 25885776251520.0, + "grad_norm": 2.7351105515731002, + "language_loss": 0.90856087, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.92888862, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.37646484, + "step": 5658, + "time_per_iteration": 2.7455952167510986 + }, + { + "auxiliary_loss_clip": 0.01817873, + "auxiliary_loss_mlp": 0.00118, + "balance_loss_clip": 1.61939216, + "balance_loss_mlp": 0.1073669, + "epoch": 0.3402374868480385, + "flos": 65507995336320.0, + "grad_norm": 0.8167290856380668, + "language_loss": 0.5966351, + "learning_rate": 3.072495270199477e-06, + "loss": 0.61599386, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.10644531, + "step": 5659, + "time_per_iteration": 3.109438896179199 + }, + { + "auxiliary_loss_clip": 0.01591896, + "auxiliary_loss_mlp": 0.00426343, + "balance_loss_clip": 1.3081789, + "balance_loss_mlp": 0.39012721, + "epoch": 0.34029761010070647, + "flos": 24060580513920.0, + "grad_norm": 13.646799482092204, + "language_loss": 0.74164832, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.76183069, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.36230469, + "step": 5660, + "time_per_iteration": 2.632992744445801 + }, + { + "auxiliary_loss_clip": 0.01595012, + "auxiliary_loss_mlp": 0.00423427, + "balance_loss_clip": 1.30687273, + "balance_loss_mlp": 0.38609052, + "epoch": 0.34035773335337444, + "flos": 27599720590080.0, + "grad_norm": 2.376384653023225, + "language_loss": 0.73044056, + "learning_rate": 3.071837730274918e-06, + "loss": 0.75062495, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.37304688, + "step": 5661, + "time_per_iteration": 2.748883008956909 + }, + { + "auxiliary_loss_clip": 0.01589037, + "auxiliary_loss_mlp": 0.00454272, + "balance_loss_clip": 1.30540848, + "balance_loss_mlp": 0.4161008, + "epoch": 0.3404178566060424, + "flos": 20812783651200.0, + "grad_norm": 4.390980860594311, + "language_loss": 0.85364997, + "learning_rate": 3.071508899340113e-06, + "loss": 0.87408304, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.38183594, + "step": 5662, + "time_per_iteration": 2.85486102104187 + }, + { + "auxiliary_loss_clip": 0.01592386, + "auxiliary_loss_mlp": 0.00396281, + "balance_loss_clip": 1.30384839, + "balance_loss_mlp": 0.35968333, + "epoch": 0.34047797985871037, + "flos": 26833566470400.0, + "grad_norm": 5.305578717321817, + "language_loss": 0.79917634, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.81906295, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.36572266, + "step": 5663, + "time_per_iteration": 2.6761600971221924 + }, + { + "auxiliary_loss_clip": 0.01615654, + "auxiliary_loss_mlp": 0.00430076, + "balance_loss_clip": 1.32961893, + "balance_loss_mlp": 0.39555264, + "epoch": 0.34053810311137833, + "flos": 19682639061120.0, + "grad_norm": 7.814468648217868, + "language_loss": 0.91602862, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.93648589, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.34521484, + "step": 5664, + "time_per_iteration": 2.6260929107666016 + }, + { + "auxiliary_loss_clip": 0.01584264, + "auxiliary_loss_mlp": 0.00426024, + "balance_loss_clip": 1.29689229, + "balance_loss_mlp": 0.39169195, + "epoch": 0.3405982263640463, + "flos": 21725740656000.0, + "grad_norm": 15.068385101642036, + "language_loss": 0.74817109, + "learning_rate": 3.070522162795235e-06, + "loss": 0.76827395, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.34326172, + "step": 5665, + "time_per_iteration": 2.614964723587036 + }, + { + "auxiliary_loss_clip": 0.01572904, + "auxiliary_loss_mlp": 0.00406549, + "balance_loss_clip": 1.28473353, + "balance_loss_mlp": 0.36782986, + "epoch": 0.34065834961671426, + "flos": 18041629288320.0, + "grad_norm": 6.576760242988177, + "language_loss": 0.80109334, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.82088792, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 2.88085938, + "router_z_loss_mlp": 0.38720703, + "step": 5666, + "time_per_iteration": 2.6227569580078125 + }, + { + "auxiliary_loss_clip": 0.01577809, + "auxiliary_loss_mlp": 0.00427108, + "balance_loss_clip": 1.28695941, + "balance_loss_mlp": 0.38900888, + "epoch": 0.3407184728693822, + "flos": 21397337585280.0, + "grad_norm": 5.769210336590465, + "language_loss": 0.78098953, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.80103874, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.38085938, + "step": 5667, + "time_per_iteration": 2.6269400119781494 + }, + { + "auxiliary_loss_clip": 0.01705422, + "auxiliary_loss_mlp": 0.00099449, + "balance_loss_clip": 1.5060159, + "balance_loss_mlp": 0.08900621, + "epoch": 0.3407785961220502, + "flos": 68688101018880.0, + "grad_norm": 0.838233226978156, + "language_loss": 0.63385999, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65190876, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.10449219, + "step": 5668, + "time_per_iteration": 3.3097074031829834 + }, + { + "auxiliary_loss_clip": 0.01588457, + "auxiliary_loss_mlp": 0.00428777, + "balance_loss_clip": 1.29510403, + "balance_loss_mlp": 0.39187014, + "epoch": 0.34083871937471816, + "flos": 14064379027200.0, + "grad_norm": 12.122833531598696, + "language_loss": 0.80025601, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.82042837, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.36889648, + "step": 5669, + "time_per_iteration": 2.612865686416626 + }, + { + "auxiliary_loss_clip": 0.01564618, + "auxiliary_loss_mlp": 0.00408196, + "balance_loss_clip": 1.27129197, + "balance_loss_mlp": 0.3715511, + "epoch": 0.3408988426273861, + "flos": 17085435287040.0, + "grad_norm": 37.47167530738846, + "language_loss": 0.85501486, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.87474298, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.36645508, + "step": 5670, + "time_per_iteration": 2.6444623470306396 + }, + { + "auxiliary_loss_clip": 0.01577781, + "auxiliary_loss_mlp": 0.00462251, + "balance_loss_clip": 1.28130841, + "balance_loss_mlp": 0.42295954, + "epoch": 0.3409589658800541, + "flos": 24024562151040.0, + "grad_norm": 37.351444174063765, + "language_loss": 0.81965142, + "learning_rate": 3.068547593996078e-06, + "loss": 0.84005171, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.39306641, + "step": 5671, + "time_per_iteration": 2.6785223484039307 + }, + { + "auxiliary_loss_clip": 0.01566692, + "auxiliary_loss_mlp": 0.00412247, + "balance_loss_clip": 1.27676153, + "balance_loss_mlp": 0.37231144, + "epoch": 0.34101908913272205, + "flos": 21142012734720.0, + "grad_norm": 50.70823660410165, + "language_loss": 0.79444927, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.81423861, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.3996582, + "step": 5672, + "time_per_iteration": 2.7613494396209717 + }, + { + "auxiliary_loss_clip": 0.01563639, + "auxiliary_loss_mlp": 0.00382596, + "balance_loss_clip": 1.27102828, + "balance_loss_mlp": 0.34649983, + "epoch": 0.3410792123853901, + "flos": 15702012921600.0, + "grad_norm": 6.003312795106123, + "language_loss": 0.79127163, + "learning_rate": 3.06788908010777e-06, + "loss": 0.81073397, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.36083984, + "step": 5673, + "time_per_iteration": 4.149132251739502 + }, + { + "auxiliary_loss_clip": 0.01565389, + "auxiliary_loss_mlp": 0.00399115, + "balance_loss_clip": 1.27599072, + "balance_loss_mlp": 0.36437771, + "epoch": 0.34113933563805804, + "flos": 23036012974080.0, + "grad_norm": 295.4353289904276, + "language_loss": 0.85384077, + "learning_rate": 3.067559762415682e-06, + "loss": 0.8734858, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.34741211, + "step": 5674, + "time_per_iteration": 2.688446044921875 + }, + { + "auxiliary_loss_clip": 0.01628128, + "auxiliary_loss_mlp": 0.00167691, + "balance_loss_clip": 1.42623901, + "balance_loss_mlp": 0.15805879, + "epoch": 0.341199458890726, + "flos": 69614235336960.0, + "grad_norm": 0.8012025881704852, + "language_loss": 0.56134033, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.57929862, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.09619141, + "step": 5675, + "time_per_iteration": 4.696690797805786 + }, + { + "auxiliary_loss_clip": 0.01574558, + "auxiliary_loss_mlp": 0.004101, + "balance_loss_clip": 1.28344929, + "balance_loss_mlp": 0.37545824, + "epoch": 0.34125958214339397, + "flos": 22346348866560.0, + "grad_norm": 215.67085276824042, + "language_loss": 0.84685135, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.86669797, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.34643555, + "step": 5676, + "time_per_iteration": 2.7509567737579346 + }, + { + "auxiliary_loss_clip": 0.01568796, + "auxiliary_loss_mlp": 0.00405564, + "balance_loss_clip": 1.27323258, + "balance_loss_mlp": 0.36844194, + "epoch": 0.34131970539606193, + "flos": 21871933009920.0, + "grad_norm": 145.46440675917248, + "language_loss": 0.90497494, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.9247185, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.37133789, + "step": 5677, + "time_per_iteration": 2.6524882316589355 + }, + { + "auxiliary_loss_clip": 0.01563023, + "auxiliary_loss_mlp": 0.00425114, + "balance_loss_clip": 1.26987815, + "balance_loss_mlp": 0.38582283, + "epoch": 0.3413798286487299, + "flos": 24935723475840.0, + "grad_norm": 93.48290785521299, + "language_loss": 0.85356069, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.87344205, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.39331055, + "step": 5678, + "time_per_iteration": 2.686438798904419 + }, + { + "auxiliary_loss_clip": 0.01557729, + "auxiliary_loss_mlp": 0.00428829, + "balance_loss_clip": 1.26340222, + "balance_loss_mlp": 0.39211208, + "epoch": 0.34143995190139786, + "flos": 25374372364800.0, + "grad_norm": 20.115937004868083, + "language_loss": 0.80775338, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.82761896, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.3671875, + "step": 5679, + "time_per_iteration": 2.737489700317383 + }, + { + "auxiliary_loss_clip": 0.01583695, + "auxiliary_loss_mlp": 0.00193207, + "balance_loss_clip": 1.39548206, + "balance_loss_mlp": 0.18271689, + "epoch": 0.34150007515406583, + "flos": 67782578129280.0, + "grad_norm": 1.4644571484273643, + "language_loss": 0.59035873, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.60812771, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.10498047, + "step": 5680, + "time_per_iteration": 4.592510938644409 + }, + { + "auxiliary_loss_clip": 0.01551543, + "auxiliary_loss_mlp": 0.00406052, + "balance_loss_clip": 1.26681256, + "balance_loss_mlp": 0.36964551, + "epoch": 0.3415601984067338, + "flos": 20302421258880.0, + "grad_norm": 20.676773352818547, + "language_loss": 0.78444523, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.80402118, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.36401367, + "step": 5681, + "time_per_iteration": 2.6316099166870117 + }, + { + "auxiliary_loss_clip": 0.01547595, + "auxiliary_loss_mlp": 0.0043015, + "balance_loss_clip": 1.25914609, + "balance_loss_mlp": 0.39164537, + "epoch": 0.34162032165940176, + "flos": 26031178506240.0, + "grad_norm": 264.19066824515966, + "language_loss": 0.77694106, + "learning_rate": 3.064923764577233e-06, + "loss": 0.79671848, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.38476562, + "step": 5682, + "time_per_iteration": 2.704002618789673 + }, + { + "auxiliary_loss_clip": 0.01554642, + "auxiliary_loss_mlp": 0.00454688, + "balance_loss_clip": 1.26387334, + "balance_loss_mlp": 0.41670835, + "epoch": 0.3416804449120697, + "flos": 28803338449920.0, + "grad_norm": 52.999884836344485, + "language_loss": 0.88795102, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.9080444, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.37963867, + "step": 5683, + "time_per_iteration": 2.7831549644470215 + }, + { + "auxiliary_loss_clip": 0.01574861, + "auxiliary_loss_mlp": 0.00435873, + "balance_loss_clip": 1.28095341, + "balance_loss_mlp": 0.39970517, + "epoch": 0.3417405681647377, + "flos": 22601601889920.0, + "grad_norm": 16.609237824179207, + "language_loss": 0.76859319, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.78870058, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.36181641, + "step": 5684, + "time_per_iteration": 2.6840338706970215 + }, + { + "auxiliary_loss_clip": 0.01551699, + "auxiliary_loss_mlp": 0.00439061, + "balance_loss_clip": 1.26758051, + "balance_loss_mlp": 0.40458584, + "epoch": 0.34180069141740566, + "flos": 24716237420160.0, + "grad_norm": 7.533671066186242, + "language_loss": 0.80096793, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.82087553, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.3449707, + "step": 5685, + "time_per_iteration": 2.6756787300109863 + }, + { + "auxiliary_loss_clip": 0.01572446, + "auxiliary_loss_mlp": 0.00404363, + "balance_loss_clip": 1.28472972, + "balance_loss_mlp": 0.36964914, + "epoch": 0.3418608146700737, + "flos": 30518755246080.0, + "grad_norm": 5.544803339208702, + "language_loss": 0.76664484, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.78641289, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.34692383, + "step": 5686, + "time_per_iteration": 4.115495920181274 + }, + { + "auxiliary_loss_clip": 0.0157048, + "auxiliary_loss_mlp": 0.00431996, + "balance_loss_clip": 1.27871513, + "balance_loss_mlp": 0.39444527, + "epoch": 0.34192093792274164, + "flos": 15122343237120.0, + "grad_norm": 2.200688185207488, + "language_loss": 0.84181452, + "learning_rate": 3.06327495310661e-06, + "loss": 0.86183929, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.37548828, + "step": 5687, + "time_per_iteration": 2.632516860961914 + }, + { + "auxiliary_loss_clip": 0.01566522, + "auxiliary_loss_mlp": 0.00444264, + "balance_loss_clip": 1.27815259, + "balance_loss_mlp": 0.40704656, + "epoch": 0.3419810611754096, + "flos": 13187799521280.0, + "grad_norm": 300.5222692491193, + "language_loss": 0.93339968, + "learning_rate": 3.062945069803981e-06, + "loss": 0.95350748, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.37207031, + "step": 5688, + "time_per_iteration": 2.6575570106506348 + }, + { + "auxiliary_loss_clip": 0.01587754, + "auxiliary_loss_mlp": 0.00424028, + "balance_loss_clip": 1.28607833, + "balance_loss_mlp": 0.38528526, + "epoch": 0.34204118442807757, + "flos": 19536267139200.0, + "grad_norm": 10.832553455046009, + "language_loss": 0.85442686, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.87454474, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.38745117, + "step": 5689, + "time_per_iteration": 2.6565048694610596 + }, + { + "auxiliary_loss_clip": 0.01578265, + "auxiliary_loss_mlp": 0.00429211, + "balance_loss_clip": 1.28456426, + "balance_loss_mlp": 0.38877556, + "epoch": 0.34210130768074554, + "flos": 15194846839680.0, + "grad_norm": 3.9989992956146456, + "language_loss": 0.79860556, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.81868041, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.40429688, + "step": 5690, + "time_per_iteration": 2.6812186241149902 + }, + { + "auxiliary_loss_clip": 0.01556899, + "auxiliary_loss_mlp": 0.00422181, + "balance_loss_clip": 1.26918542, + "balance_loss_mlp": 0.38722903, + "epoch": 0.3421614309334135, + "flos": 24936226266240.0, + "grad_norm": 6.699531921127357, + "language_loss": 0.81605667, + "learning_rate": 3.061955178104237e-06, + "loss": 0.8358475, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.34960938, + "step": 5691, + "time_per_iteration": 2.7449257373809814 + }, + { + "auxiliary_loss_clip": 0.01567016, + "auxiliary_loss_mlp": 0.00416806, + "balance_loss_clip": 1.28132772, + "balance_loss_mlp": 0.3800897, + "epoch": 0.34222155418608147, + "flos": 21908633731200.0, + "grad_norm": 2.6334149039010524, + "language_loss": 0.76185554, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.78169382, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.3671875, + "step": 5692, + "time_per_iteration": 2.6840031147003174 + }, + { + "auxiliary_loss_clip": 0.01580508, + "auxiliary_loss_mlp": 0.00389414, + "balance_loss_clip": 1.28679729, + "balance_loss_mlp": 0.35212508, + "epoch": 0.34228167743874943, + "flos": 18114061063680.0, + "grad_norm": 16.69465923614269, + "language_loss": 0.805215, + "learning_rate": 3.06129504893632e-06, + "loss": 0.82491416, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.37304688, + "step": 5693, + "time_per_iteration": 2.696392297744751 + }, + { + "auxiliary_loss_clip": 0.01570045, + "auxiliary_loss_mlp": 0.00401975, + "balance_loss_clip": 1.28324866, + "balance_loss_mlp": 0.36647415, + "epoch": 0.3423418006914174, + "flos": 21288600138240.0, + "grad_norm": 4.102767930373659, + "language_loss": 0.81528914, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.83500934, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.35498047, + "step": 5694, + "time_per_iteration": 2.80635142326355 + }, + { + "auxiliary_loss_clip": 0.01561946, + "auxiliary_loss_mlp": 0.00379832, + "balance_loss_clip": 1.2713114, + "balance_loss_mlp": 0.34533259, + "epoch": 0.34240192394408536, + "flos": 19823480288640.0, + "grad_norm": 359.6438447262561, + "language_loss": 0.87385631, + "learning_rate": 3.060634758790747e-06, + "loss": 0.89327407, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.3449707, + "step": 5695, + "time_per_iteration": 2.703368663787842 + }, + { + "auxiliary_loss_clip": 0.01591864, + "auxiliary_loss_mlp": 0.00375214, + "balance_loss_clip": 1.29602659, + "balance_loss_mlp": 0.3393802, + "epoch": 0.3424620471967533, + "flos": 24535535074560.0, + "grad_norm": 7.480668955777455, + "language_loss": 0.7887795, + "learning_rate": 3.060304553382635e-06, + "loss": 0.80845028, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.35839844, + "step": 5696, + "time_per_iteration": 2.783504009246826 + }, + { + "auxiliary_loss_clip": 0.01581576, + "auxiliary_loss_mlp": 0.00383439, + "balance_loss_clip": 1.29212046, + "balance_loss_mlp": 0.34698457, + "epoch": 0.3425221704494213, + "flos": 25848895962240.0, + "grad_norm": 57.70019261654548, + "language_loss": 0.76703048, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.78668058, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.36450195, + "step": 5697, + "time_per_iteration": 2.7351551055908203 + }, + { + "auxiliary_loss_clip": 0.01587068, + "auxiliary_loss_mlp": 0.00375952, + "balance_loss_clip": 1.29546118, + "balance_loss_mlp": 0.34059423, + "epoch": 0.34258229370208926, + "flos": 21540513196800.0, + "grad_norm": 36.06780918230688, + "language_loss": 0.87722385, + "learning_rate": 3.05964402195837e-06, + "loss": 0.89685398, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.35375977, + "step": 5698, + "time_per_iteration": 2.7074692249298096 + }, + { + "auxiliary_loss_clip": 0.01569598, + "auxiliary_loss_mlp": 0.0039138, + "balance_loss_clip": 1.27978015, + "balance_loss_mlp": 0.35447246, + "epoch": 0.3426424169547573, + "flos": 23652778429440.0, + "grad_norm": 40.51167638912526, + "language_loss": 0.7616595, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.78126931, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.36889648, + "step": 5699, + "time_per_iteration": 2.6597366333007812 + }, + { + "auxiliary_loss_clip": 0.0156737, + "auxiliary_loss_mlp": 0.00366281, + "balance_loss_clip": 1.2772671, + "balance_loss_mlp": 0.33237818, + "epoch": 0.34270254020742524, + "flos": 24644883052800.0, + "grad_norm": 5.8526132044947135, + "language_loss": 0.7803362, + "learning_rate": 3.058983329806877e-06, + "loss": 0.79967272, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.33886719, + "step": 5700, + "time_per_iteration": 2.673095226287842 + }, + { + "auxiliary_loss_clip": 0.01574258, + "auxiliary_loss_mlp": 0.00362797, + "balance_loss_clip": 1.28505766, + "balance_loss_mlp": 0.32886988, + "epoch": 0.3427626634600932, + "flos": 20996754134400.0, + "grad_norm": 6.094389706459197, + "language_loss": 0.89286065, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.91223121, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.33911133, + "step": 5701, + "time_per_iteration": 2.636474370956421 + }, + { + "auxiliary_loss_clip": 0.01561999, + "auxiliary_loss_mlp": 0.00322589, + "balance_loss_clip": 1.27096295, + "balance_loss_mlp": 0.28994972, + "epoch": 0.3428227867127612, + "flos": 21433786911360.0, + "grad_norm": 35.8365731115393, + "language_loss": 0.78259301, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.80143887, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.32641602, + "step": 5702, + "time_per_iteration": 2.6632611751556396 + }, + { + "auxiliary_loss_clip": 0.01570299, + "auxiliary_loss_mlp": 0.00174527, + "balance_loss_clip": 1.34997857, + "balance_loss_mlp": 0.16146214, + "epoch": 0.34288290996542914, + "flos": 55731782695680.0, + "grad_norm": 0.861277875320857, + "language_loss": 0.56101012, + "learning_rate": 3.057991990435309e-06, + "loss": 0.57845831, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.13085938, + "step": 5703, + "time_per_iteration": 3.042109489440918 + }, + { + "auxiliary_loss_clip": 0.01574025, + "auxiliary_loss_mlp": 0.00409733, + "balance_loss_clip": 1.28392315, + "balance_loss_mlp": 0.36920193, + "epoch": 0.3429430332180971, + "flos": 20156803522560.0, + "grad_norm": 73.1080590958927, + "language_loss": 0.81009531, + "learning_rate": 3.057661463723086e-06, + "loss": 0.82993281, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.40527344, + "step": 5704, + "time_per_iteration": 2.6462290287017822 + }, + { + "auxiliary_loss_clip": 0.01582379, + "auxiliary_loss_mlp": 0.00368482, + "balance_loss_clip": 1.29340792, + "balance_loss_mlp": 0.33472165, + "epoch": 0.34300315647076507, + "flos": 17965857548160.0, + "grad_norm": 34.712633822719454, + "language_loss": 0.76986551, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.78937411, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.33764648, + "step": 5705, + "time_per_iteration": 2.787940502166748 + }, + { + "auxiliary_loss_clip": 0.0153817, + "auxiliary_loss_mlp": 0.00336864, + "balance_loss_clip": 1.25333333, + "balance_loss_mlp": 0.30435535, + "epoch": 0.34306327972343303, + "flos": 22086822124800.0, + "grad_norm": 9.846079843187336, + "language_loss": 0.84611058, + "learning_rate": 3.057000289991289e-06, + "loss": 0.86486089, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.32507324, + "step": 5706, + "time_per_iteration": 2.664628028869629 + }, + { + "auxiliary_loss_clip": 0.0159436, + "auxiliary_loss_mlp": 0.00343067, + "balance_loss_clip": 1.29539454, + "balance_loss_mlp": 0.30663651, + "epoch": 0.343123402976101, + "flos": 18442679616000.0, + "grad_norm": 3.75549088660496, + "language_loss": 0.90218323, + "learning_rate": 3.056669642996787e-06, + "loss": 0.92155755, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 2.99414062, + "router_z_loss_mlp": 0.36425781, + "step": 5707, + "time_per_iteration": 2.7008376121520996 + }, + { + "auxiliary_loss_clip": 0.01554917, + "auxiliary_loss_mlp": 0.00359986, + "balance_loss_clip": 1.2714889, + "balance_loss_mlp": 0.32784748, + "epoch": 0.34318352622876896, + "flos": 17163685065600.0, + "grad_norm": 2.3075675913491445, + "language_loss": 0.81046379, + "learning_rate": 3.056338955933266e-06, + "loss": 0.82961285, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.32128906, + "step": 5708, + "time_per_iteration": 2.834834575653076 + }, + { + "auxiliary_loss_clip": 0.01531711, + "auxiliary_loss_mlp": 0.00300609, + "balance_loss_clip": 1.25019491, + "balance_loss_mlp": 0.26846987, + "epoch": 0.34324364948143693, + "flos": 26688164215680.0, + "grad_norm": 53.288824445915346, + "language_loss": 0.86918616, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.88750935, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.32128906, + "step": 5709, + "time_per_iteration": 2.801910638809204 + }, + { + "auxiliary_loss_clip": 0.01543266, + "auxiliary_loss_mlp": 0.00348725, + "balance_loss_clip": 1.25203133, + "balance_loss_mlp": 0.31186581, + "epoch": 0.3433037727341049, + "flos": 21251576194560.0, + "grad_norm": 13.567898002049748, + "language_loss": 0.85988516, + "learning_rate": 3.055677461649329e-06, + "loss": 0.8788051, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.36865234, + "step": 5710, + "time_per_iteration": 2.682270050048828 + }, + { + "auxiliary_loss_clip": 0.01534662, + "auxiliary_loss_mlp": 0.00321637, + "balance_loss_clip": 1.24711812, + "balance_loss_mlp": 0.28747198, + "epoch": 0.34336389598677286, + "flos": 20629423699200.0, + "grad_norm": 5.022444693645233, + "language_loss": 0.77214837, + "learning_rate": 3.055346654453996e-06, + "loss": 0.79071134, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.34179688, + "step": 5711, + "time_per_iteration": 2.728151321411133 + }, + { + "auxiliary_loss_clip": 0.01517625, + "auxiliary_loss_mlp": 0.00358195, + "balance_loss_clip": 1.23619246, + "balance_loss_mlp": 0.32488748, + "epoch": 0.3434240192394409, + "flos": 14538579402240.0, + "grad_norm": 4.219602789569208, + "language_loss": 0.74137044, + "learning_rate": 3.055015807239812e-06, + "loss": 0.76012868, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.33325195, + "step": 5712, + "time_per_iteration": 2.658104419708252 + }, + { + "auxiliary_loss_clip": 0.01564025, + "auxiliary_loss_mlp": 0.00162828, + "balance_loss_clip": 1.3556931, + "balance_loss_mlp": 0.15114501, + "epoch": 0.34348414249210885, + "flos": 58051538841600.0, + "grad_norm": 0.8518648594124081, + "language_loss": 0.57910168, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.59637022, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.11669922, + "step": 5713, + "time_per_iteration": 3.12113356590271 + }, + { + "auxiliary_loss_clip": 0.01502989, + "auxiliary_loss_mlp": 0.00320144, + "balance_loss_clip": 1.21905029, + "balance_loss_mlp": 0.28674155, + "epoch": 0.3435442657447768, + "flos": 20704441253760.0, + "grad_norm": 20.23782584353987, + "language_loss": 0.87569547, + "learning_rate": 3.054353992805076e-06, + "loss": 0.8939268, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.33398438, + "step": 5714, + "time_per_iteration": 2.692373037338257 + }, + { + "auxiliary_loss_clip": 0.0150676, + "auxiliary_loss_mlp": 0.00359094, + "balance_loss_clip": 1.22317731, + "balance_loss_mlp": 0.32361761, + "epoch": 0.3436043889974448, + "flos": 22930256355840.0, + "grad_norm": 66.72656747330252, + "language_loss": 0.77312183, + "learning_rate": 3.05402302560962e-06, + "loss": 0.79178035, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.35498047, + "step": 5715, + "time_per_iteration": 2.669142723083496 + }, + { + "auxiliary_loss_clip": 0.01510789, + "auxiliary_loss_mlp": 0.00138312, + "balance_loss_clip": 1.30575252, + "balance_loss_mlp": 0.12629533, + "epoch": 0.34366451225011274, + "flos": 58403285752320.0, + "grad_norm": 0.9168685104943242, + "language_loss": 0.65305114, + "learning_rate": 3.053692018445505e-06, + "loss": 0.66954213, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.12011719, + "step": 5716, + "time_per_iteration": 4.6164093017578125 + }, + { + "auxiliary_loss_clip": 0.01497051, + "auxiliary_loss_mlp": 0.00336527, + "balance_loss_clip": 1.21951306, + "balance_loss_mlp": 0.30391109, + "epoch": 0.3437246355027807, + "flos": 15596292216960.0, + "grad_norm": 54.48080523912891, + "language_loss": 0.81735402, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.83568978, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.32592773, + "step": 5717, + "time_per_iteration": 4.09070086479187 + }, + { + "auxiliary_loss_clip": 0.0148687, + "auxiliary_loss_mlp": 0.00357891, + "balance_loss_clip": 1.20976281, + "balance_loss_mlp": 0.32498938, + "epoch": 0.34378475875544867, + "flos": 27672260106240.0, + "grad_norm": 2.784537190086487, + "language_loss": 0.82768649, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.84613413, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.32885742, + "step": 5718, + "time_per_iteration": 2.783261299133301 + }, + { + "auxiliary_loss_clip": 0.01502528, + "auxiliary_loss_mlp": 0.00347925, + "balance_loss_clip": 1.21751547, + "balance_loss_mlp": 0.31046954, + "epoch": 0.34384488200811664, + "flos": 31431496769280.0, + "grad_norm": 6.334651021516015, + "language_loss": 0.72502893, + "learning_rate": 3.052698757266734e-06, + "loss": 0.74353349, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.37426758, + "step": 5719, + "time_per_iteration": 2.8150084018707275 + }, + { + "auxiliary_loss_clip": 0.0150933, + "auxiliary_loss_mlp": 0.00321305, + "balance_loss_clip": 1.2222389, + "balance_loss_mlp": 0.28809369, + "epoch": 0.3439050052607846, + "flos": 24899920594560.0, + "grad_norm": 2.5367988712795824, + "language_loss": 0.81472707, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.83303344, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.33203125, + "step": 5720, + "time_per_iteration": 2.760312080383301 + }, + { + "auxiliary_loss_clip": 0.01510465, + "auxiliary_loss_mlp": 0.00323401, + "balance_loss_clip": 1.22564292, + "balance_loss_mlp": 0.29126251, + "epoch": 0.34396512851345257, + "flos": 18150079426560.0, + "grad_norm": 7.483712972715063, + "language_loss": 0.80822957, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.82656825, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.32141113, + "step": 5721, + "time_per_iteration": 2.6796512603759766 + }, + { + "auxiliary_loss_clip": 0.01495017, + "auxiliary_loss_mlp": 0.00346112, + "balance_loss_clip": 1.20986307, + "balance_loss_mlp": 0.31220847, + "epoch": 0.34402525176612053, + "flos": 16034438315520.0, + "grad_norm": 85.07560403086474, + "language_loss": 0.86785245, + "learning_rate": 3.051705136821992e-06, + "loss": 0.88626379, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.33898926, + "step": 5722, + "time_per_iteration": 4.0104169845581055 + }, + { + "auxiliary_loss_clip": 0.01495529, + "auxiliary_loss_mlp": 0.00335356, + "balance_loss_clip": 1.21648848, + "balance_loss_mlp": 0.30340827, + "epoch": 0.3440853750187885, + "flos": 21178641628800.0, + "grad_norm": 34.23610427145381, + "language_loss": 0.86571622, + "learning_rate": 3.051373850228801e-06, + "loss": 0.88402498, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.3190918, + "step": 5723, + "time_per_iteration": 2.6350176334381104 + }, + { + "auxiliary_loss_clip": 0.01492861, + "auxiliary_loss_mlp": 0.00329978, + "balance_loss_clip": 1.20958507, + "balance_loss_mlp": 0.29693329, + "epoch": 0.34414549827145646, + "flos": 12677868092160.0, + "grad_norm": 5.144927802474237, + "language_loss": 0.9158352, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.93406361, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.33007812, + "step": 5724, + "time_per_iteration": 2.6324737071990967 + }, + { + "auxiliary_loss_clip": 0.0149856, + "auxiliary_loss_mlp": 0.0034294, + "balance_loss_clip": 1.21911883, + "balance_loss_mlp": 0.3091327, + "epoch": 0.3442056215241244, + "flos": 31284514316160.0, + "grad_norm": 47.45832382797063, + "language_loss": 0.75688672, + "learning_rate": 3.05071115745038e-06, + "loss": 0.77530175, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.33789062, + "step": 5725, + "time_per_iteration": 2.725569248199463 + }, + { + "auxiliary_loss_clip": 0.01480928, + "auxiliary_loss_mlp": 0.00344616, + "balance_loss_clip": 1.20019186, + "balance_loss_mlp": 0.30978256, + "epoch": 0.34426574477679245, + "flos": 23367289132800.0, + "grad_norm": 31.800106306525294, + "language_loss": 0.7700808, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.78833628, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.34814453, + "step": 5726, + "time_per_iteration": 2.7133121490478516 + }, + { + "auxiliary_loss_clip": 0.01473965, + "auxiliary_loss_mlp": 0.00346251, + "balance_loss_clip": 1.19575739, + "balance_loss_mlp": 0.31337279, + "epoch": 0.3443258680294604, + "flos": 24535427333760.0, + "grad_norm": 14.77160130146188, + "language_loss": 0.80160868, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.81981087, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.32910156, + "step": 5727, + "time_per_iteration": 2.6744582653045654 + }, + { + "auxiliary_loss_clip": 0.01482508, + "auxiliary_loss_mlp": 0.00324609, + "balance_loss_clip": 1.20267367, + "balance_loss_mlp": 0.29384089, + "epoch": 0.3443859912821284, + "flos": 20230133137920.0, + "grad_norm": 4.111812841656273, + "language_loss": 0.95251906, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.97059023, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.30773926, + "step": 5728, + "time_per_iteration": 3.9685022830963135 + }, + { + "auxiliary_loss_clip": 0.01478046, + "auxiliary_loss_mlp": 0.00307246, + "balance_loss_clip": 1.19748163, + "balance_loss_mlp": 0.27587023, + "epoch": 0.34444611453479634, + "flos": 24316515895680.0, + "grad_norm": 82.65261771544166, + "language_loss": 0.77305734, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.79091024, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.3137207, + "step": 5729, + "time_per_iteration": 2.714808225631714 + }, + { + "auxiliary_loss_clip": 0.01474056, + "auxiliary_loss_mlp": 0.00341891, + "balance_loss_clip": 1.1994195, + "balance_loss_mlp": 0.30994296, + "epoch": 0.3445062377874643, + "flos": 16983413683200.0, + "grad_norm": 30.858303907379426, + "language_loss": 0.81893325, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.83709276, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.31958008, + "step": 5730, + "time_per_iteration": 2.640746831893921 + }, + { + "auxiliary_loss_clip": 0.01462637, + "auxiliary_loss_mlp": 0.00285989, + "balance_loss_clip": 1.18847108, + "balance_loss_mlp": 0.25721151, + "epoch": 0.3445663610401323, + "flos": 20302708567680.0, + "grad_norm": 4.172468225623624, + "language_loss": 0.8534981, + "learning_rate": 3.048722123283578e-06, + "loss": 0.87098438, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.28759766, + "step": 5731, + "time_per_iteration": 2.6816492080688477 + }, + { + "auxiliary_loss_clip": 0.01483135, + "auxiliary_loss_mlp": 0.00305319, + "balance_loss_clip": 1.20170212, + "balance_loss_mlp": 0.27306104, + "epoch": 0.34462648429280024, + "flos": 15888102307200.0, + "grad_norm": 7.230164810219902, + "language_loss": 0.86092865, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.87881321, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.32250977, + "step": 5732, + "time_per_iteration": 2.8207433223724365 + }, + { + "auxiliary_loss_clip": 0.01412623, + "auxiliary_loss_mlp": 0.00044003, + "balance_loss_clip": 1.21572137, + "balance_loss_mlp": 0.03532484, + "epoch": 0.3446866075454682, + "flos": 59311035285120.0, + "grad_norm": 0.811757024503234, + "language_loss": 0.53482616, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.54939246, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.08691406, + "step": 5733, + "time_per_iteration": 3.2104363441467285 + }, + { + "auxiliary_loss_clip": 0.0148772, + "auxiliary_loss_mlp": 0.00302331, + "balance_loss_clip": 1.20548391, + "balance_loss_mlp": 0.27152756, + "epoch": 0.34474673079813617, + "flos": 22343799000960.0, + "grad_norm": 1.9937726138617244, + "language_loss": 0.89376205, + "learning_rate": 3.047727069167207e-06, + "loss": 0.91166258, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.30822754, + "step": 5734, + "time_per_iteration": 2.741135835647583 + }, + { + "auxiliary_loss_clip": 0.01448141, + "auxiliary_loss_mlp": 0.0029349, + "balance_loss_clip": 1.17453349, + "balance_loss_mlp": 0.26237667, + "epoch": 0.34480685405080413, + "flos": 27670141203840.0, + "grad_norm": 21.92828647146351, + "language_loss": 0.98964858, + "learning_rate": 3.0473953049851478e-06, + "loss": 1.00706494, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.31079102, + "step": 5735, + "time_per_iteration": 2.7265405654907227 + }, + { + "auxiliary_loss_clip": 0.01484354, + "auxiliary_loss_mlp": 0.0030914, + "balance_loss_clip": 1.20451069, + "balance_loss_mlp": 0.27701306, + "epoch": 0.3448669773034721, + "flos": 22456020067200.0, + "grad_norm": 9.022410963146363, + "language_loss": 0.82859653, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.84653145, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.32128906, + "step": 5736, + "time_per_iteration": 2.7656447887420654 + }, + { + "auxiliary_loss_clip": 0.01470008, + "auxiliary_loss_mlp": 0.00312366, + "balance_loss_clip": 1.19352162, + "balance_loss_mlp": 0.28137201, + "epoch": 0.34492710055614006, + "flos": 24936190352640.0, + "grad_norm": 12.832306219283502, + "language_loss": 0.83581579, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.85363948, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.31005859, + "step": 5737, + "time_per_iteration": 2.7277369499206543 + }, + { + "auxiliary_loss_clip": 0.01490773, + "auxiliary_loss_mlp": 0.0033163, + "balance_loss_clip": 1.20534468, + "balance_loss_mlp": 0.29756019, + "epoch": 0.34498722380880803, + "flos": 20120821073280.0, + "grad_norm": 7.592716308160046, + "language_loss": 0.78516495, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.80338907, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.34082031, + "step": 5738, + "time_per_iteration": 2.698808193206787 + }, + { + "auxiliary_loss_clip": 0.01476561, + "auxiliary_loss_mlp": 0.00287715, + "balance_loss_clip": 1.19699097, + "balance_loss_mlp": 0.25569549, + "epoch": 0.34504734706147605, + "flos": 28438126917120.0, + "grad_norm": 5.119614151673911, + "language_loss": 0.88862932, + "learning_rate": 3.046067851209389e-06, + "loss": 0.90627205, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.3203125, + "step": 5739, + "time_per_iteration": 2.7606801986694336 + }, + { + "auxiliary_loss_clip": 0.01479735, + "auxiliary_loss_mlp": 0.00280434, + "balance_loss_clip": 1.19854069, + "balance_loss_mlp": 0.25103691, + "epoch": 0.345107470314144, + "flos": 22674464628480.0, + "grad_norm": 2.792326305748787, + "language_loss": 0.90410084, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.9217025, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.29394531, + "step": 5740, + "time_per_iteration": 2.6929519176483154 + }, + { + "auxiliary_loss_clip": 0.01468263, + "auxiliary_loss_mlp": 0.00292265, + "balance_loss_clip": 1.19530141, + "balance_loss_mlp": 0.26401246, + "epoch": 0.345167593566812, + "flos": 20630716588800.0, + "grad_norm": 7.216468605083782, + "language_loss": 0.84466958, + "learning_rate": 3.045403886269181e-06, + "loss": 0.86227483, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.28271484, + "step": 5741, + "time_per_iteration": 2.714667797088623 + }, + { + "auxiliary_loss_clip": 0.01476076, + "auxiliary_loss_mlp": 0.00281152, + "balance_loss_clip": 1.19462144, + "balance_loss_mlp": 0.25057489, + "epoch": 0.34522771681947995, + "flos": 26214358890240.0, + "grad_norm": 20.616411120897467, + "language_loss": 0.83737683, + "learning_rate": 3.045071844330053e-06, + "loss": 0.85494912, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.3059082, + "step": 5742, + "time_per_iteration": 2.686887264251709 + }, + { + "auxiliary_loss_clip": 0.01444545, + "auxiliary_loss_mlp": 0.00282844, + "balance_loss_clip": 1.17438817, + "balance_loss_mlp": 0.25449604, + "epoch": 0.3452878400721479, + "flos": 19062354072960.0, + "grad_norm": 5.369101887745796, + "language_loss": 0.85610509, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.87337899, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.28356934, + "step": 5743, + "time_per_iteration": 2.627516984939575 + }, + { + "auxiliary_loss_clip": 0.014633, + "auxiliary_loss_mlp": 0.00283147, + "balance_loss_clip": 1.18889976, + "balance_loss_mlp": 0.25317782, + "epoch": 0.3453479633248159, + "flos": 27929739772800.0, + "grad_norm": 8.282143787549998, + "language_loss": 0.76090682, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.77837133, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.29980469, + "step": 5744, + "time_per_iteration": 2.692866802215576 + }, + { + "auxiliary_loss_clip": 0.01453356, + "auxiliary_loss_mlp": 0.002612, + "balance_loss_clip": 1.18397784, + "balance_loss_mlp": 0.23258999, + "epoch": 0.34540808657748384, + "flos": 19606113135360.0, + "grad_norm": 6.034156227042831, + "language_loss": 0.87267113, + "learning_rate": 3.044075480787665e-06, + "loss": 0.88981676, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.28601074, + "step": 5745, + "time_per_iteration": 2.623452663421631 + }, + { + "auxiliary_loss_clip": 0.01470004, + "auxiliary_loss_mlp": 0.00263793, + "balance_loss_clip": 1.19267261, + "balance_loss_mlp": 0.23544472, + "epoch": 0.3454682098301518, + "flos": 20411661496320.0, + "grad_norm": 88.7722177677256, + "language_loss": 0.96527982, + "learning_rate": 3.043743280407182e-06, + "loss": 0.9826178, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.28369141, + "step": 5746, + "time_per_iteration": 2.7449681758880615 + }, + { + "auxiliary_loss_clip": 0.0147041, + "auxiliary_loss_mlp": 0.00283393, + "balance_loss_clip": 1.19039989, + "balance_loss_mlp": 0.25235093, + "epoch": 0.34552833308281977, + "flos": 21325121291520.0, + "grad_norm": 3.8004040649364703, + "language_loss": 0.74602044, + "learning_rate": 3.043411040447849e-06, + "loss": 0.76355839, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.31054688, + "step": 5747, + "time_per_iteration": 2.754140853881836 + }, + { + "auxiliary_loss_clip": 0.01466816, + "auxiliary_loss_mlp": 0.00242507, + "balance_loss_clip": 1.19182158, + "balance_loss_mlp": 0.2146001, + "epoch": 0.34558845633548774, + "flos": 36243633824640.0, + "grad_norm": 9.135358857490374, + "language_loss": 0.78507245, + "learning_rate": 3.043078760922264e-06, + "loss": 0.80216563, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.27893066, + "step": 5748, + "time_per_iteration": 2.8253400325775146 + }, + { + "auxiliary_loss_clip": 0.0145459, + "auxiliary_loss_mlp": 0.00305742, + "balance_loss_clip": 1.18009329, + "balance_loss_mlp": 0.27803731, + "epoch": 0.3456485795881557, + "flos": 22450561200000.0, + "grad_norm": 8.795812113704747, + "language_loss": 0.82074356, + "learning_rate": 3.042746441843029e-06, + "loss": 0.83834684, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.27709961, + "step": 5749, + "time_per_iteration": 2.6852948665618896 + }, + { + "auxiliary_loss_clip": 0.01438559, + "auxiliary_loss_mlp": 0.00084476, + "balance_loss_clip": 1.25078773, + "balance_loss_mlp": 0.07384302, + "epoch": 0.34570870284082367, + "flos": 62004299005440.0, + "grad_norm": 1.2823536824308277, + "language_loss": 0.6277225, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.6429528, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.10644531, + "step": 5750, + "time_per_iteration": 3.0461132526397705 + }, + { + "auxiliary_loss_clip": 0.01456432, + "auxiliary_loss_mlp": 0.00248062, + "balance_loss_clip": 1.18508816, + "balance_loss_mlp": 0.21995258, + "epoch": 0.34576882609349163, + "flos": 22782196494720.0, + "grad_norm": 20.520041659815462, + "language_loss": 0.85950589, + "learning_rate": 3.042081685074012e-06, + "loss": 0.87655079, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.28088379, + "step": 5751, + "time_per_iteration": 2.6738550662994385 + }, + { + "auxiliary_loss_clip": 0.01469247, + "auxiliary_loss_mlp": 0.0026591, + "balance_loss_clip": 1.19457614, + "balance_loss_mlp": 0.23715663, + "epoch": 0.34582894934615965, + "flos": 12348818576640.0, + "grad_norm": 5.6742831812274455, + "language_loss": 0.91813672, + "learning_rate": 3.041749247409439e-06, + "loss": 0.93548822, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.2878418, + "step": 5752, + "time_per_iteration": 2.684209108352661 + }, + { + "auxiliary_loss_clip": 0.01414592, + "auxiliary_loss_mlp": 0.00086809, + "balance_loss_clip": 1.23035455, + "balance_loss_mlp": 0.07827319, + "epoch": 0.3458890725988276, + "flos": 70167691071360.0, + "grad_norm": 0.7311763753685301, + "language_loss": 0.62770212, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.64271617, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.08544922, + "step": 5753, + "time_per_iteration": 3.0959017276763916 + }, + { + "auxiliary_loss_clip": 0.01479278, + "auxiliary_loss_mlp": 0.00290814, + "balance_loss_clip": 1.20185769, + "balance_loss_mlp": 0.25974816, + "epoch": 0.3459491958514956, + "flos": 17092582093440.0, + "grad_norm": 65.28241818629962, + "language_loss": 0.78571439, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.8034153, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.31091309, + "step": 5754, + "time_per_iteration": 2.7241389751434326 + }, + { + "auxiliary_loss_clip": 0.01466154, + "auxiliary_loss_mlp": 0.00265608, + "balance_loss_clip": 1.19003689, + "balance_loss_mlp": 0.23833291, + "epoch": 0.34600931910416355, + "flos": 16650952375680.0, + "grad_norm": 18.65530035461238, + "language_loss": 0.80181044, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.8191281, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.27319336, + "step": 5755, + "time_per_iteration": 2.6638662815093994 + }, + { + "auxiliary_loss_clip": 0.01466723, + "auxiliary_loss_mlp": 0.00262467, + "balance_loss_clip": 1.19378376, + "balance_loss_mlp": 0.23345178, + "epoch": 0.3460694423568315, + "flos": 38546190334080.0, + "grad_norm": 2.447677879106579, + "language_loss": 0.78710055, + "learning_rate": 3.040419101844869e-06, + "loss": 0.80439246, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.29016113, + "step": 5756, + "time_per_iteration": 2.8843610286712646 + }, + { + "auxiliary_loss_clip": 0.01401326, + "auxiliary_loss_mlp": 0.00105339, + "balance_loss_clip": 1.21756053, + "balance_loss_mlp": 0.09704244, + "epoch": 0.3461295656094995, + "flos": 72081479704320.0, + "grad_norm": 0.8640733316014463, + "language_loss": 0.61767936, + "learning_rate": 3.040086466790207e-06, + "loss": 0.63274604, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.08300781, + "step": 5757, + "time_per_iteration": 3.1898863315582275 + }, + { + "auxiliary_loss_clip": 0.01392471, + "auxiliary_loss_mlp": 0.00112283, + "balance_loss_clip": 1.21551335, + "balance_loss_mlp": 0.10327029, + "epoch": 0.34618968886216744, + "flos": 65460089571840.0, + "grad_norm": 0.8093034737188053, + "language_loss": 0.58715409, + "learning_rate": 3.039753792295362e-06, + "loss": 0.60220164, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.09033203, + "step": 5758, + "time_per_iteration": 4.599436044692993 + }, + { + "auxiliary_loss_clip": 0.01462018, + "auxiliary_loss_mlp": 0.00293925, + "balance_loss_clip": 1.19192624, + "balance_loss_mlp": 0.26648262, + "epoch": 0.3462498121148354, + "flos": 23472542960640.0, + "grad_norm": 113.26262265731994, + "language_loss": 0.77860415, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.79616362, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.2746582, + "step": 5759, + "time_per_iteration": 4.1634767055511475 + }, + { + "auxiliary_loss_clip": 0.01435428, + "auxiliary_loss_mlp": 0.00269524, + "balance_loss_clip": 1.16921639, + "balance_loss_mlp": 0.24022238, + "epoch": 0.3463099353675034, + "flos": 24170790418560.0, + "grad_norm": 4.953173030745065, + "language_loss": 0.88824356, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.90529311, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.29296875, + "step": 5760, + "time_per_iteration": 2.676464319229126 + }, + { + "auxiliary_loss_clip": 0.01396916, + "auxiliary_loss_mlp": 0.00073291, + "balance_loss_clip": 1.21560526, + "balance_loss_mlp": 0.0652801, + "epoch": 0.34637005862017134, + "flos": 63700609766400.0, + "grad_norm": 0.7821886416323237, + "language_loss": 0.56404281, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.57874489, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.08007812, + "step": 5761, + "time_per_iteration": 3.1754019260406494 + }, + { + "auxiliary_loss_clip": 0.01443206, + "auxiliary_loss_mlp": 0.00239477, + "balance_loss_clip": 1.17942691, + "balance_loss_mlp": 0.21245226, + "epoch": 0.3464301818728393, + "flos": 13145532192000.0, + "grad_norm": 5.105558622624866, + "language_loss": 1.0246675, + "learning_rate": 3.038422700166474e-06, + "loss": 1.04149437, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.2701416, + "step": 5762, + "time_per_iteration": 2.633002281188965 + }, + { + "auxiliary_loss_clip": 0.01454811, + "auxiliary_loss_mlp": 0.00294878, + "balance_loss_clip": 1.1854496, + "balance_loss_mlp": 0.26295424, + "epoch": 0.34649030512550727, + "flos": 29315173299840.0, + "grad_norm": 27.388763208134897, + "language_loss": 0.75957525, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.77707207, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.31933594, + "step": 5763, + "time_per_iteration": 2.709540605545044 + }, + { + "auxiliary_loss_clip": 0.01447573, + "auxiliary_loss_mlp": 0.00294117, + "balance_loss_clip": 1.17755699, + "balance_loss_mlp": 0.26400489, + "epoch": 0.34655042837817523, + "flos": 23730884553600.0, + "grad_norm": 28.521733272912325, + "language_loss": 0.90312946, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.92054629, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.30126953, + "step": 5764, + "time_per_iteration": 4.0606207847595215 + }, + { + "auxiliary_loss_clip": 0.01446801, + "auxiliary_loss_mlp": 0.00280922, + "balance_loss_clip": 1.18428946, + "balance_loss_mlp": 0.25263375, + "epoch": 0.34661055163084326, + "flos": 22054215553920.0, + "grad_norm": 2.8025743421765825, + "language_loss": 0.75958824, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.77686554, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.28308105, + "step": 5765, + "time_per_iteration": 2.6045303344726562 + }, + { + "auxiliary_loss_clip": 0.01463132, + "auxiliary_loss_mlp": 0.00242365, + "balance_loss_clip": 1.19905698, + "balance_loss_mlp": 0.21505415, + "epoch": 0.3466706748835112, + "flos": 21799213925760.0, + "grad_norm": 10.632592324246923, + "language_loss": 0.83198357, + "learning_rate": 3.03709097800413e-06, + "loss": 0.84903854, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.27319336, + "step": 5766, + "time_per_iteration": 2.6247313022613525 + }, + { + "auxiliary_loss_clip": 0.01445257, + "auxiliary_loss_mlp": 0.00280441, + "balance_loss_clip": 1.181633, + "balance_loss_mlp": 0.25196177, + "epoch": 0.3467307981361792, + "flos": 19461680547840.0, + "grad_norm": 5.863973596172745, + "language_loss": 0.7841149, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.80137187, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.28479004, + "step": 5767, + "time_per_iteration": 2.6007909774780273 + }, + { + "auxiliary_loss_clip": 0.01435317, + "auxiliary_loss_mlp": 0.00272416, + "balance_loss_clip": 1.17419803, + "balance_loss_mlp": 0.24380605, + "epoch": 0.34679092138884715, + "flos": 24827452905600.0, + "grad_norm": 103.80233222253996, + "language_loss": 0.86324215, + "learning_rate": 3.036424880912893e-06, + "loss": 0.88031948, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.28637695, + "step": 5768, + "time_per_iteration": 2.695274591445923 + }, + { + "auxiliary_loss_clip": 0.01384951, + "auxiliary_loss_mlp": 0.00081343, + "balance_loss_clip": 1.21646476, + "balance_loss_mlp": 0.07247426, + "epoch": 0.3468510446415151, + "flos": 63236070149760.0, + "grad_norm": 0.8643138272967363, + "language_loss": 0.57181668, + "learning_rate": 3.036091773408956e-06, + "loss": 0.58647954, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.08886719, + "step": 5769, + "time_per_iteration": 3.17423939704895 + }, + { + "auxiliary_loss_clip": 0.01469068, + "auxiliary_loss_mlp": 0.00285971, + "balance_loss_clip": 1.19308281, + "balance_loss_mlp": 0.2543326, + "epoch": 0.3469111678941831, + "flos": 12120713256960.0, + "grad_norm": 25.311508584086187, + "language_loss": 0.95229203, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.96984243, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.31640625, + "step": 5770, + "time_per_iteration": 4.026670694351196 + }, + { + "auxiliary_loss_clip": 0.01380134, + "auxiliary_loss_mlp": 0.00079637, + "balance_loss_clip": 1.21078992, + "balance_loss_mlp": 0.07095837, + "epoch": 0.34697129114685105, + "flos": 65934110378880.0, + "grad_norm": 0.779600433788468, + "language_loss": 0.59489465, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.60949236, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.08691406, + "step": 5771, + "time_per_iteration": 2.9256889820098877 + }, + { + "auxiliary_loss_clip": 0.01467532, + "auxiliary_loss_mlp": 0.00301813, + "balance_loss_clip": 1.19912231, + "balance_loss_mlp": 0.27204615, + "epoch": 0.347031414399519, + "flos": 34454205054720.0, + "grad_norm": 21.870951673600278, + "language_loss": 0.78543895, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.80313247, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.29748535, + "step": 5772, + "time_per_iteration": 2.7882466316223145 + }, + { + "auxiliary_loss_clip": 0.01450608, + "auxiliary_loss_mlp": 0.00330017, + "balance_loss_clip": 1.18280315, + "balance_loss_mlp": 0.29642382, + "epoch": 0.347091537652187, + "flos": 26944135511040.0, + "grad_norm": 15.899152559726218, + "language_loss": 0.82194114, + "learning_rate": 3.034758950632507e-06, + "loss": 0.83974737, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.33569336, + "step": 5773, + "time_per_iteration": 2.71340012550354 + }, + { + "auxiliary_loss_clip": 0.01455008, + "auxiliary_loss_mlp": 0.00309165, + "balance_loss_clip": 1.18349767, + "balance_loss_mlp": 0.27721652, + "epoch": 0.34715166090485494, + "flos": 21142228216320.0, + "grad_norm": 14.773437514371732, + "language_loss": 0.7896964, + "learning_rate": 3.034425646811396e-06, + "loss": 0.80733812, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.31970215, + "step": 5774, + "time_per_iteration": 2.6807193756103516 + }, + { + "auxiliary_loss_clip": 0.01457083, + "auxiliary_loss_mlp": 0.00323087, + "balance_loss_clip": 1.19344509, + "balance_loss_mlp": 0.29161549, + "epoch": 0.3472117841575229, + "flos": 23478001827840.0, + "grad_norm": 88.39551330393498, + "language_loss": 0.8151989, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.83300054, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.31494141, + "step": 5775, + "time_per_iteration": 2.7095096111297607 + }, + { + "auxiliary_loss_clip": 0.01479329, + "auxiliary_loss_mlp": 0.00332796, + "balance_loss_clip": 1.20361257, + "balance_loss_mlp": 0.30070442, + "epoch": 0.34727190741019087, + "flos": 17492806408320.0, + "grad_norm": 3.528692534846297, + "language_loss": 0.89002311, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.90814435, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.32080078, + "step": 5776, + "time_per_iteration": 2.645292282104492 + }, + { + "auxiliary_loss_clip": 0.01404982, + "auxiliary_loss_mlp": 0.00095069, + "balance_loss_clip": 1.24075174, + "balance_loss_mlp": 0.08462617, + "epoch": 0.34733203066285884, + "flos": 65265491640960.0, + "grad_norm": 0.8547911464793351, + "language_loss": 0.63281393, + "learning_rate": 3.033425500045478e-06, + "loss": 0.64781445, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.10449219, + "step": 5777, + "time_per_iteration": 3.1997790336608887 + }, + { + "auxiliary_loss_clip": 0.01456148, + "auxiliary_loss_mlp": 0.00302339, + "balance_loss_clip": 1.19011045, + "balance_loss_mlp": 0.27082044, + "epoch": 0.3473921539155268, + "flos": 28658726294400.0, + "grad_norm": 9.752263434950216, + "language_loss": 0.72273779, + "learning_rate": 3.033092039398119e-06, + "loss": 0.74032265, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.31518555, + "step": 5778, + "time_per_iteration": 2.7698099613189697 + }, + { + "auxiliary_loss_clip": 0.01493507, + "auxiliary_loss_mlp": 0.00290564, + "balance_loss_clip": 1.21767998, + "balance_loss_mlp": 0.2593548, + "epoch": 0.3474522771681948, + "flos": 40836895355520.0, + "grad_norm": 11.001242041406563, + "language_loss": 0.80609274, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.82393348, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.31225586, + "step": 5779, + "time_per_iteration": 2.839808464050293 + }, + { + "auxiliary_loss_clip": 0.01499976, + "auxiliary_loss_mlp": 0.00318879, + "balance_loss_clip": 1.21871924, + "balance_loss_mlp": 0.28728819, + "epoch": 0.3475124004208628, + "flos": 24608577381120.0, + "grad_norm": 30.78492690117864, + "language_loss": 0.71843857, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.7366271, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.31591797, + "step": 5780, + "time_per_iteration": 2.762274980545044 + }, + { + "auxiliary_loss_clip": 0.01481142, + "auxiliary_loss_mlp": 0.00310512, + "balance_loss_clip": 1.20963943, + "balance_loss_mlp": 0.27971995, + "epoch": 0.34757252367353075, + "flos": 22711309004160.0, + "grad_norm": 10.041901054169669, + "language_loss": 0.78291565, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.80083215, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.30773926, + "step": 5781, + "time_per_iteration": 2.6990506649017334 + }, + { + "auxiliary_loss_clip": 0.01481783, + "auxiliary_loss_mlp": 0.00329622, + "balance_loss_clip": 1.20937657, + "balance_loss_mlp": 0.29714966, + "epoch": 0.3476326469261987, + "flos": 19828184970240.0, + "grad_norm": 179.57107119191863, + "language_loss": 0.87637818, + "learning_rate": 3.031757805185612e-06, + "loss": 0.89449215, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.32470703, + "step": 5782, + "time_per_iteration": 2.739673376083374 + }, + { + "auxiliary_loss_clip": 0.01500035, + "auxiliary_loss_mlp": 0.00371364, + "balance_loss_clip": 1.22606385, + "balance_loss_mlp": 0.33755642, + "epoch": 0.3476927701788667, + "flos": 19938107566080.0, + "grad_norm": 183.9967570929765, + "language_loss": 0.71102989, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.7297439, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.33789062, + "step": 5783, + "time_per_iteration": 2.8160147666931152 + }, + { + "auxiliary_loss_clip": 0.01481514, + "auxiliary_loss_mlp": 0.00374101, + "balance_loss_clip": 1.21817207, + "balance_loss_mlp": 0.33955395, + "epoch": 0.34775289343153465, + "flos": 20735108490240.0, + "grad_norm": 669.4162450256633, + "language_loss": 0.92523789, + "learning_rate": 3.031090453282605e-06, + "loss": 0.94379401, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.34570312, + "step": 5784, + "time_per_iteration": 2.6665399074554443 + }, + { + "auxiliary_loss_clip": 0.01489836, + "auxiliary_loss_mlp": 0.00336213, + "balance_loss_clip": 1.22203481, + "balance_loss_mlp": 0.30233353, + "epoch": 0.3478130166842026, + "flos": 19354846521600.0, + "grad_norm": 50.90848396056713, + "language_loss": 0.87908626, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.89734674, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.33862305, + "step": 5785, + "time_per_iteration": 2.6187832355499268 + }, + { + "auxiliary_loss_clip": 0.0151043, + "auxiliary_loss_mlp": 0.00316126, + "balance_loss_clip": 1.237988, + "balance_loss_mlp": 0.28653827, + "epoch": 0.3478731399368706, + "flos": 22051198811520.0, + "grad_norm": 4.765072393733737, + "language_loss": 0.85768551, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.87595105, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.29589844, + "step": 5786, + "time_per_iteration": 2.672747850418091 + }, + { + "auxiliary_loss_clip": 0.0150415, + "auxiliary_loss_mlp": 0.00287256, + "balance_loss_clip": 1.23568368, + "balance_loss_mlp": 0.25645196, + "epoch": 0.34793326318953854, + "flos": 18041449720320.0, + "grad_norm": 3.3337188605499346, + "language_loss": 0.80718493, + "learning_rate": 3.030089132216836e-06, + "loss": 0.82509899, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.30810547, + "step": 5787, + "time_per_iteration": 2.651751756668091 + }, + { + "auxiliary_loss_clip": 0.01522117, + "auxiliary_loss_mlp": 0.00338003, + "balance_loss_clip": 1.24406695, + "balance_loss_mlp": 0.30545866, + "epoch": 0.3479933864422065, + "flos": 29314670509440.0, + "grad_norm": 3.2158077699744694, + "language_loss": 0.87289566, + "learning_rate": 3.029755280389203e-06, + "loss": 0.8914969, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.32519531, + "step": 5788, + "time_per_iteration": 2.703883647918701 + }, + { + "auxiliary_loss_clip": 0.01559011, + "auxiliary_loss_mlp": 0.00321905, + "balance_loss_clip": 1.27043462, + "balance_loss_mlp": 0.28645253, + "epoch": 0.3480535096948745, + "flos": 20120713332480.0, + "grad_norm": 6.589139409848463, + "language_loss": 0.93800181, + "learning_rate": 3.029421389513147e-06, + "loss": 0.95681095, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.35424805, + "step": 5789, + "time_per_iteration": 2.6454269886016846 + }, + { + "auxiliary_loss_clip": 0.01532953, + "auxiliary_loss_mlp": 0.0034973, + "balance_loss_clip": 1.2520138, + "balance_loss_mlp": 0.31659013, + "epoch": 0.34811363294754244, + "flos": 18548974938240.0, + "grad_norm": 3.6190364738353202, + "language_loss": 0.90084207, + "learning_rate": 3.029087459601328e-06, + "loss": 0.91966891, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.33129883, + "step": 5790, + "time_per_iteration": 2.7314038276672363 + }, + { + "auxiliary_loss_clip": 0.01535951, + "auxiliary_loss_mlp": 0.00343922, + "balance_loss_clip": 1.25908113, + "balance_loss_mlp": 0.30977997, + "epoch": 0.3481737562002104, + "flos": 26870303105280.0, + "grad_norm": 3.9284966007552398, + "language_loss": 0.9047823, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.923581, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.34130859, + "step": 5791, + "time_per_iteration": 2.6898274421691895 + }, + { + "auxiliary_loss_clip": 0.01564545, + "auxiliary_loss_mlp": 0.00324988, + "balance_loss_clip": 1.27674079, + "balance_loss_mlp": 0.2902976, + "epoch": 0.3482338794528784, + "flos": 28908664104960.0, + "grad_norm": 32.78251315853265, + "language_loss": 0.83618534, + "learning_rate": 3.028419482721056e-06, + "loss": 0.8550806, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.34716797, + "step": 5792, + "time_per_iteration": 2.7686691284179688 + }, + { + "auxiliary_loss_clip": 0.01543542, + "auxiliary_loss_mlp": 0.00290432, + "balance_loss_clip": 1.26131034, + "balance_loss_mlp": 0.25748298, + "epoch": 0.3482940027055464, + "flos": 22200767043840.0, + "grad_norm": 18.141416934945124, + "language_loss": 0.87461501, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.89295477, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.32958984, + "step": 5793, + "time_per_iteration": 2.681962251663208 + }, + { + "auxiliary_loss_clip": 0.01579516, + "auxiliary_loss_mlp": 0.0034467, + "balance_loss_clip": 1.28601408, + "balance_loss_mlp": 0.30807275, + "epoch": 0.34835412595821436, + "flos": 20302708567680.0, + "grad_norm": 4.496433465467231, + "language_loss": 0.84319329, + "learning_rate": 3.027751349849706e-06, + "loss": 0.86243522, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.3659668, + "step": 5794, + "time_per_iteration": 2.739572525024414 + }, + { + "auxiliary_loss_clip": 0.01554853, + "auxiliary_loss_mlp": 0.00296552, + "balance_loss_clip": 1.27188873, + "balance_loss_mlp": 0.26295891, + "epoch": 0.3484142492108823, + "flos": 20449691020800.0, + "grad_norm": 18.55495528512756, + "language_loss": 0.63496387, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.65347803, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.33581543, + "step": 5795, + "time_per_iteration": 2.7755885124206543 + }, + { + "auxiliary_loss_clip": 0.01547898, + "auxiliary_loss_mlp": 0.0028252, + "balance_loss_clip": 1.26247358, + "balance_loss_mlp": 0.24985689, + "epoch": 0.3484743724635503, + "flos": 24352929308160.0, + "grad_norm": 18.757703204072016, + "language_loss": 0.88422692, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.90253115, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.32641602, + "step": 5796, + "time_per_iteration": 2.7252728939056396 + }, + { + "auxiliary_loss_clip": 0.01545913, + "auxiliary_loss_mlp": 0.00238657, + "balance_loss_clip": 1.26746607, + "balance_loss_mlp": 0.20883131, + "epoch": 0.34853449571621825, + "flos": 24353001135360.0, + "grad_norm": 10.824327737783955, + "language_loss": 0.90600526, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.92385095, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 0.29821777, + "step": 5797, + "time_per_iteration": 2.7263407707214355 + }, + { + "auxiliary_loss_clip": 0.01571532, + "auxiliary_loss_mlp": 0.00258269, + "balance_loss_clip": 1.28195977, + "balance_loss_mlp": 0.2246995, + "epoch": 0.3485946189688862, + "flos": 27267690245760.0, + "grad_norm": 66.07733483051561, + "language_loss": 0.78040153, + "learning_rate": 3.026414616539167e-06, + "loss": 0.7986995, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.33569336, + "step": 5798, + "time_per_iteration": 2.8640477657318115 + }, + { + "auxiliary_loss_clip": 0.01570104, + "auxiliary_loss_mlp": 0.00265474, + "balance_loss_clip": 1.27898872, + "balance_loss_mlp": 0.23259631, + "epoch": 0.3486547422215542, + "flos": 20156695781760.0, + "grad_norm": 120.60373065257113, + "language_loss": 0.84767115, + "learning_rate": 3.026080335875485e-06, + "loss": 0.866027, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.32885742, + "step": 5799, + "time_per_iteration": 2.6920862197875977 + }, + { + "auxiliary_loss_clip": 0.01555081, + "auxiliary_loss_mlp": 0.00242522, + "balance_loss_clip": 1.26748073, + "balance_loss_mlp": 0.21326782, + "epoch": 0.34871486547422215, + "flos": 20230348619520.0, + "grad_norm": 977.9232727681581, + "language_loss": 0.84634674, + "learning_rate": 3.025746016302734e-06, + "loss": 0.86432278, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.29248047, + "step": 5800, + "time_per_iteration": 2.6642909049987793 + }, + { + "auxiliary_loss_clip": 0.01582317, + "auxiliary_loss_mlp": 0.00275409, + "balance_loss_clip": 1.28611696, + "balance_loss_mlp": 0.24036191, + "epoch": 0.3487749887268901, + "flos": 44053234882560.0, + "grad_norm": 41.17505015642335, + "language_loss": 0.7422772, + "learning_rate": 3.025411657833591e-06, + "loss": 0.76085448, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.35058594, + "step": 5801, + "time_per_iteration": 5.75596284866333 + }, + { + "auxiliary_loss_clip": 0.01593735, + "auxiliary_loss_mlp": 0.00256901, + "balance_loss_clip": 1.29995561, + "balance_loss_mlp": 0.22247359, + "epoch": 0.3488351119795581, + "flos": 23295144666240.0, + "grad_norm": 5.259347864654754, + "language_loss": 0.83467793, + "learning_rate": 3.025077260480735e-06, + "loss": 0.85318434, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.34423828, + "step": 5802, + "time_per_iteration": 2.6848833560943604 + }, + { + "auxiliary_loss_clip": 0.01573586, + "auxiliary_loss_mlp": 0.00226288, + "balance_loss_clip": 1.28922021, + "balance_loss_mlp": 0.19717684, + "epoch": 0.34889523523222604, + "flos": 19934839428480.0, + "grad_norm": 23.57779255107615, + "language_loss": 0.8548311, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.87282985, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.29125977, + "step": 5803, + "time_per_iteration": 2.656851053237915 + }, + { + "auxiliary_loss_clip": 0.01551959, + "auxiliary_loss_mlp": 0.00251914, + "balance_loss_clip": 1.2583735, + "balance_loss_mlp": 0.21767694, + "epoch": 0.348955358484894, + "flos": 30446179816320.0, + "grad_norm": 29.324733621419128, + "language_loss": 0.75280428, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.77084303, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.34179688, + "step": 5804, + "time_per_iteration": 2.7073211669921875 + }, + { + "auxiliary_loss_clip": 0.01575134, + "auxiliary_loss_mlp": 0.0024435, + "balance_loss_clip": 1.28935063, + "balance_loss_mlp": 0.21247301, + "epoch": 0.349015481737562, + "flos": 17999972490240.0, + "grad_norm": 8.797189168446717, + "language_loss": 0.84269536, + "learning_rate": 3.024073835246702e-06, + "loss": 0.86089021, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.31884766, + "step": 5805, + "time_per_iteration": 2.6225640773773193 + }, + { + "auxiliary_loss_clip": 0.01559894, + "auxiliary_loss_mlp": 0.00230483, + "balance_loss_clip": 1.26963663, + "balance_loss_mlp": 0.1970565, + "epoch": 0.34907560499023, + "flos": 27198490694400.0, + "grad_norm": 146.9946221316487, + "language_loss": 0.7654227, + "learning_rate": 3.023739282485814e-06, + "loss": 0.78332645, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.33447266, + "step": 5806, + "time_per_iteration": 4.117645978927612 + }, + { + "auxiliary_loss_clip": 0.01590583, + "auxiliary_loss_mlp": 0.00260473, + "balance_loss_clip": 1.29432178, + "balance_loss_mlp": 0.22594967, + "epoch": 0.34913572824289796, + "flos": 30226873328640.0, + "grad_norm": 1.9751375308200427, + "language_loss": 0.77537239, + "learning_rate": 3.023404690904629e-06, + "loss": 0.79388297, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.34521484, + "step": 5807, + "time_per_iteration": 2.7855167388916016 + }, + { + "auxiliary_loss_clip": 0.01561004, + "auxiliary_loss_mlp": 0.00257091, + "balance_loss_clip": 1.26917839, + "balance_loss_mlp": 0.2213755, + "epoch": 0.3491958514955659, + "flos": 29971907614080.0, + "grad_norm": 73.4048830562558, + "language_loss": 0.82050192, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.83868289, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.35717773, + "step": 5808, + "time_per_iteration": 2.7306065559387207 + }, + { + "auxiliary_loss_clip": 0.01566895, + "auxiliary_loss_mlp": 0.00238818, + "balance_loss_clip": 1.28053749, + "balance_loss_mlp": 0.2056303, + "epoch": 0.3492559747482339, + "flos": 22783273902720.0, + "grad_norm": 19.558915230599123, + "language_loss": 0.89057976, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.90863693, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.33227539, + "step": 5809, + "time_per_iteration": 2.654219627380371 + }, + { + "auxiliary_loss_clip": 0.01560349, + "auxiliary_loss_mlp": 0.00247631, + "balance_loss_clip": 1.27748251, + "balance_loss_mlp": 0.21296453, + "epoch": 0.34931609800090185, + "flos": 26068022881920.0, + "grad_norm": 37.28032634310127, + "language_loss": 0.87392956, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.89200932, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.34692383, + "step": 5810, + "time_per_iteration": 2.655759572982788 + }, + { + "auxiliary_loss_clip": 0.01569706, + "auxiliary_loss_mlp": 0.00239939, + "balance_loss_clip": 1.27663732, + "balance_loss_mlp": 0.20486797, + "epoch": 0.3493762212535698, + "flos": 29242023252480.0, + "grad_norm": 39.19042257785873, + "language_loss": 0.82668775, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.8447842, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.35058594, + "step": 5811, + "time_per_iteration": 2.7703630924224854 + }, + { + "auxiliary_loss_clip": 0.01564741, + "auxiliary_loss_mlp": 0.0023128, + "balance_loss_clip": 1.27751386, + "balance_loss_mlp": 0.19592279, + "epoch": 0.3494363445062378, + "flos": 27126058919040.0, + "grad_norm": 6.36716662143538, + "language_loss": 0.85958666, + "learning_rate": 3.021731151138386e-06, + "loss": 0.87754685, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.35351562, + "step": 5812, + "time_per_iteration": 2.70796537399292 + }, + { + "auxiliary_loss_clip": 0.01566501, + "auxiliary_loss_mlp": 0.00247519, + "balance_loss_clip": 1.27272129, + "balance_loss_mlp": 0.21106461, + "epoch": 0.34949646775890575, + "flos": 12276207233280.0, + "grad_norm": 23.846968409879793, + "language_loss": 0.7837404, + "learning_rate": 3.021396326901918e-06, + "loss": 0.8018806, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.36450195, + "step": 5813, + "time_per_iteration": 4.004144906997681 + }, + { + "auxiliary_loss_clip": 0.01576528, + "auxiliary_loss_mlp": 0.00227537, + "balance_loss_clip": 1.2868855, + "balance_loss_mlp": 0.19258484, + "epoch": 0.3495565910115737, + "flos": 17165516659200.0, + "grad_norm": 9.027985365000973, + "language_loss": 0.83727551, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.85531616, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.34960938, + "step": 5814, + "time_per_iteration": 2.67761492729187 + }, + { + "auxiliary_loss_clip": 0.01572333, + "auxiliary_loss_mlp": 0.00242412, + "balance_loss_clip": 1.2755301, + "balance_loss_mlp": 0.20874743, + "epoch": 0.3496167142642417, + "flos": 26465661417600.0, + "grad_norm": 230.01527896364266, + "language_loss": 0.90555334, + "learning_rate": 3.020726562247328e-06, + "loss": 0.92370075, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.33691406, + "step": 5815, + "time_per_iteration": 2.744168758392334 + }, + { + "auxiliary_loss_clip": 0.01576726, + "auxiliary_loss_mlp": 0.00239533, + "balance_loss_clip": 1.28114867, + "balance_loss_mlp": 0.20603502, + "epoch": 0.34967683751690964, + "flos": 17414843938560.0, + "grad_norm": 8.759685359859752, + "language_loss": 0.83783603, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.85599864, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.33496094, + "step": 5816, + "time_per_iteration": 2.632697820663452 + }, + { + "auxiliary_loss_clip": 0.01594198, + "auxiliary_loss_mlp": 0.00242727, + "balance_loss_clip": 1.29634237, + "balance_loss_mlp": 0.2026249, + "epoch": 0.3497369607695776, + "flos": 22600021691520.0, + "grad_norm": 12.255862808411512, + "language_loss": 0.6873104, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.7056796, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.40112305, + "step": 5817, + "time_per_iteration": 2.7047250270843506 + }, + { + "auxiliary_loss_clip": 0.01460708, + "auxiliary_loss_mlp": 0.0007659, + "balance_loss_clip": 1.28176117, + "balance_loss_mlp": 0.06409672, + "epoch": 0.34979708402224563, + "flos": 68529374818560.0, + "grad_norm": 0.9047523949974388, + "language_loss": 0.60090649, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.61627948, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.125, + "step": 5818, + "time_per_iteration": 3.2595417499542236 + }, + { + "auxiliary_loss_clip": 0.01575064, + "auxiliary_loss_mlp": 0.00225528, + "balance_loss_clip": 1.28214061, + "balance_loss_mlp": 0.18778639, + "epoch": 0.3498572072749136, + "flos": 18989634988800.0, + "grad_norm": 26.728886683583188, + "language_loss": 0.89554662, + "learning_rate": 3.019386568567123e-06, + "loss": 0.91355252, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.37768555, + "step": 5819, + "time_per_iteration": 2.6883039474487305 + }, + { + "auxiliary_loss_clip": 0.01580991, + "auxiliary_loss_mlp": 0.00212275, + "balance_loss_clip": 1.28462315, + "balance_loss_mlp": 0.17851481, + "epoch": 0.34991733052758156, + "flos": 27818883423360.0, + "grad_norm": 3.1635689709523374, + "language_loss": 0.75444341, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.77237606, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.33764648, + "step": 5820, + "time_per_iteration": 2.7443721294403076 + }, + { + "auxiliary_loss_clip": 0.01577249, + "auxiliary_loss_mlp": 0.00230196, + "balance_loss_clip": 1.2770927, + "balance_loss_mlp": 0.19498135, + "epoch": 0.3499774537802495, + "flos": 33584197737600.0, + "grad_norm": 69.96375477888168, + "language_loss": 0.73808169, + "learning_rate": 3.018716339744759e-06, + "loss": 0.75615609, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.35229492, + "step": 5821, + "time_per_iteration": 2.790905237197876 + }, + { + "auxiliary_loss_clip": 0.015748, + "auxiliary_loss_mlp": 0.00247696, + "balance_loss_clip": 1.27156758, + "balance_loss_mlp": 0.21128994, + "epoch": 0.3500375770329175, + "flos": 23476744851840.0, + "grad_norm": 9.35806671139538, + "language_loss": 0.81627595, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.83450091, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.36401367, + "step": 5822, + "time_per_iteration": 2.700282096862793 + }, + { + "auxiliary_loss_clip": 0.01567257, + "auxiliary_loss_mlp": 0.00247892, + "balance_loss_clip": 1.26854515, + "balance_loss_mlp": 0.21155696, + "epoch": 0.35009770028558546, + "flos": 19026048401280.0, + "grad_norm": 16.64389630615954, + "language_loss": 0.8540414, + "learning_rate": 3.018045956403094e-06, + "loss": 0.87219286, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.36328125, + "step": 5823, + "time_per_iteration": 2.661284923553467 + }, + { + "auxiliary_loss_clip": 0.01412952, + "auxiliary_loss_mlp": 0.00093582, + "balance_loss_clip": 1.23731685, + "balance_loss_mlp": 0.08175655, + "epoch": 0.3501578235382534, + "flos": 68351868783360.0, + "grad_norm": 0.7643539865106257, + "language_loss": 0.58663636, + "learning_rate": 3.017710706819298e-06, + "loss": 0.60170168, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.11816406, + "step": 5824, + "time_per_iteration": 3.1621673107147217 + }, + { + "auxiliary_loss_clip": 0.01551744, + "auxiliary_loss_mlp": 0.00217395, + "balance_loss_clip": 1.259848, + "balance_loss_mlp": 0.18125057, + "epoch": 0.3502179467909214, + "flos": 21250893836160.0, + "grad_norm": 2.402137245768028, + "language_loss": 0.91707826, + "learning_rate": 3.017375418643811e-06, + "loss": 0.93476963, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.36132812, + "step": 5825, + "time_per_iteration": 2.635638952255249 + }, + { + "auxiliary_loss_clip": 0.01577927, + "auxiliary_loss_mlp": 0.00232612, + "balance_loss_clip": 1.28528404, + "balance_loss_mlp": 0.19646822, + "epoch": 0.35027807004358935, + "flos": 11942955826560.0, + "grad_norm": 8.540964697798524, + "language_loss": 0.92130703, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.93941247, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.36181641, + "step": 5826, + "time_per_iteration": 2.5988311767578125 + }, + { + "auxiliary_loss_clip": 0.0155829, + "auxiliary_loss_mlp": 0.00230896, + "balance_loss_clip": 1.26758587, + "balance_loss_mlp": 0.19525287, + "epoch": 0.3503381932962573, + "flos": 21470918595840.0, + "grad_norm": 25.662655212803255, + "language_loss": 0.86769378, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.88558567, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.35644531, + "step": 5827, + "time_per_iteration": 2.6705381870269775 + }, + { + "auxiliary_loss_clip": 0.0153217, + "auxiliary_loss_mlp": 0.00204343, + "balance_loss_clip": 1.24856234, + "balance_loss_mlp": 0.16967644, + "epoch": 0.3503983165489253, + "flos": 21251109317760.0, + "grad_norm": 334.3822124394979, + "language_loss": 0.80842608, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.82579124, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.34667969, + "step": 5828, + "time_per_iteration": 2.638786554336548 + }, + { + "auxiliary_loss_clip": 0.0152519, + "auxiliary_loss_mlp": 0.00235543, + "balance_loss_clip": 1.23996758, + "balance_loss_mlp": 0.19846895, + "epoch": 0.35045843980159325, + "flos": 27815723026560.0, + "grad_norm": 22.696204966671583, + "language_loss": 0.85539079, + "learning_rate": 3.016033880279248e-06, + "loss": 0.87299812, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.37060547, + "step": 5829, + "time_per_iteration": 2.6921236515045166 + }, + { + "auxiliary_loss_clip": 0.01538918, + "auxiliary_loss_mlp": 0.00239069, + "balance_loss_clip": 1.24559188, + "balance_loss_mlp": 0.20251977, + "epoch": 0.3505185630542612, + "flos": 25921148169600.0, + "grad_norm": 10.008271779218335, + "language_loss": 0.81343091, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.83121079, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.36547852, + "step": 5830, + "time_per_iteration": 2.6528921127319336 + }, + { + "auxiliary_loss_clip": 0.01541855, + "auxiliary_loss_mlp": 0.00204128, + "balance_loss_clip": 1.2530725, + "balance_loss_mlp": 0.16922313, + "epoch": 0.35057868630692923, + "flos": 20521763660160.0, + "grad_norm": 25.480547908884514, + "language_loss": 0.97221619, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.98967606, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.34887695, + "step": 5831, + "time_per_iteration": 2.6487741470336914 + }, + { + "auxiliary_loss_clip": 0.01526114, + "auxiliary_loss_mlp": 0.00201496, + "balance_loss_clip": 1.24182379, + "balance_loss_mlp": 0.16311058, + "epoch": 0.3506388095595972, + "flos": 20448649526400.0, + "grad_norm": 8.421436539511818, + "language_loss": 0.87504363, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.89231968, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.38378906, + "step": 5832, + "time_per_iteration": 2.6634533405303955 + }, + { + "auxiliary_loss_clip": 0.01540337, + "auxiliary_loss_mlp": 0.00240114, + "balance_loss_clip": 1.24602151, + "balance_loss_mlp": 0.2017765, + "epoch": 0.35069893281226516, + "flos": 23109665811840.0, + "grad_norm": 93.01636380491672, + "language_loss": 0.78382868, + "learning_rate": 3.014691725465008e-06, + "loss": 0.80163324, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.38330078, + "step": 5833, + "time_per_iteration": 2.7585129737854004 + }, + { + "auxiliary_loss_clip": 0.01517197, + "auxiliary_loss_mlp": 0.00183191, + "balance_loss_clip": 1.23564792, + "balance_loss_mlp": 0.15069476, + "epoch": 0.35075905606493313, + "flos": 27271999877760.0, + "grad_norm": 19.68969705857964, + "language_loss": 0.85490882, + "learning_rate": 3.014356090536606e-06, + "loss": 0.87191272, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.32519531, + "step": 5834, + "time_per_iteration": 2.78979754447937 + }, + { + "auxiliary_loss_clip": 0.01523248, + "auxiliary_loss_mlp": 0.00211907, + "balance_loss_clip": 1.23934579, + "balance_loss_mlp": 0.17697826, + "epoch": 0.3508191793176011, + "flos": 19128608709120.0, + "grad_norm": 11.685140299256252, + "language_loss": 0.92696583, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.94431734, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.34960938, + "step": 5835, + "time_per_iteration": 2.6622815132141113 + }, + { + "auxiliary_loss_clip": 0.01522393, + "auxiliary_loss_mlp": 0.00220481, + "balance_loss_clip": 1.24239779, + "balance_loss_mlp": 0.18977252, + "epoch": 0.35087930257026906, + "flos": 25557588662400.0, + "grad_norm": 2.2939299686992864, + "language_loss": 0.81347227, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.83090103, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.30688477, + "step": 5836, + "time_per_iteration": 2.7081809043884277 + }, + { + "auxiliary_loss_clip": 0.01508775, + "auxiliary_loss_mlp": 0.00199258, + "balance_loss_clip": 1.23258793, + "balance_loss_mlp": 0.16683304, + "epoch": 0.350939425822937, + "flos": 18004246208640.0, + "grad_norm": 94.30165913685752, + "language_loss": 0.85731959, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.8743999, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.32397461, + "step": 5837, + "time_per_iteration": 2.6494076251983643 + }, + { + "auxiliary_loss_clip": 0.01510179, + "auxiliary_loss_mlp": 0.00242485, + "balance_loss_clip": 1.22746921, + "balance_loss_mlp": 0.20922586, + "epoch": 0.350999549075605, + "flos": 22273198819200.0, + "grad_norm": 47.5923821009948, + "language_loss": 0.76175797, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.7792846, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.33239746, + "step": 5838, + "time_per_iteration": 2.677680730819702 + }, + { + "auxiliary_loss_clip": 0.01489503, + "auxiliary_loss_mlp": 0.00184986, + "balance_loss_clip": 1.21836066, + "balance_loss_mlp": 0.14950943, + "epoch": 0.35105967232827295, + "flos": 14392279307520.0, + "grad_norm": 3.7660874424612287, + "language_loss": 0.91749126, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.93423617, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.35473633, + "step": 5839, + "time_per_iteration": 2.6309454441070557 + }, + { + "auxiliary_loss_clip": 0.01495712, + "auxiliary_loss_mlp": 0.00233021, + "balance_loss_clip": 1.21534646, + "balance_loss_mlp": 0.19864115, + "epoch": 0.3511197955809409, + "flos": 25082346792960.0, + "grad_norm": 7.830237553035223, + "language_loss": 0.68513894, + "learning_rate": 3.012341473657572e-06, + "loss": 0.70242625, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.34350586, + "step": 5840, + "time_per_iteration": 2.6954710483551025 + }, + { + "auxiliary_loss_clip": 0.01495855, + "auxiliary_loss_mlp": 0.00213708, + "balance_loss_clip": 1.22218299, + "balance_loss_mlp": 0.18004367, + "epoch": 0.3511799188336089, + "flos": 25884160139520.0, + "grad_norm": 194.77949142691438, + "language_loss": 0.95750022, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.97459579, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.33618164, + "step": 5841, + "time_per_iteration": 2.711456775665283 + }, + { + "auxiliary_loss_clip": 0.01495523, + "auxiliary_loss_mlp": 0.00248739, + "balance_loss_clip": 1.21783924, + "balance_loss_mlp": 0.21283305, + "epoch": 0.35124004208627685, + "flos": 20083725302400.0, + "grad_norm": 48.88818409016856, + "language_loss": 0.8396796, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.85712218, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.35888672, + "step": 5842, + "time_per_iteration": 2.7118382453918457 + }, + { + "auxiliary_loss_clip": 0.0150143, + "auxiliary_loss_mlp": 0.00219394, + "balance_loss_clip": 1.22603846, + "balance_loss_mlp": 0.18270169, + "epoch": 0.3513001653389448, + "flos": 17783431349760.0, + "grad_norm": 6.156129794892344, + "language_loss": 0.81603575, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.83324403, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.36694336, + "step": 5843, + "time_per_iteration": 4.196788311004639 + }, + { + "auxiliary_loss_clip": 0.01488592, + "auxiliary_loss_mlp": 0.00217674, + "balance_loss_clip": 1.21699536, + "balance_loss_mlp": 0.18186307, + "epoch": 0.3513602885916128, + "flos": 29387138198400.0, + "grad_norm": 11.96732825421378, + "language_loss": 0.74234068, + "learning_rate": 3.010997627806655e-06, + "loss": 0.75940329, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.3581543, + "step": 5844, + "time_per_iteration": 2.9514596462249756 + }, + { + "auxiliary_loss_clip": 0.01490819, + "auxiliary_loss_mlp": 0.0019977, + "balance_loss_clip": 1.22015929, + "balance_loss_mlp": 0.16539007, + "epoch": 0.3514204118442808, + "flos": 16179876483840.0, + "grad_norm": 23.08384677847394, + "language_loss": 0.84317678, + "learning_rate": 3.010661570469245e-06, + "loss": 0.86008269, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.34399414, + "step": 5845, + "time_per_iteration": 2.74468994140625 + }, + { + "auxiliary_loss_clip": 0.0148671, + "auxiliary_loss_mlp": 0.00205763, + "balance_loss_clip": 1.2197783, + "balance_loss_mlp": 0.17324279, + "epoch": 0.35148053509694877, + "flos": 23834665923840.0, + "grad_norm": 12.007083258298238, + "language_loss": 0.82987881, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.84680355, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.32495117, + "step": 5846, + "time_per_iteration": 2.701140880584717 + }, + { + "auxiliary_loss_clip": 0.01492805, + "auxiliary_loss_mlp": 0.00220642, + "balance_loss_clip": 1.21779823, + "balance_loss_mlp": 0.18917096, + "epoch": 0.35154065834961673, + "flos": 20991295267200.0, + "grad_norm": 6.347185183749801, + "language_loss": 0.80332637, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.82046092, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.31469727, + "step": 5847, + "time_per_iteration": 2.7477598190307617 + }, + { + "auxiliary_loss_clip": 0.01506446, + "auxiliary_loss_mlp": 0.00198281, + "balance_loss_clip": 1.23107219, + "balance_loss_mlp": 0.16685733, + "epoch": 0.3516007816022847, + "flos": 33255471444480.0, + "grad_norm": 1346.510300176404, + "language_loss": 0.80288112, + "learning_rate": 3.009653168561666e-06, + "loss": 0.81992835, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.31420898, + "step": 5848, + "time_per_iteration": 2.8120625019073486 + }, + { + "auxiliary_loss_clip": 0.01501003, + "auxiliary_loss_mlp": 0.00216329, + "balance_loss_clip": 1.22664976, + "balance_loss_mlp": 0.18237816, + "epoch": 0.35166090485495266, + "flos": 11726953390080.0, + "grad_norm": 6.806242205544046, + "language_loss": 0.99939144, + "learning_rate": 3.009316958003178e-06, + "loss": 1.01656473, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.33959961, + "step": 5849, + "time_per_iteration": 4.193632364273071 + }, + { + "auxiliary_loss_clip": 0.01510966, + "auxiliary_loss_mlp": 0.00205963, + "balance_loss_clip": 1.23978472, + "balance_loss_mlp": 0.17506403, + "epoch": 0.3517210281076206, + "flos": 22638446265600.0, + "grad_norm": 118.00643526874818, + "language_loss": 0.85088789, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.86805713, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.30908203, + "step": 5850, + "time_per_iteration": 2.6801109313964844 + }, + { + "auxiliary_loss_clip": 0.01506362, + "auxiliary_loss_mlp": 0.0020993, + "balance_loss_clip": 1.23525476, + "balance_loss_mlp": 0.17690951, + "epoch": 0.3517811513602886, + "flos": 21322750993920.0, + "grad_norm": 8.745930273480166, + "language_loss": 0.82474875, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.84191167, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.33007812, + "step": 5851, + "time_per_iteration": 2.638716459274292 + }, + { + "auxiliary_loss_clip": 0.01523238, + "auxiliary_loss_mlp": 0.00197316, + "balance_loss_clip": 1.2503643, + "balance_loss_mlp": 0.16548759, + "epoch": 0.35184127461295656, + "flos": 21032880238080.0, + "grad_norm": 15.513811154796475, + "language_loss": 0.95824856, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.97545409, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.31811523, + "step": 5852, + "time_per_iteration": 2.6389896869659424 + }, + { + "auxiliary_loss_clip": 0.01527077, + "auxiliary_loss_mlp": 0.00214051, + "balance_loss_clip": 1.24984908, + "balance_loss_mlp": 0.18095855, + "epoch": 0.3519013978656245, + "flos": 22455265881600.0, + "grad_norm": 33.05630572846214, + "language_loss": 0.76525581, + "learning_rate": 3.007971733162737e-06, + "loss": 0.7826671, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.33105469, + "step": 5853, + "time_per_iteration": 2.713188648223877 + }, + { + "auxiliary_loss_clip": 0.01522301, + "auxiliary_loss_mlp": 0.00213125, + "balance_loss_clip": 1.24175906, + "balance_loss_mlp": 0.17967477, + "epoch": 0.3519615211182925, + "flos": 13115295918720.0, + "grad_norm": 4.434978413776267, + "language_loss": 0.89082646, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.90818077, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.3347168, + "step": 5854, + "time_per_iteration": 2.6654839515686035 + }, + { + "auxiliary_loss_clip": 0.01528573, + "auxiliary_loss_mlp": 0.00188993, + "balance_loss_clip": 1.25624585, + "balance_loss_mlp": 0.15811799, + "epoch": 0.35202164437096045, + "flos": 19135144984320.0, + "grad_norm": 6.088950865676143, + "language_loss": 0.8012383, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.81841397, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.30859375, + "step": 5855, + "time_per_iteration": 4.1006903648376465 + }, + { + "auxiliary_loss_clip": 0.01509289, + "auxiliary_loss_mlp": 0.00197747, + "balance_loss_clip": 1.23767233, + "balance_loss_mlp": 0.16587012, + "epoch": 0.3520817676236284, + "flos": 26542187343360.0, + "grad_norm": 8.206888669266723, + "language_loss": 0.79963672, + "learning_rate": 3.006962413152691e-06, + "loss": 0.81670702, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.31884766, + "step": 5856, + "time_per_iteration": 2.7310144901275635 + }, + { + "auxiliary_loss_clip": 0.01537528, + "auxiliary_loss_mlp": 0.00218346, + "balance_loss_clip": 1.25418568, + "balance_loss_mlp": 0.1853247, + "epoch": 0.3521418908762964, + "flos": 44893472803200.0, + "grad_norm": 24.942118903064955, + "language_loss": 0.69330537, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.71086413, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.33032227, + "step": 5857, + "time_per_iteration": 2.960798740386963 + }, + { + "auxiliary_loss_clip": 0.01528123, + "auxiliary_loss_mlp": 0.0022685, + "balance_loss_clip": 1.24635923, + "balance_loss_mlp": 0.19539064, + "epoch": 0.3522020141289644, + "flos": 20187398931840.0, + "grad_norm": 17.226276319749353, + "language_loss": 0.80056202, + "learning_rate": 3.006289342204152e-06, + "loss": 0.81811178, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.31481934, + "step": 5858, + "time_per_iteration": 2.7337229251861572 + }, + { + "auxiliary_loss_clip": 0.01531838, + "auxiliary_loss_mlp": 0.00207318, + "balance_loss_clip": 1.25277114, + "balance_loss_mlp": 0.17646638, + "epoch": 0.35226213738163237, + "flos": 27563917708800.0, + "grad_norm": 32.10512889216305, + "language_loss": 0.83192825, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.84931982, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 2.78710938, + "router_z_loss_mlp": 0.30834961, + "step": 5859, + "time_per_iteration": 2.7709572315216064 + }, + { + "auxiliary_loss_clip": 0.01538089, + "auxiliary_loss_mlp": 0.00243789, + "balance_loss_clip": 1.25578547, + "balance_loss_mlp": 0.21200824, + "epoch": 0.35232226063430033, + "flos": 22966310632320.0, + "grad_norm": 27.652407880978856, + "language_loss": 0.80359197, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.82141072, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.31762695, + "step": 5860, + "time_per_iteration": 2.7094807624816895 + }, + { + "auxiliary_loss_clip": 0.01528858, + "auxiliary_loss_mlp": 0.00225652, + "balance_loss_clip": 1.24485064, + "balance_loss_mlp": 0.19315507, + "epoch": 0.3523823838869683, + "flos": 19168290259200.0, + "grad_norm": 2.5149409271444694, + "language_loss": 0.8035028, + "learning_rate": 3.005279449623811e-06, + "loss": 0.8210479, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.32495117, + "step": 5861, + "time_per_iteration": 2.7008743286132812 + }, + { + "auxiliary_loss_clip": 0.01551766, + "auxiliary_loss_mlp": 0.00215468, + "balance_loss_clip": 1.27121079, + "balance_loss_mlp": 0.18216069, + "epoch": 0.35244250713963626, + "flos": 17930988420480.0, + "grad_norm": 5.041665823191844, + "language_loss": 0.74959791, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.76727021, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.33325195, + "step": 5862, + "time_per_iteration": 2.6826417446136475 + }, + { + "auxiliary_loss_clip": 0.01545658, + "auxiliary_loss_mlp": 0.00250366, + "balance_loss_clip": 1.26412499, + "balance_loss_mlp": 0.2173216, + "epoch": 0.35250263039230423, + "flos": 21432529935360.0, + "grad_norm": 271.92291673649606, + "language_loss": 0.84381956, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.86177981, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.33081055, + "step": 5863, + "time_per_iteration": 2.7712810039520264 + }, + { + "auxiliary_loss_clip": 0.0153974, + "auxiliary_loss_mlp": 0.00229304, + "balance_loss_clip": 1.25994825, + "balance_loss_mlp": 0.19747508, + "epoch": 0.3525627536449722, + "flos": 27416863428480.0, + "grad_norm": 24.67685669205483, + "language_loss": 0.81271005, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.83040047, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.31835938, + "step": 5864, + "time_per_iteration": 2.7455577850341797 + }, + { + "auxiliary_loss_clip": 0.01547252, + "auxiliary_loss_mlp": 0.00216485, + "balance_loss_clip": 1.2666415, + "balance_loss_mlp": 0.18646784, + "epoch": 0.35262287689764016, + "flos": 24789818430720.0, + "grad_norm": 3.5033826666673367, + "language_loss": 0.89233088, + "learning_rate": 3.003932392558793e-06, + "loss": 0.90996826, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.30029297, + "step": 5865, + "time_per_iteration": 2.900442600250244 + }, + { + "auxiliary_loss_clip": 0.01557253, + "auxiliary_loss_mlp": 0.00265463, + "balance_loss_clip": 1.26746023, + "balance_loss_mlp": 0.23163143, + "epoch": 0.3526830001503081, + "flos": 17821604528640.0, + "grad_norm": 3.0238925018260696, + "language_loss": 0.89306295, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.91129017, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.33813477, + "step": 5866, + "time_per_iteration": 2.6215198040008545 + }, + { + "auxiliary_loss_clip": 0.01540923, + "auxiliary_loss_mlp": 0.00258244, + "balance_loss_clip": 1.25042629, + "balance_loss_mlp": 0.22283903, + "epoch": 0.3527431234029761, + "flos": 18078114528000.0, + "grad_norm": 14.40635147565815, + "language_loss": 0.93465424, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.9526459, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.35424805, + "step": 5867, + "time_per_iteration": 2.6193461418151855 + }, + { + "auxiliary_loss_clip": 0.01551169, + "auxiliary_loss_mlp": 0.00240178, + "balance_loss_clip": 1.26684093, + "balance_loss_mlp": 0.21088813, + "epoch": 0.35280324665564405, + "flos": 19427350124160.0, + "grad_norm": 1348.2564886775306, + "language_loss": 0.81130272, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.82921618, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.29309082, + "step": 5868, + "time_per_iteration": 2.6162021160125732 + }, + { + "auxiliary_loss_clip": 0.01562742, + "auxiliary_loss_mlp": 0.00229255, + "balance_loss_clip": 1.27585173, + "balance_loss_mlp": 0.19752173, + "epoch": 0.352863369908312, + "flos": 21504027957120.0, + "grad_norm": 18.565821139703353, + "language_loss": 0.71407843, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.73199832, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.31738281, + "step": 5869, + "time_per_iteration": 2.636399984359741 + }, + { + "auxiliary_loss_clip": 0.01540381, + "auxiliary_loss_mlp": 0.00234544, + "balance_loss_clip": 1.25932312, + "balance_loss_mlp": 0.20385987, + "epoch": 0.35292349316098, + "flos": 22309504490880.0, + "grad_norm": 179.72877583884124, + "language_loss": 0.83734149, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.85509074, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.3067627, + "step": 5870, + "time_per_iteration": 2.659522294998169 + }, + { + "auxiliary_loss_clip": 0.01537413, + "auxiliary_loss_mlp": 0.00243821, + "balance_loss_clip": 1.26205564, + "balance_loss_mlp": 0.21263549, + "epoch": 0.352983616413648, + "flos": 33109745967360.0, + "grad_norm": 4.104322903160396, + "language_loss": 0.77605247, + "learning_rate": 3.001910665140316e-06, + "loss": 0.79386485, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.31201172, + "step": 5871, + "time_per_iteration": 2.868439197540283 + }, + { + "auxiliary_loss_clip": 0.01540613, + "auxiliary_loss_mlp": 0.00206025, + "balance_loss_clip": 1.26738369, + "balance_loss_mlp": 0.17853522, + "epoch": 0.35304373966631597, + "flos": 18696603836160.0, + "grad_norm": 261.4402412486613, + "language_loss": 0.81121475, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.82868111, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.27514648, + "step": 5872, + "time_per_iteration": 2.627357244491577 + }, + { + "auxiliary_loss_clip": 0.01548381, + "auxiliary_loss_mlp": 0.00212955, + "balance_loss_clip": 1.27127004, + "balance_loss_mlp": 0.18265221, + "epoch": 0.35310386291898394, + "flos": 23364954748800.0, + "grad_norm": 41.98583573571079, + "language_loss": 0.87685698, + "learning_rate": 3.001236451924089e-06, + "loss": 0.89447033, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.30322266, + "step": 5873, + "time_per_iteration": 2.6473228931427 + }, + { + "auxiliary_loss_clip": 0.01555399, + "auxiliary_loss_mlp": 0.00265296, + "balance_loss_clip": 1.27073741, + "balance_loss_mlp": 0.23501715, + "epoch": 0.3531639861716519, + "flos": 24461954064000.0, + "grad_norm": 111.65463896716041, + "language_loss": 0.75061727, + "learning_rate": 3.000899288359104e-06, + "loss": 0.76882422, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.30249023, + "step": 5874, + "time_per_iteration": 2.6582789421081543 + }, + { + "auxiliary_loss_clip": 0.01548153, + "auxiliary_loss_mlp": 0.00093008, + "balance_loss_clip": 1.36006117, + "balance_loss_mlp": 0.08418636, + "epoch": 0.35322410942431987, + "flos": 70312446881280.0, + "grad_norm": 0.8025998236146415, + "language_loss": 0.61204904, + "learning_rate": 3.000562086839917e-06, + "loss": 0.62846065, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.08837891, + "step": 5875, + "time_per_iteration": 3.0798707008361816 + }, + { + "auxiliary_loss_clip": 0.01557035, + "auxiliary_loss_mlp": 0.00253245, + "balance_loss_clip": 1.27313447, + "balance_loss_mlp": 0.22351415, + "epoch": 0.35328423267698783, + "flos": 19820894509440.0, + "grad_norm": 23.428945231621206, + "language_loss": 0.83997917, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.85808194, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.29699707, + "step": 5876, + "time_per_iteration": 2.634537696838379 + }, + { + "auxiliary_loss_clip": 0.01502852, + "auxiliary_loss_mlp": 0.00073255, + "balance_loss_clip": 1.32432067, + "balance_loss_mlp": 0.06424253, + "epoch": 0.3533443559296558, + "flos": 60826356391680.0, + "grad_norm": 0.6788376982672064, + "language_loss": 0.56679016, + "learning_rate": 2.999887569990088e-06, + "loss": 0.58255124, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.09033203, + "step": 5877, + "time_per_iteration": 3.2287144660949707 + }, + { + "auxiliary_loss_clip": 0.01544909, + "auxiliary_loss_mlp": 0.0022195, + "balance_loss_clip": 1.2634449, + "balance_loss_mlp": 0.19157571, + "epoch": 0.35340447918232376, + "flos": 24755775315840.0, + "grad_norm": 373.40445332385326, + "language_loss": 0.81697005, + "learning_rate": 2.999550254685024e-06, + "loss": 0.8346386, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.30371094, + "step": 5878, + "time_per_iteration": 2.6868772506713867 + }, + { + "auxiliary_loss_clip": 0.01541398, + "auxiliary_loss_mlp": 0.00246637, + "balance_loss_clip": 1.259866, + "balance_loss_mlp": 0.21807402, + "epoch": 0.3534646024349917, + "flos": 21796304924160.0, + "grad_norm": 6.831848092345733, + "language_loss": 0.85729015, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.87517047, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.28588867, + "step": 5879, + "time_per_iteration": 2.673164129257202 + }, + { + "auxiliary_loss_clip": 0.01544228, + "auxiliary_loss_mlp": 0.00271621, + "balance_loss_clip": 1.25687146, + "balance_loss_mlp": 0.24007834, + "epoch": 0.3535247256876597, + "flos": 20012119539840.0, + "grad_norm": 5.895197069495591, + "language_loss": 0.76413691, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.78229541, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.31542969, + "step": 5880, + "time_per_iteration": 2.6836390495300293 + }, + { + "auxiliary_loss_clip": 0.0153904, + "auxiliary_loss_mlp": 0.00272894, + "balance_loss_clip": 1.25292325, + "balance_loss_mlp": 0.2423286, + "epoch": 0.35358484894032766, + "flos": 18187929383040.0, + "grad_norm": 19.702092018180075, + "language_loss": 0.75804782, + "learning_rate": 2.998538081402727e-06, + "loss": 0.77616715, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.30566406, + "step": 5881, + "time_per_iteration": 2.6667232513427734 + }, + { + "auxiliary_loss_clip": 0.01539635, + "auxiliary_loss_mlp": 0.00241575, + "balance_loss_clip": 1.26371336, + "balance_loss_mlp": 0.21524119, + "epoch": 0.3536449721929956, + "flos": 22820369673600.0, + "grad_norm": 26.630480695242866, + "language_loss": 0.81761354, + "learning_rate": 2.998200614562239e-06, + "loss": 0.83542562, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.26330566, + "step": 5882, + "time_per_iteration": 2.6696343421936035 + }, + { + "auxiliary_loss_clip": 0.01545333, + "auxiliary_loss_mlp": 0.00269124, + "balance_loss_clip": 1.26142883, + "balance_loss_mlp": 0.23803386, + "epoch": 0.3537050954456636, + "flos": 26432336574720.0, + "grad_norm": 44.5138297520715, + "language_loss": 0.76595086, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.78409541, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.31079102, + "step": 5883, + "time_per_iteration": 2.7396907806396484 + }, + { + "auxiliary_loss_clip": 0.01526528, + "auxiliary_loss_mlp": 0.00271174, + "balance_loss_clip": 1.24145472, + "balance_loss_mlp": 0.23936914, + "epoch": 0.3537652186983316, + "flos": 17197153562880.0, + "grad_norm": 193.92812762771175, + "language_loss": 0.86477208, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.88274914, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 2.85351562, + "router_z_loss_mlp": 0.31787109, + "step": 5884, + "time_per_iteration": 2.644519090652466 + }, + { + "auxiliary_loss_clip": 0.0152835, + "auxiliary_loss_mlp": 0.00236569, + "balance_loss_clip": 1.24823546, + "balance_loss_mlp": 0.20934188, + "epoch": 0.3538253419509996, + "flos": 19536769929600.0, + "grad_norm": 75.71696170026736, + "language_loss": 0.81557679, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.83322597, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.27233887, + "step": 5885, + "time_per_iteration": 4.2806055545806885 + }, + { + "auxiliary_loss_clip": 0.01525607, + "auxiliary_loss_mlp": 0.0025277, + "balance_loss_clip": 1.24441087, + "balance_loss_mlp": 0.22396913, + "epoch": 0.35388546520366754, + "flos": 12128578335360.0, + "grad_norm": 26.906172481899688, + "language_loss": 0.94384611, + "learning_rate": 2.996850368809606e-06, + "loss": 0.96162993, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.2878418, + "step": 5886, + "time_per_iteration": 2.648134231567383 + }, + { + "auxiliary_loss_clip": 0.01517451, + "auxiliary_loss_mlp": 0.00252816, + "balance_loss_clip": 1.24711728, + "balance_loss_mlp": 0.22287108, + "epoch": 0.3539455884563355, + "flos": 19678149861120.0, + "grad_norm": 3.1122315115114896, + "language_loss": 0.86311507, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.88081777, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.29907227, + "step": 5887, + "time_per_iteration": 2.7064616680145264 + }, + { + "auxiliary_loss_clip": 0.01514738, + "auxiliary_loss_mlp": 0.00247199, + "balance_loss_clip": 1.24121308, + "balance_loss_mlp": 0.21761124, + "epoch": 0.35400571170900347, + "flos": 18072045129600.0, + "grad_norm": 8.399843582822472, + "language_loss": 0.72722477, + "learning_rate": 2.996175019078089e-06, + "loss": 0.74484414, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.2956543, + "step": 5888, + "time_per_iteration": 2.6676909923553467 + }, + { + "auxiliary_loss_clip": 0.01538843, + "auxiliary_loss_mlp": 0.00284451, + "balance_loss_clip": 1.26168489, + "balance_loss_mlp": 0.25479209, + "epoch": 0.35406583496167143, + "flos": 26068058795520.0, + "grad_norm": 7.2004686391379655, + "language_loss": 0.84418434, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.86241734, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.296875, + "step": 5889, + "time_per_iteration": 2.6549150943756104 + }, + { + "auxiliary_loss_clip": 0.0155145, + "auxiliary_loss_mlp": 0.0027363, + "balance_loss_clip": 1.27395797, + "balance_loss_mlp": 0.24380338, + "epoch": 0.3541259582143394, + "flos": 19792453916160.0, + "grad_norm": 13.660839148494652, + "language_loss": 0.90698481, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.92523557, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.29821777, + "step": 5890, + "time_per_iteration": 2.715224027633667 + }, + { + "auxiliary_loss_clip": 0.01533069, + "auxiliary_loss_mlp": 0.00242878, + "balance_loss_clip": 1.26127028, + "balance_loss_mlp": 0.21643695, + "epoch": 0.35418608146700736, + "flos": 24022084112640.0, + "grad_norm": 12.527501025519555, + "language_loss": 0.86834407, + "learning_rate": 2.99516171119991e-06, + "loss": 0.88610357, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.26391602, + "step": 5891, + "time_per_iteration": 4.073111534118652 + }, + { + "auxiliary_loss_clip": 0.01524487, + "auxiliary_loss_mlp": 0.00282846, + "balance_loss_clip": 1.25589299, + "balance_loss_mlp": 0.25393769, + "epoch": 0.35424620471967533, + "flos": 12385770693120.0, + "grad_norm": 5.573846170037341, + "language_loss": 0.79510474, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.81317806, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.2890625, + "step": 5892, + "time_per_iteration": 2.628737449645996 + }, + { + "auxiliary_loss_clip": 0.01523981, + "auxiliary_loss_mlp": 0.00265495, + "balance_loss_clip": 1.2545495, + "balance_loss_mlp": 0.23678914, + "epoch": 0.3543063279723433, + "flos": 19673624747520.0, + "grad_norm": 11.305287881590573, + "language_loss": 0.75058526, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.76848, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.28674316, + "step": 5893, + "time_per_iteration": 2.665471076965332 + }, + { + "auxiliary_loss_clip": 0.0153122, + "auxiliary_loss_mlp": 0.0031037, + "balance_loss_clip": 1.26058686, + "balance_loss_mlp": 0.27928066, + "epoch": 0.35436645122501126, + "flos": 21909208348800.0, + "grad_norm": 47.87487034370348, + "language_loss": 0.75794637, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.7763623, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.31103516, + "step": 5894, + "time_per_iteration": 2.666548252105713 + }, + { + "auxiliary_loss_clip": 0.01540474, + "auxiliary_loss_mlp": 0.00235802, + "balance_loss_clip": 1.27321768, + "balance_loss_mlp": 0.20915824, + "epoch": 0.3544265744776792, + "flos": 21719527603200.0, + "grad_norm": 7.714899172362084, + "language_loss": 0.80217361, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.81993639, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.26611328, + "step": 5895, + "time_per_iteration": 2.64473032951355 + }, + { + "auxiliary_loss_clip": 0.01530376, + "auxiliary_loss_mlp": 0.00287995, + "balance_loss_clip": 1.26146483, + "balance_loss_mlp": 0.26050571, + "epoch": 0.3544866977303472, + "flos": 21213223447680.0, + "grad_norm": 49.095140557841226, + "language_loss": 0.90311062, + "learning_rate": 2.993472110174491e-06, + "loss": 0.92129433, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.27478027, + "step": 5896, + "time_per_iteration": 2.6386702060699463 + }, + { + "auxiliary_loss_clip": 0.01522569, + "auxiliary_loss_mlp": 0.00275233, + "balance_loss_clip": 1.25777912, + "balance_loss_mlp": 0.24968643, + "epoch": 0.35454682098301515, + "flos": 29311402371840.0, + "grad_norm": 2.5200787110755978, + "language_loss": 0.77765691, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.79563498, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.25561523, + "step": 5897, + "time_per_iteration": 4.147968530654907 + }, + { + "auxiliary_loss_clip": 0.01505761, + "auxiliary_loss_mlp": 0.00272337, + "balance_loss_clip": 1.24113894, + "balance_loss_mlp": 0.24522921, + "epoch": 0.3546069442356832, + "flos": 24316587722880.0, + "grad_norm": 8.963501027497745, + "language_loss": 0.87302935, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.89081031, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.27087402, + "step": 5898, + "time_per_iteration": 2.7752861976623535 + }, + { + "auxiliary_loss_clip": 0.01488334, + "auxiliary_loss_mlp": 0.0027671, + "balance_loss_clip": 1.2290833, + "balance_loss_mlp": 0.24869597, + "epoch": 0.35466706748835114, + "flos": 22857285876480.0, + "grad_norm": 20.492287434901915, + "language_loss": 0.81563127, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.8332817, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.28015137, + "step": 5899, + "time_per_iteration": 2.6861534118652344 + }, + { + "auxiliary_loss_clip": 0.01492157, + "auxiliary_loss_mlp": 0.00252968, + "balance_loss_clip": 1.22877693, + "balance_loss_mlp": 0.22532293, + "epoch": 0.3547271907410191, + "flos": 28330107742080.0, + "grad_norm": 3.5163163291122976, + "language_loss": 0.87161309, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.88906437, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.27636719, + "step": 5900, + "time_per_iteration": 2.7171592712402344 + }, + { + "auxiliary_loss_clip": 0.01497941, + "auxiliary_loss_mlp": 0.00296316, + "balance_loss_clip": 1.23404193, + "balance_loss_mlp": 0.26689562, + "epoch": 0.35478731399368707, + "flos": 23514092017920.0, + "grad_norm": 26.47977309138114, + "language_loss": 0.87827921, + "learning_rate": 2.991781567335093e-06, + "loss": 0.89622176, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.29443359, + "step": 5901, + "time_per_iteration": 2.653489351272583 + }, + { + "auxiliary_loss_clip": 0.014964, + "auxiliary_loss_mlp": 0.00278671, + "balance_loss_clip": 1.22946513, + "balance_loss_mlp": 0.25141972, + "epoch": 0.35484743724635504, + "flos": 18624315715200.0, + "grad_norm": 12.298874308908644, + "language_loss": 0.82703853, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.84478927, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.27258301, + "step": 5902, + "time_per_iteration": 2.7079484462738037 + }, + { + "auxiliary_loss_clip": 0.01481276, + "auxiliary_loss_mlp": 0.00267821, + "balance_loss_clip": 1.22052467, + "balance_loss_mlp": 0.242167, + "epoch": 0.354907560499023, + "flos": 17384499924480.0, + "grad_norm": 1.9705534189789042, + "language_loss": 0.79117811, + "learning_rate": 2.991105086850381e-06, + "loss": 0.80866909, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.25610352, + "step": 5903, + "time_per_iteration": 2.698075771331787 + }, + { + "auxiliary_loss_clip": 0.01478117, + "auxiliary_loss_mlp": 0.00312131, + "balance_loss_clip": 1.21451163, + "balance_loss_mlp": 0.2839019, + "epoch": 0.35496768375169097, + "flos": 19208546426880.0, + "grad_norm": 8.04838683859432, + "language_loss": 0.84454334, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.86244583, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.28222656, + "step": 5904, + "time_per_iteration": 2.650801420211792 + }, + { + "auxiliary_loss_clip": 0.01483079, + "auxiliary_loss_mlp": 0.00277839, + "balance_loss_clip": 1.22035289, + "balance_loss_mlp": 0.24998006, + "epoch": 0.35502780700435893, + "flos": 18332792933760.0, + "grad_norm": 7.349125307122319, + "language_loss": 0.85888022, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.8764894, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.27893066, + "step": 5905, + "time_per_iteration": 2.675139904022217 + }, + { + "auxiliary_loss_clip": 0.01488305, + "auxiliary_loss_mlp": 0.00282415, + "balance_loss_clip": 1.23508906, + "balance_loss_mlp": 0.25539032, + "epoch": 0.3550879302570269, + "flos": 15448555578240.0, + "grad_norm": 34.00735744856272, + "language_loss": 0.77426964, + "learning_rate": 2.990090084284356e-06, + "loss": 0.79197681, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.2701416, + "step": 5906, + "time_per_iteration": 2.637699604034424 + }, + { + "auxiliary_loss_clip": 0.01481577, + "auxiliary_loss_mlp": 0.003166, + "balance_loss_clip": 1.21661854, + "balance_loss_mlp": 0.28683341, + "epoch": 0.35514805350969486, + "flos": 21979197999360.0, + "grad_norm": 6413.6022757304345, + "language_loss": 0.82373464, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.84171635, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.29736328, + "step": 5907, + "time_per_iteration": 2.667628288269043 + }, + { + "auxiliary_loss_clip": 0.01495828, + "auxiliary_loss_mlp": 0.00292535, + "balance_loss_clip": 1.23082876, + "balance_loss_mlp": 0.26578468, + "epoch": 0.3552081767623628, + "flos": 29861949104640.0, + "grad_norm": 10.775576147939987, + "language_loss": 0.81064343, + "learning_rate": 2.989413228164047e-06, + "loss": 0.82852709, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.26733398, + "step": 5908, + "time_per_iteration": 2.8001832962036133 + }, + { + "auxiliary_loss_clip": 0.01490273, + "auxiliary_loss_mlp": 0.00319109, + "balance_loss_clip": 1.22918749, + "balance_loss_mlp": 0.29089257, + "epoch": 0.3552683000150308, + "flos": 26432264747520.0, + "grad_norm": 3.821765736810609, + "language_loss": 0.76175368, + "learning_rate": 2.989074743819502e-06, + "loss": 0.7798475, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.28210449, + "step": 5909, + "time_per_iteration": 2.8730227947235107 + }, + { + "auxiliary_loss_clip": 0.01492003, + "auxiliary_loss_mlp": 0.00299544, + "balance_loss_clip": 1.23322582, + "balance_loss_mlp": 0.27371129, + "epoch": 0.35532842326769876, + "flos": 19785989468160.0, + "grad_norm": 7.041222316521886, + "language_loss": 0.8666569, + "learning_rate": 2.988736221969144e-06, + "loss": 0.88457239, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.25866699, + "step": 5910, + "time_per_iteration": 2.6895179748535156 + }, + { + "auxiliary_loss_clip": 0.01502458, + "auxiliary_loss_mlp": 0.00352541, + "balance_loss_clip": 1.23262894, + "balance_loss_mlp": 0.32080752, + "epoch": 0.3553885465203668, + "flos": 17239277237760.0, + "grad_norm": 2.049628425571731, + "language_loss": 0.80072427, + "learning_rate": 2.98839766262581e-06, + "loss": 0.81927431, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.31738281, + "step": 5911, + "time_per_iteration": 2.6407647132873535 + }, + { + "auxiliary_loss_clip": 0.01492864, + "auxiliary_loss_mlp": 0.00318901, + "balance_loss_clip": 1.23284876, + "balance_loss_mlp": 0.29172164, + "epoch": 0.35544866977303474, + "flos": 14934350430720.0, + "grad_norm": 46.99077634321067, + "language_loss": 0.94998634, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.96810395, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.27185059, + "step": 5912, + "time_per_iteration": 2.5882568359375 + }, + { + "auxiliary_loss_clip": 0.01499189, + "auxiliary_loss_mlp": 0.003145, + "balance_loss_clip": 1.23575735, + "balance_loss_mlp": 0.28596175, + "epoch": 0.3555087930257027, + "flos": 19756040503680.0, + "grad_norm": 7.2213600463361125, + "language_loss": 0.84271538, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.86085224, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.28552246, + "step": 5913, + "time_per_iteration": 2.637784481048584 + }, + { + "auxiliary_loss_clip": 0.01506802, + "auxiliary_loss_mlp": 0.00315498, + "balance_loss_clip": 1.24323249, + "balance_loss_mlp": 0.28698277, + "epoch": 0.3555689162783707, + "flos": 21068252156160.0, + "grad_norm": 2.2518926297441815, + "language_loss": 0.86649996, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.88472301, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.28515625, + "step": 5914, + "time_per_iteration": 2.625899314880371 + }, + { + "auxiliary_loss_clip": 0.01509382, + "auxiliary_loss_mlp": 0.00303621, + "balance_loss_clip": 1.24217045, + "balance_loss_mlp": 0.27638179, + "epoch": 0.35562903953103864, + "flos": 33069633454080.0, + "grad_norm": 142.43572518494807, + "language_loss": 0.79166162, + "learning_rate": 2.98704305057949e-06, + "loss": 0.80979168, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.27258301, + "step": 5915, + "time_per_iteration": 2.737635612487793 + }, + { + "auxiliary_loss_clip": 0.015052, + "auxiliary_loss_mlp": 0.00310532, + "balance_loss_clip": 1.24310589, + "balance_loss_mlp": 0.28099185, + "epoch": 0.3556891627837066, + "flos": 20557853850240.0, + "grad_norm": 7.124030078357069, + "language_loss": 0.8246547, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.84281206, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.2956543, + "step": 5916, + "time_per_iteration": 2.6383414268493652 + }, + { + "auxiliary_loss_clip": 0.01522837, + "auxiliary_loss_mlp": 0.0032505, + "balance_loss_clip": 1.25394535, + "balance_loss_mlp": 0.29751289, + "epoch": 0.35574928603637457, + "flos": 20703327932160.0, + "grad_norm": 116.98635109976172, + "language_loss": 0.95379001, + "learning_rate": 2.986365519932332e-06, + "loss": 0.97226888, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.27539062, + "step": 5917, + "time_per_iteration": 2.6645665168762207 + }, + { + "auxiliary_loss_clip": 0.0151482, + "auxiliary_loss_mlp": 0.00305985, + "balance_loss_clip": 1.24826586, + "balance_loss_mlp": 0.27689826, + "epoch": 0.35580940928904253, + "flos": 15194595444480.0, + "grad_norm": 7.318126587713542, + "language_loss": 0.85146189, + "learning_rate": 2.98602669849771e-06, + "loss": 0.86966997, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.29089355, + "step": 5918, + "time_per_iteration": 2.672731637954712 + }, + { + "auxiliary_loss_clip": 0.01463757, + "auxiliary_loss_mlp": 0.00105356, + "balance_loss_clip": 1.26793814, + "balance_loss_mlp": 0.09615277, + "epoch": 0.3558695325417105, + "flos": 58639145431680.0, + "grad_norm": 30.011511032784316, + "language_loss": 0.63664269, + "learning_rate": 2.985687839672857e-06, + "loss": 0.65233386, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.09179688, + "step": 5919, + "time_per_iteration": 2.904599189758301 + }, + { + "auxiliary_loss_clip": 0.01518291, + "auxiliary_loss_mlp": 0.00327551, + "balance_loss_clip": 1.24954569, + "balance_loss_mlp": 0.29870194, + "epoch": 0.35592965579437846, + "flos": 22018233104640.0, + "grad_norm": 13.752898540590467, + "language_loss": 0.82794708, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.84640545, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.28881836, + "step": 5920, + "time_per_iteration": 2.6655349731445312 + }, + { + "auxiliary_loss_clip": 0.0152425, + "auxiliary_loss_mlp": 0.00325258, + "balance_loss_clip": 1.25616241, + "balance_loss_mlp": 0.29640907, + "epoch": 0.35598977904704643, + "flos": 23367684182400.0, + "grad_norm": 15145.141574004392, + "language_loss": 0.84300882, + "learning_rate": 2.985010009903857e-06, + "loss": 0.8615039, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.28857422, + "step": 5921, + "time_per_iteration": 2.755725383758545 + }, + { + "auxiliary_loss_clip": 0.01525137, + "auxiliary_loss_mlp": 0.00286204, + "balance_loss_clip": 1.25587773, + "balance_loss_mlp": 0.25587749, + "epoch": 0.3560499022997144, + "flos": 17785334770560.0, + "grad_norm": 26.24091261076865, + "language_loss": 0.77522993, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.79334342, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.30297852, + "step": 5922, + "time_per_iteration": 2.7211203575134277 + }, + { + "auxiliary_loss_clip": 0.01518421, + "auxiliary_loss_mlp": 0.00292639, + "balance_loss_clip": 1.25347948, + "balance_loss_mlp": 0.26443449, + "epoch": 0.35611002555238236, + "flos": 20740459616640.0, + "grad_norm": 41.33323798568489, + "language_loss": 0.85017145, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.86828208, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.28186035, + "step": 5923, + "time_per_iteration": 2.7301626205444336 + }, + { + "auxiliary_loss_clip": 0.01515931, + "auxiliary_loss_mlp": 0.002989, + "balance_loss_clip": 1.25306797, + "balance_loss_mlp": 0.27038473, + "epoch": 0.3561701488050504, + "flos": 19462219251840.0, + "grad_norm": 25.991058511060764, + "language_loss": 0.90702826, + "learning_rate": 2.983992985144908e-06, + "loss": 0.92517662, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.28515625, + "step": 5924, + "time_per_iteration": 2.684992551803589 + }, + { + "auxiliary_loss_clip": 0.01528924, + "auxiliary_loss_mlp": 0.00290892, + "balance_loss_clip": 1.25990152, + "balance_loss_mlp": 0.26273513, + "epoch": 0.35623027205771834, + "flos": 30774942023040.0, + "grad_norm": 2.2945950548981013, + "language_loss": 0.85980755, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.87800574, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.28112793, + "step": 5925, + "time_per_iteration": 2.750714063644409 + }, + { + "auxiliary_loss_clip": 0.01510387, + "auxiliary_loss_mlp": 0.00312726, + "balance_loss_clip": 1.24365389, + "balance_loss_mlp": 0.28254217, + "epoch": 0.3562903953103863, + "flos": 16981079299200.0, + "grad_norm": 23.117762382951124, + "language_loss": 0.83298767, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.85121882, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.30187988, + "step": 5926, + "time_per_iteration": 2.774214744567871 + }, + { + "auxiliary_loss_clip": 0.01515393, + "auxiliary_loss_mlp": 0.00285606, + "balance_loss_clip": 1.24815214, + "balance_loss_mlp": 0.25738871, + "epoch": 0.3563505185630543, + "flos": 23839837482240.0, + "grad_norm": 16.390731639523615, + "language_loss": 0.77688658, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.7948966, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.28210449, + "step": 5927, + "time_per_iteration": 4.455687522888184 + }, + { + "auxiliary_loss_clip": 0.01512291, + "auxiliary_loss_mlp": 0.00292783, + "balance_loss_clip": 1.2481842, + "balance_loss_mlp": 0.26590082, + "epoch": 0.35641064181572224, + "flos": 22273450214400.0, + "grad_norm": 35.281872906752476, + "language_loss": 0.85092533, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.86897612, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.26867676, + "step": 5928, + "time_per_iteration": 4.368957281112671 + }, + { + "auxiliary_loss_clip": 0.01508821, + "auxiliary_loss_mlp": 0.002834, + "balance_loss_clip": 1.24678826, + "balance_loss_mlp": 0.25416991, + "epoch": 0.3564707650683902, + "flos": 23001251587200.0, + "grad_norm": 4.077640733454962, + "language_loss": 0.88193625, + "learning_rate": 2.982297197789215e-06, + "loss": 0.89985847, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.29223633, + "step": 5929, + "time_per_iteration": 2.848938226699829 + }, + { + "auxiliary_loss_clip": 0.01509039, + "auxiliary_loss_mlp": 0.0025333, + "balance_loss_clip": 1.24648726, + "balance_loss_mlp": 0.22618556, + "epoch": 0.35653088832105817, + "flos": 14684268965760.0, + "grad_norm": 24.713996412558508, + "language_loss": 0.78719914, + "learning_rate": 2.981957928520201e-06, + "loss": 0.8048228, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.27124023, + "step": 5930, + "time_per_iteration": 2.703510046005249 + }, + { + "auxiliary_loss_clip": 0.01508001, + "auxiliary_loss_mlp": 0.00286483, + "balance_loss_clip": 1.2392621, + "balance_loss_mlp": 0.25555986, + "epoch": 0.35659101157372614, + "flos": 23477068074240.0, + "grad_norm": 36.41178767717122, + "language_loss": 0.75772732, + "learning_rate": 2.981618622015244e-06, + "loss": 0.7756722, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.3092041, + "step": 5931, + "time_per_iteration": 2.8686869144439697 + }, + { + "auxiliary_loss_clip": 0.01488728, + "auxiliary_loss_mlp": 0.00247636, + "balance_loss_clip": 1.23041344, + "balance_loss_mlp": 0.22191074, + "epoch": 0.3566511348263941, + "flos": 26578672583040.0, + "grad_norm": 3.629516985500712, + "language_loss": 0.74720758, + "learning_rate": 2.981279278287211e-06, + "loss": 0.76457125, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.25708008, + "step": 5932, + "time_per_iteration": 2.7950844764709473 + }, + { + "auxiliary_loss_clip": 0.01511044, + "auxiliary_loss_mlp": 0.00274373, + "balance_loss_clip": 1.24824548, + "balance_loss_mlp": 0.24738401, + "epoch": 0.35671125807906207, + "flos": 13115008609920.0, + "grad_norm": 515.1582088038905, + "language_loss": 0.89982677, + "learning_rate": 2.980939897348969e-06, + "loss": 0.91768092, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.27001953, + "step": 5933, + "time_per_iteration": 4.237018823623657 + }, + { + "auxiliary_loss_clip": 0.01497943, + "auxiliary_loss_mlp": 0.00295381, + "balance_loss_clip": 1.23597634, + "balance_loss_mlp": 0.26600745, + "epoch": 0.35677138133173003, + "flos": 33000577557120.0, + "grad_norm": 10.426071311582003, + "language_loss": 0.75285125, + "learning_rate": 2.980600479213388e-06, + "loss": 0.7707845, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.29370117, + "step": 5934, + "time_per_iteration": 2.848538637161255 + }, + { + "auxiliary_loss_clip": 0.01504644, + "auxiliary_loss_mlp": 0.00277204, + "balance_loss_clip": 1.23815691, + "balance_loss_mlp": 0.24721113, + "epoch": 0.356831504584398, + "flos": 20777842696320.0, + "grad_norm": 16.83274762527767, + "language_loss": 0.80124712, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.81906563, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.29992676, + "step": 5935, + "time_per_iteration": 2.8459441661834717 + }, + { + "auxiliary_loss_clip": 0.01493347, + "auxiliary_loss_mlp": 0.00275082, + "balance_loss_clip": 1.23364365, + "balance_loss_mlp": 0.24649569, + "epoch": 0.35689162783706596, + "flos": 12165566365440.0, + "grad_norm": 11.033408738761537, + "language_loss": 0.86630678, + "learning_rate": 2.979921531401692e-06, + "loss": 0.88399112, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.28564453, + "step": 5936, + "time_per_iteration": 2.85355544090271 + }, + { + "auxiliary_loss_clip": 0.01491076, + "auxiliary_loss_mlp": 0.00272285, + "balance_loss_clip": 1.22679257, + "balance_loss_mlp": 0.24514139, + "epoch": 0.356951751089734, + "flos": 23841489507840.0, + "grad_norm": 8.822662366218662, + "language_loss": 0.72147751, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.73911113, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.27160645, + "step": 5937, + "time_per_iteration": 2.6971676349639893 + }, + { + "auxiliary_loss_clip": 0.01491748, + "auxiliary_loss_mlp": 0.00254721, + "balance_loss_clip": 1.22994924, + "balance_loss_mlp": 0.2283276, + "epoch": 0.35701187434240195, + "flos": 11722176881280.0, + "grad_norm": 299.6733573780944, + "language_loss": 0.89531302, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.91277772, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.26379395, + "step": 5938, + "time_per_iteration": 2.6800901889801025 + }, + { + "auxiliary_loss_clip": 0.01499365, + "auxiliary_loss_mlp": 0.00247942, + "balance_loss_clip": 1.23788393, + "balance_loss_mlp": 0.22226456, + "epoch": 0.3570719975950699, + "flos": 24898879100160.0, + "grad_norm": 35.35227962056127, + "language_loss": 0.864519, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.88199198, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.25671387, + "step": 5939, + "time_per_iteration": 4.156993865966797 + }, + { + "auxiliary_loss_clip": 0.0148571, + "auxiliary_loss_mlp": 0.00295593, + "balance_loss_clip": 1.21911299, + "balance_loss_mlp": 0.26582688, + "epoch": 0.3571321208477379, + "flos": 25994836920960.0, + "grad_norm": 8.680842744578293, + "language_loss": 0.87455285, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.89236593, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.29772949, + "step": 5940, + "time_per_iteration": 2.6442272663116455 + }, + { + "auxiliary_loss_clip": 0.01493829, + "auxiliary_loss_mlp": 0.00326047, + "balance_loss_clip": 1.22956014, + "balance_loss_mlp": 0.29254931, + "epoch": 0.35719224410040584, + "flos": 14501663199360.0, + "grad_norm": 18.897314772315582, + "language_loss": 0.80665338, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.82485211, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.33496094, + "step": 5941, + "time_per_iteration": 2.6456472873687744 + }, + { + "auxiliary_loss_clip": 0.01500958, + "auxiliary_loss_mlp": 0.00253751, + "balance_loss_clip": 1.23690462, + "balance_loss_mlp": 0.22751309, + "epoch": 0.3572523673530738, + "flos": 31175453646720.0, + "grad_norm": 29.30628975145058, + "language_loss": 0.76966882, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.78721589, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.2623291, + "step": 5942, + "time_per_iteration": 2.6903724670410156 + }, + { + "auxiliary_loss_clip": 0.01492028, + "auxiliary_loss_mlp": 0.0027162, + "balance_loss_clip": 1.22857308, + "balance_loss_mlp": 0.24370095, + "epoch": 0.3573124906057418, + "flos": 15851976203520.0, + "grad_norm": 32.0714690750336, + "language_loss": 0.82299042, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.8406269, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.27880859, + "step": 5943, + "time_per_iteration": 2.604304313659668 + }, + { + "auxiliary_loss_clip": 0.01496204, + "auxiliary_loss_mlp": 0.00088257, + "balance_loss_clip": 1.30519748, + "balance_loss_mlp": 0.07748039, + "epoch": 0.35737261385840974, + "flos": 60822729118080.0, + "grad_norm": 0.7988660221409314, + "language_loss": 0.60460901, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.6204536, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.10791016, + "step": 5944, + "time_per_iteration": 3.2205090522766113 + }, + { + "auxiliary_loss_clip": 0.01493226, + "auxiliary_loss_mlp": 0.00271535, + "balance_loss_clip": 1.23329306, + "balance_loss_mlp": 0.24228139, + "epoch": 0.3574327371110777, + "flos": 18843765857280.0, + "grad_norm": 31.55290877374383, + "language_loss": 0.78670406, + "learning_rate": 2.976864428379655e-06, + "loss": 0.80435169, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.29260254, + "step": 5945, + "time_per_iteration": 2.6056289672851562 + }, + { + "auxiliary_loss_clip": 0.01501848, + "auxiliary_loss_mlp": 0.00269551, + "balance_loss_clip": 1.24379051, + "balance_loss_mlp": 0.24033329, + "epoch": 0.35749286036374567, + "flos": 23549679417600.0, + "grad_norm": 2.982838781620009, + "language_loss": 0.85974407, + "learning_rate": 2.976524564880326e-06, + "loss": 0.87745804, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.29187012, + "step": 5946, + "time_per_iteration": 2.6423346996307373 + }, + { + "auxiliary_loss_clip": 0.01502911, + "auxiliary_loss_mlp": 0.00255238, + "balance_loss_clip": 1.23779571, + "balance_loss_mlp": 0.22781941, + "epoch": 0.35755298361641363, + "flos": 21105491581440.0, + "grad_norm": 4.986826558578387, + "language_loss": 0.7515285, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.76911002, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.27429199, + "step": 5947, + "time_per_iteration": 2.65560245513916 + }, + { + "auxiliary_loss_clip": 0.01502512, + "auxiliary_loss_mlp": 0.00249347, + "balance_loss_clip": 1.24339724, + "balance_loss_mlp": 0.22227435, + "epoch": 0.3576131068690816, + "flos": 19245031666560.0, + "grad_norm": 1757.1775610805526, + "language_loss": 0.81522727, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.83274585, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.27087402, + "step": 5948, + "time_per_iteration": 2.669339656829834 + }, + { + "auxiliary_loss_clip": 0.01500312, + "auxiliary_loss_mlp": 0.00276958, + "balance_loss_clip": 1.24172616, + "balance_loss_mlp": 0.2490395, + "epoch": 0.35767323012174956, + "flos": 28654703971200.0, + "grad_norm": 2.3119903943017728, + "language_loss": 0.80437452, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.82214725, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.27929688, + "step": 5949, + "time_per_iteration": 2.7328648567199707 + }, + { + "auxiliary_loss_clip": 0.0150381, + "auxiliary_loss_mlp": 0.00271933, + "balance_loss_clip": 1.24194205, + "balance_loss_mlp": 0.24407417, + "epoch": 0.35773335337441753, + "flos": 17085363459840.0, + "grad_norm": 23.091905183980128, + "language_loss": 0.84887576, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.86663318, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.27832031, + "step": 5950, + "time_per_iteration": 2.6642158031463623 + }, + { + "auxiliary_loss_clip": 0.01499182, + "auxiliary_loss_mlp": 0.00276289, + "balance_loss_clip": 1.23775434, + "balance_loss_mlp": 0.24770282, + "epoch": 0.35779347662708555, + "flos": 15888605097600.0, + "grad_norm": 213.36166011452516, + "language_loss": 0.81946564, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.83722031, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.28588867, + "step": 5951, + "time_per_iteration": 2.6741888523101807 + }, + { + "auxiliary_loss_clip": 0.01505622, + "auxiliary_loss_mlp": 0.00275568, + "balance_loss_clip": 1.23940623, + "balance_loss_mlp": 0.24515802, + "epoch": 0.3578535998797535, + "flos": 28658834035200.0, + "grad_norm": 299.2720347104936, + "language_loss": 0.79243314, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.81024504, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.30407715, + "step": 5952, + "time_per_iteration": 2.8111579418182373 + }, + { + "auxiliary_loss_clip": 0.0150193, + "auxiliary_loss_mlp": 0.00269252, + "balance_loss_clip": 1.24150717, + "balance_loss_mlp": 0.24126138, + "epoch": 0.3579137231324215, + "flos": 37852432076160.0, + "grad_norm": 765.0293157213974, + "language_loss": 0.75548947, + "learning_rate": 2.974144484269449e-06, + "loss": 0.77320129, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.28015137, + "step": 5953, + "time_per_iteration": 2.8341546058654785 + }, + { + "auxiliary_loss_clip": 0.01498873, + "auxiliary_loss_mlp": 0.00273169, + "balance_loss_clip": 1.23718941, + "balance_loss_mlp": 0.24581094, + "epoch": 0.35797384638508944, + "flos": 22346851656960.0, + "grad_norm": 2.5389744417506694, + "language_loss": 0.72711557, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.74483603, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.2734375, + "step": 5954, + "time_per_iteration": 2.7336812019348145 + }, + { + "auxiliary_loss_clip": 0.01500985, + "auxiliary_loss_mlp": 0.00258639, + "balance_loss_clip": 1.24401653, + "balance_loss_mlp": 0.2316974, + "epoch": 0.3580339696377574, + "flos": 13589711775360.0, + "grad_norm": 9.547787503987697, + "language_loss": 0.83188701, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.84948313, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.26940918, + "step": 5955, + "time_per_iteration": 2.6379752159118652 + }, + { + "auxiliary_loss_clip": 0.01494611, + "auxiliary_loss_mlp": 0.00250133, + "balance_loss_clip": 1.23729324, + "balance_loss_mlp": 0.22298923, + "epoch": 0.3580940928904254, + "flos": 23768231719680.0, + "grad_norm": 63.083960422439176, + "language_loss": 0.83031559, + "learning_rate": 2.973123895369182e-06, + "loss": 0.84776306, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.27124023, + "step": 5956, + "time_per_iteration": 2.688478469848633 + }, + { + "auxiliary_loss_clip": 0.01490946, + "auxiliary_loss_mlp": 0.00221116, + "balance_loss_clip": 1.23601735, + "balance_loss_mlp": 0.19626084, + "epoch": 0.35815421614309334, + "flos": 19463871277440.0, + "grad_norm": 6.554604328044441, + "language_loss": 0.79953784, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.81665844, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.24841309, + "step": 5957, + "time_per_iteration": 2.6196229457855225 + }, + { + "auxiliary_loss_clip": 0.0150473, + "auxiliary_loss_mlp": 0.00272438, + "balance_loss_clip": 1.24173033, + "balance_loss_mlp": 0.24566346, + "epoch": 0.3582143393957613, + "flos": 23368186972800.0, + "grad_norm": 23.446359030780407, + "language_loss": 0.82114965, + "learning_rate": 2.972443318242726e-06, + "loss": 0.83892131, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.2677002, + "step": 5958, + "time_per_iteration": 2.6661384105682373 + }, + { + "auxiliary_loss_clip": 0.0148663, + "auxiliary_loss_mlp": 0.00242682, + "balance_loss_clip": 1.2306273, + "balance_loss_mlp": 0.21924575, + "epoch": 0.35827446264842927, + "flos": 26323275905280.0, + "grad_norm": 1.7229376919654875, + "language_loss": 0.9365077, + "learning_rate": 2.972102974360324e-06, + "loss": 0.95380086, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.23425293, + "step": 5959, + "time_per_iteration": 2.7513198852539062 + }, + { + "auxiliary_loss_clip": 0.01482245, + "auxiliary_loss_mlp": 0.00215151, + "balance_loss_clip": 1.22549295, + "balance_loss_mlp": 0.18981871, + "epoch": 0.35833458590109724, + "flos": 30446610779520.0, + "grad_norm": 6.090088655960469, + "language_loss": 0.65056479, + "learning_rate": 2.971762593615679e-06, + "loss": 0.66753876, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.25354004, + "step": 5960, + "time_per_iteration": 2.7185463905334473 + }, + { + "auxiliary_loss_clip": 0.01473336, + "auxiliary_loss_mlp": 0.00237406, + "balance_loss_clip": 1.2193476, + "balance_loss_mlp": 0.20982131, + "epoch": 0.3583947091537652, + "flos": 14829886702080.0, + "grad_norm": 11.16089893860679, + "language_loss": 0.85702682, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.8741343, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.27587891, + "step": 5961, + "time_per_iteration": 2.621490001678467 + }, + { + "auxiliary_loss_clip": 0.01469958, + "auxiliary_loss_mlp": 0.00256717, + "balance_loss_clip": 1.21490967, + "balance_loss_mlp": 0.22981113, + "epoch": 0.35845483240643317, + "flos": 34240644743040.0, + "grad_norm": 19.03233312902429, + "language_loss": 0.78469002, + "learning_rate": 2.971081721591294e-06, + "loss": 0.80195677, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.26904297, + "step": 5962, + "time_per_iteration": 2.768618106842041 + }, + { + "auxiliary_loss_clip": 0.01465514, + "auxiliary_loss_mlp": 0.00233435, + "balance_loss_clip": 1.21321177, + "balance_loss_mlp": 0.20667244, + "epoch": 0.35851495565910113, + "flos": 20960089326720.0, + "grad_norm": 8.948067264379377, + "language_loss": 0.81682509, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.83381462, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.26757812, + "step": 5963, + "time_per_iteration": 2.66129469871521 + }, + { + "auxiliary_loss_clip": 0.01471584, + "auxiliary_loss_mlp": 0.00251421, + "balance_loss_clip": 1.21500874, + "balance_loss_mlp": 0.22377646, + "epoch": 0.35857507891176915, + "flos": 22309863626880.0, + "grad_norm": 17.870409051344918, + "language_loss": 0.86086494, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.87809503, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.27636719, + "step": 5964, + "time_per_iteration": 2.7011122703552246 + }, + { + "auxiliary_loss_clip": 0.01470987, + "auxiliary_loss_mlp": 0.00269969, + "balance_loss_clip": 1.21473098, + "balance_loss_mlp": 0.24408835, + "epoch": 0.3586352021644371, + "flos": 23367863750400.0, + "grad_norm": 211.42828195324296, + "language_loss": 0.74672979, + "learning_rate": 2.970060137410626e-06, + "loss": 0.76413935, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.25878906, + "step": 5965, + "time_per_iteration": 2.649482250213623 + }, + { + "auxiliary_loss_clip": 0.01459605, + "auxiliary_loss_mlp": 0.00253339, + "balance_loss_clip": 1.20793462, + "balance_loss_mlp": 0.22718433, + "epoch": 0.3586953254171051, + "flos": 27849227437440.0, + "grad_norm": 18.671993238716933, + "language_loss": 0.85685104, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.87398046, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.26147461, + "step": 5966, + "time_per_iteration": 2.680037498474121 + }, + { + "auxiliary_loss_clip": 0.01458894, + "auxiliary_loss_mlp": 0.00275665, + "balance_loss_clip": 1.20482135, + "balance_loss_mlp": 0.24650636, + "epoch": 0.35875544866977305, + "flos": 19500500171520.0, + "grad_norm": 420.1194610653278, + "language_loss": 0.9859603, + "learning_rate": 2.9693788973447715e-06, + "loss": 1.00330591, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.29174805, + "step": 5967, + "time_per_iteration": 2.653076648712158 + }, + { + "auxiliary_loss_clip": 0.01466207, + "auxiliary_loss_mlp": 0.0025569, + "balance_loss_clip": 1.21094632, + "balance_loss_mlp": 0.22734228, + "epoch": 0.358815571922441, + "flos": 21471134077440.0, + "grad_norm": 7.498684555916693, + "language_loss": 0.88701344, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.9042325, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.28369141, + "step": 5968, + "time_per_iteration": 2.6468262672424316 + }, + { + "auxiliary_loss_clip": 0.01456793, + "auxiliary_loss_mlp": 0.00254175, + "balance_loss_clip": 1.20683146, + "balance_loss_mlp": 0.22577976, + "epoch": 0.358875695175109, + "flos": 21835411856640.0, + "grad_norm": 87.28617008129291, + "language_loss": 0.91435999, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.93146968, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.28417969, + "step": 5969, + "time_per_iteration": 4.13720178604126 + }, + { + "auxiliary_loss_clip": 0.01458056, + "auxiliary_loss_mlp": 0.00247614, + "balance_loss_clip": 1.20897341, + "balance_loss_mlp": 0.22153111, + "epoch": 0.35893581842777694, + "flos": 32011633330560.0, + "grad_norm": 271.8719445895624, + "language_loss": 0.79646909, + "learning_rate": 2.968356761586202e-06, + "loss": 0.8135258, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.26086426, + "step": 5970, + "time_per_iteration": 4.14414381980896 + }, + { + "auxiliary_loss_clip": 0.01454417, + "auxiliary_loss_mlp": 0.00244193, + "balance_loss_clip": 1.20511758, + "balance_loss_mlp": 0.21538004, + "epoch": 0.3589959416804449, + "flos": 20485817124480.0, + "grad_norm": 4.321338247277519, + "language_loss": 0.87330472, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.8902908, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.2878418, + "step": 5971, + "time_per_iteration": 2.7066855430603027 + }, + { + "auxiliary_loss_clip": 0.01445554, + "auxiliary_loss_mlp": 0.00223705, + "balance_loss_clip": 1.19537544, + "balance_loss_mlp": 0.19534475, + "epoch": 0.3590560649331129, + "flos": 16180666583040.0, + "grad_norm": 3.0395966567516086, + "language_loss": 0.88180196, + "learning_rate": 2.967675154124696e-06, + "loss": 0.89849454, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.28356934, + "step": 5972, + "time_per_iteration": 2.683870315551758 + }, + { + "auxiliary_loss_clip": 0.01441351, + "auxiliary_loss_mlp": 0.00230422, + "balance_loss_clip": 1.19439769, + "balance_loss_mlp": 0.20286115, + "epoch": 0.35911618818578084, + "flos": 20375391738240.0, + "grad_norm": 19.878774657511613, + "language_loss": 0.87281334, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.88953114, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.27587891, + "step": 5973, + "time_per_iteration": 2.7409756183624268 + }, + { + "auxiliary_loss_clip": 0.01477931, + "auxiliary_loss_mlp": 0.00140551, + "balance_loss_clip": 1.30160785, + "balance_loss_mlp": 0.13206363, + "epoch": 0.3591763114384488, + "flos": 41236691685120.0, + "grad_norm": 0.8982706403941807, + "language_loss": 0.562617, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.57880181, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.08496094, + "step": 5974, + "time_per_iteration": 3.059684991836548 + }, + { + "auxiliary_loss_clip": 0.0144937, + "auxiliary_loss_mlp": 0.00235933, + "balance_loss_clip": 1.20262969, + "balance_loss_mlp": 0.2057137, + "epoch": 0.35923643469111677, + "flos": 18695454600960.0, + "grad_norm": 2.8182231748901643, + "language_loss": 0.75663054, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.77348363, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.30212402, + "step": 5975, + "time_per_iteration": 4.127224922180176 + }, + { + "auxiliary_loss_clip": 0.01437343, + "auxiliary_loss_mlp": 0.00268085, + "balance_loss_clip": 1.1963861, + "balance_loss_mlp": 0.23769836, + "epoch": 0.35929655794378473, + "flos": 25009950931200.0, + "grad_norm": 750.8530245636255, + "language_loss": 0.85507011, + "learning_rate": 2.96631149897303e-06, + "loss": 0.87212443, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.30383301, + "step": 5976, + "time_per_iteration": 2.6835696697235107 + }, + { + "auxiliary_loss_clip": 0.01442306, + "auxiliary_loss_mlp": 0.00272572, + "balance_loss_clip": 1.19795275, + "balance_loss_mlp": 0.24360384, + "epoch": 0.35935668119645275, + "flos": 14975576265600.0, + "grad_norm": 2.2526362747170663, + "language_loss": 0.86361778, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.88076663, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.28967285, + "step": 5977, + "time_per_iteration": 2.616544246673584 + }, + { + "auxiliary_loss_clip": 0.01437029, + "auxiliary_loss_mlp": 0.0032582, + "balance_loss_clip": 1.19657159, + "balance_loss_mlp": 0.29681674, + "epoch": 0.3594168044491207, + "flos": 21178138838400.0, + "grad_norm": 2.537868397864208, + "language_loss": 0.87699997, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.89462841, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.28991699, + "step": 5978, + "time_per_iteration": 2.6337997913360596 + }, + { + "auxiliary_loss_clip": 0.01456825, + "auxiliary_loss_mlp": 0.00443473, + "balance_loss_clip": 1.20464134, + "balance_loss_mlp": 0.40911663, + "epoch": 0.3594769277017887, + "flos": 27672152365440.0, + "grad_norm": 30.32994243477531, + "language_loss": 0.73066759, + "learning_rate": 2.965288372816436e-06, + "loss": 0.74967062, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.34350586, + "step": 5979, + "time_per_iteration": 2.7411305904388428 + }, + { + "auxiliary_loss_clip": 0.01447921, + "auxiliary_loss_mlp": 0.00409836, + "balance_loss_clip": 1.20448589, + "balance_loss_mlp": 0.3793422, + "epoch": 0.35953705095445665, + "flos": 23002328995200.0, + "grad_norm": 14.607376722504364, + "language_loss": 0.76661503, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.78519261, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.30541992, + "step": 5980, + "time_per_iteration": 2.782780408859253 + }, + { + "auxiliary_loss_clip": 0.01447622, + "auxiliary_loss_mlp": 0.00446961, + "balance_loss_clip": 1.19738662, + "balance_loss_mlp": 0.41374975, + "epoch": 0.3595971742071246, + "flos": 25513992529920.0, + "grad_norm": 3.5933004206174735, + "language_loss": 0.77797341, + "learning_rate": 2.964606105671327e-06, + "loss": 0.79691929, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.33203125, + "step": 5981, + "time_per_iteration": 4.209684610366821 + }, + { + "auxiliary_loss_clip": 0.01457606, + "auxiliary_loss_mlp": 0.00478424, + "balance_loss_clip": 1.21102262, + "balance_loss_mlp": 0.44189823, + "epoch": 0.3596572974597926, + "flos": 29862559635840.0, + "grad_norm": 49.57559604493387, + "language_loss": 0.78031814, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.79967839, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.36499023, + "step": 5982, + "time_per_iteration": 2.779730796813965 + }, + { + "auxiliary_loss_clip": 0.01467411, + "auxiliary_loss_mlp": 0.004891, + "balance_loss_clip": 1.22065353, + "balance_loss_mlp": 0.45612708, + "epoch": 0.35971742071246054, + "flos": 23112538899840.0, + "grad_norm": 24.50571184040137, + "language_loss": 0.83675921, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.85632432, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.32983398, + "step": 5983, + "time_per_iteration": 2.6947379112243652 + }, + { + "auxiliary_loss_clip": 0.01466421, + "auxiliary_loss_mlp": 0.00580815, + "balance_loss_clip": 1.21533537, + "balance_loss_mlp": 0.54421794, + "epoch": 0.3597775439651285, + "flos": 16725359399040.0, + "grad_norm": 6.85544339058106, + "language_loss": 0.83839709, + "learning_rate": 2.96358243065131e-06, + "loss": 0.85886943, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.3659668, + "step": 5984, + "time_per_iteration": 2.7049572467803955 + }, + { + "auxiliary_loss_clip": 0.01475383, + "auxiliary_loss_mlp": 0.00504354, + "balance_loss_clip": 1.22670627, + "balance_loss_mlp": 0.46842414, + "epoch": 0.3598376672177965, + "flos": 19719483436800.0, + "grad_norm": 10.35794576928377, + "language_loss": 0.93930376, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.95910108, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.359375, + "step": 5985, + "time_per_iteration": 2.756159782409668 + }, + { + "auxiliary_loss_clip": 0.01466476, + "auxiliary_loss_mlp": 0.00606955, + "balance_loss_clip": 1.22168982, + "balance_loss_mlp": 0.56914186, + "epoch": 0.35989779047046444, + "flos": 17311529445120.0, + "grad_norm": 10.913314829950103, + "language_loss": 0.77984178, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.80057609, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.37817383, + "step": 5986, + "time_per_iteration": 2.6435656547546387 + }, + { + "auxiliary_loss_clip": 0.01478391, + "auxiliary_loss_mlp": 0.00681699, + "balance_loss_clip": 1.22277117, + "balance_loss_mlp": 0.6417399, + "epoch": 0.3599579137231324, + "flos": 22711237176960.0, + "grad_norm": 616.0371758615881, + "language_loss": 0.8111676, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.8327685, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.39941406, + "step": 5987, + "time_per_iteration": 2.627520799636841 + }, + { + "auxiliary_loss_clip": 0.01475145, + "auxiliary_loss_mlp": 0.00569503, + "balance_loss_clip": 1.22474849, + "balance_loss_mlp": 0.53133243, + "epoch": 0.36001803697580037, + "flos": 20959873845120.0, + "grad_norm": 7.74713646479869, + "language_loss": 0.79452705, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.81497353, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.38183594, + "step": 5988, + "time_per_iteration": 2.669175386428833 + }, + { + "auxiliary_loss_clip": 0.01484149, + "auxiliary_loss_mlp": 0.00576155, + "balance_loss_clip": 1.22833347, + "balance_loss_mlp": 0.53920054, + "epoch": 0.36007816022846834, + "flos": 20485565729280.0, + "grad_norm": 2.9823452109621846, + "language_loss": 0.7886796, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.80928266, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.36938477, + "step": 5989, + "time_per_iteration": 2.651118040084839 + }, + { + "auxiliary_loss_clip": 0.01485887, + "auxiliary_loss_mlp": 0.00608664, + "balance_loss_clip": 1.23681366, + "balance_loss_mlp": 0.56977773, + "epoch": 0.36013828348113636, + "flos": 28001237794560.0, + "grad_norm": 44.114346120020464, + "language_loss": 0.85283935, + "learning_rate": 2.961534094403931e-06, + "loss": 0.87378484, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.38891602, + "step": 5990, + "time_per_iteration": 2.6938037872314453 + }, + { + "auxiliary_loss_clip": 0.01506956, + "auxiliary_loss_mlp": 0.00622558, + "balance_loss_clip": 1.24646914, + "balance_loss_mlp": 0.58510262, + "epoch": 0.3601984067338043, + "flos": 20082181017600.0, + "grad_norm": 4.432396963158662, + "language_loss": 0.9016664, + "learning_rate": 2.961192577338698e-06, + "loss": 0.92296159, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.37451172, + "step": 5991, + "time_per_iteration": 2.623351573944092 + }, + { + "auxiliary_loss_clip": 0.01501121, + "auxiliary_loss_mlp": 0.00651128, + "balance_loss_clip": 1.24173498, + "balance_loss_mlp": 0.61071575, + "epoch": 0.3602585299864723, + "flos": 18617599872000.0, + "grad_norm": 130.53086995873204, + "language_loss": 0.83211821, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.85364068, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.40405273, + "step": 5992, + "time_per_iteration": 2.66521954536438 + }, + { + "auxiliary_loss_clip": 0.01509263, + "auxiliary_loss_mlp": 0.00588935, + "balance_loss_clip": 1.25550175, + "balance_loss_mlp": 0.55348194, + "epoch": 0.36031865323914025, + "flos": 19573003774080.0, + "grad_norm": 10.267843307743744, + "language_loss": 0.83855551, + "learning_rate": 2.960509433875627e-06, + "loss": 0.85953748, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.35473633, + "step": 5993, + "time_per_iteration": 2.697218179702759 + }, + { + "auxiliary_loss_clip": 0.01525208, + "auxiliary_loss_mlp": 0.00612614, + "balance_loss_clip": 1.26287818, + "balance_loss_mlp": 0.57501519, + "epoch": 0.3603787764918082, + "flos": 17490615678720.0, + "grad_norm": 2.9915786741589985, + "language_loss": 0.82189655, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.84327483, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.37597656, + "step": 5994, + "time_per_iteration": 2.63128924369812 + }, + { + "auxiliary_loss_clip": 0.01525728, + "auxiliary_loss_mlp": 0.00522424, + "balance_loss_clip": 1.26028979, + "balance_loss_mlp": 0.48670846, + "epoch": 0.3604388997444762, + "flos": 15523393564800.0, + "grad_norm": 7.152655347051845, + "language_loss": 0.77152443, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.7920059, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.35717773, + "step": 5995, + "time_per_iteration": 2.759296178817749 + }, + { + "auxiliary_loss_clip": 0.01523572, + "auxiliary_loss_mlp": 0.00566933, + "balance_loss_clip": 1.25840259, + "balance_loss_mlp": 0.52663982, + "epoch": 0.36049902299714415, + "flos": 17310883000320.0, + "grad_norm": 17.25714603952194, + "language_loss": 0.89827234, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.91917741, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.40258789, + "step": 5996, + "time_per_iteration": 2.673665761947632 + }, + { + "auxiliary_loss_clip": 0.01548599, + "auxiliary_loss_mlp": 0.0048644, + "balance_loss_clip": 1.28160429, + "balance_loss_mlp": 0.45012897, + "epoch": 0.3605591462498121, + "flos": 17056025026560.0, + "grad_norm": 5.234161895306228, + "language_loss": 0.79485351, + "learning_rate": 2.959142709981763e-06, + "loss": 0.81520391, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.36303711, + "step": 5997, + "time_per_iteration": 2.7516837120056152 + }, + { + "auxiliary_loss_clip": 0.01571245, + "auxiliary_loss_mlp": 0.00477125, + "balance_loss_clip": 1.30147147, + "balance_loss_mlp": 0.44317403, + "epoch": 0.3606192695024801, + "flos": 16836862193280.0, + "grad_norm": 12.324358754249818, + "language_loss": 0.79117143, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.8116551, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.33959961, + "step": 5998, + "time_per_iteration": 2.7042171955108643 + }, + { + "auxiliary_loss_clip": 0.0157524, + "auxiliary_loss_mlp": 0.00513208, + "balance_loss_clip": 1.30546796, + "balance_loss_mlp": 0.47849444, + "epoch": 0.36067939275514804, + "flos": 12129655743360.0, + "grad_norm": 23.314273873431844, + "language_loss": 0.86643004, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.88731456, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.34692383, + "step": 5999, + "time_per_iteration": 2.7084171772003174 + }, + { + "auxiliary_loss_clip": 0.01571568, + "auxiliary_loss_mlp": 0.00486167, + "balance_loss_clip": 1.29719996, + "balance_loss_mlp": 0.44809139, + "epoch": 0.360739516007816, + "flos": 18041449720320.0, + "grad_norm": 13.791425027694288, + "language_loss": 0.85908961, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.87966692, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.38061523, + "step": 6000, + "time_per_iteration": 2.616788148880005 + }, + { + "auxiliary_loss_clip": 0.01571421, + "auxiliary_loss_mlp": 0.00457672, + "balance_loss_clip": 1.30142689, + "balance_loss_mlp": 0.41971558, + "epoch": 0.360799639260484, + "flos": 18549800951040.0, + "grad_norm": 7.272441247229841, + "language_loss": 0.84922516, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.86951607, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.37963867, + "step": 6001, + "time_per_iteration": 2.6695752143859863 + }, + { + "auxiliary_loss_clip": 0.01587504, + "auxiliary_loss_mlp": 0.0045106, + "balance_loss_clip": 1.31589615, + "balance_loss_mlp": 0.41234055, + "epoch": 0.36085976251315194, + "flos": 19682028529920.0, + "grad_norm": 9.033496741404514, + "language_loss": 0.90491855, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.92530417, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.38720703, + "step": 6002, + "time_per_iteration": 2.718067169189453 + }, + { + "auxiliary_loss_clip": 0.01600011, + "auxiliary_loss_mlp": 0.00405641, + "balance_loss_clip": 1.32606959, + "balance_loss_mlp": 0.37321591, + "epoch": 0.3609198857658199, + "flos": 24198943703040.0, + "grad_norm": 121.66778833329406, + "language_loss": 0.96084952, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.98090613, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.32421875, + "step": 6003, + "time_per_iteration": 2.6218903064727783 + }, + { + "auxiliary_loss_clip": 0.01623032, + "auxiliary_loss_mlp": 0.00101253, + "balance_loss_clip": 1.41944599, + "balance_loss_mlp": 0.08995207, + "epoch": 0.3609800090184879, + "flos": 57115995160320.0, + "grad_norm": 0.8881867025641624, + "language_loss": 0.53644717, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55369008, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.11279297, + "step": 6004, + "time_per_iteration": 3.062074899673462 + }, + { + "auxiliary_loss_clip": 0.01571227, + "auxiliary_loss_mlp": 0.00461396, + "balance_loss_clip": 1.29589438, + "balance_loss_mlp": 0.4240123, + "epoch": 0.3610401322711559, + "flos": 20811239366400.0, + "grad_norm": 19.25488589458657, + "language_loss": 0.85258913, + "learning_rate": 2.956407517225883e-06, + "loss": 0.87291533, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.3737793, + "step": 6005, + "time_per_iteration": 2.6486454010009766 + }, + { + "auxiliary_loss_clip": 0.01561748, + "auxiliary_loss_mlp": 0.00411312, + "balance_loss_clip": 1.2900908, + "balance_loss_mlp": 0.37912515, + "epoch": 0.36110025552382385, + "flos": 13699167494400.0, + "grad_norm": 2.392218970734875, + "language_loss": 0.85937792, + "learning_rate": 2.956065454793429e-06, + "loss": 0.87910855, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.32202148, + "step": 6006, + "time_per_iteration": 2.5957164764404297 + }, + { + "auxiliary_loss_clip": 0.01582305, + "auxiliary_loss_mlp": 0.00393469, + "balance_loss_clip": 1.30115807, + "balance_loss_mlp": 0.35932714, + "epoch": 0.3611603787764918, + "flos": 22455014486400.0, + "grad_norm": 11.576546223568087, + "language_loss": 0.91087222, + "learning_rate": 2.955723356106876e-06, + "loss": 0.93062997, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.34130859, + "step": 6007, + "time_per_iteration": 2.6875240802764893 + }, + { + "auxiliary_loss_clip": 0.01575331, + "auxiliary_loss_mlp": 0.00409532, + "balance_loss_clip": 1.29301703, + "balance_loss_mlp": 0.37302965, + "epoch": 0.3612205020291598, + "flos": 20886651970560.0, + "grad_norm": 30.80999999615213, + "language_loss": 0.82335556, + "learning_rate": 2.955381221179198e-06, + "loss": 0.8432042, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.36523438, + "step": 6008, + "time_per_iteration": 2.6471405029296875 + }, + { + "auxiliary_loss_clip": 0.01550504, + "auxiliary_loss_mlp": 0.00401012, + "balance_loss_clip": 1.27660906, + "balance_loss_mlp": 0.36682263, + "epoch": 0.36128062528182775, + "flos": 15741981780480.0, + "grad_norm": 16.76778529119698, + "language_loss": 0.90638536, + "learning_rate": 2.955039050023368e-06, + "loss": 0.92590046, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.34204102, + "step": 6009, + "time_per_iteration": 2.627875566482544 + }, + { + "auxiliary_loss_clip": 0.01566299, + "auxiliary_loss_mlp": 0.00418733, + "balance_loss_clip": 1.28883493, + "balance_loss_mlp": 0.38380447, + "epoch": 0.3613407485344957, + "flos": 16764502245120.0, + "grad_norm": 41.46415358412408, + "language_loss": 0.83563399, + "learning_rate": 2.954696842652362e-06, + "loss": 0.85548437, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.34887695, + "step": 6010, + "time_per_iteration": 2.632220983505249 + }, + { + "auxiliary_loss_clip": 0.01560358, + "auxiliary_loss_mlp": 0.00385214, + "balance_loss_clip": 1.28236091, + "balance_loss_mlp": 0.35076249, + "epoch": 0.3614008717871637, + "flos": 20371189847040.0, + "grad_norm": 182.62770585650384, + "language_loss": 0.88974273, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.9091984, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.34448242, + "step": 6011, + "time_per_iteration": 2.7324817180633545 + }, + { + "auxiliary_loss_clip": 0.01557919, + "auxiliary_loss_mlp": 0.003927, + "balance_loss_clip": 1.27881765, + "balance_loss_mlp": 0.3560794, + "epoch": 0.36146099503983165, + "flos": 22776665800320.0, + "grad_norm": 10.895251530840905, + "language_loss": 0.69909334, + "learning_rate": 2.954012319316727e-06, + "loss": 0.71859956, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.36621094, + "step": 6012, + "time_per_iteration": 5.611877679824829 + }, + { + "auxiliary_loss_clip": 0.01565606, + "auxiliary_loss_mlp": 0.00422277, + "balance_loss_clip": 1.28958988, + "balance_loss_mlp": 0.38708678, + "epoch": 0.3615211182924996, + "flos": 22996654646400.0, + "grad_norm": 10.04041095155924, + "language_loss": 0.88257194, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.90245074, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.35180664, + "step": 6013, + "time_per_iteration": 2.6535143852233887 + }, + { + "auxiliary_loss_clip": 0.01574581, + "auxiliary_loss_mlp": 0.0037804, + "balance_loss_clip": 1.29443622, + "balance_loss_mlp": 0.3451618, + "epoch": 0.3615812415451676, + "flos": 16648079287680.0, + "grad_norm": 10.629891709509527, + "language_loss": 0.96783549, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.98736161, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.32861328, + "step": 6014, + "time_per_iteration": 2.6206252574920654 + }, + { + "auxiliary_loss_clip": 0.01566284, + "auxiliary_loss_mlp": 0.00345777, + "balance_loss_clip": 1.29080331, + "balance_loss_mlp": 0.31430614, + "epoch": 0.36164136479783554, + "flos": 21320093387520.0, + "grad_norm": 2.25173743572339, + "language_loss": 0.82212436, + "learning_rate": 2.95298526302391e-06, + "loss": 0.84124494, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.31420898, + "step": 6015, + "time_per_iteration": 2.7504706382751465 + }, + { + "auxiliary_loss_clip": 0.0156255, + "auxiliary_loss_mlp": 0.0038474, + "balance_loss_clip": 1.28922129, + "balance_loss_mlp": 0.3503367, + "epoch": 0.3617014880505035, + "flos": 24169569356160.0, + "grad_norm": 9.505567128948794, + "language_loss": 0.73483276, + "learning_rate": 2.9526428386344e-06, + "loss": 0.75430572, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.34423828, + "step": 6016, + "time_per_iteration": 2.703514814376831 + }, + { + "auxiliary_loss_clip": 0.01561267, + "auxiliary_loss_mlp": 0.0040924, + "balance_loss_clip": 1.28399336, + "balance_loss_mlp": 0.37333381, + "epoch": 0.3617616113031715, + "flos": 39014824101120.0, + "grad_norm": 17.817182573751168, + "language_loss": 0.79627407, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.81597912, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.35913086, + "step": 6017, + "time_per_iteration": 4.234957695007324 + }, + { + "auxiliary_loss_clip": 0.01552538, + "auxiliary_loss_mlp": 0.00396329, + "balance_loss_clip": 1.2788676, + "balance_loss_mlp": 0.36218786, + "epoch": 0.3618217345558395, + "flos": 12130840892160.0, + "grad_norm": 7.473210046626886, + "language_loss": 0.82488948, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.84437811, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.34130859, + "step": 6018, + "time_per_iteration": 2.6358540058135986 + }, + { + "auxiliary_loss_clip": 0.01566652, + "auxiliary_loss_mlp": 0.00409365, + "balance_loss_clip": 1.29718375, + "balance_loss_mlp": 0.37474674, + "epoch": 0.36188185780850746, + "flos": 24935005203840.0, + "grad_norm": 96.84059335966258, + "language_loss": 0.74407041, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.7638306, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.34619141, + "step": 6019, + "time_per_iteration": 2.743962287902832 + }, + { + "auxiliary_loss_clip": 0.01556169, + "auxiliary_loss_mlp": 0.00352976, + "balance_loss_clip": 1.27954578, + "balance_loss_mlp": 0.31833419, + "epoch": 0.3619419810611754, + "flos": 20958832350720.0, + "grad_norm": 64.92320878496048, + "language_loss": 0.83585328, + "learning_rate": 2.95127277996311e-06, + "loss": 0.85494471, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.34643555, + "step": 6020, + "time_per_iteration": 2.7030398845672607 + }, + { + "auxiliary_loss_clip": 0.0157056, + "auxiliary_loss_mlp": 0.00388557, + "balance_loss_clip": 1.29170156, + "balance_loss_mlp": 0.34919381, + "epoch": 0.3620021043138434, + "flos": 22528882805760.0, + "grad_norm": 2.5606583574109494, + "language_loss": 0.7928865, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.81247771, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 2.78710938, + "router_z_loss_mlp": 0.39331055, + "step": 6021, + "time_per_iteration": 2.6987974643707275 + }, + { + "auxiliary_loss_clip": 0.01564868, + "auxiliary_loss_mlp": 0.0042541, + "balance_loss_clip": 1.29002798, + "balance_loss_mlp": 0.38816914, + "epoch": 0.36206222756651135, + "flos": 15596687266560.0, + "grad_norm": 4.126709381123415, + "language_loss": 0.88092053, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.90082335, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.37255859, + "step": 6022, + "time_per_iteration": 2.6822407245635986 + }, + { + "auxiliary_loss_clip": 0.01570873, + "auxiliary_loss_mlp": 0.00350529, + "balance_loss_clip": 1.30009174, + "balance_loss_mlp": 0.31810379, + "epoch": 0.3621223508191793, + "flos": 23587170238080.0, + "grad_norm": 5.872172287596771, + "language_loss": 0.87495214, + "learning_rate": 2.950244857154417e-06, + "loss": 0.89416611, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.32421875, + "step": 6023, + "time_per_iteration": 2.678739547729492 + }, + { + "auxiliary_loss_clip": 0.01566326, + "auxiliary_loss_mlp": 0.00369541, + "balance_loss_clip": 1.29046392, + "balance_loss_mlp": 0.33494651, + "epoch": 0.3621824740718473, + "flos": 22309899540480.0, + "grad_norm": 15.701285692911613, + "language_loss": 0.86442024, + "learning_rate": 2.9499021441341e-06, + "loss": 0.88377893, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.34619141, + "step": 6024, + "time_per_iteration": 4.062086343765259 + }, + { + "auxiliary_loss_clip": 0.01559448, + "auxiliary_loss_mlp": 0.00325427, + "balance_loss_clip": 1.29257035, + "balance_loss_mlp": 0.29445601, + "epoch": 0.36224259732451525, + "flos": 16763640318720.0, + "grad_norm": 2.4829029832255936, + "language_loss": 0.79927242, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.81812114, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.30957031, + "step": 6025, + "time_per_iteration": 2.672111749649048 + }, + { + "auxiliary_loss_clip": 0.01552127, + "auxiliary_loss_mlp": 0.00359088, + "balance_loss_clip": 1.28442824, + "balance_loss_mlp": 0.32499412, + "epoch": 0.3623027205771832, + "flos": 23149742411520.0, + "grad_norm": 108.1789619862585, + "language_loss": 0.80559886, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.82471102, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.34106445, + "step": 6026, + "time_per_iteration": 2.6549315452575684 + }, + { + "auxiliary_loss_clip": 0.0155792, + "auxiliary_loss_mlp": 0.00355704, + "balance_loss_clip": 1.2794255, + "balance_loss_mlp": 0.31934479, + "epoch": 0.3623628438298512, + "flos": 28549162834560.0, + "grad_norm": 24.737237337347604, + "language_loss": 0.84967756, + "learning_rate": 2.948873789002833e-06, + "loss": 0.86881381, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.36376953, + "step": 6027, + "time_per_iteration": 2.9171323776245117 + }, + { + "auxiliary_loss_clip": 0.01562717, + "auxiliary_loss_mlp": 0.00369456, + "balance_loss_clip": 1.28490686, + "balance_loss_mlp": 0.33645871, + "epoch": 0.36242296708251914, + "flos": 25484941405440.0, + "grad_norm": 49.96178791332562, + "language_loss": 0.75763428, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.77695596, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.32983398, + "step": 6028, + "time_per_iteration": 2.770338296890259 + }, + { + "auxiliary_loss_clip": 0.01549793, + "auxiliary_loss_mlp": 0.00362525, + "balance_loss_clip": 1.28507447, + "balance_loss_mlp": 0.32995656, + "epoch": 0.3624830903351871, + "flos": 16290373697280.0, + "grad_norm": 5.6372337311069, + "language_loss": 0.92120337, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.94032657, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.32568359, + "step": 6029, + "time_per_iteration": 2.602210283279419 + }, + { + "auxiliary_loss_clip": 0.01565588, + "auxiliary_loss_mlp": 0.00355964, + "balance_loss_clip": 1.29303741, + "balance_loss_mlp": 0.32167923, + "epoch": 0.36254321358785513, + "flos": 18296307694080.0, + "grad_norm": 24.86369056098275, + "language_loss": 0.80560058, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.82481611, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.34277344, + "step": 6030, + "time_per_iteration": 2.6217291355133057 + }, + { + "auxiliary_loss_clip": 0.01557945, + "auxiliary_loss_mlp": 0.00383341, + "balance_loss_clip": 1.2782011, + "balance_loss_mlp": 0.34879434, + "epoch": 0.3626033368405231, + "flos": 14865294533760.0, + "grad_norm": 10.012168292311188, + "language_loss": 0.84469539, + "learning_rate": 2.94750214514905e-06, + "loss": 0.86410826, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.34545898, + "step": 6031, + "time_per_iteration": 2.675849437713623 + }, + { + "auxiliary_loss_clip": 0.0155207, + "auxiliary_loss_mlp": 0.00313494, + "balance_loss_clip": 1.28141904, + "balance_loss_mlp": 0.28102177, + "epoch": 0.36266346009319106, + "flos": 22306595489280.0, + "grad_norm": 8.126727217100383, + "language_loss": 0.79599476, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.81465036, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.32470703, + "step": 6032, + "time_per_iteration": 2.666625499725342 + }, + { + "auxiliary_loss_clip": 0.01549457, + "auxiliary_loss_mlp": 0.00355325, + "balance_loss_clip": 1.27783847, + "balance_loss_mlp": 0.32144541, + "epoch": 0.362723583345859, + "flos": 18222331633920.0, + "grad_norm": 2948.9069252747677, + "language_loss": 0.84910756, + "learning_rate": 2.946816107593884e-06, + "loss": 0.86815536, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.33886719, + "step": 6033, + "time_per_iteration": 2.6463539600372314 + }, + { + "auxiliary_loss_clip": 0.01498357, + "auxiliary_loss_mlp": 0.00068281, + "balance_loss_clip": 1.29960823, + "balance_loss_mlp": 0.06079457, + "epoch": 0.362783706598527, + "flos": 68499174458880.0, + "grad_norm": 0.8647830140674183, + "language_loss": 0.64785033, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.66351676, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.07470703, + "step": 6034, + "time_per_iteration": 3.226513624191284 + }, + { + "auxiliary_loss_clip": 0.01552477, + "auxiliary_loss_mlp": 0.00342545, + "balance_loss_clip": 1.28550816, + "balance_loss_mlp": 0.31002429, + "epoch": 0.36284382985119495, + "flos": 26576589594240.0, + "grad_norm": 5.031916114779958, + "language_loss": 0.94570994, + "learning_rate": 2.946129926425273e-06, + "loss": 0.96466017, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.32519531, + "step": 6035, + "time_per_iteration": 2.7005274295806885 + }, + { + "auxiliary_loss_clip": 0.01544031, + "auxiliary_loss_mlp": 0.00374262, + "balance_loss_clip": 1.27188206, + "balance_loss_mlp": 0.33766437, + "epoch": 0.3629039531038629, + "flos": 20156767608960.0, + "grad_norm": 2.331189902669245, + "language_loss": 0.81335664, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.8325395, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.36572266, + "step": 6036, + "time_per_iteration": 2.7455461025238037 + }, + { + "auxiliary_loss_clip": 0.01545841, + "auxiliary_loss_mlp": 0.00323678, + "balance_loss_clip": 1.26803374, + "balance_loss_mlp": 0.28834435, + "epoch": 0.3629640763565309, + "flos": 18625716345600.0, + "grad_norm": 3.418602757420063, + "language_loss": 0.84120011, + "learning_rate": 2.945443601747297e-06, + "loss": 0.85989535, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.35302734, + "step": 6037, + "time_per_iteration": 2.6517624855041504 + }, + { + "auxiliary_loss_clip": 0.01559843, + "auxiliary_loss_mlp": 0.00339353, + "balance_loss_clip": 1.29196334, + "balance_loss_mlp": 0.30750021, + "epoch": 0.36302419960919885, + "flos": 19571459489280.0, + "grad_norm": 57.127477765106825, + "language_loss": 0.84305787, + "learning_rate": 2.945100385624828e-06, + "loss": 0.86204982, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.31811523, + "step": 6038, + "time_per_iteration": 2.660201072692871 + }, + { + "auxiliary_loss_clip": 0.01486884, + "auxiliary_loss_mlp": 0.0008896, + "balance_loss_clip": 1.29702294, + "balance_loss_mlp": 0.08080637, + "epoch": 0.3630843228618668, + "flos": 63797606444160.0, + "grad_norm": 9.722116845312845, + "language_loss": 0.63052177, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.64628023, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.08154297, + "step": 6039, + "time_per_iteration": 3.281809091567993 + }, + { + "auxiliary_loss_clip": 0.01545419, + "auxiliary_loss_mlp": 0.00326267, + "balance_loss_clip": 1.27742648, + "balance_loss_mlp": 0.29205436, + "epoch": 0.3631444461145348, + "flos": 21835160461440.0, + "grad_norm": 21.431710342890515, + "language_loss": 0.80669737, + "learning_rate": 2.944413845878002e-06, + "loss": 0.82541424, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.34179688, + "step": 6040, + "time_per_iteration": 2.719456195831299 + }, + { + "auxiliary_loss_clip": 0.01553489, + "auxiliary_loss_mlp": 0.00318265, + "balance_loss_clip": 1.27782297, + "balance_loss_mlp": 0.28429055, + "epoch": 0.36320456936720275, + "flos": 21722041555200.0, + "grad_norm": 8.639956117444767, + "language_loss": 0.87951946, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.89823711, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.33935547, + "step": 6041, + "time_per_iteration": 2.6764333248138428 + }, + { + "auxiliary_loss_clip": 0.01547202, + "auxiliary_loss_mlp": 0.00327366, + "balance_loss_clip": 1.27180123, + "balance_loss_mlp": 0.29475045, + "epoch": 0.3632646926198707, + "flos": 17019072910080.0, + "grad_norm": 16.759068417091296, + "language_loss": 0.92318761, + "learning_rate": 2.943727162882107e-06, + "loss": 0.94193327, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.32617188, + "step": 6042, + "time_per_iteration": 2.6406478881835938 + }, + { + "auxiliary_loss_clip": 0.01543629, + "auxiliary_loss_mlp": 0.00334175, + "balance_loss_clip": 1.27222824, + "balance_loss_mlp": 0.30113071, + "epoch": 0.36332481587253873, + "flos": 23331163029120.0, + "grad_norm": 3.366033856670862, + "language_loss": 0.83911991, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.85789806, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.33032227, + "step": 6043, + "time_per_iteration": 2.6817820072174072 + }, + { + "auxiliary_loss_clip": 0.01556206, + "auxiliary_loss_mlp": 0.00331807, + "balance_loss_clip": 1.28384447, + "balance_loss_mlp": 0.29625866, + "epoch": 0.3633849391252067, + "flos": 10743539857920.0, + "grad_norm": 17.396910573955882, + "language_loss": 0.73740447, + "learning_rate": 2.943040336741298e-06, + "loss": 0.75628459, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.35546875, + "step": 6044, + "time_per_iteration": 2.6208152770996094 + }, + { + "auxiliary_loss_clip": 0.01535808, + "auxiliary_loss_mlp": 0.00282862, + "balance_loss_clip": 1.26920629, + "balance_loss_mlp": 0.25026995, + "epoch": 0.36344506237787466, + "flos": 25849147357440.0, + "grad_norm": 62.4315337409331, + "language_loss": 0.87255275, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.89073944, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.32592773, + "step": 6045, + "time_per_iteration": 2.662590980529785 + }, + { + "auxiliary_loss_clip": 0.0155335, + "auxiliary_loss_mlp": 0.00334483, + "balance_loss_clip": 1.27927375, + "balance_loss_mlp": 0.2994352, + "epoch": 0.3635051856305426, + "flos": 30154046503680.0, + "grad_norm": 1282.7793389069172, + "language_loss": 0.72630942, + "learning_rate": 2.942353367559755e-06, + "loss": 0.7451877, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.35058594, + "step": 6046, + "time_per_iteration": 2.7174036502838135 + }, + { + "auxiliary_loss_clip": 0.01559059, + "auxiliary_loss_mlp": 0.0033658, + "balance_loss_clip": 1.28992498, + "balance_loss_mlp": 0.30315381, + "epoch": 0.3635653088832106, + "flos": 22198396746240.0, + "grad_norm": 43.358894418547685, + "language_loss": 0.84388423, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.86284065, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.33447266, + "step": 6047, + "time_per_iteration": 2.673017740249634 + }, + { + "auxiliary_loss_clip": 0.0154374, + "auxiliary_loss_mlp": 0.00305775, + "balance_loss_clip": 1.26702213, + "balance_loss_mlp": 0.271467, + "epoch": 0.36362543213587856, + "flos": 24787053083520.0, + "grad_norm": 2.2171641734891914, + "language_loss": 0.85775459, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.87624979, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.34301758, + "step": 6048, + "time_per_iteration": 2.6680619716644287 + }, + { + "auxiliary_loss_clip": 0.01480543, + "auxiliary_loss_mlp": 0.00087433, + "balance_loss_clip": 1.28737175, + "balance_loss_mlp": 0.07846881, + "epoch": 0.3636855553885465, + "flos": 62526369231360.0, + "grad_norm": 1.2850555927511922, + "language_loss": 0.52147937, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.53715914, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.08984375, + "step": 6049, + "time_per_iteration": 3.2505764961242676 + }, + { + "auxiliary_loss_clip": 0.01543851, + "auxiliary_loss_mlp": 0.00284502, + "balance_loss_clip": 1.27441239, + "balance_loss_mlp": 0.25081366, + "epoch": 0.3637456786412145, + "flos": 24060652341120.0, + "grad_norm": 73.65272320622037, + "language_loss": 0.93735886, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.9556424, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.33691406, + "step": 6050, + "time_per_iteration": 2.699235439300537 + }, + { + "auxiliary_loss_clip": 0.01546982, + "auxiliary_loss_mlp": 0.00281301, + "balance_loss_clip": 1.27558184, + "balance_loss_mlp": 0.24889965, + "epoch": 0.36380580189388245, + "flos": 16691495852160.0, + "grad_norm": 122.5069751169658, + "language_loss": 0.87201411, + "learning_rate": 2.940635319486546e-06, + "loss": 0.89029694, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.32397461, + "step": 6051, + "time_per_iteration": 2.663992166519165 + }, + { + "auxiliary_loss_clip": 0.01537657, + "auxiliary_loss_mlp": 0.00293684, + "balance_loss_clip": 1.27002966, + "balance_loss_mlp": 0.25961429, + "epoch": 0.3638659251465504, + "flos": 25114091437440.0, + "grad_norm": 7.482870452908569, + "language_loss": 0.8813442, + "learning_rate": 2.940291602812822e-06, + "loss": 0.89965761, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.34057617, + "step": 6052, + "time_per_iteration": 2.670405626296997 + }, + { + "auxiliary_loss_clip": 0.01530144, + "auxiliary_loss_mlp": 0.00284544, + "balance_loss_clip": 1.26943386, + "balance_loss_mlp": 0.2526671, + "epoch": 0.3639260483992184, + "flos": 23003011353600.0, + "grad_norm": 18.230122114809408, + "language_loss": 0.78178877, + "learning_rate": 2.939947850483145e-06, + "loss": 0.7999357, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.31835938, + "step": 6053, + "time_per_iteration": 2.6551151275634766 + }, + { + "auxiliary_loss_clip": 0.0145458, + "auxiliary_loss_mlp": 0.00075108, + "balance_loss_clip": 1.26205635, + "balance_loss_mlp": 0.06638142, + "epoch": 0.36398617165188635, + "flos": 70716011160960.0, + "grad_norm": 0.7675708904970995, + "language_loss": 0.60843146, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.62372839, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.08740234, + "step": 6054, + "time_per_iteration": 6.004218101501465 + }, + { + "auxiliary_loss_clip": 0.01539285, + "auxiliary_loss_mlp": 0.00258252, + "balance_loss_clip": 1.26764131, + "balance_loss_mlp": 0.2241343, + "epoch": 0.3640462949045543, + "flos": 22235456603520.0, + "grad_norm": 9.175135137288539, + "language_loss": 0.83680958, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.85478497, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.34130859, + "step": 6055, + "time_per_iteration": 2.672990083694458 + }, + { + "auxiliary_loss_clip": 0.01549911, + "auxiliary_loss_mlp": 0.00266756, + "balance_loss_clip": 1.27609515, + "balance_loss_mlp": 0.23175579, + "epoch": 0.3641064181572223, + "flos": 21543529939200.0, + "grad_norm": 7.3657432637445845, + "language_loss": 0.81512678, + "learning_rate": 2.938916379688765e-06, + "loss": 0.8332935, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.35009766, + "step": 6056, + "time_per_iteration": 2.639315366744995 + }, + { + "auxiliary_loss_clip": 0.01535121, + "auxiliary_loss_mlp": 0.00281203, + "balance_loss_clip": 1.26775837, + "balance_loss_mlp": 0.2480152, + "epoch": 0.3641665414098903, + "flos": 22273306560000.0, + "grad_norm": 5.450955794797858, + "language_loss": 0.8985287, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.9166919, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.33203125, + "step": 6057, + "time_per_iteration": 2.635474920272827 + }, + { + "auxiliary_loss_clip": 0.01526313, + "auxiliary_loss_mlp": 0.00290605, + "balance_loss_clip": 1.26011467, + "balance_loss_mlp": 0.25810874, + "epoch": 0.36422666466255826, + "flos": 28329676778880.0, + "grad_norm": 10.497085435589678, + "language_loss": 0.88319182, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.90136099, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.32495117, + "step": 6058, + "time_per_iteration": 2.6934762001037598 + }, + { + "auxiliary_loss_clip": 0.0151503, + "auxiliary_loss_mlp": 0.00281663, + "balance_loss_clip": 1.24927914, + "balance_loss_mlp": 0.24854672, + "epoch": 0.36428678791522623, + "flos": 24170503109760.0, + "grad_norm": 104.44578032286712, + "language_loss": 0.93046069, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.94842762, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.33105469, + "step": 6059, + "time_per_iteration": 2.685561180114746 + }, + { + "auxiliary_loss_clip": 0.01516576, + "auxiliary_loss_mlp": 0.00306564, + "balance_loss_clip": 1.25279284, + "balance_loss_mlp": 0.27289894, + "epoch": 0.3643469111678942, + "flos": 22528451842560.0, + "grad_norm": 24.546411651395186, + "language_loss": 0.94290125, + "learning_rate": 2.937540586903884e-06, + "loss": 0.96113271, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.33642578, + "step": 6060, + "time_per_iteration": 4.129201889038086 + }, + { + "auxiliary_loss_clip": 0.01526675, + "auxiliary_loss_mlp": 0.00284838, + "balance_loss_clip": 1.25496173, + "balance_loss_mlp": 0.249385, + "epoch": 0.36440703442056216, + "flos": 19426595938560.0, + "grad_norm": 4.374903796558651, + "language_loss": 0.74731898, + "learning_rate": 2.937196549795971e-06, + "loss": 0.76543409, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.35449219, + "step": 6061, + "time_per_iteration": 2.698676586151123 + }, + { + "auxiliary_loss_clip": 0.01522825, + "auxiliary_loss_mlp": 0.00297349, + "balance_loss_clip": 1.25429523, + "balance_loss_mlp": 0.26492378, + "epoch": 0.3644671576732301, + "flos": 18040515966720.0, + "grad_norm": 50.76919287406095, + "language_loss": 0.86151415, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.87971586, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.32421875, + "step": 6062, + "time_per_iteration": 2.66644287109375 + }, + { + "auxiliary_loss_clip": 0.01523473, + "auxiliary_loss_mlp": 0.00268574, + "balance_loss_clip": 1.26201522, + "balance_loss_mlp": 0.23574382, + "epoch": 0.3645272809258981, + "flos": 21542811667200.0, + "grad_norm": 406.2891628326871, + "language_loss": 0.79788399, + "learning_rate": 2.936508368977432e-06, + "loss": 0.81580448, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.328125, + "step": 6063, + "time_per_iteration": 2.69675350189209 + }, + { + "auxiliary_loss_clip": 0.01525665, + "auxiliary_loss_mlp": 0.00285572, + "balance_loss_clip": 1.25983405, + "balance_loss_mlp": 0.25145429, + "epoch": 0.36458740417856605, + "flos": 22746860490240.0, + "grad_norm": 48.359611196208725, + "language_loss": 0.74985898, + "learning_rate": 2.936164225292901e-06, + "loss": 0.7679714, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.34094238, + "step": 6064, + "time_per_iteration": 2.6221470832824707 + }, + { + "auxiliary_loss_clip": 0.0152708, + "auxiliary_loss_mlp": 0.00261971, + "balance_loss_clip": 1.26068163, + "balance_loss_mlp": 0.23131019, + "epoch": 0.364647527431234, + "flos": 26140670138880.0, + "grad_norm": 62.59034005758984, + "language_loss": 0.83088374, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.84877419, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.30639648, + "step": 6065, + "time_per_iteration": 2.687039613723755 + }, + { + "auxiliary_loss_clip": 0.01533501, + "auxiliary_loss_mlp": 0.00291792, + "balance_loss_clip": 1.26465452, + "balance_loss_mlp": 0.25707871, + "epoch": 0.364707650683902, + "flos": 31029907737600.0, + "grad_norm": 15.817982801443712, + "language_loss": 0.83524036, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.85349321, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.34692383, + "step": 6066, + "time_per_iteration": 4.131505966186523 + }, + { + "auxiliary_loss_clip": 0.0153238, + "auxiliary_loss_mlp": 0.00279198, + "balance_loss_clip": 1.26846087, + "balance_loss_mlp": 0.24720195, + "epoch": 0.36476777393656995, + "flos": 19572896033280.0, + "grad_norm": 2546.902450036903, + "language_loss": 0.85629672, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.87441248, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.31982422, + "step": 6067, + "time_per_iteration": 2.642425537109375 + }, + { + "auxiliary_loss_clip": 0.01538912, + "auxiliary_loss_mlp": 0.00244993, + "balance_loss_clip": 1.27447522, + "balance_loss_mlp": 0.21499977, + "epoch": 0.3648278971892379, + "flos": 17748849530880.0, + "grad_norm": 10.654989292663304, + "language_loss": 0.81576514, + "learning_rate": 2.934787295690886e-06, + "loss": 0.83360416, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.29980469, + "step": 6068, + "time_per_iteration": 2.6299734115600586 + }, + { + "auxiliary_loss_clip": 0.01544501, + "auxiliary_loss_mlp": 0.00293013, + "balance_loss_clip": 1.27190506, + "balance_loss_mlp": 0.26075488, + "epoch": 0.3648880204419059, + "flos": 17931167988480.0, + "grad_norm": 35.92315833391162, + "language_loss": 0.82798523, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.84636033, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.32275391, + "step": 6069, + "time_per_iteration": 2.6106553077697754 + }, + { + "auxiliary_loss_clip": 0.01544859, + "auxiliary_loss_mlp": 0.00307898, + "balance_loss_clip": 1.27584243, + "balance_loss_mlp": 0.27554488, + "epoch": 0.3649481436945739, + "flos": 22638266697600.0, + "grad_norm": 4.011972269425396, + "language_loss": 0.73193288, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.75046039, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.32324219, + "step": 6070, + "time_per_iteration": 2.6845672130584717 + }, + { + "auxiliary_loss_clip": 0.01557875, + "auxiliary_loss_mlp": 0.00281759, + "balance_loss_clip": 1.28910065, + "balance_loss_mlp": 0.25119343, + "epoch": 0.36500826694724187, + "flos": 21579656042880.0, + "grad_norm": 2.8208110705131384, + "language_loss": 0.82751405, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.84591043, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.30566406, + "step": 6071, + "time_per_iteration": 2.656904458999634 + }, + { + "auxiliary_loss_clip": 0.01558068, + "auxiliary_loss_mlp": 0.00280647, + "balance_loss_clip": 1.28663445, + "balance_loss_mlp": 0.24691066, + "epoch": 0.36506839019990983, + "flos": 13772533023360.0, + "grad_norm": 89.08184755943006, + "language_loss": 0.96525317, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.98364031, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.33740234, + "step": 6072, + "time_per_iteration": 2.6069490909576416 + }, + { + "auxiliary_loss_clip": 0.01562151, + "auxiliary_loss_mlp": 0.00256279, + "balance_loss_clip": 1.29117668, + "balance_loss_mlp": 0.22421132, + "epoch": 0.3651285134525778, + "flos": 17274972378240.0, + "grad_norm": 37.133775943062844, + "language_loss": 0.81402969, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.832214, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.32080078, + "step": 6073, + "time_per_iteration": 2.753474235534668 + }, + { + "auxiliary_loss_clip": 0.01561402, + "auxiliary_loss_mlp": 0.00266173, + "balance_loss_clip": 1.28750324, + "balance_loss_mlp": 0.23248425, + "epoch": 0.36518863670524576, + "flos": 21907987286400.0, + "grad_norm": 4.221161573272919, + "language_loss": 0.76495266, + "learning_rate": 2.932720838132236e-06, + "loss": 0.78322846, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.33691406, + "step": 6074, + "time_per_iteration": 2.8941547870635986 + }, + { + "auxiliary_loss_clip": 0.01555319, + "auxiliary_loss_mlp": 0.00245827, + "balance_loss_clip": 1.28450274, + "balance_loss_mlp": 0.21397406, + "epoch": 0.3652487599579137, + "flos": 27122180250240.0, + "grad_norm": 5.64875876279917, + "language_loss": 0.78984416, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.80785561, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.31860352, + "step": 6075, + "time_per_iteration": 2.7034752368927 + }, + { + "auxiliary_loss_clip": 0.01557123, + "auxiliary_loss_mlp": 0.00242868, + "balance_loss_clip": 1.28375387, + "balance_loss_mlp": 0.21063364, + "epoch": 0.3653088832105817, + "flos": 19755573626880.0, + "grad_norm": 3.546096196920162, + "language_loss": 0.95922494, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.97722483, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.32226562, + "step": 6076, + "time_per_iteration": 2.6391875743865967 + }, + { + "auxiliary_loss_clip": 0.01561317, + "auxiliary_loss_mlp": 0.00279743, + "balance_loss_clip": 1.29081345, + "balance_loss_mlp": 0.24476665, + "epoch": 0.36536900646324966, + "flos": 13115008609920.0, + "grad_norm": 46.31162874402638, + "language_loss": 0.75861573, + "learning_rate": 2.931687131696872e-06, + "loss": 0.77702636, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.34985352, + "step": 6077, + "time_per_iteration": 2.7347724437713623 + }, + { + "auxiliary_loss_clip": 0.01462601, + "auxiliary_loss_mlp": 0.00114287, + "balance_loss_clip": 1.26961398, + "balance_loss_mlp": 0.10556089, + "epoch": 0.3654291297159176, + "flos": 71100472383360.0, + "grad_norm": 3.0550123453188296, + "language_loss": 0.6141417, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.62991059, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.08740234, + "step": 6078, + "time_per_iteration": 3.2844815254211426 + }, + { + "auxiliary_loss_clip": 0.01581453, + "auxiliary_loss_mlp": 0.00282046, + "balance_loss_clip": 1.30668855, + "balance_loss_mlp": 0.24852422, + "epoch": 0.3654892529685856, + "flos": 23617478338560.0, + "grad_norm": 24.101224160338298, + "language_loss": 0.84300232, + "learning_rate": 2.930997817403173e-06, + "loss": 0.86163735, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.33520508, + "step": 6079, + "time_per_iteration": 2.6630890369415283 + }, + { + "auxiliary_loss_clip": 0.01580291, + "auxiliary_loss_mlp": 0.00301506, + "balance_loss_clip": 1.30358315, + "balance_loss_mlp": 0.2691288, + "epoch": 0.36554937622125355, + "flos": 43470799850880.0, + "grad_norm": 4.212144363598613, + "language_loss": 0.70282459, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.72164255, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.32373047, + "step": 6080, + "time_per_iteration": 2.837479829788208 + }, + { + "auxiliary_loss_clip": 0.01587271, + "auxiliary_loss_mlp": 0.00317569, + "balance_loss_clip": 1.30594134, + "balance_loss_mlp": 0.28154406, + "epoch": 0.3656094994739215, + "flos": 23294641875840.0, + "grad_norm": 589.4720887562161, + "language_loss": 0.78429329, + "learning_rate": 2.930308361895352e-06, + "loss": 0.80334169, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.3605957, + "step": 6081, + "time_per_iteration": 2.6781325340270996 + }, + { + "auxiliary_loss_clip": 0.01593103, + "auxiliary_loss_mlp": 0.00303687, + "balance_loss_clip": 1.30926728, + "balance_loss_mlp": 0.2705231, + "epoch": 0.3656696227265895, + "flos": 24571984400640.0, + "grad_norm": 5.626836758181694, + "language_loss": 0.7937969, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.81276482, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.33178711, + "step": 6082, + "time_per_iteration": 2.8027541637420654 + }, + { + "auxiliary_loss_clip": 0.01610725, + "auxiliary_loss_mlp": 0.00281395, + "balance_loss_clip": 1.32741046, + "balance_loss_mlp": 0.24801643, + "epoch": 0.3657297459792575, + "flos": 27928375056000.0, + "grad_norm": 41.489731275449884, + "language_loss": 0.89300424, + "learning_rate": 2.929618765277987e-06, + "loss": 0.91192544, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.33398438, + "step": 6083, + "time_per_iteration": 2.710726022720337 + }, + { + "auxiliary_loss_clip": 0.01618784, + "auxiliary_loss_mlp": 0.00130371, + "balance_loss_clip": 1.40661621, + "balance_loss_mlp": 0.11907019, + "epoch": 0.36578986923192547, + "flos": 67392622126080.0, + "grad_norm": 0.8110750155270271, + "language_loss": 0.58786154, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.60535306, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.11279297, + "step": 6084, + "time_per_iteration": 3.326092481613159 + }, + { + "auxiliary_loss_clip": 0.01615214, + "auxiliary_loss_mlp": 0.00286156, + "balance_loss_clip": 1.33249462, + "balance_loss_mlp": 0.25430301, + "epoch": 0.36584999248459343, + "flos": 20227511445120.0, + "grad_norm": 2.588025660733764, + "language_loss": 0.79208302, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.81109673, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.31835938, + "step": 6085, + "time_per_iteration": 2.709677219390869 + }, + { + "auxiliary_loss_clip": 0.01634338, + "auxiliary_loss_mlp": 0.00315471, + "balance_loss_clip": 1.34321284, + "balance_loss_mlp": 0.28183046, + "epoch": 0.3659101157372614, + "flos": 19062461813760.0, + "grad_norm": 53.541159927667394, + "language_loss": 0.84978461, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.86928266, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.33642578, + "step": 6086, + "time_per_iteration": 2.6481964588165283 + }, + { + "auxiliary_loss_clip": 0.01631416, + "auxiliary_loss_mlp": 0.00302168, + "balance_loss_clip": 1.34824133, + "balance_loss_mlp": 0.26979101, + "epoch": 0.36597023898992936, + "flos": 30810708990720.0, + "grad_norm": 57.84596972249884, + "language_loss": 0.85064363, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.8699795, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.32373047, + "step": 6087, + "time_per_iteration": 2.753568649291992 + }, + { + "auxiliary_loss_clip": 0.01634526, + "auxiliary_loss_mlp": 0.00350714, + "balance_loss_clip": 1.33934951, + "balance_loss_mlp": 0.31418782, + "epoch": 0.36603036224259733, + "flos": 20521799573760.0, + "grad_norm": 28.568795364350212, + "language_loss": 0.80449772, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.82435012, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.36523438, + "step": 6088, + "time_per_iteration": 2.6144182682037354 + }, + { + "auxiliary_loss_clip": 0.01634974, + "auxiliary_loss_mlp": 0.00318195, + "balance_loss_clip": 1.33167541, + "balance_loss_mlp": 0.2818836, + "epoch": 0.3660904854952653, + "flos": 38329397798400.0, + "grad_norm": 4.919271960365891, + "language_loss": 0.87104511, + "learning_rate": 2.92754912981472e-06, + "loss": 0.89057684, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.36303711, + "step": 6089, + "time_per_iteration": 2.7797272205352783 + }, + { + "auxiliary_loss_clip": 0.01654192, + "auxiliary_loss_mlp": 0.00338518, + "balance_loss_clip": 1.36207843, + "balance_loss_mlp": 0.30141979, + "epoch": 0.36615060874793326, + "flos": 21835555511040.0, + "grad_norm": 14.005567487824953, + "language_loss": 0.78157586, + "learning_rate": 2.927204067389884e-06, + "loss": 0.80150294, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.37060547, + "step": 6090, + "time_per_iteration": 2.621188163757324 + }, + { + "auxiliary_loss_clip": 0.01661474, + "auxiliary_loss_mlp": 0.00333503, + "balance_loss_clip": 1.36517477, + "balance_loss_mlp": 0.30002916, + "epoch": 0.3662107320006012, + "flos": 16581537342720.0, + "grad_norm": 163.62107897288536, + "language_loss": 0.80666095, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.82661068, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.33496094, + "step": 6091, + "time_per_iteration": 2.611182928085327 + }, + { + "auxiliary_loss_clip": 0.01654078, + "auxiliary_loss_mlp": 0.00291181, + "balance_loss_clip": 1.36181128, + "balance_loss_mlp": 0.25780213, + "epoch": 0.3662708552532692, + "flos": 20958365473920.0, + "grad_norm": 21.849799775193116, + "language_loss": 0.81032979, + "learning_rate": 2.926513837074284e-06, + "loss": 0.82978231, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.33398438, + "step": 6092, + "time_per_iteration": 2.6020522117614746 + }, + { + "auxiliary_loss_clip": 0.01652464, + "auxiliary_loss_mlp": 0.00306308, + "balance_loss_clip": 1.35506821, + "balance_loss_mlp": 0.2684229, + "epoch": 0.36633097850593715, + "flos": 21902707987200.0, + "grad_norm": 9.122981889426043, + "language_loss": 0.8719418, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.89152956, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.37866211, + "step": 6093, + "time_per_iteration": 2.650583028793335 + }, + { + "auxiliary_loss_clip": 0.01659137, + "auxiliary_loss_mlp": 0.00322975, + "balance_loss_clip": 1.36031866, + "balance_loss_mlp": 0.2865451, + "epoch": 0.3663911017586051, + "flos": 32854133808000.0, + "grad_norm": 3.269023043334289, + "language_loss": 0.80578983, + "learning_rate": 2.925823466224696e-06, + "loss": 0.82561094, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.36450195, + "step": 6094, + "time_per_iteration": 2.7127485275268555 + }, + { + "auxiliary_loss_clip": 0.01649706, + "auxiliary_loss_mlp": 0.00297215, + "balance_loss_clip": 1.35539865, + "balance_loss_mlp": 0.2616908, + "epoch": 0.3664512250112731, + "flos": 27271748482560.0, + "grad_norm": 3.672337963270092, + "language_loss": 0.85190296, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.87137216, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.35534668, + "step": 6095, + "time_per_iteration": 2.68685245513916 + }, + { + "auxiliary_loss_clip": 0.01641562, + "auxiliary_loss_mlp": 0.0030452, + "balance_loss_clip": 1.34191084, + "balance_loss_mlp": 0.26866212, + "epoch": 0.3665113482639411, + "flos": 17784436930560.0, + "grad_norm": 19.080317955024817, + "language_loss": 0.8532604, + "learning_rate": 2.925132954945834e-06, + "loss": 0.87272125, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.35839844, + "step": 6096, + "time_per_iteration": 4.085319757461548 + }, + { + "auxiliary_loss_clip": 0.01655309, + "auxiliary_loss_mlp": 0.00311297, + "balance_loss_clip": 1.35524929, + "balance_loss_mlp": 0.27529615, + "epoch": 0.36657147151660907, + "flos": 27854614477440.0, + "grad_norm": 62.52904187131912, + "language_loss": 0.74879795, + "learning_rate": 2.924787646678155e-06, + "loss": 0.76846403, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.35986328, + "step": 6097, + "time_per_iteration": 4.157190799713135 + }, + { + "auxiliary_loss_clip": 0.01653087, + "auxiliary_loss_mlp": 0.00338079, + "balance_loss_clip": 1.35605359, + "balance_loss_mlp": 0.30322173, + "epoch": 0.36663159476927704, + "flos": 25374013228800.0, + "grad_norm": 7.791021071818948, + "language_loss": 0.8341344, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.85404605, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.34863281, + "step": 6098, + "time_per_iteration": 2.7568814754486084 + }, + { + "auxiliary_loss_clip": 0.01658424, + "auxiliary_loss_mlp": 0.0029034, + "balance_loss_clip": 1.36070538, + "balance_loss_mlp": 0.25724721, + "epoch": 0.366691718021945, + "flos": 21357225072000.0, + "grad_norm": 4.2181137947259355, + "language_loss": 0.80546516, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.82495284, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.33105469, + "step": 6099, + "time_per_iteration": 2.681466579437256 + }, + { + "auxiliary_loss_clip": 0.01654652, + "auxiliary_loss_mlp": 0.00281056, + "balance_loss_clip": 1.35878944, + "balance_loss_mlp": 0.24538897, + "epoch": 0.36675184127461297, + "flos": 16800376953600.0, + "grad_norm": 15.058799386051357, + "language_loss": 0.91622061, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.93557769, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.35668945, + "step": 6100, + "time_per_iteration": 2.692492723464966 + }, + { + "auxiliary_loss_clip": 0.0164273, + "auxiliary_loss_mlp": 0.00330367, + "balance_loss_clip": 1.3429991, + "balance_loss_mlp": 0.29257739, + "epoch": 0.36681196452728093, + "flos": 21906514828800.0, + "grad_norm": 20.172228681170047, + "language_loss": 0.79219627, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.8119272, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.37792969, + "step": 6101, + "time_per_iteration": 2.671046018600464 + }, + { + "auxiliary_loss_clip": 0.01666463, + "auxiliary_loss_mlp": 0.00301017, + "balance_loss_clip": 1.36272252, + "balance_loss_mlp": 0.26775753, + "epoch": 0.3668720877799489, + "flos": 17712436118400.0, + "grad_norm": 4.060481140914331, + "language_loss": 0.85496509, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.87463987, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.33251953, + "step": 6102, + "time_per_iteration": 4.033511638641357 + }, + { + "auxiliary_loss_clip": 0.01657969, + "auxiliary_loss_mlp": 0.00301341, + "balance_loss_clip": 1.35543835, + "balance_loss_mlp": 0.26340821, + "epoch": 0.36693221103261686, + "flos": 47045455499520.0, + "grad_norm": 211.77160381442087, + "language_loss": 0.78094637, + "learning_rate": 2.922715061101625e-06, + "loss": 0.80053937, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.37915039, + "step": 6103, + "time_per_iteration": 2.909019708633423 + }, + { + "auxiliary_loss_clip": 0.01650752, + "auxiliary_loss_mlp": 0.00318334, + "balance_loss_clip": 1.35172701, + "balance_loss_mlp": 0.28481275, + "epoch": 0.3669923342852848, + "flos": 15960929132160.0, + "grad_norm": 32.804376977435695, + "language_loss": 0.80321783, + "learning_rate": 2.922369507632716e-06, + "loss": 0.82290876, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 2.99023438, + "router_z_loss_mlp": 0.33544922, + "step": 6104, + "time_per_iteration": 2.623243570327759 + }, + { + "auxiliary_loss_clip": 0.01660307, + "auxiliary_loss_mlp": 0.0029579, + "balance_loss_clip": 1.35749435, + "balance_loss_mlp": 0.2601698, + "epoch": 0.3670524575379528, + "flos": 19974485064960.0, + "grad_norm": 113.0315206940846, + "language_loss": 0.89020491, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.90976584, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.35644531, + "step": 6105, + "time_per_iteration": 2.6984660625457764 + }, + { + "auxiliary_loss_clip": 0.01646013, + "auxiliary_loss_mlp": 0.00322827, + "balance_loss_clip": 1.34136462, + "balance_loss_mlp": 0.28653991, + "epoch": 0.36711258079062076, + "flos": 25702955003520.0, + "grad_norm": 3.257065789417559, + "language_loss": 0.87883806, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.89852643, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.36279297, + "step": 6106, + "time_per_iteration": 2.6917905807495117 + }, + { + "auxiliary_loss_clip": 0.0169502, + "auxiliary_loss_mlp": 0.00128248, + "balance_loss_clip": 1.44733262, + "balance_loss_mlp": 0.11852089, + "epoch": 0.3671727040432887, + "flos": 60772743342720.0, + "grad_norm": 5.766417196467903, + "language_loss": 0.58992648, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.60815918, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.09716797, + "step": 6107, + "time_per_iteration": 3.219575881958008 + }, + { + "auxiliary_loss_clip": 0.01659753, + "auxiliary_loss_mlp": 0.00291199, + "balance_loss_clip": 1.3665055, + "balance_loss_mlp": 0.25638992, + "epoch": 0.3672328272959567, + "flos": 18661303745280.0, + "grad_norm": 2.383125876257265, + "language_loss": 0.80668378, + "learning_rate": 2.92098694412469e-06, + "loss": 0.82619327, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.34790039, + "step": 6108, + "time_per_iteration": 4.030089378356934 + }, + { + "auxiliary_loss_clip": 0.0166168, + "auxiliary_loss_mlp": 0.0031021, + "balance_loss_clip": 1.35587168, + "balance_loss_mlp": 0.27594921, + "epoch": 0.3672929505486247, + "flos": 15049049535360.0, + "grad_norm": 29.02903292230155, + "language_loss": 0.83583999, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.85555887, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 3.0546875, + "router_z_loss_mlp": 0.34228516, + "step": 6109, + "time_per_iteration": 2.579540491104126 + }, + { + "auxiliary_loss_clip": 0.01648261, + "auxiliary_loss_mlp": 0.00314217, + "balance_loss_clip": 1.35537887, + "balance_loss_mlp": 0.28031406, + "epoch": 0.3673530738012927, + "flos": 20589347099520.0, + "grad_norm": 107.21459818008167, + "language_loss": 0.60468501, + "learning_rate": 2.920295452774744e-06, + "loss": 0.62430984, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.33886719, + "step": 6110, + "time_per_iteration": 2.6516590118408203 + }, + { + "auxiliary_loss_clip": 0.01671875, + "auxiliary_loss_mlp": 0.00317994, + "balance_loss_clip": 1.37172866, + "balance_loss_mlp": 0.28125349, + "epoch": 0.36741319705396064, + "flos": 21689830033920.0, + "grad_norm": 5.047102067877273, + "language_loss": 0.85373867, + "learning_rate": 2.919949654746672e-06, + "loss": 0.87363732, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.36767578, + "step": 6111, + "time_per_iteration": 2.6954715251922607 + }, + { + "auxiliary_loss_clip": 0.01659555, + "auxiliary_loss_mlp": 0.00311551, + "balance_loss_clip": 1.36380005, + "balance_loss_mlp": 0.276456, + "epoch": 0.3674733203066286, + "flos": 29862200499840.0, + "grad_norm": 8.45879770281599, + "language_loss": 0.78262222, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.80233324, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.35083008, + "step": 6112, + "time_per_iteration": 2.7311718463897705 + }, + { + "auxiliary_loss_clip": 0.01658533, + "auxiliary_loss_mlp": 0.00317767, + "balance_loss_clip": 1.36111975, + "balance_loss_mlp": 0.2858668, + "epoch": 0.36753344355929657, + "flos": 18257021193600.0, + "grad_norm": 25.358928071103804, + "language_loss": 0.90774941, + "learning_rate": 2.919257954049892e-06, + "loss": 0.92751241, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.3190918, + "step": 6113, + "time_per_iteration": 2.717071294784546 + }, + { + "auxiliary_loss_clip": 0.01671641, + "auxiliary_loss_mlp": 0.00343802, + "balance_loss_clip": 1.36965334, + "balance_loss_mlp": 0.30932629, + "epoch": 0.36759356681196453, + "flos": 25301150490240.0, + "grad_norm": 2.2173369351011214, + "language_loss": 0.88578683, + "learning_rate": 2.918912051407413e-06, + "loss": 0.90594125, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.34472656, + "step": 6114, + "time_per_iteration": 2.6451780796051025 + }, + { + "auxiliary_loss_clip": 0.01657791, + "auxiliary_loss_mlp": 0.00314294, + "balance_loss_clip": 1.35778821, + "balance_loss_mlp": 0.27898377, + "epoch": 0.3676536900646325, + "flos": 21032952065280.0, + "grad_norm": 2.1155994279674406, + "language_loss": 0.76314777, + "learning_rate": 2.918566113919698e-06, + "loss": 0.78286862, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.35327148, + "step": 6115, + "time_per_iteration": 2.627621650695801 + }, + { + "auxiliary_loss_clip": 0.01683332, + "auxiliary_loss_mlp": 0.00322721, + "balance_loss_clip": 1.38622904, + "balance_loss_mlp": 0.28927055, + "epoch": 0.36771381331730046, + "flos": 16288506190080.0, + "grad_norm": 61.14663255603954, + "language_loss": 0.85632932, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.87638986, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.33447266, + "step": 6116, + "time_per_iteration": 2.5789105892181396 + }, + { + "auxiliary_loss_clip": 0.01685298, + "auxiliary_loss_mlp": 0.00300148, + "balance_loss_clip": 1.38188279, + "balance_loss_mlp": 0.26553011, + "epoch": 0.36777393656996843, + "flos": 22309971367680.0, + "grad_norm": 2.2397869133562254, + "language_loss": 0.70879138, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.72864586, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.34643555, + "step": 6117, + "time_per_iteration": 2.728520154953003 + }, + { + "auxiliary_loss_clip": 0.01675387, + "auxiliary_loss_mlp": 0.00312139, + "balance_loss_clip": 1.37553155, + "balance_loss_mlp": 0.2806673, + "epoch": 0.3678340598226364, + "flos": 26834069260800.0, + "grad_norm": 6.193472498482061, + "language_loss": 0.81954849, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.83942378, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.31469727, + "step": 6118, + "time_per_iteration": 2.745534896850586 + }, + { + "auxiliary_loss_clip": 0.01689399, + "auxiliary_loss_mlp": 0.00322835, + "balance_loss_clip": 1.38776326, + "balance_loss_mlp": 0.2882883, + "epoch": 0.36789418307530436, + "flos": 21761723105280.0, + "grad_norm": 9.901698974419121, + "language_loss": 0.79580975, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.8159321, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.34570312, + "step": 6119, + "time_per_iteration": 2.7610905170440674 + }, + { + "auxiliary_loss_clip": 0.0170244, + "auxiliary_loss_mlp": 0.00299706, + "balance_loss_clip": 1.40154111, + "balance_loss_mlp": 0.26608884, + "epoch": 0.3679543063279723, + "flos": 15924192497280.0, + "grad_norm": 47.909885142599734, + "language_loss": 0.87013829, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.89015973, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.3359375, + "step": 6120, + "time_per_iteration": 2.7355473041534424 + }, + { + "auxiliary_loss_clip": 0.01706873, + "auxiliary_loss_mlp": 0.00301846, + "balance_loss_clip": 1.40236592, + "balance_loss_mlp": 0.26985043, + "epoch": 0.3680144295806403, + "flos": 24275541456000.0, + "grad_norm": 10.971688026495075, + "language_loss": 0.74156022, + "learning_rate": 2.916489757978126e-06, + "loss": 0.76164746, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.31982422, + "step": 6121, + "time_per_iteration": 2.672490358352661 + }, + { + "auxiliary_loss_clip": 0.01714428, + "auxiliary_loss_mlp": 0.00324305, + "balance_loss_clip": 1.40910709, + "balance_loss_mlp": 0.29288173, + "epoch": 0.36807455283330826, + "flos": 26104148985600.0, + "grad_norm": 2.2297940306770565, + "language_loss": 0.80173624, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.82212359, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.31420898, + "step": 6122, + "time_per_iteration": 2.6889796257019043 + }, + { + "auxiliary_loss_clip": 0.01703796, + "auxiliary_loss_mlp": 0.00282232, + "balance_loss_clip": 1.40663457, + "balance_loss_mlp": 0.24768497, + "epoch": 0.3681346760859763, + "flos": 24644990793600.0, + "grad_norm": 117.88835064706055, + "language_loss": 0.74764729, + "learning_rate": 2.915797361163875e-06, + "loss": 0.76750755, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.34545898, + "step": 6123, + "time_per_iteration": 2.770402669906616 + }, + { + "auxiliary_loss_clip": 0.01707917, + "auxiliary_loss_mlp": 0.00317882, + "balance_loss_clip": 1.39919376, + "balance_loss_mlp": 0.28307313, + "epoch": 0.36819479933864424, + "flos": 23878369797120.0, + "grad_norm": 16.707481766684513, + "language_loss": 0.83276492, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.85302293, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.34814453, + "step": 6124, + "time_per_iteration": 2.7541017532348633 + }, + { + "auxiliary_loss_clip": 0.01718555, + "auxiliary_loss_mlp": 0.00325106, + "balance_loss_clip": 1.40814495, + "balance_loss_mlp": 0.28984356, + "epoch": 0.3682549225913122, + "flos": 25553997302400.0, + "grad_norm": 35.97742056864481, + "language_loss": 0.82276833, + "learning_rate": 2.915104825441114e-06, + "loss": 0.84320498, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.35253906, + "step": 6125, + "time_per_iteration": 2.7553911209106445 + }, + { + "auxiliary_loss_clip": 0.01735141, + "auxiliary_loss_mlp": 0.00346759, + "balance_loss_clip": 1.41891909, + "balance_loss_mlp": 0.31085366, + "epoch": 0.36831504584398017, + "flos": 16946605221120.0, + "grad_norm": 8.875758677434353, + "language_loss": 0.86264956, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.88346851, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.35913086, + "step": 6126, + "time_per_iteration": 2.6563873291015625 + }, + { + "auxiliary_loss_clip": 0.01738052, + "auxiliary_loss_mlp": 0.00371324, + "balance_loss_clip": 1.41075718, + "balance_loss_mlp": 0.32929081, + "epoch": 0.36837516909664814, + "flos": 19865065259520.0, + "grad_norm": 12.522907600333244, + "language_loss": 0.73676234, + "learning_rate": 2.914412150914888e-06, + "loss": 0.75785613, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.42016602, + "step": 6127, + "time_per_iteration": 2.6750481128692627 + }, + { + "auxiliary_loss_clip": 0.0172939, + "auxiliary_loss_mlp": 0.00278603, + "balance_loss_clip": 1.41553569, + "balance_loss_mlp": 0.24586849, + "epoch": 0.3684352923493161, + "flos": 37626984362880.0, + "grad_norm": 5.175152485821013, + "language_loss": 0.7703169, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.79039681, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.32739258, + "step": 6128, + "time_per_iteration": 2.8093948364257812 + }, + { + "auxiliary_loss_clip": 0.01761005, + "auxiliary_loss_mlp": 0.00301462, + "balance_loss_clip": 1.44244432, + "balance_loss_mlp": 0.26801193, + "epoch": 0.36849541560198407, + "flos": 14465501182080.0, + "grad_norm": 3.8763915596427903, + "language_loss": 0.837071, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.85769564, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.33447266, + "step": 6129, + "time_per_iteration": 2.590507984161377 + }, + { + "auxiliary_loss_clip": 0.0174311, + "auxiliary_loss_mlp": 0.00304209, + "balance_loss_clip": 1.41975641, + "balance_loss_mlp": 0.26987666, + "epoch": 0.36855553885465203, + "flos": 25770753924480.0, + "grad_norm": 2.3938025259093307, + "language_loss": 0.90971732, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.93019044, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.34301758, + "step": 6130, + "time_per_iteration": 2.6983652114868164 + }, + { + "auxiliary_loss_clip": 0.01897998, + "auxiliary_loss_mlp": 0.001831, + "balance_loss_clip": 1.67588043, + "balance_loss_mlp": 0.17012998, + "epoch": 0.36861566210732, + "flos": 65049417377280.0, + "grad_norm": 0.8215704052041939, + "language_loss": 0.60114229, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62195325, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.12988281, + "step": 6131, + "time_per_iteration": 3.2208354473114014 + }, + { + "auxiliary_loss_clip": 0.0175852, + "auxiliary_loss_mlp": 0.00310124, + "balance_loss_clip": 1.43795156, + "balance_loss_mlp": 0.27793741, + "epoch": 0.36867578535998796, + "flos": 30954495133440.0, + "grad_norm": 1.7503215313541118, + "language_loss": 0.80153954, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.82222593, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.32128906, + "step": 6132, + "time_per_iteration": 2.6898601055145264 + }, + { + "auxiliary_loss_clip": 0.0174936, + "auxiliary_loss_mlp": 0.00308445, + "balance_loss_clip": 1.41876554, + "balance_loss_mlp": 0.27373156, + "epoch": 0.3687359086126559, + "flos": 28837956182400.0, + "grad_norm": 2.5584278386217303, + "language_loss": 0.81444955, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.83502758, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.34692383, + "step": 6133, + "time_per_iteration": 2.6741058826446533 + }, + { + "auxiliary_loss_clip": 0.01762654, + "auxiliary_loss_mlp": 0.00351775, + "balance_loss_clip": 1.44205284, + "balance_loss_mlp": 0.31386608, + "epoch": 0.3687960318653239, + "flos": 21396798881280.0, + "grad_norm": 4.773291754773896, + "language_loss": 0.77258563, + "learning_rate": 2.911986698512874e-06, + "loss": 0.7937299, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.37939453, + "step": 6134, + "time_per_iteration": 2.618229866027832 + }, + { + "auxiliary_loss_clip": 0.01756156, + "auxiliary_loss_mlp": 0.0033886, + "balance_loss_clip": 1.43598557, + "balance_loss_mlp": 0.30414668, + "epoch": 0.36885615511799186, + "flos": 20266043760000.0, + "grad_norm": 5.488995254832189, + "language_loss": 0.81492507, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.83587521, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.34667969, + "step": 6135, + "time_per_iteration": 2.638878583908081 + }, + { + "auxiliary_loss_clip": 0.01918003, + "auxiliary_loss_mlp": 0.00113538, + "balance_loss_clip": 1.68269551, + "balance_loss_mlp": 0.0999006, + "epoch": 0.3689162783706599, + "flos": 63088836301440.0, + "grad_norm": 0.8152603977945808, + "language_loss": 0.58329588, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.60361129, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.13671875, + "step": 6136, + "time_per_iteration": 3.1016945838928223 + }, + { + "auxiliary_loss_clip": 0.0174602, + "auxiliary_loss_mlp": 0.00332967, + "balance_loss_clip": 1.42749131, + "balance_loss_mlp": 0.29770535, + "epoch": 0.36897640162332784, + "flos": 10961984419200.0, + "grad_norm": 3.5509601470041514, + "language_loss": 0.86275268, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.88354254, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.35253906, + "step": 6137, + "time_per_iteration": 2.6219449043273926 + }, + { + "auxiliary_loss_clip": 0.01774063, + "auxiliary_loss_mlp": 0.00298483, + "balance_loss_clip": 1.4493165, + "balance_loss_mlp": 0.26376879, + "epoch": 0.3690365248759958, + "flos": 20704297599360.0, + "grad_norm": 13.251590657794914, + "language_loss": 0.81466615, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.83539158, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.34741211, + "step": 6138, + "time_per_iteration": 4.08752179145813 + }, + { + "auxiliary_loss_clip": 0.01752022, + "auxiliary_loss_mlp": 0.00320311, + "balance_loss_clip": 1.42813492, + "balance_loss_mlp": 0.28605026, + "epoch": 0.3690966481286638, + "flos": 31826369957760.0, + "grad_norm": 27.626187106028116, + "language_loss": 0.74382615, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.76454949, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.34228516, + "step": 6139, + "time_per_iteration": 4.250002145767212 + }, + { + "auxiliary_loss_clip": 0.01772901, + "auxiliary_loss_mlp": 0.00304036, + "balance_loss_clip": 1.45500207, + "balance_loss_mlp": 0.27225491, + "epoch": 0.36915677138133174, + "flos": 13114936782720.0, + "grad_norm": 2.911845193258864, + "language_loss": 0.78942263, + "learning_rate": 2.909906390418006e-06, + "loss": 0.81019199, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.31787109, + "step": 6140, + "time_per_iteration": 2.660627603530884 + }, + { + "auxiliary_loss_clip": 0.01933827, + "auxiliary_loss_mlp": 0.00130663, + "balance_loss_clip": 1.68699896, + "balance_loss_mlp": 0.11550003, + "epoch": 0.3692168946339997, + "flos": 68686879956480.0, + "grad_norm": 0.8013563662870526, + "language_loss": 0.58813548, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.60878038, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.15136719, + "step": 6141, + "time_per_iteration": 3.247819423675537 + }, + { + "auxiliary_loss_clip": 0.01764191, + "auxiliary_loss_mlp": 0.0032418, + "balance_loss_clip": 1.44484687, + "balance_loss_mlp": 0.28867972, + "epoch": 0.36927701788666767, + "flos": 22017873968640.0, + "grad_norm": 6.339878994636131, + "language_loss": 0.81235325, + "learning_rate": 2.909212678216192e-06, + "loss": 0.83323693, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.35498047, + "step": 6142, + "time_per_iteration": 2.6446585655212402 + }, + { + "auxiliary_loss_clip": 0.01751588, + "auxiliary_loss_mlp": 0.00301659, + "balance_loss_clip": 1.43795538, + "balance_loss_mlp": 0.26878121, + "epoch": 0.36933714113933563, + "flos": 21835591424640.0, + "grad_norm": 17.251072900497764, + "language_loss": 0.82707155, + "learning_rate": 2.908865770392555e-06, + "loss": 0.84760404, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.32885742, + "step": 6143, + "time_per_iteration": 2.6618659496307373 + }, + { + "auxiliary_loss_clip": 0.01749336, + "auxiliary_loss_mlp": 0.00290607, + "balance_loss_clip": 1.43761051, + "balance_loss_mlp": 0.25672776, + "epoch": 0.3693972643920036, + "flos": 23691705793920.0, + "grad_norm": 4.385277260438461, + "language_loss": 0.88941097, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.90981036, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.33837891, + "step": 6144, + "time_per_iteration": 4.066816329956055 + }, + { + "auxiliary_loss_clip": 0.01752516, + "auxiliary_loss_mlp": 0.00304028, + "balance_loss_clip": 1.43957043, + "balance_loss_mlp": 0.27241322, + "epoch": 0.36945738764467156, + "flos": 22856747172480.0, + "grad_norm": 8.011343558712973, + "language_loss": 0.87254429, + "learning_rate": 2.908171851365593e-06, + "loss": 0.89310968, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.31665039, + "step": 6145, + "time_per_iteration": 2.689098834991455 + }, + { + "auxiliary_loss_clip": 0.01747047, + "auxiliary_loss_mlp": 0.00308123, + "balance_loss_clip": 1.43371511, + "balance_loss_mlp": 0.27431512, + "epoch": 0.36951751089733953, + "flos": 16615939593600.0, + "grad_norm": 3.2463909344333253, + "language_loss": 0.85730052, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.8778522, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.33764648, + "step": 6146, + "time_per_iteration": 2.625507354736328 + }, + { + "auxiliary_loss_clip": 0.01751913, + "auxiliary_loss_mlp": 0.00308492, + "balance_loss_clip": 1.43687212, + "balance_loss_mlp": 0.27554274, + "epoch": 0.3695776341500075, + "flos": 18914545607040.0, + "grad_norm": 46.67677066868991, + "language_loss": 0.90241617, + "learning_rate": 2.907477794586761e-06, + "loss": 0.92302024, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.3293457, + "step": 6147, + "time_per_iteration": 2.6742210388183594 + }, + { + "auxiliary_loss_clip": 0.01757006, + "auxiliary_loss_mlp": 0.00334457, + "balance_loss_clip": 1.43992543, + "balance_loss_mlp": 0.30129313, + "epoch": 0.36963775740267546, + "flos": 20808474019200.0, + "grad_norm": 23.12108597144141, + "language_loss": 0.90302539, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.92394, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 3.16992188, + "router_z_loss_mlp": 0.33154297, + "step": 6148, + "time_per_iteration": 2.756828784942627 + }, + { + "auxiliary_loss_clip": 0.01751927, + "auxiliary_loss_mlp": 0.00289513, + "balance_loss_clip": 1.44641066, + "balance_loss_mlp": 0.25651574, + "epoch": 0.3696978806553435, + "flos": 26061881656320.0, + "grad_norm": 27.66448168235438, + "language_loss": 0.81981838, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.84023273, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.32983398, + "step": 6149, + "time_per_iteration": 2.7588748931884766 + }, + { + "auxiliary_loss_clip": 0.01744027, + "auxiliary_loss_mlp": 0.00315852, + "balance_loss_clip": 1.43001151, + "balance_loss_mlp": 0.27997014, + "epoch": 0.36975800390801145, + "flos": 26833925606400.0, + "grad_norm": 55.72440744971489, + "language_loss": 0.78007501, + "learning_rate": 2.906436451364054e-06, + "loss": 0.80067384, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.35888672, + "step": 6150, + "time_per_iteration": 4.051138639450073 + }, + { + "auxiliary_loss_clip": 0.01764409, + "auxiliary_loss_mlp": 0.003234, + "balance_loss_clip": 1.44772387, + "balance_loss_mlp": 0.29276353, + "epoch": 0.3698181271606794, + "flos": 21142623265920.0, + "grad_norm": 18.588103554450182, + "language_loss": 0.88866532, + "learning_rate": 2.906089268194611e-06, + "loss": 0.9095434, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.30639648, + "step": 6151, + "time_per_iteration": 2.639847993850708 + }, + { + "auxiliary_loss_clip": 0.01879684, + "auxiliary_loss_mlp": 0.00258936, + "balance_loss_clip": 1.63921881, + "balance_loss_mlp": 0.24758756, + "epoch": 0.3698782504133474, + "flos": 66742639568640.0, + "grad_norm": 0.8913916021970856, + "language_loss": 0.63081217, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65219837, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.11328125, + "step": 6152, + "time_per_iteration": 3.2172608375549316 + }, + { + "auxiliary_loss_clip": 0.01789433, + "auxiliary_loss_mlp": 0.00310788, + "balance_loss_clip": 1.48045397, + "balance_loss_mlp": 0.28122383, + "epoch": 0.36993837366601534, + "flos": 24311523905280.0, + "grad_norm": 13.27810593455708, + "language_loss": 0.76209956, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.7831018, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.29589844, + "step": 6153, + "time_per_iteration": 2.6226654052734375 + }, + { + "auxiliary_loss_clip": 0.01752663, + "auxiliary_loss_mlp": 0.00328189, + "balance_loss_clip": 1.43782234, + "balance_loss_mlp": 0.29454845, + "epoch": 0.3699984969186833, + "flos": 24349194293760.0, + "grad_norm": 19.958438161126388, + "language_loss": 0.79401559, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.8148241, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.33666992, + "step": 6154, + "time_per_iteration": 2.6286425590515137 + }, + { + "auxiliary_loss_clip": 0.01769702, + "auxiliary_loss_mlp": 0.00325652, + "balance_loss_clip": 1.45036948, + "balance_loss_mlp": 0.29274982, + "epoch": 0.37005862017135127, + "flos": 19829154637440.0, + "grad_norm": 2.3716957928688833, + "language_loss": 0.75364166, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.7745952, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.32885742, + "step": 6155, + "time_per_iteration": 2.6308815479278564 + }, + { + "auxiliary_loss_clip": 0.01754817, + "auxiliary_loss_mlp": 0.00328395, + "balance_loss_clip": 1.44308472, + "balance_loss_mlp": 0.29749557, + "epoch": 0.37011874342401924, + "flos": 19573793873280.0, + "grad_norm": 60.18763246504998, + "language_loss": 0.75456631, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.77539843, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.30908203, + "step": 6156, + "time_per_iteration": 2.6738739013671875 + }, + { + "auxiliary_loss_clip": 0.0175877, + "auxiliary_loss_mlp": 0.00318529, + "balance_loss_clip": 1.45406413, + "balance_loss_mlp": 0.2875106, + "epoch": 0.3701788666766872, + "flos": 20374350243840.0, + "grad_norm": 4.303218289626294, + "language_loss": 0.8657515, + "learning_rate": 2.904005448099916e-06, + "loss": 0.88652444, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 3.04492188, + "router_z_loss_mlp": 0.31018066, + "step": 6157, + "time_per_iteration": 2.6034724712371826 + }, + { + "auxiliary_loss_clip": 0.01763096, + "auxiliary_loss_mlp": 0.00380435, + "balance_loss_clip": 1.43685758, + "balance_loss_mlp": 0.34452897, + "epoch": 0.37023898992935517, + "flos": 15340931452800.0, + "grad_norm": 14.637838314534807, + "language_loss": 0.87401319, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.89544845, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.35913086, + "step": 6158, + "time_per_iteration": 2.6095242500305176 + }, + { + "auxiliary_loss_clip": 0.01740722, + "auxiliary_loss_mlp": 0.00357847, + "balance_loss_clip": 1.42744637, + "balance_loss_mlp": 0.3245998, + "epoch": 0.37029911318202313, + "flos": 19573937527680.0, + "grad_norm": 22.97231569900213, + "language_loss": 0.78225696, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.80324268, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.33239746, + "step": 6159, + "time_per_iteration": 2.640918254852295 + }, + { + "auxiliary_loss_clip": 0.01746209, + "auxiliary_loss_mlp": 0.00341356, + "balance_loss_clip": 1.43796206, + "balance_loss_mlp": 0.30988428, + "epoch": 0.3703592364346911, + "flos": 26213353309440.0, + "grad_norm": 162.18991399203838, + "language_loss": 0.78623044, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.80710614, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 3.07617188, + "router_z_loss_mlp": 0.31469727, + "step": 6160, + "time_per_iteration": 2.7621724605560303 + }, + { + "auxiliary_loss_clip": 0.01745502, + "auxiliary_loss_mlp": 0.00350425, + "balance_loss_clip": 1.43910503, + "balance_loss_mlp": 0.32031286, + "epoch": 0.37041935968735906, + "flos": 20048317470720.0, + "grad_norm": 16.32692762629393, + "language_loss": 0.84679055, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.86774981, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.30126953, + "step": 6161, + "time_per_iteration": 2.6703407764434814 + }, + { + "auxiliary_loss_clip": 0.01762708, + "auxiliary_loss_mlp": 0.00367626, + "balance_loss_clip": 1.44721127, + "balance_loss_mlp": 0.33365148, + "epoch": 0.3704794829400271, + "flos": 24133802388480.0, + "grad_norm": 12.98149532345473, + "language_loss": 0.84822619, + "learning_rate": 2.902267988534295e-06, + "loss": 0.8695296, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.33984375, + "step": 6162, + "time_per_iteration": 2.693572759628296 + }, + { + "auxiliary_loss_clip": 0.01747583, + "auxiliary_loss_mlp": 0.00379241, + "balance_loss_clip": 1.43327248, + "balance_loss_mlp": 0.34662554, + "epoch": 0.37053960619269505, + "flos": 14866874732160.0, + "grad_norm": 9.534158556911878, + "language_loss": 0.87700605, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.8982743, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.32629395, + "step": 6163, + "time_per_iteration": 2.6321518421173096 + }, + { + "auxiliary_loss_clip": 0.01772637, + "auxiliary_loss_mlp": 0.00388557, + "balance_loss_clip": 1.45221591, + "balance_loss_mlp": 0.35594136, + "epoch": 0.370599729445363, + "flos": 21361498790400.0, + "grad_norm": 7.75312584962961, + "language_loss": 0.75406301, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.77567494, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.32666016, + "step": 6164, + "time_per_iteration": 2.65812611579895 + }, + { + "auxiliary_loss_clip": 0.01750714, + "auxiliary_loss_mlp": 0.00389768, + "balance_loss_clip": 1.435709, + "balance_loss_mlp": 0.35464942, + "epoch": 0.370659852698031, + "flos": 26829041356800.0, + "grad_norm": 90.83827626853744, + "language_loss": 0.90762895, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.92903382, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.35107422, + "step": 6165, + "time_per_iteration": 2.818296432495117 + }, + { + "auxiliary_loss_clip": 0.01761991, + "auxiliary_loss_mlp": 0.00393814, + "balance_loss_clip": 1.43625593, + "balance_loss_mlp": 0.35776511, + "epoch": 0.37071997595069894, + "flos": 19099018880640.0, + "grad_norm": 76.33734557143282, + "language_loss": 0.77548897, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.79704702, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.3605957, + "step": 6166, + "time_per_iteration": 2.840646982192993 + }, + { + "auxiliary_loss_clip": 0.01790764, + "auxiliary_loss_mlp": 0.00182561, + "balance_loss_clip": 1.558743, + "balance_loss_mlp": 0.17397824, + "epoch": 0.3707800992033669, + "flos": 52178384920320.0, + "grad_norm": 0.832023837952891, + "language_loss": 0.55362785, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.5733611, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.0859375, + "step": 6167, + "time_per_iteration": 3.0273876190185547 + }, + { + "auxiliary_loss_clip": 0.01767227, + "auxiliary_loss_mlp": 0.0045365, + "balance_loss_clip": 1.44164014, + "balance_loss_mlp": 0.41524038, + "epoch": 0.3708402224560349, + "flos": 19901837808000.0, + "grad_norm": 9.85019736137254, + "language_loss": 0.8303836, + "learning_rate": 2.900181908135584e-06, + "loss": 0.85259235, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.38452148, + "step": 6168, + "time_per_iteration": 2.821683883666992 + }, + { + "auxiliary_loss_clip": 0.01775421, + "auxiliary_loss_mlp": 0.00361162, + "balance_loss_clip": 1.44361448, + "balance_loss_mlp": 0.32764056, + "epoch": 0.37090034570870284, + "flos": 20007630339840.0, + "grad_norm": 328.82768878188125, + "language_loss": 0.80381024, + "learning_rate": 2.899834108519755e-06, + "loss": 0.82517606, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.33496094, + "step": 6169, + "time_per_iteration": 2.8078439235687256 + }, + { + "auxiliary_loss_clip": 0.0178242, + "auxiliary_loss_mlp": 0.00374225, + "balance_loss_clip": 1.44995713, + "balance_loss_mlp": 0.33870089, + "epoch": 0.3709604689613708, + "flos": 24134700228480.0, + "grad_norm": 37.54496928817693, + "language_loss": 0.85182667, + "learning_rate": 2.899486274782127e-06, + "loss": 0.87339312, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.35522461, + "step": 6170, + "time_per_iteration": 2.681673288345337 + }, + { + "auxiliary_loss_clip": 0.01772013, + "auxiliary_loss_mlp": 0.00383421, + "balance_loss_clip": 1.44255781, + "balance_loss_mlp": 0.34799248, + "epoch": 0.37102059221403877, + "flos": 23876071326720.0, + "grad_norm": 10.036151389541871, + "language_loss": 0.82856262, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.85011697, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.35424805, + "step": 6171, + "time_per_iteration": 2.684309959411621 + }, + { + "auxiliary_loss_clip": 0.017816, + "auxiliary_loss_mlp": 0.00374201, + "balance_loss_clip": 1.44709361, + "balance_loss_mlp": 0.33653051, + "epoch": 0.37108071546670673, + "flos": 14501268149760.0, + "grad_norm": 2.831526408845845, + "language_loss": 0.87951833, + "learning_rate": 2.898790504994232e-06, + "loss": 0.90107632, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.37695312, + "step": 6172, + "time_per_iteration": 2.61558198928833 + }, + { + "auxiliary_loss_clip": 0.01763208, + "auxiliary_loss_mlp": 0.00385546, + "balance_loss_clip": 1.42823839, + "balance_loss_mlp": 0.34673142, + "epoch": 0.3711408387193747, + "flos": 34562619279360.0, + "grad_norm": 11.103513257936944, + "language_loss": 0.66689932, + "learning_rate": 2.89844256897035e-06, + "loss": 0.68838686, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.38842773, + "step": 6173, + "time_per_iteration": 2.7604682445526123 + }, + { + "auxiliary_loss_clip": 0.01761967, + "auxiliary_loss_mlp": 0.00352768, + "balance_loss_clip": 1.42835116, + "balance_loss_mlp": 0.31652817, + "epoch": 0.37120096197204266, + "flos": 17310703432320.0, + "grad_norm": 4.262962883782642, + "language_loss": 0.87597746, + "learning_rate": 2.898094598877435e-06, + "loss": 0.89712477, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.36254883, + "step": 6174, + "time_per_iteration": 2.6347172260284424 + }, + { + "auxiliary_loss_clip": 0.01784005, + "auxiliary_loss_mlp": 0.00373251, + "balance_loss_clip": 1.44893241, + "balance_loss_mlp": 0.33779779, + "epoch": 0.37126108522471063, + "flos": 30664049760000.0, + "grad_norm": 5.770911542686874, + "language_loss": 0.87380517, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.89537764, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.35449219, + "step": 6175, + "time_per_iteration": 2.7404541969299316 + }, + { + "auxiliary_loss_clip": 0.017674, + "auxiliary_loss_mlp": 0.003715, + "balance_loss_clip": 1.43736863, + "balance_loss_mlp": 0.33418775, + "epoch": 0.37132120847737865, + "flos": 25155640494720.0, + "grad_norm": 57.764205605973174, + "language_loss": 0.94989121, + "learning_rate": 2.89739855653729e-06, + "loss": 0.97128022, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.37304688, + "step": 6176, + "time_per_iteration": 2.6623072624206543 + }, + { + "auxiliary_loss_clip": 0.01758686, + "auxiliary_loss_mlp": 0.00370177, + "balance_loss_clip": 1.42801249, + "balance_loss_mlp": 0.3329834, + "epoch": 0.3713813317300466, + "flos": 21213474842880.0, + "grad_norm": 7.956622477997687, + "language_loss": 0.79159445, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.81288302, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.37158203, + "step": 6177, + "time_per_iteration": 2.640137195587158 + }, + { + "auxiliary_loss_clip": 0.01752415, + "auxiliary_loss_mlp": 0.00345109, + "balance_loss_clip": 1.41648507, + "balance_loss_mlp": 0.30937022, + "epoch": 0.3714414549827146, + "flos": 21616644072960.0, + "grad_norm": 3.8615859378096955, + "language_loss": 0.83767933, + "learning_rate": 2.896702378079374e-06, + "loss": 0.85865462, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.35717773, + "step": 6178, + "time_per_iteration": 2.614762306213379 + }, + { + "auxiliary_loss_clip": 0.01765294, + "auxiliary_loss_mlp": 0.00342274, + "balance_loss_clip": 1.42872238, + "balance_loss_mlp": 0.30653492, + "epoch": 0.37150157823538255, + "flos": 19972294335360.0, + "grad_norm": 6.028753697064996, + "language_loss": 0.78684485, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.80792046, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.35742188, + "step": 6179, + "time_per_iteration": 2.655958414077759 + }, + { + "auxiliary_loss_clip": 0.01738338, + "auxiliary_loss_mlp": 0.00344814, + "balance_loss_clip": 1.40612221, + "balance_loss_mlp": 0.30843115, + "epoch": 0.3715617014880505, + "flos": 24860562266880.0, + "grad_norm": 11.325527399744889, + "language_loss": 0.76684862, + "learning_rate": 2.896006063609283e-06, + "loss": 0.78768015, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.36376953, + "step": 6180, + "time_per_iteration": 2.706688404083252 + }, + { + "auxiliary_loss_clip": 0.01771479, + "auxiliary_loss_mlp": 0.0033329, + "balance_loss_clip": 1.43146563, + "balance_loss_mlp": 0.29795685, + "epoch": 0.3716218247407185, + "flos": 20449080489600.0, + "grad_norm": 8.397526022519594, + "language_loss": 0.82797611, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.84902376, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 3.39648438, + "router_z_loss_mlp": 0.35351562, + "step": 6181, + "time_per_iteration": 4.1642632484436035 + }, + { + "auxiliary_loss_clip": 0.01750873, + "auxiliary_loss_mlp": 0.00340412, + "balance_loss_clip": 1.41550696, + "balance_loss_mlp": 0.3030276, + "epoch": 0.37168194799338644, + "flos": 24133479166080.0, + "grad_norm": 3.895181375171825, + "language_loss": 0.85404336, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.87495625, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.3737793, + "step": 6182, + "time_per_iteration": 4.10425329208374 + }, + { + "auxiliary_loss_clip": 0.01631747, + "auxiliary_loss_mlp": 0.00101038, + "balance_loss_clip": 1.43830562, + "balance_loss_mlp": 0.09307515, + "epoch": 0.3717420712460544, + "flos": 67408926900480.0, + "grad_norm": 0.7949177884181257, + "language_loss": 0.57155257, + "learning_rate": 2.894961337112362e-06, + "loss": 0.58888042, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.07958984, + "step": 6183, + "time_per_iteration": 3.138582468032837 + }, + { + "auxiliary_loss_clip": 0.01751125, + "auxiliary_loss_mlp": 0.00373311, + "balance_loss_clip": 1.40011287, + "balance_loss_mlp": 0.33306611, + "epoch": 0.37180219449872237, + "flos": 22376908362240.0, + "grad_norm": 37.025864110619146, + "language_loss": 0.83227932, + "learning_rate": 2.894613027055066e-06, + "loss": 0.85352373, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 3.5078125, + "router_z_loss_mlp": 0.40234375, + "step": 6184, + "time_per_iteration": 2.6824100017547607 + }, + { + "auxiliary_loss_clip": 0.01756492, + "auxiliary_loss_mlp": 0.00332097, + "balance_loss_clip": 1.41195631, + "balance_loss_mlp": 0.2960957, + "epoch": 0.37186231775139034, + "flos": 21869885934720.0, + "grad_norm": 165.15424316820702, + "language_loss": 0.7998184, + "learning_rate": 2.894264683073954e-06, + "loss": 0.82070428, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.35986328, + "step": 6185, + "time_per_iteration": 2.731783866882324 + }, + { + "auxiliary_loss_clip": 0.01772659, + "auxiliary_loss_mlp": 0.00336292, + "balance_loss_clip": 1.42674398, + "balance_loss_mlp": 0.30040956, + "epoch": 0.3719224410040583, + "flos": 22415225195520.0, + "grad_norm": 6.4054323583099215, + "language_loss": 0.8334865, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.85457599, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 0.35888672, + "step": 6186, + "time_per_iteration": 4.179441928863525 + }, + { + "auxiliary_loss_clip": 0.01756799, + "auxiliary_loss_mlp": 0.0036431, + "balance_loss_clip": 1.40731311, + "balance_loss_mlp": 0.3264249, + "epoch": 0.37198256425672627, + "flos": 25151223121920.0, + "grad_norm": 103.57731824628544, + "language_loss": 0.91585267, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.93706375, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 3.49023438, + "router_z_loss_mlp": 0.37841797, + "step": 6187, + "time_per_iteration": 2.707345962524414 + }, + { + "auxiliary_loss_clip": 0.01740387, + "auxiliary_loss_mlp": 0.00319625, + "balance_loss_clip": 1.40747344, + "balance_loss_mlp": 0.28333762, + "epoch": 0.37204268750939423, + "flos": 21138313633920.0, + "grad_norm": 3.5221536598295806, + "language_loss": 0.89998674, + "learning_rate": 2.893219447719824e-06, + "loss": 0.92058682, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.36279297, + "step": 6188, + "time_per_iteration": 2.812798500061035 + }, + { + "auxiliary_loss_clip": 0.01728977, + "auxiliary_loss_mlp": 0.0034052, + "balance_loss_clip": 1.39351559, + "balance_loss_mlp": 0.30339789, + "epoch": 0.37210281076206225, + "flos": 21506829217920.0, + "grad_norm": 6.470280606006512, + "language_loss": 0.72186929, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.7425642, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.37158203, + "step": 6189, + "time_per_iteration": 2.701035976409912 + }, + { + "auxiliary_loss_clip": 0.01751875, + "auxiliary_loss_mlp": 0.00343385, + "balance_loss_clip": 1.41285872, + "balance_loss_mlp": 0.30781281, + "epoch": 0.3721629340147302, + "flos": 17347835116800.0, + "grad_norm": 5.918363596605893, + "language_loss": 0.91386658, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.93481922, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.35595703, + "step": 6190, + "time_per_iteration": 2.628676652908325 + }, + { + "auxiliary_loss_clip": 0.0175073, + "auxiliary_loss_mlp": 0.00363262, + "balance_loss_clip": 1.40010583, + "balance_loss_mlp": 0.32344574, + "epoch": 0.3722230572673982, + "flos": 16432400073600.0, + "grad_norm": 14.869174185778954, + "language_loss": 0.99507302, + "learning_rate": 2.8921739075269633e-06, + "loss": 1.01621282, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.39819336, + "step": 6191, + "time_per_iteration": 2.6085667610168457 + }, + { + "auxiliary_loss_clip": 0.01732405, + "auxiliary_loss_mlp": 0.00360975, + "balance_loss_clip": 1.38886809, + "balance_loss_mlp": 0.32027724, + "epoch": 0.37228318052006615, + "flos": 22674716023680.0, + "grad_norm": 28.059415817560293, + "language_loss": 0.8035543, + "learning_rate": 2.891825326449073e-06, + "loss": 0.82448804, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 0.40698242, + "step": 6192, + "time_per_iteration": 2.8310558795928955 + }, + { + "auxiliary_loss_clip": 0.01735686, + "auxiliary_loss_mlp": 0.00386705, + "balance_loss_clip": 1.39667344, + "balance_loss_mlp": 0.34798574, + "epoch": 0.3723433037727341, + "flos": 25265491263360.0, + "grad_norm": 60.21299109620226, + "language_loss": 0.89144921, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.91267312, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.38720703, + "step": 6193, + "time_per_iteration": 4.088541269302368 + }, + { + "auxiliary_loss_clip": 0.01726472, + "auxiliary_loss_mlp": 0.00394257, + "balance_loss_clip": 1.38630021, + "balance_loss_mlp": 0.35436979, + "epoch": 0.3724034270254021, + "flos": 10524664333440.0, + "grad_norm": 26.782036437731833, + "language_loss": 0.93332297, + "learning_rate": 2.891128062852194e-06, + "loss": 0.9545303, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.39868164, + "step": 6194, + "time_per_iteration": 2.624647378921509 + }, + { + "auxiliary_loss_clip": 0.01748852, + "auxiliary_loss_mlp": 0.00348914, + "balance_loss_clip": 1.40744138, + "balance_loss_mlp": 0.31310368, + "epoch": 0.37246355027807004, + "flos": 20266223328000.0, + "grad_norm": 23.296579631366935, + "language_loss": 0.86816168, + "learning_rate": 2.890779380359646e-06, + "loss": 0.88913941, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 3.41210938, + "router_z_loss_mlp": 0.35766602, + "step": 6195, + "time_per_iteration": 2.640249013900757 + }, + { + "auxiliary_loss_clip": 0.01726316, + "auxiliary_loss_mlp": 0.00361546, + "balance_loss_clip": 1.39563477, + "balance_loss_mlp": 0.32490095, + "epoch": 0.372523673530738, + "flos": 19500571998720.0, + "grad_norm": 4.443975525287679, + "language_loss": 0.84632063, + "learning_rate": 2.890430664088655e-06, + "loss": 0.8671993, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.36645508, + "step": 6196, + "time_per_iteration": 2.638850688934326 + }, + { + "auxiliary_loss_clip": 0.01741795, + "auxiliary_loss_mlp": 0.0034721, + "balance_loss_clip": 1.40622759, + "balance_loss_mlp": 0.31163746, + "epoch": 0.372583796783406, + "flos": 16764250849920.0, + "grad_norm": 169.95378587711426, + "language_loss": 0.92950213, + "learning_rate": 2.890081914052443e-06, + "loss": 0.95039213, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.35546875, + "step": 6197, + "time_per_iteration": 2.701787233352661 + }, + { + "auxiliary_loss_clip": 0.01727405, + "auxiliary_loss_mlp": 0.00357686, + "balance_loss_clip": 1.39933681, + "balance_loss_mlp": 0.32077876, + "epoch": 0.37264392003607394, + "flos": 22637979388800.0, + "grad_norm": 7.148352252324941, + "language_loss": 0.72934955, + "learning_rate": 2.889733130264237e-06, + "loss": 0.75020045, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.36889648, + "step": 6198, + "time_per_iteration": 2.7098886966705322 + }, + { + "auxiliary_loss_clip": 0.01726346, + "auxiliary_loss_mlp": 0.00310373, + "balance_loss_clip": 1.39754152, + "balance_loss_mlp": 0.27444309, + "epoch": 0.3727040432887419, + "flos": 19973120348160.0, + "grad_norm": 844.2562329934984, + "language_loss": 0.80675435, + "learning_rate": 2.889384312737261e-06, + "loss": 0.8271215, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.35913086, + "step": 6199, + "time_per_iteration": 2.7864387035369873 + }, + { + "auxiliary_loss_clip": 0.01751193, + "auxiliary_loss_mlp": 0.00341318, + "balance_loss_clip": 1.41590834, + "balance_loss_mlp": 0.30557936, + "epoch": 0.37276416654140987, + "flos": 63899122279680.0, + "grad_norm": 11.68775424684546, + "language_loss": 0.87785619, + "learning_rate": 2.889035461484742e-06, + "loss": 0.8987813, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 3.35351562, + "router_z_loss_mlp": 0.35742188, + "step": 6200, + "time_per_iteration": 3.062654495239258 + }, + { + "auxiliary_loss_clip": 0.01730776, + "auxiliary_loss_mlp": 0.00307638, + "balance_loss_clip": 1.39593911, + "balance_loss_mlp": 0.27006292, + "epoch": 0.37282428979407783, + "flos": 39785970211200.0, + "grad_norm": 5.216280328096078, + "language_loss": 0.66756642, + "learning_rate": 2.88868657651991e-06, + "loss": 0.68795061, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.37573242, + "step": 6201, + "time_per_iteration": 2.8229570388793945 + }, + { + "auxiliary_loss_clip": 0.01720933, + "auxiliary_loss_mlp": 0.00349912, + "balance_loss_clip": 1.38732576, + "balance_loss_mlp": 0.3128618, + "epoch": 0.37288441304674586, + "flos": 22709046447360.0, + "grad_norm": 6.938724169525751, + "language_loss": 0.80283368, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.82354218, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.37036133, + "step": 6202, + "time_per_iteration": 2.6978414058685303 + }, + { + "auxiliary_loss_clip": 0.01727644, + "auxiliary_loss_mlp": 0.00326534, + "balance_loss_clip": 1.39674366, + "balance_loss_mlp": 0.28946, + "epoch": 0.3729445362994138, + "flos": 18770292587520.0, + "grad_norm": 1163.5513950143343, + "language_loss": 0.80674016, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.82728195, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.37084961, + "step": 6203, + "time_per_iteration": 2.678176164627075 + }, + { + "auxiliary_loss_clip": 0.0171213, + "auxiliary_loss_mlp": 0.00322028, + "balance_loss_clip": 1.38334107, + "balance_loss_mlp": 0.28638479, + "epoch": 0.3730046595520818, + "flos": 22456199635200.0, + "grad_norm": 159.8452322186409, + "language_loss": 0.860587, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.88092858, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.35620117, + "step": 6204, + "time_per_iteration": 2.695551633834839 + }, + { + "auxiliary_loss_clip": 0.01704644, + "auxiliary_loss_mlp": 0.00295229, + "balance_loss_clip": 1.37738121, + "balance_loss_mlp": 0.26034814, + "epoch": 0.37306478280474975, + "flos": 24316372241280.0, + "grad_norm": 1.9237567676402594, + "language_loss": 0.81729567, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.83729434, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.34838867, + "step": 6205, + "time_per_iteration": 2.6962597370147705 + }, + { + "auxiliary_loss_clip": 0.01708088, + "auxiliary_loss_mlp": 0.00332416, + "balance_loss_clip": 1.38637316, + "balance_loss_mlp": 0.29415029, + "epoch": 0.3731249060574177, + "flos": 15815167741440.0, + "grad_norm": 124.67974357147908, + "language_loss": 0.87893176, + "learning_rate": 2.886941646474128e-06, + "loss": 0.89933681, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.38256836, + "step": 6206, + "time_per_iteration": 2.623389959335327 + }, + { + "auxiliary_loss_clip": 0.01700995, + "auxiliary_loss_mlp": 0.00294829, + "balance_loss_clip": 1.38045382, + "balance_loss_mlp": 0.25718307, + "epoch": 0.3731850293100857, + "flos": 19828077229440.0, + "grad_norm": 21.299436800490437, + "language_loss": 0.99193847, + "learning_rate": 2.886592559513283e-06, + "loss": 1.01189685, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.3762207, + "step": 6207, + "time_per_iteration": 2.643613338470459 + }, + { + "auxiliary_loss_clip": 0.0165991, + "auxiliary_loss_mlp": 0.00277675, + "balance_loss_clip": 1.34810495, + "balance_loss_mlp": 0.24141175, + "epoch": 0.37324515256275365, + "flos": 19062354072960.0, + "grad_norm": 83.53901267553387, + "language_loss": 0.92375898, + "learning_rate": 2.886243438932759e-06, + "loss": 0.94313478, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.36254883, + "step": 6208, + "time_per_iteration": 2.6359732151031494 + }, + { + "auxiliary_loss_clip": 0.01673946, + "auxiliary_loss_mlp": 0.00315116, + "balance_loss_clip": 1.36272693, + "balance_loss_mlp": 0.2784715, + "epoch": 0.3733052758154216, + "flos": 20704333512960.0, + "grad_norm": 19.93076828162548, + "language_loss": 0.80820835, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.82809901, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.36645508, + "step": 6209, + "time_per_iteration": 2.6833019256591797 + }, + { + "auxiliary_loss_clip": 0.01696813, + "auxiliary_loss_mlp": 0.00301319, + "balance_loss_clip": 1.37629461, + "balance_loss_mlp": 0.26522291, + "epoch": 0.3733653990680896, + "flos": 20193504243840.0, + "grad_norm": 1469.0156638428725, + "language_loss": 0.79311258, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.8130939, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.36132812, + "step": 6210, + "time_per_iteration": 2.698287010192871 + }, + { + "auxiliary_loss_clip": 0.01704777, + "auxiliary_loss_mlp": 0.00312599, + "balance_loss_clip": 1.38486719, + "balance_loss_mlp": 0.27438077, + "epoch": 0.37342552232075754, + "flos": 20339660684160.0, + "grad_norm": 75.2770940526914, + "language_loss": 0.84696579, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.86713958, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.38183594, + "step": 6211, + "time_per_iteration": 2.790957450866699 + }, + { + "auxiliary_loss_clip": 0.01707597, + "auxiliary_loss_mlp": 0.00317616, + "balance_loss_clip": 1.38144886, + "balance_loss_mlp": 0.28142402, + "epoch": 0.3734856455734255, + "flos": 35517879527040.0, + "grad_norm": 46.80030827966551, + "language_loss": 0.81229854, + "learning_rate": 2.884846620678668e-06, + "loss": 0.83255064, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.36181641, + "step": 6212, + "time_per_iteration": 2.947744607925415 + }, + { + "auxiliary_loss_clip": 0.01702778, + "auxiliary_loss_mlp": 0.00350404, + "balance_loss_clip": 1.37576818, + "balance_loss_mlp": 0.31175661, + "epoch": 0.37354576882609347, + "flos": 21142300043520.0, + "grad_norm": 102.29957469427833, + "language_loss": 0.88364637, + "learning_rate": 2.884497332198356e-06, + "loss": 0.9041782, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.38623047, + "step": 6213, + "time_per_iteration": 2.6485490798950195 + }, + { + "auxiliary_loss_clip": 0.01744939, + "auxiliary_loss_mlp": 0.00321103, + "balance_loss_clip": 1.41530049, + "balance_loss_mlp": 0.28307498, + "epoch": 0.37360589207876144, + "flos": 21506793304320.0, + "grad_norm": 5.397452728992794, + "language_loss": 0.86660707, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.88726747, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.38012695, + "step": 6214, + "time_per_iteration": 2.6463186740875244 + }, + { + "auxiliary_loss_clip": 0.01770692, + "auxiliary_loss_mlp": 0.00292075, + "balance_loss_clip": 1.43599665, + "balance_loss_mlp": 0.25609717, + "epoch": 0.37366601533142946, + "flos": 38435800861440.0, + "grad_norm": 12.82699136741613, + "language_loss": 0.9174602, + "learning_rate": 2.883798654630296e-06, + "loss": 0.93808794, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.36010742, + "step": 6215, + "time_per_iteration": 2.8581056594848633 + }, + { + "auxiliary_loss_clip": 0.01767993, + "auxiliary_loss_mlp": 0.00315197, + "balance_loss_clip": 1.4239552, + "balance_loss_mlp": 0.27733612, + "epoch": 0.3737261385840974, + "flos": 18441171244800.0, + "grad_norm": 329.43872232355596, + "language_loss": 0.75696039, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.77779227, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 3.44140625, + "router_z_loss_mlp": 0.37890625, + "step": 6216, + "time_per_iteration": 2.673511266708374 + }, + { + "auxiliary_loss_clip": 0.01775131, + "auxiliary_loss_mlp": 0.00287296, + "balance_loss_clip": 1.436131, + "balance_loss_mlp": 0.25129485, + "epoch": 0.3737862618367654, + "flos": 22929861306240.0, + "grad_norm": 535.9028297087336, + "language_loss": 0.76544935, + "learning_rate": 2.883099843007303e-06, + "loss": 0.78607357, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.35986328, + "step": 6217, + "time_per_iteration": 2.6698174476623535 + }, + { + "auxiliary_loss_clip": 0.0179696, + "auxiliary_loss_mlp": 0.00301545, + "balance_loss_clip": 1.44641304, + "balance_loss_mlp": 0.26246798, + "epoch": 0.37384638508943335, + "flos": 15409664127360.0, + "grad_norm": 100.86209809413465, + "language_loss": 0.8634932, + "learning_rate": 2.88275038695833e-06, + "loss": 0.88447827, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.39086914, + "step": 6218, + "time_per_iteration": 2.6003971099853516 + }, + { + "auxiliary_loss_clip": 0.01809767, + "auxiliary_loss_mlp": 0.00314998, + "balance_loss_clip": 1.45789826, + "balance_loss_mlp": 0.27759027, + "epoch": 0.3739065083421013, + "flos": 24280820755200.0, + "grad_norm": 12.700535646905603, + "language_loss": 0.84185207, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.86309969, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.37402344, + "step": 6219, + "time_per_iteration": 2.7263286113739014 + }, + { + "auxiliary_loss_clip": 0.01852811, + "auxiliary_loss_mlp": 0.00321785, + "balance_loss_clip": 1.48415279, + "balance_loss_mlp": 0.28430516, + "epoch": 0.3739666315947693, + "flos": 23002831785600.0, + "grad_norm": 77.72955449991898, + "language_loss": 0.83332592, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.8550719, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 3.6875, + "router_z_loss_mlp": 0.375, + "step": 6220, + "time_per_iteration": 2.629988193511963 + }, + { + "auxiliary_loss_clip": 0.01864375, + "auxiliary_loss_mlp": 0.00346924, + "balance_loss_clip": 1.48142207, + "balance_loss_mlp": 0.30458063, + "epoch": 0.37402675484743725, + "flos": 19391116279680.0, + "grad_norm": 2.7785801449467717, + "language_loss": 0.89500743, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.91712046, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 3.828125, + "router_z_loss_mlp": 0.4230957, + "step": 6221, + "time_per_iteration": 2.6810359954833984 + }, + { + "auxiliary_loss_clip": 0.01874178, + "auxiliary_loss_mlp": 0.00330632, + "balance_loss_clip": 1.49085593, + "balance_loss_mlp": 0.29458317, + "epoch": 0.3740868781001052, + "flos": 17126158331520.0, + "grad_norm": 3.156957643583962, + "language_loss": 0.82266426, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.84471238, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 0.36035156, + "step": 6222, + "time_per_iteration": 2.697625160217285 + }, + { + "auxiliary_loss_clip": 0.01889468, + "auxiliary_loss_mlp": 0.00333942, + "balance_loss_clip": 1.50639224, + "balance_loss_mlp": 0.29553324, + "epoch": 0.3741470013527732, + "flos": 20043505048320.0, + "grad_norm": 6.583884574919026, + "language_loss": 0.77513599, + "learning_rate": 2.881002604868789e-06, + "loss": 0.79737008, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 0.3840332, + "step": 6223, + "time_per_iteration": 4.179046154022217 + }, + { + "auxiliary_loss_clip": 0.01876546, + "auxiliary_loss_mlp": 0.00312015, + "balance_loss_clip": 1.49775434, + "balance_loss_mlp": 0.2749173, + "epoch": 0.37420712460544114, + "flos": 36897279569280.0, + "grad_norm": 24.42932142784083, + "language_loss": 0.75529999, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.77718556, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.37133789, + "step": 6224, + "time_per_iteration": 4.267859697341919 + }, + { + "auxiliary_loss_clip": 0.01903215, + "auxiliary_loss_mlp": 0.00321637, + "balance_loss_clip": 1.50710392, + "balance_loss_mlp": 0.2822265, + "epoch": 0.3742672478581091, + "flos": 22201198007040.0, + "grad_norm": 169.8052249715832, + "language_loss": 0.75917518, + "learning_rate": 2.880303258086228e-06, + "loss": 0.78142369, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 3.96289062, + "router_z_loss_mlp": 0.39379883, + "step": 6225, + "time_per_iteration": 2.6787261962890625 + }, + { + "auxiliary_loss_clip": 0.01903505, + "auxiliary_loss_mlp": 0.00374613, + "balance_loss_clip": 1.50762856, + "balance_loss_mlp": 0.33160207, + "epoch": 0.3743273711107771, + "flos": 24681547860480.0, + "grad_norm": 5.578502963536227, + "language_loss": 0.86453331, + "learning_rate": 2.879953534616536e-06, + "loss": 0.8873145, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 3.95117188, + "router_z_loss_mlp": 0.42993164, + "step": 6226, + "time_per_iteration": 2.6953277587890625 + }, + { + "auxiliary_loss_clip": 0.01879939, + "auxiliary_loss_mlp": 0.00344279, + "balance_loss_clip": 1.49191642, + "balance_loss_mlp": 0.30720508, + "epoch": 0.37438749436344504, + "flos": 24459619680000.0, + "grad_norm": 9.848099541722927, + "language_loss": 0.7537905, + "learning_rate": 2.879603777778917e-06, + "loss": 0.77603269, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 0.37060547, + "step": 6227, + "time_per_iteration": 2.7418878078460693 + }, + { + "auxiliary_loss_clip": 0.01872247, + "auxiliary_loss_mlp": 0.00300065, + "balance_loss_clip": 1.50206828, + "balance_loss_mlp": 0.26310998, + "epoch": 0.374447617616113, + "flos": 21798747048960.0, + "grad_norm": 205.3105839906934, + "language_loss": 0.88903528, + "learning_rate": 2.879253987586635e-06, + "loss": 0.91075844, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 3.69921875, + "router_z_loss_mlp": 0.36938477, + "step": 6228, + "time_per_iteration": 4.1091930866241455 + }, + { + "auxiliary_loss_clip": 0.01898255, + "auxiliary_loss_mlp": 0.00338625, + "balance_loss_clip": 1.50758064, + "balance_loss_mlp": 0.30128872, + "epoch": 0.374507740868781, + "flos": 17968191932160.0, + "grad_norm": 3.024899839641628, + "language_loss": 0.80797374, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.83034253, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 3.90625, + "router_z_loss_mlp": 0.37304688, + "step": 6229, + "time_per_iteration": 2.7096736431121826 + }, + { + "auxiliary_loss_clip": 0.018801, + "auxiliary_loss_mlp": 0.00336902, + "balance_loss_clip": 1.49933648, + "balance_loss_mlp": 0.2955364, + "epoch": 0.374567864121449, + "flos": 16105828596480.0, + "grad_norm": 6.648066918118238, + "language_loss": 0.90789127, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.93006122, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 3.80859375, + "router_z_loss_mlp": 0.41381836, + "step": 6230, + "time_per_iteration": 2.6571669578552246 + }, + { + "auxiliary_loss_clip": 0.01877623, + "auxiliary_loss_mlp": 0.00319483, + "balance_loss_clip": 1.49165916, + "balance_loss_mlp": 0.27938092, + "epoch": 0.37462798737411696, + "flos": 25773160135680.0, + "grad_norm": 2.3191735673029195, + "language_loss": 0.81501675, + "learning_rate": 2.878204417014456e-06, + "loss": 0.83698779, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 3.85546875, + "router_z_loss_mlp": 0.40112305, + "step": 6231, + "time_per_iteration": 2.6997509002685547 + }, + { + "auxiliary_loss_clip": 0.0184278, + "auxiliary_loss_mlp": 0.00336784, + "balance_loss_clip": 1.4769578, + "balance_loss_mlp": 0.29658675, + "epoch": 0.3746881106267849, + "flos": 16654507822080.0, + "grad_norm": 30.09528976023357, + "language_loss": 0.81781572, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.83961141, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 0.40185547, + "step": 6232, + "time_per_iteration": 2.6533267498016357 + }, + { + "auxiliary_loss_clip": 0.01842189, + "auxiliary_loss_mlp": 0.0033891, + "balance_loss_clip": 1.4758178, + "balance_loss_mlp": 0.29852194, + "epoch": 0.3747482338794529, + "flos": 26177981391360.0, + "grad_norm": 13.09794378764582, + "language_loss": 0.82242632, + "learning_rate": 2.877504536769561e-06, + "loss": 0.84423733, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.40380859, + "step": 6233, + "time_per_iteration": 2.6974451541900635 + }, + { + "auxiliary_loss_clip": 0.01883734, + "auxiliary_loss_mlp": 0.00335454, + "balance_loss_clip": 1.50510311, + "balance_loss_mlp": 0.2958048, + "epoch": 0.37480835713212085, + "flos": 12021061950720.0, + "grad_norm": 6.411901264153556, + "language_loss": 0.77728009, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.79947197, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 3.78710938, + "router_z_loss_mlp": 0.39672852, + "step": 6234, + "time_per_iteration": 2.6605517864227295 + }, + { + "auxiliary_loss_clip": 0.01866197, + "auxiliary_loss_mlp": 0.0030326, + "balance_loss_clip": 1.49676061, + "balance_loss_mlp": 0.26504135, + "epoch": 0.3748684803847888, + "flos": 19679263182720.0, + "grad_norm": 33.99818610903916, + "language_loss": 0.89069664, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.91239119, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 3.69140625, + "router_z_loss_mlp": 0.38232422, + "step": 6235, + "time_per_iteration": 4.040600776672363 + }, + { + "auxiliary_loss_clip": 0.01895997, + "auxiliary_loss_mlp": 0.00343683, + "balance_loss_clip": 1.51793718, + "balance_loss_mlp": 0.30520254, + "epoch": 0.3749286036374568, + "flos": 20521189042560.0, + "grad_norm": 21.376432884972886, + "language_loss": 0.85431904, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.87671584, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 0.38500977, + "step": 6236, + "time_per_iteration": 2.6919105052948 + }, + { + "auxiliary_loss_clip": 0.01864658, + "auxiliary_loss_mlp": 0.00327963, + "balance_loss_clip": 1.48695481, + "balance_loss_mlp": 0.28681225, + "epoch": 0.37498872689012475, + "flos": 20704620821760.0, + "grad_norm": 5.5749971956454765, + "language_loss": 0.82549107, + "learning_rate": 2.876104377085234e-06, + "loss": 0.84741735, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 3.77734375, + "router_z_loss_mlp": 0.41137695, + "step": 6237, + "time_per_iteration": 2.688678026199341 + }, + { + "auxiliary_loss_clip": 0.01881091, + "auxiliary_loss_mlp": 0.00368646, + "balance_loss_clip": 1.49876022, + "balance_loss_mlp": 0.3283532, + "epoch": 0.3750488501427927, + "flos": 21574843620480.0, + "grad_norm": 5.0796658929377445, + "language_loss": 0.98796141, + "learning_rate": 2.8757542540760508e-06, + "loss": 1.01045883, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 3.83007812, + "router_z_loss_mlp": 0.40283203, + "step": 6238, + "time_per_iteration": 2.6438536643981934 + }, + { + "auxiliary_loss_clip": 0.01917394, + "auxiliary_loss_mlp": 0.00358655, + "balance_loss_clip": 1.52026224, + "balance_loss_mlp": 0.31819487, + "epoch": 0.3751089733954607, + "flos": 15923869274880.0, + "grad_norm": 3.651896369504434, + "language_loss": 0.77606487, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.79882538, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 3.97851562, + "router_z_loss_mlp": 0.40454102, + "step": 6239, + "time_per_iteration": 2.634272575378418 + }, + { + "auxiliary_loss_clip": 0.01909892, + "auxiliary_loss_mlp": 0.00332075, + "balance_loss_clip": 1.52239239, + "balance_loss_mlp": 0.29326081, + "epoch": 0.37516909664812864, + "flos": 36284644177920.0, + "grad_norm": 60.086590523922126, + "language_loss": 0.73454273, + "learning_rate": 2.875053908444895e-06, + "loss": 0.75696242, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 3.87890625, + "router_z_loss_mlp": 0.38818359, + "step": 6240, + "time_per_iteration": 2.7657978534698486 + }, + { + "auxiliary_loss_clip": 0.01922686, + "auxiliary_loss_mlp": 0.00310842, + "balance_loss_clip": 1.52555978, + "balance_loss_mlp": 0.27355331, + "epoch": 0.3752292199007966, + "flos": 13515915283200.0, + "grad_norm": 199.7752184576111, + "language_loss": 0.85524571, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.877581, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 3.97460938, + "router_z_loss_mlp": 0.37280273, + "step": 6241, + "time_per_iteration": 2.6335947513580322 + }, + { + "auxiliary_loss_clip": 0.01922911, + "auxiliary_loss_mlp": 0.00363786, + "balance_loss_clip": 1.52964377, + "balance_loss_mlp": 0.32439965, + "epoch": 0.3752893431534646, + "flos": 27198095644800.0, + "grad_norm": 3.309810333109818, + "language_loss": 0.90710175, + "learning_rate": 2.874353430085213e-06, + "loss": 0.92996871, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 3.93359375, + "router_z_loss_mlp": 0.39379883, + "step": 6242, + "time_per_iteration": 2.699143886566162 + }, + { + "auxiliary_loss_clip": 0.0192101, + "auxiliary_loss_mlp": 0.00386548, + "balance_loss_clip": 1.52584398, + "balance_loss_mlp": 0.34821069, + "epoch": 0.3753494664061326, + "flos": 30007674581760.0, + "grad_norm": 44.40093327694809, + "language_loss": 0.76479769, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.78787321, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 3.9453125, + "router_z_loss_mlp": 0.38354492, + "step": 6243, + "time_per_iteration": 2.7594821453094482 + }, + { + "auxiliary_loss_clip": 0.01919027, + "auxiliary_loss_mlp": 0.00344427, + "balance_loss_clip": 1.5275892, + "balance_loss_mlp": 0.30525458, + "epoch": 0.37540958965880056, + "flos": 24461954064000.0, + "grad_norm": 11.099275734425364, + "language_loss": 0.9187935, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.94142801, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 3.91210938, + "router_z_loss_mlp": 0.3918457, + "step": 6244, + "time_per_iteration": 2.706972122192383 + }, + { + "auxiliary_loss_clip": 0.01919294, + "auxiliary_loss_mlp": 0.0037486, + "balance_loss_clip": 1.53721941, + "balance_loss_mlp": 0.33587897, + "epoch": 0.3754697129114685, + "flos": 16508387295360.0, + "grad_norm": 8.00230730849742, + "language_loss": 0.92057246, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.94351399, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 0.39013672, + "step": 6245, + "time_per_iteration": 2.678317070007324 + }, + { + "auxiliary_loss_clip": 0.01907928, + "auxiliary_loss_mlp": 0.00360349, + "balance_loss_clip": 1.52120686, + "balance_loss_mlp": 0.32122433, + "epoch": 0.3755298361641365, + "flos": 19390900798080.0, + "grad_norm": 34.42144106574157, + "language_loss": 0.73644412, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.75912684, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.39135742, + "step": 6246, + "time_per_iteration": 2.6878113746643066 + }, + { + "auxiliary_loss_clip": 0.01922199, + "auxiliary_loss_mlp": 0.00434356, + "balance_loss_clip": 1.52981496, + "balance_loss_mlp": 0.39217934, + "epoch": 0.37558995941680445, + "flos": 14720395069440.0, + "grad_norm": 105.79631757964029, + "language_loss": 0.82674861, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.8503142, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 3.91796875, + "router_z_loss_mlp": 0.42163086, + "step": 6247, + "time_per_iteration": 2.6661903858184814 + }, + { + "auxiliary_loss_clip": 0.01899771, + "auxiliary_loss_mlp": 0.00427946, + "balance_loss_clip": 1.51067817, + "balance_loss_mlp": 0.38584161, + "epoch": 0.3756500826694724, + "flos": 21689901861120.0, + "grad_norm": 42.35756937260307, + "language_loss": 0.61449897, + "learning_rate": 2.872251199697598e-06, + "loss": 0.63777614, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 3.88867188, + "router_z_loss_mlp": 0.4206543, + "step": 6248, + "time_per_iteration": 2.696249008178711 + }, + { + "auxiliary_loss_clip": 0.01926245, + "auxiliary_loss_mlp": 0.00411943, + "balance_loss_clip": 1.53763962, + "balance_loss_mlp": 0.37084013, + "epoch": 0.3757102059221404, + "flos": 26505666190080.0, + "grad_norm": 12.697953628671101, + "language_loss": 0.90617585, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.92955774, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 3.88867188, + "router_z_loss_mlp": 0.41113281, + "step": 6249, + "time_per_iteration": 2.7899115085601807 + }, + { + "auxiliary_loss_clip": 0.01911216, + "auxiliary_loss_mlp": 0.00397112, + "balance_loss_clip": 1.52416396, + "balance_loss_mlp": 0.35624677, + "epoch": 0.37577032917480835, + "flos": 37338083274240.0, + "grad_norm": 14.403286407543535, + "language_loss": 0.76266086, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.78574419, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 3.87109375, + "router_z_loss_mlp": 0.40844727, + "step": 6250, + "time_per_iteration": 2.920860767364502 + }, + { + "auxiliary_loss_clip": 0.0191818, + "auxiliary_loss_mlp": 0.00417517, + "balance_loss_clip": 1.53638887, + "balance_loss_mlp": 0.38001367, + "epoch": 0.3758304524274763, + "flos": 21908597817600.0, + "grad_norm": 82.07551195271114, + "language_loss": 0.84240448, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.86576152, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 3.81835938, + "router_z_loss_mlp": 0.37524414, + "step": 6251, + "time_per_iteration": 2.771466016769409 + }, + { + "auxiliary_loss_clip": 0.01917488, + "auxiliary_loss_mlp": 0.00377633, + "balance_loss_clip": 1.54096746, + "balance_loss_mlp": 0.34246644, + "epoch": 0.3758905756801443, + "flos": 36569343375360.0, + "grad_norm": 57.597289568598285, + "language_loss": 0.65622747, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.67917871, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 3.765625, + "router_z_loss_mlp": 0.35180664, + "step": 6252, + "time_per_iteration": 2.845865488052368 + }, + { + "auxiliary_loss_clip": 0.01918672, + "auxiliary_loss_mlp": 0.00427225, + "balance_loss_clip": 1.53209269, + "balance_loss_mlp": 0.38407123, + "epoch": 0.37595069893281224, + "flos": 24528783317760.0, + "grad_norm": 66.0750705566635, + "language_loss": 0.9460175, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.96947646, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 3.86132812, + "router_z_loss_mlp": 0.43164062, + "step": 6253, + "time_per_iteration": 2.701983690261841 + }, + { + "auxiliary_loss_clip": 0.01925117, + "auxiliary_loss_mlp": 0.00379594, + "balance_loss_clip": 1.54306209, + "balance_loss_mlp": 0.33984971, + "epoch": 0.3760108221854802, + "flos": 16435021766400.0, + "grad_norm": 6.166453486783868, + "language_loss": 0.86335933, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.88640642, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 3.81835938, + "router_z_loss_mlp": 0.39746094, + "step": 6254, + "time_per_iteration": 2.8000218868255615 + }, + { + "auxiliary_loss_clip": 0.0191129, + "auxiliary_loss_mlp": 0.00438602, + "balance_loss_clip": 1.52666545, + "balance_loss_mlp": 0.39442289, + "epoch": 0.37607094543814823, + "flos": 13771742924160.0, + "grad_norm": 3.9619038077928534, + "language_loss": 0.711824, + "learning_rate": 2.869797092829169e-06, + "loss": 0.73532289, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.44165039, + "step": 6255, + "time_per_iteration": 2.634838819503784 + }, + { + "auxiliary_loss_clip": 0.01900484, + "auxiliary_loss_mlp": 0.00393473, + "balance_loss_clip": 1.52691901, + "balance_loss_mlp": 0.35487255, + "epoch": 0.3761310686908162, + "flos": 19857918453120.0, + "grad_norm": 112.35511098297715, + "language_loss": 0.81516039, + "learning_rate": 2.869446374096135e-06, + "loss": 0.83810002, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 3.734375, + "router_z_loss_mlp": 0.38598633, + "step": 6256, + "time_per_iteration": 2.6313912868499756 + }, + { + "auxiliary_loss_clip": 0.01897294, + "auxiliary_loss_mlp": 0.00432489, + "balance_loss_clip": 1.52077293, + "balance_loss_mlp": 0.39155272, + "epoch": 0.37619119194348416, + "flos": 12750802657920.0, + "grad_norm": 131.21033826536137, + "language_loss": 0.78961587, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.81291372, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 0.40942383, + "step": 6257, + "time_per_iteration": 2.6628663539886475 + }, + { + "auxiliary_loss_clip": 0.01901021, + "auxiliary_loss_mlp": 0.00386117, + "balance_loss_clip": 1.52443123, + "balance_loss_mlp": 0.34663504, + "epoch": 0.3762513151961521, + "flos": 17530548624000.0, + "grad_norm": 24.969304995305503, + "language_loss": 0.90571475, + "learning_rate": 2.868744837734889e-06, + "loss": 0.92858613, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 0.39477539, + "step": 6258, + "time_per_iteration": 2.724742889404297 + }, + { + "auxiliary_loss_clip": 0.01924873, + "auxiliary_loss_mlp": 0.00422984, + "balance_loss_clip": 1.54692984, + "balance_loss_mlp": 0.38326323, + "epoch": 0.3763114384488201, + "flos": 23617406511360.0, + "grad_norm": 9.53932641636149, + "language_loss": 0.86272085, + "learning_rate": 2.868394020133277e-06, + "loss": 0.88619936, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 0.39697266, + "step": 6259, + "time_per_iteration": 2.6500914096832275 + }, + { + "auxiliary_loss_clip": 0.0194715, + "auxiliary_loss_mlp": 0.0043783, + "balance_loss_clip": 1.55209351, + "balance_loss_mlp": 0.39086145, + "epoch": 0.37637156170148806, + "flos": 25406978935680.0, + "grad_norm": 17.876668162632317, + "language_loss": 0.78715622, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.81100601, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 3.94921875, + "router_z_loss_mlp": 0.46972656, + "step": 6260, + "time_per_iteration": 2.6986477375030518 + }, + { + "auxiliary_loss_clip": 0.01940847, + "auxiliary_loss_mlp": 0.00389913, + "balance_loss_clip": 1.55044723, + "balance_loss_mlp": 0.34749818, + "epoch": 0.376431684954156, + "flos": 23440906056960.0, + "grad_norm": 27.977444205653715, + "language_loss": 0.85491079, + "learning_rate": 2.867692286154594e-06, + "loss": 0.87821841, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 3.90820312, + "router_z_loss_mlp": 0.42407227, + "step": 6261, + "time_per_iteration": 2.737029552459717 + }, + { + "auxiliary_loss_clip": 0.01936204, + "auxiliary_loss_mlp": 0.00422871, + "balance_loss_clip": 1.55122674, + "balance_loss_mlp": 0.38298365, + "epoch": 0.376491808206824, + "flos": 34204482725760.0, + "grad_norm": 2.2088847308571857, + "language_loss": 0.86401594, + "learning_rate": 2.867341369804132e-06, + "loss": 0.88760668, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 3.84960938, + "router_z_loss_mlp": 0.39892578, + "step": 6262, + "time_per_iteration": 2.758775234222412 + }, + { + "auxiliary_loss_clip": 0.01944814, + "auxiliary_loss_mlp": 0.00448658, + "balance_loss_clip": 1.56329679, + "balance_loss_mlp": 0.40767425, + "epoch": 0.37655193145949195, + "flos": 35185669614720.0, + "grad_norm": 3.5780132671943448, + "language_loss": 0.85952234, + "learning_rate": 2.866990420563998e-06, + "loss": 0.88345706, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 3.81640625, + "router_z_loss_mlp": 0.40966797, + "step": 6263, + "time_per_iteration": 2.7555785179138184 + }, + { + "auxiliary_loss_clip": 0.019325, + "auxiliary_loss_mlp": 0.00405261, + "balance_loss_clip": 1.55567241, + "balance_loss_mlp": 0.3624168, + "epoch": 0.3766120547121599, + "flos": 16761844638720.0, + "grad_norm": 102.60994173008052, + "language_loss": 0.88269889, + "learning_rate": 2.866639438447501e-06, + "loss": 0.90607655, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 3.76757812, + "router_z_loss_mlp": 0.42871094, + "step": 6264, + "time_per_iteration": 2.7224411964416504 + }, + { + "auxiliary_loss_clip": 0.0192823, + "auxiliary_loss_mlp": 0.00382629, + "balance_loss_clip": 1.55286622, + "balance_loss_mlp": 0.34226483, + "epoch": 0.3766721779648279, + "flos": 23550361776000.0, + "grad_norm": 13.401158443692575, + "language_loss": 0.81099808, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.83410668, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 3.75, + "router_z_loss_mlp": 0.40356445, + "step": 6265, + "time_per_iteration": 4.1410229206085205 + }, + { + "auxiliary_loss_clip": 0.01954951, + "auxiliary_loss_mlp": 0.00368251, + "balance_loss_clip": 1.58013511, + "balance_loss_mlp": 0.33058068, + "epoch": 0.37673230121749585, + "flos": 29129191655040.0, + "grad_norm": 79.97532432961876, + "language_loss": 0.71765381, + "learning_rate": 2.865937375638654e-06, + "loss": 0.74088585, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 3.74609375, + "router_z_loss_mlp": 0.37719727, + "step": 6266, + "time_per_iteration": 2.7063207626342773 + }, + { + "auxiliary_loss_clip": 0.01924775, + "auxiliary_loss_mlp": 0.00410118, + "balance_loss_clip": 1.54506183, + "balance_loss_mlp": 0.36751226, + "epoch": 0.3767924244701638, + "flos": 28146783703680.0, + "grad_norm": 4.526592905307364, + "language_loss": 0.70813394, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.73148286, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 0.42626953, + "step": 6267, + "time_per_iteration": 4.098716974258423 + }, + { + "auxiliary_loss_clip": 0.02011626, + "auxiliary_loss_mlp": 0.00405127, + "balance_loss_clip": 1.763116, + "balance_loss_mlp": 0.39034477, + "epoch": 0.37685254772283183, + "flos": 60797197526400.0, + "grad_norm": 0.7475781923874779, + "language_loss": 0.58947766, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.6136452, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.14746094, + "step": 6268, + "time_per_iteration": 3.2631325721740723 + }, + { + "auxiliary_loss_clip": 0.01921774, + "auxiliary_loss_mlp": 0.00392342, + "balance_loss_clip": 1.55667663, + "balance_loss_mlp": 0.3545289, + "epoch": 0.3769126709754998, + "flos": 26032543223040.0, + "grad_norm": 6.189676524820148, + "language_loss": 0.70643169, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.72957283, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 0.37817383, + "step": 6269, + "time_per_iteration": 2.698833703994751 + }, + { + "auxiliary_loss_clip": 0.01914414, + "auxiliary_loss_mlp": 0.00364811, + "balance_loss_clip": 1.55721176, + "balance_loss_mlp": 0.32783186, + "epoch": 0.37697279422816776, + "flos": 23579879777280.0, + "grad_norm": 30.232906005912522, + "language_loss": 0.77553439, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.79832667, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.37011719, + "step": 6270, + "time_per_iteration": 2.657090663909912 + }, + { + "auxiliary_loss_clip": 0.01941315, + "auxiliary_loss_mlp": 0.00267897, + "balance_loss_clip": 1.71488154, + "balance_loss_mlp": 0.25445032, + "epoch": 0.3770329174808357, + "flos": 64745935367040.0, + "grad_norm": 0.717428500404105, + "language_loss": 0.56058997, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.58268213, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.13476562, + "step": 6271, + "time_per_iteration": 4.5436851978302 + }, + { + "auxiliary_loss_clip": 0.01896872, + "auxiliary_loss_mlp": 0.00365614, + "balance_loss_clip": 1.54343414, + "balance_loss_mlp": 0.32281822, + "epoch": 0.3770930407335037, + "flos": 21835304115840.0, + "grad_norm": 4.402440366136075, + "language_loss": 0.85404819, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.8766731, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.42797852, + "step": 6272, + "time_per_iteration": 2.6630859375 + }, + { + "auxiliary_loss_clip": 0.01858111, + "auxiliary_loss_mlp": 0.00365946, + "balance_loss_clip": 1.50870681, + "balance_loss_mlp": 0.3298493, + "epoch": 0.37715316398617166, + "flos": 22747901984640.0, + "grad_norm": 331.6858930783031, + "language_loss": 0.79920292, + "learning_rate": 2.863479122159103e-06, + "loss": 0.82144356, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 3.49023438, + "router_z_loss_mlp": 0.36083984, + "step": 6273, + "time_per_iteration": 2.6434078216552734 + }, + { + "auxiliary_loss_clip": 0.01881123, + "auxiliary_loss_mlp": 0.00367709, + "balance_loss_clip": 1.53484547, + "balance_loss_mlp": 0.32922816, + "epoch": 0.3772132872388396, + "flos": 18914581520640.0, + "grad_norm": 6.894333195275504, + "language_loss": 0.77053726, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.79302561, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 3.46484375, + "router_z_loss_mlp": 0.38452148, + "step": 6274, + "time_per_iteration": 2.6399075984954834 + }, + { + "auxiliary_loss_clip": 0.01855099, + "auxiliary_loss_mlp": 0.00351244, + "balance_loss_clip": 1.50401592, + "balance_loss_mlp": 0.31405061, + "epoch": 0.3772734104915076, + "flos": 17346219004800.0, + "grad_norm": 28.69803074862732, + "language_loss": 0.91633308, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.93839657, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 0.37158203, + "step": 6275, + "time_per_iteration": 2.752255439758301 + }, + { + "auxiliary_loss_clip": 0.01856299, + "auxiliary_loss_mlp": 0.00327014, + "balance_loss_clip": 1.51735699, + "balance_loss_mlp": 0.29048866, + "epoch": 0.37733353374417555, + "flos": 32342370785280.0, + "grad_norm": 5.350188949454512, + "language_loss": 0.79056835, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.81240153, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.36523438, + "step": 6276, + "time_per_iteration": 2.782949924468994 + }, + { + "auxiliary_loss_clip": 0.0185671, + "auxiliary_loss_mlp": 0.00352035, + "balance_loss_clip": 1.5153687, + "balance_loss_mlp": 0.31245792, + "epoch": 0.3773936569968435, + "flos": 23360681030400.0, + "grad_norm": 16.256338316858308, + "language_loss": 0.91296548, + "learning_rate": 2.862073685241366e-06, + "loss": 0.93505299, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 3.41992188, + "router_z_loss_mlp": 0.39599609, + "step": 6277, + "time_per_iteration": 4.02819561958313 + }, + { + "auxiliary_loss_clip": 0.01824626, + "auxiliary_loss_mlp": 0.00387982, + "balance_loss_clip": 1.49706626, + "balance_loss_mlp": 0.34752226, + "epoch": 0.3774537802495115, + "flos": 21466788531840.0, + "grad_norm": 16.729840689338406, + "language_loss": 0.83216047, + "learning_rate": 2.861722244253818e-06, + "loss": 0.85428655, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.4050293, + "step": 6278, + "time_per_iteration": 2.6059980392456055 + }, + { + "auxiliary_loss_clip": 0.01831186, + "auxiliary_loss_mlp": 0.00424008, + "balance_loss_clip": 1.48937809, + "balance_loss_mlp": 0.38218969, + "epoch": 0.37751390350217945, + "flos": 24973717086720.0, + "grad_norm": 41.48152221048386, + "language_loss": 0.90985274, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.93240464, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.41821289, + "step": 6279, + "time_per_iteration": 2.7499871253967285 + }, + { + "auxiliary_loss_clip": 0.01808258, + "auxiliary_loss_mlp": 0.00338236, + "balance_loss_clip": 1.47714269, + "balance_loss_mlp": 0.30094689, + "epoch": 0.3775740267548474, + "flos": 27819098904960.0, + "grad_norm": 79.84066725775578, + "language_loss": 0.81239927, + "learning_rate": 2.861019264262269e-06, + "loss": 0.83386421, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.37304688, + "step": 6280, + "time_per_iteration": 2.6716761589050293 + }, + { + "auxiliary_loss_clip": 0.01844457, + "auxiliary_loss_mlp": 0.00392318, + "balance_loss_clip": 1.50813663, + "balance_loss_mlp": 0.34883031, + "epoch": 0.3776341500075154, + "flos": 22565224391040.0, + "grad_norm": 2.4562968593805463, + "language_loss": 0.79573143, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.81809914, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.43505859, + "step": 6281, + "time_per_iteration": 2.646883964538574 + }, + { + "auxiliary_loss_clip": 0.01807167, + "auxiliary_loss_mlp": 0.00378813, + "balance_loss_clip": 1.4801383, + "balance_loss_mlp": 0.3399511, + "epoch": 0.3776942732601834, + "flos": 23077238808960.0, + "grad_norm": 13.516866568311812, + "language_loss": 0.87936985, + "learning_rate": 2.860316153670974e-06, + "loss": 0.90122962, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.38842773, + "step": 6282, + "time_per_iteration": 2.6472721099853516 + }, + { + "auxiliary_loss_clip": 0.01804149, + "auxiliary_loss_mlp": 0.00411838, + "balance_loss_clip": 1.47764087, + "balance_loss_mlp": 0.37040126, + "epoch": 0.37775439651285136, + "flos": 21724411852800.0, + "grad_norm": 11.901492485475334, + "language_loss": 0.75809646, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.78025639, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 3.265625, + "router_z_loss_mlp": 0.41455078, + "step": 6283, + "time_per_iteration": 2.6179730892181396 + }, + { + "auxiliary_loss_clip": 0.01833217, + "auxiliary_loss_mlp": 0.00408684, + "balance_loss_clip": 1.50414264, + "balance_loss_mlp": 0.36712807, + "epoch": 0.37781451976551933, + "flos": 23987753688960.0, + "grad_norm": 35.10569096213132, + "language_loss": 0.84272927, + "learning_rate": 2.859612912586581e-06, + "loss": 0.86514831, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.41577148, + "step": 6284, + "time_per_iteration": 2.673311710357666 + }, + { + "auxiliary_loss_clip": 0.01811814, + "auxiliary_loss_mlp": 0.00416936, + "balance_loss_clip": 1.47777176, + "balance_loss_mlp": 0.37227991, + "epoch": 0.3778746430181873, + "flos": 13727967223680.0, + "grad_norm": 7.687931518994306, + "language_loss": 0.95356166, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.97584915, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.44628906, + "step": 6285, + "time_per_iteration": 2.633985757827759 + }, + { + "auxiliary_loss_clip": 0.01821531, + "auxiliary_loss_mlp": 0.00381419, + "balance_loss_clip": 1.4865669, + "balance_loss_mlp": 0.33733505, + "epoch": 0.37793476627085526, + "flos": 19460495399040.0, + "grad_norm": 45.57120108941556, + "language_loss": 0.90695649, + "learning_rate": 2.858909541115758e-06, + "loss": 0.92898601, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.44091797, + "step": 6286, + "time_per_iteration": 2.6958253383636475 + }, + { + "auxiliary_loss_clip": 0.01807288, + "auxiliary_loss_mlp": 0.0038074, + "balance_loss_clip": 1.47609806, + "balance_loss_mlp": 0.34104371, + "epoch": 0.3779948895235232, + "flos": 10707018704640.0, + "grad_norm": 46.246363363631126, + "language_loss": 0.89423126, + "learning_rate": 2.858557806518775e-06, + "loss": 0.91611159, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.39697266, + "step": 6287, + "time_per_iteration": 2.778470277786255 + }, + { + "auxiliary_loss_clip": 0.01829541, + "auxiliary_loss_mlp": 0.00419007, + "balance_loss_clip": 1.49035168, + "balance_loss_mlp": 0.37504238, + "epoch": 0.3780550127761912, + "flos": 22310007281280.0, + "grad_norm": 118.00212758282379, + "language_loss": 0.8003307, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.82281625, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.43945312, + "step": 6288, + "time_per_iteration": 2.6491541862487793 + }, + { + "auxiliary_loss_clip": 0.01813873, + "auxiliary_loss_mlp": 0.00411331, + "balance_loss_clip": 1.48393202, + "balance_loss_mlp": 0.36691338, + "epoch": 0.37811513602885916, + "flos": 28950644125440.0, + "grad_norm": 4.284310419794487, + "language_loss": 0.8124699, + "learning_rate": 2.857854239668352e-06, + "loss": 0.83472192, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.44433594, + "step": 6289, + "time_per_iteration": 2.760009765625 + }, + { + "auxiliary_loss_clip": 0.01824548, + "auxiliary_loss_mlp": 0.00343275, + "balance_loss_clip": 1.49053729, + "balance_loss_mlp": 0.30465129, + "epoch": 0.3781752592815271, + "flos": 23112933949440.0, + "grad_norm": 2.9193772959218527, + "language_loss": 0.81848073, + "learning_rate": 2.857502407441593e-06, + "loss": 0.84015888, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.38623047, + "step": 6290, + "time_per_iteration": 2.666259288787842 + }, + { + "auxiliary_loss_clip": 0.01818464, + "auxiliary_loss_mlp": 0.00401302, + "balance_loss_clip": 1.47980738, + "balance_loss_mlp": 0.35631213, + "epoch": 0.3782353825341951, + "flos": 19755932762880.0, + "grad_norm": 6.286185403262449, + "language_loss": 0.87763238, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.89983004, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.44995117, + "step": 6291, + "time_per_iteration": 2.6776504516601562 + }, + { + "auxiliary_loss_clip": 0.01797993, + "auxiliary_loss_mlp": 0.00351438, + "balance_loss_clip": 1.46805871, + "balance_loss_mlp": 0.31298056, + "epoch": 0.37829550578686305, + "flos": 22050839675520.0, + "grad_norm": 6.864759132992324, + "language_loss": 0.84684652, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.86834079, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.38476562, + "step": 6292, + "time_per_iteration": 2.6722164154052734 + }, + { + "auxiliary_loss_clip": 0.01767498, + "auxiliary_loss_mlp": 0.003859, + "balance_loss_clip": 1.44269049, + "balance_loss_mlp": 0.34329432, + "epoch": 0.378355629039531, + "flos": 16470357770880.0, + "grad_norm": 5.2067709036577465, + "language_loss": 0.77076286, + "learning_rate": 2.856446715715224e-06, + "loss": 0.79229683, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.42626953, + "step": 6293, + "time_per_iteration": 2.6429967880249023 + }, + { + "auxiliary_loss_clip": 0.01797719, + "auxiliary_loss_mlp": 0.00363774, + "balance_loss_clip": 1.47208595, + "balance_loss_mlp": 0.32448274, + "epoch": 0.378415752292199, + "flos": 19974844200960.0, + "grad_norm": 12.746481480265993, + "language_loss": 0.78763044, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.80924535, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.39282227, + "step": 6294, + "time_per_iteration": 2.6479086875915527 + }, + { + "auxiliary_loss_clip": 0.01788647, + "auxiliary_loss_mlp": 0.00367829, + "balance_loss_clip": 1.44802964, + "balance_loss_mlp": 0.32820341, + "epoch": 0.378475875544867, + "flos": 14647388676480.0, + "grad_norm": 351.038399729336, + "language_loss": 0.90710562, + "learning_rate": 2.855742758826011e-06, + "loss": 0.92867035, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.39648438, + "step": 6295, + "time_per_iteration": 2.633073568344116 + }, + { + "auxiliary_loss_clip": 0.01787173, + "auxiliary_loss_mlp": 0.00368951, + "balance_loss_clip": 1.45999885, + "balance_loss_mlp": 0.3293969, + "epoch": 0.37853599879753497, + "flos": 26650996617600.0, + "grad_norm": 5.80580948617567, + "language_loss": 0.78779566, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.80935693, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.39550781, + "step": 6296, + "time_per_iteration": 2.693225860595703 + }, + { + "auxiliary_loss_clip": 0.01781476, + "auxiliary_loss_mlp": 0.00355107, + "balance_loss_clip": 1.45962369, + "balance_loss_mlp": 0.31915379, + "epoch": 0.37859612205020293, + "flos": 17311960408320.0, + "grad_norm": 7.619778088351546, + "language_loss": 0.82903957, + "learning_rate": 2.855038672137396e-06, + "loss": 0.85040545, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 0.35961914, + "step": 6297, + "time_per_iteration": 2.769942045211792 + }, + { + "auxiliary_loss_clip": 0.01791135, + "auxiliary_loss_mlp": 0.00391366, + "balance_loss_clip": 1.45889497, + "balance_loss_mlp": 0.35274225, + "epoch": 0.3786562453028709, + "flos": 18220392299520.0, + "grad_norm": 5.802611979452548, + "language_loss": 0.85055107, + "learning_rate": 2.854686580151684e-06, + "loss": 0.87237602, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.38647461, + "step": 6298, + "time_per_iteration": 2.6130568981170654 + }, + { + "auxiliary_loss_clip": 0.01780885, + "auxiliary_loss_mlp": 0.00343004, + "balance_loss_clip": 1.4518652, + "balance_loss_mlp": 0.30511913, + "epoch": 0.37871636855553886, + "flos": 21214875473280.0, + "grad_norm": 11.416922275552885, + "language_loss": 0.90747482, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.92871368, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.37890625, + "step": 6299, + "time_per_iteration": 2.670567035675049 + }, + { + "auxiliary_loss_clip": 0.01760086, + "auxiliary_loss_mlp": 0.00352653, + "balance_loss_clip": 1.43906975, + "balance_loss_mlp": 0.31576994, + "epoch": 0.3787764918082068, + "flos": 20952727038720.0, + "grad_norm": 55.102193475118, + "language_loss": 0.86510414, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.88623154, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.36889648, + "step": 6300, + "time_per_iteration": 2.6362504959106445 + }, + { + "auxiliary_loss_clip": 0.01778714, + "auxiliary_loss_mlp": 0.00372247, + "balance_loss_clip": 1.44841385, + "balance_loss_mlp": 0.33164459, + "epoch": 0.3788366150608748, + "flos": 17308009912320.0, + "grad_norm": 8.882171563133577, + "language_loss": 0.90434849, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.92585808, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.40649414, + "step": 6301, + "time_per_iteration": 2.6532063484191895 + }, + { + "auxiliary_loss_clip": 0.01779786, + "auxiliary_loss_mlp": 0.00353343, + "balance_loss_clip": 1.45490885, + "balance_loss_mlp": 0.31557792, + "epoch": 0.37889673831354276, + "flos": 24311092942080.0, + "grad_norm": 4.851674300595305, + "language_loss": 0.75261676, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.77394801, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 3.25195312, + "router_z_loss_mlp": 0.37768555, + "step": 6302, + "time_per_iteration": 2.7103302478790283 + }, + { + "auxiliary_loss_clip": 0.0176499, + "auxiliary_loss_mlp": 0.00353385, + "balance_loss_clip": 1.45115542, + "balance_loss_mlp": 0.3161442, + "epoch": 0.3789568615662107, + "flos": 26683603188480.0, + "grad_norm": 5.139995373532578, + "language_loss": 0.74396276, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.76514649, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.37231445, + "step": 6303, + "time_per_iteration": 2.746009111404419 + }, + { + "auxiliary_loss_clip": 0.01753601, + "auxiliary_loss_mlp": 0.00359827, + "balance_loss_clip": 1.43499422, + "balance_loss_mlp": 0.32172805, + "epoch": 0.3790169848188787, + "flos": 23585194990080.0, + "grad_norm": 17.457892913544267, + "language_loss": 0.83951557, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.86064982, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.38085938, + "step": 6304, + "time_per_iteration": 2.8070883750915527 + }, + { + "auxiliary_loss_clip": 0.01779289, + "auxiliary_loss_mlp": 0.00347768, + "balance_loss_clip": 1.45452499, + "balance_loss_mlp": 0.30857199, + "epoch": 0.37907710807154665, + "flos": 18437436230400.0, + "grad_norm": 39.90711869085266, + "language_loss": 0.88943326, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.9107039, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.39208984, + "step": 6305, + "time_per_iteration": 2.6936378479003906 + }, + { + "auxiliary_loss_clip": 0.01572347, + "auxiliary_loss_mlp": 0.00197845, + "balance_loss_clip": 1.3916533, + "balance_loss_mlp": 0.18878534, + "epoch": 0.3791372313242146, + "flos": 50107165954560.0, + "grad_norm": 0.9569122233755102, + "language_loss": 0.64270067, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66040266, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.09082031, + "step": 6306, + "time_per_iteration": 3.010441303253174 + }, + { + "auxiliary_loss_clip": 0.01764967, + "auxiliary_loss_mlp": 0.00325326, + "balance_loss_clip": 1.45511866, + "balance_loss_mlp": 0.28770322, + "epoch": 0.3791973545768826, + "flos": 24316551809280.0, + "grad_norm": 13.402067579599324, + "language_loss": 0.7917679, + "learning_rate": 2.851516295441817e-06, + "loss": 0.81267077, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.37597656, + "step": 6307, + "time_per_iteration": 4.103034734725952 + }, + { + "auxiliary_loss_clip": 0.01752623, + "auxiliary_loss_mlp": 0.0037607, + "balance_loss_clip": 1.43864441, + "balance_loss_mlp": 0.33856678, + "epoch": 0.3792574778295506, + "flos": 21579907438080.0, + "grad_norm": 7.06689290896619, + "language_loss": 0.84348178, + "learning_rate": 2.851163879959112e-06, + "loss": 0.86476868, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.37475586, + "step": 6308, + "time_per_iteration": 2.7256312370300293 + }, + { + "auxiliary_loss_clip": 0.0173956, + "auxiliary_loss_mlp": 0.00323675, + "balance_loss_clip": 1.42890167, + "balance_loss_mlp": 0.28550416, + "epoch": 0.37931760108221857, + "flos": 22272731942400.0, + "grad_norm": 29.9546703251187, + "language_loss": 0.81198275, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.83261514, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.38183594, + "step": 6309, + "time_per_iteration": 4.099500894546509 + }, + { + "auxiliary_loss_clip": 0.01747561, + "auxiliary_loss_mlp": 0.00350681, + "balance_loss_clip": 1.44051909, + "balance_loss_mlp": 0.31317803, + "epoch": 0.37937772433488653, + "flos": 19682998197120.0, + "grad_norm": 17.039936598968172, + "language_loss": 0.85809851, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.87908089, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.37475586, + "step": 6310, + "time_per_iteration": 2.6909570693969727 + }, + { + "auxiliary_loss_clip": 0.01748546, + "auxiliary_loss_mlp": 0.00329769, + "balance_loss_clip": 1.44409502, + "balance_loss_mlp": 0.29379156, + "epoch": 0.3794378475875545, + "flos": 19099378016640.0, + "grad_norm": 7.0298053179181546, + "language_loss": 0.83692122, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.8577044, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.35986328, + "step": 6311, + "time_per_iteration": 2.659982681274414 + }, + { + "auxiliary_loss_clip": 0.01726732, + "auxiliary_loss_mlp": 0.00331949, + "balance_loss_clip": 1.4218725, + "balance_loss_mlp": 0.29392144, + "epoch": 0.37949797084022246, + "flos": 20339660684160.0, + "grad_norm": 86.74826596780385, + "language_loss": 0.76141596, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.78200269, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.38037109, + "step": 6312, + "time_per_iteration": 2.6650490760803223 + }, + { + "auxiliary_loss_clip": 0.01594735, + "auxiliary_loss_mlp": 0.00211282, + "balance_loss_clip": 1.40998733, + "balance_loss_mlp": 0.19735818, + "epoch": 0.37955809409289043, + "flos": 63972203477760.0, + "grad_norm": 0.7484418696017325, + "language_loss": 0.55355513, + "learning_rate": 2.849401318669608e-06, + "loss": 0.57161528, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.13964844, + "step": 6313, + "time_per_iteration": 4.594098806381226 + }, + { + "auxiliary_loss_clip": 0.01745494, + "auxiliary_loss_mlp": 0.00345152, + "balance_loss_clip": 1.4380163, + "balance_loss_mlp": 0.30733895, + "epoch": 0.3796182173455584, + "flos": 31540665179520.0, + "grad_norm": 3.293529400187782, + "language_loss": 0.77518559, + "learning_rate": 2.849048709730083e-06, + "loss": 0.79609203, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 3.07617188, + "router_z_loss_mlp": 0.37817383, + "step": 6314, + "time_per_iteration": 2.7508485317230225 + }, + { + "auxiliary_loss_clip": 0.01724268, + "auxiliary_loss_mlp": 0.00364108, + "balance_loss_clip": 1.41995263, + "balance_loss_mlp": 0.32534137, + "epoch": 0.37967834059822636, + "flos": 12130804978560.0, + "grad_norm": 6.0027224160187815, + "language_loss": 0.80808234, + "learning_rate": 2.848696068594545e-06, + "loss": 0.82896608, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 3.04492188, + "router_z_loss_mlp": 0.38769531, + "step": 6315, + "time_per_iteration": 2.732398748397827 + }, + { + "auxiliary_loss_clip": 0.01710741, + "auxiliary_loss_mlp": 0.00300886, + "balance_loss_clip": 1.41599309, + "balance_loss_mlp": 0.26633912, + "epoch": 0.3797384638508943, + "flos": 39348578298240.0, + "grad_norm": 23.67779095354767, + "language_loss": 0.78999102, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.81010729, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.34545898, + "step": 6316, + "time_per_iteration": 2.861340045928955 + }, + { + "auxiliary_loss_clip": 0.01722798, + "auxiliary_loss_mlp": 0.00332929, + "balance_loss_clip": 1.42141974, + "balance_loss_mlp": 0.29642695, + "epoch": 0.3797985871035623, + "flos": 34054016653440.0, + "grad_norm": 2.3623849488220716, + "language_loss": 0.72991347, + "learning_rate": 2.847990689788923e-06, + "loss": 0.75047076, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.36499023, + "step": 6317, + "time_per_iteration": 2.7673027515411377 + }, + { + "auxiliary_loss_clip": 0.01702372, + "auxiliary_loss_mlp": 0.0033525, + "balance_loss_clip": 1.40615058, + "balance_loss_mlp": 0.29800946, + "epoch": 0.37985871035623026, + "flos": 23222174186880.0, + "grad_norm": 9.862311346420181, + "language_loss": 0.93124503, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.95162123, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.37231445, + "step": 6318, + "time_per_iteration": 2.7520792484283447 + }, + { + "auxiliary_loss_clip": 0.01701736, + "auxiliary_loss_mlp": 0.00357897, + "balance_loss_clip": 1.40478742, + "balance_loss_mlp": 0.31798545, + "epoch": 0.3799188336088982, + "flos": 18114958903680.0, + "grad_norm": 55.36256524096329, + "language_loss": 0.86600429, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.88660061, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.39892578, + "step": 6319, + "time_per_iteration": 4.062528848648071 + }, + { + "auxiliary_loss_clip": 0.01716187, + "auxiliary_loss_mlp": 0.00333954, + "balance_loss_clip": 1.41962206, + "balance_loss_mlp": 0.29823905, + "epoch": 0.3799789568615662, + "flos": 21871897096320.0, + "grad_norm": 4.079888687741842, + "language_loss": 0.71026945, + "learning_rate": 2.846932380444744e-06, + "loss": 0.73077095, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 0.35693359, + "step": 6320, + "time_per_iteration": 2.6409599781036377 + }, + { + "auxiliary_loss_clip": 0.01675601, + "auxiliary_loss_mlp": 0.0036625, + "balance_loss_clip": 1.38168609, + "balance_loss_mlp": 0.32657754, + "epoch": 0.3800390801142342, + "flos": 32962943082240.0, + "grad_norm": 9.118692507694329, + "language_loss": 0.76827043, + "learning_rate": 2.846579546413992e-06, + "loss": 0.78868896, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.39648438, + "step": 6321, + "time_per_iteration": 2.7763257026672363 + }, + { + "auxiliary_loss_clip": 0.01677069, + "auxiliary_loss_mlp": 0.00361072, + "balance_loss_clip": 1.38552272, + "balance_loss_mlp": 0.3219949, + "epoch": 0.38009920336690217, + "flos": 26907075653760.0, + "grad_norm": 25.980472287639238, + "language_loss": 0.82597303, + "learning_rate": 2.846226680280859e-06, + "loss": 0.84635454, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.39086914, + "step": 6322, + "time_per_iteration": 2.667598247528076 + }, + { + "auxiliary_loss_clip": 0.01677753, + "auxiliary_loss_mlp": 0.00341984, + "balance_loss_clip": 1.38567162, + "balance_loss_mlp": 0.30374187, + "epoch": 0.38015932661957014, + "flos": 22488913946880.0, + "grad_norm": 10.995790896409527, + "language_loss": 0.90483999, + "learning_rate": 2.845873782058725e-06, + "loss": 0.92503732, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.38232422, + "step": 6323, + "time_per_iteration": 2.6958930492401123 + }, + { + "auxiliary_loss_clip": 0.01657554, + "auxiliary_loss_mlp": 0.00351482, + "balance_loss_clip": 1.36703849, + "balance_loss_mlp": 0.31393123, + "epoch": 0.3802194498722381, + "flos": 21980993679360.0, + "grad_norm": 58096.21168933838, + "language_loss": 0.815786, + "learning_rate": 2.845520851760973e-06, + "loss": 0.83587635, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.37548828, + "step": 6324, + "time_per_iteration": 2.615281105041504 + }, + { + "auxiliary_loss_clip": 0.01657504, + "auxiliary_loss_mlp": 0.00345755, + "balance_loss_clip": 1.3691324, + "balance_loss_mlp": 0.3092289, + "epoch": 0.38027957312490607, + "flos": 21324869896320.0, + "grad_norm": 4.8748114815731975, + "language_loss": 0.92800522, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.94803774, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.36523438, + "step": 6325, + "time_per_iteration": 2.6678097248077393 + }, + { + "auxiliary_loss_clip": 0.01679979, + "auxiliary_loss_mlp": 0.0037471, + "balance_loss_clip": 1.39034772, + "balance_loss_mlp": 0.3356812, + "epoch": 0.38033969637757403, + "flos": 16691244456960.0, + "grad_norm": 106.75011144654844, + "language_loss": 0.86331761, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.88386446, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.39013672, + "step": 6326, + "time_per_iteration": 2.6468002796173096 + }, + { + "auxiliary_loss_clip": 0.01670417, + "auxiliary_loss_mlp": 0.0031065, + "balance_loss_clip": 1.38464427, + "balance_loss_mlp": 0.27488685, + "epoch": 0.380399819630242, + "flos": 36210847685760.0, + "grad_norm": 250.08127878542467, + "language_loss": 0.78477168, + "learning_rate": 2.844461868547842e-06, + "loss": 0.80458236, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.35766602, + "step": 6327, + "time_per_iteration": 2.8125650882720947 + }, + { + "auxiliary_loss_clip": 0.01668131, + "auxiliary_loss_mlp": 0.003371, + "balance_loss_clip": 1.38008904, + "balance_loss_mlp": 0.299263, + "epoch": 0.38045994288290996, + "flos": 21288851533440.0, + "grad_norm": 34.43407094339074, + "language_loss": 0.90350306, + "learning_rate": 2.844108810081459e-06, + "loss": 0.92355537, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.37817383, + "step": 6328, + "time_per_iteration": 2.6929678916931152 + }, + { + "auxiliary_loss_clip": 0.01642996, + "auxiliary_loss_mlp": 0.00391242, + "balance_loss_clip": 1.35751915, + "balance_loss_mlp": 0.35330975, + "epoch": 0.38052006613557793, + "flos": 20922885815040.0, + "grad_norm": 12.032188848619578, + "language_loss": 0.67222941, + "learning_rate": 2.843755719606385e-06, + "loss": 0.69257176, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.37939453, + "step": 6329, + "time_per_iteration": 2.86405611038208 + }, + { + "auxiliary_loss_clip": 0.01653699, + "auxiliary_loss_mlp": 0.00347674, + "balance_loss_clip": 1.36908555, + "balance_loss_mlp": 0.31291246, + "epoch": 0.3805801893882459, + "flos": 20990720649600.0, + "grad_norm": 3.0827356038337665, + "language_loss": 0.63162988, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.65164363, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 2.84570312, + "router_z_loss_mlp": 0.34765625, + "step": 6330, + "time_per_iteration": 2.6362199783325195 + }, + { + "auxiliary_loss_clip": 0.01660003, + "auxiliary_loss_mlp": 0.00319157, + "balance_loss_clip": 1.38163209, + "balance_loss_mlp": 0.28358537, + "epoch": 0.38064031264091386, + "flos": 25558594243200.0, + "grad_norm": 77.1129716540159, + "language_loss": 0.71788412, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.73767573, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 0.35571289, + "step": 6331, + "time_per_iteration": 2.767843723297119 + }, + { + "auxiliary_loss_clip": 0.01662141, + "auxiliary_loss_mlp": 0.00345219, + "balance_loss_clip": 1.37312746, + "balance_loss_mlp": 0.3075732, + "epoch": 0.3807004358935818, + "flos": 15085857997440.0, + "grad_norm": 191.80767360205004, + "language_loss": 0.82617444, + "learning_rate": 2.842696256262919e-06, + "loss": 0.84624803, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.37646484, + "step": 6332, + "time_per_iteration": 2.6633622646331787 + }, + { + "auxiliary_loss_clip": 0.01646805, + "auxiliary_loss_mlp": 0.00357192, + "balance_loss_clip": 1.3578546, + "balance_loss_mlp": 0.31568378, + "epoch": 0.3807605591462498, + "flos": 16399398453120.0, + "grad_norm": 149.71814529522095, + "language_loss": 0.89260828, + "learning_rate": 2.842343037886987e-06, + "loss": 0.9126482, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.41479492, + "step": 6333, + "time_per_iteration": 2.6423985958099365 + }, + { + "auxiliary_loss_clip": 0.01645006, + "auxiliary_loss_mlp": 0.00327805, + "balance_loss_clip": 1.35713148, + "balance_loss_mlp": 0.29142195, + "epoch": 0.3808206823989178, + "flos": 29057083102080.0, + "grad_norm": 23.273082800797216, + "language_loss": 0.90227681, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.92200488, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.36376953, + "step": 6334, + "time_per_iteration": 2.744326591491699 + }, + { + "auxiliary_loss_clip": 0.01643671, + "auxiliary_loss_mlp": 0.00377337, + "balance_loss_clip": 1.35406148, + "balance_loss_mlp": 0.33904654, + "epoch": 0.3808808056515858, + "flos": 15705855676800.0, + "grad_norm": 2.7673171741302083, + "language_loss": 0.86890757, + "learning_rate": 2.841636505323321e-06, + "loss": 0.88911772, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.38305664, + "step": 6335, + "time_per_iteration": 2.6367666721343994 + }, + { + "auxiliary_loss_clip": 0.01642752, + "auxiliary_loss_mlp": 0.00386126, + "balance_loss_clip": 1.3521632, + "balance_loss_mlp": 0.34459355, + "epoch": 0.38094092890425374, + "flos": 20704584908160.0, + "grad_norm": 4.887053105690929, + "language_loss": 0.8132481, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.83353686, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.4152832, + "step": 6336, + "time_per_iteration": 2.639522075653076 + }, + { + "auxiliary_loss_clip": 0.01616877, + "auxiliary_loss_mlp": 0.00346382, + "balance_loss_clip": 1.33122849, + "balance_loss_mlp": 0.3098799, + "epoch": 0.3810010521569217, + "flos": 20667956014080.0, + "grad_norm": 19.731351704334546, + "language_loss": 0.76143926, + "learning_rate": 2.840929845099894e-06, + "loss": 0.7810719, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.36523438, + "step": 6337, + "time_per_iteration": 2.6626675128936768 + }, + { + "auxiliary_loss_clip": 0.01618131, + "auxiliary_loss_mlp": 0.00378607, + "balance_loss_clip": 1.32861519, + "balance_loss_mlp": 0.33991128, + "epoch": 0.38106117540958967, + "flos": 31827626933760.0, + "grad_norm": 22.740821330893887, + "language_loss": 0.69900227, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.71896964, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.38696289, + "step": 6338, + "time_per_iteration": 2.738293409347534 + }, + { + "auxiliary_loss_clip": 0.0162191, + "auxiliary_loss_mlp": 0.00307624, + "balance_loss_clip": 1.33269012, + "balance_loss_mlp": 0.27102634, + "epoch": 0.38112129866225763, + "flos": 16902757693440.0, + "grad_norm": 51.366522824283635, + "language_loss": 0.78202629, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.80132163, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.36572266, + "step": 6339, + "time_per_iteration": 2.7502620220184326 + }, + { + "auxiliary_loss_clip": 0.01617708, + "auxiliary_loss_mlp": 0.00340545, + "balance_loss_clip": 1.3351903, + "balance_loss_mlp": 0.30406725, + "epoch": 0.3811814219149256, + "flos": 20887226588160.0, + "grad_norm": 10.22970083205903, + "language_loss": 0.77072072, + "learning_rate": 2.839869615637177e-06, + "loss": 0.79030323, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.36474609, + "step": 6340, + "time_per_iteration": 2.7073283195495605 + }, + { + "auxiliary_loss_clip": 0.01611596, + "auxiliary_loss_mlp": 0.0032431, + "balance_loss_clip": 1.32688892, + "balance_loss_mlp": 0.28749776, + "epoch": 0.38124154516759357, + "flos": 16690813493760.0, + "grad_norm": 2.962139574331783, + "language_loss": 0.99508452, + "learning_rate": 2.839516142102522e-06, + "loss": 1.01444364, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.36816406, + "step": 6341, + "time_per_iteration": 2.702725887298584 + }, + { + "auxiliary_loss_clip": 0.01604112, + "auxiliary_loss_mlp": 0.00319418, + "balance_loss_clip": 1.3188436, + "balance_loss_mlp": 0.28050843, + "epoch": 0.38130166842026153, + "flos": 19681956702720.0, + "grad_norm": 15.44616999737883, + "language_loss": 0.82259417, + "learning_rate": 2.83916263673333e-06, + "loss": 0.84182942, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.38891602, + "step": 6342, + "time_per_iteration": 2.6628644466400146 + }, + { + "auxiliary_loss_clip": 0.01610023, + "auxiliary_loss_mlp": 0.00339371, + "balance_loss_clip": 1.32598853, + "balance_loss_mlp": 0.30267861, + "epoch": 0.3813617916729295, + "flos": 22198432659840.0, + "grad_norm": 5.86366953193857, + "language_loss": 0.88495713, + "learning_rate": 2.838809099543007e-06, + "loss": 0.90445113, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.36669922, + "step": 6343, + "time_per_iteration": 2.6732633113861084 + }, + { + "auxiliary_loss_clip": 0.01596852, + "auxiliary_loss_mlp": 0.0032061, + "balance_loss_clip": 1.31398034, + "balance_loss_mlp": 0.28541934, + "epoch": 0.38142191492559746, + "flos": 19096899978240.0, + "grad_norm": 8.386416740801145, + "language_loss": 0.82893658, + "learning_rate": 2.838455530544959e-06, + "loss": 0.84811127, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.35205078, + "step": 6344, + "time_per_iteration": 2.64619517326355 + }, + { + "auxiliary_loss_clip": 0.01599836, + "auxiliary_loss_mlp": 0.00335868, + "balance_loss_clip": 1.31566691, + "balance_loss_mlp": 0.29867435, + "epoch": 0.3814820381782654, + "flos": 24097748112000.0, + "grad_norm": 28.982781865800654, + "language_loss": 0.8146891, + "learning_rate": 2.838101929752593e-06, + "loss": 0.83404613, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.37182617, + "step": 6345, + "time_per_iteration": 2.6838085651397705 + }, + { + "auxiliary_loss_clip": 0.01606472, + "auxiliary_loss_mlp": 0.00325936, + "balance_loss_clip": 1.32523167, + "balance_loss_mlp": 0.28938615, + "epoch": 0.3815421614309334, + "flos": 15778502933760.0, + "grad_norm": 7.594039798954289, + "language_loss": 0.7811954, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.80051947, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.36523438, + "step": 6346, + "time_per_iteration": 2.7067387104034424 + }, + { + "auxiliary_loss_clip": 0.01602548, + "auxiliary_loss_mlp": 0.00326646, + "balance_loss_clip": 1.3160274, + "balance_loss_mlp": 0.28890437, + "epoch": 0.38160228468360136, + "flos": 19899754819200.0, + "grad_norm": 27.33825746462555, + "language_loss": 0.82789379, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.84718573, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.37719727, + "step": 6347, + "time_per_iteration": 2.662125825881958 + }, + { + "auxiliary_loss_clip": 0.01594731, + "auxiliary_loss_mlp": 0.00306102, + "balance_loss_clip": 1.31177163, + "balance_loss_mlp": 0.27274776, + "epoch": 0.3816624079362694, + "flos": 19281050029440.0, + "grad_norm": 3.3280869493578953, + "language_loss": 0.81904149, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.83804977, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.33349609, + "step": 6348, + "time_per_iteration": 2.6169116497039795 + }, + { + "auxiliary_loss_clip": 0.01599514, + "auxiliary_loss_mlp": 0.00315178, + "balance_loss_clip": 1.31415939, + "balance_loss_mlp": 0.27517158, + "epoch": 0.38172253118893734, + "flos": 21177564220800.0, + "grad_norm": 6.599754956347613, + "language_loss": 0.9526394, + "learning_rate": 2.836687208908142e-06, + "loss": 0.97178632, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.40014648, + "step": 6349, + "time_per_iteration": 4.099695920944214 + }, + { + "auxiliary_loss_clip": 0.01617193, + "auxiliary_loss_mlp": 0.00284771, + "balance_loss_clip": 1.3280381, + "balance_loss_mlp": 0.24893647, + "epoch": 0.3817826544416053, + "flos": 17529219820800.0, + "grad_norm": 12.100676716755896, + "language_loss": 0.8647033, + "learning_rate": 2.836333449345341e-06, + "loss": 0.8837229, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.35839844, + "step": 6350, + "time_per_iteration": 2.6415741443634033 + }, + { + "auxiliary_loss_clip": 0.01594355, + "auxiliary_loss_mlp": 0.00315247, + "balance_loss_clip": 1.31365752, + "balance_loss_mlp": 0.2742388, + "epoch": 0.38184277769427327, + "flos": 16326535714560.0, + "grad_norm": 4.066549234458273, + "language_loss": 0.83770609, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.85680211, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.40966797, + "step": 6351, + "time_per_iteration": 4.143386602401733 + }, + { + "auxiliary_loss_clip": 0.01620492, + "auxiliary_loss_mlp": 0.00318301, + "balance_loss_clip": 1.33287787, + "balance_loss_mlp": 0.27798429, + "epoch": 0.38190290094694124, + "flos": 30443450382720.0, + "grad_norm": 8.89597882618044, + "language_loss": 0.84861881, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.86800665, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.40332031, + "step": 6352, + "time_per_iteration": 2.819288969039917 + }, + { + "auxiliary_loss_clip": 0.01619417, + "auxiliary_loss_mlp": 0.00331966, + "balance_loss_clip": 1.33261323, + "balance_loss_mlp": 0.29431999, + "epoch": 0.3819630241996092, + "flos": 14209924936320.0, + "grad_norm": 26.5925386651671, + "language_loss": 0.70450819, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.72402203, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.3762207, + "step": 6353, + "time_per_iteration": 2.697115182876587 + }, + { + "auxiliary_loss_clip": 0.01603932, + "auxiliary_loss_mlp": 0.00284783, + "balance_loss_clip": 1.31660986, + "balance_loss_mlp": 0.24873419, + "epoch": 0.38202314745227717, + "flos": 25009699536000.0, + "grad_norm": 4.642029143390403, + "language_loss": 0.89097846, + "learning_rate": 2.834918094089816e-06, + "loss": 0.90986562, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.3605957, + "step": 6354, + "time_per_iteration": 2.674713134765625 + }, + { + "auxiliary_loss_clip": 0.01626034, + "auxiliary_loss_mlp": 0.00277559, + "balance_loss_clip": 1.34139276, + "balance_loss_mlp": 0.24384655, + "epoch": 0.38208327070494513, + "flos": 20814507504000.0, + "grad_norm": 1.8706555521131765, + "language_loss": 0.88374752, + "learning_rate": 2.834564176091943e-06, + "loss": 0.90278345, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 2.84570312, + "router_z_loss_mlp": 0.33691406, + "step": 6355, + "time_per_iteration": 4.093441724777222 + }, + { + "auxiliary_loss_clip": 0.01611108, + "auxiliary_loss_mlp": 0.00302688, + "balance_loss_clip": 1.32657981, + "balance_loss_mlp": 0.2684277, + "epoch": 0.3821433939576131, + "flos": 22637727993600.0, + "grad_norm": 1065.2635182942765, + "language_loss": 0.82591498, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.84505296, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.34228516, + "step": 6356, + "time_per_iteration": 2.6782469749450684 + }, + { + "auxiliary_loss_clip": 0.01627385, + "auxiliary_loss_mlp": 0.00319317, + "balance_loss_clip": 1.33355713, + "balance_loss_mlp": 0.2811704, + "epoch": 0.38220351721028106, + "flos": 26869872142080.0, + "grad_norm": 17.92427179665703, + "language_loss": 0.8797105, + "learning_rate": 2.833856245169348e-06, + "loss": 0.89917755, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.3815918, + "step": 6357, + "time_per_iteration": 2.717782735824585 + }, + { + "auxiliary_loss_clip": 0.01638929, + "auxiliary_loss_mlp": 0.00311389, + "balance_loss_clip": 1.34712601, + "balance_loss_mlp": 0.27624598, + "epoch": 0.38226364046294903, + "flos": 23367468700800.0, + "grad_norm": 21.557444268876438, + "language_loss": 0.87017787, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.88968104, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.35131836, + "step": 6358, + "time_per_iteration": 2.6921043395996094 + }, + { + "auxiliary_loss_clip": 0.01635267, + "auxiliary_loss_mlp": 0.00287146, + "balance_loss_clip": 1.33668435, + "balance_loss_mlp": 0.25119299, + "epoch": 0.382323763715617, + "flos": 19646225648640.0, + "grad_norm": 36.700007592076034, + "language_loss": 0.85840839, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.8776325, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 2.99023438, + "router_z_loss_mlp": 0.35961914, + "step": 6359, + "time_per_iteration": 2.6756958961486816 + }, + { + "auxiliary_loss_clip": 0.0163617, + "auxiliary_loss_mlp": 0.00310689, + "balance_loss_clip": 1.34665358, + "balance_loss_mlp": 0.27371031, + "epoch": 0.38238388696828496, + "flos": 54124741232640.0, + "grad_norm": 6.757895651590891, + "language_loss": 0.76579475, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.78526336, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.36962891, + "step": 6360, + "time_per_iteration": 2.935215711593628 + }, + { + "auxiliary_loss_clip": 0.01644196, + "auxiliary_loss_mlp": 0.00342146, + "balance_loss_clip": 1.35449314, + "balance_loss_mlp": 0.3038322, + "epoch": 0.382444010220953, + "flos": 24936190352640.0, + "grad_norm": 183.3862611858086, + "language_loss": 0.84157073, + "learning_rate": 2.83244000399261e-06, + "loss": 0.8614341, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.3828125, + "step": 6361, + "time_per_iteration": 4.188424587249756 + }, + { + "auxiliary_loss_clip": 0.01631754, + "auxiliary_loss_mlp": 0.00310187, + "balance_loss_clip": 1.33984244, + "balance_loss_mlp": 0.27163455, + "epoch": 0.38250413347362094, + "flos": 42337351209600.0, + "grad_norm": 4.75359859340562, + "language_loss": 0.71291703, + "learning_rate": 2.832085864749337e-06, + "loss": 0.73233646, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.38574219, + "step": 6362, + "time_per_iteration": 2.8097174167633057 + }, + { + "auxiliary_loss_clip": 0.01633669, + "auxiliary_loss_mlp": 0.00337431, + "balance_loss_clip": 1.33474171, + "balance_loss_mlp": 0.29818696, + "epoch": 0.3825642567262889, + "flos": 16289224462080.0, + "grad_norm": 230.2022076347913, + "language_loss": 0.89253169, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.91224265, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.39282227, + "step": 6363, + "time_per_iteration": 2.654904365539551 + }, + { + "auxiliary_loss_clip": 0.01642755, + "auxiliary_loss_mlp": 0.00299767, + "balance_loss_clip": 1.35172915, + "balance_loss_mlp": 0.26560172, + "epoch": 0.3826243799789569, + "flos": 45654778586880.0, + "grad_norm": 3.1390254233277077, + "language_loss": 0.67328572, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.692711, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.34204102, + "step": 6364, + "time_per_iteration": 2.921494722366333 + }, + { + "auxiliary_loss_clip": 0.01633247, + "auxiliary_loss_mlp": 0.00323678, + "balance_loss_clip": 1.33395076, + "balance_loss_mlp": 0.28481627, + "epoch": 0.38268450323162484, + "flos": 25301581453440.0, + "grad_norm": 9.333599662354064, + "language_loss": 0.76151007, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.78107935, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.38842773, + "step": 6365, + "time_per_iteration": 2.712049722671509 + }, + { + "auxiliary_loss_clip": 0.01642703, + "auxiliary_loss_mlp": 0.00331412, + "balance_loss_clip": 1.34180737, + "balance_loss_mlp": 0.2928119, + "epoch": 0.3827446264842928, + "flos": 21836022387840.0, + "grad_norm": 2.6915263514773535, + "language_loss": 0.79895067, + "learning_rate": 2.830668992382758e-06, + "loss": 0.81869185, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.38598633, + "step": 6366, + "time_per_iteration": 2.6547906398773193 + }, + { + "auxiliary_loss_clip": 0.01652466, + "auxiliary_loss_mlp": 0.0030324, + "balance_loss_clip": 1.35244131, + "balance_loss_mlp": 0.26731074, + "epoch": 0.38280474973696077, + "flos": 25734591907200.0, + "grad_norm": 3.2746922353969063, + "language_loss": 0.78030682, + "learning_rate": 2.830314695509902e-06, + "loss": 0.79986382, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 3.00195312, + "router_z_loss_mlp": 0.359375, + "step": 6367, + "time_per_iteration": 2.7022924423217773 + }, + { + "auxiliary_loss_clip": 0.01645579, + "auxiliary_loss_mlp": 0.00308976, + "balance_loss_clip": 1.35439682, + "balance_loss_mlp": 0.27225989, + "epoch": 0.38286487298962874, + "flos": 24895934184960.0, + "grad_norm": 29.008406633294673, + "language_loss": 0.71184838, + "learning_rate": 2.82996036715143e-06, + "loss": 0.73139393, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.36743164, + "step": 6368, + "time_per_iteration": 2.6671314239501953 + }, + { + "auxiliary_loss_clip": 0.0164398, + "auxiliary_loss_mlp": 0.00249546, + "balance_loss_clip": 1.35303164, + "balance_loss_mlp": 0.21366408, + "epoch": 0.3829249962422967, + "flos": 28543703967360.0, + "grad_norm": 6.804965912593796, + "language_loss": 0.75027627, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.76921153, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.35864258, + "step": 6369, + "time_per_iteration": 2.7387173175811768 + }, + { + "auxiliary_loss_clip": 0.01644063, + "auxiliary_loss_mlp": 0.00280656, + "balance_loss_clip": 1.34890759, + "balance_loss_mlp": 0.24632323, + "epoch": 0.38298511949496467, + "flos": 21471205904640.0, + "grad_norm": 4.112067191658734, + "language_loss": 0.84173089, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.86097807, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.34350586, + "step": 6370, + "time_per_iteration": 2.671417236328125 + }, + { + "auxiliary_loss_clip": 0.01640708, + "auxiliary_loss_mlp": 0.00295478, + "balance_loss_clip": 1.34462953, + "balance_loss_mlp": 0.26031083, + "epoch": 0.38304524274763263, + "flos": 31679998035840.0, + "grad_norm": 13.542197684908267, + "language_loss": 0.7150318, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.73439366, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.35131836, + "step": 6371, + "time_per_iteration": 2.8249351978302 + }, + { + "auxiliary_loss_clip": 0.01629305, + "auxiliary_loss_mlp": 0.0031426, + "balance_loss_clip": 1.32985497, + "balance_loss_mlp": 0.27825874, + "epoch": 0.3831053660003006, + "flos": 25076816098560.0, + "grad_norm": 2.100183189851737, + "language_loss": 0.79272449, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.81216019, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 2.99414062, + "router_z_loss_mlp": 0.35986328, + "step": 6372, + "time_per_iteration": 2.7687911987304688 + }, + { + "auxiliary_loss_clip": 0.01634512, + "auxiliary_loss_mlp": 0.00311852, + "balance_loss_clip": 1.33286881, + "balance_loss_mlp": 0.27606487, + "epoch": 0.38316548925296856, + "flos": 23259018562560.0, + "grad_norm": 11.00943733144634, + "language_loss": 0.91037482, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.92983842, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.35791016, + "step": 6373, + "time_per_iteration": 2.733586072921753 + }, + { + "auxiliary_loss_clip": 0.01633817, + "auxiliary_loss_mlp": 0.00322382, + "balance_loss_clip": 1.33856297, + "balance_loss_mlp": 0.28618985, + "epoch": 0.3832256125056366, + "flos": 34423465991040.0, + "grad_norm": 5.080349468038354, + "language_loss": 0.81788421, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.83744615, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.36181641, + "step": 6374, + "time_per_iteration": 2.7729270458221436 + }, + { + "auxiliary_loss_clip": 0.01641953, + "auxiliary_loss_mlp": 0.00315158, + "balance_loss_clip": 1.34198666, + "balance_loss_mlp": 0.27934742, + "epoch": 0.38328573575830455, + "flos": 21762764599680.0, + "grad_norm": 18.077195179627996, + "language_loss": 0.83782071, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.85739183, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.3581543, + "step": 6375, + "time_per_iteration": 2.6978530883789062 + }, + { + "auxiliary_loss_clip": 0.0161977, + "auxiliary_loss_mlp": 0.00298048, + "balance_loss_clip": 1.32249069, + "balance_loss_mlp": 0.26209489, + "epoch": 0.3833458590109725, + "flos": 17380010724480.0, + "grad_norm": 54.25424411282349, + "language_loss": 0.80408758, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.82326579, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.35961914, + "step": 6376, + "time_per_iteration": 2.6696083545684814 + }, + { + "auxiliary_loss_clip": 0.01624487, + "auxiliary_loss_mlp": 0.00303517, + "balance_loss_clip": 1.33263206, + "balance_loss_mlp": 0.26765925, + "epoch": 0.3834059822636405, + "flos": 29424557191680.0, + "grad_norm": 43.3541374108664, + "language_loss": 0.72407168, + "learning_rate": 2.826769997289796e-06, + "loss": 0.7433517, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.35839844, + "step": 6377, + "time_per_iteration": 2.8000590801239014 + }, + { + "auxiliary_loss_clip": 0.01619972, + "auxiliary_loss_mlp": 0.00319812, + "balance_loss_clip": 1.32066298, + "balance_loss_mlp": 0.28288147, + "epoch": 0.38346610551630844, + "flos": 21470739027840.0, + "grad_norm": 1419.7694736385138, + "language_loss": 0.82079792, + "learning_rate": 2.826415354814344e-06, + "loss": 0.84019578, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.36889648, + "step": 6378, + "time_per_iteration": 2.6820626258850098 + }, + { + "auxiliary_loss_clip": 0.01636608, + "auxiliary_loss_mlp": 0.00327475, + "balance_loss_clip": 1.33576155, + "balance_loss_mlp": 0.29140192, + "epoch": 0.3835262287689764, + "flos": 27561224188800.0, + "grad_norm": 43.77223103004055, + "language_loss": 0.74262512, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.76226592, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.36083984, + "step": 6379, + "time_per_iteration": 2.7266478538513184 + }, + { + "auxiliary_loss_clip": 0.01631655, + "auxiliary_loss_mlp": 0.00295413, + "balance_loss_clip": 1.33536875, + "balance_loss_mlp": 0.26122427, + "epoch": 0.3835863520216444, + "flos": 15523716787200.0, + "grad_norm": 3.53089952463034, + "language_loss": 0.88467366, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.90394425, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.34204102, + "step": 6380, + "time_per_iteration": 2.7315142154693604 + }, + { + "auxiliary_loss_clip": 0.01624299, + "auxiliary_loss_mlp": 0.00324617, + "balance_loss_clip": 1.32844687, + "balance_loss_mlp": 0.28782916, + "epoch": 0.38364647527431234, + "flos": 21904934630400.0, + "grad_norm": 12.648454522273104, + "language_loss": 0.85923922, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.87872839, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.36816406, + "step": 6381, + "time_per_iteration": 2.6952576637268066 + }, + { + "auxiliary_loss_clip": 0.01586094, + "auxiliary_loss_mlp": 0.00228604, + "balance_loss_clip": 1.40374136, + "balance_loss_mlp": 0.21830411, + "epoch": 0.3837065985269803, + "flos": 65534927558400.0, + "grad_norm": 0.8122089837339881, + "language_loss": 0.59734744, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.61549443, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.10302734, + "step": 6382, + "time_per_iteration": 3.1304779052734375 + }, + { + "auxiliary_loss_clip": 0.01618217, + "auxiliary_loss_mlp": 0.0029267, + "balance_loss_clip": 1.32110715, + "balance_loss_mlp": 0.25614429, + "epoch": 0.38376672177964827, + "flos": 28256598558720.0, + "grad_norm": 31.01702469523618, + "language_loss": 0.75027978, + "learning_rate": 2.824641672639794e-06, + "loss": 0.76938868, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.36523438, + "step": 6383, + "time_per_iteration": 2.7845327854156494 + }, + { + "auxiliary_loss_clip": 0.01634254, + "auxiliary_loss_mlp": 0.00296492, + "balance_loss_clip": 1.33144569, + "balance_loss_mlp": 0.26173091, + "epoch": 0.38382684503231623, + "flos": 20631363033600.0, + "grad_norm": 12.106975601486083, + "language_loss": 0.82265091, + "learning_rate": 2.824286842339587e-06, + "loss": 0.8419584, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.34741211, + "step": 6384, + "time_per_iteration": 2.7124667167663574 + }, + { + "auxiliary_loss_clip": 0.01651498, + "auxiliary_loss_mlp": 0.00279031, + "balance_loss_clip": 1.34730005, + "balance_loss_mlp": 0.24651103, + "epoch": 0.3838869682849842, + "flos": 19605825826560.0, + "grad_norm": 40.14352114451263, + "language_loss": 0.80459905, + "learning_rate": 2.823931980782341e-06, + "loss": 0.82390434, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.32495117, + "step": 6385, + "time_per_iteration": 2.764244794845581 + }, + { + "auxiliary_loss_clip": 0.01555741, + "auxiliary_loss_mlp": 0.00213322, + "balance_loss_clip": 1.37314105, + "balance_loss_mlp": 0.20159169, + "epoch": 0.38394709153765216, + "flos": 56556110891520.0, + "grad_norm": 1.1764234130688362, + "language_loss": 0.678231, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69592154, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.1171875, + "step": 6386, + "time_per_iteration": 3.00028657913208 + }, + { + "auxiliary_loss_clip": 0.0164764, + "auxiliary_loss_mlp": 0.00314766, + "balance_loss_clip": 1.3471427, + "balance_loss_mlp": 0.27998027, + "epoch": 0.3840072147903202, + "flos": 15888748752000.0, + "grad_norm": 9.539811422810631, + "language_loss": 0.79170305, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.8113271, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.34765625, + "step": 6387, + "time_per_iteration": 2.683128595352173 + }, + { + "auxiliary_loss_clip": 0.01650924, + "auxiliary_loss_mlp": 0.00302356, + "balance_loss_clip": 1.35045171, + "balance_loss_mlp": 0.26969314, + "epoch": 0.38406733804298815, + "flos": 28218030330240.0, + "grad_norm": 2.926624253721753, + "language_loss": 0.87572628, + "learning_rate": 2.822867208702932e-06, + "loss": 0.89525914, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.32666016, + "step": 6388, + "time_per_iteration": 2.7151131629943848 + }, + { + "auxiliary_loss_clip": 0.0165121, + "auxiliary_loss_mlp": 0.00288919, + "balance_loss_clip": 1.34033775, + "balance_loss_mlp": 0.25356135, + "epoch": 0.3841274612956561, + "flos": 18223588609920.0, + "grad_norm": 13.983126721764119, + "language_loss": 0.82510334, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.84450459, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.35375977, + "step": 6389, + "time_per_iteration": 2.742051839828491 + }, + { + "auxiliary_loss_clip": 0.0166683, + "auxiliary_loss_mlp": 0.00320901, + "balance_loss_clip": 1.34844732, + "balance_loss_mlp": 0.28289717, + "epoch": 0.3841875845483241, + "flos": 19792884879360.0, + "grad_norm": 4.778524176391641, + "language_loss": 0.8391583, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.85903561, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.37988281, + "step": 6390, + "time_per_iteration": 2.6780333518981934 + }, + { + "auxiliary_loss_clip": 0.01658868, + "auxiliary_loss_mlp": 0.00288224, + "balance_loss_clip": 1.34294629, + "balance_loss_mlp": 0.25367731, + "epoch": 0.38424770780099204, + "flos": 29898829393920.0, + "grad_norm": 262.4528700326799, + "language_loss": 0.76295197, + "learning_rate": 2.821802155794668e-06, + "loss": 0.78242284, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.34545898, + "step": 6391, + "time_per_iteration": 2.7532753944396973 + }, + { + "auxiliary_loss_clip": 0.01649161, + "auxiliary_loss_mlp": 0.00331622, + "balance_loss_clip": 1.33381319, + "balance_loss_mlp": 0.29354697, + "epoch": 0.38430783105366, + "flos": 20813717404800.0, + "grad_norm": 8.951828855598835, + "language_loss": 0.91793633, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.93774408, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.38061523, + "step": 6392, + "time_per_iteration": 4.065686225891113 + }, + { + "auxiliary_loss_clip": 0.01655674, + "auxiliary_loss_mlp": 0.00317393, + "balance_loss_clip": 1.33980393, + "balance_loss_mlp": 0.28258431, + "epoch": 0.384367954306328, + "flos": 10998577399680.0, + "grad_norm": 44.87928977372603, + "language_loss": 0.69636261, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.7160933, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.34790039, + "step": 6393, + "time_per_iteration": 2.67919921875 + }, + { + "auxiliary_loss_clip": 0.0165279, + "auxiliary_loss_mlp": 0.00304554, + "balance_loss_clip": 1.33456767, + "balance_loss_mlp": 0.26778981, + "epoch": 0.38442807755899594, + "flos": 25338030779520.0, + "grad_norm": 6.9178031105268785, + "language_loss": 0.80512524, + "learning_rate": 2.820736822421029e-06, + "loss": 0.82469863, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.3671875, + "step": 6394, + "time_per_iteration": 4.246353387832642 + }, + { + "auxiliary_loss_clip": 0.01657222, + "auxiliary_loss_mlp": 0.00298556, + "balance_loss_clip": 1.33776689, + "balance_loss_mlp": 0.26358047, + "epoch": 0.3844882008116639, + "flos": 21069760527360.0, + "grad_norm": 38.01514094757584, + "language_loss": 0.89542091, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.91497874, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.34985352, + "step": 6395, + "time_per_iteration": 2.8561062812805176 + }, + { + "auxiliary_loss_clip": 0.01671022, + "auxiliary_loss_mlp": 0.00296386, + "balance_loss_clip": 1.34647012, + "balance_loss_mlp": 0.26346076, + "epoch": 0.38454832406433187, + "flos": 17963235855360.0, + "grad_norm": 29.2097131755875, + "language_loss": 0.80332494, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.82299906, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.3293457, + "step": 6396, + "time_per_iteration": 2.7471776008605957 + }, + { + "auxiliary_loss_clip": 0.01474214, + "auxiliary_loss_mlp": 0.00106547, + "balance_loss_clip": 1.30212903, + "balance_loss_mlp": 0.09681967, + "epoch": 0.38460844731699984, + "flos": 67924999555200.0, + "grad_norm": 0.9973468574081054, + "language_loss": 0.59828007, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.6140877, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.09716797, + "step": 6397, + "time_per_iteration": 4.663430213928223 + }, + { + "auxiliary_loss_clip": 0.01665425, + "auxiliary_loss_mlp": 0.002565, + "balance_loss_clip": 1.35379732, + "balance_loss_mlp": 0.2251482, + "epoch": 0.3846685705696678, + "flos": 25849075530240.0, + "grad_norm": 683068.9376245758, + "language_loss": 0.92615199, + "learning_rate": 2.819315942271794e-06, + "loss": 0.94537127, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.31323242, + "step": 6398, + "time_per_iteration": 2.8191897869110107 + }, + { + "auxiliary_loss_clip": 0.01647086, + "auxiliary_loss_mlp": 0.00293524, + "balance_loss_clip": 1.33587265, + "balance_loss_mlp": 0.26198125, + "epoch": 0.38472869382233577, + "flos": 16290194129280.0, + "grad_norm": 4.797798387559952, + "language_loss": 0.87718755, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.89659369, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.31542969, + "step": 6399, + "time_per_iteration": 2.652078151702881 + }, + { + "auxiliary_loss_clip": 0.01629405, + "auxiliary_loss_mlp": 0.0027606, + "balance_loss_clip": 1.32162011, + "balance_loss_mlp": 0.24010614, + "epoch": 0.38478881707500373, + "flos": 19353122668800.0, + "grad_norm": 105.21784984775711, + "language_loss": 0.75932962, + "learning_rate": 2.818605315732038e-06, + "loss": 0.77838427, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.35961914, + "step": 6400, + "time_per_iteration": 2.826495409011841 + }, + { + "auxiliary_loss_clip": 0.01643348, + "auxiliary_loss_mlp": 0.00290036, + "balance_loss_clip": 1.33457339, + "balance_loss_mlp": 0.25780219, + "epoch": 0.38484894032767175, + "flos": 24860849575680.0, + "grad_norm": 6.583121368690919, + "language_loss": 0.7875967, + "learning_rate": 2.81824995589303e-06, + "loss": 0.80693054, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.32202148, + "step": 6401, + "time_per_iteration": 2.7704579830169678 + }, + { + "auxiliary_loss_clip": 0.0165261, + "auxiliary_loss_mlp": 0.00277772, + "balance_loss_clip": 1.33622897, + "balance_loss_mlp": 0.24115115, + "epoch": 0.3849090635803397, + "flos": 14501806853760.0, + "grad_norm": 4.698122172961729, + "language_loss": 0.80882818, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.82813191, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.3659668, + "step": 6402, + "time_per_iteration": 2.711061477661133 + }, + { + "auxiliary_loss_clip": 0.01636417, + "auxiliary_loss_mlp": 0.00268352, + "balance_loss_clip": 1.32975483, + "balance_loss_mlp": 0.23318522, + "epoch": 0.3849691868330077, + "flos": 18515865576960.0, + "grad_norm": 12.770514564255796, + "language_loss": 0.90521371, + "learning_rate": 2.817539143144128e-06, + "loss": 0.92426139, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.3515625, + "step": 6403, + "time_per_iteration": 2.6125285625457764 + }, + { + "auxiliary_loss_clip": 0.01618506, + "auxiliary_loss_mlp": 0.0029043, + "balance_loss_clip": 1.32121253, + "balance_loss_mlp": 0.25881618, + "epoch": 0.38502931008567565, + "flos": 21616392677760.0, + "grad_norm": 522.9589302879966, + "language_loss": 0.90498936, + "learning_rate": 2.817183690261189e-06, + "loss": 0.9240787, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.31616211, + "step": 6404, + "time_per_iteration": 4.086002826690674 + }, + { + "auxiliary_loss_clip": 0.01614214, + "auxiliary_loss_mlp": 0.00283081, + "balance_loss_clip": 1.31210184, + "balance_loss_mlp": 0.24932085, + "epoch": 0.3850894333383436, + "flos": 25415346804480.0, + "grad_norm": 2.4525566263436973, + "language_loss": 0.75457811, + "learning_rate": 2.816828206390563e-06, + "loss": 0.77355111, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.33740234, + "step": 6405, + "time_per_iteration": 2.7340383529663086 + }, + { + "auxiliary_loss_clip": 0.01618282, + "auxiliary_loss_mlp": 0.00269863, + "balance_loss_clip": 1.31931412, + "balance_loss_mlp": 0.23805785, + "epoch": 0.3851495565910116, + "flos": 20227870581120.0, + "grad_norm": 109.48243481597194, + "language_loss": 0.86628896, + "learning_rate": 2.816472691545729e-06, + "loss": 0.8851704, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.31811523, + "step": 6406, + "time_per_iteration": 2.6327297687530518 + }, + { + "auxiliary_loss_clip": 0.01622763, + "auxiliary_loss_mlp": 0.00282586, + "balance_loss_clip": 1.31411695, + "balance_loss_mlp": 0.24746734, + "epoch": 0.38520967984367954, + "flos": 16508459122560.0, + "grad_norm": 9.009872721962736, + "language_loss": 0.92503709, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.9440906, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.35107422, + "step": 6407, + "time_per_iteration": 2.60701060295105 + }, + { + "auxiliary_loss_clip": 0.01407908, + "auxiliary_loss_mlp": 0.00097652, + "balance_loss_clip": 1.23090756, + "balance_loss_mlp": 0.09006993, + "epoch": 0.3852698030963475, + "flos": 61313772971520.0, + "grad_norm": 0.7701428187108796, + "language_loss": 0.64861035, + "learning_rate": 2.815761568987365e-06, + "loss": 0.66366595, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.07568359, + "step": 6408, + "time_per_iteration": 3.21704363822937 + }, + { + "auxiliary_loss_clip": 0.01623172, + "auxiliary_loss_mlp": 0.00307297, + "balance_loss_clip": 1.32007957, + "balance_loss_mlp": 0.27339351, + "epoch": 0.3853299263490155, + "flos": 22893016930560.0, + "grad_norm": 22.150362727447924, + "language_loss": 0.78805512, + "learning_rate": 2.8154059613008e-06, + "loss": 0.80735981, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.33911133, + "step": 6409, + "time_per_iteration": 2.6574177742004395 + }, + { + "auxiliary_loss_clip": 0.01623192, + "auxiliary_loss_mlp": 0.00288355, + "balance_loss_clip": 1.3140552, + "balance_loss_mlp": 0.25359327, + "epoch": 0.38539004960168344, + "flos": 20047491457920.0, + "grad_norm": 120.7775109532317, + "language_loss": 0.8092851, + "learning_rate": 2.81505032269396e-06, + "loss": 0.82840061, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.34741211, + "step": 6410, + "time_per_iteration": 2.67203688621521 + }, + { + "auxiliary_loss_clip": 0.01413846, + "auxiliary_loss_mlp": 0.00091077, + "balance_loss_clip": 1.23758411, + "balance_loss_mlp": 0.08220824, + "epoch": 0.3854501728543514, + "flos": 68730691570560.0, + "grad_norm": 2.0941226392176775, + "language_loss": 0.59496307, + "learning_rate": 2.81469465318033e-06, + "loss": 0.61001229, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.08886719, + "step": 6411, + "time_per_iteration": 3.242995023727417 + }, + { + "auxiliary_loss_clip": 0.01616793, + "auxiliary_loss_mlp": 0.00246114, + "balance_loss_clip": 1.30892897, + "balance_loss_mlp": 0.21333119, + "epoch": 0.38551029610701937, + "flos": 20485027025280.0, + "grad_norm": 6.35446635716123, + "language_loss": 0.85746861, + "learning_rate": 2.814338952773397e-06, + "loss": 0.87609762, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.32739258, + "step": 6412, + "time_per_iteration": 2.7130844593048096 + }, + { + "auxiliary_loss_clip": 0.01618982, + "auxiliary_loss_mlp": 0.00275395, + "balance_loss_clip": 1.30901361, + "balance_loss_mlp": 0.2427558, + "epoch": 0.38557041935968733, + "flos": 23471788775040.0, + "grad_norm": 2.549919299423259, + "language_loss": 0.87172139, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.89066517, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.32666016, + "step": 6413, + "time_per_iteration": 2.6942784786224365 + }, + { + "auxiliary_loss_clip": 0.01417563, + "auxiliary_loss_mlp": 0.0007404, + "balance_loss_clip": 1.24129415, + "balance_loss_mlp": 0.06540932, + "epoch": 0.38563054261235535, + "flos": 63966636869760.0, + "grad_norm": 0.8104589740427719, + "language_loss": 0.61041033, + "learning_rate": 2.813627459333576e-06, + "loss": 0.6253264, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.08642578, + "step": 6414, + "time_per_iteration": 3.077376365661621 + }, + { + "auxiliary_loss_clip": 0.01621711, + "auxiliary_loss_mlp": 0.00251389, + "balance_loss_clip": 1.30936754, + "balance_loss_mlp": 0.21581739, + "epoch": 0.3856906658650233, + "flos": 23987789602560.0, + "grad_norm": 5.457801374787279, + "language_loss": 0.86835825, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.88708919, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.35546875, + "step": 6415, + "time_per_iteration": 2.7062904834747314 + }, + { + "auxiliary_loss_clip": 0.01636942, + "auxiliary_loss_mlp": 0.00247308, + "balance_loss_clip": 1.32462645, + "balance_loss_mlp": 0.21342859, + "epoch": 0.3857507891176913, + "flos": 25007436979200.0, + "grad_norm": 8.493384525836746, + "language_loss": 0.84398359, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.86282605, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.33862305, + "step": 6416, + "time_per_iteration": 2.7539241313934326 + }, + { + "auxiliary_loss_clip": 0.01621486, + "auxiliary_loss_mlp": 0.00275288, + "balance_loss_clip": 1.31088769, + "balance_loss_mlp": 0.24431768, + "epoch": 0.38581091237035925, + "flos": 21536778182400.0, + "grad_norm": 7.877570523578659, + "language_loss": 0.85020435, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.8691721, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.30981445, + "step": 6417, + "time_per_iteration": 2.681366205215454 + }, + { + "auxiliary_loss_clip": 0.01622608, + "auxiliary_loss_mlp": 0.00254775, + "balance_loss_clip": 1.3100996, + "balance_loss_mlp": 0.2227788, + "epoch": 0.3858710356230272, + "flos": 17383889393280.0, + "grad_norm": 13.549240707663827, + "language_loss": 0.8904106, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.9091844, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.32006836, + "step": 6418, + "time_per_iteration": 2.726663589477539 + }, + { + "auxiliary_loss_clip": 0.01619538, + "auxiliary_loss_mlp": 0.00248069, + "balance_loss_clip": 1.31333435, + "balance_loss_mlp": 0.21705124, + "epoch": 0.3859311588756952, + "flos": 20339588856960.0, + "grad_norm": 7.2652202216629105, + "language_loss": 0.87159479, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.89027083, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.31030273, + "step": 6419, + "time_per_iteration": 2.705132484436035 + }, + { + "auxiliary_loss_clip": 0.01635078, + "auxiliary_loss_mlp": 0.00248218, + "balance_loss_clip": 1.31754112, + "balance_loss_mlp": 0.21302727, + "epoch": 0.38599128212836314, + "flos": 26321157002880.0, + "grad_norm": 37.02581083666262, + "language_loss": 0.75878537, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.77761835, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.35180664, + "step": 6420, + "time_per_iteration": 2.750032424926758 + }, + { + "auxiliary_loss_clip": 0.01637871, + "auxiliary_loss_mlp": 0.00264549, + "balance_loss_clip": 1.32441509, + "balance_loss_mlp": 0.22952518, + "epoch": 0.3860514053810311, + "flos": 13553837066880.0, + "grad_norm": 5.876358523652723, + "language_loss": 0.88083565, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.89985991, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 3.13867188, + "router_z_loss_mlp": 0.3503418, + "step": 6421, + "time_per_iteration": 2.6665453910827637 + }, + { + "auxiliary_loss_clip": 0.01638165, + "auxiliary_loss_mlp": 0.0026978, + "balance_loss_clip": 1.3198241, + "balance_loss_mlp": 0.23516153, + "epoch": 0.3861115286336991, + "flos": 20954271323520.0, + "grad_norm": 4.590073067070618, + "language_loss": 0.80771947, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.82679886, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.34643555, + "step": 6422, + "time_per_iteration": 2.6321959495544434 + }, + { + "auxiliary_loss_clip": 0.01651381, + "auxiliary_loss_mlp": 0.00246392, + "balance_loss_clip": 1.33118415, + "balance_loss_mlp": 0.21370503, + "epoch": 0.38617165188636704, + "flos": 16362697731840.0, + "grad_norm": 2.3463799525801123, + "language_loss": 0.74307883, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.76205659, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.3269043, + "step": 6423, + "time_per_iteration": 2.6652989387512207 + }, + { + "auxiliary_loss_clip": 0.01624951, + "auxiliary_loss_mlp": 0.00259142, + "balance_loss_clip": 1.30840778, + "balance_loss_mlp": 0.22352204, + "epoch": 0.386231775139035, + "flos": 34787276893440.0, + "grad_norm": 3.774284105856269, + "language_loss": 0.77174854, + "learning_rate": 2.810068143123449e-06, + "loss": 0.79058945, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.35620117, + "step": 6424, + "time_per_iteration": 2.8043863773345947 + }, + { + "auxiliary_loss_clip": 0.01628187, + "auxiliary_loss_mlp": 0.00260154, + "balance_loss_clip": 1.31270862, + "balance_loss_mlp": 0.22846854, + "epoch": 0.38629189839170297, + "flos": 21726171619200.0, + "grad_norm": 18.498667601603845, + "language_loss": 0.77568841, + "learning_rate": 2.809712042331429e-06, + "loss": 0.79457182, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.31689453, + "step": 6425, + "time_per_iteration": 2.7228190898895264 + }, + { + "auxiliary_loss_clip": 0.01623246, + "auxiliary_loss_mlp": 0.00262252, + "balance_loss_clip": 1.30741155, + "balance_loss_mlp": 0.22873083, + "epoch": 0.38635202164437094, + "flos": 27923634460800.0, + "grad_norm": 581.4834736584602, + "language_loss": 0.86199278, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.88084781, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.33520508, + "step": 6426, + "time_per_iteration": 2.825049638748169 + }, + { + "auxiliary_loss_clip": 0.01635088, + "auxiliary_loss_mlp": 0.00291914, + "balance_loss_clip": 1.31853437, + "balance_loss_mlp": 0.25875026, + "epoch": 0.38641214489703896, + "flos": 23586631534080.0, + "grad_norm": 25.322865339239797, + "language_loss": 0.82649434, + "learning_rate": 2.80899974864781e-06, + "loss": 0.84576434, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 3.16601562, + "router_z_loss_mlp": 0.33178711, + "step": 6427, + "time_per_iteration": 2.73966121673584 + }, + { + "auxiliary_loss_clip": 0.01627626, + "auxiliary_loss_mlp": 0.00264892, + "balance_loss_clip": 1.31166708, + "balance_loss_mlp": 0.23122761, + "epoch": 0.3864722681497069, + "flos": 12641239198080.0, + "grad_norm": 5.178065698152898, + "language_loss": 0.76156753, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.78049272, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.33642578, + "step": 6428, + "time_per_iteration": 2.670247793197632 + }, + { + "auxiliary_loss_clip": 0.01620404, + "auxiliary_loss_mlp": 0.00279659, + "balance_loss_clip": 1.30646241, + "balance_loss_mlp": 0.24525538, + "epoch": 0.3865323914023749, + "flos": 17598922162560.0, + "grad_norm": 5.168169471143829, + "language_loss": 0.89987695, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.9188776, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.34423828, + "step": 6429, + "time_per_iteration": 2.665508270263672 + }, + { + "auxiliary_loss_clip": 0.01611354, + "auxiliary_loss_mlp": 0.0030184, + "balance_loss_clip": 1.30057049, + "balance_loss_mlp": 0.26922432, + "epoch": 0.38659251465504285, + "flos": 18478949374080.0, + "grad_norm": 143.53212572378067, + "language_loss": 0.90953016, + "learning_rate": 2.807931078076015e-06, + "loss": 0.92866206, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.32592773, + "step": 6430, + "time_per_iteration": 2.7065279483795166 + }, + { + "auxiliary_loss_clip": 0.01426963, + "auxiliary_loss_mlp": 0.00162404, + "balance_loss_clip": 1.25524187, + "balance_loss_mlp": 0.15186636, + "epoch": 0.3866526379077108, + "flos": 64165726978560.0, + "grad_norm": 1.3511975586301592, + "language_loss": 0.58661735, + "learning_rate": 2.807574793260416e-06, + "loss": 0.60251099, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.10546875, + "step": 6431, + "time_per_iteration": 3.2014658451080322 + }, + { + "auxiliary_loss_clip": 0.0160859, + "auxiliary_loss_mlp": 0.00291007, + "balance_loss_clip": 1.29696488, + "balance_loss_mlp": 0.25719944, + "epoch": 0.3867127611603788, + "flos": 14388292897920.0, + "grad_norm": 67.67262516554253, + "language_loss": 0.8722856, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.89128155, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.33789062, + "step": 6432, + "time_per_iteration": 2.6436076164245605 + }, + { + "auxiliary_loss_clip": 0.01609774, + "auxiliary_loss_mlp": 0.00306545, + "balance_loss_clip": 1.29203081, + "balance_loss_mlp": 0.27195033, + "epoch": 0.38677288441304675, + "flos": 20010754823040.0, + "grad_norm": 15.029161225752137, + "language_loss": 0.90068674, + "learning_rate": 2.806862131772779e-06, + "loss": 0.91984999, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.34594727, + "step": 6433, + "time_per_iteration": 2.7007317543029785 + }, + { + "auxiliary_loss_clip": 0.01597732, + "auxiliary_loss_mlp": 0.00252346, + "balance_loss_clip": 1.28952813, + "balance_loss_mlp": 0.2170601, + "epoch": 0.3868330076657147, + "flos": 22236893147520.0, + "grad_norm": 226.3857086724911, + "language_loss": 0.78739607, + "learning_rate": 2.806505755127765e-06, + "loss": 0.80589688, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.3527832, + "step": 6434, + "time_per_iteration": 4.0895140171051025 + }, + { + "auxiliary_loss_clip": 0.01596896, + "auxiliary_loss_mlp": 0.00292639, + "balance_loss_clip": 1.27986789, + "balance_loss_mlp": 0.25916541, + "epoch": 0.3868931309183827, + "flos": 16727442387840.0, + "grad_norm": 6.82635709548058, + "language_loss": 0.87253559, + "learning_rate": 2.806149347899972e-06, + "loss": 0.89143097, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 3.17382812, + "router_z_loss_mlp": 0.33496094, + "step": 6435, + "time_per_iteration": 2.6448957920074463 + }, + { + "auxiliary_loss_clip": 0.01584944, + "auxiliary_loss_mlp": 0.00290784, + "balance_loss_clip": 1.28063023, + "balance_loss_mlp": 0.25816792, + "epoch": 0.38695325417105064, + "flos": 22674716023680.0, + "grad_norm": 123.73818269155582, + "language_loss": 0.85604489, + "learning_rate": 2.805792910102915e-06, + "loss": 0.87480217, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.32592773, + "step": 6436, + "time_per_iteration": 4.135608434677124 + }, + { + "auxiliary_loss_clip": 0.01573145, + "auxiliary_loss_mlp": 0.00284238, + "balance_loss_clip": 1.27835107, + "balance_loss_mlp": 0.25240889, + "epoch": 0.3870133774237186, + "flos": 23112036109440.0, + "grad_norm": 2.5520577152469612, + "language_loss": 0.82380152, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.8423754, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.31835938, + "step": 6437, + "time_per_iteration": 2.682001829147339 + }, + { + "auxiliary_loss_clip": 0.01579868, + "auxiliary_loss_mlp": 0.0028608, + "balance_loss_clip": 1.28154874, + "balance_loss_mlp": 0.25396538, + "epoch": 0.3870735006763866, + "flos": 17675699483520.0, + "grad_norm": 18.879839620250902, + "language_loss": 0.89580423, + "learning_rate": 2.805079942855074e-06, + "loss": 0.9144637, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.32128906, + "step": 6438, + "time_per_iteration": 2.666818857192993 + }, + { + "auxiliary_loss_clip": 0.01588133, + "auxiliary_loss_mlp": 0.00296544, + "balance_loss_clip": 1.28598094, + "balance_loss_mlp": 0.26376134, + "epoch": 0.38713362392905454, + "flos": 23295791111040.0, + "grad_norm": 11.106264734864858, + "language_loss": 0.81345201, + "learning_rate": 2.804723413431326e-06, + "loss": 0.83229882, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.32788086, + "step": 6439, + "time_per_iteration": 4.177370309829712 + }, + { + "auxiliary_loss_clip": 0.0158238, + "auxiliary_loss_mlp": 0.00265458, + "balance_loss_clip": 1.28490436, + "balance_loss_mlp": 0.23606052, + "epoch": 0.38719374718172256, + "flos": 21031192298880.0, + "grad_norm": 2698.8115049238877, + "language_loss": 0.79856002, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.81703842, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.29394531, + "step": 6440, + "time_per_iteration": 2.8905861377716064 + }, + { + "auxiliary_loss_clip": 0.01582491, + "auxiliary_loss_mlp": 0.00295182, + "balance_loss_clip": 1.27567124, + "balance_loss_mlp": 0.26037312, + "epoch": 0.3872538704343905, + "flos": 19609776322560.0, + "grad_norm": 19.98204487847169, + "language_loss": 0.89891291, + "learning_rate": 2.804010263051774e-06, + "loss": 0.91768968, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.34814453, + "step": 6441, + "time_per_iteration": 2.7068021297454834 + }, + { + "auxiliary_loss_clip": 0.01579005, + "auxiliary_loss_mlp": 0.00284305, + "balance_loss_clip": 1.28051805, + "balance_loss_mlp": 0.25183189, + "epoch": 0.3873139936870585, + "flos": 17530045833600.0, + "grad_norm": 30.846464127102777, + "language_loss": 0.89172351, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.91035664, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 2.99023438, + "router_z_loss_mlp": 0.32446289, + "step": 6442, + "time_per_iteration": 2.63509464263916 + }, + { + "auxiliary_loss_clip": 0.01578801, + "auxiliary_loss_mlp": 0.00279398, + "balance_loss_clip": 1.27762032, + "balance_loss_mlp": 0.2464962, + "epoch": 0.38737411693972645, + "flos": 17786555832960.0, + "grad_norm": 6.033118273186808, + "language_loss": 0.91615719, + "learning_rate": 2.803296990719624e-06, + "loss": 0.93473911, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.32910156, + "step": 6443, + "time_per_iteration": 2.7195403575897217 + }, + { + "auxiliary_loss_clip": 0.01370931, + "auxiliary_loss_mlp": 0.00102426, + "balance_loss_clip": 1.20647013, + "balance_loss_mlp": 0.09451038, + "epoch": 0.3874342401923944, + "flos": 58304637048960.0, + "grad_norm": 0.8260066586418422, + "language_loss": 0.5030992, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.51783276, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.07910156, + "step": 6444, + "time_per_iteration": 3.1782565116882324 + }, + { + "auxiliary_loss_clip": 0.01564572, + "auxiliary_loss_mlp": 0.00276169, + "balance_loss_clip": 1.27235365, + "balance_loss_mlp": 0.24453072, + "epoch": 0.3874943634450624, + "flos": 17711933328000.0, + "grad_norm": 5.132340380019124, + "language_loss": 0.84929276, + "learning_rate": 2.802583596543065e-06, + "loss": 0.86770022, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.31640625, + "step": 6445, + "time_per_iteration": 2.6092607975006104 + }, + { + "auxiliary_loss_clip": 0.01549872, + "auxiliary_loss_mlp": 0.00289635, + "balance_loss_clip": 1.25933111, + "balance_loss_mlp": 0.2563042, + "epoch": 0.38755448669773035, + "flos": 19244852098560.0, + "grad_norm": 30.034406786312978, + "language_loss": 0.88520896, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.90360403, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.33349609, + "step": 6446, + "time_per_iteration": 4.07696795463562 + }, + { + "auxiliary_loss_clip": 0.01546041, + "auxiliary_loss_mlp": 0.00280965, + "balance_loss_clip": 1.25525045, + "balance_loss_mlp": 0.24730068, + "epoch": 0.3876146099503983, + "flos": 20594267262720.0, + "grad_norm": 6.204593487446583, + "language_loss": 0.84951806, + "learning_rate": 2.801870080630306e-06, + "loss": 0.86778808, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.33691406, + "step": 6447, + "time_per_iteration": 2.6207103729248047 + }, + { + "auxiliary_loss_clip": 0.01544966, + "auxiliary_loss_mlp": 0.00301008, + "balance_loss_clip": 1.2588743, + "balance_loss_mlp": 0.26934642, + "epoch": 0.3876747332030663, + "flos": 19281121856640.0, + "grad_norm": 10.832703160301248, + "language_loss": 0.83466738, + "learning_rate": 2.801513277056671e-06, + "loss": 0.85312712, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.31640625, + "step": 6448, + "time_per_iteration": 2.6414225101470947 + }, + { + "auxiliary_loss_clip": 0.01534666, + "auxiliary_loss_mlp": 0.00250908, + "balance_loss_clip": 1.24921179, + "balance_loss_mlp": 0.22184475, + "epoch": 0.38773485645573424, + "flos": 18945895201920.0, + "grad_norm": 107.80117643773491, + "language_loss": 0.80577034, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.82362616, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.29077148, + "step": 6449, + "time_per_iteration": 2.6069836616516113 + }, + { + "auxiliary_loss_clip": 0.01502054, + "auxiliary_loss_mlp": 0.00277053, + "balance_loss_clip": 1.21885681, + "balance_loss_mlp": 0.24341182, + "epoch": 0.3877949797084022, + "flos": 23071348978560.0, + "grad_norm": 150128.5785214675, + "language_loss": 0.86373401, + "learning_rate": 2.800799578742542e-06, + "loss": 0.88152504, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.33618164, + "step": 6450, + "time_per_iteration": 2.703974962234497 + }, + { + "auxiliary_loss_clip": 0.01509763, + "auxiliary_loss_mlp": 0.00276708, + "balance_loss_clip": 1.22497845, + "balance_loss_mlp": 0.24434228, + "epoch": 0.3878551029610702, + "flos": 29095543589760.0, + "grad_norm": 23.490848056333736, + "language_loss": 0.83431888, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.8521837, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.3236084, + "step": 6451, + "time_per_iteration": 2.7084262371063232 + }, + { + "auxiliary_loss_clip": 0.01475519, + "auxiliary_loss_mlp": 0.00260851, + "balance_loss_clip": 1.2064693, + "balance_loss_mlp": 0.22940361, + "epoch": 0.38791522621373814, + "flos": 20996394998400.0, + "grad_norm": 275.6890235506067, + "language_loss": 0.84946412, + "learning_rate": 2.800085758962812e-06, + "loss": 0.86682785, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.31420898, + "step": 6452, + "time_per_iteration": 2.7136781215667725 + }, + { + "auxiliary_loss_clip": 0.01483229, + "auxiliary_loss_mlp": 0.0028688, + "balance_loss_clip": 1.21269083, + "balance_loss_mlp": 0.25488454, + "epoch": 0.3879753494664061, + "flos": 15486836497920.0, + "grad_norm": 2.2685780771495714, + "language_loss": 0.85421801, + "learning_rate": 2.799728803557182e-06, + "loss": 0.87191916, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.31958008, + "step": 6453, + "time_per_iteration": 2.6290056705474854 + }, + { + "auxiliary_loss_clip": 0.01483115, + "auxiliary_loss_mlp": 0.00289799, + "balance_loss_clip": 1.20601773, + "balance_loss_mlp": 0.25904316, + "epoch": 0.3880354727190741, + "flos": 22053964158720.0, + "grad_norm": 9.443094203125678, + "language_loss": 0.78644931, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.80417842, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.30761719, + "step": 6454, + "time_per_iteration": 2.714240550994873 + }, + { + "auxiliary_loss_clip": 0.01478986, + "auxiliary_loss_mlp": 0.00321217, + "balance_loss_clip": 1.20665097, + "balance_loss_mlp": 0.28745663, + "epoch": 0.3880955959717421, + "flos": 20340307128960.0, + "grad_norm": 3.0111833756054516, + "language_loss": 0.86041421, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.87841618, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.33764648, + "step": 6455, + "time_per_iteration": 2.6649327278137207 + }, + { + "auxiliary_loss_clip": 0.01469797, + "auxiliary_loss_mlp": 0.00300665, + "balance_loss_clip": 1.20012069, + "balance_loss_mlp": 0.26561701, + "epoch": 0.38815571922441006, + "flos": 23075407215360.0, + "grad_norm": 2.473031468023308, + "language_loss": 0.82208121, + "learning_rate": 2.798657755439662e-06, + "loss": 0.83978587, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.35058594, + "step": 6456, + "time_per_iteration": 2.6733102798461914 + }, + { + "auxiliary_loss_clip": 0.01460819, + "auxiliary_loss_mlp": 0.00289582, + "balance_loss_clip": 1.19465506, + "balance_loss_mlp": 0.25708562, + "epoch": 0.388215842477078, + "flos": 20776944856320.0, + "grad_norm": 5.718243065887521, + "language_loss": 0.71112895, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.72863293, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.32495117, + "step": 6457, + "time_per_iteration": 2.691025495529175 + }, + { + "auxiliary_loss_clip": 0.01460693, + "auxiliary_loss_mlp": 0.00295313, + "balance_loss_clip": 1.1918987, + "balance_loss_mlp": 0.26148093, + "epoch": 0.388275965729746, + "flos": 20448182649600.0, + "grad_norm": 5.693040832832106, + "language_loss": 0.90197134, + "learning_rate": 2.797943571912841e-06, + "loss": 0.91953135, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.33837891, + "step": 6458, + "time_per_iteration": 2.6768531799316406 + }, + { + "auxiliary_loss_clip": 0.01456689, + "auxiliary_loss_mlp": 0.00282294, + "balance_loss_clip": 1.19195271, + "balance_loss_mlp": 0.25184757, + "epoch": 0.38833608898241395, + "flos": 27892392606720.0, + "grad_norm": 19.50981090791896, + "language_loss": 0.87809575, + "learning_rate": 2.797586434755509e-06, + "loss": 0.89548552, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.30444336, + "step": 6459, + "time_per_iteration": 2.7084097862243652 + }, + { + "auxiliary_loss_clip": 0.01447782, + "auxiliary_loss_mlp": 0.00291323, + "balance_loss_clip": 1.18515897, + "balance_loss_mlp": 0.26135367, + "epoch": 0.3883962122350819, + "flos": 18076390675200.0, + "grad_norm": 13.490096370367871, + "language_loss": 0.69605803, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.71344912, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.29956055, + "step": 6460, + "time_per_iteration": 2.657027244567871 + }, + { + "auxiliary_loss_clip": 0.01444232, + "auxiliary_loss_mlp": 0.00272149, + "balance_loss_clip": 1.18625951, + "balance_loss_mlp": 0.24163094, + "epoch": 0.3884563354877499, + "flos": 23622254847360.0, + "grad_norm": 5.654545742733878, + "language_loss": 0.91951966, + "learning_rate": 2.796872069720717e-06, + "loss": 0.93668342, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.30517578, + "step": 6461, + "time_per_iteration": 2.704819440841675 + }, + { + "auxiliary_loss_clip": 0.01432322, + "auxiliary_loss_mlp": 0.00290806, + "balance_loss_clip": 1.17508864, + "balance_loss_mlp": 0.25902432, + "epoch": 0.38851645874041785, + "flos": 27453528236160.0, + "grad_norm": 19.76220759751131, + "language_loss": 0.8195641, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.83679533, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.31738281, + "step": 6462, + "time_per_iteration": 2.7301876544952393 + }, + { + "auxiliary_loss_clip": 0.01437326, + "auxiliary_loss_mlp": 0.00274832, + "balance_loss_clip": 1.17747688, + "balance_loss_mlp": 0.24343227, + "epoch": 0.3885765819930858, + "flos": 25228072270080.0, + "grad_norm": 38.02205846940823, + "language_loss": 0.84303319, + "learning_rate": 2.796157583816052e-06, + "loss": 0.86015475, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.31396484, + "step": 6463, + "time_per_iteration": 2.655244827270508 + }, + { + "auxiliary_loss_clip": 0.01458901, + "auxiliary_loss_mlp": 0.00303213, + "balance_loss_clip": 1.18895221, + "balance_loss_mlp": 0.26876202, + "epoch": 0.3886367052457538, + "flos": 16946605221120.0, + "grad_norm": 15.80373461753734, + "language_loss": 0.80689025, + "learning_rate": 2.795800295571382e-06, + "loss": 0.82451129, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.34448242, + "step": 6464, + "time_per_iteration": 2.646345853805542 + }, + { + "auxiliary_loss_clip": 0.01434175, + "auxiliary_loss_mlp": 0.00286145, + "balance_loss_clip": 1.17891002, + "balance_loss_mlp": 0.25455415, + "epoch": 0.38869682849842174, + "flos": 27154140376320.0, + "grad_norm": 3.6812497429866804, + "language_loss": 0.75412452, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.77132773, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.31567383, + "step": 6465, + "time_per_iteration": 2.660212516784668 + }, + { + "auxiliary_loss_clip": 0.01434381, + "auxiliary_loss_mlp": 0.00303956, + "balance_loss_clip": 1.17466354, + "balance_loss_mlp": 0.26974303, + "epoch": 0.3887569517510897, + "flos": 21063619301760.0, + "grad_norm": 24.065312226753637, + "language_loss": 0.84232205, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.85970545, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.34204102, + "step": 6466, + "time_per_iteration": 2.62174129486084 + }, + { + "auxiliary_loss_clip": 0.01426984, + "auxiliary_loss_mlp": 0.00282846, + "balance_loss_clip": 1.16610289, + "balance_loss_mlp": 0.25182778, + "epoch": 0.38881707500375773, + "flos": 29497384016640.0, + "grad_norm": 390.7304449220014, + "language_loss": 0.75837481, + "learning_rate": 2.794728249830611e-06, + "loss": 0.77547312, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.31005859, + "step": 6467, + "time_per_iteration": 2.6900274753570557 + }, + { + "auxiliary_loss_clip": 0.01421467, + "auxiliary_loss_mlp": 0.00274472, + "balance_loss_clip": 1.16308141, + "balance_loss_mlp": 0.2435368, + "epoch": 0.3888771982564257, + "flos": 17488281294720.0, + "grad_norm": 7.253526309738377, + "language_loss": 0.92756289, + "learning_rate": 2.794370840959936e-06, + "loss": 0.94452232, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.3092041, + "step": 6468, + "time_per_iteration": 2.614870071411133 + }, + { + "auxiliary_loss_clip": 0.01406229, + "auxiliary_loss_mlp": 0.00270576, + "balance_loss_clip": 1.15689349, + "balance_loss_mlp": 0.24078512, + "epoch": 0.38893732150909366, + "flos": 21942425450880.0, + "grad_norm": 84.70971845098208, + "language_loss": 0.91145504, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.92822313, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.29772949, + "step": 6469, + "time_per_iteration": 2.659018039703369 + }, + { + "auxiliary_loss_clip": 0.01428588, + "auxiliary_loss_mlp": 0.00254136, + "balance_loss_clip": 1.17310143, + "balance_loss_mlp": 0.22399974, + "epoch": 0.3889974447617616, + "flos": 24276367468800.0, + "grad_norm": 316.4226848799966, + "language_loss": 0.80768013, + "learning_rate": 2.793655932864273e-06, + "loss": 0.82450736, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.30151367, + "step": 6470, + "time_per_iteration": 2.6421797275543213 + }, + { + "auxiliary_loss_clip": 0.01411961, + "auxiliary_loss_mlp": 0.0026701, + "balance_loss_clip": 1.1607244, + "balance_loss_mlp": 0.23601538, + "epoch": 0.3890575680144296, + "flos": 25667116208640.0, + "grad_norm": 122.90887690607924, + "language_loss": 0.8293916, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.84618127, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.30969238, + "step": 6471, + "time_per_iteration": 2.6981358528137207 + }, + { + "auxiliary_loss_clip": 0.0141125, + "auxiliary_loss_mlp": 0.00267756, + "balance_loss_clip": 1.1581924, + "balance_loss_mlp": 0.23583147, + "epoch": 0.38911769126709755, + "flos": 22855274714880.0, + "grad_norm": 3.6447668409396865, + "language_loss": 0.72261649, + "learning_rate": 2.792940904386562e-06, + "loss": 0.73940659, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.3190918, + "step": 6472, + "time_per_iteration": 2.6294267177581787 + }, + { + "auxiliary_loss_clip": 0.01411366, + "auxiliary_loss_mlp": 0.00288541, + "balance_loss_clip": 1.15668678, + "balance_loss_mlp": 0.25582981, + "epoch": 0.3891778145197655, + "flos": 25447522412160.0, + "grad_norm": 12.411221239543886, + "language_loss": 0.82560021, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.84259927, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.32763672, + "step": 6473, + "time_per_iteration": 2.6768908500671387 + }, + { + "auxiliary_loss_clip": 0.01427805, + "auxiliary_loss_mlp": 0.00273685, + "balance_loss_clip": 1.1692313, + "balance_loss_mlp": 0.24235702, + "epoch": 0.3892379377724335, + "flos": 14027965614720.0, + "grad_norm": 100.8966622166671, + "language_loss": 0.81948251, + "learning_rate": 2.792225755635257e-06, + "loss": 0.83649743, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.31323242, + "step": 6474, + "time_per_iteration": 2.635230541229248 + }, + { + "auxiliary_loss_clip": 0.01410296, + "auxiliary_loss_mlp": 0.00233525, + "balance_loss_clip": 1.15518653, + "balance_loss_mlp": 0.20520091, + "epoch": 0.38929806102510145, + "flos": 20157449967360.0, + "grad_norm": 6.052489268162658, + "language_loss": 0.74852842, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.76496661, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.28369141, + "step": 6475, + "time_per_iteration": 2.7633635997772217 + }, + { + "auxiliary_loss_clip": 0.01436909, + "auxiliary_loss_mlp": 0.00295619, + "balance_loss_clip": 1.17230105, + "balance_loss_mlp": 0.2630986, + "epoch": 0.3893581842777694, + "flos": 22163958581760.0, + "grad_norm": 6.947620539027206, + "language_loss": 0.81704456, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.83436984, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.32568359, + "step": 6476, + "time_per_iteration": 4.141119480133057 + }, + { + "auxiliary_loss_clip": 0.01335984, + "auxiliary_loss_mlp": 0.00234733, + "balance_loss_clip": 1.17702293, + "balance_loss_mlp": 0.22348003, + "epoch": 0.3894183075304374, + "flos": 67301877392640.0, + "grad_norm": 0.8503068609658352, + "language_loss": 0.57724059, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.59294784, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.11230469, + "step": 6477, + "time_per_iteration": 3.147035837173462 + }, + { + "auxiliary_loss_clip": 0.01438575, + "auxiliary_loss_mlp": 0.0025765, + "balance_loss_clip": 1.17559969, + "balance_loss_mlp": 0.22503397, + "epoch": 0.38947843078310534, + "flos": 18547502480640.0, + "grad_norm": 87.85736850978014, + "language_loss": 0.85254198, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.86950421, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.32617188, + "step": 6478, + "time_per_iteration": 2.653440475463867 + }, + { + "auxiliary_loss_clip": 0.01424014, + "auxiliary_loss_mlp": 0.00269686, + "balance_loss_clip": 1.16432643, + "balance_loss_mlp": 0.23761836, + "epoch": 0.3895385540357733, + "flos": 14605875532800.0, + "grad_norm": 112.00906850600344, + "language_loss": 0.9098047, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.92674172, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.32055664, + "step": 6479, + "time_per_iteration": 4.125814914703369 + }, + { + "auxiliary_loss_clip": 0.01429806, + "auxiliary_loss_mlp": 0.00278053, + "balance_loss_clip": 1.16953826, + "balance_loss_mlp": 0.24885869, + "epoch": 0.38959867728844133, + "flos": 19975203336960.0, + "grad_norm": 3.990272310302621, + "language_loss": 0.87720668, + "learning_rate": 2.790079588824617e-06, + "loss": 0.89428532, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.29199219, + "step": 6480, + "time_per_iteration": 2.62803053855896 + }, + { + "auxiliary_loss_clip": 0.0142196, + "auxiliary_loss_mlp": 0.00276557, + "balance_loss_clip": 1.16519094, + "balance_loss_mlp": 0.24729146, + "epoch": 0.3896588005411093, + "flos": 22672130244480.0, + "grad_norm": 167.76552286998233, + "language_loss": 0.89203322, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.9090184, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.2923584, + "step": 6481, + "time_per_iteration": 4.086307048797607 + }, + { + "auxiliary_loss_clip": 0.01427086, + "auxiliary_loss_mlp": 0.002531, + "balance_loss_clip": 1.16730785, + "balance_loss_mlp": 0.22384559, + "epoch": 0.38971892379377726, + "flos": 20996035862400.0, + "grad_norm": 64.53872556247816, + "language_loss": 0.81756747, + "learning_rate": 2.789363960063863e-06, + "loss": 0.8343693, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.29248047, + "step": 6482, + "time_per_iteration": 2.6340091228485107 + }, + { + "auxiliary_loss_clip": 0.01424119, + "auxiliary_loss_mlp": 0.00282219, + "balance_loss_clip": 1.16346622, + "balance_loss_mlp": 0.25196365, + "epoch": 0.3897790470464452, + "flos": 22528487756160.0, + "grad_norm": 9.087926678013908, + "language_loss": 0.86136615, + "learning_rate": 2.78900610077756e-06, + "loss": 0.87842953, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.30285645, + "step": 6483, + "time_per_iteration": 2.6710658073425293 + }, + { + "auxiliary_loss_clip": 0.01429467, + "auxiliary_loss_mlp": 0.00272453, + "balance_loss_clip": 1.16353536, + "balance_loss_mlp": 0.2389079, + "epoch": 0.3898391702991132, + "flos": 26209905603840.0, + "grad_norm": 10.662940204439261, + "language_loss": 0.83993655, + "learning_rate": 2.788648211572067e-06, + "loss": 0.85695577, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.33544922, + "step": 6484, + "time_per_iteration": 2.6716723442077637 + }, + { + "auxiliary_loss_clip": 0.01438534, + "auxiliary_loss_mlp": 0.00266349, + "balance_loss_clip": 1.17478621, + "balance_loss_mlp": 0.23366177, + "epoch": 0.38989929355178116, + "flos": 21065558636160.0, + "grad_norm": 5.089714065552633, + "language_loss": 0.88113916, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.89818799, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.3269043, + "step": 6485, + "time_per_iteration": 2.6687333583831787 + }, + { + "auxiliary_loss_clip": 0.01425281, + "auxiliary_loss_mlp": 0.00289624, + "balance_loss_clip": 1.16135454, + "balance_loss_mlp": 0.25731796, + "epoch": 0.3899594168044491, + "flos": 25484115392640.0, + "grad_norm": 23.169141006593062, + "language_loss": 0.93261385, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.94976294, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.32299805, + "step": 6486, + "time_per_iteration": 2.8332126140594482 + }, + { + "auxiliary_loss_clip": 0.0143308, + "auxiliary_loss_mlp": 0.00280427, + "balance_loss_clip": 1.16514921, + "balance_loss_mlp": 0.24876457, + "epoch": 0.3900195400571171, + "flos": 31139363456640.0, + "grad_norm": 3.837766818699953, + "language_loss": 0.92742109, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.94455612, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.31665039, + "step": 6487, + "time_per_iteration": 2.7519614696502686 + }, + { + "auxiliary_loss_clip": 0.01416208, + "auxiliary_loss_mlp": 0.00267963, + "balance_loss_clip": 1.15517938, + "balance_loss_mlp": 0.23656285, + "epoch": 0.39007966330978505, + "flos": 20229917656320.0, + "grad_norm": 2.2524855853662618, + "language_loss": 0.8090263, + "learning_rate": 2.787216355829633e-06, + "loss": 0.82586801, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.31396484, + "step": 6488, + "time_per_iteration": 4.089753150939941 + }, + { + "auxiliary_loss_clip": 0.01449672, + "auxiliary_loss_mlp": 0.00326326, + "balance_loss_clip": 1.17767644, + "balance_loss_mlp": 0.29208916, + "epoch": 0.390139786562453, + "flos": 22528739151360.0, + "grad_norm": 3.0131082360640455, + "language_loss": 0.7688266, + "learning_rate": 2.786858317231779e-06, + "loss": 0.78658664, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.34277344, + "step": 6489, + "time_per_iteration": 2.8891658782958984 + }, + { + "auxiliary_loss_clip": 0.01427137, + "auxiliary_loss_mlp": 0.00272821, + "balance_loss_clip": 1.16010761, + "balance_loss_mlp": 0.24366215, + "epoch": 0.390199909815121, + "flos": 26432911192320.0, + "grad_norm": 5.212522280218004, + "language_loss": 0.87238598, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.88938558, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.29150391, + "step": 6490, + "time_per_iteration": 2.7837557792663574 + }, + { + "auxiliary_loss_clip": 0.0142147, + "auxiliary_loss_mlp": 0.00303798, + "balance_loss_clip": 1.1573689, + "balance_loss_mlp": 0.27232683, + "epoch": 0.39026003306778895, + "flos": 17274577328640.0, + "grad_norm": 2.848701300704279, + "language_loss": 0.98184299, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.99909568, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.31494141, + "step": 6491, + "time_per_iteration": 2.6340012550354004 + }, + { + "auxiliary_loss_clip": 0.01434014, + "auxiliary_loss_mlp": 0.00267586, + "balance_loss_clip": 1.16238439, + "balance_loss_mlp": 0.23597193, + "epoch": 0.3903201563204569, + "flos": 24532841554560.0, + "grad_norm": 30.924248528339277, + "language_loss": 0.84281451, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.8598305, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.31591797, + "step": 6492, + "time_per_iteration": 2.6429078578948975 + }, + { + "auxiliary_loss_clip": 0.01420595, + "auxiliary_loss_mlp": 0.00265425, + "balance_loss_clip": 1.15781558, + "balance_loss_mlp": 0.2339298, + "epoch": 0.39038027957312493, + "flos": 23767944410880.0, + "grad_norm": 10.59488395269669, + "language_loss": 0.7906003, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.80746055, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.31506348, + "step": 6493, + "time_per_iteration": 2.7049787044525146 + }, + { + "auxiliary_loss_clip": 0.01454511, + "auxiliary_loss_mlp": 0.00302614, + "balance_loss_clip": 1.17862797, + "balance_loss_mlp": 0.26914036, + "epoch": 0.3904404028257929, + "flos": 14100612871680.0, + "grad_norm": 18.362015275945453, + "language_loss": 0.8483628, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.86593407, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.3347168, + "step": 6494, + "time_per_iteration": 2.613114595413208 + }, + { + "auxiliary_loss_clip": 0.01446611, + "auxiliary_loss_mlp": 0.00334195, + "balance_loss_clip": 1.1702013, + "balance_loss_mlp": 0.29733506, + "epoch": 0.39050052607846086, + "flos": 16910048154240.0, + "grad_norm": 72.04328146162683, + "language_loss": 0.84588099, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.86368906, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.36865234, + "step": 6495, + "time_per_iteration": 2.660688877105713 + }, + { + "auxiliary_loss_clip": 0.01441295, + "auxiliary_loss_mlp": 0.00284918, + "balance_loss_clip": 1.17385423, + "balance_loss_mlp": 0.25156337, + "epoch": 0.39056064933112883, + "flos": 25915761129600.0, + "grad_norm": 89.95493582881869, + "language_loss": 0.75954127, + "learning_rate": 2.784351212350352e-06, + "loss": 0.77680331, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.33374023, + "step": 6496, + "time_per_iteration": 2.740514039993286 + }, + { + "auxiliary_loss_clip": 0.01369322, + "auxiliary_loss_mlp": 0.00056358, + "balance_loss_clip": 1.15555072, + "balance_loss_mlp": 0.04748853, + "epoch": 0.3906207725837968, + "flos": 60028421713920.0, + "grad_norm": 0.6654237805044204, + "language_loss": 0.53697115, + "learning_rate": 2.783992935430775e-06, + "loss": 0.55122793, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.08886719, + "step": 6497, + "time_per_iteration": 3.2274329662323 + }, + { + "auxiliary_loss_clip": 0.01431043, + "auxiliary_loss_mlp": 0.0027213, + "balance_loss_clip": 1.16201425, + "balance_loss_mlp": 0.24037269, + "epoch": 0.39068089583646476, + "flos": 21068683119360.0, + "grad_norm": 15.18587398265637, + "language_loss": 0.76631069, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.78334248, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.31787109, + "step": 6498, + "time_per_iteration": 2.680330514907837 + }, + { + "auxiliary_loss_clip": 0.01382766, + "auxiliary_loss_mlp": 0.00055918, + "balance_loss_clip": 1.17798579, + "balance_loss_mlp": 0.04571389, + "epoch": 0.3907410190891327, + "flos": 70445677403520.0, + "grad_norm": 0.7232880396868521, + "language_loss": 0.51553869, + "learning_rate": 2.783276292417936e-06, + "loss": 0.52992558, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.10205078, + "step": 6499, + "time_per_iteration": 3.1817221641540527 + }, + { + "auxiliary_loss_clip": 0.01429801, + "auxiliary_loss_mlp": 0.00313089, + "balance_loss_clip": 1.15923917, + "balance_loss_mlp": 0.27632487, + "epoch": 0.3908011423418007, + "flos": 27962454084480.0, + "grad_norm": 6.102377875234599, + "language_loss": 0.79938775, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.81681669, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.36791992, + "step": 6500, + "time_per_iteration": 2.7219269275665283 + }, + { + "auxiliary_loss_clip": 0.01457985, + "auxiliary_loss_mlp": 0.00295205, + "balance_loss_clip": 1.18387222, + "balance_loss_mlp": 0.26313788, + "epoch": 0.39086126559446865, + "flos": 24462097718400.0, + "grad_norm": 58.00850906669905, + "language_loss": 0.78002572, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.79755765, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.32006836, + "step": 6501, + "time_per_iteration": 2.7087833881378174 + }, + { + "auxiliary_loss_clip": 0.01441248, + "auxiliary_loss_mlp": 0.0027396, + "balance_loss_clip": 1.17107558, + "balance_loss_mlp": 0.24046238, + "epoch": 0.3909213888471366, + "flos": 16941541403520.0, + "grad_norm": 36.803316140432315, + "language_loss": 0.84421343, + "learning_rate": 2.782201105168287e-06, + "loss": 0.86136544, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.3347168, + "step": 6502, + "time_per_iteration": 2.711869716644287 + }, + { + "auxiliary_loss_clip": 0.01421627, + "auxiliary_loss_mlp": 0.0025604, + "balance_loss_clip": 1.15802431, + "balance_loss_mlp": 0.22633335, + "epoch": 0.3909815120998046, + "flos": 29278400751360.0, + "grad_norm": 6.2762859649316605, + "language_loss": 0.8820107, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.89878738, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.29736328, + "step": 6503, + "time_per_iteration": 2.786430835723877 + }, + { + "auxiliary_loss_clip": 0.01424345, + "auxiliary_loss_mlp": 0.00273259, + "balance_loss_clip": 1.16169, + "balance_loss_mlp": 0.24233589, + "epoch": 0.39104163535247255, + "flos": 18951246328320.0, + "grad_norm": 4.301953823803494, + "language_loss": 0.80164802, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.81862414, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.30908203, + "step": 6504, + "time_per_iteration": 2.692943811416626 + }, + { + "auxiliary_loss_clip": 0.01415772, + "auxiliary_loss_mlp": 0.00261054, + "balance_loss_clip": 1.15145242, + "balance_loss_mlp": 0.22741327, + "epoch": 0.3911017586051405, + "flos": 26323347732480.0, + "grad_norm": 258.71196887508677, + "language_loss": 0.88998842, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.90675676, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.33642578, + "step": 6505, + "time_per_iteration": 2.7082085609436035 + }, + { + "auxiliary_loss_clip": 0.01426775, + "auxiliary_loss_mlp": 0.00259905, + "balance_loss_clip": 1.16017079, + "balance_loss_mlp": 0.22724116, + "epoch": 0.3911618818578085, + "flos": 21835770992640.0, + "grad_norm": 1981.0411405048003, + "language_loss": 0.81235278, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.82921958, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.3269043, + "step": 6506, + "time_per_iteration": 2.630178928375244 + }, + { + "auxiliary_loss_clip": 0.01426376, + "auxiliary_loss_mlp": 0.00259324, + "balance_loss_clip": 1.16509426, + "balance_loss_mlp": 0.22832938, + "epoch": 0.3912220051104765, + "flos": 16359680989440.0, + "grad_norm": 9.768795334173719, + "language_loss": 0.82468629, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.84154326, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.30981445, + "step": 6507, + "time_per_iteration": 2.6447837352752686 + }, + { + "auxiliary_loss_clip": 0.01346502, + "auxiliary_loss_mlp": 0.00062402, + "balance_loss_clip": 1.16921723, + "balance_loss_mlp": 0.05667965, + "epoch": 0.39128212836314447, + "flos": 71050986420480.0, + "grad_norm": 0.7460888361265731, + "language_loss": 0.56519979, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.57928878, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.05712891, + "step": 6508, + "time_per_iteration": 3.286494016647339 + }, + { + "auxiliary_loss_clip": 0.01417888, + "auxiliary_loss_mlp": 0.00265005, + "balance_loss_clip": 1.15842569, + "balance_loss_mlp": 0.23212677, + "epoch": 0.39134225161581243, + "flos": 20331975173760.0, + "grad_norm": 87.34585527786857, + "language_loss": 0.83616745, + "learning_rate": 2.779691297413471e-06, + "loss": 0.85299641, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.32885742, + "step": 6509, + "time_per_iteration": 2.6110756397247314 + }, + { + "auxiliary_loss_clip": 0.01450047, + "auxiliary_loss_mlp": 0.00276972, + "balance_loss_clip": 1.18093109, + "balance_loss_mlp": 0.24221063, + "epoch": 0.3914023748684804, + "flos": 17018390551680.0, + "grad_norm": 151.77149016247262, + "language_loss": 0.92613661, + "learning_rate": 2.779332635075825e-06, + "loss": 0.94340682, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.34741211, + "step": 6510, + "time_per_iteration": 2.586942195892334 + }, + { + "auxiliary_loss_clip": 0.01430193, + "auxiliary_loss_mlp": 0.00268615, + "balance_loss_clip": 1.16456056, + "balance_loss_mlp": 0.23604709, + "epoch": 0.39146249812114836, + "flos": 18405224709120.0, + "grad_norm": 10.31553965610553, + "language_loss": 0.86050063, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.87748873, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.32568359, + "step": 6511, + "time_per_iteration": 2.6577961444854736 + }, + { + "auxiliary_loss_clip": 0.01379936, + "auxiliary_loss_mlp": 0.00074487, + "balance_loss_clip": 1.18845034, + "balance_loss_mlp": 0.06828818, + "epoch": 0.3915226213738163, + "flos": 67637355442560.0, + "grad_norm": 0.7146889134694348, + "language_loss": 0.57758832, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.59213257, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.06176758, + "step": 6512, + "time_per_iteration": 3.220501184463501 + }, + { + "auxiliary_loss_clip": 0.01445659, + "auxiliary_loss_mlp": 0.00297532, + "balance_loss_clip": 1.17698479, + "balance_loss_mlp": 0.26498795, + "epoch": 0.3915827446264843, + "flos": 26359330181760.0, + "grad_norm": 34.293633224994416, + "language_loss": 0.76649588, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.7839278, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.32519531, + "step": 6513, + "time_per_iteration": 2.6691160202026367 + }, + { + "auxiliary_loss_clip": 0.01452168, + "auxiliary_loss_mlp": 0.00291835, + "balance_loss_clip": 1.17996407, + "balance_loss_mlp": 0.25828913, + "epoch": 0.39164286787915226, + "flos": 21943897908480.0, + "grad_norm": 25.710143733997185, + "language_loss": 0.87688476, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.89432478, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.33544922, + "step": 6514, + "time_per_iteration": 2.620985984802246 + }, + { + "auxiliary_loss_clip": 0.01431598, + "auxiliary_loss_mlp": 0.00297411, + "balance_loss_clip": 1.16681159, + "balance_loss_mlp": 0.26548663, + "epoch": 0.3917029911318202, + "flos": 16399829416320.0, + "grad_norm": 144.8244040139509, + "language_loss": 0.85529077, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.87258089, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.31933594, + "step": 6515, + "time_per_iteration": 2.632024049758911 + }, + { + "auxiliary_loss_clip": 0.01428844, + "auxiliary_loss_mlp": 0.0027775, + "balance_loss_clip": 1.1639452, + "balance_loss_mlp": 0.24661213, + "epoch": 0.3917631143844882, + "flos": 26211701283840.0, + "grad_norm": 91.25896661887427, + "language_loss": 0.84391081, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.86097682, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.31152344, + "step": 6516, + "time_per_iteration": 2.7095232009887695 + }, + { + "auxiliary_loss_clip": 0.01434446, + "auxiliary_loss_mlp": 0.0028292, + "balance_loss_clip": 1.16695213, + "balance_loss_mlp": 0.24823022, + "epoch": 0.39182323763715615, + "flos": 18548364407040.0, + "grad_norm": 3.233505049969333, + "language_loss": 0.78115308, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.79832673, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.34692383, + "step": 6517, + "time_per_iteration": 2.677799701690674 + }, + { + "auxiliary_loss_clip": 0.0144167, + "auxiliary_loss_mlp": 0.00262708, + "balance_loss_clip": 1.16999602, + "balance_loss_mlp": 0.23128425, + "epoch": 0.3918833608898241, + "flos": 34313543395200.0, + "grad_norm": 4.530292872335274, + "language_loss": 0.78265655, + "learning_rate": 2.776462273631956e-06, + "loss": 0.79970032, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.31420898, + "step": 6518, + "time_per_iteration": 2.7896158695220947 + }, + { + "auxiliary_loss_clip": 0.01436714, + "auxiliary_loss_mlp": 0.00322228, + "balance_loss_clip": 1.16398478, + "balance_loss_mlp": 0.28927898, + "epoch": 0.3919434841424921, + "flos": 36939582812160.0, + "grad_norm": 37.623656679511306, + "language_loss": 0.69133204, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.70892143, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.32958984, + "step": 6519, + "time_per_iteration": 4.216265439987183 + }, + { + "auxiliary_loss_clip": 0.01461738, + "auxiliary_loss_mlp": 0.00305299, + "balance_loss_clip": 1.18351269, + "balance_loss_mlp": 0.27153906, + "epoch": 0.3920036073951601, + "flos": 23508956373120.0, + "grad_norm": 2.1985370864055387, + "language_loss": 0.76498532, + "learning_rate": 2.775744388563563e-06, + "loss": 0.78265572, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 0.33740234, + "step": 6520, + "time_per_iteration": 2.652175188064575 + }, + { + "auxiliary_loss_clip": 0.01428722, + "auxiliary_loss_mlp": 0.00296707, + "balance_loss_clip": 1.15534854, + "balance_loss_mlp": 0.26340038, + "epoch": 0.39206373064782807, + "flos": 18406086635520.0, + "grad_norm": 30.59593200708812, + "language_loss": 0.86679435, + "learning_rate": 2.775385401898104e-06, + "loss": 0.8840487, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.33325195, + "step": 6521, + "time_per_iteration": 4.0971901416778564 + }, + { + "auxiliary_loss_clip": 0.01450627, + "auxiliary_loss_mlp": 0.0031023, + "balance_loss_clip": 1.17324686, + "balance_loss_mlp": 0.2769464, + "epoch": 0.39212385390049603, + "flos": 12313051608960.0, + "grad_norm": 6689.509652100534, + "language_loss": 0.81988013, + "learning_rate": 2.775026385829952e-06, + "loss": 0.83748871, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.33276367, + "step": 6522, + "time_per_iteration": 2.7357699871063232 + }, + { + "auxiliary_loss_clip": 0.01436551, + "auxiliary_loss_mlp": 0.00264785, + "balance_loss_clip": 1.16215634, + "balance_loss_mlp": 0.23317116, + "epoch": 0.392183977153164, + "flos": 19719160214400.0, + "grad_norm": 38.426515997916674, + "language_loss": 0.8577925, + "learning_rate": 2.774667340372722e-06, + "loss": 0.87480581, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.31591797, + "step": 6523, + "time_per_iteration": 2.676544427871704 + }, + { + "auxiliary_loss_clip": 0.01432615, + "auxiliary_loss_mlp": 0.00292457, + "balance_loss_clip": 1.1558938, + "balance_loss_mlp": 0.26050854, + "epoch": 0.39224410040583196, + "flos": 33144902403840.0, + "grad_norm": 123.4313016308746, + "language_loss": 0.6863445, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.70359522, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.31958008, + "step": 6524, + "time_per_iteration": 4.184070110321045 + }, + { + "auxiliary_loss_clip": 0.01417853, + "auxiliary_loss_mlp": 0.00267724, + "balance_loss_clip": 1.14641094, + "balance_loss_mlp": 0.23608588, + "epoch": 0.39230422365849993, + "flos": 27782434097280.0, + "grad_norm": 23.94354305669703, + "language_loss": 0.81767762, + "learning_rate": 2.773949161345489e-06, + "loss": 0.83453333, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.31640625, + "step": 6525, + "time_per_iteration": 2.761019229888916 + }, + { + "auxiliary_loss_clip": 0.01417274, + "auxiliary_loss_mlp": 0.00286827, + "balance_loss_clip": 1.14545739, + "balance_loss_mlp": 0.25344807, + "epoch": 0.3923643469111679, + "flos": 17931634865280.0, + "grad_norm": 37.40970133362971, + "language_loss": 0.91495073, + "learning_rate": 2.773590027802719e-06, + "loss": 0.93199176, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.33349609, + "step": 6526, + "time_per_iteration": 2.6442313194274902 + }, + { + "auxiliary_loss_clip": 0.01427545, + "auxiliary_loss_mlp": 0.00283874, + "balance_loss_clip": 1.14865661, + "balance_loss_mlp": 0.24813552, + "epoch": 0.39242447016383586, + "flos": 24059539019520.0, + "grad_norm": 8.164651092983313, + "language_loss": 0.75390399, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.77101827, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.35742188, + "step": 6527, + "time_per_iteration": 2.7507286071777344 + }, + { + "auxiliary_loss_clip": 0.01420099, + "auxiliary_loss_mlp": 0.00265863, + "balance_loss_clip": 1.1503917, + "balance_loss_mlp": 0.23279426, + "epoch": 0.3924845934165038, + "flos": 10664069016960.0, + "grad_norm": 66.87487419637978, + "language_loss": 0.92293787, + "learning_rate": 2.772871672726965e-06, + "loss": 0.93979752, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.33081055, + "step": 6528, + "time_per_iteration": 2.5909855365753174 + }, + { + "auxiliary_loss_clip": 0.01429192, + "auxiliary_loss_mlp": 0.00290365, + "balance_loss_clip": 1.15609157, + "balance_loss_mlp": 0.25813118, + "epoch": 0.3925447166691718, + "flos": 31245910174080.0, + "grad_norm": 23.54490098592932, + "language_loss": 0.75150943, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.76870507, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.32250977, + "step": 6529, + "time_per_iteration": 2.7485172748565674 + }, + { + "auxiliary_loss_clip": 0.01436062, + "auxiliary_loss_mlp": 0.00266717, + "balance_loss_clip": 1.15602481, + "balance_loss_mlp": 0.23450664, + "epoch": 0.39260483992183975, + "flos": 29415040087680.0, + "grad_norm": 5.791513910477593, + "language_loss": 0.90039241, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.91742015, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.32226562, + "step": 6530, + "time_per_iteration": 4.060192346572876 + }, + { + "auxiliary_loss_clip": 0.01425392, + "auxiliary_loss_mlp": 0.00270597, + "balance_loss_clip": 1.15267015, + "balance_loss_mlp": 0.23874465, + "epoch": 0.3926649631745077, + "flos": 22857788666880.0, + "grad_norm": 23.30479821152368, + "language_loss": 0.83972913, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.85668898, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.31835938, + "step": 6531, + "time_per_iteration": 2.7624094486236572 + }, + { + "auxiliary_loss_clip": 0.01370532, + "auxiliary_loss_mlp": 0.00132083, + "balance_loss_clip": 1.20368159, + "balance_loss_mlp": 0.12349989, + "epoch": 0.3927250864271757, + "flos": 63893881872000.0, + "grad_norm": 0.7982653442877826, + "language_loss": 0.60050786, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.61553407, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.0859375, + "step": 6532, + "time_per_iteration": 3.081346035003662 + }, + { + "auxiliary_loss_clip": 0.01365831, + "auxiliary_loss_mlp": 0.00101024, + "balance_loss_clip": 1.19655418, + "balance_loss_mlp": 0.09296565, + "epoch": 0.3927852096798437, + "flos": 68909741890560.0, + "grad_norm": 0.7761626056907693, + "language_loss": 0.55118781, + "learning_rate": 2.771075272396981e-06, + "loss": 0.56585634, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.08056641, + "step": 6533, + "time_per_iteration": 3.2230589389801025 + }, + { + "auxiliary_loss_clip": 0.01441094, + "auxiliary_loss_mlp": 0.00295873, + "balance_loss_clip": 1.16206026, + "balance_loss_mlp": 0.26359132, + "epoch": 0.39284533293251167, + "flos": 29715972232320.0, + "grad_norm": 3.5640933702118356, + "language_loss": 0.84653342, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.86390305, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.32275391, + "step": 6534, + "time_per_iteration": 2.785506010055542 + }, + { + "auxiliary_loss_clip": 0.01439923, + "auxiliary_loss_mlp": 0.00288363, + "balance_loss_clip": 1.16409302, + "balance_loss_mlp": 0.25436485, + "epoch": 0.39290545618517964, + "flos": 18552027594240.0, + "grad_norm": 13.62896249023886, + "language_loss": 0.87015021, + "learning_rate": 2.770356507494851e-06, + "loss": 0.88743311, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.34008789, + "step": 6535, + "time_per_iteration": 2.6079249382019043 + }, + { + "auxiliary_loss_clip": 0.01448496, + "auxiliary_loss_mlp": 0.00264818, + "balance_loss_clip": 1.16683626, + "balance_loss_mlp": 0.23131996, + "epoch": 0.3929655794378476, + "flos": 26249479413120.0, + "grad_norm": 16.60652397766833, + "language_loss": 0.75984657, + "learning_rate": 2.769997081218978e-06, + "loss": 0.77697968, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.33496094, + "step": 6536, + "time_per_iteration": 2.6889560222625732 + }, + { + "auxiliary_loss_clip": 0.01428521, + "auxiliary_loss_mlp": 0.00262186, + "balance_loss_clip": 1.16369009, + "balance_loss_mlp": 0.22902176, + "epoch": 0.39302570269051557, + "flos": 29277933874560.0, + "grad_norm": 21.757021968956256, + "language_loss": 0.78238869, + "learning_rate": 2.769637625744738e-06, + "loss": 0.79929578, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.33178711, + "step": 6537, + "time_per_iteration": 2.7091174125671387 + }, + { + "auxiliary_loss_clip": 0.01453486, + "auxiliary_loss_mlp": 0.0029833, + "balance_loss_clip": 1.18069005, + "balance_loss_mlp": 0.26540443, + "epoch": 0.39308582594318353, + "flos": 17347440067200.0, + "grad_norm": 41.042522464405856, + "language_loss": 0.84697461, + "learning_rate": 2.769278141085763e-06, + "loss": 0.86449289, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.3293457, + "step": 6538, + "time_per_iteration": 2.6474804878234863 + }, + { + "auxiliary_loss_clip": 0.01364784, + "auxiliary_loss_mlp": 0.00117196, + "balance_loss_clip": 1.19734502, + "balance_loss_mlp": 0.10956663, + "epoch": 0.3931459491958515, + "flos": 61007094650880.0, + "grad_norm": 0.7962885061833448, + "language_loss": 0.61770701, + "learning_rate": 2.768918627255683e-06, + "loss": 0.63252681, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.07617188, + "step": 6539, + "time_per_iteration": 2.9980828762054443 + }, + { + "auxiliary_loss_clip": 0.01447626, + "auxiliary_loss_mlp": 0.00295416, + "balance_loss_clip": 1.17612219, + "balance_loss_mlp": 0.26175079, + "epoch": 0.39320607244851946, + "flos": 39016009249920.0, + "grad_norm": 3.0693651550290584, + "language_loss": 0.77767396, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.79510438, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.33691406, + "step": 6540, + "time_per_iteration": 2.8119516372680664 + }, + { + "auxiliary_loss_clip": 0.01431625, + "auxiliary_loss_mlp": 0.00268295, + "balance_loss_clip": 1.16298294, + "balance_loss_mlp": 0.23572677, + "epoch": 0.3932661957011874, + "flos": 24679752180480.0, + "grad_norm": 301.4849158121515, + "language_loss": 0.77489507, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.7918942, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.32568359, + "step": 6541, + "time_per_iteration": 2.675463914871216 + }, + { + "auxiliary_loss_clip": 0.01367901, + "auxiliary_loss_mlp": 0.00079105, + "balance_loss_clip": 1.20077634, + "balance_loss_mlp": 0.07257282, + "epoch": 0.3933263189538554, + "flos": 70096552185600.0, + "grad_norm": 0.8719265393698116, + "language_loss": 0.60150695, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.61597705, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.06542969, + "step": 6542, + "time_per_iteration": 2.9672024250030518 + }, + { + "auxiliary_loss_clip": 0.01450789, + "auxiliary_loss_mlp": 0.00296263, + "balance_loss_clip": 1.17743373, + "balance_loss_mlp": 0.26274127, + "epoch": 0.39338644220652336, + "flos": 22929071207040.0, + "grad_norm": 282.23728344924524, + "language_loss": 0.8878926, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.90536308, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.33544922, + "step": 6543, + "time_per_iteration": 2.7156713008880615 + }, + { + "auxiliary_loss_clip": 0.01444345, + "auxiliary_loss_mlp": 0.00259673, + "balance_loss_clip": 1.17411041, + "balance_loss_mlp": 0.22662845, + "epoch": 0.3934465654591913, + "flos": 30848163897600.0, + "grad_norm": 8213.254453606947, + "language_loss": 0.75395089, + "learning_rate": 2.767120621015908e-06, + "loss": 0.77099109, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.33032227, + "step": 6544, + "time_per_iteration": 2.755950689315796 + }, + { + "auxiliary_loss_clip": 0.01450987, + "auxiliary_loss_mlp": 0.00299661, + "balance_loss_clip": 1.17780244, + "balance_loss_mlp": 0.26492333, + "epoch": 0.3935066887118593, + "flos": 29236528471680.0, + "grad_norm": 1533.3521857635988, + "language_loss": 0.83851123, + "learning_rate": 2.76676093244553e-06, + "loss": 0.85601771, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.34716797, + "step": 6545, + "time_per_iteration": 2.7264750003814697 + }, + { + "auxiliary_loss_clip": 0.01442905, + "auxiliary_loss_mlp": 0.00293099, + "balance_loss_clip": 1.17785144, + "balance_loss_mlp": 0.26363087, + "epoch": 0.3935668119645273, + "flos": 19135288638720.0, + "grad_norm": 26.63001862673322, + "language_loss": 0.80359697, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.82095701, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.29443359, + "step": 6546, + "time_per_iteration": 2.7560856342315674 + }, + { + "auxiliary_loss_clip": 0.01463693, + "auxiliary_loss_mlp": 0.00308226, + "balance_loss_clip": 1.18726349, + "balance_loss_mlp": 0.27484781, + "epoch": 0.3936269352171953, + "flos": 18516116972160.0, + "grad_norm": 20.268399174732618, + "language_loss": 0.88509858, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.90281773, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.33374023, + "step": 6547, + "time_per_iteration": 2.6412556171417236 + }, + { + "auxiliary_loss_clip": 0.01461365, + "auxiliary_loss_mlp": 0.00290301, + "balance_loss_clip": 1.1919843, + "balance_loss_mlp": 0.25971201, + "epoch": 0.39368705846986324, + "flos": 15632813370240.0, + "grad_norm": 6.656900410407221, + "language_loss": 0.91279399, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.93031067, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.30578613, + "step": 6548, + "time_per_iteration": 2.60237455368042 + }, + { + "auxiliary_loss_clip": 0.01443874, + "auxiliary_loss_mlp": 0.00284244, + "balance_loss_clip": 1.17235947, + "balance_loss_mlp": 0.25038883, + "epoch": 0.3937471817225312, + "flos": 21325839563520.0, + "grad_norm": 3.3976293513354343, + "language_loss": 0.79872191, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.81600308, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.33837891, + "step": 6549, + "time_per_iteration": 2.638240098953247 + }, + { + "auxiliary_loss_clip": 0.01462782, + "auxiliary_loss_mlp": 0.00307709, + "balance_loss_clip": 1.18395627, + "balance_loss_mlp": 0.27437758, + "epoch": 0.39380730497519917, + "flos": 20776693461120.0, + "grad_norm": 12.241719105453654, + "language_loss": 0.82534313, + "learning_rate": 2.764962053731699e-06, + "loss": 0.84304798, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 0.33325195, + "step": 6550, + "time_per_iteration": 2.6687657833099365 + }, + { + "auxiliary_loss_clip": 0.01458101, + "auxiliary_loss_mlp": 0.00264144, + "balance_loss_clip": 1.18369114, + "balance_loss_mlp": 0.23186165, + "epoch": 0.39386742822786713, + "flos": 21609784575360.0, + "grad_norm": 102.76955842770397, + "language_loss": 0.8826319, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.8998543, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.32275391, + "step": 6551, + "time_per_iteration": 2.641793727874756 + }, + { + "auxiliary_loss_clip": 0.014429, + "auxiliary_loss_mlp": 0.003066, + "balance_loss_clip": 1.16955173, + "balance_loss_mlp": 0.27527168, + "epoch": 0.3939275514805351, + "flos": 12414642249600.0, + "grad_norm": 23.87603264415071, + "language_loss": 0.87840211, + "learning_rate": 2.764242299098596e-06, + "loss": 0.89589715, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.31323242, + "step": 6552, + "time_per_iteration": 2.6510274410247803 + }, + { + "auxiliary_loss_clip": 0.01471307, + "auxiliary_loss_mlp": 0.00284277, + "balance_loss_clip": 1.19182491, + "balance_loss_mlp": 0.25178012, + "epoch": 0.39398767473320306, + "flos": 18552027594240.0, + "grad_norm": 15.030878362827929, + "language_loss": 0.79224551, + "learning_rate": 2.763882378305003e-06, + "loss": 0.80980134, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.32446289, + "step": 6553, + "time_per_iteration": 2.599310874938965 + }, + { + "auxiliary_loss_clip": 0.01442168, + "auxiliary_loss_mlp": 0.00304383, + "balance_loss_clip": 1.17155182, + "balance_loss_mlp": 0.27047959, + "epoch": 0.39404779798587103, + "flos": 29308888419840.0, + "grad_norm": 13.057009463717566, + "language_loss": 0.69924867, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.71671426, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.33935547, + "step": 6554, + "time_per_iteration": 2.7854840755462646 + }, + { + "auxiliary_loss_clip": 0.01445225, + "auxiliary_loss_mlp": 0.00288174, + "balance_loss_clip": 1.17604494, + "balance_loss_mlp": 0.25810945, + "epoch": 0.394107921238539, + "flos": 34897055834880.0, + "grad_norm": 33.50418287192239, + "language_loss": 0.85169888, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.8690328, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.30053711, + "step": 6555, + "time_per_iteration": 2.744800567626953 + }, + { + "auxiliary_loss_clip": 0.01446735, + "auxiliary_loss_mlp": 0.0030625, + "balance_loss_clip": 1.17473757, + "balance_loss_mlp": 0.27134585, + "epoch": 0.39416804449120696, + "flos": 25081413039360.0, + "grad_norm": 188.00228802860536, + "language_loss": 0.81112993, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.82865983, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.34936523, + "step": 6556, + "time_per_iteration": 2.67191219329834 + }, + { + "auxiliary_loss_clip": 0.01431449, + "auxiliary_loss_mlp": 0.00277701, + "balance_loss_clip": 1.16224802, + "balance_loss_mlp": 0.2455858, + "epoch": 0.3942281677438749, + "flos": 32306639731200.0, + "grad_norm": 1965.3989433742058, + "language_loss": 0.91709203, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.9341836, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.32104492, + "step": 6557, + "time_per_iteration": 2.752310037612915 + }, + { + "auxiliary_loss_clip": 0.01439304, + "auxiliary_loss_mlp": 0.00293336, + "balance_loss_clip": 1.16969204, + "balance_loss_mlp": 0.26250881, + "epoch": 0.3942882909965429, + "flos": 24936621315840.0, + "grad_norm": 5.383269719480459, + "language_loss": 0.89588606, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.91321254, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.30859375, + "step": 6558, + "time_per_iteration": 2.6710662841796875 + }, + { + "auxiliary_loss_clip": 0.01438069, + "auxiliary_loss_mlp": 0.00285201, + "balance_loss_clip": 1.16857207, + "balance_loss_mlp": 0.25401634, + "epoch": 0.39434841424921085, + "flos": 11874797769600.0, + "grad_norm": 141.37443708330932, + "language_loss": 0.79682308, + "learning_rate": 2.761722245724792e-06, + "loss": 0.8140558, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.31176758, + "step": 6559, + "time_per_iteration": 2.630338191986084 + }, + { + "auxiliary_loss_clip": 0.01431556, + "auxiliary_loss_mlp": 0.00310766, + "balance_loss_clip": 1.15931058, + "balance_loss_mlp": 0.27641034, + "epoch": 0.3944085375018789, + "flos": 16361620323840.0, + "grad_norm": 77.29299085577408, + "language_loss": 0.90033048, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.9177537, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.34399414, + "step": 6560, + "time_per_iteration": 2.66190505027771 + }, + { + "auxiliary_loss_clip": 0.01429887, + "auxiliary_loss_mlp": 0.00270415, + "balance_loss_clip": 1.1616559, + "balance_loss_mlp": 0.23938489, + "epoch": 0.39446866075454684, + "flos": 10633365866880.0, + "grad_norm": 170.54426822832397, + "language_loss": 0.91034508, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.92734808, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.31018066, + "step": 6561, + "time_per_iteration": 4.005646467208862 + }, + { + "auxiliary_loss_clip": 0.01408277, + "auxiliary_loss_mlp": 0.00273251, + "balance_loss_clip": 1.14674497, + "balance_loss_mlp": 0.2429478, + "epoch": 0.3945287840072148, + "flos": 18187498419840.0, + "grad_norm": 19.211562299695338, + "language_loss": 0.90970433, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.92651963, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.30273438, + "step": 6562, + "time_per_iteration": 2.6686630249023438 + }, + { + "auxiliary_loss_clip": 0.01425627, + "auxiliary_loss_mlp": 0.00282197, + "balance_loss_clip": 1.15965474, + "balance_loss_mlp": 0.25176334, + "epoch": 0.39458890725988277, + "flos": 23039891642880.0, + "grad_norm": 145.07200769373895, + "language_loss": 0.88867176, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.90574998, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.30407715, + "step": 6563, + "time_per_iteration": 4.094133615493774 + }, + { + "auxiliary_loss_clip": 0.01416145, + "auxiliary_loss_mlp": 0.00263428, + "balance_loss_clip": 1.15165377, + "balance_loss_mlp": 0.23362595, + "epoch": 0.39464903051255074, + "flos": 17159052211200.0, + "grad_norm": 15.636818747717083, + "language_loss": 0.78939164, + "learning_rate": 2.759921340790127e-06, + "loss": 0.80618739, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.2980957, + "step": 6564, + "time_per_iteration": 2.6878697872161865 + }, + { + "auxiliary_loss_clip": 0.01420173, + "auxiliary_loss_mlp": 0.00272662, + "balance_loss_clip": 1.15339255, + "balance_loss_mlp": 0.24536322, + "epoch": 0.3947091537652187, + "flos": 15889000147200.0, + "grad_norm": 15.713864791117048, + "language_loss": 0.92184889, + "learning_rate": 2.759561073299676e-06, + "loss": 0.93877721, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.27319336, + "step": 6565, + "time_per_iteration": 2.6649186611175537 + }, + { + "auxiliary_loss_clip": 0.01418355, + "auxiliary_loss_mlp": 0.00274513, + "balance_loss_clip": 1.15602338, + "balance_loss_mlp": 0.24347107, + "epoch": 0.39476927701788667, + "flos": 18545491319040.0, + "grad_norm": 145.5760049280142, + "language_loss": 0.89096653, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.90789527, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.31054688, + "step": 6566, + "time_per_iteration": 4.0582075119018555 + }, + { + "auxiliary_loss_clip": 0.01423048, + "auxiliary_loss_mlp": 0.00297433, + "balance_loss_clip": 1.15371466, + "balance_loss_mlp": 0.26476923, + "epoch": 0.39482940027055463, + "flos": 22275712771200.0, + "grad_norm": 36.21142995920963, + "language_loss": 0.86941469, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.88661945, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.32641602, + "step": 6567, + "time_per_iteration": 2.6643660068511963 + }, + { + "auxiliary_loss_clip": 0.01397572, + "auxiliary_loss_mlp": 0.00247242, + "balance_loss_clip": 1.14239275, + "balance_loss_mlp": 0.21681963, + "epoch": 0.3948895235232226, + "flos": 14757634494720.0, + "grad_norm": 6.002472420997199, + "language_loss": 0.86320817, + "learning_rate": 2.758480098067182e-06, + "loss": 0.87965631, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.30444336, + "step": 6568, + "time_per_iteration": 2.6569736003875732 + }, + { + "auxiliary_loss_clip": 0.01409584, + "auxiliary_loss_mlp": 0.0027427, + "balance_loss_clip": 1.15071428, + "balance_loss_mlp": 0.24399111, + "epoch": 0.39494964677589056, + "flos": 22565763095040.0, + "grad_norm": 2.01701162719074, + "language_loss": 0.91243976, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.92927831, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.30297852, + "step": 6569, + "time_per_iteration": 2.719325542449951 + }, + { + "auxiliary_loss_clip": 0.01417429, + "auxiliary_loss_mlp": 0.00264811, + "balance_loss_clip": 1.15939426, + "balance_loss_mlp": 0.23436448, + "epoch": 0.3950097700285585, + "flos": 22963186149120.0, + "grad_norm": 6.481766312967979, + "language_loss": 0.83505666, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.85187912, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.3046875, + "step": 6570, + "time_per_iteration": 2.7094762325286865 + }, + { + "auxiliary_loss_clip": 0.01398087, + "auxiliary_loss_mlp": 0.00276705, + "balance_loss_clip": 1.14084125, + "balance_loss_mlp": 0.24501893, + "epoch": 0.3950698932812265, + "flos": 20595236929920.0, + "grad_norm": 8.998410709131717, + "language_loss": 0.87205184, + "learning_rate": 2.757398863979922e-06, + "loss": 0.88879979, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.31640625, + "step": 6571, + "time_per_iteration": 2.646961212158203 + }, + { + "auxiliary_loss_clip": 0.01409815, + "auxiliary_loss_mlp": 0.00286389, + "balance_loss_clip": 1.15004635, + "balance_loss_mlp": 0.25575206, + "epoch": 0.39513001653389446, + "flos": 20375786787840.0, + "grad_norm": 6.646359102661631, + "language_loss": 0.84016967, + "learning_rate": 2.757038395157997e-06, + "loss": 0.85713172, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.30688477, + "step": 6572, + "time_per_iteration": 4.085606813430786 + }, + { + "auxiliary_loss_clip": 0.0140634, + "auxiliary_loss_mlp": 0.00258736, + "balance_loss_clip": 1.14728999, + "balance_loss_mlp": 0.22917181, + "epoch": 0.3951901397865625, + "flos": 26463650256000.0, + "grad_norm": 3.007711280469517, + "language_loss": 0.80815035, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.82480109, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.29614258, + "step": 6573, + "time_per_iteration": 2.7038486003875732 + }, + { + "auxiliary_loss_clip": 0.01404877, + "auxiliary_loss_mlp": 0.00279555, + "balance_loss_clip": 1.14648676, + "balance_loss_mlp": 0.24979997, + "epoch": 0.39525026303923044, + "flos": 43838345767680.0, + "grad_norm": 134.8651861437548, + "language_loss": 0.73037958, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.74722397, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.29748535, + "step": 6574, + "time_per_iteration": 2.8859164714813232 + }, + { + "auxiliary_loss_clip": 0.01399962, + "auxiliary_loss_mlp": 0.00243113, + "balance_loss_clip": 1.14101756, + "balance_loss_mlp": 0.21334614, + "epoch": 0.3953103862918984, + "flos": 18040803275520.0, + "grad_norm": 212.0853240633193, + "language_loss": 0.83929467, + "learning_rate": 2.755956816505072e-06, + "loss": 0.85572541, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.29748535, + "step": 6575, + "time_per_iteration": 2.6955862045288086 + }, + { + "auxiliary_loss_clip": 0.01421858, + "auxiliary_loss_mlp": 0.00273402, + "balance_loss_clip": 1.15792036, + "balance_loss_mlp": 0.24325444, + "epoch": 0.3953705095445664, + "flos": 16976015481600.0, + "grad_norm": 13.040253677510428, + "language_loss": 0.81536126, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.83231384, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.30114746, + "step": 6576, + "time_per_iteration": 2.6759207248687744 + }, + { + "auxiliary_loss_clip": 0.01410418, + "auxiliary_loss_mlp": 0.00263775, + "balance_loss_clip": 1.14720142, + "balance_loss_mlp": 0.23249468, + "epoch": 0.39543063279723434, + "flos": 17411144837760.0, + "grad_norm": 124.61536411379016, + "language_loss": 0.89663017, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.9133721, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.31286621, + "step": 6577, + "time_per_iteration": 2.8384056091308594 + }, + { + "auxiliary_loss_clip": 0.01420595, + "auxiliary_loss_mlp": 0.00268407, + "balance_loss_clip": 1.15957284, + "balance_loss_mlp": 0.23781793, + "epoch": 0.3954907560499023, + "flos": 22784207656320.0, + "grad_norm": 4.049372801190956, + "language_loss": 0.98261309, + "learning_rate": 2.75487497985853e-06, + "loss": 0.99950308, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.3059082, + "step": 6578, + "time_per_iteration": 2.771533727645874 + }, + { + "auxiliary_loss_clip": 0.01409034, + "auxiliary_loss_mlp": 0.00278974, + "balance_loss_clip": 1.1449194, + "balance_loss_mlp": 0.24681097, + "epoch": 0.39555087930257027, + "flos": 21944400698880.0, + "grad_norm": 5.013963497486403, + "language_loss": 0.86058342, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.87746352, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.3215332, + "step": 6579, + "time_per_iteration": 2.680816411972046 + }, + { + "auxiliary_loss_clip": 0.01430232, + "auxiliary_loss_mlp": 0.00327342, + "balance_loss_clip": 1.16132402, + "balance_loss_mlp": 0.29644245, + "epoch": 0.39561100255523823, + "flos": 20404622430720.0, + "grad_norm": 1348.5707027409603, + "language_loss": 0.77095306, + "learning_rate": 2.754153612280037e-06, + "loss": 0.7885288, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.30883789, + "step": 6580, + "time_per_iteration": 2.677456855773926 + }, + { + "auxiliary_loss_clip": 0.01415054, + "auxiliary_loss_mlp": 0.00291996, + "balance_loss_clip": 1.15438223, + "balance_loss_mlp": 0.26145491, + "epoch": 0.3956711258079062, + "flos": 27964572986880.0, + "grad_norm": 406.7111051026534, + "language_loss": 0.66092908, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.67799962, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.30541992, + "step": 6581, + "time_per_iteration": 2.757558584213257 + }, + { + "auxiliary_loss_clip": 0.01424815, + "auxiliary_loss_mlp": 0.0030109, + "balance_loss_clip": 1.15961647, + "balance_loss_mlp": 0.26985714, + "epoch": 0.39573124906057416, + "flos": 14428297670400.0, + "grad_norm": 8.61748472767752, + "language_loss": 0.79038829, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.80764735, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.31225586, + "step": 6582, + "time_per_iteration": 2.640465259552002 + }, + { + "auxiliary_loss_clip": 0.01421074, + "auxiliary_loss_mlp": 0.00297255, + "balance_loss_clip": 1.15716839, + "balance_loss_mlp": 0.26783454, + "epoch": 0.39579137231324213, + "flos": 18733699607040.0, + "grad_norm": 43.65190332982038, + "language_loss": 0.85502481, + "learning_rate": 2.753071346464642e-06, + "loss": 0.87220812, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.29418945, + "step": 6583, + "time_per_iteration": 2.6321802139282227 + }, + { + "auxiliary_loss_clip": 0.01428628, + "auxiliary_loss_mlp": 0.00296532, + "balance_loss_clip": 1.16214907, + "balance_loss_mlp": 0.2663008, + "epoch": 0.3958514955659101, + "flos": 17676417755520.0, + "grad_norm": 255.30788589818252, + "language_loss": 0.728769, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.74602062, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.30224609, + "step": 6584, + "time_per_iteration": 2.654461622238159 + }, + { + "auxiliary_loss_clip": 0.0143314, + "auxiliary_loss_mlp": 0.00295148, + "balance_loss_clip": 1.16761637, + "balance_loss_mlp": 0.26572716, + "epoch": 0.39591161881857806, + "flos": 29309103901440.0, + "grad_norm": 53.885190221646035, + "language_loss": 0.81635392, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.83363682, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.29418945, + "step": 6585, + "time_per_iteration": 2.7365570068359375 + }, + { + "auxiliary_loss_clip": 0.01424185, + "auxiliary_loss_mlp": 0.00289858, + "balance_loss_clip": 1.15895009, + "balance_loss_mlp": 0.25717053, + "epoch": 0.3959717420712461, + "flos": 25771831332480.0, + "grad_norm": 29.03455149142605, + "language_loss": 0.80495203, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.82209241, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.32641602, + "step": 6586, + "time_per_iteration": 2.7751080989837646 + }, + { + "auxiliary_loss_clip": 0.01431758, + "auxiliary_loss_mlp": 0.00324288, + "balance_loss_clip": 1.16909635, + "balance_loss_mlp": 0.2933417, + "epoch": 0.39603186532391405, + "flos": 20923783655040.0, + "grad_norm": 9.876764816217014, + "language_loss": 0.78014368, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.79770416, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.30957031, + "step": 6587, + "time_per_iteration": 2.6830945014953613 + }, + { + "auxiliary_loss_clip": 0.01386326, + "auxiliary_loss_mlp": 0.00102037, + "balance_loss_clip": 1.22113037, + "balance_loss_mlp": 0.09221432, + "epoch": 0.396091988576582, + "flos": 54880986176640.0, + "grad_norm": 1.0075464053311867, + "language_loss": 0.60485756, + "learning_rate": 2.751266999157285e-06, + "loss": 0.6197412, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.09814453, + "step": 6588, + "time_per_iteration": 2.932032585144043 + }, + { + "auxiliary_loss_clip": 0.01439902, + "auxiliary_loss_mlp": 0.00302335, + "balance_loss_clip": 1.17207146, + "balance_loss_mlp": 0.27007681, + "epoch": 0.39615211182925, + "flos": 20702896968960.0, + "grad_norm": 36.15881620293012, + "language_loss": 0.86410224, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.88152468, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.32275391, + "step": 6589, + "time_per_iteration": 2.801392078399658 + }, + { + "auxiliary_loss_clip": 0.01450213, + "auxiliary_loss_mlp": 0.00334964, + "balance_loss_clip": 1.18199205, + "balance_loss_mlp": 0.30141824, + "epoch": 0.39621223508191794, + "flos": 20994312009600.0, + "grad_norm": 10738.582826645055, + "language_loss": 0.80863452, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.82648623, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.33569336, + "step": 6590, + "time_per_iteration": 2.7860207557678223 + }, + { + "auxiliary_loss_clip": 0.01445303, + "auxiliary_loss_mlp": 0.00332556, + "balance_loss_clip": 1.1812135, + "balance_loss_mlp": 0.30447015, + "epoch": 0.3962723583345859, + "flos": 23368833417600.0, + "grad_norm": 122.96299840950465, + "language_loss": 0.82287681, + "learning_rate": 2.750184048805956e-06, + "loss": 0.84065539, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.28100586, + "step": 6591, + "time_per_iteration": 2.66439151763916 + }, + { + "auxiliary_loss_clip": 0.01451189, + "auxiliary_loss_mlp": 0.00321698, + "balance_loss_clip": 1.18539739, + "balance_loss_mlp": 0.28996408, + "epoch": 0.39633248158725387, + "flos": 25115599808640.0, + "grad_norm": 39.863807833461806, + "language_loss": 0.84396195, + "learning_rate": 2.749823008443152e-06, + "loss": 0.86169082, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.31750488, + "step": 6592, + "time_per_iteration": 2.7012786865234375 + }, + { + "auxiliary_loss_clip": 0.01442417, + "auxiliary_loss_mlp": 0.00307075, + "balance_loss_clip": 1.18493748, + "balance_loss_mlp": 0.27721289, + "epoch": 0.39639260483992184, + "flos": 39787622236800.0, + "grad_norm": 19.860812413244226, + "language_loss": 0.75221479, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.76970971, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.29870605, + "step": 6593, + "time_per_iteration": 2.8126957416534424 + }, + { + "auxiliary_loss_clip": 0.01444319, + "auxiliary_loss_mlp": 0.00335003, + "balance_loss_clip": 1.18146133, + "balance_loss_mlp": 0.30322173, + "epoch": 0.3964527280925898, + "flos": 17347045017600.0, + "grad_norm": 7.646989274719978, + "language_loss": 0.844661, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.86245418, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.31750488, + "step": 6594, + "time_per_iteration": 2.6207823753356934 + }, + { + "auxiliary_loss_clip": 0.01409041, + "auxiliary_loss_mlp": 0.00101313, + "balance_loss_clip": 1.23718381, + "balance_loss_mlp": 0.09034582, + "epoch": 0.39651285134525777, + "flos": 71717848369920.0, + "grad_norm": 0.9148173921094871, + "language_loss": 0.62724513, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.64234865, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.10986328, + "step": 6595, + "time_per_iteration": 3.2069408893585205 + }, + { + "auxiliary_loss_clip": 0.01453579, + "auxiliary_loss_mlp": 0.00321813, + "balance_loss_clip": 1.18700457, + "balance_loss_mlp": 0.29034221, + "epoch": 0.39657297459792573, + "flos": 25775710001280.0, + "grad_norm": 51.57020044260727, + "language_loss": 0.70727694, + "learning_rate": 2.748378562795223e-06, + "loss": 0.72503084, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.31469727, + "step": 6596, + "time_per_iteration": 2.718285322189331 + }, + { + "auxiliary_loss_clip": 0.01444205, + "auxiliary_loss_mlp": 0.00313308, + "balance_loss_clip": 1.18505526, + "balance_loss_mlp": 0.28028703, + "epoch": 0.3966330978505937, + "flos": 20266115587200.0, + "grad_norm": 5.841454282379107, + "language_loss": 0.85343158, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.87100673, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.33007812, + "step": 6597, + "time_per_iteration": 2.7122414112091064 + }, + { + "auxiliary_loss_clip": 0.01463095, + "auxiliary_loss_mlp": 0.00353183, + "balance_loss_clip": 1.19359565, + "balance_loss_mlp": 0.32087752, + "epoch": 0.39669322110326166, + "flos": 20631183465600.0, + "grad_norm": 35.06936872258178, + "language_loss": 0.78126585, + "learning_rate": 2.747656169644941e-06, + "loss": 0.7994287, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.32299805, + "step": 6598, + "time_per_iteration": 2.638516902923584 + }, + { + "auxiliary_loss_clip": 0.01466133, + "auxiliary_loss_mlp": 0.00333251, + "balance_loss_clip": 1.19583845, + "balance_loss_mlp": 0.30287653, + "epoch": 0.3967533443559297, + "flos": 21726063878400.0, + "grad_norm": 85.32559724749366, + "language_loss": 0.87254673, + "learning_rate": 2.747294930536157e-06, + "loss": 0.8905406, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.30395508, + "step": 6599, + "time_per_iteration": 2.7062056064605713 + }, + { + "auxiliary_loss_clip": 0.01464651, + "auxiliary_loss_mlp": 0.00328627, + "balance_loss_clip": 1.1979928, + "balance_loss_mlp": 0.2948668, + "epoch": 0.39681346760859765, + "flos": 25484151306240.0, + "grad_norm": 59.32231263797697, + "language_loss": 0.8086468, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.82657957, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.33764648, + "step": 6600, + "time_per_iteration": 2.688436508178711 + }, + { + "auxiliary_loss_clip": 0.01450144, + "auxiliary_loss_mlp": 0.00347921, + "balance_loss_clip": 1.18456268, + "balance_loss_mlp": 0.31566256, + "epoch": 0.3968735908612656, + "flos": 20959586536320.0, + "grad_norm": 52.61490389638629, + "language_loss": 0.93874621, + "learning_rate": 2.746572367319791e-06, + "loss": 0.95672685, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.32226562, + "step": 6601, + "time_per_iteration": 2.6468327045440674 + }, + { + "auxiliary_loss_clip": 0.01464525, + "auxiliary_loss_mlp": 0.00319675, + "balance_loss_clip": 1.19204259, + "balance_loss_mlp": 0.28438896, + "epoch": 0.3969337141139336, + "flos": 10707090531840.0, + "grad_norm": 36.357774473413095, + "language_loss": 0.79414678, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.81198877, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.35253906, + "step": 6602, + "time_per_iteration": 2.6900947093963623 + }, + { + "auxiliary_loss_clip": 0.01460651, + "auxiliary_loss_mlp": 0.00343899, + "balance_loss_clip": 1.1913588, + "balance_loss_mlp": 0.31168818, + "epoch": 0.39699383736660154, + "flos": 17593714690560.0, + "grad_norm": 6.758090078760853, + "language_loss": 0.9446522, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.96269763, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.32177734, + "step": 6603, + "time_per_iteration": 4.041489124298096 + }, + { + "auxiliary_loss_clip": 0.01457492, + "auxiliary_loss_mlp": 0.00308739, + "balance_loss_clip": 1.18986773, + "balance_loss_mlp": 0.27755341, + "epoch": 0.3970539606192695, + "flos": 17785945301760.0, + "grad_norm": 20.327464045279093, + "language_loss": 0.78147912, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.79914141, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.31176758, + "step": 6604, + "time_per_iteration": 2.691746950149536 + }, + { + "auxiliary_loss_clip": 0.01441717, + "auxiliary_loss_mlp": 0.0031995, + "balance_loss_clip": 1.18542552, + "balance_loss_mlp": 0.29014802, + "epoch": 0.3971140838719375, + "flos": 24789495208320.0, + "grad_norm": 2.9958794009406757, + "language_loss": 0.87171471, + "learning_rate": 2.745126901275491e-06, + "loss": 0.88933134, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.2980957, + "step": 6605, + "time_per_iteration": 4.17153263092041 + }, + { + "auxiliary_loss_clip": 0.01445959, + "auxiliary_loss_mlp": 0.00315674, + "balance_loss_clip": 1.18218863, + "balance_loss_mlp": 0.2844885, + "epoch": 0.39717420712460544, + "flos": 24243581329920.0, + "grad_norm": 5.633861172853512, + "language_loss": 0.81780934, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.83542573, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.31176758, + "step": 6606, + "time_per_iteration": 2.7407524585723877 + }, + { + "auxiliary_loss_clip": 0.01447786, + "auxiliary_loss_mlp": 0.00306291, + "balance_loss_clip": 1.17906094, + "balance_loss_mlp": 0.27400905, + "epoch": 0.3972343303772734, + "flos": 25884698843520.0, + "grad_norm": 647.2578146304963, + "language_loss": 0.82660061, + "learning_rate": 2.744403998666805e-06, + "loss": 0.84414136, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.32299805, + "step": 6607, + "time_per_iteration": 2.712916374206543 + }, + { + "auxiliary_loss_clip": 0.01459948, + "auxiliary_loss_mlp": 0.00326274, + "balance_loss_clip": 1.18592715, + "balance_loss_mlp": 0.29287139, + "epoch": 0.39729445362994137, + "flos": 45623716300800.0, + "grad_norm": 41.18693102897316, + "language_loss": 0.74784374, + "learning_rate": 2.744042505013797e-06, + "loss": 0.76570594, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.33422852, + "step": 6608, + "time_per_iteration": 4.2569053173065186 + }, + { + "auxiliary_loss_clip": 0.01454733, + "auxiliary_loss_mlp": 0.0032029, + "balance_loss_clip": 1.1767025, + "balance_loss_mlp": 0.28621998, + "epoch": 0.39735457688260933, + "flos": 20193971120640.0, + "grad_norm": 37.61526746075015, + "language_loss": 0.80947298, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.82722318, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.34057617, + "step": 6609, + "time_per_iteration": 2.7021679878234863 + }, + { + "auxiliary_loss_clip": 0.01459376, + "auxiliary_loss_mlp": 0.00303747, + "balance_loss_clip": 1.18882799, + "balance_loss_mlp": 0.2751132, + "epoch": 0.3974147001352773, + "flos": 23331163029120.0, + "grad_norm": 22.68483436622926, + "language_loss": 0.78472281, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.80235404, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.28637695, + "step": 6610, + "time_per_iteration": 2.724433422088623 + }, + { + "auxiliary_loss_clip": 0.01434319, + "auxiliary_loss_mlp": 0.00301356, + "balance_loss_clip": 1.17060912, + "balance_loss_mlp": 0.27126712, + "epoch": 0.39747482338794526, + "flos": 21688644885120.0, + "grad_norm": 11.217874178177599, + "language_loss": 0.85828096, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.87563765, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.30102539, + "step": 6611, + "time_per_iteration": 2.702512502670288 + }, + { + "auxiliary_loss_clip": 0.0144383, + "auxiliary_loss_mlp": 0.00319483, + "balance_loss_clip": 1.17484486, + "balance_loss_mlp": 0.2899186, + "epoch": 0.3975349466406133, + "flos": 30988717816320.0, + "grad_norm": 4.937033423925227, + "language_loss": 0.85068709, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.86832023, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.2956543, + "step": 6612, + "time_per_iteration": 2.740370273590088 + }, + { + "auxiliary_loss_clip": 0.01372928, + "auxiliary_loss_mlp": 0.00091849, + "balance_loss_clip": 1.19603014, + "balance_loss_mlp": 0.08283718, + "epoch": 0.39759506989328125, + "flos": 63683948833920.0, + "grad_norm": 1.6223500196745415, + "language_loss": 0.64426947, + "learning_rate": 2.742234613810459e-06, + "loss": 0.65891719, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.09033203, + "step": 6613, + "time_per_iteration": 3.0801045894622803 + }, + { + "auxiliary_loss_clip": 0.01444668, + "auxiliary_loss_mlp": 0.00326448, + "balance_loss_clip": 1.17209983, + "balance_loss_mlp": 0.29442862, + "epoch": 0.3976551931459492, + "flos": 23695835857920.0, + "grad_norm": 40.27073122333036, + "language_loss": 0.79188287, + "learning_rate": 2.741872951078109e-06, + "loss": 0.80959404, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.3203125, + "step": 6614, + "time_per_iteration": 4.091978073120117 + }, + { + "auxiliary_loss_clip": 0.01448709, + "auxiliary_loss_mlp": 0.00330526, + "balance_loss_clip": 1.17526054, + "balance_loss_mlp": 0.29864931, + "epoch": 0.3977153163986172, + "flos": 15669657745920.0, + "grad_norm": 8.929224907298694, + "language_loss": 0.88506466, + "learning_rate": 2.741511260213862e-06, + "loss": 0.90285707, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.31848145, + "step": 6615, + "time_per_iteration": 2.6483981609344482 + }, + { + "auxiliary_loss_clip": 0.01442522, + "auxiliary_loss_mlp": 0.00297081, + "balance_loss_clip": 1.16961598, + "balance_loss_mlp": 0.26706421, + "epoch": 0.39777543965128515, + "flos": 14064702249600.0, + "grad_norm": 18.5493636005327, + "language_loss": 0.75566256, + "learning_rate": 2.741149541231434e-06, + "loss": 0.77305853, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.30004883, + "step": 6616, + "time_per_iteration": 2.6497044563293457 + }, + { + "auxiliary_loss_clip": 0.01434443, + "auxiliary_loss_mlp": 0.00306447, + "balance_loss_clip": 1.16162825, + "balance_loss_mlp": 0.2754285, + "epoch": 0.3978355629039531, + "flos": 23367468700800.0, + "grad_norm": 11.441666977512993, + "language_loss": 0.92286038, + "learning_rate": 2.740787794144541e-06, + "loss": 0.94026929, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.31005859, + "step": 6617, + "time_per_iteration": 2.666437864303589 + }, + { + "auxiliary_loss_clip": 0.01428162, + "auxiliary_loss_mlp": 0.003224, + "balance_loss_clip": 1.16489398, + "balance_loss_mlp": 0.29240742, + "epoch": 0.3978956861566211, + "flos": 19062785036160.0, + "grad_norm": 13.248516196794126, + "language_loss": 0.78301913, + "learning_rate": 2.7404260189669e-06, + "loss": 0.80052477, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.30004883, + "step": 6618, + "time_per_iteration": 2.774120807647705 + }, + { + "auxiliary_loss_clip": 0.01430982, + "auxiliary_loss_mlp": 0.00306926, + "balance_loss_clip": 1.16032994, + "balance_loss_mlp": 0.2740719, + "epoch": 0.39795580940928904, + "flos": 30227699341440.0, + "grad_norm": 13232.615980711544, + "language_loss": 0.7311151, + "learning_rate": 2.740064215712231e-06, + "loss": 0.74849427, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.32861328, + "step": 6619, + "time_per_iteration": 2.723377227783203 + }, + { + "auxiliary_loss_clip": 0.01312799, + "auxiliary_loss_mlp": 0.00116189, + "balance_loss_clip": 1.1450057, + "balance_loss_mlp": 0.1056508, + "epoch": 0.398015932661957, + "flos": 69847224906240.0, + "grad_norm": 0.7797192127842372, + "language_loss": 0.58145726, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.59574711, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.10546875, + "step": 6620, + "time_per_iteration": 3.128671646118164 + }, + { + "auxiliary_loss_clip": 0.01416184, + "auxiliary_loss_mlp": 0.00288683, + "balance_loss_clip": 1.15199637, + "balance_loss_mlp": 0.25950027, + "epoch": 0.39807605591462497, + "flos": 20157773189760.0, + "grad_norm": 4.3630488559990965, + "language_loss": 0.85580772, + "learning_rate": 2.739340525026686e-06, + "loss": 0.87285638, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.29199219, + "step": 6621, + "time_per_iteration": 2.6746280193328857 + }, + { + "auxiliary_loss_clip": 0.01403275, + "auxiliary_loss_mlp": 0.00299201, + "balance_loss_clip": 1.14354146, + "balance_loss_mlp": 0.26908833, + "epoch": 0.39813617916729294, + "flos": 21141761339520.0, + "grad_norm": 88.32599439427958, + "language_loss": 0.84977299, + "learning_rate": 2.738978637623252e-06, + "loss": 0.86679775, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.30114746, + "step": 6622, + "time_per_iteration": 2.7428510189056396 + }, + { + "auxiliary_loss_clip": 0.01420062, + "auxiliary_loss_mlp": 0.003027, + "balance_loss_clip": 1.1535821, + "balance_loss_mlp": 0.27051368, + "epoch": 0.3981963024199609, + "flos": 18988485753600.0, + "grad_norm": 37.03750149571474, + "language_loss": 0.81946898, + "learning_rate": 2.738616722197674e-06, + "loss": 0.83669662, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.32177734, + "step": 6623, + "time_per_iteration": 2.949141263961792 + }, + { + "auxiliary_loss_clip": 0.01405941, + "auxiliary_loss_mlp": 0.00303421, + "balance_loss_clip": 1.14476037, + "balance_loss_mlp": 0.27352381, + "epoch": 0.39825642567262887, + "flos": 16575108808320.0, + "grad_norm": 4.141039167794411, + "language_loss": 0.88257802, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.89967167, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.29931641, + "step": 6624, + "time_per_iteration": 2.7515249252319336 + }, + { + "auxiliary_loss_clip": 0.01408457, + "auxiliary_loss_mlp": 0.00313572, + "balance_loss_clip": 1.14137101, + "balance_loss_mlp": 0.28033623, + "epoch": 0.39831654892529683, + "flos": 22199833290240.0, + "grad_norm": 18.791346370814626, + "language_loss": 0.94176853, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.95898879, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.33227539, + "step": 6625, + "time_per_iteration": 2.7426493167877197 + }, + { + "auxiliary_loss_clip": 0.01404575, + "auxiliary_loss_mlp": 0.00313379, + "balance_loss_clip": 1.14202654, + "balance_loss_mlp": 0.28008386, + "epoch": 0.39837667217796485, + "flos": 10487963612160.0, + "grad_norm": 25.13003114444521, + "language_loss": 0.93801785, + "learning_rate": 2.737530807925321e-06, + "loss": 0.95519745, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.33288574, + "step": 6626, + "time_per_iteration": 2.767620325088501 + }, + { + "auxiliary_loss_clip": 0.01401424, + "auxiliary_loss_mlp": 0.00321382, + "balance_loss_clip": 1.13880324, + "balance_loss_mlp": 0.29053044, + "epoch": 0.3984367954306328, + "flos": 17965282930560.0, + "grad_norm": 12.457675447490075, + "language_loss": 0.91886425, + "learning_rate": 2.737168780548417e-06, + "loss": 0.93609226, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.30859375, + "step": 6627, + "time_per_iteration": 2.692538022994995 + }, + { + "auxiliary_loss_clip": 0.01390105, + "auxiliary_loss_mlp": 0.00285516, + "balance_loss_clip": 1.12744558, + "balance_loss_mlp": 0.25754941, + "epoch": 0.3984969186833008, + "flos": 22711057608960.0, + "grad_norm": 82.3527991664246, + "language_loss": 0.87918675, + "learning_rate": 2.736806725217998e-06, + "loss": 0.89594293, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.27978516, + "step": 6628, + "time_per_iteration": 2.7396485805511475 + }, + { + "auxiliary_loss_clip": 0.01402356, + "auxiliary_loss_mlp": 0.00319773, + "balance_loss_clip": 1.13619447, + "balance_loss_mlp": 0.28879064, + "epoch": 0.39855704193596875, + "flos": 23405785534080.0, + "grad_norm": 25.495820308451297, + "language_loss": 0.79075933, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.80798066, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.30969238, + "step": 6629, + "time_per_iteration": 2.7774717807769775 + }, + { + "auxiliary_loss_clip": 0.01376663, + "auxiliary_loss_mlp": 0.00307652, + "balance_loss_clip": 1.12301397, + "balance_loss_mlp": 0.27734894, + "epoch": 0.3986171651886367, + "flos": 21251935330560.0, + "grad_norm": 88.71973461865431, + "language_loss": 0.866207, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.8830502, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.30273438, + "step": 6630, + "time_per_iteration": 2.6875522136688232 + }, + { + "auxiliary_loss_clip": 0.01383175, + "auxiliary_loss_mlp": 0.0030915, + "balance_loss_clip": 1.12665105, + "balance_loss_mlp": 0.27906132, + "epoch": 0.3986772884413047, + "flos": 12458705258880.0, + "grad_norm": 110.56265094446617, + "language_loss": 0.82033134, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.83725464, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.30078125, + "step": 6631, + "time_per_iteration": 2.6572818756103516 + }, + { + "auxiliary_loss_clip": 0.01378731, + "auxiliary_loss_mlp": 0.00315112, + "balance_loss_clip": 1.11865807, + "balance_loss_mlp": 0.28285396, + "epoch": 0.39873741169397264, + "flos": 19646117907840.0, + "grad_norm": 7.279050144644968, + "language_loss": 0.80259985, + "learning_rate": 2.735358224635783e-06, + "loss": 0.8195383, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.32299805, + "step": 6632, + "time_per_iteration": 2.7034547328948975 + }, + { + "auxiliary_loss_clip": 0.01372968, + "auxiliary_loss_mlp": 0.0029664, + "balance_loss_clip": 1.12027812, + "balance_loss_mlp": 0.26900721, + "epoch": 0.3987975349466406, + "flos": 21684766216320.0, + "grad_norm": 137.2862486288265, + "language_loss": 0.81928241, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.83597857, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.27624512, + "step": 6633, + "time_per_iteration": 2.6177995204925537 + }, + { + "auxiliary_loss_clip": 0.01374964, + "auxiliary_loss_mlp": 0.0031687, + "balance_loss_clip": 1.11884892, + "balance_loss_mlp": 0.28694803, + "epoch": 0.3988576581993086, + "flos": 23914064937600.0, + "grad_norm": 17.620894463998365, + "language_loss": 0.87070286, + "learning_rate": 2.7346338069806e-06, + "loss": 0.88762128, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.29882812, + "step": 6634, + "time_per_iteration": 2.6756954193115234 + }, + { + "auxiliary_loss_clip": 0.01383833, + "auxiliary_loss_mlp": 0.00311126, + "balance_loss_clip": 1.12639713, + "balance_loss_mlp": 0.28056037, + "epoch": 0.39891778145197654, + "flos": 18149899858560.0, + "grad_norm": 102.77095001274809, + "language_loss": 0.82448971, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.84143925, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.3059082, + "step": 6635, + "time_per_iteration": 2.6367125511169434 + }, + { + "auxiliary_loss_clip": 0.01384114, + "auxiliary_loss_mlp": 0.002839, + "balance_loss_clip": 1.12501073, + "balance_loss_mlp": 0.25575459, + "epoch": 0.3989779047046445, + "flos": 22595281096320.0, + "grad_norm": 15.110911951521603, + "language_loss": 0.75469685, + "learning_rate": 2.733909277895868e-06, + "loss": 0.77137709, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.28137207, + "step": 6636, + "time_per_iteration": 2.70432448387146 + }, + { + "auxiliary_loss_clip": 0.01366071, + "auxiliary_loss_mlp": 0.00309814, + "balance_loss_clip": 1.11778951, + "balance_loss_mlp": 0.28048861, + "epoch": 0.39903802795731247, + "flos": 18077216688000.0, + "grad_norm": 7.517464780227158, + "language_loss": 0.88892484, + "learning_rate": 2.733546971601763e-06, + "loss": 0.90568376, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.29321289, + "step": 6637, + "time_per_iteration": 2.600917100906372 + }, + { + "auxiliary_loss_clip": 0.01295164, + "auxiliary_loss_mlp": 0.00106355, + "balance_loss_clip": 1.11816454, + "balance_loss_mlp": 0.09739094, + "epoch": 0.39909815120998043, + "flos": 70441367771520.0, + "grad_norm": 0.7053648572337116, + "language_loss": 0.53017914, + "learning_rate": 2.733184637491484e-06, + "loss": 0.54419434, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.08984375, + "step": 6638, + "time_per_iteration": 3.20635986328125 + }, + { + "auxiliary_loss_clip": 0.01375218, + "auxiliary_loss_mlp": 0.00319186, + "balance_loss_clip": 1.12449133, + "balance_loss_mlp": 0.28771514, + "epoch": 0.39915827446264845, + "flos": 18549262247040.0, + "grad_norm": 47.8180856261843, + "language_loss": 0.81303912, + "learning_rate": 2.732822275578769e-06, + "loss": 0.82998317, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.31469727, + "step": 6639, + "time_per_iteration": 2.665512800216675 + }, + { + "auxiliary_loss_clip": 0.01385418, + "auxiliary_loss_mlp": 0.00312752, + "balance_loss_clip": 1.13449609, + "balance_loss_mlp": 0.28151876, + "epoch": 0.3992183977153164, + "flos": 29897249195520.0, + "grad_norm": 5.190464538460886, + "language_loss": 0.82404244, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.84102404, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.31176758, + "step": 6640, + "time_per_iteration": 2.809905529022217 + }, + { + "auxiliary_loss_clip": 0.01376402, + "auxiliary_loss_mlp": 0.00312988, + "balance_loss_clip": 1.1237061, + "balance_loss_mlp": 0.28356695, + "epoch": 0.3992785209679844, + "flos": 22565080736640.0, + "grad_norm": 11.915435124975641, + "language_loss": 0.89323068, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.9101246, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.29418945, + "step": 6641, + "time_per_iteration": 2.6601693630218506 + }, + { + "auxiliary_loss_clip": 0.01398189, + "auxiliary_loss_mlp": 0.00316749, + "balance_loss_clip": 1.14343357, + "balance_loss_mlp": 0.28664857, + "epoch": 0.39933864422065235, + "flos": 19682674974720.0, + "grad_norm": 6.191067955207903, + "language_loss": 0.91691715, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.93406653, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.30078125, + "step": 6642, + "time_per_iteration": 2.6519598960876465 + }, + { + "auxiliary_loss_clip": 0.0140225, + "auxiliary_loss_mlp": 0.00331697, + "balance_loss_clip": 1.14379168, + "balance_loss_mlp": 0.30017799, + "epoch": 0.3993987674733203, + "flos": 23038491012480.0, + "grad_norm": 22.886459462787073, + "language_loss": 0.81036353, + "learning_rate": 2.731372550178393e-06, + "loss": 0.827703, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.31518555, + "step": 6643, + "time_per_iteration": 2.650790214538574 + }, + { + "auxiliary_loss_clip": 0.01405133, + "auxiliary_loss_mlp": 0.00331413, + "balance_loss_clip": 1.148826, + "balance_loss_mlp": 0.29927409, + "epoch": 0.3994588907259883, + "flos": 19390828970880.0, + "grad_norm": 3.3518122566900606, + "language_loss": 0.72937787, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.74674332, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.3215332, + "step": 6644, + "time_per_iteration": 2.674323320388794 + }, + { + "auxiliary_loss_clip": 0.01395466, + "auxiliary_loss_mlp": 0.00325959, + "balance_loss_clip": 1.14032328, + "balance_loss_mlp": 0.29399937, + "epoch": 0.39951901397865625, + "flos": 13734395758080.0, + "grad_norm": 13.74551547148261, + "language_loss": 0.85918379, + "learning_rate": 2.730647521020907e-06, + "loss": 0.87639797, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.31945801, + "step": 6645, + "time_per_iteration": 4.142471075057983 + }, + { + "auxiliary_loss_clip": 0.01403043, + "auxiliary_loss_mlp": 0.003398, + "balance_loss_clip": 1.14820182, + "balance_loss_mlp": 0.30849546, + "epoch": 0.3995791372313242, + "flos": 23586451966080.0, + "grad_norm": 3.62687885603037, + "language_loss": 0.76616895, + "learning_rate": 2.73028496487595e-06, + "loss": 0.78359735, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.31323242, + "step": 6646, + "time_per_iteration": 2.6528072357177734 + }, + { + "auxiliary_loss_clip": 0.01410247, + "auxiliary_loss_mlp": 0.00320419, + "balance_loss_clip": 1.15130424, + "balance_loss_mlp": 0.28825676, + "epoch": 0.3996392604839922, + "flos": 21355896268800.0, + "grad_norm": 154.03353386673538, + "language_loss": 0.78757524, + "learning_rate": 2.729922381038513e-06, + "loss": 0.80488193, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.32177734, + "step": 6647, + "time_per_iteration": 4.121215581893921 + }, + { + "auxiliary_loss_clip": 0.01403891, + "auxiliary_loss_mlp": 0.00327951, + "balance_loss_clip": 1.15548003, + "balance_loss_mlp": 0.29798174, + "epoch": 0.39969938373666014, + "flos": 26032255914240.0, + "grad_norm": 18.175798040939558, + "language_loss": 0.80103505, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.81835347, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.29980469, + "step": 6648, + "time_per_iteration": 2.722133159637451 + }, + { + "auxiliary_loss_clip": 0.01418745, + "auxiliary_loss_mlp": 0.00329834, + "balance_loss_clip": 1.16373694, + "balance_loss_mlp": 0.30094942, + "epoch": 0.3997595069893281, + "flos": 20116367786880.0, + "grad_norm": 34.835189834158186, + "language_loss": 0.71308076, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.7305665, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.28894043, + "step": 6649, + "time_per_iteration": 2.637627124786377 + }, + { + "auxiliary_loss_clip": 0.01414264, + "auxiliary_loss_mlp": 0.00365837, + "balance_loss_clip": 1.16130447, + "balance_loss_mlp": 0.33365071, + "epoch": 0.39981963024199607, + "flos": 27783403764480.0, + "grad_norm": 49.179026632357356, + "language_loss": 0.83642936, + "learning_rate": 2.728834463508826e-06, + "loss": 0.85423034, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.32202148, + "step": 6650, + "time_per_iteration": 4.136972665786743 + }, + { + "auxiliary_loss_clip": 0.01414165, + "auxiliary_loss_mlp": 0.00333303, + "balance_loss_clip": 1.1599586, + "balance_loss_mlp": 0.30345285, + "epoch": 0.39987975349466404, + "flos": 21944436612480.0, + "grad_norm": 21.414485971490652, + "language_loss": 0.77655619, + "learning_rate": 2.728471769038975e-06, + "loss": 0.79403085, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.29858398, + "step": 6651, + "time_per_iteration": 2.674011468887329 + }, + { + "auxiliary_loss_clip": 0.01427629, + "auxiliary_loss_mlp": 0.00393604, + "balance_loss_clip": 1.16704762, + "balance_loss_mlp": 0.35929531, + "epoch": 0.39993987674733206, + "flos": 20704405340160.0, + "grad_norm": 284.23651455035167, + "language_loss": 0.81215632, + "learning_rate": 2.728109046945403e-06, + "loss": 0.83036864, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.34301758, + "step": 6652, + "time_per_iteration": 2.696906566619873 + }, + { + "auxiliary_loss_clip": 0.01350404, + "auxiliary_loss_mlp": 0.00147955, + "balance_loss_clip": 1.17307448, + "balance_loss_mlp": 0.13746475, + "epoch": 0.4, + "flos": 61525429862400.0, + "grad_norm": 0.8600863865416475, + "language_loss": 0.59982121, + "learning_rate": 2.727746297241862e-06, + "loss": 0.61480474, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.10498047, + "step": 6653, + "time_per_iteration": 3.049694061279297 + }, + { + "auxiliary_loss_clip": 0.01419922, + "auxiliary_loss_mlp": 0.00300915, + "balance_loss_clip": 1.17410111, + "balance_loss_mlp": 0.27309182, + "epoch": 0.400060123252668, + "flos": 14502309644160.0, + "grad_norm": 114.96780173480907, + "language_loss": 0.74397117, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.76117957, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.27856445, + "step": 6654, + "time_per_iteration": 2.674602508544922 + }, + { + "auxiliary_loss_clip": 0.01411783, + "auxiliary_loss_mlp": 0.00370713, + "balance_loss_clip": 1.1609714, + "balance_loss_mlp": 0.3370955, + "epoch": 0.40012024650533595, + "flos": 19093308618240.0, + "grad_norm": 11.9482588240532, + "language_loss": 0.96239978, + "learning_rate": 2.7270207150599e-06, + "loss": 0.98022473, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.33569336, + "step": 6655, + "time_per_iteration": 2.6430020332336426 + }, + { + "auxiliary_loss_clip": 0.0140492, + "auxiliary_loss_mlp": 0.00324711, + "balance_loss_clip": 1.15845597, + "balance_loss_mlp": 0.29526627, + "epoch": 0.4001803697580039, + "flos": 29351012094720.0, + "grad_norm": 28.430699216117382, + "language_loss": 0.7774235, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.79471982, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.29443359, + "step": 6656, + "time_per_iteration": 2.696978807449341 + }, + { + "auxiliary_loss_clip": 0.01424392, + "auxiliary_loss_mlp": 0.00334661, + "balance_loss_clip": 1.16948462, + "balance_loss_mlp": 0.30271333, + "epoch": 0.4002404930106719, + "flos": 20920048640640.0, + "grad_norm": 6.892117674552533, + "language_loss": 0.79036832, + "learning_rate": 2.726295022603144e-06, + "loss": 0.80795884, + "num_input_tokens_seen": 142998390, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.31933594, + "step": 6657, + "time_per_iteration": 2.8217923641204834 + }, + { + "auxiliary_loss_clip": 0.01435378, + "auxiliary_loss_mlp": 0.00329581, + "balance_loss_clip": 1.17848647, + "balance_loss_mlp": 0.29560632, + "epoch": 0.40030061626333985, + "flos": 28405735827840.0, + "grad_norm": 2.2639142427127723, + "language_loss": 0.85588044, + "learning_rate": 2.725932135056117e-06, + "loss": 0.87353003, + "num_input_tokens_seen": 143021505, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.33959961, + "step": 6658, + "time_per_iteration": 2.8139142990112305 + }, + { + "auxiliary_loss_clip": 0.0142651, + "auxiliary_loss_mlp": 0.00311404, + "balance_loss_clip": 1.17005885, + "balance_loss_mlp": 0.28126782, + "epoch": 0.4003607395160078, + "flos": 25921615046400.0, + "grad_norm": 149.36320769837982, + "language_loss": 0.83419865, + "learning_rate": 2.72556921998167e-06, + "loss": 0.85157776, + "num_input_tokens_seen": 143041375, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.30151367, + "step": 6659, + "time_per_iteration": 2.799391031265259 + }, + { + "auxiliary_loss_clip": 0.01395655, + "auxiliary_loss_mlp": 0.00321009, + "balance_loss_clip": 1.1496253, + "balance_loss_mlp": 0.29073024, + "epoch": 0.4004208627686758, + "flos": 20768648814720.0, + "grad_norm": 265.2062231912479, + "language_loss": 0.78212911, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.79929578, + "num_input_tokens_seen": 143058725, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.30297852, + "step": 6660, + "time_per_iteration": 2.662999391555786 + }, + { + "auxiliary_loss_clip": 0.01420219, + "auxiliary_loss_mlp": 0.00319355, + "balance_loss_clip": 1.16595411, + "balance_loss_mlp": 0.2887181, + "epoch": 0.40048098602134374, + "flos": 24681224638080.0, + "grad_norm": 45.82129070569097, + "language_loss": 0.76853389, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.78592962, + "num_input_tokens_seen": 143076995, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.30664062, + "step": 6661, + "time_per_iteration": 2.6707546710968018 + }, + { + "auxiliary_loss_clip": 0.01426213, + "auxiliary_loss_mlp": 0.00327603, + "balance_loss_clip": 1.16617596, + "balance_loss_mlp": 0.29665661, + "epoch": 0.4005411092740117, + "flos": 23185688947200.0, + "grad_norm": 7.733232329230433, + "language_loss": 0.81258756, + "learning_rate": 2.724480309731437e-06, + "loss": 0.83012569, + "num_input_tokens_seen": 143096780, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.30957031, + "step": 6662, + "time_per_iteration": 2.6551225185394287 + }, + { + "auxiliary_loss_clip": 0.01423402, + "auxiliary_loss_mlp": 0.00296868, + "balance_loss_clip": 1.16408062, + "balance_loss_mlp": 0.26701784, + "epoch": 0.4006012325266797, + "flos": 17522324409600.0, + "grad_norm": 11.204425431374172, + "language_loss": 0.73853654, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.75573921, + "num_input_tokens_seen": 143112590, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.29858398, + "step": 6663, + "time_per_iteration": 2.806070566177368 + }, + { + "auxiliary_loss_clip": 0.01426618, + "auxiliary_loss_mlp": 0.00334622, + "balance_loss_clip": 1.17014968, + "balance_loss_mlp": 0.30233964, + "epoch": 0.40066135577934764, + "flos": 19857200181120.0, + "grad_norm": 33.4272767852, + "language_loss": 0.9389137, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.95652616, + "num_input_tokens_seen": 143130220, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.32299805, + "step": 6664, + "time_per_iteration": 2.746793746948242 + }, + { + "auxiliary_loss_clip": 0.01437669, + "auxiliary_loss_mlp": 0.00339765, + "balance_loss_clip": 1.17356253, + "balance_loss_mlp": 0.3081508, + "epoch": 0.40072147903201566, + "flos": 18150007599360.0, + "grad_norm": 6.2736858869562075, + "language_loss": 0.91643441, + "learning_rate": 2.723391152229917e-06, + "loss": 0.93420875, + "num_input_tokens_seen": 143147160, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.31640625, + "step": 6665, + "time_per_iteration": 2.7651543617248535 + }, + { + "auxiliary_loss_clip": 0.01445403, + "auxiliary_loss_mlp": 0.00327571, + "balance_loss_clip": 1.1842494, + "balance_loss_mlp": 0.29474038, + "epoch": 0.4007816022846836, + "flos": 18661267831680.0, + "grad_norm": 29.68208446402844, + "language_loss": 0.83507073, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.85280049, + "num_input_tokens_seen": 143164605, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.32836914, + "step": 6666, + "time_per_iteration": 2.689385414123535 + }, + { + "auxiliary_loss_clip": 0.01446935, + "auxiliary_loss_mlp": 0.0034218, + "balance_loss_clip": 1.18698883, + "balance_loss_mlp": 0.31047022, + "epoch": 0.4008417255373516, + "flos": 25703170485120.0, + "grad_norm": 4316.617313332409, + "language_loss": 0.82483011, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.84272128, + "num_input_tokens_seen": 143183965, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.31689453, + "step": 6667, + "time_per_iteration": 2.6882448196411133 + }, + { + "auxiliary_loss_clip": 0.0143969, + "auxiliary_loss_mlp": 0.00344426, + "balance_loss_clip": 1.17990434, + "balance_loss_mlp": 0.31440958, + "epoch": 0.40090184879001955, + "flos": 22858614679680.0, + "grad_norm": 4.373300696033577, + "language_loss": 0.82265091, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.84049207, + "num_input_tokens_seen": 143204965, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.30004883, + "step": 6668, + "time_per_iteration": 2.700854778289795 + }, + { + "auxiliary_loss_clip": 0.01442209, + "auxiliary_loss_mlp": 0.00303681, + "balance_loss_clip": 1.18458986, + "balance_loss_mlp": 0.27652559, + "epoch": 0.4009619720426875, + "flos": 29059848449280.0, + "grad_norm": 10.204492118611837, + "language_loss": 0.90986276, + "learning_rate": 2.721938558257248e-06, + "loss": 0.92732167, + "num_input_tokens_seen": 143225015, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.27148438, + "step": 6669, + "time_per_iteration": 2.7382924556732178 + }, + { + "auxiliary_loss_clip": 0.01411253, + "auxiliary_loss_mlp": 0.00213896, + "balance_loss_clip": 1.22700298, + "balance_loss_mlp": 0.20092554, + "epoch": 0.4010220952953555, + "flos": 66059763131520.0, + "grad_norm": 0.7066692509830752, + "language_loss": 0.53076714, + "learning_rate": 2.721575341289695e-06, + "loss": 0.54701865, + "num_input_tokens_seen": 143294925, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.12988281, + "step": 6670, + "time_per_iteration": 3.362952709197998 + }, + { + "auxiliary_loss_clip": 0.01450848, + "auxiliary_loss_mlp": 0.00369222, + "balance_loss_clip": 1.19475329, + "balance_loss_mlp": 0.33667746, + "epoch": 0.40108221854802345, + "flos": 29642822184960.0, + "grad_norm": 3.7389537580788934, + "language_loss": 0.93595088, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.95415151, + "num_input_tokens_seen": 143314170, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.32543945, + "step": 6671, + "time_per_iteration": 2.738396644592285 + }, + { + "auxiliary_loss_clip": 0.01453968, + "auxiliary_loss_mlp": 0.00375575, + "balance_loss_clip": 1.18824339, + "balance_loss_mlp": 0.34174341, + "epoch": 0.4011423418006914, + "flos": 19929560129280.0, + "grad_norm": 4.275056493317617, + "language_loss": 0.84315956, + "learning_rate": 2.720848825281736e-06, + "loss": 0.86145496, + "num_input_tokens_seen": 143330050, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.33862305, + "step": 6672, + "time_per_iteration": 2.6326746940612793 + }, + { + "auxiliary_loss_clip": 0.01450859, + "auxiliary_loss_mlp": 0.00391226, + "balance_loss_clip": 1.18891537, + "balance_loss_mlp": 0.35720408, + "epoch": 0.4012024650533594, + "flos": 20084299920000.0, + "grad_norm": 10.646086222082266, + "language_loss": 0.73296821, + "learning_rate": 2.72048552626888e-06, + "loss": 0.75138903, + "num_input_tokens_seen": 143348650, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.34033203, + "step": 6673, + "time_per_iteration": 2.6665260791778564 + }, + { + "auxiliary_loss_clip": 0.01454296, + "auxiliary_loss_mlp": 0.0035133, + "balance_loss_clip": 1.18906081, + "balance_loss_mlp": 0.32124203, + "epoch": 0.40126258830602735, + "flos": 21695719864320.0, + "grad_norm": 14.9682207660407, + "language_loss": 0.85422671, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.87228292, + "num_input_tokens_seen": 143370275, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.30041504, + "step": 6674, + "time_per_iteration": 2.7273263931274414 + }, + { + "auxiliary_loss_clip": 0.01461739, + "auxiliary_loss_mlp": 0.0036171, + "balance_loss_clip": 1.19502616, + "balance_loss_mlp": 0.32668591, + "epoch": 0.4013227115586953, + "flos": 12020379592320.0, + "grad_norm": 48.78687381316823, + "language_loss": 0.9236744, + "learning_rate": 2.719758846294294e-06, + "loss": 0.94190896, + "num_input_tokens_seen": 143385390, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.35009766, + "step": 6675, + "time_per_iteration": 2.6525962352752686 + }, + { + "auxiliary_loss_clip": 0.01460111, + "auxiliary_loss_mlp": 0.0037383, + "balance_loss_clip": 1.19473743, + "balance_loss_mlp": 0.33711314, + "epoch": 0.4013828348113633, + "flos": 25447522412160.0, + "grad_norm": 533.4624324695498, + "language_loss": 0.99072212, + "learning_rate": 2.71939546536012e-06, + "loss": 1.00906146, + "num_input_tokens_seen": 143404215, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.36694336, + "step": 6676, + "time_per_iteration": 2.699455976486206 + }, + { + "auxiliary_loss_clip": 0.01455672, + "auxiliary_loss_mlp": 0.00375843, + "balance_loss_clip": 1.18951321, + "balance_loss_mlp": 0.34296465, + "epoch": 0.40144295806403124, + "flos": 18582946225920.0, + "grad_norm": 40.12522483833791, + "language_loss": 0.87387913, + "learning_rate": 2.719032057146399e-06, + "loss": 0.89219421, + "num_input_tokens_seen": 143422245, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.32861328, + "step": 6677, + "time_per_iteration": 2.649977684020996 + }, + { + "auxiliary_loss_clip": 0.01464133, + "auxiliary_loss_mlp": 0.00368141, + "balance_loss_clip": 1.20064533, + "balance_loss_mlp": 0.33552498, + "epoch": 0.4015030813166992, + "flos": 22930220442240.0, + "grad_norm": 12.139587983986777, + "language_loss": 0.91062999, + "learning_rate": 2.71866862166691e-06, + "loss": 0.92895269, + "num_input_tokens_seen": 143443130, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.32617188, + "step": 6678, + "time_per_iteration": 2.6780178546905518 + }, + { + "auxiliary_loss_clip": 0.01459226, + "auxiliary_loss_mlp": 0.00379906, + "balance_loss_clip": 1.19568157, + "balance_loss_mlp": 0.3462415, + "epoch": 0.4015632045693672, + "flos": 20595057361920.0, + "grad_norm": 16.082019177064296, + "language_loss": 0.71167862, + "learning_rate": 2.718305158935434e-06, + "loss": 0.73006994, + "num_input_tokens_seen": 143461385, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.33691406, + "step": 6679, + "time_per_iteration": 2.700611114501953 + }, + { + "auxiliary_loss_clip": 0.01448398, + "auxiliary_loss_mlp": 0.00386181, + "balance_loss_clip": 1.18775129, + "balance_loss_mlp": 0.3522298, + "epoch": 0.4016233278220352, + "flos": 23438930808960.0, + "grad_norm": 3.4443152844304032, + "language_loss": 0.83007312, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.84841889, + "num_input_tokens_seen": 143481750, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.33959961, + "step": 6680, + "time_per_iteration": 2.795255661010742 + }, + { + "auxiliary_loss_clip": 0.01468904, + "auxiliary_loss_mlp": 0.00445482, + "balance_loss_clip": 1.19747734, + "balance_loss_mlp": 0.40921855, + "epoch": 0.40168345107470316, + "flos": 21431057477760.0, + "grad_norm": 7086.790220004393, + "language_loss": 0.8086884, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.82783222, + "num_input_tokens_seen": 143501540, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.36230469, + "step": 6681, + "time_per_iteration": 2.7298166751861572 + }, + { + "auxiliary_loss_clip": 0.014768, + "auxiliary_loss_mlp": 0.00395029, + "balance_loss_clip": 1.20774913, + "balance_loss_mlp": 0.36176962, + "epoch": 0.4017435743273711, + "flos": 22857214049280.0, + "grad_norm": 3.2897154929116974, + "language_loss": 0.69885671, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.71757495, + "num_input_tokens_seen": 143520530, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.33251953, + "step": 6682, + "time_per_iteration": 2.684739828109741 + }, + { + "auxiliary_loss_clip": 0.01457244, + "auxiliary_loss_mlp": 0.00398262, + "balance_loss_clip": 1.18862021, + "balance_loss_mlp": 0.36156917, + "epoch": 0.4018036975800391, + "flos": 28622312881920.0, + "grad_norm": 16.645088347547606, + "language_loss": 0.79805189, + "learning_rate": 2.716851035765337e-06, + "loss": 0.816607, + "num_input_tokens_seen": 143540210, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.36694336, + "step": 6683, + "time_per_iteration": 2.752300500869751 + }, + { + "auxiliary_loss_clip": 0.0146536, + "auxiliary_loss_mlp": 0.00369742, + "balance_loss_clip": 1.20181322, + "balance_loss_mlp": 0.33650681, + "epoch": 0.40186382083270705, + "flos": 26651212099200.0, + "grad_norm": 4.66712898342189, + "language_loss": 0.79784226, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.81619328, + "num_input_tokens_seen": 143560940, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.33251953, + "step": 6684, + "time_per_iteration": 2.7237842082977295 + }, + { + "auxiliary_loss_clip": 0.01429243, + "auxiliary_loss_mlp": 0.00167097, + "balance_loss_clip": 1.24041212, + "balance_loss_mlp": 0.15632053, + "epoch": 0.401923944085375, + "flos": 59259969123840.0, + "grad_norm": 0.821013106554311, + "language_loss": 0.6035012, + "learning_rate": 2.716123811026767e-06, + "loss": 0.61946464, + "num_input_tokens_seen": 143624015, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.10791016, + "step": 6685, + "time_per_iteration": 3.268524408340454 + }, + { + "auxiliary_loss_clip": 0.0147827, + "auxiliary_loss_mlp": 0.00414803, + "balance_loss_clip": 1.20789099, + "balance_loss_mlp": 0.37772942, + "epoch": 0.401984067338043, + "flos": 16982803152000.0, + "grad_norm": 5.200403744280799, + "language_loss": 0.76771611, + "learning_rate": 2.715760157917357e-06, + "loss": 0.78664684, + "num_input_tokens_seen": 143642750, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.37109375, + "step": 6686, + "time_per_iteration": 2.6494410037994385 + }, + { + "auxiliary_loss_clip": 0.01470808, + "auxiliary_loss_mlp": 0.004187, + "balance_loss_clip": 1.20573604, + "balance_loss_mlp": 0.38091075, + "epoch": 0.40204419059071095, + "flos": 24972496024320.0, + "grad_norm": 6.422060180195437, + "language_loss": 0.78713012, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.80602521, + "num_input_tokens_seen": 143664515, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.37792969, + "step": 6687, + "time_per_iteration": 4.083427429199219 + }, + { + "auxiliary_loss_clip": 0.01491521, + "auxiliary_loss_mlp": 0.00370042, + "balance_loss_clip": 1.22193193, + "balance_loss_mlp": 0.33780771, + "epoch": 0.4021043138433789, + "flos": 23477463123840.0, + "grad_norm": 10.961693552894461, + "language_loss": 0.78154343, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.8001591, + "num_input_tokens_seen": 143683135, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.32250977, + "step": 6688, + "time_per_iteration": 2.6573517322540283 + }, + { + "auxiliary_loss_clip": 0.01489211, + "auxiliary_loss_mlp": 0.00404849, + "balance_loss_clip": 1.21692395, + "balance_loss_mlp": 0.37006402, + "epoch": 0.4021644370960469, + "flos": 25995806588160.0, + "grad_norm": 76.27810719981487, + "language_loss": 0.71521473, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.7341553, + "num_input_tokens_seen": 143703985, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.34814453, + "step": 6689, + "time_per_iteration": 4.141737937927246 + }, + { + "auxiliary_loss_clip": 0.01471368, + "auxiliary_loss_mlp": 0.00386521, + "balance_loss_clip": 1.20389831, + "balance_loss_mlp": 0.35276085, + "epoch": 0.40222456034871484, + "flos": 13587987922560.0, + "grad_norm": 2.4827325937360496, + "language_loss": 0.8363809, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.85495973, + "num_input_tokens_seen": 143719245, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.33764648, + "step": 6690, + "time_per_iteration": 2.619150400161743 + }, + { + "auxiliary_loss_clip": 0.0146848, + "auxiliary_loss_mlp": 0.00390267, + "balance_loss_clip": 1.20108652, + "balance_loss_mlp": 0.35786611, + "epoch": 0.4022846836013828, + "flos": 24278019494400.0, + "grad_norm": 5.828977487423062, + "language_loss": 0.79029441, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.80888194, + "num_input_tokens_seen": 143739575, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.32373047, + "step": 6691, + "time_per_iteration": 2.896336078643799 + }, + { + "auxiliary_loss_clip": 0.01493335, + "auxiliary_loss_mlp": 0.00425754, + "balance_loss_clip": 1.22038972, + "balance_loss_mlp": 0.38908547, + "epoch": 0.40234480685405083, + "flos": 20151596050560.0, + "grad_norm": 342.3760096461946, + "language_loss": 0.79363972, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.81283063, + "num_input_tokens_seen": 143758515, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.36694336, + "step": 6692, + "time_per_iteration": 4.122026205062866 + }, + { + "auxiliary_loss_clip": 0.01484326, + "auxiliary_loss_mlp": 0.00373456, + "balance_loss_clip": 1.21582806, + "balance_loss_mlp": 0.3408401, + "epoch": 0.4024049301067188, + "flos": 22930220442240.0, + "grad_norm": 9.009617884570192, + "language_loss": 0.89231551, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.91089332, + "num_input_tokens_seen": 143776770, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.32617188, + "step": 6693, + "time_per_iteration": 2.7413792610168457 + }, + { + "auxiliary_loss_clip": 0.01463214, + "auxiliary_loss_mlp": 0.00394513, + "balance_loss_clip": 1.19876027, + "balance_loss_mlp": 0.36044317, + "epoch": 0.40246505335938676, + "flos": 36028421487360.0, + "grad_norm": 6.9012941077829, + "language_loss": 0.76762056, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.7861979, + "num_input_tokens_seen": 143798450, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.34082031, + "step": 6694, + "time_per_iteration": 2.800140380859375 + }, + { + "auxiliary_loss_clip": 0.01466753, + "auxiliary_loss_mlp": 0.00422609, + "balance_loss_clip": 1.19825959, + "balance_loss_mlp": 0.38455772, + "epoch": 0.4025251766120547, + "flos": 20594303176320.0, + "grad_norm": 125.93784531855822, + "language_loss": 0.74607491, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.76496851, + "num_input_tokens_seen": 143816995, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.38061523, + "step": 6695, + "time_per_iteration": 2.7324469089508057 + }, + { + "auxiliary_loss_clip": 0.0149442, + "auxiliary_loss_mlp": 0.00389331, + "balance_loss_clip": 1.22594237, + "balance_loss_mlp": 0.35521311, + "epoch": 0.4025852998647227, + "flos": 64523932381440.0, + "grad_norm": 21.377019976099415, + "language_loss": 0.84527493, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.86411238, + "num_input_tokens_seen": 143842090, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.34106445, + "step": 6696, + "time_per_iteration": 3.1059353351593018 + }, + { + "auxiliary_loss_clip": 0.01508314, + "auxiliary_loss_mlp": 0.00389015, + "balance_loss_clip": 1.23620629, + "balance_loss_mlp": 0.35454008, + "epoch": 0.40264542311739066, + "flos": 20886292834560.0, + "grad_norm": 9.678502759484006, + "language_loss": 0.78572112, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.80469441, + "num_input_tokens_seen": 143860800, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.34472656, + "step": 6697, + "time_per_iteration": 2.677260398864746 + }, + { + "auxiliary_loss_clip": 0.01491943, + "auxiliary_loss_mlp": 0.00380919, + "balance_loss_clip": 1.22416735, + "balance_loss_mlp": 0.34898257, + "epoch": 0.4027055463700586, + "flos": 26250197685120.0, + "grad_norm": 52.99144119650553, + "language_loss": 0.69154251, + "learning_rate": 2.711394207496984e-06, + "loss": 0.71027118, + "num_input_tokens_seen": 143878950, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.31933594, + "step": 6698, + "time_per_iteration": 4.08188533782959 + }, + { + "auxiliary_loss_clip": 0.01473172, + "auxiliary_loss_mlp": 0.00372046, + "balance_loss_clip": 1.2021749, + "balance_loss_mlp": 0.33747542, + "epoch": 0.4027656696227266, + "flos": 20631398947200.0, + "grad_norm": 47.030044969251925, + "language_loss": 0.83571857, + "learning_rate": 2.711030202621491e-06, + "loss": 0.8541708, + "num_input_tokens_seen": 143898385, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.34594727, + "step": 6699, + "time_per_iteration": 2.689856767654419 + }, + { + "auxiliary_loss_clip": 0.01470235, + "auxiliary_loss_mlp": 0.00375643, + "balance_loss_clip": 1.20642424, + "balance_loss_mlp": 0.34276491, + "epoch": 0.40282579287539455, + "flos": 22346277039360.0, + "grad_norm": 4.6228051963933865, + "language_loss": 0.86069524, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.87915409, + "num_input_tokens_seen": 143918795, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.32885742, + "step": 6700, + "time_per_iteration": 2.695662260055542 + }, + { + "auxiliary_loss_clip": 0.01494606, + "auxiliary_loss_mlp": 0.00397701, + "balance_loss_clip": 1.22229147, + "balance_loss_mlp": 0.36236775, + "epoch": 0.4028859161280625, + "flos": 29274988959360.0, + "grad_norm": 8.326410180345835, + "language_loss": 0.8067289, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.825652, + "num_input_tokens_seen": 143938245, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.35302734, + "step": 6701, + "time_per_iteration": 2.769742488861084 + }, + { + "auxiliary_loss_clip": 0.01480766, + "auxiliary_loss_mlp": 0.0039235, + "balance_loss_clip": 1.21656156, + "balance_loss_mlp": 0.35813719, + "epoch": 0.4029460393807305, + "flos": 28622312881920.0, + "grad_norm": 28.770585010787787, + "language_loss": 0.72181368, + "learning_rate": 2.709938026276208e-06, + "loss": 0.7405448, + "num_input_tokens_seen": 143960995, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.34179688, + "step": 6702, + "time_per_iteration": 2.719691038131714 + }, + { + "auxiliary_loss_clip": 0.01474534, + "auxiliary_loss_mlp": 0.00419183, + "balance_loss_clip": 1.20574117, + "balance_loss_mlp": 0.38187078, + "epoch": 0.40300616263339845, + "flos": 22601925112320.0, + "grad_norm": 5.35777577246246, + "language_loss": 0.73481929, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.7537564, + "num_input_tokens_seen": 143979910, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.37329102, + "step": 6703, + "time_per_iteration": 2.6427388191223145 + }, + { + "auxiliary_loss_clip": 0.01481572, + "auxiliary_loss_mlp": 0.00380648, + "balance_loss_clip": 1.21657455, + "balance_loss_mlp": 0.34648287, + "epoch": 0.4030662858860664, + "flos": 25520313323520.0, + "grad_norm": 3.6364737316512596, + "language_loss": 0.89118028, + "learning_rate": 2.709209774085071e-06, + "loss": 0.90980244, + "num_input_tokens_seen": 144000095, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.34179688, + "step": 6704, + "time_per_iteration": 2.6714224815368652 + }, + { + "auxiliary_loss_clip": 0.01484946, + "auxiliary_loss_mlp": 0.00393896, + "balance_loss_clip": 1.21625876, + "balance_loss_mlp": 0.35806212, + "epoch": 0.40312640913873443, + "flos": 23586703361280.0, + "grad_norm": 38.40954931359415, + "language_loss": 0.79901922, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.81780767, + "num_input_tokens_seen": 144019695, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.3581543, + "step": 6705, + "time_per_iteration": 2.6568474769592285 + }, + { + "auxiliary_loss_clip": 0.01474063, + "auxiliary_loss_mlp": 0.00351898, + "balance_loss_clip": 1.21252418, + "balance_loss_mlp": 0.32296601, + "epoch": 0.4031865323914024, + "flos": 20011042131840.0, + "grad_norm": 8.282837421762546, + "language_loss": 0.74519861, + "learning_rate": 2.708481414320713e-06, + "loss": 0.76345813, + "num_input_tokens_seen": 144038525, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.28942871, + "step": 6706, + "time_per_iteration": 2.6590330600738525 + }, + { + "auxiliary_loss_clip": 0.01472385, + "auxiliary_loss_mlp": 0.0034447, + "balance_loss_clip": 1.20882738, + "balance_loss_mlp": 0.3121163, + "epoch": 0.40324665564407036, + "flos": 21871430219520.0, + "grad_norm": 12.727149188840825, + "language_loss": 0.77210832, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.79027689, + "num_input_tokens_seen": 144059485, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.32324219, + "step": 6707, + "time_per_iteration": 2.693007230758667 + }, + { + "auxiliary_loss_clip": 0.01462576, + "auxiliary_loss_mlp": 0.00380152, + "balance_loss_clip": 1.20304823, + "balance_loss_mlp": 0.34567702, + "epoch": 0.4033067788967383, + "flos": 23878728933120.0, + "grad_norm": 5.838110902743076, + "language_loss": 0.84703636, + "learning_rate": 2.707752947093611e-06, + "loss": 0.86546361, + "num_input_tokens_seen": 144080265, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.34472656, + "step": 6708, + "time_per_iteration": 2.777224063873291 + }, + { + "auxiliary_loss_clip": 0.01472782, + "auxiliary_loss_mlp": 0.00377603, + "balance_loss_clip": 1.20186353, + "balance_loss_mlp": 0.34377131, + "epoch": 0.4033669021494063, + "flos": 17419907756160.0, + "grad_norm": 14.545672426968789, + "language_loss": 0.91280609, + "learning_rate": 2.70738867321606e-06, + "loss": 0.93130994, + "num_input_tokens_seen": 144098040, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.33813477, + "step": 6709, + "time_per_iteration": 2.6831109523773193 + }, + { + "auxiliary_loss_clip": 0.01477374, + "auxiliary_loss_mlp": 0.00375543, + "balance_loss_clip": 1.20828676, + "balance_loss_mlp": 0.33808753, + "epoch": 0.40342702540207426, + "flos": 29600554855680.0, + "grad_norm": 7.400719998605195, + "language_loss": 0.76727676, + "learning_rate": 2.70702437251426e-06, + "loss": 0.78580594, + "num_input_tokens_seen": 144118265, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.37451172, + "step": 6710, + "time_per_iteration": 2.707368850708008 + }, + { + "auxiliary_loss_clip": 0.01458482, + "auxiliary_loss_mlp": 0.00369178, + "balance_loss_clip": 1.19946265, + "balance_loss_mlp": 0.3368485, + "epoch": 0.4034871486547422, + "flos": 11284605400320.0, + "grad_norm": 96.61545440226658, + "language_loss": 0.91979402, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.9380706, + "num_input_tokens_seen": 144133865, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.32324219, + "step": 6711, + "time_per_iteration": 2.731245517730713 + }, + { + "auxiliary_loss_clip": 0.01466584, + "auxiliary_loss_mlp": 0.00388444, + "balance_loss_clip": 1.20214367, + "balance_loss_mlp": 0.35520822, + "epoch": 0.4035472719074102, + "flos": 15552839738880.0, + "grad_norm": 8.704247306731228, + "language_loss": 0.8663286, + "learning_rate": 2.706295690693168e-06, + "loss": 0.88487893, + "num_input_tokens_seen": 144150125, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.33251953, + "step": 6712, + "time_per_iteration": 2.667625904083252 + }, + { + "auxiliary_loss_clip": 0.01464676, + "auxiliary_loss_mlp": 0.00344806, + "balance_loss_clip": 1.20195246, + "balance_loss_mlp": 0.3146221, + "epoch": 0.40360739516007815, + "flos": 24674365140480.0, + "grad_norm": 8.975785919379025, + "language_loss": 0.86529541, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.88339019, + "num_input_tokens_seen": 144169295, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.30151367, + "step": 6713, + "time_per_iteration": 2.7855679988861084 + }, + { + "auxiliary_loss_clip": 0.01456064, + "auxiliary_loss_mlp": 0.00360148, + "balance_loss_clip": 1.19131255, + "balance_loss_mlp": 0.32821208, + "epoch": 0.4036675184127461, + "flos": 17304095329920.0, + "grad_norm": 3.8201444208081656, + "language_loss": 0.93352389, + "learning_rate": 2.705566901740865e-06, + "loss": 0.95168591, + "num_input_tokens_seen": 144185790, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.31970215, + "step": 6714, + "time_per_iteration": 2.6568803787231445 + }, + { + "auxiliary_loss_clip": 0.0146434, + "auxiliary_loss_mlp": 0.00343136, + "balance_loss_clip": 1.20074773, + "balance_loss_mlp": 0.31197473, + "epoch": 0.4037276416654141, + "flos": 19864023765120.0, + "grad_norm": 7.459855224006649, + "language_loss": 0.75255251, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.77062726, + "num_input_tokens_seen": 144205190, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.31152344, + "step": 6715, + "time_per_iteration": 2.6723573207855225 + }, + { + "auxiliary_loss_clip": 0.01460441, + "auxiliary_loss_mlp": 0.00376124, + "balance_loss_clip": 1.1933161, + "balance_loss_mlp": 0.34214956, + "epoch": 0.40378776491808205, + "flos": 18296271780480.0, + "grad_norm": 27.603476457479417, + "language_loss": 0.8615436, + "learning_rate": 2.704838005767892e-06, + "loss": 0.87990922, + "num_input_tokens_seen": 144222705, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.33959961, + "step": 6716, + "time_per_iteration": 2.6693716049194336 + }, + { + "auxiliary_loss_clip": 0.01451289, + "auxiliary_loss_mlp": 0.00348216, + "balance_loss_clip": 1.19065261, + "balance_loss_mlp": 0.31835425, + "epoch": 0.40384788817075, + "flos": 15049372757760.0, + "grad_norm": 11.639201971165813, + "language_loss": 0.84078652, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.85878158, + "num_input_tokens_seen": 144239545, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.29846191, + "step": 6717, + "time_per_iteration": 2.644970178604126 + }, + { + "auxiliary_loss_clip": 0.01419245, + "auxiliary_loss_mlp": 0.00171111, + "balance_loss_clip": 1.20799589, + "balance_loss_mlp": 0.16038249, + "epoch": 0.40390801142341803, + "flos": 61929927895680.0, + "grad_norm": 0.9297737048799892, + "language_loss": 0.60116249, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.61706609, + "num_input_tokens_seen": 144288145, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.10742188, + "step": 6718, + "time_per_iteration": 3.066592216491699 + }, + { + "auxiliary_loss_clip": 0.01451009, + "auxiliary_loss_mlp": 0.00346466, + "balance_loss_clip": 1.18766308, + "balance_loss_mlp": 0.31385064, + "epoch": 0.403968134676086, + "flos": 22738779930240.0, + "grad_norm": 29.649458709233436, + "language_loss": 0.83880913, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.85678387, + "num_input_tokens_seen": 144302315, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.32592773, + "step": 6719, + "time_per_iteration": 2.710273027420044 + }, + { + "auxiliary_loss_clip": 0.0144983, + "auxiliary_loss_mlp": 0.00336038, + "balance_loss_clip": 1.18562055, + "balance_loss_mlp": 0.30673641, + "epoch": 0.40402825792875396, + "flos": 19784409269760.0, + "grad_norm": 41.16388494930088, + "language_loss": 0.88199651, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.89985514, + "num_input_tokens_seen": 144318990, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.29296875, + "step": 6720, + "time_per_iteration": 2.693999767303467 + }, + { + "auxiliary_loss_clip": 0.01460185, + "auxiliary_loss_mlp": 0.00316102, + "balance_loss_clip": 1.19891953, + "balance_loss_mlp": 0.28625238, + "epoch": 0.40408838118142193, + "flos": 19609273532160.0, + "grad_norm": 7.234617141246806, + "language_loss": 0.83039612, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.84815896, + "num_input_tokens_seen": 144335765, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.29907227, + "step": 6721, + "time_per_iteration": 2.7311298847198486 + }, + { + "auxiliary_loss_clip": 0.01448539, + "auxiliary_loss_mlp": 0.00342457, + "balance_loss_clip": 1.19039941, + "balance_loss_mlp": 0.31291723, + "epoch": 0.4041485044340899, + "flos": 24426043441920.0, + "grad_norm": 4.580903513682932, + "language_loss": 0.79784369, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.81575364, + "num_input_tokens_seen": 144355825, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.29516602, + "step": 6722, + "time_per_iteration": 2.754469633102417 + }, + { + "auxiliary_loss_clip": 0.01460391, + "auxiliary_loss_mlp": 0.00351336, + "balance_loss_clip": 1.19945359, + "balance_loss_mlp": 0.32259476, + "epoch": 0.40420862768675786, + "flos": 16760192613120.0, + "grad_norm": 13.966833719856666, + "language_loss": 0.7400192, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.75813651, + "num_input_tokens_seen": 144374320, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.28723145, + "step": 6723, + "time_per_iteration": 2.6448826789855957 + }, + { + "auxiliary_loss_clip": 0.01477293, + "auxiliary_loss_mlp": 0.00375498, + "balance_loss_clip": 1.21032298, + "balance_loss_mlp": 0.3449443, + "epoch": 0.4042687509394258, + "flos": 22491571553280.0, + "grad_norm": 3.5127932657150884, + "language_loss": 0.79296339, + "learning_rate": 2.701921353880734e-06, + "loss": 0.81149137, + "num_input_tokens_seen": 144394325, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.30554199, + "step": 6724, + "time_per_iteration": 2.729971408843994 + }, + { + "auxiliary_loss_clip": 0.0146901, + "auxiliary_loss_mlp": 0.00296254, + "balance_loss_clip": 1.21154344, + "balance_loss_mlp": 0.27040988, + "epoch": 0.4043288741920938, + "flos": 30336149479680.0, + "grad_norm": 117.31355339402123, + "language_loss": 0.80633658, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.82398927, + "num_input_tokens_seen": 144412765, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.25854492, + "step": 6725, + "time_per_iteration": 2.693812370300293 + }, + { + "auxiliary_loss_clip": 0.01478348, + "auxiliary_loss_mlp": 0.0031719, + "balance_loss_clip": 1.21773958, + "balance_loss_mlp": 0.28619576, + "epoch": 0.40438899744476176, + "flos": 46348321363200.0, + "grad_norm": 56.58077520271252, + "language_loss": 0.84094298, + "learning_rate": 2.701191924463126e-06, + "loss": 0.8588984, + "num_input_tokens_seen": 144435400, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.31030273, + "step": 6726, + "time_per_iteration": 2.8658411502838135 + }, + { + "auxiliary_loss_clip": 0.01476604, + "auxiliary_loss_mlp": 0.003292, + "balance_loss_clip": 1.21178198, + "balance_loss_mlp": 0.29965967, + "epoch": 0.4044491206974297, + "flos": 13333524998400.0, + "grad_norm": 11.340058153241813, + "language_loss": 0.90064538, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.91870338, + "num_input_tokens_seen": 144452925, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.29553223, + "step": 6727, + "time_per_iteration": 2.633463144302368 + }, + { + "auxiliary_loss_clip": 0.01458095, + "auxiliary_loss_mlp": 0.00390146, + "balance_loss_clip": 1.19538248, + "balance_loss_mlp": 0.3551459, + "epoch": 0.4045092439500977, + "flos": 12093745121280.0, + "grad_norm": 3.93880585749848, + "language_loss": 0.93249273, + "learning_rate": 2.700462388688447e-06, + "loss": 0.95097506, + "num_input_tokens_seen": 144470195, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.34960938, + "step": 6728, + "time_per_iteration": 2.641984224319458 + }, + { + "auxiliary_loss_clip": 0.01499655, + "auxiliary_loss_mlp": 0.0029627, + "balance_loss_clip": 1.2341888, + "balance_loss_mlp": 0.2668018, + "epoch": 0.40456936720276565, + "flos": 21179683123200.0, + "grad_norm": 31.614348256228457, + "language_loss": 0.89779794, + "learning_rate": 2.700097580951786e-06, + "loss": 0.91575724, + "num_input_tokens_seen": 144490320, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.29455566, + "step": 6729, + "time_per_iteration": 4.080347061157227 + }, + { + "auxiliary_loss_clip": 0.01496838, + "auxiliary_loss_mlp": 0.00344103, + "balance_loss_clip": 1.22956908, + "balance_loss_mlp": 0.31412208, + "epoch": 0.4046294904554336, + "flos": 23915286000000.0, + "grad_norm": 12.146428529095132, + "language_loss": 0.80208987, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.8204993, + "num_input_tokens_seen": 144508990, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.29943848, + "step": 6730, + "time_per_iteration": 2.685918092727661 + }, + { + "auxiliary_loss_clip": 0.01495317, + "auxiliary_loss_mlp": 0.00363963, + "balance_loss_clip": 1.22912633, + "balance_loss_mlp": 0.33249205, + "epoch": 0.4046896137081016, + "flos": 38071235773440.0, + "grad_norm": 7.860589373445715, + "language_loss": 0.74498904, + "learning_rate": 2.699367885848985e-06, + "loss": 0.76358187, + "num_input_tokens_seen": 144529550, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.31469727, + "step": 6731, + "time_per_iteration": 4.246072053909302 + }, + { + "auxiliary_loss_clip": 0.01514146, + "auxiliary_loss_mlp": 0.00319061, + "balance_loss_clip": 1.24335885, + "balance_loss_mlp": 0.29184508, + "epoch": 0.4047497369607696, + "flos": 23617262856960.0, + "grad_norm": 132.27790671849317, + "language_loss": 0.8154794, + "learning_rate": 2.699002998510517e-06, + "loss": 0.83381146, + "num_input_tokens_seen": 144549310, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.27172852, + "step": 6732, + "time_per_iteration": 2.754749059677124 + }, + { + "auxiliary_loss_clip": 0.01493763, + "auxiliary_loss_mlp": 0.00302768, + "balance_loss_clip": 1.22658563, + "balance_loss_mlp": 0.27380013, + "epoch": 0.40480986021343757, + "flos": 12823593569280.0, + "grad_norm": 24.878703102153786, + "language_loss": 0.8319633, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.84992862, + "num_input_tokens_seen": 144567430, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.28955078, + "step": 6733, + "time_per_iteration": 2.7015082836151123 + }, + { + "auxiliary_loss_clip": 0.01495914, + "auxiliary_loss_mlp": 0.00322802, + "balance_loss_clip": 1.2248292, + "balance_loss_mlp": 0.29166433, + "epoch": 0.40486998346610553, + "flos": 23768770423680.0, + "grad_norm": 9.205305763089699, + "language_loss": 0.8338747, + "learning_rate": 2.698273144328627e-06, + "loss": 0.85206187, + "num_input_tokens_seen": 144585975, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.31176758, + "step": 6734, + "time_per_iteration": 4.189352035522461 + }, + { + "auxiliary_loss_clip": 0.01495226, + "auxiliary_loss_mlp": 0.00333896, + "balance_loss_clip": 1.22307789, + "balance_loss_mlp": 0.30392689, + "epoch": 0.4049301067187735, + "flos": 22856818999680.0, + "grad_norm": 99.5913024898432, + "language_loss": 0.72657657, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.74486768, + "num_input_tokens_seen": 144605225, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.29980469, + "step": 6735, + "time_per_iteration": 2.6793932914733887 + }, + { + "auxiliary_loss_clip": 0.01509127, + "auxiliary_loss_mlp": 0.0033703, + "balance_loss_clip": 1.23716354, + "balance_loss_mlp": 0.30541611, + "epoch": 0.40499022997144146, + "flos": 22783992174720.0, + "grad_norm": 21.073024092642754, + "language_loss": 0.90719157, + "learning_rate": 2.697543184232387e-06, + "loss": 0.92565316, + "num_input_tokens_seen": 144624145, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.31640625, + "step": 6736, + "time_per_iteration": 2.7064034938812256 + }, + { + "auxiliary_loss_clip": 0.01536711, + "auxiliary_loss_mlp": 0.00294531, + "balance_loss_clip": 1.25911009, + "balance_loss_mlp": 0.26544404, + "epoch": 0.4050503532241094, + "flos": 23039352938880.0, + "grad_norm": 5.531064693265709, + "language_loss": 0.82502425, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.8433367, + "num_input_tokens_seen": 144644470, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.29125977, + "step": 6737, + "time_per_iteration": 2.6819875240325928 + }, + { + "auxiliary_loss_clip": 0.01524797, + "auxiliary_loss_mlp": 0.00310127, + "balance_loss_clip": 1.24852502, + "balance_loss_mlp": 0.28012195, + "epoch": 0.4051104764767774, + "flos": 16647756065280.0, + "grad_norm": 4.13578699849256, + "language_loss": 0.79232538, + "learning_rate": 2.696813118332519e-06, + "loss": 0.81067455, + "num_input_tokens_seen": 144661055, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.29992676, + "step": 6738, + "time_per_iteration": 2.6623728275299072 + }, + { + "auxiliary_loss_clip": 0.01530194, + "auxiliary_loss_mlp": 0.00306928, + "balance_loss_clip": 1.25512505, + "balance_loss_mlp": 0.27935538, + "epoch": 0.40517059972944536, + "flos": 16358962717440.0, + "grad_norm": 3.7303338138509403, + "language_loss": 0.80624533, + "learning_rate": 2.696448045740828e-06, + "loss": 0.82461655, + "num_input_tokens_seen": 144677935, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.27539062, + "step": 6739, + "time_per_iteration": 2.6798486709594727 + }, + { + "auxiliary_loss_clip": 0.01538907, + "auxiliary_loss_mlp": 0.00313718, + "balance_loss_clip": 1.25755537, + "balance_loss_mlp": 0.28175816, + "epoch": 0.4052307229821133, + "flos": 28803374363520.0, + "grad_norm": 112.64144795195084, + "language_loss": 0.81721765, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.8357439, + "num_input_tokens_seen": 144697725, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.32006836, + "step": 6740, + "time_per_iteration": 4.149216413497925 + }, + { + "auxiliary_loss_clip": 0.01544728, + "auxiliary_loss_mlp": 0.00338092, + "balance_loss_clip": 1.26436162, + "balance_loss_mlp": 0.30690706, + "epoch": 0.4052908462347813, + "flos": 21397876289280.0, + "grad_norm": 2.7489668566366943, + "language_loss": 0.81692386, + "learning_rate": 2.695717821343153e-06, + "loss": 0.83575201, + "num_input_tokens_seen": 144718805, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.31164551, + "step": 6741, + "time_per_iteration": 2.705186605453491 + }, + { + "auxiliary_loss_clip": 0.01554451, + "auxiliary_loss_mlp": 0.00320389, + "balance_loss_clip": 1.2710309, + "balance_loss_mlp": 0.28901282, + "epoch": 0.40535096948744925, + "flos": 22419067950720.0, + "grad_norm": 12.592968668982827, + "language_loss": 0.79064417, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.80939257, + "num_input_tokens_seen": 144737105, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.31396484, + "step": 6742, + "time_per_iteration": 2.675943374633789 + }, + { + "auxiliary_loss_clip": 0.015526, + "auxiliary_loss_mlp": 0.00337729, + "balance_loss_clip": 1.26510966, + "balance_loss_mlp": 0.30659151, + "epoch": 0.4054110927401172, + "flos": 17010776868480.0, + "grad_norm": 145.01146389896232, + "language_loss": 0.82547414, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.84437752, + "num_input_tokens_seen": 144751350, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.31152344, + "step": 6743, + "time_per_iteration": 2.6258978843688965 + }, + { + "auxiliary_loss_clip": 0.01561552, + "auxiliary_loss_mlp": 0.00327606, + "balance_loss_clip": 1.27303481, + "balance_loss_mlp": 0.29675463, + "epoch": 0.4054712159927852, + "flos": 21614848392960.0, + "grad_norm": 446.2569263109992, + "language_loss": 0.80140448, + "learning_rate": 2.694622286918588e-06, + "loss": 0.82029605, + "num_input_tokens_seen": 144770030, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.30834961, + "step": 6744, + "time_per_iteration": 2.686690330505371 + }, + { + "auxiliary_loss_clip": 0.01558444, + "auxiliary_loss_mlp": 0.00320028, + "balance_loss_clip": 1.27364433, + "balance_loss_mlp": 0.2897487, + "epoch": 0.4055313392454532, + "flos": 25812554376960.0, + "grad_norm": 5.332799232174399, + "language_loss": 0.87102199, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.88980675, + "num_input_tokens_seen": 144790965, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.30273438, + "step": 6745, + "time_per_iteration": 2.703606605529785 + }, + { + "auxiliary_loss_clip": 0.01565119, + "auxiliary_loss_mlp": 0.00285489, + "balance_loss_clip": 1.2811029, + "balance_loss_mlp": 0.25513861, + "epoch": 0.40559146249812117, + "flos": 14137098111360.0, + "grad_norm": 7.976023589863802, + "language_loss": 0.73456419, + "learning_rate": 2.693891798911731e-06, + "loss": 0.75307029, + "num_input_tokens_seen": 144807755, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.30371094, + "step": 6746, + "time_per_iteration": 2.633772373199463 + }, + { + "auxiliary_loss_clip": 0.01561539, + "auxiliary_loss_mlp": 0.00316552, + "balance_loss_clip": 1.27823925, + "balance_loss_mlp": 0.28571215, + "epoch": 0.40565158575078913, + "flos": 41355481962240.0, + "grad_norm": 1336.5692058061634, + "language_loss": 0.63964665, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.6584276, + "num_input_tokens_seen": 144832405, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.30871582, + "step": 6747, + "time_per_iteration": 2.827512502670288 + }, + { + "auxiliary_loss_clip": 0.01566432, + "auxiliary_loss_mlp": 0.00313086, + "balance_loss_clip": 1.28280294, + "balance_loss_mlp": 0.28364116, + "epoch": 0.4057117090034571, + "flos": 28544529980160.0, + "grad_norm": 4.3549221760317804, + "language_loss": 0.89947295, + "learning_rate": 2.693161205655089e-06, + "loss": 0.91826808, + "num_input_tokens_seen": 144853890, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.29418945, + "step": 6748, + "time_per_iteration": 2.702265977859497 + }, + { + "auxiliary_loss_clip": 0.0157916, + "auxiliary_loss_mlp": 0.00286059, + "balance_loss_clip": 1.2920835, + "balance_loss_mlp": 0.25736529, + "epoch": 0.40577183225612506, + "flos": 18004066640640.0, + "grad_norm": 9.083302049614716, + "language_loss": 0.90592444, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.92457664, + "num_input_tokens_seen": 144871395, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.28674316, + "step": 6749, + "time_per_iteration": 2.622791051864624 + }, + { + "auxiliary_loss_clip": 0.0158749, + "auxiliary_loss_mlp": 0.00310343, + "balance_loss_clip": 1.30014312, + "balance_loss_mlp": 0.28248394, + "epoch": 0.40583195550879303, + "flos": 19536734016000.0, + "grad_norm": 5.374074943162313, + "language_loss": 0.83260369, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.85158205, + "num_input_tokens_seen": 144890975, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.27893066, + "step": 6750, + "time_per_iteration": 2.6600613594055176 + }, + { + "auxiliary_loss_clip": 0.01572796, + "auxiliary_loss_mlp": 0.00314119, + "balance_loss_clip": 1.28079939, + "balance_loss_mlp": 0.28355402, + "epoch": 0.405892078761461, + "flos": 22309468577280.0, + "grad_norm": 411.25369979242834, + "language_loss": 0.82539648, + "learning_rate": 2.692065118669195e-06, + "loss": 0.84426558, + "num_input_tokens_seen": 144908170, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.30566406, + "step": 6751, + "time_per_iteration": 2.681941270828247 + }, + { + "auxiliary_loss_clip": 0.01575802, + "auxiliary_loss_mlp": 0.00318924, + "balance_loss_clip": 1.28785777, + "balance_loss_mlp": 0.28727439, + "epoch": 0.40595220201412896, + "flos": 25484402701440.0, + "grad_norm": 39.43246084256196, + "language_loss": 0.75842267, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.77736992, + "num_input_tokens_seen": 144928020, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.31640625, + "step": 6752, + "time_per_iteration": 2.7259552478790283 + }, + { + "auxiliary_loss_clip": 0.01594617, + "auxiliary_loss_mlp": 0.00300257, + "balance_loss_clip": 1.30441236, + "balance_loss_mlp": 0.27062112, + "epoch": 0.4060123252667969, + "flos": 49856004103680.0, + "grad_norm": 3.6476343793506274, + "language_loss": 0.77784359, + "learning_rate": 2.691334262772948e-06, + "loss": 0.79679239, + "num_input_tokens_seen": 144951240, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.29638672, + "step": 6753, + "time_per_iteration": 2.9413504600524902 + }, + { + "auxiliary_loss_clip": 0.01578945, + "auxiliary_loss_mlp": 0.00305382, + "balance_loss_clip": 1.29578018, + "balance_loss_mlp": 0.27577028, + "epoch": 0.4060724485194649, + "flos": 21135476459520.0, + "grad_norm": 3.230957506678049, + "language_loss": 0.79457772, + "learning_rate": 2.690968795494699e-06, + "loss": 0.81342101, + "num_input_tokens_seen": 144969100, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.29589844, + "step": 6754, + "time_per_iteration": 2.7523746490478516 + }, + { + "auxiliary_loss_clip": 0.01582788, + "auxiliary_loss_mlp": 0.00316547, + "balance_loss_clip": 1.29718697, + "balance_loss_mlp": 0.2857551, + "epoch": 0.40613257177213286, + "flos": 21758059918080.0, + "grad_norm": 38.10390500768634, + "language_loss": 0.88838184, + "learning_rate": 2.690603302014844e-06, + "loss": 0.90737522, + "num_input_tokens_seen": 144987065, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.30786133, + "step": 6755, + "time_per_iteration": 2.9120874404907227 + }, + { + "auxiliary_loss_clip": 0.0155892, + "auxiliary_loss_mlp": 0.00303093, + "balance_loss_clip": 1.27711272, + "balance_loss_mlp": 0.27220559, + "epoch": 0.4061926950248008, + "flos": 25555074710400.0, + "grad_norm": 97.27585014819508, + "language_loss": 0.76871479, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.78733492, + "num_input_tokens_seen": 145007310, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.3092041, + "step": 6756, + "time_per_iteration": 2.7961766719818115 + }, + { + "auxiliary_loss_clip": 0.01582334, + "auxiliary_loss_mlp": 0.00294772, + "balance_loss_clip": 1.29831004, + "balance_loss_mlp": 0.26291886, + "epoch": 0.4062528182774688, + "flos": 23695799944320.0, + "grad_norm": 5.3198607251868735, + "language_loss": 0.85062915, + "learning_rate": 2.689872236505755e-06, + "loss": 0.8694002, + "num_input_tokens_seen": 145026210, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.31860352, + "step": 6757, + "time_per_iteration": 2.7748465538024902 + }, + { + "auxiliary_loss_clip": 0.0158906, + "auxiliary_loss_mlp": 0.00289667, + "balance_loss_clip": 1.3052851, + "balance_loss_mlp": 0.26162881, + "epoch": 0.4063129415301368, + "flos": 21726027964800.0, + "grad_norm": 8.274125632832591, + "language_loss": 0.85944366, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.87823099, + "num_input_tokens_seen": 145045475, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.28039551, + "step": 6758, + "time_per_iteration": 2.723299980163574 + }, + { + "auxiliary_loss_clip": 0.01572013, + "auxiliary_loss_mlp": 0.00312442, + "balance_loss_clip": 1.29310298, + "balance_loss_mlp": 0.28265128, + "epoch": 0.40637306478280477, + "flos": 12787575206400.0, + "grad_norm": 9.214785681582102, + "language_loss": 0.97004449, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.98888904, + "num_input_tokens_seen": 145062260, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.2980957, + "step": 6759, + "time_per_iteration": 2.716917037963867 + }, + { + "auxiliary_loss_clip": 0.0154746, + "auxiliary_loss_mlp": 0.00313047, + "balance_loss_clip": 1.27094507, + "balance_loss_mlp": 0.28350663, + "epoch": 0.40643318803547274, + "flos": 24024490323840.0, + "grad_norm": 17.708391285908302, + "language_loss": 0.73640561, + "learning_rate": 2.688775442076598e-06, + "loss": 0.75501072, + "num_input_tokens_seen": 145082470, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.29541016, + "step": 6760, + "time_per_iteration": 2.6811389923095703 + }, + { + "auxiliary_loss_clip": 0.01538625, + "auxiliary_loss_mlp": 0.00326836, + "balance_loss_clip": 1.26478958, + "balance_loss_mlp": 0.29544771, + "epoch": 0.4064933112881407, + "flos": 25592421876480.0, + "grad_norm": 2.241828305502943, + "language_loss": 0.80971187, + "learning_rate": 2.688409791678193e-06, + "loss": 0.82836652, + "num_input_tokens_seen": 145105685, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.31384277, + "step": 6761, + "time_per_iteration": 2.755098342895508 + }, + { + "auxiliary_loss_clip": 0.01519084, + "auxiliary_loss_mlp": 0.00267972, + "balance_loss_clip": 1.25458157, + "balance_loss_mlp": 0.24129303, + "epoch": 0.40655343454080867, + "flos": 22054323294720.0, + "grad_norm": 37.12825306709907, + "language_loss": 0.75707704, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.77494764, + "num_input_tokens_seen": 145125590, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.26660156, + "step": 6762, + "time_per_iteration": 2.669874429702759 + }, + { + "auxiliary_loss_clip": 0.01530642, + "auxiliary_loss_mlp": 0.00296014, + "balance_loss_clip": 1.26277995, + "balance_loss_mlp": 0.2670103, + "epoch": 0.40661355779347663, + "flos": 26468893641600.0, + "grad_norm": 1.9431259555165223, + "language_loss": 0.80180109, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.82006764, + "num_input_tokens_seen": 145146810, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.29016113, + "step": 6763, + "time_per_iteration": 2.7438178062438965 + }, + { + "auxiliary_loss_clip": 0.01520405, + "auxiliary_loss_mlp": 0.00331809, + "balance_loss_clip": 1.25567508, + "balance_loss_mlp": 0.30114844, + "epoch": 0.4066736810461446, + "flos": 13261129136640.0, + "grad_norm": 104.24700861893749, + "language_loss": 0.7419554, + "learning_rate": 2.687312683911033e-06, + "loss": 0.76047754, + "num_input_tokens_seen": 145163130, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.3067627, + "step": 6764, + "time_per_iteration": 2.705596685409546 + }, + { + "auxiliary_loss_clip": 0.01539347, + "auxiliary_loss_mlp": 0.00374118, + "balance_loss_clip": 1.2674458, + "balance_loss_mlp": 0.3381173, + "epoch": 0.40673380429881256, + "flos": 28803625758720.0, + "grad_norm": 40.884844692211296, + "language_loss": 0.97201705, + "learning_rate": 2.686946929177557e-06, + "loss": 0.99115169, + "num_input_tokens_seen": 145181420, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.36010742, + "step": 6765, + "time_per_iteration": 2.7028183937072754 + }, + { + "auxiliary_loss_clip": 0.01529678, + "auxiliary_loss_mlp": 0.00311909, + "balance_loss_clip": 1.26159656, + "balance_loss_mlp": 0.28120095, + "epoch": 0.4067939275514805, + "flos": 12495334152960.0, + "grad_norm": 7.563426840829584, + "language_loss": 0.9011035, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.91951936, + "num_input_tokens_seen": 145198545, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.3067627, + "step": 6766, + "time_per_iteration": 2.667046070098877 + }, + { + "auxiliary_loss_clip": 0.01507279, + "auxiliary_loss_mlp": 0.00297221, + "balance_loss_clip": 1.24745846, + "balance_loss_mlp": 0.26760927, + "epoch": 0.4068540508041485, + "flos": 18770508069120.0, + "grad_norm": 15.281632834157598, + "language_loss": 0.83554733, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.8535924, + "num_input_tokens_seen": 145215835, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.29614258, + "step": 6767, + "time_per_iteration": 2.6536810398101807 + }, + { + "auxiliary_loss_clip": 0.01517725, + "auxiliary_loss_mlp": 0.00314803, + "balance_loss_clip": 1.25830531, + "balance_loss_mlp": 0.28554934, + "epoch": 0.40691417405681646, + "flos": 28512821249280.0, + "grad_norm": 34.4928877755483, + "language_loss": 0.8401593, + "learning_rate": 2.685849508738034e-06, + "loss": 0.85848457, + "num_input_tokens_seen": 145236555, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.29284668, + "step": 6768, + "time_per_iteration": 2.7089619636535645 + }, + { + "auxiliary_loss_clip": 0.01499062, + "auxiliary_loss_mlp": 0.00306725, + "balance_loss_clip": 1.24115491, + "balance_loss_mlp": 0.27718529, + "epoch": 0.4069742973094844, + "flos": 20814040627200.0, + "grad_norm": 68.35475911304923, + "language_loss": 0.93186259, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.94992054, + "num_input_tokens_seen": 145254595, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.2956543, + "step": 6769, + "time_per_iteration": 2.6208348274230957 + }, + { + "auxiliary_loss_clip": 0.01487721, + "auxiliary_loss_mlp": 0.00299353, + "balance_loss_clip": 1.2376219, + "balance_loss_mlp": 0.27096966, + "epoch": 0.4070344205621524, + "flos": 21470272151040.0, + "grad_norm": 15.192417870740572, + "language_loss": 0.85938948, + "learning_rate": 2.685117765051156e-06, + "loss": 0.87726021, + "num_input_tokens_seen": 145274005, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.28393555, + "step": 6770, + "time_per_iteration": 2.6757125854492188 + }, + { + "auxiliary_loss_clip": 0.01478831, + "auxiliary_loss_mlp": 0.00336904, + "balance_loss_clip": 1.22309852, + "balance_loss_mlp": 0.30544454, + "epoch": 0.4070945438148204, + "flos": 26830046937600.0, + "grad_norm": 108.34020628884224, + "language_loss": 0.84924603, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.86740339, + "num_input_tokens_seen": 145294850, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.3145752, + "step": 6771, + "time_per_iteration": 2.7132208347320557 + }, + { + "auxiliary_loss_clip": 0.01472326, + "auxiliary_loss_mlp": 0.00322287, + "balance_loss_clip": 1.22135472, + "balance_loss_mlp": 0.29329485, + "epoch": 0.4071546670674884, + "flos": 26354158623360.0, + "grad_norm": 3.3188086155513976, + "language_loss": 0.80647713, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.82442325, + "num_input_tokens_seen": 145317050, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.28979492, + "step": 6772, + "time_per_iteration": 4.101114273071289 + }, + { + "auxiliary_loss_clip": 0.01479502, + "auxiliary_loss_mlp": 0.00312281, + "balance_loss_clip": 1.22833037, + "balance_loss_mlp": 0.28468412, + "epoch": 0.40721479032015634, + "flos": 17895401020800.0, + "grad_norm": 2.2749283779985428, + "language_loss": 0.87895954, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.89687735, + "num_input_tokens_seen": 145334480, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.27612305, + "step": 6773, + "time_per_iteration": 4.040104150772095 + }, + { + "auxiliary_loss_clip": 0.01458882, + "auxiliary_loss_mlp": 0.00176156, + "balance_loss_clip": 1.25665689, + "balance_loss_mlp": 0.16447356, + "epoch": 0.4072749135728243, + "flos": 49854570537600.0, + "grad_norm": 0.9538996705580078, + "language_loss": 0.63950688, + "learning_rate": 2.683653966031597e-06, + "loss": 0.65585726, + "num_input_tokens_seen": 145388695, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.11669922, + "step": 6774, + "time_per_iteration": 3.062551498413086 + }, + { + "auxiliary_loss_clip": 0.0145883, + "auxiliary_loss_mlp": 0.00334277, + "balance_loss_clip": 1.20739186, + "balance_loss_mlp": 0.30423644, + "epoch": 0.40733503682549227, + "flos": 27563630400000.0, + "grad_norm": 670.2471096212143, + "language_loss": 0.79800463, + "learning_rate": 2.683287951431446e-06, + "loss": 0.81593573, + "num_input_tokens_seen": 145408240, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.30029297, + "step": 6775, + "time_per_iteration": 2.7410616874694824 + }, + { + "auxiliary_loss_clip": 0.01453087, + "auxiliary_loss_mlp": 0.0030407, + "balance_loss_clip": 1.20730567, + "balance_loss_mlp": 0.27896422, + "epoch": 0.40739516007816023, + "flos": 22126970551680.0, + "grad_norm": 3.8144484008095456, + "language_loss": 0.83623427, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.8538059, + "num_input_tokens_seen": 145428395, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.2512207, + "step": 6776, + "time_per_iteration": 4.1398024559021 + }, + { + "auxiliary_loss_clip": 0.01460667, + "auxiliary_loss_mlp": 0.00322831, + "balance_loss_clip": 1.20723629, + "balance_loss_mlp": 0.29467386, + "epoch": 0.4074552833308282, + "flos": 23842243693440.0, + "grad_norm": 141.33446925520673, + "language_loss": 0.86476725, + "learning_rate": 2.682555844513981e-06, + "loss": 0.88260221, + "num_input_tokens_seen": 145448290, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.28173828, + "step": 6777, + "time_per_iteration": 2.7067837715148926 + }, + { + "auxiliary_loss_clip": 0.01441553, + "auxiliary_loss_mlp": 0.0011949, + "balance_loss_clip": 1.24342704, + "balance_loss_mlp": 0.11042972, + "epoch": 0.40751540658349616, + "flos": 58000008781440.0, + "grad_norm": 0.6906222850432487, + "language_loss": 0.52778316, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.54339361, + "num_input_tokens_seen": 145509785, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.09082031, + "step": 6778, + "time_per_iteration": 3.1699235439300537 + }, + { + "auxiliary_loss_clip": 0.01449776, + "auxiliary_loss_mlp": 0.00315085, + "balance_loss_clip": 1.20340133, + "balance_loss_mlp": 0.28490168, + "epoch": 0.40757552983616413, + "flos": 21214659991680.0, + "grad_norm": 9.189414732807958, + "language_loss": 0.89898354, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.91663218, + "num_input_tokens_seen": 145528620, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.30175781, + "step": 6779, + "time_per_iteration": 2.704439640045166 + }, + { + "auxiliary_loss_clip": 0.01449063, + "auxiliary_loss_mlp": 0.00318083, + "balance_loss_clip": 1.20374405, + "balance_loss_mlp": 0.28990149, + "epoch": 0.4076356530888321, + "flos": 26833530556800.0, + "grad_norm": 2.7171844606275397, + "language_loss": 0.81922996, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.83690143, + "num_input_tokens_seen": 145547775, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.28186035, + "step": 6780, + "time_per_iteration": 2.7270734310150146 + }, + { + "auxiliary_loss_clip": 0.01419485, + "auxiliary_loss_mlp": 0.00302061, + "balance_loss_clip": 1.17994237, + "balance_loss_mlp": 0.27523845, + "epoch": 0.40769577634150006, + "flos": 12203021272320.0, + "grad_norm": 8.699434267212615, + "language_loss": 0.74082041, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.75803584, + "num_input_tokens_seen": 145564465, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.26855469, + "step": 6781, + "time_per_iteration": 2.6750071048736572 + }, + { + "auxiliary_loss_clip": 0.01434185, + "auxiliary_loss_mlp": 0.00297185, + "balance_loss_clip": 1.19087541, + "balance_loss_mlp": 0.27031541, + "epoch": 0.407755899594168, + "flos": 33655264796160.0, + "grad_norm": 11.385036462673098, + "language_loss": 0.75818801, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.77550173, + "num_input_tokens_seen": 145585965, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.26904297, + "step": 6782, + "time_per_iteration": 4.233702898025513 + }, + { + "auxiliary_loss_clip": 0.01422815, + "auxiliary_loss_mlp": 0.00318948, + "balance_loss_clip": 1.17777419, + "balance_loss_mlp": 0.28988475, + "epoch": 0.407816022846836, + "flos": 20157342226560.0, + "grad_norm": 28.645444860725227, + "language_loss": 0.88865501, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.90607262, + "num_input_tokens_seen": 145605000, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.29052734, + "step": 6783, + "time_per_iteration": 2.675356388092041 + }, + { + "auxiliary_loss_clip": 0.01433986, + "auxiliary_loss_mlp": 0.00325751, + "balance_loss_clip": 1.19033313, + "balance_loss_mlp": 0.29571062, + "epoch": 0.40787614609950396, + "flos": 21178821196800.0, + "grad_norm": 135.06827649318106, + "language_loss": 0.85841531, + "learning_rate": 2.679992655730283e-06, + "loss": 0.87601268, + "num_input_tokens_seen": 145623740, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.30078125, + "step": 6784, + "time_per_iteration": 2.7047250270843506 + }, + { + "auxiliary_loss_clip": 0.01432615, + "auxiliary_loss_mlp": 0.00346272, + "balance_loss_clip": 1.18189824, + "balance_loss_mlp": 0.31735188, + "epoch": 0.407936269352172, + "flos": 20520650338560.0, + "grad_norm": 178.39465157413716, + "language_loss": 0.74218822, + "learning_rate": 2.679626382651386e-06, + "loss": 0.7599771, + "num_input_tokens_seen": 145643515, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.2890625, + "step": 6785, + "time_per_iteration": 2.6575913429260254 + }, + { + "auxiliary_loss_clip": 0.01422453, + "auxiliary_loss_mlp": 0.00317611, + "balance_loss_clip": 1.17666578, + "balance_loss_mlp": 0.28894094, + "epoch": 0.40799639260483994, + "flos": 20118809911680.0, + "grad_norm": 52.26521977229502, + "language_loss": 0.861678, + "learning_rate": 2.679260083800989e-06, + "loss": 0.87907863, + "num_input_tokens_seen": 145660890, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.28662109, + "step": 6786, + "time_per_iteration": 2.63122296333313 + }, + { + "auxiliary_loss_clip": 0.01420612, + "auxiliary_loss_mlp": 0.0033702, + "balance_loss_clip": 1.17646134, + "balance_loss_mlp": 0.30774194, + "epoch": 0.4080565158575079, + "flos": 20997328752000.0, + "grad_norm": 25.042325898533264, + "language_loss": 0.8535701, + "learning_rate": 2.678893759192982e-06, + "loss": 0.87114644, + "num_input_tokens_seen": 145680070, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.29272461, + "step": 6787, + "time_per_iteration": 2.6502041816711426 + }, + { + "auxiliary_loss_clip": 0.01431033, + "auxiliary_loss_mlp": 0.00309714, + "balance_loss_clip": 1.18302917, + "balance_loss_mlp": 0.27944639, + "epoch": 0.40811663911017587, + "flos": 19317714837120.0, + "grad_norm": 5.590155406613061, + "language_loss": 0.74787468, + "learning_rate": 2.678527408841255e-06, + "loss": 0.76528215, + "num_input_tokens_seen": 145698010, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.30273438, + "step": 6788, + "time_per_iteration": 2.6242079734802246 + }, + { + "auxiliary_loss_clip": 0.01424582, + "auxiliary_loss_mlp": 0.00330959, + "balance_loss_clip": 1.17350447, + "balance_loss_mlp": 0.3012045, + "epoch": 0.40817676236284384, + "flos": 40625382119040.0, + "grad_norm": 3.998006126211897, + "language_loss": 0.73041987, + "learning_rate": 2.678161032759701e-06, + "loss": 0.74797529, + "num_input_tokens_seen": 145722215, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.29772949, + "step": 6789, + "time_per_iteration": 2.8281960487365723 + }, + { + "auxiliary_loss_clip": 0.01420182, + "auxiliary_loss_mlp": 0.00316132, + "balance_loss_clip": 1.17461812, + "balance_loss_mlp": 0.28865415, + "epoch": 0.4082368856155118, + "flos": 20522086882560.0, + "grad_norm": 495.7123768406948, + "language_loss": 0.68257302, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.69993615, + "num_input_tokens_seen": 145741090, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.2746582, + "step": 6790, + "time_per_iteration": 2.68023419380188 + }, + { + "auxiliary_loss_clip": 0.01418619, + "auxiliary_loss_mlp": 0.0031173, + "balance_loss_clip": 1.17458785, + "balance_loss_mlp": 0.28297696, + "epoch": 0.40829700886817977, + "flos": 11427745098240.0, + "grad_norm": 6.786210880521379, + "language_loss": 0.77530897, + "learning_rate": 2.677428203462683e-06, + "loss": 0.79261243, + "num_input_tokens_seen": 145754985, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.28771973, + "step": 6791, + "time_per_iteration": 2.6076364517211914 + }, + { + "auxiliary_loss_clip": 0.01398636, + "auxiliary_loss_mlp": 0.00118979, + "balance_loss_clip": 1.19688904, + "balance_loss_mlp": 0.1111115, + "epoch": 0.40835713212084773, + "flos": 67330677121920.0, + "grad_norm": 0.7511841296424061, + "language_loss": 0.5870502, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.60222626, + "num_input_tokens_seen": 145815260, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.07861328, + "step": 6792, + "time_per_iteration": 3.1058013439178467 + }, + { + "auxiliary_loss_clip": 0.01425544, + "auxiliary_loss_mlp": 0.00303047, + "balance_loss_clip": 1.17533422, + "balance_loss_mlp": 0.27475911, + "epoch": 0.4084172553735157, + "flos": 21762010414080.0, + "grad_norm": 57.05819781834104, + "language_loss": 0.88781142, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.90509737, + "num_input_tokens_seen": 145832665, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.28320312, + "step": 6793, + "time_per_iteration": 2.648909568786621 + }, + { + "auxiliary_loss_clip": 0.01420688, + "auxiliary_loss_mlp": 0.00315491, + "balance_loss_clip": 1.17161632, + "balance_loss_mlp": 0.288037, + "epoch": 0.40847737862618366, + "flos": 27417258478080.0, + "grad_norm": 3.8952726669834905, + "language_loss": 0.8985123, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.91587412, + "num_input_tokens_seen": 145850240, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.27441406, + "step": 6794, + "time_per_iteration": 2.703150749206543 + }, + { + "auxiliary_loss_clip": 0.01425667, + "auxiliary_loss_mlp": 0.0032723, + "balance_loss_clip": 1.18079185, + "balance_loss_mlp": 0.29665291, + "epoch": 0.4085375018788516, + "flos": 18587255857920.0, + "grad_norm": 24.255096477170014, + "language_loss": 0.85074157, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.86827058, + "num_input_tokens_seen": 145869545, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.30603027, + "step": 6795, + "time_per_iteration": 2.751018762588501 + }, + { + "auxiliary_loss_clip": 0.01425513, + "auxiliary_loss_mlp": 0.00351638, + "balance_loss_clip": 1.17620921, + "balance_loss_mlp": 0.31984472, + "epoch": 0.4085976251315196, + "flos": 15411783029760.0, + "grad_norm": 14.832434759903762, + "language_loss": 0.77459288, + "learning_rate": 2.675595680920792e-06, + "loss": 0.79236436, + "num_input_tokens_seen": 145884025, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.31787109, + "step": 6796, + "time_per_iteration": 2.727681875228882 + }, + { + "auxiliary_loss_clip": 0.01407151, + "auxiliary_loss_mlp": 0.0031401, + "balance_loss_clip": 1.16498554, + "balance_loss_mlp": 0.28578159, + "epoch": 0.40865774838418756, + "flos": 21252222639360.0, + "grad_norm": 4.085642572026513, + "language_loss": 0.84516752, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.86237913, + "num_input_tokens_seen": 145903210, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.28186035, + "step": 6797, + "time_per_iteration": 2.719212532043457 + }, + { + "auxiliary_loss_clip": 0.01410036, + "auxiliary_loss_mlp": 0.0031248, + "balance_loss_clip": 1.1642741, + "balance_loss_mlp": 0.28556237, + "epoch": 0.4087178716368556, + "flos": 13772245714560.0, + "grad_norm": 4.819275065620166, + "language_loss": 0.93486345, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.95208859, + "num_input_tokens_seen": 145920985, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.26940918, + "step": 6798, + "time_per_iteration": 2.6685783863067627 + }, + { + "auxiliary_loss_clip": 0.01403765, + "auxiliary_loss_mlp": 0.00289684, + "balance_loss_clip": 1.1642592, + "balance_loss_mlp": 0.2636008, + "epoch": 0.40877799488952354, + "flos": 23621752056960.0, + "grad_norm": 6.112343132647309, + "language_loss": 0.88858753, + "learning_rate": 2.674495859860601e-06, + "loss": 0.90552199, + "num_input_tokens_seen": 145940350, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.26074219, + "step": 6799, + "time_per_iteration": 2.7127511501312256 + }, + { + "auxiliary_loss_clip": 0.01414383, + "auxiliary_loss_mlp": 0.00301418, + "balance_loss_clip": 1.17079067, + "balance_loss_mlp": 0.27241391, + "epoch": 0.4088381181421915, + "flos": 20918791664640.0, + "grad_norm": 3.1807175602522517, + "language_loss": 0.91733694, + "learning_rate": 2.6741292016681e-06, + "loss": 0.93449497, + "num_input_tokens_seen": 145957460, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.28979492, + "step": 6800, + "time_per_iteration": 2.7614519596099854 + }, + { + "auxiliary_loss_clip": 0.01408615, + "auxiliary_loss_mlp": 0.00317968, + "balance_loss_clip": 1.16809773, + "balance_loss_mlp": 0.28849977, + "epoch": 0.4088982413948595, + "flos": 13297578462720.0, + "grad_norm": 41.67330139276372, + "language_loss": 0.8301903, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.8474561, + "num_input_tokens_seen": 145975285, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.29492188, + "step": 6801, + "time_per_iteration": 2.840203046798706 + }, + { + "auxiliary_loss_clip": 0.01413092, + "auxiliary_loss_mlp": 0.00331093, + "balance_loss_clip": 1.17181158, + "balance_loss_mlp": 0.30092132, + "epoch": 0.40895836464752744, + "flos": 15267673664640.0, + "grad_norm": 9.522323583402022, + "language_loss": 0.85277259, + "learning_rate": 2.673395808607861e-06, + "loss": 0.87021446, + "num_input_tokens_seen": 145989150, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.30187988, + "step": 6802, + "time_per_iteration": 2.7383153438568115 + }, + { + "auxiliary_loss_clip": 0.01420188, + "auxiliary_loss_mlp": 0.00313388, + "balance_loss_clip": 1.17681289, + "balance_loss_mlp": 0.28493294, + "epoch": 0.4090184879001954, + "flos": 14501411804160.0, + "grad_norm": 2.3148682118257735, + "language_loss": 0.86598706, + "learning_rate": 2.673029073767934e-06, + "loss": 0.88332283, + "num_input_tokens_seen": 146006980, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.2845459, + "step": 6803, + "time_per_iteration": 2.724520683288574 + }, + { + "auxiliary_loss_clip": 0.01397995, + "auxiliary_loss_mlp": 0.00308527, + "balance_loss_clip": 1.16109467, + "balance_loss_mlp": 0.2796545, + "epoch": 0.40907861115286337, + "flos": 13881593692800.0, + "grad_norm": 8.815198984959101, + "language_loss": 0.87467027, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.89173549, + "num_input_tokens_seen": 146025125, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.28845215, + "step": 6804, + "time_per_iteration": 2.749619722366333 + }, + { + "auxiliary_loss_clip": 0.0141133, + "auxiliary_loss_mlp": 0.00319309, + "balance_loss_clip": 1.16601336, + "balance_loss_mlp": 0.29173556, + "epoch": 0.40913873440553133, + "flos": 28037615293440.0, + "grad_norm": 12.766313498358565, + "language_loss": 0.83376354, + "learning_rate": 2.672295527537998e-06, + "loss": 0.85106993, + "num_input_tokens_seen": 146044990, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.27600098, + "step": 6805, + "time_per_iteration": 2.714515209197998 + }, + { + "auxiliary_loss_clip": 0.01412461, + "auxiliary_loss_mlp": 0.0030439, + "balance_loss_clip": 1.16723299, + "balance_loss_mlp": 0.27861732, + "epoch": 0.4091988576581993, + "flos": 21618188357760.0, + "grad_norm": 100.53948754952127, + "language_loss": 0.83534288, + "learning_rate": 2.671928716175804e-06, + "loss": 0.85251141, + "num_input_tokens_seen": 146066045, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.2578125, + "step": 6806, + "time_per_iteration": 2.7058799266815186 + }, + { + "auxiliary_loss_clip": 0.0141715, + "auxiliary_loss_mlp": 0.00266109, + "balance_loss_clip": 1.17403281, + "balance_loss_mlp": 0.24047898, + "epoch": 0.40925898091086726, + "flos": 25224085860480.0, + "grad_norm": 10.092206488863786, + "language_loss": 0.78772718, + "learning_rate": 2.671561879334007e-06, + "loss": 0.80455971, + "num_input_tokens_seen": 146086280, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.25646973, + "step": 6807, + "time_per_iteration": 2.69642972946167 + }, + { + "auxiliary_loss_clip": 0.01376918, + "auxiliary_loss_mlp": 0.00070372, + "balance_loss_clip": 1.18582416, + "balance_loss_mlp": 0.06269466, + "epoch": 0.40931910416353523, + "flos": 68930568800640.0, + "grad_norm": 0.8083750418312338, + "language_loss": 0.58671606, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.6011889, + "num_input_tokens_seen": 146148840, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.07666016, + "step": 6808, + "time_per_iteration": 3.271242380142212 + }, + { + "auxiliary_loss_clip": 0.0141503, + "auxiliary_loss_mlp": 0.00264727, + "balance_loss_clip": 1.17321908, + "balance_loss_mlp": 0.23777393, + "epoch": 0.4093792274162032, + "flos": 20189553747840.0, + "grad_norm": 4.811014592016072, + "language_loss": 0.61023426, + "learning_rate": 2.670828129267242e-06, + "loss": 0.6270318, + "num_input_tokens_seen": 146166195, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.26953125, + "step": 6809, + "time_per_iteration": 2.736030340194702 + }, + { + "auxiliary_loss_clip": 0.01403656, + "auxiliary_loss_mlp": 0.0024995, + "balance_loss_clip": 1.16175425, + "balance_loss_mlp": 0.22341384, + "epoch": 0.40943935066887116, + "flos": 25228754628480.0, + "grad_norm": 4.1303496077500474, + "language_loss": 0.89504886, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.91158485, + "num_input_tokens_seen": 146185045, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.26538086, + "step": 6810, + "time_per_iteration": 2.762766122817993 + }, + { + "auxiliary_loss_clip": 0.01421885, + "auxiliary_loss_mlp": 0.00297817, + "balance_loss_clip": 1.17324483, + "balance_loss_mlp": 0.26951653, + "epoch": 0.4094994739215392, + "flos": 23255319461760.0, + "grad_norm": 13.53074461078268, + "language_loss": 0.86132354, + "learning_rate": 2.670094277448999e-06, + "loss": 0.87852055, + "num_input_tokens_seen": 146204655, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.28283691, + "step": 6811, + "time_per_iteration": 2.6620559692382812 + }, + { + "auxiliary_loss_clip": 0.01408114, + "auxiliary_loss_mlp": 0.00259819, + "balance_loss_clip": 1.1623522, + "balance_loss_mlp": 0.23162647, + "epoch": 0.40955959717420715, + "flos": 17382165540480.0, + "grad_norm": 13.969822797946932, + "language_loss": 0.78892291, + "learning_rate": 2.669727313417857e-06, + "loss": 0.80560225, + "num_input_tokens_seen": 146222000, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.28186035, + "step": 6812, + "time_per_iteration": 2.6516098976135254 + }, + { + "auxiliary_loss_clip": 0.0140659, + "auxiliary_loss_mlp": 0.00292961, + "balance_loss_clip": 1.16393733, + "balance_loss_mlp": 0.26380223, + "epoch": 0.4096197204268751, + "flos": 25082418620160.0, + "grad_norm": 10.121506947891838, + "language_loss": 0.72998559, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.74698114, + "num_input_tokens_seen": 146242630, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.29174805, + "step": 6813, + "time_per_iteration": 2.6722145080566406 + }, + { + "auxiliary_loss_clip": 0.01392996, + "auxiliary_loss_mlp": 0.00265, + "balance_loss_clip": 1.15061617, + "balance_loss_mlp": 0.23784381, + "epoch": 0.4096798436795431, + "flos": 30586769648640.0, + "grad_norm": 58.23938987657569, + "language_loss": 0.79769772, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.81427765, + "num_input_tokens_seen": 146263070, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.27160645, + "step": 6814, + "time_per_iteration": 4.083324909210205 + }, + { + "auxiliary_loss_clip": 0.01422885, + "auxiliary_loss_mlp": 0.00285642, + "balance_loss_clip": 1.17134345, + "balance_loss_mlp": 0.25791392, + "epoch": 0.40973996693221104, + "flos": 24133622820480.0, + "grad_norm": 11.844929396767144, + "language_loss": 0.75474429, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.77182961, + "num_input_tokens_seen": 146282890, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.27734375, + "step": 6815, + "time_per_iteration": 4.076141595840454 + }, + { + "auxiliary_loss_clip": 0.01409669, + "auxiliary_loss_mlp": 0.00300109, + "balance_loss_clip": 1.16366625, + "balance_loss_mlp": 0.27208331, + "epoch": 0.409800090184879, + "flos": 23988974751360.0, + "grad_norm": 9.965380070753442, + "language_loss": 0.82390344, + "learning_rate": 2.668259203471188e-06, + "loss": 0.84100127, + "num_input_tokens_seen": 146301755, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.28027344, + "step": 6816, + "time_per_iteration": 2.6798224449157715 + }, + { + "auxiliary_loss_clip": 0.01410227, + "auxiliary_loss_mlp": 0.00291995, + "balance_loss_clip": 1.16391468, + "balance_loss_mlp": 0.26390889, + "epoch": 0.40986021343754697, + "flos": 16143678552960.0, + "grad_norm": 10.471479753651506, + "language_loss": 0.87743217, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.89445436, + "num_input_tokens_seen": 146316835, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.28088379, + "step": 6817, + "time_per_iteration": 2.611795425415039 + }, + { + "auxiliary_loss_clip": 0.01420968, + "auxiliary_loss_mlp": 0.0030629, + "balance_loss_clip": 1.16272807, + "balance_loss_mlp": 0.2763682, + "epoch": 0.40992033669021494, + "flos": 24790824011520.0, + "grad_norm": 144.60074343327943, + "language_loss": 0.88582212, + "learning_rate": 2.667524996399444e-06, + "loss": 0.90309465, + "num_input_tokens_seen": 146336650, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.2989502, + "step": 6818, + "time_per_iteration": 4.118330240249634 + }, + { + "auxiliary_loss_clip": 0.01409618, + "auxiliary_loss_mlp": 0.00262611, + "balance_loss_clip": 1.16307425, + "balance_loss_mlp": 0.2365759, + "epoch": 0.4099804599428829, + "flos": 29641888431360.0, + "grad_norm": 36.24206301136326, + "language_loss": 0.71398568, + "learning_rate": 2.66715785488769e-06, + "loss": 0.73070794, + "num_input_tokens_seen": 146357640, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.26037598, + "step": 6819, + "time_per_iteration": 2.8902831077575684 + }, + { + "auxiliary_loss_clip": 0.0142173, + "auxiliary_loss_mlp": 0.00303273, + "balance_loss_clip": 1.16474926, + "balance_loss_mlp": 0.27324465, + "epoch": 0.41004058319555087, + "flos": 24826590979200.0, + "grad_norm": 4.778699194793992, + "language_loss": 0.90895122, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.92620128, + "num_input_tokens_seen": 146379325, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.29992676, + "step": 6820, + "time_per_iteration": 2.7267448902130127 + }, + { + "auxiliary_loss_clip": 0.0139687, + "auxiliary_loss_mlp": 0.00296921, + "balance_loss_clip": 1.15297258, + "balance_loss_mlp": 0.26794127, + "epoch": 0.41010070644821883, + "flos": 25737464995200.0, + "grad_norm": 115.08346938310278, + "language_loss": 0.78367758, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.80061549, + "num_input_tokens_seen": 146398635, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.29016113, + "step": 6821, + "time_per_iteration": 2.6953489780426025 + }, + { + "auxiliary_loss_clip": 0.01394558, + "auxiliary_loss_mlp": 0.00264358, + "balance_loss_clip": 1.14665759, + "balance_loss_mlp": 0.23703498, + "epoch": 0.4101608297008868, + "flos": 22346061557760.0, + "grad_norm": 33.47381231574748, + "language_loss": 0.7986939, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.81528306, + "num_input_tokens_seen": 146417585, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.2734375, + "step": 6822, + "time_per_iteration": 2.626279830932617 + }, + { + "auxiliary_loss_clip": 0.01400868, + "auxiliary_loss_mlp": 0.00296292, + "balance_loss_clip": 1.14896214, + "balance_loss_mlp": 0.26713338, + "epoch": 0.41022095295355476, + "flos": 21945083057280.0, + "grad_norm": 7.559985696240839, + "language_loss": 0.83649391, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.8534655, + "num_input_tokens_seen": 146437035, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.29125977, + "step": 6823, + "time_per_iteration": 2.6830451488494873 + }, + { + "auxiliary_loss_clip": 0.01405575, + "auxiliary_loss_mlp": 0.00269099, + "balance_loss_clip": 1.15286946, + "balance_loss_mlp": 0.23943949, + "epoch": 0.4102810762062228, + "flos": 27450511493760.0, + "grad_norm": 23.84103159984935, + "language_loss": 0.80702579, + "learning_rate": 2.665321768127001e-06, + "loss": 0.82377255, + "num_input_tokens_seen": 146457370, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.29663086, + "step": 6824, + "time_per_iteration": 4.104139089584351 + }, + { + "auxiliary_loss_clip": 0.01396113, + "auxiliary_loss_mlp": 0.0027708, + "balance_loss_clip": 1.14309335, + "balance_loss_mlp": 0.24754041, + "epoch": 0.41034119945889075, + "flos": 24499265316480.0, + "grad_norm": 11.853762083682092, + "language_loss": 0.79436564, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.81109756, + "num_input_tokens_seen": 146478105, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.29528809, + "step": 6825, + "time_per_iteration": 2.795856475830078 + }, + { + "auxiliary_loss_clip": 0.01389948, + "auxiliary_loss_mlp": 0.00244421, + "balance_loss_clip": 1.13980389, + "balance_loss_mlp": 0.21795669, + "epoch": 0.4104013227115587, + "flos": 24352641999360.0, + "grad_norm": 30.18353569223568, + "language_loss": 0.92310631, + "learning_rate": 2.664587156721768e-06, + "loss": 0.93944997, + "num_input_tokens_seen": 146497835, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.26477051, + "step": 6826, + "time_per_iteration": 2.7050585746765137 + }, + { + "auxiliary_loss_clip": 0.01396126, + "auxiliary_loss_mlp": 0.00256865, + "balance_loss_clip": 1.14929187, + "balance_loss_mlp": 0.22918504, + "epoch": 0.4104614459642267, + "flos": 23729340268800.0, + "grad_norm": 7.085910192077398, + "language_loss": 0.75289237, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.76942229, + "num_input_tokens_seen": 146517735, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.2767334, + "step": 6827, + "time_per_iteration": 2.642277479171753 + }, + { + "auxiliary_loss_clip": 0.01379609, + "auxiliary_loss_mlp": 0.00246598, + "balance_loss_clip": 1.13375759, + "balance_loss_mlp": 0.22040814, + "epoch": 0.41052156921689464, + "flos": 22127976132480.0, + "grad_norm": 20.346235489964542, + "language_loss": 0.78887111, + "learning_rate": 2.663852444511689e-06, + "loss": 0.80513316, + "num_input_tokens_seen": 146537640, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.26171875, + "step": 6828, + "time_per_iteration": 2.6456539630889893 + }, + { + "auxiliary_loss_clip": 0.01393769, + "auxiliary_loss_mlp": 0.00257275, + "balance_loss_clip": 1.13956225, + "balance_loss_mlp": 0.22725785, + "epoch": 0.4105816924695626, + "flos": 20084371747200.0, + "grad_norm": 6.903336998367901, + "language_loss": 0.90364802, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.92015839, + "num_input_tokens_seen": 146554695, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.3001709, + "step": 6829, + "time_per_iteration": 2.6480817794799805 + }, + { + "auxiliary_loss_clip": 0.01377395, + "auxiliary_loss_mlp": 0.00257214, + "balance_loss_clip": 1.12631166, + "balance_loss_mlp": 0.22880657, + "epoch": 0.4106418157222306, + "flos": 18076785724800.0, + "grad_norm": 9.192076690317803, + "language_loss": 0.94873154, + "learning_rate": 2.663117631608206e-06, + "loss": 0.96507764, + "num_input_tokens_seen": 146573740, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.28393555, + "step": 6830, + "time_per_iteration": 2.7050697803497314 + }, + { + "auxiliary_loss_clip": 0.01384301, + "auxiliary_loss_mlp": 0.00236132, + "balance_loss_clip": 1.13148761, + "balance_loss_mlp": 0.20658, + "epoch": 0.41070193897489854, + "flos": 21647850013440.0, + "grad_norm": 8.386839145773173, + "language_loss": 0.73360085, + "learning_rate": 2.662750187431268e-06, + "loss": 0.74980521, + "num_input_tokens_seen": 146592885, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.2956543, + "step": 6831, + "time_per_iteration": 2.664151668548584 + }, + { + "auxiliary_loss_clip": 0.01386073, + "auxiliary_loss_mlp": 0.00245289, + "balance_loss_clip": 1.1339618, + "balance_loss_mlp": 0.2164408, + "epoch": 0.4107620622275665, + "flos": 26648195356800.0, + "grad_norm": 2.837288790692594, + "language_loss": 0.77262795, + "learning_rate": 2.662382718122776e-06, + "loss": 0.78894162, + "num_input_tokens_seen": 146611995, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.28820801, + "step": 6832, + "time_per_iteration": 2.7238142490386963 + }, + { + "auxiliary_loss_clip": 0.01382918, + "auxiliary_loss_mlp": 0.00264687, + "balance_loss_clip": 1.13129473, + "balance_loss_mlp": 0.23742396, + "epoch": 0.41082218548023447, + "flos": 18734310138240.0, + "grad_norm": 5.493127954347516, + "language_loss": 0.84241086, + "learning_rate": 2.662015223696666e-06, + "loss": 0.85888696, + "num_input_tokens_seen": 146628045, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.27246094, + "step": 6833, + "time_per_iteration": 2.640706777572632 + }, + { + "auxiliary_loss_clip": 0.01394455, + "auxiliary_loss_mlp": 0.00262507, + "balance_loss_clip": 1.1387434, + "balance_loss_mlp": 0.23302685, + "epoch": 0.41088230873290243, + "flos": 22893771116160.0, + "grad_norm": 5.3259668843236, + "language_loss": 0.81012678, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.8266964, + "num_input_tokens_seen": 146648355, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.29443359, + "step": 6834, + "time_per_iteration": 2.6702494621276855 + }, + { + "auxiliary_loss_clip": 0.01383621, + "auxiliary_loss_mlp": 0.00240392, + "balance_loss_clip": 1.13051116, + "balance_loss_mlp": 0.21274699, + "epoch": 0.4109424319855704, + "flos": 24276978000000.0, + "grad_norm": 42.1509701045469, + "language_loss": 0.77931553, + "learning_rate": 2.661280159547329e-06, + "loss": 0.79555571, + "num_input_tokens_seen": 146668370, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.27636719, + "step": 6835, + "time_per_iteration": 2.6571192741394043 + }, + { + "auxiliary_loss_clip": 0.01384163, + "auxiliary_loss_mlp": 0.00222979, + "balance_loss_clip": 1.13197458, + "balance_loss_mlp": 0.1974439, + "epoch": 0.41100255523823837, + "flos": 12969139478400.0, + "grad_norm": 12.698165255873297, + "language_loss": 0.98045063, + "learning_rate": 2.660912589851978e-06, + "loss": 0.99652201, + "num_input_tokens_seen": 146686665, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.25561523, + "step": 6836, + "time_per_iteration": 2.653738498687744 + }, + { + "auxiliary_loss_clip": 0.01369643, + "auxiliary_loss_mlp": 0.00233054, + "balance_loss_clip": 1.1243217, + "balance_loss_mlp": 0.20624372, + "epoch": 0.4110626784909064, + "flos": 23145648261120.0, + "grad_norm": 16.944712182581654, + "language_loss": 0.7451061, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.76113307, + "num_input_tokens_seen": 146706570, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.26831055, + "step": 6837, + "time_per_iteration": 2.695446491241455 + }, + { + "auxiliary_loss_clip": 0.01388967, + "auxiliary_loss_mlp": 0.00242718, + "balance_loss_clip": 1.1342566, + "balance_loss_mlp": 0.21541885, + "epoch": 0.41112280174357435, + "flos": 22747399194240.0, + "grad_norm": 10.887020802883303, + "language_loss": 0.84264171, + "learning_rate": 2.660177375289599e-06, + "loss": 0.8589586, + "num_input_tokens_seen": 146723425, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.27307129, + "step": 6838, + "time_per_iteration": 2.7097127437591553 + }, + { + "auxiliary_loss_clip": 0.01390225, + "auxiliary_loss_mlp": 0.00227454, + "balance_loss_clip": 1.13860083, + "balance_loss_mlp": 0.19964266, + "epoch": 0.4111829249962423, + "flos": 21102403011840.0, + "grad_norm": 9.464108747440203, + "language_loss": 0.91437048, + "learning_rate": 2.659809730450451e-06, + "loss": 0.9305473, + "num_input_tokens_seen": 146741640, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.27844238, + "step": 6839, + "time_per_iteration": 2.7113709449768066 + }, + { + "auxiliary_loss_clip": 0.01378316, + "auxiliary_loss_mlp": 0.00218634, + "balance_loss_clip": 1.13079798, + "balance_loss_mlp": 0.19153813, + "epoch": 0.4112430482489103, + "flos": 21505787723520.0, + "grad_norm": 175.44311889823615, + "language_loss": 0.88768125, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.90365076, + "num_input_tokens_seen": 146759195, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.27062988, + "step": 6840, + "time_per_iteration": 2.674197196960449 + }, + { + "auxiliary_loss_clip": 0.01379069, + "auxiliary_loss_mlp": 0.00241447, + "balance_loss_clip": 1.12616038, + "balance_loss_mlp": 0.21394551, + "epoch": 0.41130317150157825, + "flos": 19570022945280.0, + "grad_norm": 143.2282223427411, + "language_loss": 0.76193899, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.77814412, + "num_input_tokens_seen": 146774990, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.27490234, + "step": 6841, + "time_per_iteration": 2.7424862384796143 + }, + { + "auxiliary_loss_clip": 0.01319223, + "auxiliary_loss_mlp": 0.0007177, + "balance_loss_clip": 1.1459825, + "balance_loss_mlp": 0.06566673, + "epoch": 0.4113632947542462, + "flos": 62383157706240.0, + "grad_norm": 3.9857311708039527, + "language_loss": 0.59311676, + "learning_rate": 2.65870664586847e-06, + "loss": 0.6070267, + "num_input_tokens_seen": 146839610, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.06103516, + "step": 6842, + "time_per_iteration": 3.2852349281311035 + }, + { + "auxiliary_loss_clip": 0.01380711, + "auxiliary_loss_mlp": 0.00227698, + "balance_loss_clip": 1.1345154, + "balance_loss_mlp": 0.20327184, + "epoch": 0.4114234180069142, + "flos": 13918617636480.0, + "grad_norm": 174.89712729263888, + "language_loss": 0.78022474, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.79630882, + "num_input_tokens_seen": 146857360, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.2442627, + "step": 6843, + "time_per_iteration": 2.6498069763183594 + }, + { + "auxiliary_loss_clip": 0.01375027, + "auxiliary_loss_mlp": 0.00051615, + "balance_loss_clip": 1.19147015, + "balance_loss_mlp": 0.04651245, + "epoch": 0.41148354125958214, + "flos": 64928505219840.0, + "grad_norm": 0.7035343193442615, + "language_loss": 0.5339672, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.54823357, + "num_input_tokens_seen": 146917055, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.05102539, + "step": 6844, + "time_per_iteration": 3.1321163177490234 + }, + { + "auxiliary_loss_clip": 0.01391776, + "auxiliary_loss_mlp": 0.00235322, + "balance_loss_clip": 1.1423595, + "balance_loss_mlp": 0.20972812, + "epoch": 0.4115436645122501, + "flos": 18728779443840.0, + "grad_norm": 3.343607194329203, + "language_loss": 0.73096937, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.74724036, + "num_input_tokens_seen": 146935215, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.25610352, + "step": 6845, + "time_per_iteration": 2.688157081604004 + }, + { + "auxiliary_loss_clip": 0.01394422, + "auxiliary_loss_mlp": 0.00227827, + "balance_loss_clip": 1.14090765, + "balance_loss_mlp": 0.20166066, + "epoch": 0.41160378776491807, + "flos": 16252918790400.0, + "grad_norm": 14.113990501804645, + "language_loss": 0.77456033, + "learning_rate": 2.657235516795808e-06, + "loss": 0.79078287, + "num_input_tokens_seen": 146951970, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.26171875, + "step": 6846, + "time_per_iteration": 2.786271810531616 + }, + { + "auxiliary_loss_clip": 0.0139122, + "auxiliary_loss_mlp": 0.00231816, + "balance_loss_clip": 1.14440072, + "balance_loss_mlp": 0.20721135, + "epoch": 0.41166391101758604, + "flos": 27970031854080.0, + "grad_norm": 2.010351342107659, + "language_loss": 0.71490949, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.73113984, + "num_input_tokens_seen": 146975615, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.24584961, + "step": 6847, + "time_per_iteration": 2.8369603157043457 + }, + { + "auxiliary_loss_clip": 0.01382517, + "auxiliary_loss_mlp": 0.00235988, + "balance_loss_clip": 1.13674903, + "balance_loss_mlp": 0.21014363, + "epoch": 0.411724034270254, + "flos": 34131296764800.0, + "grad_norm": 8.071896012635584, + "language_loss": 0.77612972, + "learning_rate": 2.656499802669069e-06, + "loss": 0.79231477, + "num_input_tokens_seen": 146998855, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.25842285, + "step": 6848, + "time_per_iteration": 2.906960964202881 + }, + { + "auxiliary_loss_clip": 0.01400022, + "auxiliary_loss_mlp": 0.00088995, + "balance_loss_clip": 1.22302866, + "balance_loss_mlp": 0.08122301, + "epoch": 0.41178415752292197, + "flos": 67923670752000.0, + "grad_norm": 0.8583809841367851, + "language_loss": 0.5612607, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.57615089, + "num_input_tokens_seen": 147062710, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.07763672, + "step": 6849, + "time_per_iteration": 3.2435426712036133 + }, + { + "auxiliary_loss_clip": 0.01387665, + "auxiliary_loss_mlp": 0.00237723, + "balance_loss_clip": 1.14508986, + "balance_loss_mlp": 0.2122006, + "epoch": 0.41184428077558993, + "flos": 34313938444800.0, + "grad_norm": 34.15032277529469, + "language_loss": 0.82197285, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.83822668, + "num_input_tokens_seen": 147086075, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.25524902, + "step": 6850, + "time_per_iteration": 2.771185874938965 + }, + { + "auxiliary_loss_clip": 0.01387592, + "auxiliary_loss_mlp": 0.00230334, + "balance_loss_clip": 1.14226627, + "balance_loss_mlp": 0.20535985, + "epoch": 0.41190440402825795, + "flos": 35444118948480.0, + "grad_norm": 172.4770081162339, + "language_loss": 0.73212814, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.74830735, + "num_input_tokens_seen": 147107590, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.24975586, + "step": 6851, + "time_per_iteration": 2.7946598529815674 + }, + { + "auxiliary_loss_clip": 0.0139376, + "auxiliary_loss_mlp": 0.00247927, + "balance_loss_clip": 1.14550495, + "balance_loss_mlp": 0.22214177, + "epoch": 0.4119645272809259, + "flos": 20849879422080.0, + "grad_norm": 22.0826243554903, + "language_loss": 0.91643381, + "learning_rate": 2.655028075792743e-06, + "loss": 0.93285072, + "num_input_tokens_seen": 147123715, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.2578125, + "step": 6852, + "time_per_iteration": 2.6828012466430664 + }, + { + "auxiliary_loss_clip": 0.01393613, + "auxiliary_loss_mlp": 0.00261798, + "balance_loss_clip": 1.14317441, + "balance_loss_mlp": 0.23360489, + "epoch": 0.4120246505335939, + "flos": 27562050201600.0, + "grad_norm": 83.78229117855604, + "language_loss": 0.86788905, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.88444316, + "num_input_tokens_seen": 147144290, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.28222656, + "step": 6853, + "time_per_iteration": 2.7457242012023926 + }, + { + "auxiliary_loss_clip": 0.01398907, + "auxiliary_loss_mlp": 0.00248002, + "balance_loss_clip": 1.14604843, + "balance_loss_mlp": 0.22021399, + "epoch": 0.41208477378626185, + "flos": 37815444046080.0, + "grad_norm": 194.8128496835626, + "language_loss": 0.74310017, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.75956929, + "num_input_tokens_seen": 147166340, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.27807617, + "step": 6854, + "time_per_iteration": 2.7854151725769043 + }, + { + "auxiliary_loss_clip": 0.01376686, + "auxiliary_loss_mlp": 0.00235256, + "balance_loss_clip": 1.13032532, + "balance_loss_mlp": 0.2068364, + "epoch": 0.4121448970389298, + "flos": 23440762402560.0, + "grad_norm": 19.853306788143822, + "language_loss": 0.89654177, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.91266119, + "num_input_tokens_seen": 147184025, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.28417969, + "step": 6855, + "time_per_iteration": 2.697131872177124 + }, + { + "auxiliary_loss_clip": 0.01375473, + "auxiliary_loss_mlp": 0.00265195, + "balance_loss_clip": 1.12829232, + "balance_loss_mlp": 0.23786062, + "epoch": 0.4122050202915978, + "flos": 21325300859520.0, + "grad_norm": 2.502196563589322, + "language_loss": 0.84900296, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.86540961, + "num_input_tokens_seen": 147202730, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.27282715, + "step": 6856, + "time_per_iteration": 4.06578803062439 + }, + { + "auxiliary_loss_clip": 0.0138383, + "auxiliary_loss_mlp": 0.00244204, + "balance_loss_clip": 1.13554108, + "balance_loss_mlp": 0.21958686, + "epoch": 0.41226514354426574, + "flos": 17306286059520.0, + "grad_norm": 65.01585731279155, + "language_loss": 0.88163078, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.89791107, + "num_input_tokens_seen": 147215315, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.24645996, + "step": 6857, + "time_per_iteration": 4.119785785675049 + }, + { + "auxiliary_loss_clip": 0.01375555, + "auxiliary_loss_mlp": 0.00252225, + "balance_loss_clip": 1.13053417, + "balance_loss_mlp": 0.2241874, + "epoch": 0.4123252667969337, + "flos": 17638855107840.0, + "grad_norm": 270.6058027295092, + "language_loss": 0.78699946, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.80327725, + "num_input_tokens_seen": 147233330, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.28027344, + "step": 6858, + "time_per_iteration": 2.6855404376983643 + }, + { + "auxiliary_loss_clip": 0.0137771, + "auxiliary_loss_mlp": 0.00242702, + "balance_loss_clip": 1.12928975, + "balance_loss_mlp": 0.2150453, + "epoch": 0.4123853900496017, + "flos": 46424811375360.0, + "grad_norm": 1400.1943083889316, + "language_loss": 0.6777904, + "learning_rate": 2.652451598005391e-06, + "loss": 0.69399446, + "num_input_tokens_seen": 147257780, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.27624512, + "step": 6859, + "time_per_iteration": 2.9023327827453613 + }, + { + "auxiliary_loss_clip": 0.01377419, + "auxiliary_loss_mlp": 0.00257916, + "balance_loss_clip": 1.12513161, + "balance_loss_mlp": 0.22940102, + "epoch": 0.41244551330226964, + "flos": 17675160779520.0, + "grad_norm": 33.10806642056471, + "language_loss": 0.83793586, + "learning_rate": 2.652083430674264e-06, + "loss": 0.85428917, + "num_input_tokens_seen": 147276055, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.28540039, + "step": 6860, + "time_per_iteration": 4.051973581314087 + }, + { + "auxiliary_loss_clip": 0.01365001, + "auxiliary_loss_mlp": 0.00254451, + "balance_loss_clip": 1.11949801, + "balance_loss_mlp": 0.2269018, + "epoch": 0.4125056365549376, + "flos": 18693730748160.0, + "grad_norm": 70.49784343601358, + "language_loss": 0.79660231, + "learning_rate": 2.651715238616068e-06, + "loss": 0.81279683, + "num_input_tokens_seen": 147293200, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.27539062, + "step": 6861, + "time_per_iteration": 2.6070330142974854 + }, + { + "auxiliary_loss_clip": 0.01369028, + "auxiliary_loss_mlp": 0.00236809, + "balance_loss_clip": 1.12726009, + "balance_loss_mlp": 0.21172784, + "epoch": 0.41256575980760557, + "flos": 17895293280000.0, + "grad_norm": 16.91097659552207, + "language_loss": 0.87084889, + "learning_rate": 2.651347021844765e-06, + "loss": 0.88690728, + "num_input_tokens_seen": 147310640, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.25109863, + "step": 6862, + "time_per_iteration": 2.766947031021118 + }, + { + "auxiliary_loss_clip": 0.01376949, + "auxiliary_loss_mlp": 0.00256692, + "balance_loss_clip": 1.1300869, + "balance_loss_mlp": 0.23116903, + "epoch": 0.41262588306027354, + "flos": 21981316901760.0, + "grad_norm": 8.294575461556992, + "language_loss": 0.84003568, + "learning_rate": 2.650978780374318e-06, + "loss": 0.85637212, + "num_input_tokens_seen": 147329435, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.25561523, + "step": 6863, + "time_per_iteration": 2.7042076587677 + }, + { + "auxiliary_loss_clip": 0.01330087, + "auxiliary_loss_mlp": 0.00133784, + "balance_loss_clip": 1.14601922, + "balance_loss_mlp": 0.1255822, + "epoch": 0.41268600631294156, + "flos": 53350006740480.0, + "grad_norm": 0.7140864231224561, + "language_loss": 0.52524936, + "learning_rate": 2.650610514218691e-06, + "loss": 0.53988808, + "num_input_tokens_seen": 147385805, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.08203125, + "step": 6864, + "time_per_iteration": 3.1213865280151367 + }, + { + "auxiliary_loss_clip": 0.01374198, + "auxiliary_loss_mlp": 0.00267969, + "balance_loss_clip": 1.1244235, + "balance_loss_mlp": 0.23921563, + "epoch": 0.4127461295656095, + "flos": 24385356311040.0, + "grad_norm": 61.405046451703825, + "language_loss": 0.79265809, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.80907977, + "num_input_tokens_seen": 147405160, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.28759766, + "step": 6865, + "time_per_iteration": 2.7147769927978516 + }, + { + "auxiliary_loss_clip": 0.01325251, + "auxiliary_loss_mlp": 0.00164871, + "balance_loss_clip": 1.13663197, + "balance_loss_mlp": 0.15619248, + "epoch": 0.4128062528182775, + "flos": 71705242696320.0, + "grad_norm": 0.9063399141356253, + "language_loss": 0.65737087, + "learning_rate": 2.649873907907753e-06, + "loss": 0.67227209, + "num_input_tokens_seen": 147460245, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.08691406, + "step": 6866, + "time_per_iteration": 4.487993478775024 + }, + { + "auxiliary_loss_clip": 0.01365727, + "auxiliary_loss_mlp": 0.00262338, + "balance_loss_clip": 1.11700201, + "balance_loss_mlp": 0.23439582, + "epoch": 0.41286637607094545, + "flos": 17849111368320.0, + "grad_norm": 21.742137038483026, + "language_loss": 0.90596306, + "learning_rate": 2.649505567780375e-06, + "loss": 0.92224371, + "num_input_tokens_seen": 147476200, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.27941895, + "step": 6867, + "time_per_iteration": 2.7160019874572754 + }, + { + "auxiliary_loss_clip": 0.01369456, + "auxiliary_loss_mlp": 0.00255284, + "balance_loss_clip": 1.11826217, + "balance_loss_mlp": 0.22858125, + "epoch": 0.4129264993236134, + "flos": 25549544016000.0, + "grad_norm": 6.9580949431459915, + "language_loss": 0.86797035, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.88421774, + "num_input_tokens_seen": 147494315, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.26696777, + "step": 6868, + "time_per_iteration": 2.743624210357666 + }, + { + "auxiliary_loss_clip": 0.01316827, + "auxiliary_loss_mlp": 0.00079728, + "balance_loss_clip": 1.14182758, + "balance_loss_mlp": 0.07381497, + "epoch": 0.4129866225762814, + "flos": 65414446364160.0, + "grad_norm": 0.8663249964161274, + "language_loss": 0.57348049, + "learning_rate": 2.64876881365164e-06, + "loss": 0.58744597, + "num_input_tokens_seen": 147543665, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.05908203, + "step": 6869, + "time_per_iteration": 2.9005041122436523 + }, + { + "auxiliary_loss_clip": 0.01364384, + "auxiliary_loss_mlp": 0.0027761, + "balance_loss_clip": 1.11902881, + "balance_loss_mlp": 0.25009698, + "epoch": 0.41304674582894935, + "flos": 28876991287680.0, + "grad_norm": 11.427010929111855, + "language_loss": 0.81383312, + "learning_rate": 2.64840039967822e-06, + "loss": 0.83025312, + "num_input_tokens_seen": 147564870, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.27526855, + "step": 6870, + "time_per_iteration": 2.6914758682250977 + }, + { + "auxiliary_loss_clip": 0.01363545, + "auxiliary_loss_mlp": 0.00263336, + "balance_loss_clip": 1.11613846, + "balance_loss_mlp": 0.23434404, + "epoch": 0.4131068690816173, + "flos": 22891975436160.0, + "grad_norm": 121.48837658952212, + "language_loss": 0.88649124, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.90276003, + "num_input_tokens_seen": 147584840, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.29003906, + "step": 6871, + "time_per_iteration": 2.67029070854187 + }, + { + "auxiliary_loss_clip": 0.01375895, + "auxiliary_loss_mlp": 0.00246737, + "balance_loss_clip": 1.12967157, + "balance_loss_mlp": 0.22052297, + "epoch": 0.4131669923342853, + "flos": 26065185707520.0, + "grad_norm": 4991.675422938932, + "language_loss": 0.76976281, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.78598917, + "num_input_tokens_seen": 147604635, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.26220703, + "step": 6872, + "time_per_iteration": 2.6754255294799805 + }, + { + "auxiliary_loss_clip": 0.01360708, + "auxiliary_loss_mlp": 0.002428, + "balance_loss_clip": 1.11754704, + "balance_loss_mlp": 0.21399873, + "epoch": 0.41322711558695324, + "flos": 19244564789760.0, + "grad_norm": 10.066074504951425, + "language_loss": 0.85081756, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.86685264, + "num_input_tokens_seen": 147620700, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.28820801, + "step": 6873, + "time_per_iteration": 2.681732416152954 + }, + { + "auxiliary_loss_clip": 0.01375686, + "auxiliary_loss_mlp": 0.00274958, + "balance_loss_clip": 1.12345064, + "balance_loss_mlp": 0.24341479, + "epoch": 0.4132872388396212, + "flos": 22674464628480.0, + "grad_norm": 249.3106096018696, + "language_loss": 0.90470809, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.92121452, + "num_input_tokens_seen": 147639490, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.31567383, + "step": 6874, + "time_per_iteration": 2.6835896968841553 + }, + { + "auxiliary_loss_clip": 0.01372468, + "auxiliary_loss_mlp": 0.00276642, + "balance_loss_clip": 1.1265223, + "balance_loss_mlp": 0.24813868, + "epoch": 0.4133473620922892, + "flos": 20150195420160.0, + "grad_norm": 16.55573828341059, + "language_loss": 0.79203951, + "learning_rate": 2.646557961279436e-06, + "loss": 0.80853057, + "num_input_tokens_seen": 147657205, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.28503418, + "step": 6875, + "time_per_iteration": 2.6380765438079834 + }, + { + "auxiliary_loss_clip": 0.01365801, + "auxiliary_loss_mlp": 0.00255205, + "balance_loss_clip": 1.12616348, + "balance_loss_mlp": 0.22975381, + "epoch": 0.41340748534495714, + "flos": 24242755317120.0, + "grad_norm": 3.895502565426921, + "language_loss": 0.86526716, + "learning_rate": 2.646189399991154e-06, + "loss": 0.88147724, + "num_input_tokens_seen": 147677005, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.2545166, + "step": 6876, + "time_per_iteration": 2.6835334300994873 + }, + { + "auxiliary_loss_clip": 0.01377222, + "auxiliary_loss_mlp": 0.00299211, + "balance_loss_clip": 1.12901592, + "balance_loss_mlp": 0.26931328, + "epoch": 0.41346760859762516, + "flos": 14392171566720.0, + "grad_norm": 7.756798175932521, + "language_loss": 0.7719788, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.78874314, + "num_input_tokens_seen": 147693435, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.29870605, + "step": 6877, + "time_per_iteration": 2.6251327991485596 + }, + { + "auxiliary_loss_clip": 0.0137954, + "auxiliary_loss_mlp": 0.00262634, + "balance_loss_clip": 1.13552129, + "balance_loss_mlp": 0.23620489, + "epoch": 0.4135277318502931, + "flos": 22492002516480.0, + "grad_norm": 8.781838527347082, + "language_loss": 0.83666706, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.85308874, + "num_input_tokens_seen": 147714000, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.26452637, + "step": 6878, + "time_per_iteration": 2.6512234210968018 + }, + { + "auxiliary_loss_clip": 0.01373161, + "auxiliary_loss_mlp": 0.00265212, + "balance_loss_clip": 1.12665224, + "balance_loss_mlp": 0.23787682, + "epoch": 0.4135878551029611, + "flos": 22418744728320.0, + "grad_norm": 10.180778754940595, + "language_loss": 0.88431752, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.90070128, + "num_input_tokens_seen": 147731010, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.27294922, + "step": 6879, + "time_per_iteration": 2.738079071044922 + }, + { + "auxiliary_loss_clip": 0.01382991, + "auxiliary_loss_mlp": 0.00270991, + "balance_loss_clip": 1.1371423, + "balance_loss_mlp": 0.24295273, + "epoch": 0.41364797835562905, + "flos": 27053232094080.0, + "grad_norm": 322.00457011732294, + "language_loss": 0.91000116, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.92654097, + "num_input_tokens_seen": 147750880, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.28063965, + "step": 6880, + "time_per_iteration": 2.73940110206604 + }, + { + "auxiliary_loss_clip": 0.01389934, + "auxiliary_loss_mlp": 0.00292547, + "balance_loss_clip": 1.14084363, + "balance_loss_mlp": 0.26307893, + "epoch": 0.413708101608297, + "flos": 22967603521920.0, + "grad_norm": 3.2722765019582885, + "language_loss": 0.77203977, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.78886461, + "num_input_tokens_seen": 147771360, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.29467773, + "step": 6881, + "time_per_iteration": 2.689289093017578 + }, + { + "auxiliary_loss_clip": 0.01384324, + "auxiliary_loss_mlp": 0.0029237, + "balance_loss_clip": 1.1444329, + "balance_loss_mlp": 0.26472518, + "epoch": 0.413768224860965, + "flos": 13333991875200.0, + "grad_norm": 6.447575893323677, + "language_loss": 0.87007403, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.88684094, + "num_input_tokens_seen": 147787440, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.27661133, + "step": 6882, + "time_per_iteration": 2.7379395961761475 + }, + { + "auxiliary_loss_clip": 0.01396516, + "auxiliary_loss_mlp": 0.00299916, + "balance_loss_clip": 1.14301074, + "balance_loss_mlp": 0.26746699, + "epoch": 0.41382834811363295, + "flos": 20813968800000.0, + "grad_norm": 57.01543474718828, + "language_loss": 0.81385452, + "learning_rate": 2.643608785656077e-06, + "loss": 0.83081883, + "num_input_tokens_seen": 147805720, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.32446289, + "step": 6883, + "time_per_iteration": 2.705965757369995 + }, + { + "auxiliary_loss_clip": 0.01386805, + "auxiliary_loss_mlp": 0.00303944, + "balance_loss_clip": 1.13799429, + "balance_loss_mlp": 0.27337891, + "epoch": 0.4138884713663009, + "flos": 20667130001280.0, + "grad_norm": 16.640973123443906, + "language_loss": 0.80755293, + "learning_rate": 2.643240028730663e-06, + "loss": 0.82446039, + "num_input_tokens_seen": 147824605, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.3059082, + "step": 6884, + "time_per_iteration": 2.763960123062134 + }, + { + "auxiliary_loss_clip": 0.01387997, + "auxiliary_loss_mlp": 0.0030527, + "balance_loss_clip": 1.13740361, + "balance_loss_mlp": 0.27694577, + "epoch": 0.4139485946189689, + "flos": 29056616225280.0, + "grad_norm": 10.374167131057625, + "language_loss": 0.80864847, + "learning_rate": 2.642871247413523e-06, + "loss": 0.82558113, + "num_input_tokens_seen": 147845445, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.28295898, + "step": 6885, + "time_per_iteration": 2.720151662826538 + }, + { + "auxiliary_loss_clip": 0.01391806, + "auxiliary_loss_mlp": 0.00293357, + "balance_loss_clip": 1.14549804, + "balance_loss_mlp": 0.26434177, + "epoch": 0.41400871787163684, + "flos": 24425720219520.0, + "grad_norm": 123.72668436118754, + "language_loss": 0.76831591, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.78516752, + "num_input_tokens_seen": 147865580, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.2902832, + "step": 6886, + "time_per_iteration": 2.7026567459106445 + }, + { + "auxiliary_loss_clip": 0.01400741, + "auxiliary_loss_mlp": 0.00328908, + "balance_loss_clip": 1.15302086, + "balance_loss_mlp": 0.29933217, + "epoch": 0.4140688411243048, + "flos": 19464050845440.0, + "grad_norm": 16.52039714209627, + "language_loss": 0.82146859, + "learning_rate": 2.642133611660002e-06, + "loss": 0.83876514, + "num_input_tokens_seen": 147885230, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.29553223, + "step": 6887, + "time_per_iteration": 2.67100191116333 + }, + { + "auxiliary_loss_clip": 0.01387094, + "auxiliary_loss_mlp": 0.00281285, + "balance_loss_clip": 1.14418244, + "balance_loss_mlp": 0.25400996, + "epoch": 0.4141289643769728, + "flos": 19313656600320.0, + "grad_norm": 4.175942878105828, + "language_loss": 0.78027201, + "learning_rate": 2.641764757251592e-06, + "loss": 0.7969557, + "num_input_tokens_seen": 147903035, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.27307129, + "step": 6888, + "time_per_iteration": 2.6471431255340576 + }, + { + "auxiliary_loss_clip": 0.01395771, + "auxiliary_loss_mlp": 0.00307472, + "balance_loss_clip": 1.14742494, + "balance_loss_mlp": 0.27421248, + "epoch": 0.41418908762964074, + "flos": 16726903683840.0, + "grad_norm": 15.773086233838354, + "language_loss": 0.81180644, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.82883894, + "num_input_tokens_seen": 147918745, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.33276367, + "step": 6889, + "time_per_iteration": 2.6389358043670654 + }, + { + "auxiliary_loss_clip": 0.0140254, + "auxiliary_loss_mlp": 0.00295556, + "balance_loss_clip": 1.15922737, + "balance_loss_mlp": 0.26736349, + "epoch": 0.41424921088230876, + "flos": 25296840858240.0, + "grad_norm": 90.5282382617966, + "language_loss": 0.84150988, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.85849082, + "num_input_tokens_seen": 147938265, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.28186035, + "step": 6890, + "time_per_iteration": 2.730959415435791 + }, + { + "auxiliary_loss_clip": 0.01409133, + "auxiliary_loss_mlp": 0.00315238, + "balance_loss_clip": 1.16443098, + "balance_loss_mlp": 0.28476763, + "epoch": 0.4143093341349767, + "flos": 20960520289920.0, + "grad_norm": 7.231242666209297, + "language_loss": 0.82899022, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.8462339, + "num_input_tokens_seen": 147957320, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.30480957, + "step": 6891, + "time_per_iteration": 2.6533048152923584 + }, + { + "auxiliary_loss_clip": 0.01417454, + "auxiliary_loss_mlp": 0.0033727, + "balance_loss_clip": 1.16275692, + "balance_loss_mlp": 0.30186486, + "epoch": 0.4143694573876447, + "flos": 22017694400640.0, + "grad_norm": 57.03982148594572, + "language_loss": 0.91406876, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.93161595, + "num_input_tokens_seen": 147977045, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.35424805, + "step": 6892, + "time_per_iteration": 2.830307722091675 + }, + { + "auxiliary_loss_clip": 0.01412734, + "auxiliary_loss_mlp": 0.0031509, + "balance_loss_clip": 1.16872597, + "balance_loss_mlp": 0.28555, + "epoch": 0.41442958064031266, + "flos": 35697396723840.0, + "grad_norm": 4.3836858450997385, + "language_loss": 0.75051278, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.76779097, + "num_input_tokens_seen": 147996905, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.29528809, + "step": 6893, + "time_per_iteration": 2.8227481842041016 + }, + { + "auxiliary_loss_clip": 0.01410084, + "auxiliary_loss_mlp": 0.00271688, + "balance_loss_clip": 1.16535878, + "balance_loss_mlp": 0.24455574, + "epoch": 0.4144897038929806, + "flos": 28293766156800.0, + "grad_norm": 7.945131252390588, + "language_loss": 0.79575306, + "learning_rate": 2.639551120239279e-06, + "loss": 0.81257081, + "num_input_tokens_seen": 148017875, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.27111816, + "step": 6894, + "time_per_iteration": 2.765350818634033 + }, + { + "auxiliary_loss_clip": 0.01409427, + "auxiliary_loss_mlp": 0.00298748, + "balance_loss_clip": 1.16187751, + "balance_loss_mlp": 0.27004203, + "epoch": 0.4145498271456486, + "flos": 11648093080320.0, + "grad_norm": 47.98093130562892, + "language_loss": 0.73765868, + "learning_rate": 2.63918209577416e-06, + "loss": 0.75474042, + "num_input_tokens_seen": 148032300, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.2869873, + "step": 6895, + "time_per_iteration": 2.6956286430358887 + }, + { + "auxiliary_loss_clip": 0.01425207, + "auxiliary_loss_mlp": 0.00284777, + "balance_loss_clip": 1.1766603, + "balance_loss_mlp": 0.25601187, + "epoch": 0.41460995039831655, + "flos": 27235622378880.0, + "grad_norm": 35.05907851796279, + "language_loss": 0.77858269, + "learning_rate": 2.638813047071192e-06, + "loss": 0.79568255, + "num_input_tokens_seen": 148053260, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.28759766, + "step": 6896, + "time_per_iteration": 2.706022262573242 + }, + { + "auxiliary_loss_clip": 0.0141703, + "auxiliary_loss_mlp": 0.00288178, + "balance_loss_clip": 1.16847444, + "balance_loss_mlp": 0.25844732, + "epoch": 0.4146700736509845, + "flos": 25922369232000.0, + "grad_norm": 89.9941491827171, + "language_loss": 0.8050226, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.82207477, + "num_input_tokens_seen": 148072965, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.29736328, + "step": 6897, + "time_per_iteration": 2.7457377910614014 + }, + { + "auxiliary_loss_clip": 0.01418136, + "auxiliary_loss_mlp": 0.002664, + "balance_loss_clip": 1.16742969, + "balance_loss_mlp": 0.23788533, + "epoch": 0.4147301969036525, + "flos": 26833243248000.0, + "grad_norm": 16.399679196959884, + "language_loss": 0.89235574, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.90920109, + "num_input_tokens_seen": 148093240, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.28515625, + "step": 6898, + "time_per_iteration": 4.137167930603027 + }, + { + "auxiliary_loss_clip": 0.01419264, + "auxiliary_loss_mlp": 0.00301022, + "balance_loss_clip": 1.17082787, + "balance_loss_mlp": 0.27222151, + "epoch": 0.41479032015632045, + "flos": 20298291194880.0, + "grad_norm": 4.111884843117439, + "language_loss": 0.82652336, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.84372622, + "num_input_tokens_seen": 148110925, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.28808594, + "step": 6899, + "time_per_iteration": 2.627286911010742 + }, + { + "auxiliary_loss_clip": 0.01430901, + "auxiliary_loss_mlp": 0.00302707, + "balance_loss_clip": 1.17804313, + "balance_loss_mlp": 0.27290505, + "epoch": 0.4148504434089884, + "flos": 25264988472960.0, + "grad_norm": 7.717498037437445, + "language_loss": 0.84085858, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.85819465, + "num_input_tokens_seen": 148130670, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.29833984, + "step": 6900, + "time_per_iteration": 4.130227565765381 + }, + { + "auxiliary_loss_clip": 0.0142158, + "auxiliary_loss_mlp": 0.00290893, + "balance_loss_clip": 1.17300439, + "balance_loss_mlp": 0.26070929, + "epoch": 0.4149105666616564, + "flos": 12822300679680.0, + "grad_norm": 9.539217033895834, + "language_loss": 0.89910436, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.91622913, + "num_input_tokens_seen": 148148350, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.30200195, + "step": 6901, + "time_per_iteration": 2.6954026222229004 + }, + { + "auxiliary_loss_clip": 0.01417709, + "auxiliary_loss_mlp": 0.00296057, + "balance_loss_clip": 1.17033899, + "balance_loss_mlp": 0.26898485, + "epoch": 0.41497068991432434, + "flos": 16763891713920.0, + "grad_norm": 86.0779626034437, + "language_loss": 0.76113397, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.77827168, + "num_input_tokens_seen": 148167550, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.27050781, + "step": 6902, + "time_per_iteration": 2.6913087368011475 + }, + { + "auxiliary_loss_clip": 0.01421343, + "auxiliary_loss_mlp": 0.00319199, + "balance_loss_clip": 1.17376876, + "balance_loss_mlp": 0.2895399, + "epoch": 0.4150308131669923, + "flos": 18000906243840.0, + "grad_norm": 9.974795896836959, + "language_loss": 0.89724463, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.91465002, + "num_input_tokens_seen": 148184740, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.29663086, + "step": 6903, + "time_per_iteration": 4.060417175292969 + }, + { + "auxiliary_loss_clip": 0.01430007, + "auxiliary_loss_mlp": 0.00334914, + "balance_loss_clip": 1.18031871, + "balance_loss_mlp": 0.30325222, + "epoch": 0.41509093641966033, + "flos": 30044770352640.0, + "grad_norm": 29.867990266370757, + "language_loss": 0.76617265, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.78382194, + "num_input_tokens_seen": 148204605, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.31665039, + "step": 6904, + "time_per_iteration": 2.7394914627075195 + }, + { + "auxiliary_loss_clip": 0.01419672, + "auxiliary_loss_mlp": 0.00308027, + "balance_loss_clip": 1.16605687, + "balance_loss_mlp": 0.2774142, + "epoch": 0.4151510596723283, + "flos": 24279994742400.0, + "grad_norm": 56.50463090246851, + "language_loss": 0.8424046, + "learning_rate": 2.635490520350643e-06, + "loss": 0.85968155, + "num_input_tokens_seen": 148224675, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.30615234, + "step": 6905, + "time_per_iteration": 2.7467727661132812 + }, + { + "auxiliary_loss_clip": 0.01424329, + "auxiliary_loss_mlp": 0.00306072, + "balance_loss_clip": 1.17144489, + "balance_loss_mlp": 0.27588773, + "epoch": 0.41521118292499626, + "flos": 23476206147840.0, + "grad_norm": 6.4955057855308915, + "language_loss": 0.75759661, + "learning_rate": 2.635121230039025e-06, + "loss": 0.77490056, + "num_input_tokens_seen": 148243375, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.30175781, + "step": 6906, + "time_per_iteration": 2.7187771797180176 + }, + { + "auxiliary_loss_clip": 0.01413975, + "auxiliary_loss_mlp": 0.00309773, + "balance_loss_clip": 1.16665006, + "balance_loss_mlp": 0.2817831, + "epoch": 0.4152713061776642, + "flos": 22125498094080.0, + "grad_norm": 19.465344134684504, + "language_loss": 0.76674199, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.78397954, + "num_input_tokens_seen": 148261140, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.27966309, + "step": 6907, + "time_per_iteration": 2.6816256046295166 + }, + { + "auxiliary_loss_clip": 0.01413873, + "auxiliary_loss_mlp": 0.00294103, + "balance_loss_clip": 1.16547084, + "balance_loss_mlp": 0.26483703, + "epoch": 0.4153314294303322, + "flos": 21251396626560.0, + "grad_norm": 5.835388374345328, + "language_loss": 0.84120166, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.85828149, + "num_input_tokens_seen": 148279655, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.29260254, + "step": 6908, + "time_per_iteration": 2.680659294128418 + }, + { + "auxiliary_loss_clip": 0.01480656, + "auxiliary_loss_mlp": 0.00078762, + "balance_loss_clip": 1.27692783, + "balance_loss_mlp": 0.07284941, + "epoch": 0.41539155268300015, + "flos": 57920681594880.0, + "grad_norm": 0.8208912612936905, + "language_loss": 0.64511979, + "learning_rate": 2.634013214657026e-06, + "loss": 0.66071403, + "num_input_tokens_seen": 148339005, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.05908203, + "step": 6909, + "time_per_iteration": 4.5079309940338135 + }, + { + "auxiliary_loss_clip": 0.01404949, + "auxiliary_loss_mlp": 0.002628, + "balance_loss_clip": 1.15990829, + "balance_loss_mlp": 0.23650268, + "epoch": 0.4154516759356681, + "flos": 21903677654400.0, + "grad_norm": 407.25151211267945, + "language_loss": 0.92874128, + "learning_rate": 2.633643828093996e-06, + "loss": 0.94541872, + "num_input_tokens_seen": 148358715, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.26269531, + "step": 6910, + "time_per_iteration": 2.665701389312744 + }, + { + "auxiliary_loss_clip": 0.01475224, + "auxiliary_loss_mlp": 0.00094293, + "balance_loss_clip": 1.27256799, + "balance_loss_mlp": 0.08776008, + "epoch": 0.4155117991883361, + "flos": 67833677226240.0, + "grad_norm": 0.8206379664202867, + "language_loss": 0.61926419, + "learning_rate": 2.633274417503128e-06, + "loss": 0.6349594, + "num_input_tokens_seen": 148417280, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.06542969, + "step": 6911, + "time_per_iteration": 3.1090869903564453 + }, + { + "auxiliary_loss_clip": 0.01427063, + "auxiliary_loss_mlp": 0.00304637, + "balance_loss_clip": 1.17120159, + "balance_loss_mlp": 0.27423894, + "epoch": 0.41557192244100405, + "flos": 14282679934080.0, + "grad_norm": 160.56101352808557, + "language_loss": 0.95819759, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.97551465, + "num_input_tokens_seen": 148432610, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.30419922, + "step": 6912, + "time_per_iteration": 2.5963642597198486 + }, + { + "auxiliary_loss_clip": 0.01396684, + "auxiliary_loss_mlp": 0.00279699, + "balance_loss_clip": 1.14974916, + "balance_loss_mlp": 0.25312755, + "epoch": 0.415632045693672, + "flos": 24461954064000.0, + "grad_norm": 16.481301082363263, + "language_loss": 0.72058785, + "learning_rate": 2.632535524293914e-06, + "loss": 0.73735166, + "num_input_tokens_seen": 148451510, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.26574707, + "step": 6913, + "time_per_iteration": 2.6894800662994385 + }, + { + "auxiliary_loss_clip": 0.01406694, + "auxiliary_loss_mlp": 0.00280085, + "balance_loss_clip": 1.16217899, + "balance_loss_mlp": 0.25342953, + "epoch": 0.41569216894634, + "flos": 20115290378880.0, + "grad_norm": 57.609545133359646, + "language_loss": 0.82184291, + "learning_rate": 2.632166041703586e-06, + "loss": 0.83871067, + "num_input_tokens_seen": 148469945, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.26660156, + "step": 6914, + "time_per_iteration": 2.6344432830810547 + }, + { + "auxiliary_loss_clip": 0.01399325, + "auxiliary_loss_mlp": 0.00310993, + "balance_loss_clip": 1.15152073, + "balance_loss_mlp": 0.28104812, + "epoch": 0.41575229219900794, + "flos": 23798827128960.0, + "grad_norm": 5.203095873600688, + "language_loss": 0.91901076, + "learning_rate": 2.631796535141458e-06, + "loss": 0.93611395, + "num_input_tokens_seen": 148486655, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.29968262, + "step": 6915, + "time_per_iteration": 2.6857364177703857 + }, + { + "auxiliary_loss_clip": 0.01401701, + "auxiliary_loss_mlp": 0.00283399, + "balance_loss_clip": 1.15303254, + "balance_loss_mlp": 0.25638592, + "epoch": 0.4158124154516759, + "flos": 23108229267840.0, + "grad_norm": 4.199973616042434, + "language_loss": 0.79899567, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.81584668, + "num_input_tokens_seen": 148505035, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.27038574, + "step": 6916, + "time_per_iteration": 2.6822328567504883 + }, + { + "auxiliary_loss_clip": 0.01414249, + "auxiliary_loss_mlp": 0.00265951, + "balance_loss_clip": 1.1622622, + "balance_loss_mlp": 0.23971298, + "epoch": 0.41587253870434393, + "flos": 24242970798720.0, + "grad_norm": 48.329791837155696, + "language_loss": 0.7876575, + "learning_rate": 2.631057450157852e-06, + "loss": 0.80445951, + "num_input_tokens_seen": 148525575, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.26269531, + "step": 6917, + "time_per_iteration": 2.6766164302825928 + }, + { + "auxiliary_loss_clip": 0.01400876, + "auxiliary_loss_mlp": 0.00273781, + "balance_loss_clip": 1.15534019, + "balance_loss_mlp": 0.24608861, + "epoch": 0.4159326619570119, + "flos": 23881602021120.0, + "grad_norm": 21.195004444646862, + "language_loss": 0.85482609, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.87157261, + "num_input_tokens_seen": 148547270, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.27661133, + "step": 6918, + "time_per_iteration": 2.7356514930725098 + }, + { + "auxiliary_loss_clip": 0.01391594, + "auxiliary_loss_mlp": 0.00264522, + "balance_loss_clip": 1.14303803, + "balance_loss_mlp": 0.23643655, + "epoch": 0.41599278520967986, + "flos": 40626531354240.0, + "grad_norm": 4756.340074586925, + "language_loss": 0.75695086, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.77351201, + "num_input_tokens_seen": 148572100, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.28112793, + "step": 6919, + "time_per_iteration": 2.8362205028533936 + }, + { + "auxiliary_loss_clip": 0.01394121, + "auxiliary_loss_mlp": 0.0029232, + "balance_loss_clip": 1.14736223, + "balance_loss_mlp": 0.26385325, + "epoch": 0.4160529084623478, + "flos": 18222942165120.0, + "grad_norm": 12.61788047787155, + "language_loss": 0.90149683, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.91836131, + "num_input_tokens_seen": 148591245, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.28442383, + "step": 6920, + "time_per_iteration": 2.624249219894409 + }, + { + "auxiliary_loss_clip": 0.01392386, + "auxiliary_loss_mlp": 0.00288514, + "balance_loss_clip": 1.14725995, + "balance_loss_mlp": 0.25968921, + "epoch": 0.4161130317150158, + "flos": 13661963982720.0, + "grad_norm": 298.5545293473413, + "language_loss": 0.75963706, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.77644604, + "num_input_tokens_seen": 148607980, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.28808594, + "step": 6921, + "time_per_iteration": 2.638244390487671 + }, + { + "auxiliary_loss_clip": 0.01396201, + "auxiliary_loss_mlp": 0.002745, + "balance_loss_clip": 1.15310454, + "balance_loss_mlp": 0.24639031, + "epoch": 0.41617315496768376, + "flos": 16178511767040.0, + "grad_norm": 32.51561619184545, + "language_loss": 0.87114376, + "learning_rate": 2.629209319173274e-06, + "loss": 0.88785076, + "num_input_tokens_seen": 148624490, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.28112793, + "step": 6922, + "time_per_iteration": 2.668532371520996 + }, + { + "auxiliary_loss_clip": 0.01400967, + "auxiliary_loss_mlp": 0.00284341, + "balance_loss_clip": 1.15360737, + "balance_loss_mlp": 0.25706571, + "epoch": 0.4162332782203517, + "flos": 26213317395840.0, + "grad_norm": 20.136738606444762, + "language_loss": 0.72727156, + "learning_rate": 2.628839621341247e-06, + "loss": 0.74412465, + "num_input_tokens_seen": 148646490, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.27270508, + "step": 6923, + "time_per_iteration": 2.710132360458374 + }, + { + "auxiliary_loss_clip": 0.0140425, + "auxiliary_loss_mlp": 0.00282596, + "balance_loss_clip": 1.15486956, + "balance_loss_mlp": 0.25369972, + "epoch": 0.4162934014730197, + "flos": 28183987215360.0, + "grad_norm": 15.36921349991564, + "language_loss": 0.83700281, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.85387135, + "num_input_tokens_seen": 148668580, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.28881836, + "step": 6924, + "time_per_iteration": 2.735969066619873 + }, + { + "auxiliary_loss_clip": 0.01382983, + "auxiliary_loss_mlp": 0.00286095, + "balance_loss_clip": 1.13617921, + "balance_loss_mlp": 0.25844994, + "epoch": 0.41635352472568765, + "flos": 19865316654720.0, + "grad_norm": 6.09792317195744, + "language_loss": 0.81456631, + "learning_rate": 2.62810015415423e-06, + "loss": 0.8312571, + "num_input_tokens_seen": 148688410, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.27661133, + "step": 6925, + "time_per_iteration": 2.6510465145111084 + }, + { + "auxiliary_loss_clip": 0.01381665, + "auxiliary_loss_mlp": 0.00276589, + "balance_loss_clip": 1.13158774, + "balance_loss_mlp": 0.24838394, + "epoch": 0.4164136479783556, + "flos": 14935356011520.0, + "grad_norm": 25.457186437008172, + "language_loss": 0.89619493, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.91277748, + "num_input_tokens_seen": 148704855, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.28234863, + "step": 6926, + "time_per_iteration": 2.634174108505249 + }, + { + "auxiliary_loss_clip": 0.01381164, + "auxiliary_loss_mlp": 0.00289522, + "balance_loss_clip": 1.13645625, + "balance_loss_mlp": 0.2618421, + "epoch": 0.4164737712310236, + "flos": 21757593041280.0, + "grad_norm": 11.798890301882237, + "language_loss": 0.91444969, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.93115658, + "num_input_tokens_seen": 148723065, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.27697754, + "step": 6927, + "time_per_iteration": 2.678067207336426 + }, + { + "auxiliary_loss_clip": 0.01388844, + "auxiliary_loss_mlp": 0.00282082, + "balance_loss_clip": 1.14089894, + "balance_loss_mlp": 0.254807, + "epoch": 0.41653389448369155, + "flos": 20740136394240.0, + "grad_norm": 63.6932402658284, + "language_loss": 0.84077799, + "learning_rate": 2.626990774776604e-06, + "loss": 0.8574872, + "num_input_tokens_seen": 148741780, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.27282715, + "step": 6928, + "time_per_iteration": 2.675041437149048 + }, + { + "auxiliary_loss_clip": 0.01395655, + "auxiliary_loss_mlp": 0.00277244, + "balance_loss_clip": 1.14684463, + "balance_loss_mlp": 0.24876487, + "epoch": 0.4165940177363595, + "flos": 24972891073920.0, + "grad_norm": 2.6898988410614484, + "language_loss": 0.85603917, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.8727681, + "num_input_tokens_seen": 148759795, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.28466797, + "step": 6929, + "time_per_iteration": 2.6775903701782227 + }, + { + "auxiliary_loss_clip": 0.0139315, + "auxiliary_loss_mlp": 0.00271433, + "balance_loss_clip": 1.14514947, + "balance_loss_mlp": 0.24439679, + "epoch": 0.41665414098902753, + "flos": 20521727746560.0, + "grad_norm": 26.891299676050043, + "language_loss": 0.7855249, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.80217069, + "num_input_tokens_seen": 148778680, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.27050781, + "step": 6930, + "time_per_iteration": 2.768950939178467 + }, + { + "auxiliary_loss_clip": 0.01394668, + "auxiliary_loss_mlp": 0.00258956, + "balance_loss_clip": 1.14597368, + "balance_loss_mlp": 0.23128766, + "epoch": 0.4167142642416955, + "flos": 19682926369920.0, + "grad_norm": 121.69783760742101, + "language_loss": 0.89496481, + "learning_rate": 2.625881181419007e-06, + "loss": 0.91150105, + "num_input_tokens_seen": 148796470, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.27661133, + "step": 6931, + "time_per_iteration": 2.6845648288726807 + }, + { + "auxiliary_loss_clip": 0.01380372, + "auxiliary_loss_mlp": 0.00293841, + "balance_loss_clip": 1.13205576, + "balance_loss_mlp": 0.26461077, + "epoch": 0.41677438749436346, + "flos": 23763742519680.0, + "grad_norm": 11.896540857713473, + "language_loss": 0.83901042, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.85575259, + "num_input_tokens_seen": 148815300, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.29248047, + "step": 6932, + "time_per_iteration": 2.6822924613952637 + }, + { + "auxiliary_loss_clip": 0.01390834, + "auxiliary_loss_mlp": 0.00261153, + "balance_loss_clip": 1.14037979, + "balance_loss_mlp": 0.23336571, + "epoch": 0.41683451074703143, + "flos": 30410053712640.0, + "grad_norm": 3.3979900530566614, + "language_loss": 0.8957845, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.91230434, + "num_input_tokens_seen": 148834315, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.27807617, + "step": 6933, + "time_per_iteration": 2.7461166381835938 + }, + { + "auxiliary_loss_clip": 0.0140444, + "auxiliary_loss_mlp": 0.00301029, + "balance_loss_clip": 1.1514914, + "balance_loss_mlp": 0.27070186, + "epoch": 0.4168946339996994, + "flos": 21506757390720.0, + "grad_norm": 7.8485192326911575, + "language_loss": 0.85273981, + "learning_rate": 2.624771374460121e-06, + "loss": 0.86979449, + "num_input_tokens_seen": 148852420, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.3034668, + "step": 6934, + "time_per_iteration": 2.716526746749878 + }, + { + "auxiliary_loss_clip": 0.01402213, + "auxiliary_loss_mlp": 0.00276557, + "balance_loss_clip": 1.15552425, + "balance_loss_mlp": 0.24751809, + "epoch": 0.41695475725236736, + "flos": 17638675539840.0, + "grad_norm": 3.6413087545155087, + "language_loss": 0.71973777, + "learning_rate": 2.624401391405668e-06, + "loss": 0.73652542, + "num_input_tokens_seen": 148869305, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.29077148, + "step": 6935, + "time_per_iteration": 2.622572898864746 + }, + { + "auxiliary_loss_clip": 0.01398195, + "auxiliary_loss_mlp": 0.0027434, + "balance_loss_clip": 1.14978862, + "balance_loss_mlp": 0.2485548, + "epoch": 0.4170148805050353, + "flos": 15668903560320.0, + "grad_norm": 21.588169750366593, + "language_loss": 0.83757108, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.85429645, + "num_input_tokens_seen": 148886395, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.25793457, + "step": 6936, + "time_per_iteration": 2.6314873695373535 + }, + { + "auxiliary_loss_clip": 0.01404527, + "auxiliary_loss_mlp": 0.00269567, + "balance_loss_clip": 1.15774989, + "balance_loss_mlp": 0.24411595, + "epoch": 0.4170750037577033, + "flos": 15159151699200.0, + "grad_norm": 26.677717110261423, + "language_loss": 0.85108984, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.86783075, + "num_input_tokens_seen": 148905235, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.25476074, + "step": 6937, + "time_per_iteration": 2.630932092666626 + }, + { + "auxiliary_loss_clip": 0.01399354, + "auxiliary_loss_mlp": 0.00258644, + "balance_loss_clip": 1.15074348, + "balance_loss_mlp": 0.23219176, + "epoch": 0.41713512701037125, + "flos": 28768289754240.0, + "grad_norm": 3.027721137814871, + "language_loss": 0.87493813, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.89151818, + "num_input_tokens_seen": 148928130, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.26452637, + "step": 6938, + "time_per_iteration": 2.8802576065063477 + }, + { + "auxiliary_loss_clip": 0.01407287, + "auxiliary_loss_mlp": 0.00295685, + "balance_loss_clip": 1.15513361, + "balance_loss_mlp": 0.26515502, + "epoch": 0.4171952502630392, + "flos": 28256993608320.0, + "grad_norm": 156.82569621425893, + "language_loss": 0.82252491, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.83955461, + "num_input_tokens_seen": 148948790, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.30505371, + "step": 6939, + "time_per_iteration": 3.0412657260894775 + }, + { + "auxiliary_loss_clip": 0.01410029, + "auxiliary_loss_mlp": 0.002755, + "balance_loss_clip": 1.1600529, + "balance_loss_mlp": 0.24864177, + "epoch": 0.4172553735157072, + "flos": 24571697091840.0, + "grad_norm": 3.6744773569727722, + "language_loss": 0.8215735, + "learning_rate": 2.622551121253579e-06, + "loss": 0.83842885, + "num_input_tokens_seen": 148967690, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.26843262, + "step": 6940, + "time_per_iteration": 4.510461091995239 + }, + { + "auxiliary_loss_clip": 0.01401623, + "auxiliary_loss_mlp": 0.00289598, + "balance_loss_clip": 1.15080333, + "balance_loss_mlp": 0.26022524, + "epoch": 0.41731549676837515, + "flos": 27045797978880.0, + "grad_norm": 47.41873454611108, + "language_loss": 0.77581072, + "learning_rate": 2.622180996345424e-06, + "loss": 0.79272294, + "num_input_tokens_seen": 148987150, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.29382324, + "step": 6941, + "time_per_iteration": 2.9065158367156982 + }, + { + "auxiliary_loss_clip": 0.01415969, + "auxiliary_loss_mlp": 0.00293216, + "balance_loss_clip": 1.165254, + "balance_loss_mlp": 0.26517817, + "epoch": 0.4173756200210431, + "flos": 28394063907840.0, + "grad_norm": 4.477494542680823, + "language_loss": 0.81601989, + "learning_rate": 2.621810847844104e-06, + "loss": 0.83311176, + "num_input_tokens_seen": 149004895, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.28076172, + "step": 6942, + "time_per_iteration": 4.153656244277954 + }, + { + "auxiliary_loss_clip": 0.01417453, + "auxiliary_loss_mlp": 0.00296634, + "balance_loss_clip": 1.16063595, + "balance_loss_mlp": 0.26606908, + "epoch": 0.41743574327371114, + "flos": 22521556431360.0, + "grad_norm": 1374.6016460061212, + "language_loss": 0.82325387, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.84039474, + "num_input_tokens_seen": 149020970, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.30554199, + "step": 6943, + "time_per_iteration": 2.6974587440490723 + }, + { + "auxiliary_loss_clip": 0.01396745, + "auxiliary_loss_mlp": 0.00299755, + "balance_loss_clip": 1.15162075, + "balance_loss_mlp": 0.27106109, + "epoch": 0.4174958665263791, + "flos": 30113431200000.0, + "grad_norm": 38.536577322544105, + "language_loss": 0.69601953, + "learning_rate": 2.621070480118111e-06, + "loss": 0.71298456, + "num_input_tokens_seen": 149041795, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.28723145, + "step": 6944, + "time_per_iteration": 2.706611394882202 + }, + { + "auxiliary_loss_clip": 0.01408176, + "auxiliary_loss_mlp": 0.00297886, + "balance_loss_clip": 1.15834737, + "balance_loss_mlp": 0.26814297, + "epoch": 0.41755598977904707, + "flos": 25263444188160.0, + "grad_norm": 1.947677275148861, + "language_loss": 0.76900673, + "learning_rate": 2.620700260921513e-06, + "loss": 0.78606737, + "num_input_tokens_seen": 149063700, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.29736328, + "step": 6945, + "time_per_iteration": 4.092973709106445 + }, + { + "auxiliary_loss_clip": 0.01404733, + "auxiliary_loss_mlp": 0.00278167, + "balance_loss_clip": 1.15957081, + "balance_loss_mlp": 0.25138062, + "epoch": 0.41761611303171503, + "flos": 19828580019840.0, + "grad_norm": 5.2044171368214265, + "language_loss": 0.87406111, + "learning_rate": 2.620330018187899e-06, + "loss": 0.89089012, + "num_input_tokens_seen": 149082410, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.2677002, + "step": 6946, + "time_per_iteration": 2.6575326919555664 + }, + { + "auxiliary_loss_clip": 0.01391528, + "auxiliary_loss_mlp": 0.00268514, + "balance_loss_clip": 1.14465284, + "balance_loss_mlp": 0.24293125, + "epoch": 0.417676236284383, + "flos": 15523249910400.0, + "grad_norm": 15.780180007200576, + "language_loss": 0.86150169, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.87810212, + "num_input_tokens_seen": 149098745, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.25610352, + "step": 6947, + "time_per_iteration": 2.6387410163879395 + }, + { + "auxiliary_loss_clip": 0.01408894, + "auxiliary_loss_mlp": 0.00274426, + "balance_loss_clip": 1.16335344, + "balance_loss_mlp": 0.24836637, + "epoch": 0.41773635953705096, + "flos": 32524473761280.0, + "grad_norm": 6.621913335776044, + "language_loss": 0.77546638, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.79229957, + "num_input_tokens_seen": 149122255, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.26037598, + "step": 6948, + "time_per_iteration": 2.8217642307281494 + }, + { + "auxiliary_loss_clip": 0.01397553, + "auxiliary_loss_mlp": 0.00308357, + "balance_loss_clip": 1.15095651, + "balance_loss_mlp": 0.28028369, + "epoch": 0.4177964827897189, + "flos": 23440941970560.0, + "grad_norm": 120.6682190865351, + "language_loss": 0.82701254, + "learning_rate": 2.619219148905362e-06, + "loss": 0.84407163, + "num_input_tokens_seen": 149142845, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.28088379, + "step": 6949, + "time_per_iteration": 2.6534557342529297 + }, + { + "auxiliary_loss_clip": 0.01411456, + "auxiliary_loss_mlp": 0.00278491, + "balance_loss_clip": 1.15941644, + "balance_loss_mlp": 0.25054818, + "epoch": 0.4178566060423869, + "flos": 22748907565440.0, + "grad_norm": 62.212861935808725, + "language_loss": 0.89109039, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.90798986, + "num_input_tokens_seen": 149163375, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.27905273, + "step": 6950, + "time_per_iteration": 2.679577350616455 + }, + { + "auxiliary_loss_clip": 0.01402087, + "auxiliary_loss_mlp": 0.00294109, + "balance_loss_clip": 1.15576982, + "balance_loss_mlp": 0.26601154, + "epoch": 0.41791672929505486, + "flos": 26032794618240.0, + "grad_norm": 378.8632240265386, + "language_loss": 0.8092593, + "learning_rate": 2.618478451956007e-06, + "loss": 0.82622123, + "num_input_tokens_seen": 149185610, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.28125, + "step": 6951, + "time_per_iteration": 4.145330429077148 + }, + { + "auxiliary_loss_clip": 0.01406921, + "auxiliary_loss_mlp": 0.00281624, + "balance_loss_clip": 1.15475965, + "balance_loss_mlp": 0.25225046, + "epoch": 0.4179768525477228, + "flos": 19568694142080.0, + "grad_norm": 3.640682426703686, + "language_loss": 0.81593341, + "learning_rate": 2.61810806829516e-06, + "loss": 0.83281887, + "num_input_tokens_seen": 149203990, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.2935791, + "step": 6952, + "time_per_iteration": 2.6274261474609375 + }, + { + "auxiliary_loss_clip": 0.01397021, + "auxiliary_loss_mlp": 0.00284076, + "balance_loss_clip": 1.14987445, + "balance_loss_mlp": 0.25458384, + "epoch": 0.4180369758003908, + "flos": 17783826399360.0, + "grad_norm": 54.84474017678979, + "language_loss": 0.81936049, + "learning_rate": 2.617737661195593e-06, + "loss": 0.83617145, + "num_input_tokens_seen": 149221385, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.29492188, + "step": 6953, + "time_per_iteration": 2.6211225986480713 + }, + { + "auxiliary_loss_clip": 0.01396797, + "auxiliary_loss_mlp": 0.00255549, + "balance_loss_clip": 1.15254974, + "balance_loss_mlp": 0.2293587, + "epoch": 0.41809709905305875, + "flos": 20960663944320.0, + "grad_norm": 14.649428435264896, + "language_loss": 0.83543956, + "learning_rate": 2.617367230671353e-06, + "loss": 0.85196298, + "num_input_tokens_seen": 149241175, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.26184082, + "step": 6954, + "time_per_iteration": 2.644941806793213 + }, + { + "auxiliary_loss_clip": 0.01394663, + "auxiliary_loss_mlp": 0.00297997, + "balance_loss_clip": 1.14635873, + "balance_loss_mlp": 0.27012634, + "epoch": 0.4181572223057267, + "flos": 22017622573440.0, + "grad_norm": 8.495035976497672, + "language_loss": 0.92649448, + "learning_rate": 2.616996776736485e-06, + "loss": 0.94342113, + "num_input_tokens_seen": 149259115, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.27868652, + "step": 6955, + "time_per_iteration": 2.6728146076202393 + }, + { + "auxiliary_loss_clip": 0.01387492, + "auxiliary_loss_mlp": 0.00266201, + "balance_loss_clip": 1.14456415, + "balance_loss_mlp": 0.23911692, + "epoch": 0.4182173455583947, + "flos": 26245528917120.0, + "grad_norm": 86.17167004225261, + "language_loss": 0.88313472, + "learning_rate": 2.616626299405037e-06, + "loss": 0.89967167, + "num_input_tokens_seen": 149278705, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.27087402, + "step": 6956, + "time_per_iteration": 2.7048206329345703 + }, + { + "auxiliary_loss_clip": 0.01393895, + "auxiliary_loss_mlp": 0.00302301, + "balance_loss_clip": 1.14287901, + "balance_loss_mlp": 0.27035287, + "epoch": 0.4182774688110627, + "flos": 14791605782400.0, + "grad_norm": 8.820644894132002, + "language_loss": 0.77697027, + "learning_rate": 2.616255798691059e-06, + "loss": 0.7939322, + "num_input_tokens_seen": 149294040, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.3190918, + "step": 6957, + "time_per_iteration": 2.607893943786621 + }, + { + "auxiliary_loss_clip": 0.01387133, + "auxiliary_loss_mlp": 0.00292062, + "balance_loss_clip": 1.139781, + "balance_loss_mlp": 0.26513273, + "epoch": 0.41833759206373067, + "flos": 20412020632320.0, + "grad_norm": 1.9968317935379543, + "language_loss": 0.83461332, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.85140532, + "num_input_tokens_seen": 149310385, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.26928711, + "step": 6958, + "time_per_iteration": 2.600341320037842 + }, + { + "auxiliary_loss_clip": 0.01395535, + "auxiliary_loss_mlp": 0.00302396, + "balance_loss_clip": 1.14875436, + "balance_loss_mlp": 0.27363086, + "epoch": 0.41839771531639863, + "flos": 23656333875840.0, + "grad_norm": 54.98658370676171, + "language_loss": 0.84276313, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.85974252, + "num_input_tokens_seen": 149328235, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.28735352, + "step": 6959, + "time_per_iteration": 2.634221076965332 + }, + { + "auxiliary_loss_clip": 0.01388075, + "auxiliary_loss_mlp": 0.00274319, + "balance_loss_clip": 1.14374375, + "balance_loss_mlp": 0.24899891, + "epoch": 0.4184578385690666, + "flos": 19754137082880.0, + "grad_norm": 155.7319269232263, + "language_loss": 0.84119254, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.85781652, + "num_input_tokens_seen": 149347465, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.25305176, + "step": 6960, + "time_per_iteration": 2.6327178478240967 + }, + { + "auxiliary_loss_clip": 0.01378348, + "auxiliary_loss_mlp": 0.00280514, + "balance_loss_clip": 1.13627434, + "balance_loss_mlp": 0.25366804, + "epoch": 0.41851796182173456, + "flos": 20193396503040.0, + "grad_norm": 14.777030057016153, + "language_loss": 0.83524811, + "learning_rate": 2.614773562290835e-06, + "loss": 0.85183668, + "num_input_tokens_seen": 149366685, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.26867676, + "step": 6961, + "time_per_iteration": 2.637571334838867 + }, + { + "auxiliary_loss_clip": 0.01428276, + "auxiliary_loss_mlp": 0.00064516, + "balance_loss_clip": 1.21457481, + "balance_loss_mlp": 0.05922337, + "epoch": 0.41857808507440253, + "flos": 59018794231680.0, + "grad_norm": 0.819582694514353, + "language_loss": 0.54965931, + "learning_rate": 2.61440294487496e-06, + "loss": 0.56458724, + "num_input_tokens_seen": 149422925, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.05297852, + "step": 6962, + "time_per_iteration": 3.038231611251831 + }, + { + "auxiliary_loss_clip": 0.01399998, + "auxiliary_loss_mlp": 0.00285574, + "balance_loss_clip": 1.1537807, + "balance_loss_mlp": 0.25623703, + "epoch": 0.4186382083270705, + "flos": 18478805719680.0, + "grad_norm": 14.313941057372123, + "language_loss": 0.92687929, + "learning_rate": 2.614032304160864e-06, + "loss": 0.943735, + "num_input_tokens_seen": 149440820, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.2935791, + "step": 6963, + "time_per_iteration": 2.63814640045166 + }, + { + "auxiliary_loss_clip": 0.01379497, + "auxiliary_loss_mlp": 0.00303447, + "balance_loss_clip": 1.13993716, + "balance_loss_mlp": 0.27571923, + "epoch": 0.41869833157973846, + "flos": 21578758202880.0, + "grad_norm": 61.728169343546185, + "language_loss": 0.75134158, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.76817101, + "num_input_tokens_seen": 149461060, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.27722168, + "step": 6964, + "time_per_iteration": 2.6312990188598633 + }, + { + "auxiliary_loss_clip": 0.01379373, + "auxiliary_loss_mlp": 0.0027356, + "balance_loss_clip": 1.13597286, + "balance_loss_mlp": 0.24526006, + "epoch": 0.4187584548324064, + "flos": 35517412650240.0, + "grad_norm": 55.56342786075924, + "language_loss": 0.76988226, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.78641158, + "num_input_tokens_seen": 149483115, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.28308105, + "step": 6965, + "time_per_iteration": 2.7698938846588135 + }, + { + "auxiliary_loss_clip": 0.01407066, + "auxiliary_loss_mlp": 0.00253873, + "balance_loss_clip": 1.16369641, + "balance_loss_mlp": 0.2287195, + "epoch": 0.4188185780850744, + "flos": 18655880791680.0, + "grad_norm": 4.741886584453144, + "language_loss": 0.77632904, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.79293847, + "num_input_tokens_seen": 149501495, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.25158691, + "step": 6966, + "time_per_iteration": 2.6276042461395264 + }, + { + "auxiliary_loss_clip": 0.01411362, + "auxiliary_loss_mlp": 0.00321692, + "balance_loss_clip": 1.16454315, + "balance_loss_mlp": 0.29272392, + "epoch": 0.41887870133774235, + "flos": 40333428374400.0, + "grad_norm": 98.81761987729494, + "language_loss": 0.79783463, + "learning_rate": 2.612549508603375e-06, + "loss": 0.81516516, + "num_input_tokens_seen": 149523170, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.28967285, + "step": 6967, + "time_per_iteration": 2.786262035369873 + }, + { + "auxiliary_loss_clip": 0.01495916, + "auxiliary_loss_mlp": 0.00116291, + "balance_loss_clip": 1.24758351, + "balance_loss_mlp": 0.10904338, + "epoch": 0.4189388245904103, + "flos": 61371336516480.0, + "grad_norm": 0.6731396751671859, + "language_loss": 0.45763588, + "learning_rate": 2.612178751609011e-06, + "loss": 0.47375798, + "num_input_tokens_seen": 149583955, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.07226562, + "step": 6968, + "time_per_iteration": 3.1143362522125244 + }, + { + "auxiliary_loss_clip": 0.01421123, + "auxiliary_loss_mlp": 0.00331717, + "balance_loss_clip": 1.17354941, + "balance_loss_mlp": 0.30098501, + "epoch": 0.4189989478430783, + "flos": 28215624119040.0, + "grad_norm": 141.38760378086602, + "language_loss": 0.81777465, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.83530307, + "num_input_tokens_seen": 149604440, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.30712891, + "step": 6969, + "time_per_iteration": 2.7423510551452637 + }, + { + "auxiliary_loss_clip": 0.01399777, + "auxiliary_loss_mlp": 0.00294558, + "balance_loss_clip": 1.15526903, + "balance_loss_mlp": 0.26814103, + "epoch": 0.4190590710957463, + "flos": 24565879088640.0, + "grad_norm": 3.7279118361254073, + "language_loss": 0.87247849, + "learning_rate": 2.611437167992705e-06, + "loss": 0.88942182, + "num_input_tokens_seen": 149623745, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.26416016, + "step": 6970, + "time_per_iteration": 2.751967668533325 + }, + { + "auxiliary_loss_clip": 0.01417904, + "auxiliary_loss_mlp": 0.00279834, + "balance_loss_clip": 1.17062306, + "balance_loss_mlp": 0.25263017, + "epoch": 0.41911919434841427, + "flos": 21726027964800.0, + "grad_norm": 18.003447499216506, + "language_loss": 0.90226209, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.91923946, + "num_input_tokens_seen": 149643025, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.27233887, + "step": 6971, + "time_per_iteration": 2.645559072494507 + }, + { + "auxiliary_loss_clip": 0.01412203, + "auxiliary_loss_mlp": 0.00286609, + "balance_loss_clip": 1.16924381, + "balance_loss_mlp": 0.25954878, + "epoch": 0.41917931760108224, + "flos": 17601543855360.0, + "grad_norm": 15.091608473164822, + "language_loss": 0.82293242, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.83992052, + "num_input_tokens_seen": 149660695, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.27026367, + "step": 6972, + "time_per_iteration": 2.6032323837280273 + }, + { + "auxiliary_loss_clip": 0.01397484, + "auxiliary_loss_mlp": 0.00263684, + "balance_loss_clip": 1.15323758, + "balance_loss_mlp": 0.23769622, + "epoch": 0.4192394408537502, + "flos": 37816701022080.0, + "grad_norm": 12.299131419066777, + "language_loss": 0.79472899, + "learning_rate": 2.610324618710212e-06, + "loss": 0.81134063, + "num_input_tokens_seen": 149682040, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.26000977, + "step": 6973, + "time_per_iteration": 2.793625593185425 + }, + { + "auxiliary_loss_clip": 0.01418839, + "auxiliary_loss_mlp": 0.00315711, + "balance_loss_clip": 1.16711807, + "balance_loss_mlp": 0.28776792, + "epoch": 0.41929956410641817, + "flos": 23107726477440.0, + "grad_norm": 9.838583653093744, + "language_loss": 0.82372606, + "learning_rate": 2.609953722643489e-06, + "loss": 0.84107161, + "num_input_tokens_seen": 149700855, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.27966309, + "step": 6974, + "time_per_iteration": 2.6352128982543945 + }, + { + "auxiliary_loss_clip": 0.01397893, + "auxiliary_loss_mlp": 0.00282221, + "balance_loss_clip": 1.15298915, + "balance_loss_mlp": 0.25337261, + "epoch": 0.41935968735908613, + "flos": 22524537260160.0, + "grad_norm": 11.84107087449541, + "language_loss": 0.79654515, + "learning_rate": 2.609582803447259e-06, + "loss": 0.81334633, + "num_input_tokens_seen": 149717360, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.28894043, + "step": 6975, + "time_per_iteration": 2.649394989013672 + }, + { + "auxiliary_loss_clip": 0.01418982, + "auxiliary_loss_mlp": 0.00294534, + "balance_loss_clip": 1.1723423, + "balance_loss_mlp": 0.26406416, + "epoch": 0.4194198106117541, + "flos": 26870446759680.0, + "grad_norm": 3.216604026392508, + "language_loss": 0.87081254, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.88794774, + "num_input_tokens_seen": 149738975, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.30480957, + "step": 6976, + "time_per_iteration": 2.6880359649658203 + }, + { + "auxiliary_loss_clip": 0.01403301, + "auxiliary_loss_mlp": 0.00301561, + "balance_loss_clip": 1.15480399, + "balance_loss_mlp": 0.27148438, + "epoch": 0.41947993386442206, + "flos": 19902412425600.0, + "grad_norm": 12.560392564886028, + "language_loss": 0.76659954, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.78364813, + "num_input_tokens_seen": 149757055, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.30090332, + "step": 6977, + "time_per_iteration": 2.625437021255493 + }, + { + "auxiliary_loss_clip": 0.01397847, + "auxiliary_loss_mlp": 0.00294266, + "balance_loss_clip": 1.15225506, + "balance_loss_mlp": 0.26648986, + "epoch": 0.41954005711709, + "flos": 17383889393280.0, + "grad_norm": 66.8612716523086, + "language_loss": 0.89866853, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.91558969, + "num_input_tokens_seen": 149772885, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.27807617, + "step": 6978, + "time_per_iteration": 2.601032018661499 + }, + { + "auxiliary_loss_clip": 0.01393963, + "auxiliary_loss_mlp": 0.00307281, + "balance_loss_clip": 1.14541471, + "balance_loss_mlp": 0.27685863, + "epoch": 0.419600180369758, + "flos": 25003306915200.0, + "grad_norm": 29.997030808879643, + "language_loss": 0.89235413, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.90936661, + "num_input_tokens_seen": 149791515, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.30444336, + "step": 6979, + "time_per_iteration": 2.763108491897583 + }, + { + "auxiliary_loss_clip": 0.01393358, + "auxiliary_loss_mlp": 0.00300284, + "balance_loss_clip": 1.14529562, + "balance_loss_mlp": 0.27193597, + "epoch": 0.41966030362242596, + "flos": 17383781652480.0, + "grad_norm": 95.501535833024, + "language_loss": 0.89732766, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.91426408, + "num_input_tokens_seen": 149807250, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.28344727, + "step": 6980, + "time_per_iteration": 2.582070827484131 + }, + { + "auxiliary_loss_clip": 0.01396271, + "auxiliary_loss_mlp": 0.00294612, + "balance_loss_clip": 1.14698827, + "balance_loss_mlp": 0.26740807, + "epoch": 0.4197204268750939, + "flos": 22156165330560.0, + "grad_norm": 15.274055354555063, + "language_loss": 0.87431908, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.8912279, + "num_input_tokens_seen": 149821640, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.2722168, + "step": 6981, + "time_per_iteration": 2.674773693084717 + }, + { + "auxiliary_loss_clip": 0.0138888, + "auxiliary_loss_mlp": 0.00304004, + "balance_loss_clip": 1.14364552, + "balance_loss_mlp": 0.27336678, + "epoch": 0.4197805501277619, + "flos": 22084128604800.0, + "grad_norm": 2.9678213696537776, + "language_loss": 0.88784075, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.90476954, + "num_input_tokens_seen": 149840545, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.30664062, + "step": 6982, + "time_per_iteration": 4.1124348640441895 + }, + { + "auxiliary_loss_clip": 0.01413061, + "auxiliary_loss_mlp": 0.00293805, + "balance_loss_clip": 1.16005158, + "balance_loss_mlp": 0.263955, + "epoch": 0.4198406733804299, + "flos": 26432192920320.0, + "grad_norm": 55.68742550031249, + "language_loss": 0.63410515, + "learning_rate": 2.606614618903214e-06, + "loss": 0.65117377, + "num_input_tokens_seen": 149860375, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.29858398, + "step": 6983, + "time_per_iteration": 2.742246627807617 + }, + { + "auxiliary_loss_clip": 0.01398007, + "auxiliary_loss_mlp": 0.00288559, + "balance_loss_clip": 1.15369773, + "balance_loss_mlp": 0.26086712, + "epoch": 0.4199007966330979, + "flos": 12531029293440.0, + "grad_norm": 23.6938109511939, + "language_loss": 0.91331506, + "learning_rate": 2.606243492174471e-06, + "loss": 0.93018073, + "num_input_tokens_seen": 149877850, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.27697754, + "step": 6984, + "time_per_iteration": 4.2667236328125 + }, + { + "auxiliary_loss_clip": 0.0139857, + "auxiliary_loss_mlp": 0.00311179, + "balance_loss_clip": 1.14990699, + "balance_loss_mlp": 0.28131711, + "epoch": 0.41996091988576584, + "flos": 21762944167680.0, + "grad_norm": 16.431129393106286, + "language_loss": 0.85130715, + "learning_rate": 2.605872342456914e-06, + "loss": 0.86840463, + "num_input_tokens_seen": 149896110, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.29858398, + "step": 6985, + "time_per_iteration": 2.735740900039673 + }, + { + "auxiliary_loss_clip": 0.0140579, + "auxiliary_loss_mlp": 0.00314293, + "balance_loss_clip": 1.15562189, + "balance_loss_mlp": 0.28351364, + "epoch": 0.4200210431384338, + "flos": 26541935948160.0, + "grad_norm": 22.643717018381516, + "language_loss": 0.86312437, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.88032526, + "num_input_tokens_seen": 149916495, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.30786133, + "step": 6986, + "time_per_iteration": 2.725522994995117 + }, + { + "auxiliary_loss_clip": 0.01388664, + "auxiliary_loss_mlp": 0.00279093, + "balance_loss_clip": 1.14565897, + "balance_loss_mlp": 0.252736, + "epoch": 0.42008116639110177, + "flos": 26795824254720.0, + "grad_norm": 19.847595422236644, + "language_loss": 0.77194262, + "learning_rate": 2.605129974111655e-06, + "loss": 0.78862023, + "num_input_tokens_seen": 149936445, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.26342773, + "step": 6987, + "time_per_iteration": 4.0937418937683105 + }, + { + "auxiliary_loss_clip": 0.01404148, + "auxiliary_loss_mlp": 0.00320791, + "balance_loss_clip": 1.15540552, + "balance_loss_mlp": 0.29131073, + "epoch": 0.42014128964376973, + "flos": 32087333243520.0, + "grad_norm": 120.11840018951423, + "language_loss": 0.80181563, + "learning_rate": 2.604758755512104e-06, + "loss": 0.81906503, + "num_input_tokens_seen": 149959430, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.29504395, + "step": 6988, + "time_per_iteration": 2.7679593563079834 + }, + { + "auxiliary_loss_clip": 0.01413027, + "auxiliary_loss_mlp": 0.00321972, + "balance_loss_clip": 1.1592623, + "balance_loss_mlp": 0.29159701, + "epoch": 0.4202014128964377, + "flos": 26467133875200.0, + "grad_norm": 24.353150288886187, + "language_loss": 0.79759848, + "learning_rate": 2.60438751398004e-06, + "loss": 0.81494844, + "num_input_tokens_seen": 149980365, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.30395508, + "step": 6989, + "time_per_iteration": 2.6546518802642822 + }, + { + "auxiliary_loss_clip": 0.01404608, + "auxiliary_loss_mlp": 0.00322412, + "balance_loss_clip": 1.15213799, + "balance_loss_mlp": 0.29193032, + "epoch": 0.42026153614910566, + "flos": 13401216178560.0, + "grad_norm": 11.939689604556325, + "language_loss": 0.78708148, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.80435169, + "num_input_tokens_seen": 149997375, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.30505371, + "step": 6990, + "time_per_iteration": 2.623218297958374 + }, + { + "auxiliary_loss_clip": 0.01409418, + "auxiliary_loss_mlp": 0.00073407, + "balance_loss_clip": 1.18596649, + "balance_loss_mlp": 0.06799515, + "epoch": 0.42032165940177363, + "flos": 60250457635200.0, + "grad_norm": 0.7989934828384163, + "language_loss": 0.60169262, + "learning_rate": 2.603644962174685e-06, + "loss": 0.61652088, + "num_input_tokens_seen": 150051230, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.05419922, + "step": 6991, + "time_per_iteration": 3.0360424518585205 + }, + { + "auxiliary_loss_clip": 0.0140426, + "auxiliary_loss_mlp": 0.00330981, + "balance_loss_clip": 1.15332222, + "balance_loss_mlp": 0.30110675, + "epoch": 0.4203817826544416, + "flos": 24535211852160.0, + "grad_norm": 1.9771824882115414, + "language_loss": 0.88525808, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.90261054, + "num_input_tokens_seen": 150071135, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.29858398, + "step": 6992, + "time_per_iteration": 2.716872215270996 + }, + { + "auxiliary_loss_clip": 0.01433838, + "auxiliary_loss_mlp": 0.0016312, + "balance_loss_clip": 1.20808411, + "balance_loss_mlp": 0.15467969, + "epoch": 0.42044190590710956, + "flos": 58820781530880.0, + "grad_norm": 0.7825280547208252, + "language_loss": 0.64943033, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.66539991, + "num_input_tokens_seen": 150125220, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.08447266, + "step": 6993, + "time_per_iteration": 4.616387844085693 + }, + { + "auxiliary_loss_clip": 0.01401909, + "auxiliary_loss_mlp": 0.00372182, + "balance_loss_clip": 1.14893675, + "balance_loss_mlp": 0.33703953, + "epoch": 0.4205020291597775, + "flos": 16436063260800.0, + "grad_norm": 44.846025833106076, + "language_loss": 0.92247772, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.94021857, + "num_input_tokens_seen": 150142300, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.3515625, + "step": 6994, + "time_per_iteration": 2.6520767211914062 + }, + { + "auxiliary_loss_clip": 0.01395343, + "auxiliary_loss_mlp": 0.00335965, + "balance_loss_clip": 1.1454711, + "balance_loss_mlp": 0.30513734, + "epoch": 0.4205621524124455, + "flos": 18405655672320.0, + "grad_norm": 1246.7931771403446, + "language_loss": 0.85594851, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.87326157, + "num_input_tokens_seen": 150161345, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.30834961, + "step": 6995, + "time_per_iteration": 2.627835512161255 + }, + { + "auxiliary_loss_clip": 0.01406332, + "auxiliary_loss_mlp": 0.00314731, + "balance_loss_clip": 1.1608423, + "balance_loss_mlp": 0.28610915, + "epoch": 0.4206222756651135, + "flos": 25520097841920.0, + "grad_norm": 96.85502334264243, + "language_loss": 0.8543756, + "learning_rate": 2.60178818232786e-06, + "loss": 0.87158626, + "num_input_tokens_seen": 150182420, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.28601074, + "step": 6996, + "time_per_iteration": 2.675568103790283 + }, + { + "auxiliary_loss_clip": 0.01401937, + "auxiliary_loss_mlp": 0.00323409, + "balance_loss_clip": 1.15435767, + "balance_loss_mlp": 0.29441774, + "epoch": 0.4206823989177815, + "flos": 15304338472320.0, + "grad_norm": 79.78981743468857, + "language_loss": 0.84555036, + "learning_rate": 2.601416757842559e-06, + "loss": 0.86280382, + "num_input_tokens_seen": 150200175, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.29003906, + "step": 6997, + "time_per_iteration": 2.6223039627075195 + }, + { + "auxiliary_loss_clip": 0.0140097, + "auxiliary_loss_mlp": 0.00332794, + "balance_loss_clip": 1.15358591, + "balance_loss_mlp": 0.30010653, + "epoch": 0.42074252217044944, + "flos": 15554096714880.0, + "grad_norm": 27.056681687905463, + "language_loss": 0.81924701, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.83658469, + "num_input_tokens_seen": 150217100, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.32666016, + "step": 6998, + "time_per_iteration": 2.6041550636291504 + }, + { + "auxiliary_loss_clip": 0.01406523, + "auxiliary_loss_mlp": 0.00360963, + "balance_loss_clip": 1.15590358, + "balance_loss_mlp": 0.32952759, + "epoch": 0.4208026454231174, + "flos": 26145877610880.0, + "grad_norm": 37.84941859269665, + "language_loss": 0.81414574, + "learning_rate": 2.60067384046869e-06, + "loss": 0.83182061, + "num_input_tokens_seen": 150239830, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.3145752, + "step": 6999, + "time_per_iteration": 2.741089344024658 + }, + { + "auxiliary_loss_clip": 0.01407861, + "auxiliary_loss_mlp": 0.00325112, + "balance_loss_clip": 1.15836167, + "balance_loss_mlp": 0.29736009, + "epoch": 0.42086276867578537, + "flos": 23550110380800.0, + "grad_norm": 3.944722112470355, + "language_loss": 0.70837021, + "learning_rate": 2.600302347608295e-06, + "loss": 0.72569996, + "num_input_tokens_seen": 150260690, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.27734375, + "step": 7000, + "time_per_iteration": 2.65169095993042 + }, + { + "auxiliary_loss_clip": 0.01413955, + "auxiliary_loss_mlp": 0.00340853, + "balance_loss_clip": 1.16400337, + "balance_loss_mlp": 0.30852318, + "epoch": 0.42092289192845334, + "flos": 18113414618880.0, + "grad_norm": 53.470024713024245, + "language_loss": 0.81361246, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.83116055, + "num_input_tokens_seen": 150279885, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.32324219, + "step": 7001, + "time_per_iteration": 2.6489367485046387 + }, + { + "auxiliary_loss_clip": 0.01400481, + "auxiliary_loss_mlp": 0.00335471, + "balance_loss_clip": 1.15604448, + "balance_loss_mlp": 0.30644402, + "epoch": 0.4209830151811213, + "flos": 20006588845440.0, + "grad_norm": 8.853984429643754, + "language_loss": 0.91605806, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.93341762, + "num_input_tokens_seen": 150297390, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.29003906, + "step": 7002, + "time_per_iteration": 2.6673898696899414 + }, + { + "auxiliary_loss_clip": 0.01407873, + "auxiliary_loss_mlp": 0.00331568, + "balance_loss_clip": 1.16457713, + "balance_loss_mlp": 0.30358934, + "epoch": 0.42104313843378927, + "flos": 21978946604160.0, + "grad_norm": 10.596916093339283, + "language_loss": 0.76364112, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.78103554, + "num_input_tokens_seen": 150317390, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.27990723, + "step": 7003, + "time_per_iteration": 2.639894962310791 + }, + { + "auxiliary_loss_clip": 0.01419205, + "auxiliary_loss_mlp": 0.00378027, + "balance_loss_clip": 1.16633105, + "balance_loss_mlp": 0.34402823, + "epoch": 0.42110326168645723, + "flos": 25443966965760.0, + "grad_norm": 3279.401439038277, + "language_loss": 0.84465528, + "learning_rate": 2.598816148672344e-06, + "loss": 0.86262763, + "num_input_tokens_seen": 150337455, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.33984375, + "step": 7004, + "time_per_iteration": 2.6789510250091553 + }, + { + "auxiliary_loss_clip": 0.01407496, + "auxiliary_loss_mlp": 0.00309773, + "balance_loss_clip": 1.16792572, + "balance_loss_mlp": 0.28161579, + "epoch": 0.4211633849391252, + "flos": 17822574195840.0, + "grad_norm": 13.109695933785492, + "language_loss": 0.76336384, + "learning_rate": 2.59844454213521e-06, + "loss": 0.78053653, + "num_input_tokens_seen": 150355385, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.28137207, + "step": 7005, + "time_per_iteration": 2.6840157508850098 + }, + { + "auxiliary_loss_clip": 0.01418304, + "auxiliary_loss_mlp": 0.00340822, + "balance_loss_clip": 1.17183852, + "balance_loss_mlp": 0.31128147, + "epoch": 0.42122350819179316, + "flos": 16282436791680.0, + "grad_norm": 12.38088264340069, + "language_loss": 0.80179429, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.81938553, + "num_input_tokens_seen": 150371750, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.2956543, + "step": 7006, + "time_per_iteration": 2.6740567684173584 + }, + { + "auxiliary_loss_clip": 0.0140488, + "auxiliary_loss_mlp": 0.00324401, + "balance_loss_clip": 1.15990591, + "balance_loss_mlp": 0.29651845, + "epoch": 0.4212836314444611, + "flos": 19645866512640.0, + "grad_norm": 119.55455430097157, + "language_loss": 0.77977371, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.79706657, + "num_input_tokens_seen": 150389955, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.27893066, + "step": 7007, + "time_per_iteration": 2.7012012004852295 + }, + { + "auxiliary_loss_clip": 0.014071, + "auxiliary_loss_mlp": 0.00379316, + "balance_loss_clip": 1.15825057, + "balance_loss_mlp": 0.34662923, + "epoch": 0.4213437546971291, + "flos": 18369026778240.0, + "grad_norm": 3.5442971695055427, + "language_loss": 0.88490415, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.90276831, + "num_input_tokens_seen": 150405780, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.32714844, + "step": 7008, + "time_per_iteration": 2.6798882484436035 + }, + { + "auxiliary_loss_clip": 0.01412628, + "auxiliary_loss_mlp": 0.00333099, + "balance_loss_clip": 1.16426778, + "balance_loss_mlp": 0.30369031, + "epoch": 0.42140387794979706, + "flos": 27704507541120.0, + "grad_norm": 16.49818934891253, + "language_loss": 0.78136957, + "learning_rate": 2.596957889196831e-06, + "loss": 0.79882687, + "num_input_tokens_seen": 150425615, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.29382324, + "step": 7009, + "time_per_iteration": 2.697206497192383 + }, + { + "auxiliary_loss_clip": 0.01412818, + "auxiliary_loss_mlp": 0.00330822, + "balance_loss_clip": 1.16879487, + "balance_loss_mlp": 0.3012816, + "epoch": 0.4214640012024651, + "flos": 28147071012480.0, + "grad_norm": 66.52399836981196, + "language_loss": 0.7296893, + "learning_rate": 2.596586169335243e-06, + "loss": 0.74712574, + "num_input_tokens_seen": 150445765, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.29553223, + "step": 7010, + "time_per_iteration": 2.7349796295166016 + }, + { + "auxiliary_loss_clip": 0.01395101, + "auxiliary_loss_mlp": 0.0033696, + "balance_loss_clip": 1.14777899, + "balance_loss_mlp": 0.30637109, + "epoch": 0.42152412445513304, + "flos": 22997265177600.0, + "grad_norm": 6.092665050321106, + "language_loss": 0.77918839, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.79650903, + "num_input_tokens_seen": 150464405, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.3059082, + "step": 7011, + "time_per_iteration": 2.6376020908355713 + }, + { + "auxiliary_loss_clip": 0.01486388, + "auxiliary_loss_mlp": 0.00100417, + "balance_loss_clip": 1.2777915, + "balance_loss_mlp": 0.0944089, + "epoch": 0.421584247707801, + "flos": 63749592938880.0, + "grad_norm": 0.8154257226532128, + "language_loss": 0.54356313, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.5594312, + "num_input_tokens_seen": 150520430, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.06005859, + "step": 7012, + "time_per_iteration": 3.028329372406006 + }, + { + "auxiliary_loss_clip": 0.0141893, + "auxiliary_loss_mlp": 0.00350458, + "balance_loss_clip": 1.17246556, + "balance_loss_mlp": 0.31960636, + "epoch": 0.421644370960469, + "flos": 24314612474880.0, + "grad_norm": 2.8926606846689067, + "language_loss": 0.83471751, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.85241139, + "num_input_tokens_seen": 150542610, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.30822754, + "step": 7013, + "time_per_iteration": 2.750278949737549 + }, + { + "auxiliary_loss_clip": 0.01407704, + "auxiliary_loss_mlp": 0.00342244, + "balance_loss_clip": 1.16163409, + "balance_loss_mlp": 0.31141639, + "epoch": 0.42170449421313694, + "flos": 23440690575360.0, + "grad_norm": 181.06737857378798, + "language_loss": 0.86528707, + "learning_rate": 2.595099063803787e-06, + "loss": 0.88278657, + "num_input_tokens_seen": 150560970, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.30810547, + "step": 7014, + "time_per_iteration": 2.6806061267852783 + }, + { + "auxiliary_loss_clip": 0.01410196, + "auxiliary_loss_mlp": 0.00327491, + "balance_loss_clip": 1.16590893, + "balance_loss_mlp": 0.29702136, + "epoch": 0.4217646174658049, + "flos": 23695476721920.0, + "grad_norm": 17.44443335873867, + "language_loss": 0.83317375, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.85055059, + "num_input_tokens_seen": 150582615, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.30444336, + "step": 7015, + "time_per_iteration": 2.71368408203125 + }, + { + "auxiliary_loss_clip": 0.01418428, + "auxiliary_loss_mlp": 0.00342149, + "balance_loss_clip": 1.17098033, + "balance_loss_mlp": 0.31070173, + "epoch": 0.42182474071847287, + "flos": 24971562270720.0, + "grad_norm": 2.5126521159015387, + "language_loss": 0.86660939, + "learning_rate": 2.594355375584368e-06, + "loss": 0.88421518, + "num_input_tokens_seen": 150603640, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.31469727, + "step": 7016, + "time_per_iteration": 2.7161731719970703 + }, + { + "auxiliary_loss_clip": 0.01410581, + "auxiliary_loss_mlp": 0.00344016, + "balance_loss_clip": 1.1653856, + "balance_loss_mlp": 0.31455976, + "epoch": 0.42188486397114083, + "flos": 22856639431680.0, + "grad_norm": 2.4228969618505847, + "language_loss": 0.7519784, + "learning_rate": 2.593983497660586e-06, + "loss": 0.76952434, + "num_input_tokens_seen": 150622490, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.29431152, + "step": 7017, + "time_per_iteration": 2.615173101425171 + }, + { + "auxiliary_loss_clip": 0.01504532, + "auxiliary_loss_mlp": 0.00110709, + "balance_loss_clip": 1.30962205, + "balance_loss_mlp": 0.1042244, + "epoch": 0.4219449872238088, + "flos": 66975700965120.0, + "grad_norm": 0.687034993337925, + "language_loss": 0.5888046, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.60495698, + "num_input_tokens_seen": 150689545, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.06494141, + "step": 7018, + "time_per_iteration": 3.2052643299102783 + }, + { + "auxiliary_loss_clip": 0.01408601, + "auxiliary_loss_mlp": 0.00368319, + "balance_loss_clip": 1.16158724, + "balance_loss_mlp": 0.33547676, + "epoch": 0.42200511047647676, + "flos": 13115367745920.0, + "grad_norm": 12.338018356073793, + "language_loss": 0.8332637, + "learning_rate": 2.593239674255382e-06, + "loss": 0.85103285, + "num_input_tokens_seen": 150707610, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.32861328, + "step": 7019, + "time_per_iteration": 2.666208505630493 + }, + { + "auxiliary_loss_clip": 0.01418058, + "auxiliary_loss_mlp": 0.00348452, + "balance_loss_clip": 1.17117691, + "balance_loss_mlp": 0.31626546, + "epoch": 0.42206523372914473, + "flos": 13991193066240.0, + "grad_norm": 9.7976148644771, + "language_loss": 0.75132555, + "learning_rate": 2.592867728802166e-06, + "loss": 0.76899064, + "num_input_tokens_seen": 150724530, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.32177734, + "step": 7020, + "time_per_iteration": 2.66951847076416 + }, + { + "auxiliary_loss_clip": 0.01400855, + "auxiliary_loss_mlp": 0.00325253, + "balance_loss_clip": 1.16091657, + "balance_loss_mlp": 0.29807335, + "epoch": 0.4221253569818127, + "flos": 21942317710080.0, + "grad_norm": 11.68084707779413, + "language_loss": 0.86072445, + "learning_rate": 2.592495760867347e-06, + "loss": 0.87798554, + "num_input_tokens_seen": 150742870, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.27197266, + "step": 7021, + "time_per_iteration": 2.6172702312469482 + }, + { + "auxiliary_loss_clip": 0.01413094, + "auxiliary_loss_mlp": 0.00347244, + "balance_loss_clip": 1.16788566, + "balance_loss_mlp": 0.31729853, + "epoch": 0.42218548023448066, + "flos": 32192587071360.0, + "grad_norm": 56.97159980192243, + "language_loss": 0.7547183, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.77232164, + "num_input_tokens_seen": 150765500, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.29956055, + "step": 7022, + "time_per_iteration": 2.711921215057373 + }, + { + "auxiliary_loss_clip": 0.01388677, + "auxiliary_loss_mlp": 0.00324446, + "balance_loss_clip": 1.15381241, + "balance_loss_mlp": 0.29662222, + "epoch": 0.4222456034871487, + "flos": 30118961894400.0, + "grad_norm": 9.030402505194479, + "language_loss": 0.72143304, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.73856431, + "num_input_tokens_seen": 150784945, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.27807617, + "step": 7023, + "time_per_iteration": 2.711137294769287 + }, + { + "auxiliary_loss_clip": 0.01392477, + "auxiliary_loss_mlp": 0.00333597, + "balance_loss_clip": 1.15494895, + "balance_loss_mlp": 0.30534407, + "epoch": 0.42230572673981664, + "flos": 22127904305280.0, + "grad_norm": 13.741545198409115, + "language_loss": 0.75034094, + "learning_rate": 2.591379722314322e-06, + "loss": 0.76760161, + "num_input_tokens_seen": 150803120, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.28271484, + "step": 7024, + "time_per_iteration": 4.072700500488281 + }, + { + "auxiliary_loss_clip": 0.01414983, + "auxiliary_loss_mlp": 0.00362583, + "balance_loss_clip": 1.17116153, + "balance_loss_mlp": 0.33139741, + "epoch": 0.4223658499924846, + "flos": 22055077480320.0, + "grad_norm": 16.47478471187675, + "language_loss": 0.82766598, + "learning_rate": 2.591007664594147e-06, + "loss": 0.84544158, + "num_input_tokens_seen": 150823135, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.31201172, + "step": 7025, + "time_per_iteration": 2.6456146240234375 + }, + { + "auxiliary_loss_clip": 0.01400754, + "auxiliary_loss_mlp": 0.00345455, + "balance_loss_clip": 1.1571908, + "balance_loss_mlp": 0.31543773, + "epoch": 0.4224259732451526, + "flos": 20410727742720.0, + "grad_norm": 3.9350329020392825, + "language_loss": 0.84068704, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.85814917, + "num_input_tokens_seen": 150842070, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.29992676, + "step": 7026, + "time_per_iteration": 4.188737154006958 + }, + { + "auxiliary_loss_clip": 0.01453036, + "auxiliary_loss_mlp": 0.00142252, + "balance_loss_clip": 1.2599715, + "balance_loss_mlp": 0.13452683, + "epoch": 0.42248609649782054, + "flos": 62846655828480.0, + "grad_norm": 0.738475364992954, + "language_loss": 0.61370313, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.62965608, + "num_input_tokens_seen": 150907450, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.07714844, + "step": 7027, + "time_per_iteration": 3.226504325866699 + }, + { + "auxiliary_loss_clip": 0.01398887, + "auxiliary_loss_mlp": 0.00341603, + "balance_loss_clip": 1.15707183, + "balance_loss_mlp": 0.31163374, + "epoch": 0.4225462197504885, + "flos": 26249946289920.0, + "grad_norm": 8.598887719107832, + "language_loss": 0.8029151, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.82032001, + "num_input_tokens_seen": 150928040, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.29943848, + "step": 7028, + "time_per_iteration": 2.6800248622894287 + }, + { + "auxiliary_loss_clip": 0.01407471, + "auxiliary_loss_mlp": 0.00366171, + "balance_loss_clip": 1.16347444, + "balance_loss_mlp": 0.33524835, + "epoch": 0.42260634300315647, + "flos": 20521943228160.0, + "grad_norm": 4.243404318521858, + "language_loss": 0.87732816, + "learning_rate": 2.589519209743846e-06, + "loss": 0.89506459, + "num_input_tokens_seen": 150945760, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.30932617, + "step": 7029, + "time_per_iteration": 4.263251066207886 + }, + { + "auxiliary_loss_clip": 0.01404407, + "auxiliary_loss_mlp": 0.00387176, + "balance_loss_clip": 1.15627789, + "balance_loss_mlp": 0.35498905, + "epoch": 0.42266646625582444, + "flos": 24316731377280.0, + "grad_norm": 92.06952006916518, + "language_loss": 0.8384577, + "learning_rate": 2.589147040109424e-06, + "loss": 0.85637355, + "num_input_tokens_seen": 150965665, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.32177734, + "step": 7030, + "time_per_iteration": 2.7506067752838135 + }, + { + "auxiliary_loss_clip": 0.01393657, + "auxiliary_loss_mlp": 0.00346533, + "balance_loss_clip": 1.14805627, + "balance_loss_mlp": 0.31620657, + "epoch": 0.4227265895084924, + "flos": 24204151175040.0, + "grad_norm": 26.728414379514415, + "language_loss": 0.92521214, + "learning_rate": 2.588774848134486e-06, + "loss": 0.94261408, + "num_input_tokens_seen": 150982260, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.3034668, + "step": 7031, + "time_per_iteration": 2.7920291423797607 + }, + { + "auxiliary_loss_clip": 0.01393853, + "auxiliary_loss_mlp": 0.00355551, + "balance_loss_clip": 1.14904141, + "balance_loss_mlp": 0.32508063, + "epoch": 0.42278671276116037, + "flos": 16909760845440.0, + "grad_norm": 81.79040567171914, + "language_loss": 0.8240158, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.84150982, + "num_input_tokens_seen": 150999990, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.3046875, + "step": 7032, + "time_per_iteration": 2.6725149154663086 + }, + { + "auxiliary_loss_clip": 0.0140085, + "auxiliary_loss_mlp": 0.0036816, + "balance_loss_clip": 1.15164971, + "balance_loss_mlp": 0.33735663, + "epoch": 0.42284683601382833, + "flos": 25411073086080.0, + "grad_norm": 16.288765770409206, + "language_loss": 0.76311171, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.78080183, + "num_input_tokens_seen": 151021105, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.30810547, + "step": 7033, + "time_per_iteration": 2.7467665672302246 + }, + { + "auxiliary_loss_clip": 0.01394904, + "auxiliary_loss_mlp": 0.00338284, + "balance_loss_clip": 1.14784598, + "balance_loss_mlp": 0.30845815, + "epoch": 0.4229069592664963, + "flos": 23040322606080.0, + "grad_norm": 43.39029057791129, + "language_loss": 0.96225417, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.97958606, + "num_input_tokens_seen": 151040665, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.29797363, + "step": 7034, + "time_per_iteration": 2.7108068466186523 + }, + { + "auxiliary_loss_clip": 0.01391188, + "auxiliary_loss_mlp": 0.00349771, + "balance_loss_clip": 1.14847279, + "balance_loss_mlp": 0.32126793, + "epoch": 0.42296708251916426, + "flos": 26067448264320.0, + "grad_norm": 64.07688715213025, + "language_loss": 0.82515216, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.84256178, + "num_input_tokens_seen": 151061240, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.28503418, + "step": 7035, + "time_per_iteration": 4.138630390167236 + }, + { + "auxiliary_loss_clip": 0.01395273, + "auxiliary_loss_mlp": 0.00345899, + "balance_loss_clip": 1.14803219, + "balance_loss_mlp": 0.31635937, + "epoch": 0.4230272057718323, + "flos": 19458376496640.0, + "grad_norm": 3.025555389520627, + "language_loss": 0.88232583, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.89973754, + "num_input_tokens_seen": 151076870, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.29541016, + "step": 7036, + "time_per_iteration": 2.5825741291046143 + }, + { + "auxiliary_loss_clip": 0.01403289, + "auxiliary_loss_mlp": 0.00333412, + "balance_loss_clip": 1.15601242, + "balance_loss_mlp": 0.30417013, + "epoch": 0.42308732902450025, + "flos": 22383300983040.0, + "grad_norm": 10.90265579531404, + "language_loss": 0.76484537, + "learning_rate": 2.58654122792447e-06, + "loss": 0.78221238, + "num_input_tokens_seen": 151095110, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.29211426, + "step": 7037, + "time_per_iteration": 2.6833889484405518 + }, + { + "auxiliary_loss_clip": 0.01391184, + "auxiliary_loss_mlp": 0.00347056, + "balance_loss_clip": 1.14338875, + "balance_loss_mlp": 0.31726563, + "epoch": 0.4231474522771682, + "flos": 20995425331200.0, + "grad_norm": 3.9712554378754983, + "language_loss": 0.82589096, + "learning_rate": 2.586168879961155e-06, + "loss": 0.84327328, + "num_input_tokens_seen": 151114355, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.29797363, + "step": 7038, + "time_per_iteration": 2.628981351852417 + }, + { + "auxiliary_loss_clip": 0.01407536, + "auxiliary_loss_mlp": 0.00369684, + "balance_loss_clip": 1.15485716, + "balance_loss_mlp": 0.33771199, + "epoch": 0.4232075755298362, + "flos": 14975863574400.0, + "grad_norm": 16.43576097153784, + "language_loss": 0.78408831, + "learning_rate": 2.585796509770259e-06, + "loss": 0.80186045, + "num_input_tokens_seen": 151131505, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.31958008, + "step": 7039, + "time_per_iteration": 2.6214776039123535 + }, + { + "auxiliary_loss_clip": 0.01407483, + "auxiliary_loss_mlp": 0.00367706, + "balance_loss_clip": 1.15396738, + "balance_loss_mlp": 0.33556765, + "epoch": 0.42326769878250414, + "flos": 24532661986560.0, + "grad_norm": 6.000809359018324, + "language_loss": 0.81865311, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.83640504, + "num_input_tokens_seen": 151151555, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.3215332, + "step": 7040, + "time_per_iteration": 2.679781913757324 + }, + { + "auxiliary_loss_clip": 0.01406899, + "auxiliary_loss_mlp": 0.00343879, + "balance_loss_clip": 1.15571618, + "balance_loss_mlp": 0.31312326, + "epoch": 0.4233278220351721, + "flos": 26870303105280.0, + "grad_norm": 2.7112307930834123, + "language_loss": 0.73493063, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.75243843, + "num_input_tokens_seen": 151172385, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.30761719, + "step": 7041, + "time_per_iteration": 2.739536762237549 + }, + { + "auxiliary_loss_clip": 0.01403426, + "auxiliary_loss_mlp": 0.00369218, + "balance_loss_clip": 1.15200031, + "balance_loss_mlp": 0.33731806, + "epoch": 0.4233879452878401, + "flos": 42814927463040.0, + "grad_norm": 109.37951003377444, + "language_loss": 0.81714797, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.83487439, + "num_input_tokens_seen": 151194930, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.3190918, + "step": 7042, + "time_per_iteration": 2.854063034057617 + }, + { + "auxiliary_loss_clip": 0.01397742, + "auxiliary_loss_mlp": 0.00357605, + "balance_loss_clip": 1.15091598, + "balance_loss_mlp": 0.32634804, + "epoch": 0.42344806854050804, + "flos": 25229006023680.0, + "grad_norm": 6287.29371473536, + "language_loss": 0.86194026, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.87949371, + "num_input_tokens_seen": 151217905, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.31274414, + "step": 7043, + "time_per_iteration": 2.839526414871216 + }, + { + "auxiliary_loss_clip": 0.01414349, + "auxiliary_loss_mlp": 0.00335228, + "balance_loss_clip": 1.16031849, + "balance_loss_mlp": 0.30629581, + "epoch": 0.423508191793176, + "flos": 22778820616320.0, + "grad_norm": 115.21024899474041, + "language_loss": 0.72332454, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.74082029, + "num_input_tokens_seen": 151234580, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.28930664, + "step": 7044, + "time_per_iteration": 2.7001419067382812 + }, + { + "auxiliary_loss_clip": 0.01424623, + "auxiliary_loss_mlp": 0.00353406, + "balance_loss_clip": 1.16677213, + "balance_loss_mlp": 0.32245949, + "epoch": 0.42356831504584397, + "flos": 34637493179520.0, + "grad_norm": 2.251152208885742, + "language_loss": 0.81987572, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.83765596, + "num_input_tokens_seen": 151254765, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.30957031, + "step": 7045, + "time_per_iteration": 2.7705023288726807 + }, + { + "auxiliary_loss_clip": 0.01404598, + "auxiliary_loss_mlp": 0.00361308, + "balance_loss_clip": 1.15517735, + "balance_loss_mlp": 0.33056363, + "epoch": 0.42362843829851193, + "flos": 17596767346560.0, + "grad_norm": 5.048372195889732, + "language_loss": 0.8750062, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.89266527, + "num_input_tokens_seen": 151269045, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.30725098, + "step": 7046, + "time_per_iteration": 2.5764427185058594 + }, + { + "auxiliary_loss_clip": 0.01414133, + "auxiliary_loss_mlp": 0.00367441, + "balance_loss_clip": 1.16200256, + "balance_loss_mlp": 0.33654237, + "epoch": 0.4236885615511799, + "flos": 22565691267840.0, + "grad_norm": 9.329389801788729, + "language_loss": 0.84010392, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.85791969, + "num_input_tokens_seen": 151287530, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.30908203, + "step": 7047, + "time_per_iteration": 2.726040840148926 + }, + { + "auxiliary_loss_clip": 0.01409731, + "auxiliary_loss_mlp": 0.0033811, + "balance_loss_clip": 1.15710926, + "balance_loss_mlp": 0.31069148, + "epoch": 0.42374868480384786, + "flos": 26469216864000.0, + "grad_norm": 6.322836622323462, + "language_loss": 0.73876965, + "learning_rate": 2.582444180141098e-06, + "loss": 0.75624806, + "num_input_tokens_seen": 151308905, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.27404785, + "step": 7048, + "time_per_iteration": 2.702104091644287 + }, + { + "auxiliary_loss_clip": 0.01405348, + "auxiliary_loss_mlp": 0.00364712, + "balance_loss_clip": 1.15336978, + "balance_loss_mlp": 0.33476698, + "epoch": 0.4238088080565159, + "flos": 20370220179840.0, + "grad_norm": 88.16953413774394, + "language_loss": 0.84270364, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.86040425, + "num_input_tokens_seen": 151326525, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.29980469, + "step": 7049, + "time_per_iteration": 2.661215305328369 + }, + { + "auxiliary_loss_clip": 0.01414399, + "auxiliary_loss_mlp": 0.00374935, + "balance_loss_clip": 1.16111994, + "balance_loss_mlp": 0.34334409, + "epoch": 0.42386893130918385, + "flos": 21172105353600.0, + "grad_norm": 59.239729154205186, + "language_loss": 0.89955914, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.91745245, + "num_input_tokens_seen": 151344675, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.31591797, + "step": 7050, + "time_per_iteration": 2.640963077545166 + }, + { + "auxiliary_loss_clip": 0.01419401, + "auxiliary_loss_mlp": 0.00349305, + "balance_loss_clip": 1.16496599, + "balance_loss_mlp": 0.32069468, + "epoch": 0.4239290545618518, + "flos": 17675627656320.0, + "grad_norm": 8.175843985748632, + "language_loss": 0.79979885, + "learning_rate": 2.581326338868687e-06, + "loss": 0.81748593, + "num_input_tokens_seen": 151360730, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.28637695, + "step": 7051, + "time_per_iteration": 2.657200336456299 + }, + { + "auxiliary_loss_clip": 0.01421312, + "auxiliary_loss_mlp": 0.00373423, + "balance_loss_clip": 1.16572762, + "balance_loss_mlp": 0.34195143, + "epoch": 0.4239891778145198, + "flos": 24314504734080.0, + "grad_norm": 1.969717553487127, + "language_loss": 0.89712262, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.91507006, + "num_input_tokens_seen": 151380445, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.31469727, + "step": 7052, + "time_per_iteration": 2.684546709060669 + }, + { + "auxiliary_loss_clip": 0.01420311, + "auxiliary_loss_mlp": 0.00362365, + "balance_loss_clip": 1.16628337, + "balance_loss_mlp": 0.33240771, + "epoch": 0.42404930106718774, + "flos": 20558428467840.0, + "grad_norm": 24.418148188926832, + "language_loss": 0.79200667, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.80983341, + "num_input_tokens_seen": 151399325, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.29968262, + "step": 7053, + "time_per_iteration": 2.6592884063720703 + }, + { + "auxiliary_loss_clip": 0.0141822, + "auxiliary_loss_mlp": 0.00408179, + "balance_loss_clip": 1.16363239, + "balance_loss_mlp": 0.37518165, + "epoch": 0.4241094243198557, + "flos": 22308067946880.0, + "grad_norm": 11.179326950373449, + "language_loss": 0.88196695, + "learning_rate": 2.580208299200704e-06, + "loss": 0.90023094, + "num_input_tokens_seen": 151417240, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.32983398, + "step": 7054, + "time_per_iteration": 2.6578078269958496 + }, + { + "auxiliary_loss_clip": 0.01439298, + "auxiliary_loss_mlp": 0.00103853, + "balance_loss_clip": 1.21063828, + "balance_loss_mlp": 0.09631884, + "epoch": 0.4241695475725237, + "flos": 70612445272320.0, + "grad_norm": 0.783878877975846, + "language_loss": 0.59930241, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.61473393, + "num_input_tokens_seen": 151476015, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.07519531, + "step": 7055, + "time_per_iteration": 3.1099839210510254 + }, + { + "auxiliary_loss_clip": 0.01420617, + "auxiliary_loss_mlp": 0.00439572, + "balance_loss_clip": 1.16465616, + "balance_loss_mlp": 0.40333229, + "epoch": 0.42422967082519164, + "flos": 14027462824320.0, + "grad_norm": 40.77952822752872, + "language_loss": 0.83783579, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.85643774, + "num_input_tokens_seen": 151492035, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.36230469, + "step": 7056, + "time_per_iteration": 2.8975815773010254 + }, + { + "auxiliary_loss_clip": 0.01414406, + "auxiliary_loss_mlp": 0.00368358, + "balance_loss_clip": 1.16002798, + "balance_loss_mlp": 0.33776915, + "epoch": 0.4242897940778596, + "flos": 22345522853760.0, + "grad_norm": 20.5604942440064, + "language_loss": 0.91936177, + "learning_rate": 2.579090061518714e-06, + "loss": 0.93718946, + "num_input_tokens_seen": 151508970, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.30615234, + "step": 7057, + "time_per_iteration": 2.7375526428222656 + }, + { + "auxiliary_loss_clip": 0.01431775, + "auxiliary_loss_mlp": 0.00401485, + "balance_loss_clip": 1.17230201, + "balance_loss_mlp": 0.36789209, + "epoch": 0.42434991733052757, + "flos": 22595855713920.0, + "grad_norm": 430.7323277254274, + "language_loss": 0.90532506, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.9236576, + "num_input_tokens_seen": 151525295, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.33618164, + "step": 7058, + "time_per_iteration": 2.657898426055908 + }, + { + "auxiliary_loss_clip": 0.01409677, + "auxiliary_loss_mlp": 0.00403368, + "balance_loss_clip": 1.15714848, + "balance_loss_mlp": 0.3718487, + "epoch": 0.42441004058319554, + "flos": 20011437181440.0, + "grad_norm": 4.5054161234514645, + "language_loss": 0.86342907, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.88155949, + "num_input_tokens_seen": 151544435, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.31530762, + "step": 7059, + "time_per_iteration": 2.6707375049591064 + }, + { + "auxiliary_loss_clip": 0.01425318, + "auxiliary_loss_mlp": 0.00408675, + "balance_loss_clip": 1.16495407, + "balance_loss_mlp": 0.37539208, + "epoch": 0.4244701638358635, + "flos": 11144985235200.0, + "grad_norm": 67.7120471329731, + "language_loss": 0.77166039, + "learning_rate": 2.57797162620435e-06, + "loss": 0.79000032, + "num_input_tokens_seen": 151559520, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.33300781, + "step": 7060, + "time_per_iteration": 2.5967650413513184 + }, + { + "auxiliary_loss_clip": 0.01421616, + "auxiliary_loss_mlp": 0.00411327, + "balance_loss_clip": 1.16482353, + "balance_loss_mlp": 0.3781155, + "epoch": 0.42453028708853147, + "flos": 23987753688960.0, + "grad_norm": 1.9557006456251689, + "language_loss": 0.80884159, + "learning_rate": 2.577598770580562e-06, + "loss": 0.82717097, + "num_input_tokens_seen": 151579790, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.33251953, + "step": 7061, + "time_per_iteration": 2.660928249359131 + }, + { + "auxiliary_loss_clip": 0.01416548, + "auxiliary_loss_mlp": 0.00409846, + "balance_loss_clip": 1.16170168, + "balance_loss_mlp": 0.37677786, + "epoch": 0.42459041034119943, + "flos": 18406338030720.0, + "grad_norm": 5.4638284568682405, + "language_loss": 0.79978579, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.81804973, + "num_input_tokens_seen": 151598285, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.33056641, + "step": 7062, + "time_per_iteration": 2.6273653507232666 + }, + { + "auxiliary_loss_clip": 0.01411537, + "auxiliary_loss_mlp": 0.00432956, + "balance_loss_clip": 1.15521276, + "balance_loss_mlp": 0.39943418, + "epoch": 0.42465053359386745, + "flos": 20958006337920.0, + "grad_norm": 18.690729353610113, + "language_loss": 0.71740055, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.73584545, + "num_input_tokens_seen": 151615430, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.33520508, + "step": 7063, + "time_per_iteration": 2.6274514198303223 + }, + { + "auxiliary_loss_clip": 0.01406699, + "auxiliary_loss_mlp": 0.00439205, + "balance_loss_clip": 1.15617597, + "balance_loss_mlp": 0.40532535, + "epoch": 0.4247106568465354, + "flos": 33106190520960.0, + "grad_norm": 33.674217186002245, + "language_loss": 0.83170831, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.85016739, + "num_input_tokens_seen": 151637030, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.33837891, + "step": 7064, + "time_per_iteration": 2.722407817840576 + }, + { + "auxiliary_loss_clip": 0.0143168, + "auxiliary_loss_mlp": 0.00422377, + "balance_loss_clip": 1.17526412, + "balance_loss_mlp": 0.38840234, + "epoch": 0.4247707800992034, + "flos": 20046916840320.0, + "grad_norm": 6.299896229815424, + "language_loss": 0.8216387, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.84017932, + "num_input_tokens_seen": 151655745, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.33959961, + "step": 7065, + "time_per_iteration": 2.6464180946350098 + }, + { + "auxiliary_loss_clip": 0.01409129, + "auxiliary_loss_mlp": 0.00381299, + "balance_loss_clip": 1.155936, + "balance_loss_mlp": 0.34920788, + "epoch": 0.42483090335187135, + "flos": 22385132576640.0, + "grad_norm": 5.770808670881057, + "language_loss": 0.77688229, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.79478657, + "num_input_tokens_seen": 151678040, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.32104492, + "step": 7066, + "time_per_iteration": 2.7271697521209717 + }, + { + "auxiliary_loss_clip": 0.01421567, + "auxiliary_loss_mlp": 0.00391722, + "balance_loss_clip": 1.16055167, + "balance_loss_mlp": 0.35727021, + "epoch": 0.4248910266045393, + "flos": 21356830022400.0, + "grad_norm": 4.524254433099811, + "language_loss": 0.84674609, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.86487901, + "num_input_tokens_seen": 151696410, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.34472656, + "step": 7067, + "time_per_iteration": 4.044067859649658 + }, + { + "auxiliary_loss_clip": 0.01434102, + "auxiliary_loss_mlp": 0.00187049, + "balance_loss_clip": 1.21414089, + "balance_loss_mlp": 0.17636745, + "epoch": 0.4249511498572073, + "flos": 64008114099840.0, + "grad_norm": 0.9228856361388713, + "language_loss": 0.63214529, + "learning_rate": 2.574988168733022e-06, + "loss": 0.6483568, + "num_input_tokens_seen": 151756365, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.10693359, + "step": 7068, + "time_per_iteration": 3.062466859817505 + }, + { + "auxiliary_loss_clip": 0.01417567, + "auxiliary_loss_mlp": 0.00420271, + "balance_loss_clip": 1.15930939, + "balance_loss_mlp": 0.38517612, + "epoch": 0.42501127310987524, + "flos": 19607046888960.0, + "grad_norm": 4.04570142203525, + "language_loss": 0.79802954, + "learning_rate": 2.574615138284361e-06, + "loss": 0.81640798, + "num_input_tokens_seen": 151775165, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.35083008, + "step": 7069, + "time_per_iteration": 4.198902606964111 + }, + { + "auxiliary_loss_clip": 0.01420369, + "auxiliary_loss_mlp": 0.00402971, + "balance_loss_clip": 1.16115117, + "balance_loss_mlp": 0.36964005, + "epoch": 0.4250713963625432, + "flos": 19462326992640.0, + "grad_norm": 3.3956677893796128, + "language_loss": 0.87870383, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.89693725, + "num_input_tokens_seen": 151792620, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.33325195, + "step": 7070, + "time_per_iteration": 2.714399576187134 + }, + { + "auxiliary_loss_clip": 0.01413121, + "auxiliary_loss_mlp": 0.00402039, + "balance_loss_clip": 1.15589249, + "balance_loss_mlp": 0.36880374, + "epoch": 0.4251315196152112, + "flos": 25337707557120.0, + "grad_norm": 7.083134151918011, + "language_loss": 0.78046149, + "learning_rate": 2.573869012032795e-06, + "loss": 0.79861307, + "num_input_tokens_seen": 151812850, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.33227539, + "step": 7071, + "time_per_iteration": 4.239865779876709 + }, + { + "auxiliary_loss_clip": 0.0140975, + "auxiliary_loss_mlp": 0.0040947, + "balance_loss_clip": 1.15378642, + "balance_loss_mlp": 0.37625861, + "epoch": 0.42519164286787914, + "flos": 26359186527360.0, + "grad_norm": 18.165206309767985, + "language_loss": 0.78889763, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.80708981, + "num_input_tokens_seen": 151831785, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.33203125, + "step": 7072, + "time_per_iteration": 2.7525768280029297 + }, + { + "auxiliary_loss_clip": 0.01428042, + "auxiliary_loss_mlp": 0.00458569, + "balance_loss_clip": 1.16729999, + "balance_loss_mlp": 0.42526174, + "epoch": 0.4252517661205471, + "flos": 26031070765440.0, + "grad_norm": 25.156022539665997, + "language_loss": 0.87491769, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.89378381, + "num_input_tokens_seen": 151853885, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.33300781, + "step": 7073, + "time_per_iteration": 2.727867603302002 + }, + { + "auxiliary_loss_clip": 0.01422898, + "auxiliary_loss_mlp": 0.00395975, + "balance_loss_clip": 1.165887, + "balance_loss_mlp": 0.36297768, + "epoch": 0.42531188937321507, + "flos": 12713635059840.0, + "grad_norm": 5.359643168855741, + "language_loss": 0.97983694, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.99802566, + "num_input_tokens_seen": 151871780, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.33007812, + "step": 7074, + "time_per_iteration": 2.7776386737823486 + }, + { + "auxiliary_loss_clip": 0.01423661, + "auxiliary_loss_mlp": 0.00434757, + "balance_loss_clip": 1.16456485, + "balance_loss_mlp": 0.39754039, + "epoch": 0.42537201262588303, + "flos": 22091670460800.0, + "grad_norm": 99.29199151043449, + "language_loss": 0.71463478, + "learning_rate": 2.572376498508805e-06, + "loss": 0.73321903, + "num_input_tokens_seen": 151891600, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.37207031, + "step": 7075, + "time_per_iteration": 2.820573568344116 + }, + { + "auxiliary_loss_clip": 0.01411675, + "auxiliary_loss_mlp": 0.0041971, + "balance_loss_clip": 1.15840483, + "balance_loss_mlp": 0.38361356, + "epoch": 0.42543213587855105, + "flos": 23003119094400.0, + "grad_norm": 2.939130345623063, + "language_loss": 0.80642271, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.8247366, + "num_input_tokens_seen": 151911330, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.36132812, + "step": 7076, + "time_per_iteration": 2.7364649772644043 + }, + { + "auxiliary_loss_clip": 0.01416465, + "auxiliary_loss_mlp": 0.00407401, + "balance_loss_clip": 1.16061056, + "balance_loss_mlp": 0.37330744, + "epoch": 0.425492259131219, + "flos": 25082454533760.0, + "grad_norm": 9.4542672574176, + "language_loss": 0.86138558, + "learning_rate": 2.571630111462766e-06, + "loss": 0.87962419, + "num_input_tokens_seen": 151930355, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.34130859, + "step": 7077, + "time_per_iteration": 4.164709806442261 + }, + { + "auxiliary_loss_clip": 0.01391275, + "auxiliary_loss_mlp": 0.00387149, + "balance_loss_clip": 1.14192951, + "balance_loss_mlp": 0.35546267, + "epoch": 0.425552382383887, + "flos": 22816850140800.0, + "grad_norm": 3.1004698977765646, + "language_loss": 0.78188288, + "learning_rate": 2.571256885418265e-06, + "loss": 0.79966712, + "num_input_tokens_seen": 151949695, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.31689453, + "step": 7078, + "time_per_iteration": 2.6744744777679443 + }, + { + "auxiliary_loss_clip": 0.01394464, + "auxiliary_loss_mlp": 0.00385122, + "balance_loss_clip": 1.14503825, + "balance_loss_mlp": 0.3553319, + "epoch": 0.42561250563655495, + "flos": 13553585671680.0, + "grad_norm": 38.62576693636197, + "language_loss": 0.88035893, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.89815474, + "num_input_tokens_seen": 151967640, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.29797363, + "step": 7079, + "time_per_iteration": 2.707059383392334 + }, + { + "auxiliary_loss_clip": 0.01405688, + "auxiliary_loss_mlp": 0.00395251, + "balance_loss_clip": 1.15015721, + "balance_loss_mlp": 0.36338606, + "epoch": 0.4256726288892229, + "flos": 46978303023360.0, + "grad_norm": 20.26361127047626, + "language_loss": 0.76820254, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.78621185, + "num_input_tokens_seen": 151994020, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.31848145, + "step": 7080, + "time_per_iteration": 2.9031472206115723 + }, + { + "auxiliary_loss_clip": 0.0139931, + "auxiliary_loss_mlp": 0.0036475, + "balance_loss_clip": 1.14541459, + "balance_loss_mlp": 0.33370799, + "epoch": 0.4257327521418909, + "flos": 23586451966080.0, + "grad_norm": 209.502591449781, + "language_loss": 0.86926007, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.88690066, + "num_input_tokens_seen": 152013415, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.31030273, + "step": 7081, + "time_per_iteration": 2.6974878311157227 + }, + { + "auxiliary_loss_clip": 0.01401786, + "auxiliary_loss_mlp": 0.0035438, + "balance_loss_clip": 1.14948106, + "balance_loss_mlp": 0.32565013, + "epoch": 0.42579287539455885, + "flos": 18989994124800.0, + "grad_norm": 2.1770485920419342, + "language_loss": 0.86429495, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.88185662, + "num_input_tokens_seen": 152030860, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.28710938, + "step": 7082, + "time_per_iteration": 2.656346321105957 + }, + { + "auxiliary_loss_clip": 0.01416107, + "auxiliary_loss_mlp": 0.00395987, + "balance_loss_clip": 1.15988708, + "balance_loss_mlp": 0.36377689, + "epoch": 0.4258529986472268, + "flos": 25191910252800.0, + "grad_norm": 4.887749288680173, + "language_loss": 0.74958283, + "learning_rate": 2.569390430547065e-06, + "loss": 0.76770377, + "num_input_tokens_seen": 152050395, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.32202148, + "step": 7083, + "time_per_iteration": 2.7276597023010254 + }, + { + "auxiliary_loss_clip": 0.01497326, + "auxiliary_loss_mlp": 0.00058948, + "balance_loss_clip": 1.26920176, + "balance_loss_mlp": 0.053441, + "epoch": 0.4259131218998948, + "flos": 69968280718080.0, + "grad_norm": 0.8635307169596059, + "language_loss": 0.66719294, + "learning_rate": 2.569017074742173e-06, + "loss": 0.68275571, + "num_input_tokens_seen": 152113555, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.05517578, + "step": 7084, + "time_per_iteration": 3.19527006149292 + }, + { + "auxiliary_loss_clip": 0.01400857, + "auxiliary_loss_mlp": 0.00393158, + "balance_loss_clip": 1.14882898, + "balance_loss_mlp": 0.35970774, + "epoch": 0.42597324515256274, + "flos": 18004964480640.0, + "grad_norm": 3.8833637590354577, + "language_loss": 0.85745114, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.8753913, + "num_input_tokens_seen": 152131575, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.33447266, + "step": 7085, + "time_per_iteration": 2.7272191047668457 + }, + { + "auxiliary_loss_clip": 0.01418357, + "auxiliary_loss_mlp": 0.00421861, + "balance_loss_clip": 1.1598227, + "balance_loss_mlp": 0.38590765, + "epoch": 0.4260333684052307, + "flos": 15158792563200.0, + "grad_norm": 30.19460991369448, + "language_loss": 0.83874309, + "learning_rate": 2.568270298414995e-06, + "loss": 0.85714531, + "num_input_tokens_seen": 152149435, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.35961914, + "step": 7086, + "time_per_iteration": 2.6548244953155518 + }, + { + "auxiliary_loss_clip": 0.01410948, + "auxiliary_loss_mlp": 0.00396453, + "balance_loss_clip": 1.15664172, + "balance_loss_mlp": 0.36240673, + "epoch": 0.42609349165789867, + "flos": 14939342421120.0, + "grad_norm": 22.8630901443389, + "language_loss": 0.86680496, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.88487899, + "num_input_tokens_seen": 152166860, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.34033203, + "step": 7087, + "time_per_iteration": 2.6134378910064697 + }, + { + "auxiliary_loss_clip": 0.01398251, + "auxiliary_loss_mlp": 0.00368361, + "balance_loss_clip": 1.14412689, + "balance_loss_mlp": 0.3354829, + "epoch": 0.42615361491056664, + "flos": 23731961961600.0, + "grad_norm": 3.9391706068798484, + "language_loss": 0.72434986, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.74201596, + "num_input_tokens_seen": 152187475, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.32910156, + "step": 7088, + "time_per_iteration": 2.7009191513061523 + }, + { + "auxiliary_loss_clip": 0.01402199, + "auxiliary_loss_mlp": 0.00364435, + "balance_loss_clip": 1.14968824, + "balance_loss_mlp": 0.33485952, + "epoch": 0.42621373816323466, + "flos": 24936441747840.0, + "grad_norm": 40.466226989375436, + "language_loss": 0.75387019, + "learning_rate": 2.56714997234313e-06, + "loss": 0.77153653, + "num_input_tokens_seen": 152207235, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.29614258, + "step": 7089, + "time_per_iteration": 2.6758017539978027 + }, + { + "auxiliary_loss_clip": 0.01412013, + "auxiliary_loss_mlp": 0.003632, + "balance_loss_clip": 1.15267622, + "balance_loss_mlp": 0.33278951, + "epoch": 0.4262738614159026, + "flos": 13552975140480.0, + "grad_norm": 8.244903373502643, + "language_loss": 0.85426104, + "learning_rate": 2.566776487287525e-06, + "loss": 0.87201321, + "num_input_tokens_seen": 152224240, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.30407715, + "step": 7090, + "time_per_iteration": 2.607390880584717 + }, + { + "auxiliary_loss_clip": 0.01416142, + "auxiliary_loss_mlp": 0.00386983, + "balance_loss_clip": 1.1610049, + "balance_loss_mlp": 0.35512996, + "epoch": 0.4263339846685706, + "flos": 29748794284800.0, + "grad_norm": 13.5519638916595, + "language_loss": 0.8239764, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.84200764, + "num_input_tokens_seen": 152242595, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.31835938, + "step": 7091, + "time_per_iteration": 2.6710665225982666 + }, + { + "auxiliary_loss_clip": 0.01397077, + "auxiliary_loss_mlp": 0.00326761, + "balance_loss_clip": 1.14741564, + "balance_loss_mlp": 0.3003318, + "epoch": 0.42639410792123855, + "flos": 16834204586880.0, + "grad_norm": 2.878859623396974, + "language_loss": 0.87425387, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.89149219, + "num_input_tokens_seen": 152260840, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.26452637, + "step": 7092, + "time_per_iteration": 2.6310858726501465 + }, + { + "auxiliary_loss_clip": 0.01431116, + "auxiliary_loss_mlp": 0.00368486, + "balance_loss_clip": 1.17149508, + "balance_loss_mlp": 0.33708656, + "epoch": 0.4264542311739065, + "flos": 28763118195840.0, + "grad_norm": 6.19154391013008, + "language_loss": 0.80158645, + "learning_rate": 2.565655903224038e-06, + "loss": 0.81958246, + "num_input_tokens_seen": 152280580, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.31396484, + "step": 7093, + "time_per_iteration": 2.7532901763916016 + }, + { + "auxiliary_loss_clip": 0.01416471, + "auxiliary_loss_mlp": 0.00364359, + "balance_loss_clip": 1.16148305, + "balance_loss_mlp": 0.33360285, + "epoch": 0.4265143544265745, + "flos": 24713615727360.0, + "grad_norm": 3.0292948186109414, + "language_loss": 0.78922015, + "learning_rate": 2.565282332284532e-06, + "loss": 0.80702847, + "num_input_tokens_seen": 152298455, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.30786133, + "step": 7094, + "time_per_iteration": 2.726991653442383 + }, + { + "auxiliary_loss_clip": 0.01415365, + "auxiliary_loss_mlp": 0.00375771, + "balance_loss_clip": 1.16125464, + "balance_loss_mlp": 0.34494352, + "epoch": 0.42657447767924245, + "flos": 21865971352320.0, + "grad_norm": 3.1622595541733114, + "language_loss": 0.87619758, + "learning_rate": 2.564908739909464e-06, + "loss": 0.89410895, + "num_input_tokens_seen": 152316995, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.30822754, + "step": 7095, + "time_per_iteration": 2.6824605464935303 + }, + { + "auxiliary_loss_clip": 0.01423262, + "auxiliary_loss_mlp": 0.00379352, + "balance_loss_clip": 1.16196132, + "balance_loss_mlp": 0.34492433, + "epoch": 0.4266346009319104, + "flos": 21470236237440.0, + "grad_norm": 265.81176341117725, + "language_loss": 0.86560452, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.88363063, + "num_input_tokens_seen": 152334800, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.34448242, + "step": 7096, + "time_per_iteration": 2.614403486251831 + }, + { + "auxiliary_loss_clip": 0.01423522, + "auxiliary_loss_mlp": 0.00398116, + "balance_loss_clip": 1.16249204, + "balance_loss_mlp": 0.36371207, + "epoch": 0.4266947241845784, + "flos": 25519379569920.0, + "grad_norm": 29.8312919627977, + "language_loss": 0.72514117, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.74335748, + "num_input_tokens_seen": 152355175, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.34399414, + "step": 7097, + "time_per_iteration": 2.7040884494781494 + }, + { + "auxiliary_loss_clip": 0.01419776, + "auxiliary_loss_mlp": 0.00406808, + "balance_loss_clip": 1.16535163, + "balance_loss_mlp": 0.37114084, + "epoch": 0.42675484743724634, + "flos": 26541217676160.0, + "grad_norm": 142.66206135988708, + "language_loss": 0.77949226, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.7977581, + "num_input_tokens_seen": 152377245, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.35668945, + "step": 7098, + "time_per_iteration": 2.722876787185669 + }, + { + "auxiliary_loss_clip": 0.01408194, + "auxiliary_loss_mlp": 0.00329873, + "balance_loss_clip": 1.15585971, + "balance_loss_mlp": 0.30104855, + "epoch": 0.4268149706899143, + "flos": 23112718467840.0, + "grad_norm": 9.794667177124698, + "language_loss": 0.83039337, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.84777409, + "num_input_tokens_seen": 152396985, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.28808594, + "step": 7099, + "time_per_iteration": 2.786961555480957 + }, + { + "auxiliary_loss_clip": 0.0143666, + "auxiliary_loss_mlp": 0.00402118, + "balance_loss_clip": 1.17615056, + "balance_loss_mlp": 0.36752313, + "epoch": 0.4268750939425823, + "flos": 22706532495360.0, + "grad_norm": 5.6723561035206895, + "language_loss": 0.89025789, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.90864563, + "num_input_tokens_seen": 152415590, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.34570312, + "step": 7100, + "time_per_iteration": 2.6879327297210693 + }, + { + "auxiliary_loss_clip": 0.01415028, + "auxiliary_loss_mlp": 0.00373507, + "balance_loss_clip": 1.15719438, + "balance_loss_mlp": 0.34117746, + "epoch": 0.42693521719525024, + "flos": 25374875155200.0, + "grad_norm": 155.51916507715336, + "language_loss": 0.85453486, + "learning_rate": 2.562666736305627e-06, + "loss": 0.87242019, + "num_input_tokens_seen": 152436735, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.32348633, + "step": 7101, + "time_per_iteration": 2.6918084621429443 + }, + { + "auxiliary_loss_clip": 0.01428274, + "auxiliary_loss_mlp": 0.00334089, + "balance_loss_clip": 1.16772509, + "balance_loss_mlp": 0.30273682, + "epoch": 0.42699534044791826, + "flos": 18150689957760.0, + "grad_norm": 54.30121474816975, + "language_loss": 0.817581, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.8352046, + "num_input_tokens_seen": 152455685, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.31347656, + "step": 7102, + "time_per_iteration": 2.6055197715759277 + }, + { + "auxiliary_loss_clip": 0.01427344, + "auxiliary_loss_mlp": 0.00370857, + "balance_loss_clip": 1.16895199, + "balance_loss_mlp": 0.3371926, + "epoch": 0.4270554637005862, + "flos": 13698413308800.0, + "grad_norm": 2.923394909874939, + "language_loss": 0.90458316, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.92256516, + "num_input_tokens_seen": 152473500, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.33691406, + "step": 7103, + "time_per_iteration": 2.630558490753174 + }, + { + "auxiliary_loss_clip": 0.0144554, + "auxiliary_loss_mlp": 0.00387063, + "balance_loss_clip": 1.18290222, + "balance_loss_mlp": 0.35494819, + "epoch": 0.4271155869532542, + "flos": 17493596507520.0, + "grad_norm": 98.16483274470569, + "language_loss": 0.81559449, + "learning_rate": 2.561545446271294e-06, + "loss": 0.8339206, + "num_input_tokens_seen": 152491320, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.32128906, + "step": 7104, + "time_per_iteration": 2.612170457839966 + }, + { + "auxiliary_loss_clip": 0.01416768, + "auxiliary_loss_mlp": 0.00351537, + "balance_loss_clip": 1.15799034, + "balance_loss_mlp": 0.31977963, + "epoch": 0.42717571020592215, + "flos": 32452293381120.0, + "grad_norm": 33.02472438677648, + "language_loss": 0.81852025, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.83620328, + "num_input_tokens_seen": 152511970, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.31787109, + "step": 7105, + "time_per_iteration": 2.733327865600586 + }, + { + "auxiliary_loss_clip": 0.01425931, + "auxiliary_loss_mlp": 0.00394648, + "balance_loss_clip": 1.16689456, + "balance_loss_mlp": 0.36286643, + "epoch": 0.4272358334585901, + "flos": 16253062444800.0, + "grad_norm": 3.08512592859961, + "language_loss": 0.83767956, + "learning_rate": 2.560797813088819e-06, + "loss": 0.85588533, + "num_input_tokens_seen": 152530515, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.31811523, + "step": 7106, + "time_per_iteration": 2.616262912750244 + }, + { + "auxiliary_loss_clip": 0.01420513, + "auxiliary_loss_mlp": 0.00378164, + "balance_loss_clip": 1.16562033, + "balance_loss_mlp": 0.34554875, + "epoch": 0.4272959567112581, + "flos": 24200092938240.0, + "grad_norm": 5.685595190721689, + "language_loss": 0.86794019, + "learning_rate": 2.560423964592229e-06, + "loss": 0.88592696, + "num_input_tokens_seen": 152549295, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.32641602, + "step": 7107, + "time_per_iteration": 2.7270443439483643 + }, + { + "auxiliary_loss_clip": 0.0141869, + "auxiliary_loss_mlp": 0.00355921, + "balance_loss_clip": 1.16153562, + "balance_loss_mlp": 0.32332903, + "epoch": 0.42735607996392605, + "flos": 27963495578880.0, + "grad_norm": 10.343565547137722, + "language_loss": 0.73328978, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.75103593, + "num_input_tokens_seen": 152570725, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.32592773, + "step": 7108, + "time_per_iteration": 2.732841968536377 + }, + { + "auxiliary_loss_clip": 0.01431841, + "auxiliary_loss_mlp": 0.00344748, + "balance_loss_clip": 1.16935658, + "balance_loss_mlp": 0.31475472, + "epoch": 0.427416203216594, + "flos": 20295597674880.0, + "grad_norm": 4.296132567966364, + "language_loss": 0.77926135, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.79702729, + "num_input_tokens_seen": 152588950, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.30004883, + "step": 7109, + "time_per_iteration": 4.053720474243164 + }, + { + "auxiliary_loss_clip": 0.01426937, + "auxiliary_loss_mlp": 0.00364327, + "balance_loss_clip": 1.16631806, + "balance_loss_mlp": 0.33111569, + "epoch": 0.427476326469262, + "flos": 26943955943040.0, + "grad_norm": 110.89258479463155, + "language_loss": 0.72124416, + "learning_rate": 2.559302291651174e-06, + "loss": 0.73915684, + "num_input_tokens_seen": 152608965, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.33203125, + "step": 7110, + "time_per_iteration": 2.7289819717407227 + }, + { + "auxiliary_loss_clip": 0.01424068, + "auxiliary_loss_mlp": 0.00384755, + "balance_loss_clip": 1.16573644, + "balance_loss_mlp": 0.35099465, + "epoch": 0.42753644972192995, + "flos": 25702847262720.0, + "grad_norm": 2.5529291880468077, + "language_loss": 0.80387592, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.82196409, + "num_input_tokens_seen": 152630220, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.33789062, + "step": 7111, + "time_per_iteration": 4.133579254150391 + }, + { + "auxiliary_loss_clip": 0.01416812, + "auxiliary_loss_mlp": 0.0036484, + "balance_loss_clip": 1.15908217, + "balance_loss_mlp": 0.33160397, + "epoch": 0.4275965729745979, + "flos": 18767419499520.0, + "grad_norm": 17.287950289421392, + "language_loss": 0.79847765, + "learning_rate": 2.558554403622845e-06, + "loss": 0.81629419, + "num_input_tokens_seen": 152648835, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.33203125, + "step": 7112, + "time_per_iteration": 2.638350248336792 + }, + { + "auxiliary_loss_clip": 0.01410891, + "auxiliary_loss_mlp": 0.00349501, + "balance_loss_clip": 1.15359735, + "balance_loss_mlp": 0.32039055, + "epoch": 0.4276566962272659, + "flos": 23764424878080.0, + "grad_norm": 6.050212207839433, + "language_loss": 0.77391958, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.79152346, + "num_input_tokens_seen": 152668375, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.29089355, + "step": 7113, + "time_per_iteration": 4.084723234176636 + }, + { + "auxiliary_loss_clip": 0.01426134, + "auxiliary_loss_mlp": 0.00368544, + "balance_loss_clip": 1.16219068, + "balance_loss_mlp": 0.33224446, + "epoch": 0.42771681947993384, + "flos": 22492505306880.0, + "grad_norm": 6.102963321493748, + "language_loss": 0.67436475, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.69231153, + "num_input_tokens_seen": 152689725, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.36315918, + "step": 7114, + "time_per_iteration": 2.7031309604644775 + }, + { + "auxiliary_loss_clip": 0.01440624, + "auxiliary_loss_mlp": 0.0038289, + "balance_loss_clip": 1.17052007, + "balance_loss_mlp": 0.34719905, + "epoch": 0.42777694273260186, + "flos": 25044712318080.0, + "grad_norm": 3.626198137318398, + "language_loss": 0.70537508, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.72361028, + "num_input_tokens_seen": 152709375, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.35717773, + "step": 7115, + "time_per_iteration": 2.7031381130218506 + }, + { + "auxiliary_loss_clip": 0.01420908, + "auxiliary_loss_mlp": 0.00382235, + "balance_loss_clip": 1.15865076, + "balance_loss_mlp": 0.34640023, + "epoch": 0.4278370659852698, + "flos": 18661519226880.0, + "grad_norm": 188.68052512244083, + "language_loss": 0.79595053, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.81398201, + "num_input_tokens_seen": 152727510, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.3581543, + "step": 7116, + "time_per_iteration": 2.618859052658081 + }, + { + "auxiliary_loss_clip": 0.01402048, + "auxiliary_loss_mlp": 0.00338143, + "balance_loss_clip": 1.14755774, + "balance_loss_mlp": 0.30807853, + "epoch": 0.4278971892379378, + "flos": 27308269635840.0, + "grad_norm": 94.34338349206381, + "language_loss": 0.74346513, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.760867, + "num_input_tokens_seen": 152746670, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.30053711, + "step": 7117, + "time_per_iteration": 2.757567882537842 + }, + { + "auxiliary_loss_clip": 0.01416203, + "auxiliary_loss_mlp": 0.00332413, + "balance_loss_clip": 1.15549803, + "balance_loss_mlp": 0.30087003, + "epoch": 0.42795731249060576, + "flos": 12888698970240.0, + "grad_norm": 3.41413914386177, + "language_loss": 0.78429741, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.80178356, + "num_input_tokens_seen": 152760545, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.31542969, + "step": 7118, + "time_per_iteration": 2.6834030151367188 + }, + { + "auxiliary_loss_clip": 0.01419761, + "auxiliary_loss_mlp": 0.00329067, + "balance_loss_clip": 1.15931344, + "balance_loss_mlp": 0.297095, + "epoch": 0.4280174357432737, + "flos": 33401448316800.0, + "grad_norm": 40.547738016411365, + "language_loss": 0.81698263, + "learning_rate": 2.55593612908444e-06, + "loss": 0.83447087, + "num_input_tokens_seen": 152780970, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.31982422, + "step": 7119, + "time_per_iteration": 4.2000038623809814 + }, + { + "auxiliary_loss_clip": 0.01429617, + "auxiliary_loss_mlp": 0.00360164, + "balance_loss_clip": 1.16612267, + "balance_loss_mlp": 0.32586703, + "epoch": 0.4280775589959417, + "flos": 18259104182400.0, + "grad_norm": 2.9240873254572697, + "language_loss": 0.81675106, + "learning_rate": 2.555562005426573e-06, + "loss": 0.83464885, + "num_input_tokens_seen": 152798475, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.34289551, + "step": 7120, + "time_per_iteration": 2.818265914916992 + }, + { + "auxiliary_loss_clip": 0.01418028, + "auxiliary_loss_mlp": 0.00356131, + "balance_loss_clip": 1.1580807, + "balance_loss_mlp": 0.32508919, + "epoch": 0.42813768224860965, + "flos": 21471277731840.0, + "grad_norm": 21.54018365248761, + "language_loss": 0.8280611, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.84580266, + "num_input_tokens_seen": 152817555, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.31030273, + "step": 7121, + "time_per_iteration": 2.7444581985473633 + }, + { + "auxiliary_loss_clip": 0.01398916, + "auxiliary_loss_mlp": 0.00346533, + "balance_loss_clip": 1.14227247, + "balance_loss_mlp": 0.31522882, + "epoch": 0.4281978055012776, + "flos": 15669262696320.0, + "grad_norm": 10.17972335458286, + "language_loss": 0.92185867, + "learning_rate": 2.554813694924126e-06, + "loss": 0.93931317, + "num_input_tokens_seen": 152836295, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.31298828, + "step": 7122, + "time_per_iteration": 2.702604055404663 + }, + { + "auxiliary_loss_clip": 0.0141432, + "auxiliary_loss_mlp": 0.00359047, + "balance_loss_clip": 1.15234661, + "balance_loss_mlp": 0.32643163, + "epoch": 0.4282579287539456, + "flos": 17712005155200.0, + "grad_norm": 3.4714436003293856, + "language_loss": 0.86373281, + "learning_rate": 2.554439508107921e-06, + "loss": 0.88146651, + "num_input_tokens_seen": 152854950, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.32617188, + "step": 7123, + "time_per_iteration": 2.638716459274292 + }, + { + "auxiliary_loss_clip": 0.01420004, + "auxiliary_loss_mlp": 0.00336133, + "balance_loss_clip": 1.1573925, + "balance_loss_mlp": 0.30265915, + "epoch": 0.42831805200661355, + "flos": 19281157770240.0, + "grad_norm": 14.631478987922854, + "language_loss": 0.86919457, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.88675594, + "num_input_tokens_seen": 152873995, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.33496094, + "step": 7124, + "time_per_iteration": 2.65563702583313 + }, + { + "auxiliary_loss_clip": 0.01413484, + "auxiliary_loss_mlp": 0.00373909, + "balance_loss_clip": 1.15175796, + "balance_loss_mlp": 0.34326008, + "epoch": 0.4283781752592815, + "flos": 19792633484160.0, + "grad_norm": 12.657679224617253, + "language_loss": 0.86787498, + "learning_rate": 2.553691071416498e-06, + "loss": 0.88574892, + "num_input_tokens_seen": 152892925, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.30639648, + "step": 7125, + "time_per_iteration": 2.6675937175750732 + }, + { + "auxiliary_loss_clip": 0.01409652, + "auxiliary_loss_mlp": 0.00321433, + "balance_loss_clip": 1.15110636, + "balance_loss_mlp": 0.29076016, + "epoch": 0.4284382985119495, + "flos": 16508064072960.0, + "grad_norm": 13.603846000274487, + "language_loss": 0.80978644, + "learning_rate": 2.553316821569659e-06, + "loss": 0.8270973, + "num_input_tokens_seen": 152910935, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.3067627, + "step": 7126, + "time_per_iteration": 2.624483585357666 + }, + { + "auxiliary_loss_clip": 0.0141142, + "auxiliary_loss_mlp": 0.00330731, + "balance_loss_clip": 1.15143859, + "balance_loss_mlp": 0.2999984, + "epoch": 0.42849842176461744, + "flos": 23330767979520.0, + "grad_norm": 5.12288237762217, + "language_loss": 0.88156629, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.89898777, + "num_input_tokens_seen": 152931030, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.30737305, + "step": 7127, + "time_per_iteration": 2.661252975463867 + }, + { + "auxiliary_loss_clip": 0.01419943, + "auxiliary_loss_mlp": 0.00369014, + "balance_loss_clip": 1.15551567, + "balance_loss_mlp": 0.33391884, + "epoch": 0.4285585450172854, + "flos": 17274433674240.0, + "grad_norm": 4.865906856089036, + "language_loss": 0.82652295, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.84441245, + "num_input_tokens_seen": 152948085, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.35107422, + "step": 7128, + "time_per_iteration": 2.637255907058716 + }, + { + "auxiliary_loss_clip": 0.01415149, + "auxiliary_loss_mlp": 0.00365995, + "balance_loss_clip": 1.15215516, + "balance_loss_mlp": 0.33345109, + "epoch": 0.42861866826995343, + "flos": 24279599692800.0, + "grad_norm": 6.234008216743909, + "language_loss": 0.81713194, + "learning_rate": 2.552193946194937e-06, + "loss": 0.83494341, + "num_input_tokens_seen": 152966265, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.32519531, + "step": 7129, + "time_per_iteration": 2.722487688064575 + }, + { + "auxiliary_loss_clip": 0.0142647, + "auxiliary_loss_mlp": 0.00400576, + "balance_loss_clip": 1.16672802, + "balance_loss_mlp": 0.36629128, + "epoch": 0.4286787915226214, + "flos": 24353108876160.0, + "grad_norm": 199.6344674330613, + "language_loss": 0.83966064, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.85793108, + "num_input_tokens_seen": 152986775, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.34277344, + "step": 7130, + "time_per_iteration": 2.7026095390319824 + }, + { + "auxiliary_loss_clip": 0.0143577, + "auxiliary_loss_mlp": 0.00363254, + "balance_loss_clip": 1.17070735, + "balance_loss_mlp": 0.33142531, + "epoch": 0.42873891477528936, + "flos": 15449992122240.0, + "grad_norm": 6.947103758152384, + "language_loss": 0.80768561, + "learning_rate": 2.551445257891886e-06, + "loss": 0.82567585, + "num_input_tokens_seen": 153003595, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.31860352, + "step": 7131, + "time_per_iteration": 2.71919322013855 + }, + { + "auxiliary_loss_clip": 0.01419645, + "auxiliary_loss_mlp": 0.0036422, + "balance_loss_clip": 1.15137815, + "balance_loss_mlp": 0.32891029, + "epoch": 0.4287990380279573, + "flos": 17639573379840.0, + "grad_norm": 3.394510288494608, + "language_loss": 0.85560411, + "learning_rate": 2.551070882366973e-06, + "loss": 0.87344277, + "num_input_tokens_seen": 153021960, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.3527832, + "step": 7132, + "time_per_iteration": 2.687697649002075 + }, + { + "auxiliary_loss_clip": 0.01421183, + "auxiliary_loss_mlp": 0.00379937, + "balance_loss_clip": 1.15865028, + "balance_loss_mlp": 0.34560466, + "epoch": 0.4288591612806253, + "flos": 27162328677120.0, + "grad_norm": 7.313473470550568, + "language_loss": 0.83171403, + "learning_rate": 2.550696485945397e-06, + "loss": 0.84972525, + "num_input_tokens_seen": 153042110, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.34338379, + "step": 7133, + "time_per_iteration": 2.754601240158081 + }, + { + "auxiliary_loss_clip": 0.01423565, + "auxiliary_loss_mlp": 0.00344598, + "balance_loss_clip": 1.15988207, + "balance_loss_mlp": 0.31623814, + "epoch": 0.42891928453329325, + "flos": 17163182275200.0, + "grad_norm": 16.464505855743376, + "language_loss": 0.81177384, + "learning_rate": 2.550322068641355e-06, + "loss": 0.82945549, + "num_input_tokens_seen": 153058925, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.28381348, + "step": 7134, + "time_per_iteration": 2.665741205215454 + }, + { + "auxiliary_loss_clip": 0.01415487, + "auxiliary_loss_mlp": 0.00356868, + "balance_loss_clip": 1.15252614, + "balance_loss_mlp": 0.32360855, + "epoch": 0.4289794077859612, + "flos": 18187031543040.0, + "grad_norm": 3.440554725946858, + "language_loss": 0.91357017, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.93129373, + "num_input_tokens_seen": 153078070, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.33276367, + "step": 7135, + "time_per_iteration": 2.699946641921997 + }, + { + "auxiliary_loss_clip": 0.01402866, + "auxiliary_loss_mlp": 0.00332513, + "balance_loss_clip": 1.1439743, + "balance_loss_mlp": 0.30166167, + "epoch": 0.4290395310386292, + "flos": 28256885867520.0, + "grad_norm": 47.46720110848756, + "language_loss": 0.83409321, + "learning_rate": 2.549573171442666e-06, + "loss": 0.85144699, + "num_input_tokens_seen": 153096680, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.30834961, + "step": 7136, + "time_per_iteration": 2.721072196960449 + }, + { + "auxiliary_loss_clip": 0.0142529, + "auxiliary_loss_mlp": 0.00368166, + "balance_loss_clip": 1.15947664, + "balance_loss_mlp": 0.33688551, + "epoch": 0.42909965429129715, + "flos": 16216074414720.0, + "grad_norm": 91.00852505679585, + "language_loss": 0.85933012, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.87726462, + "num_input_tokens_seen": 153113305, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.31274414, + "step": 7137, + "time_per_iteration": 2.589449405670166 + }, + { + "auxiliary_loss_clip": 0.01424556, + "auxiliary_loss_mlp": 0.00378908, + "balance_loss_clip": 1.16131032, + "balance_loss_mlp": 0.34705564, + "epoch": 0.4291597775439651, + "flos": 23112862122240.0, + "grad_norm": 5.203940834573685, + "language_loss": 0.83732575, + "learning_rate": 2.548824190884499e-06, + "loss": 0.85536039, + "num_input_tokens_seen": 153132735, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.3182373, + "step": 7138, + "time_per_iteration": 2.665797233581543 + }, + { + "auxiliary_loss_clip": 0.01518022, + "auxiliary_loss_mlp": 0.0007882, + "balance_loss_clip": 1.27543378, + "balance_loss_mlp": 0.06985509, + "epoch": 0.4292199007966331, + "flos": 67546212681600.0, + "grad_norm": 0.9156107643159812, + "language_loss": 0.55721092, + "learning_rate": 2.548449669381113e-06, + "loss": 0.57317936, + "num_input_tokens_seen": 153187925, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.08984375, + "step": 7139, + "time_per_iteration": 2.967646598815918 + }, + { + "auxiliary_loss_clip": 0.01404396, + "auxiliary_loss_mlp": 0.00349085, + "balance_loss_clip": 1.14280927, + "balance_loss_mlp": 0.31921107, + "epoch": 0.42928002404930105, + "flos": 22999850956800.0, + "grad_norm": 19.875443109773613, + "language_loss": 0.87468398, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.89221883, + "num_input_tokens_seen": 153206990, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.29858398, + "step": 7140, + "time_per_iteration": 2.707534074783325 + }, + { + "auxiliary_loss_clip": 0.01427916, + "auxiliary_loss_mlp": 0.00371825, + "balance_loss_clip": 1.15912676, + "balance_loss_mlp": 0.33978167, + "epoch": 0.429340147301969, + "flos": 11544922241280.0, + "grad_norm": 89.82264770161768, + "language_loss": 0.88910127, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.90709865, + "num_input_tokens_seen": 153222345, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.3203125, + "step": 7141, + "time_per_iteration": 2.6473805904388428 + }, + { + "auxiliary_loss_clip": 0.01424592, + "auxiliary_loss_mlp": 0.00390321, + "balance_loss_clip": 1.15483022, + "balance_loss_mlp": 0.35834938, + "epoch": 0.42940027055463703, + "flos": 25264988472960.0, + "grad_norm": 34.15608585997064, + "language_loss": 0.91637003, + "learning_rate": 2.547325980144166e-06, + "loss": 0.93451917, + "num_input_tokens_seen": 153240570, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.31958008, + "step": 7142, + "time_per_iteration": 2.703907012939453 + }, + { + "auxiliary_loss_clip": 0.01409552, + "auxiliary_loss_mlp": 0.00346976, + "balance_loss_clip": 1.14609432, + "balance_loss_mlp": 0.31569499, + "epoch": 0.429460393807305, + "flos": 23805004268160.0, + "grad_norm": 2.4168408941088697, + "language_loss": 0.86364543, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.88121068, + "num_input_tokens_seen": 153259575, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.3125, + "step": 7143, + "time_per_iteration": 2.7025365829467773 + }, + { + "auxiliary_loss_clip": 0.01436212, + "auxiliary_loss_mlp": 0.00340501, + "balance_loss_clip": 1.1688385, + "balance_loss_mlp": 0.31150907, + "epoch": 0.42952051705997296, + "flos": 13918294414080.0, + "grad_norm": 16.223084978859546, + "language_loss": 0.83573204, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.85349917, + "num_input_tokens_seen": 153276650, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.28979492, + "step": 7144, + "time_per_iteration": 2.616328716278076 + }, + { + "auxiliary_loss_clip": 0.01421029, + "auxiliary_loss_mlp": 0.00389635, + "balance_loss_clip": 1.15618563, + "balance_loss_mlp": 0.35794911, + "epoch": 0.4295806403126409, + "flos": 26760380509440.0, + "grad_norm": 2.0096398810518457, + "language_loss": 0.80170739, + "learning_rate": 2.54620210411532e-06, + "loss": 0.81981409, + "num_input_tokens_seen": 153298025, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.31665039, + "step": 7145, + "time_per_iteration": 2.7041823863983154 + }, + { + "auxiliary_loss_clip": 0.01426079, + "auxiliary_loss_mlp": 0.00388062, + "balance_loss_clip": 1.15897238, + "balance_loss_mlp": 0.35394403, + "epoch": 0.4296407635653089, + "flos": 20952619297920.0, + "grad_norm": 5.8101328660861356, + "language_loss": 0.86735916, + "learning_rate": 2.545827437329352e-06, + "loss": 0.88550055, + "num_input_tokens_seen": 153315775, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.34106445, + "step": 7146, + "time_per_iteration": 2.6698684692382812 + }, + { + "auxiliary_loss_clip": 0.014135, + "auxiliary_loss_mlp": 0.0037979, + "balance_loss_clip": 1.15168607, + "balance_loss_mlp": 0.34855732, + "epoch": 0.42970088681797686, + "flos": 15852335339520.0, + "grad_norm": 6.281410613121259, + "language_loss": 0.9041661, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.92209899, + "num_input_tokens_seen": 153332765, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.31225586, + "step": 7147, + "time_per_iteration": 2.6322779655456543 + }, + { + "auxiliary_loss_clip": 0.01425265, + "auxiliary_loss_mlp": 0.00357451, + "balance_loss_clip": 1.16081333, + "balance_loss_mlp": 0.32650429, + "epoch": 0.4297610100706448, + "flos": 22382618624640.0, + "grad_norm": 9.653840365409316, + "language_loss": 0.92852342, + "learning_rate": 2.545078041678131e-06, + "loss": 0.94635057, + "num_input_tokens_seen": 153350760, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.30957031, + "step": 7148, + "time_per_iteration": 2.647468328475952 + }, + { + "auxiliary_loss_clip": 0.01426505, + "auxiliary_loss_mlp": 0.00356843, + "balance_loss_clip": 1.16213918, + "balance_loss_mlp": 0.32797006, + "epoch": 0.4298211333233128, + "flos": 27925681536000.0, + "grad_norm": 36.50740341213853, + "language_loss": 0.83949488, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.8573283, + "num_input_tokens_seen": 153370765, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.28857422, + "step": 7149, + "time_per_iteration": 2.7169911861419678 + }, + { + "auxiliary_loss_clip": 0.01405787, + "auxiliary_loss_mlp": 0.00332195, + "balance_loss_clip": 1.14414692, + "balance_loss_mlp": 0.30131957, + "epoch": 0.42988125657598075, + "flos": 24425612478720.0, + "grad_norm": 55.9217417785159, + "language_loss": 0.84991521, + "learning_rate": 2.544328563349256e-06, + "loss": 0.86729503, + "num_input_tokens_seen": 153390725, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.30883789, + "step": 7150, + "time_per_iteration": 2.690232992172241 + }, + { + "auxiliary_loss_clip": 0.01437225, + "auxiliary_loss_mlp": 0.00378892, + "balance_loss_clip": 1.16565275, + "balance_loss_mlp": 0.34565622, + "epoch": 0.4299413798286487, + "flos": 15850180523520.0, + "grad_norm": 377.10358430178076, + "language_loss": 0.83169556, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.84985673, + "num_input_tokens_seen": 153408010, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.33251953, + "step": 7151, + "time_per_iteration": 4.069535970687866 + }, + { + "auxiliary_loss_clip": 0.01439547, + "auxiliary_loss_mlp": 0.00400937, + "balance_loss_clip": 1.16874015, + "balance_loss_mlp": 0.36784431, + "epoch": 0.4300015030813167, + "flos": 22309504490880.0, + "grad_norm": 6.827970250233639, + "language_loss": 0.7777983, + "learning_rate": 2.543579002456406e-06, + "loss": 0.79620314, + "num_input_tokens_seen": 153426865, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.33093262, + "step": 7152, + "time_per_iteration": 2.628608226776123 + }, + { + "auxiliary_loss_clip": 0.01414489, + "auxiliary_loss_mlp": 0.00373576, + "balance_loss_clip": 1.14765453, + "balance_loss_mlp": 0.33982787, + "epoch": 0.43006162633398465, + "flos": 34897666366080.0, + "grad_norm": 2.725355718024374, + "language_loss": 0.77548903, + "learning_rate": 2.54320419108402e-06, + "loss": 0.79336965, + "num_input_tokens_seen": 153449410, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.33752441, + "step": 7153, + "time_per_iteration": 4.2659454345703125 + }, + { + "auxiliary_loss_clip": 0.01424192, + "auxiliary_loss_mlp": 0.00379727, + "balance_loss_clip": 1.16157126, + "balance_loss_mlp": 0.34937626, + "epoch": 0.4301217495866526, + "flos": 15961575576960.0, + "grad_norm": 19.66966547461353, + "language_loss": 0.86775249, + "learning_rate": 2.542829359113276e-06, + "loss": 0.88579166, + "num_input_tokens_seen": 153467910, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.3034668, + "step": 7154, + "time_per_iteration": 2.708641767501831 + }, + { + "auxiliary_loss_clip": 0.01424284, + "auxiliary_loss_mlp": 0.00372995, + "balance_loss_clip": 1.1605978, + "balance_loss_mlp": 0.34238189, + "epoch": 0.43018187283932063, + "flos": 18770364414720.0, + "grad_norm": 111.73189539329825, + "language_loss": 0.84537005, + "learning_rate": 2.542454506558389e-06, + "loss": 0.86334288, + "num_input_tokens_seen": 153487100, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.3059082, + "step": 7155, + "time_per_iteration": 2.623826026916504 + }, + { + "auxiliary_loss_clip": 0.01420391, + "auxiliary_loss_mlp": 0.00376074, + "balance_loss_clip": 1.15669847, + "balance_loss_mlp": 0.34517458, + "epoch": 0.4302419960919886, + "flos": 20151703791360.0, + "grad_norm": 39.48521671961109, + "language_loss": 0.96495032, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.98291498, + "num_input_tokens_seen": 153505565, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.30859375, + "step": 7156, + "time_per_iteration": 4.05840277671814 + }, + { + "auxiliary_loss_clip": 0.01435762, + "auxiliary_loss_mlp": 0.00390469, + "balance_loss_clip": 1.16839933, + "balance_loss_mlp": 0.35594553, + "epoch": 0.43030211934465656, + "flos": 26432731624320.0, + "grad_norm": 2.107980655766426, + "language_loss": 0.90770817, + "learning_rate": 2.541704739753042e-06, + "loss": 0.92597044, + "num_input_tokens_seen": 153526130, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.34533691, + "step": 7157, + "time_per_iteration": 2.701692581176758 + }, + { + "auxiliary_loss_clip": 0.01461755, + "auxiliary_loss_mlp": 0.0037226, + "balance_loss_clip": 1.18858123, + "balance_loss_mlp": 0.34105131, + "epoch": 0.43036224259732453, + "flos": 24389234979840.0, + "grad_norm": 12.710639356319104, + "language_loss": 0.80839097, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.82673115, + "num_input_tokens_seen": 153546370, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.31225586, + "step": 7158, + "time_per_iteration": 2.6990506649017334 + }, + { + "auxiliary_loss_clip": 0.01438446, + "auxiliary_loss_mlp": 0.00382121, + "balance_loss_clip": 1.17134213, + "balance_loss_mlp": 0.35198486, + "epoch": 0.4304223658499925, + "flos": 17201714590080.0, + "grad_norm": 26.253594870742596, + "language_loss": 0.89170933, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.90991503, + "num_input_tokens_seen": 153562800, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.30151367, + "step": 7159, + "time_per_iteration": 2.622833490371704 + }, + { + "auxiliary_loss_clip": 0.01442834, + "auxiliary_loss_mlp": 0.00365905, + "balance_loss_clip": 1.170542, + "balance_loss_mlp": 0.33486319, + "epoch": 0.43048248910266046, + "flos": 14903000835840.0, + "grad_norm": 12.647159439448382, + "language_loss": 0.90836459, + "learning_rate": 2.54057993551933e-06, + "loss": 0.92645192, + "num_input_tokens_seen": 153578395, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.31079102, + "step": 7160, + "time_per_iteration": 2.6387205123901367 + }, + { + "auxiliary_loss_clip": 0.01443624, + "auxiliary_loss_mlp": 0.00366723, + "balance_loss_clip": 1.17059445, + "balance_loss_mlp": 0.3309606, + "epoch": 0.4305426123553284, + "flos": 21579835610880.0, + "grad_norm": 82.39527751287787, + "language_loss": 0.85798943, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.87609291, + "num_input_tokens_seen": 153596880, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.35742188, + "step": 7161, + "time_per_iteration": 4.123345375061035 + }, + { + "auxiliary_loss_clip": 0.01442817, + "auxiliary_loss_mlp": 0.00351967, + "balance_loss_clip": 1.17238212, + "balance_loss_mlp": 0.31935117, + "epoch": 0.4306027356079964, + "flos": 22601278667520.0, + "grad_norm": 20.116261959792606, + "language_loss": 0.79212642, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.81007433, + "num_input_tokens_seen": 153616570, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.32592773, + "step": 7162, + "time_per_iteration": 2.715836524963379 + }, + { + "auxiliary_loss_clip": 0.01490764, + "auxiliary_loss_mlp": 0.00090424, + "balance_loss_clip": 1.22327089, + "balance_loss_mlp": 0.08207938, + "epoch": 0.43066285886066435, + "flos": 70672091806080.0, + "grad_norm": 0.7837524237061969, + "language_loss": 0.58379459, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.59960651, + "num_input_tokens_seen": 153671450, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.08349609, + "step": 7163, + "time_per_iteration": 3.0688247680664062 + }, + { + "auxiliary_loss_clip": 0.01437003, + "auxiliary_loss_mlp": 0.00349624, + "balance_loss_clip": 1.16851091, + "balance_loss_mlp": 0.31791425, + "epoch": 0.4307229821133323, + "flos": 26720591218560.0, + "grad_norm": 3.44819501423517, + "language_loss": 0.85399044, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.87185669, + "num_input_tokens_seen": 153691405, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.31713867, + "step": 7164, + "time_per_iteration": 2.7044460773468018 + }, + { + "auxiliary_loss_clip": 0.0144059, + "auxiliary_loss_mlp": 0.00366558, + "balance_loss_clip": 1.16990185, + "balance_loss_mlp": 0.33346543, + "epoch": 0.4307831053660003, + "flos": 26177119464960.0, + "grad_norm": 11.63726781041606, + "language_loss": 0.77364558, + "learning_rate": 2.538704852009177e-06, + "loss": 0.79171705, + "num_input_tokens_seen": 153711555, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.33081055, + "step": 7165, + "time_per_iteration": 2.706228256225586 + }, + { + "auxiliary_loss_clip": 0.01440678, + "auxiliary_loss_mlp": 0.0035404, + "balance_loss_clip": 1.16952014, + "balance_loss_mlp": 0.32149562, + "epoch": 0.43084322861866825, + "flos": 18910343715840.0, + "grad_norm": 6.747080834053999, + "language_loss": 0.82079709, + "learning_rate": 2.538329773967034e-06, + "loss": 0.83874422, + "num_input_tokens_seen": 153730095, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.32568359, + "step": 7166, + "time_per_iteration": 2.634289026260376 + }, + { + "auxiliary_loss_clip": 0.01429059, + "auxiliary_loss_mlp": 0.00267218, + "balance_loss_clip": 1.16503322, + "balance_loss_mlp": 0.23579484, + "epoch": 0.4309033518713362, + "flos": 26432911192320.0, + "grad_norm": 10.921742262901743, + "language_loss": 0.79380149, + "learning_rate": 2.537954675511372e-06, + "loss": 0.81076431, + "num_input_tokens_seen": 153749320, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.31445312, + "step": 7167, + "time_per_iteration": 2.822505474090576 + }, + { + "auxiliary_loss_clip": 0.01430486, + "auxiliary_loss_mlp": 0.00304959, + "balance_loss_clip": 1.16670132, + "balance_loss_mlp": 0.27210492, + "epoch": 0.43096347512400424, + "flos": 21213295274880.0, + "grad_norm": 3.8994790281789085, + "language_loss": 0.84082699, + "learning_rate": 2.537579556656414e-06, + "loss": 0.85818148, + "num_input_tokens_seen": 153767825, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.32849121, + "step": 7168, + "time_per_iteration": 2.736447811126709 + }, + { + "auxiliary_loss_clip": 0.01435901, + "auxiliary_loss_mlp": 0.00285231, + "balance_loss_clip": 1.17114377, + "balance_loss_mlp": 0.2539984, + "epoch": 0.4310235983766722, + "flos": 16540131939840.0, + "grad_norm": 113.98137751020259, + "language_loss": 0.91951072, + "learning_rate": 2.537204417416387e-06, + "loss": 0.93672198, + "num_input_tokens_seen": 153785350, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.3125, + "step": 7169, + "time_per_iteration": 2.7616074085235596 + }, + { + "auxiliary_loss_clip": 0.01449768, + "auxiliary_loss_mlp": 0.00071009, + "balance_loss_clip": 1.19874334, + "balance_loss_mlp": 0.06118573, + "epoch": 0.43108372162934017, + "flos": 64775704763520.0, + "grad_norm": 0.6702405346718439, + "language_loss": 0.60768765, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.62289542, + "num_input_tokens_seen": 153856400, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.09814453, + "step": 7170, + "time_per_iteration": 3.2945749759674072 + }, + { + "auxiliary_loss_clip": 0.01432567, + "auxiliary_loss_mlp": 0.00296215, + "balance_loss_clip": 1.16662598, + "balance_loss_mlp": 0.26500595, + "epoch": 0.43114384488200813, + "flos": 13444094039040.0, + "grad_norm": 4.348649547854899, + "language_loss": 0.85631067, + "learning_rate": 2.536454077838021e-06, + "loss": 0.87359846, + "num_input_tokens_seen": 153875230, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.31201172, + "step": 7171, + "time_per_iteration": 2.665846347808838 + }, + { + "auxiliary_loss_clip": 0.01452716, + "auxiliary_loss_mlp": 0.00269101, + "balance_loss_clip": 1.18826067, + "balance_loss_mlp": 0.23762953, + "epoch": 0.4312039681346761, + "flos": 26286682924800.0, + "grad_norm": 10.019786500727852, + "language_loss": 0.82775021, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.84496838, + "num_input_tokens_seen": 153894740, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.31445312, + "step": 7172, + "time_per_iteration": 2.6803464889526367 + }, + { + "auxiliary_loss_clip": 0.01457939, + "auxiliary_loss_mlp": 0.00297945, + "balance_loss_clip": 1.18952763, + "balance_loss_mlp": 0.26294494, + "epoch": 0.43126409138734406, + "flos": 20376684627840.0, + "grad_norm": 52.46277923140172, + "language_loss": 0.85039186, + "learning_rate": 2.535703656890086e-06, + "loss": 0.86795068, + "num_input_tokens_seen": 153913230, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.35009766, + "step": 7173, + "time_per_iteration": 2.6240451335906982 + }, + { + "auxiliary_loss_clip": 0.01438341, + "auxiliary_loss_mlp": 0.00266428, + "balance_loss_clip": 1.17634463, + "balance_loss_mlp": 0.23388404, + "epoch": 0.431324214640012, + "flos": 22123091882880.0, + "grad_norm": 9.050033533898832, + "language_loss": 0.82667494, + "learning_rate": 2.5353284159381e-06, + "loss": 0.84372264, + "num_input_tokens_seen": 153933250, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.32568359, + "step": 7174, + "time_per_iteration": 2.72456955909729 + }, + { + "auxiliary_loss_clip": 0.01449324, + "auxiliary_loss_mlp": 0.00294285, + "balance_loss_clip": 1.18470168, + "balance_loss_mlp": 0.26216999, + "epoch": 0.43138433789268, + "flos": 15231008856960.0, + "grad_norm": 4.979708424583863, + "language_loss": 0.88370699, + "learning_rate": 2.534953154686407e-06, + "loss": 0.90114313, + "num_input_tokens_seen": 153951325, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.32104492, + "step": 7175, + "time_per_iteration": 2.6341567039489746 + }, + { + "auxiliary_loss_clip": 0.01455703, + "auxiliary_loss_mlp": 0.0027519, + "balance_loss_clip": 1.18239677, + "balance_loss_mlp": 0.23959459, + "epoch": 0.43144446114534796, + "flos": 18150294908160.0, + "grad_norm": 14.545115892762864, + "language_loss": 0.83099043, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.84829938, + "num_input_tokens_seen": 153966975, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.35620117, + "step": 7176, + "time_per_iteration": 2.6232852935791016 + }, + { + "auxiliary_loss_clip": 0.0146042, + "auxiliary_loss_mlp": 0.00277729, + "balance_loss_clip": 1.19216311, + "balance_loss_mlp": 0.24334925, + "epoch": 0.4315045843980159, + "flos": 22929861306240.0, + "grad_norm": 3.001195058673707, + "language_loss": 0.80342984, + "learning_rate": 2.534202571340819e-06, + "loss": 0.82081133, + "num_input_tokens_seen": 153986695, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.34375, + "step": 7177, + "time_per_iteration": 2.671656608581543 + }, + { + "auxiliary_loss_clip": 0.01469312, + "auxiliary_loss_mlp": 0.00306334, + "balance_loss_clip": 1.19654763, + "balance_loss_mlp": 0.27026165, + "epoch": 0.4315647076506839, + "flos": 22126862810880.0, + "grad_norm": 15.483687929846662, + "language_loss": 0.87780255, + "learning_rate": 2.533827249275387e-06, + "loss": 0.89555901, + "num_input_tokens_seen": 154004710, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.3605957, + "step": 7178, + "time_per_iteration": 2.665947914123535 + }, + { + "auxiliary_loss_clip": 0.01457767, + "auxiliary_loss_mlp": 0.0022861, + "balance_loss_clip": 1.20025349, + "balance_loss_mlp": 0.19449246, + "epoch": 0.43162483090335185, + "flos": 26871129118080.0, + "grad_norm": 2.7518927682637573, + "language_loss": 0.88790548, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.90476918, + "num_input_tokens_seen": 154024320, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.34106445, + "step": 7179, + "time_per_iteration": 2.7014245986938477 + }, + { + "auxiliary_loss_clip": 0.01452272, + "auxiliary_loss_mlp": 0.00273042, + "balance_loss_clip": 1.1895169, + "balance_loss_mlp": 0.23975897, + "epoch": 0.4316849541560198, + "flos": 13913122855680.0, + "grad_norm": 7.463791004743366, + "language_loss": 0.82816553, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.84541869, + "num_input_tokens_seen": 154041755, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.33276367, + "step": 7180, + "time_per_iteration": 2.666935920715332 + }, + { + "auxiliary_loss_clip": 0.01466563, + "auxiliary_loss_mlp": 0.00280125, + "balance_loss_clip": 1.19995236, + "balance_loss_mlp": 0.24603131, + "epoch": 0.4317450774086878, + "flos": 16435165420800.0, + "grad_norm": 2.584692010453137, + "language_loss": 0.86806178, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.88552868, + "num_input_tokens_seen": 154056775, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.34130859, + "step": 7181, + "time_per_iteration": 2.6034936904907227 + }, + { + "auxiliary_loss_clip": 0.01471696, + "auxiliary_loss_mlp": 0.00286972, + "balance_loss_clip": 1.2046386, + "balance_loss_mlp": 0.25018382, + "epoch": 0.4318052006613558, + "flos": 20554980762240.0, + "grad_norm": 6.050514979341648, + "language_loss": 0.93978953, + "learning_rate": 2.532325758728165e-06, + "loss": 0.95737618, + "num_input_tokens_seen": 154075015, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.36816406, + "step": 7182, + "time_per_iteration": 2.6644339561462402 + }, + { + "auxiliary_loss_clip": 0.01462542, + "auxiliary_loss_mlp": 0.00253502, + "balance_loss_clip": 1.19770265, + "balance_loss_mlp": 0.21971811, + "epoch": 0.43186532391402377, + "flos": 22820046451200.0, + "grad_norm": 4.754953774570806, + "language_loss": 0.83497411, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.85213453, + "num_input_tokens_seen": 154095170, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.33789062, + "step": 7183, + "time_per_iteration": 2.6813955307006836 + }, + { + "auxiliary_loss_clip": 0.0148252, + "auxiliary_loss_mlp": 0.00282145, + "balance_loss_clip": 1.21071219, + "balance_loss_mlp": 0.24786082, + "epoch": 0.43192544716669173, + "flos": 25556583081600.0, + "grad_norm": 4.120138537035195, + "language_loss": 0.85567272, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.87331939, + "num_input_tokens_seen": 154116895, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.3425293, + "step": 7184, + "time_per_iteration": 2.7311789989471436 + }, + { + "auxiliary_loss_clip": 0.01446807, + "auxiliary_loss_mlp": 0.00293422, + "balance_loss_clip": 1.18532193, + "balance_loss_mlp": 0.25959027, + "epoch": 0.4319855704193597, + "flos": 30954674701440.0, + "grad_norm": 4.60213892927688, + "language_loss": 0.78348213, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.80088449, + "num_input_tokens_seen": 154138395, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.33837891, + "step": 7185, + "time_per_iteration": 2.731214761734009 + }, + { + "auxiliary_loss_clip": 0.01456488, + "auxiliary_loss_mlp": 0.00283047, + "balance_loss_clip": 1.19026899, + "balance_loss_mlp": 0.24518618, + "epoch": 0.43204569367202766, + "flos": 24238732993920.0, + "grad_norm": 79.95896151293607, + "language_loss": 0.84374297, + "learning_rate": 2.530823945207421e-06, + "loss": 0.86113834, + "num_input_tokens_seen": 154156775, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.37841797, + "step": 7186, + "time_per_iteration": 2.7694082260131836 + }, + { + "auxiliary_loss_clip": 0.01454381, + "auxiliary_loss_mlp": 0.00266866, + "balance_loss_clip": 1.18760681, + "balance_loss_mlp": 0.23541833, + "epoch": 0.43210581692469563, + "flos": 18406948561920.0, + "grad_norm": 5.447344360697795, + "language_loss": 0.82219481, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.83940727, + "num_input_tokens_seen": 154177500, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.31445312, + "step": 7187, + "time_per_iteration": 2.73030948638916 + }, + { + "auxiliary_loss_clip": 0.01395332, + "auxiliary_loss_mlp": 0.00073325, + "balance_loss_clip": 1.17715478, + "balance_loss_mlp": 0.05749412, + "epoch": 0.4321659401773636, + "flos": 49832378910720.0, + "grad_norm": 0.842075485169368, + "language_loss": 0.67770946, + "learning_rate": 2.530072917616714e-06, + "loss": 0.69239604, + "num_input_tokens_seen": 154237110, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.15820312, + "step": 7188, + "time_per_iteration": 3.1595420837402344 + }, + { + "auxiliary_loss_clip": 0.01435027, + "auxiliary_loss_mlp": 0.00253924, + "balance_loss_clip": 1.17219186, + "balance_loss_mlp": 0.22171409, + "epoch": 0.43222606343003156, + "flos": 17128564542720.0, + "grad_norm": 441.2249633359303, + "language_loss": 0.84926099, + "learning_rate": 2.529697373663614e-06, + "loss": 0.86615056, + "num_input_tokens_seen": 154253910, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.32226562, + "step": 7189, + "time_per_iteration": 2.6554195880889893 + }, + { + "auxiliary_loss_clip": 0.01461649, + "auxiliary_loss_mlp": 0.00287315, + "balance_loss_clip": 1.19228899, + "balance_loss_mlp": 0.25081325, + "epoch": 0.4322861866826995, + "flos": 22749949059840.0, + "grad_norm": 9.447335287226995, + "language_loss": 0.79178882, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.80927849, + "num_input_tokens_seen": 154274770, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.36523438, + "step": 7190, + "time_per_iteration": 2.679560422897339 + }, + { + "auxiliary_loss_clip": 0.01426201, + "auxiliary_loss_mlp": 0.00248833, + "balance_loss_clip": 1.16558385, + "balance_loss_mlp": 0.21724211, + "epoch": 0.4323463099353675, + "flos": 27891925729920.0, + "grad_norm": 11.168723045770053, + "language_loss": 0.86012751, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.87687784, + "num_input_tokens_seen": 154295035, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.31567383, + "step": 7191, + "time_per_iteration": 2.690704584121704 + }, + { + "auxiliary_loss_clip": 0.0143051, + "auxiliary_loss_mlp": 0.00253697, + "balance_loss_clip": 1.16996622, + "balance_loss_mlp": 0.22041389, + "epoch": 0.43240643318803546, + "flos": 21614740652160.0, + "grad_norm": 40.65194698389577, + "language_loss": 0.82995749, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.84679955, + "num_input_tokens_seen": 154314905, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.33251953, + "step": 7192, + "time_per_iteration": 2.687068462371826 + }, + { + "auxiliary_loss_clip": 0.01456987, + "auxiliary_loss_mlp": 0.00279588, + "balance_loss_clip": 1.1932745, + "balance_loss_mlp": 0.24396834, + "epoch": 0.4324665564407034, + "flos": 17558378686080.0, + "grad_norm": 600.5018024235204, + "language_loss": 0.86684823, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.88421398, + "num_input_tokens_seen": 154331740, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.35620117, + "step": 7193, + "time_per_iteration": 4.009444952011108 + }, + { + "auxiliary_loss_clip": 0.01425806, + "auxiliary_loss_mlp": 0.00268568, + "balance_loss_clip": 1.16502035, + "balance_loss_mlp": 0.2329479, + "epoch": 0.4325266796933714, + "flos": 18402423448320.0, + "grad_norm": 26.484595985183503, + "language_loss": 0.84366411, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.86060786, + "num_input_tokens_seen": 154348740, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.35668945, + "step": 7194, + "time_per_iteration": 2.6324357986450195 + }, + { + "auxiliary_loss_clip": 0.01462997, + "auxiliary_loss_mlp": 0.00275779, + "balance_loss_clip": 1.19741321, + "balance_loss_mlp": 0.2437351, + "epoch": 0.4325868029460394, + "flos": 22564793427840.0, + "grad_norm": 20.19416897130306, + "language_loss": 0.68506193, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.70244968, + "num_input_tokens_seen": 154368835, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.32055664, + "step": 7195, + "time_per_iteration": 4.13252067565918 + }, + { + "auxiliary_loss_clip": 0.01455698, + "auxiliary_loss_mlp": 0.00272196, + "balance_loss_clip": 1.18673968, + "balance_loss_mlp": 0.23986597, + "epoch": 0.43264692619870737, + "flos": 14605516396800.0, + "grad_norm": 13.828701609799227, + "language_loss": 0.76482379, + "learning_rate": 2.527068004376515e-06, + "loss": 0.7821027, + "num_input_tokens_seen": 154384620, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.32324219, + "step": 7196, + "time_per_iteration": 2.6613595485687256 + }, + { + "auxiliary_loss_clip": 0.01453102, + "auxiliary_loss_mlp": 0.00288667, + "balance_loss_clip": 1.18631709, + "balance_loss_mlp": 0.25419188, + "epoch": 0.43270704945137534, + "flos": 21501657659520.0, + "grad_norm": 5.8836820624291315, + "language_loss": 0.78690773, + "learning_rate": 2.526692300132797e-06, + "loss": 0.80432546, + "num_input_tokens_seen": 154402865, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.34472656, + "step": 7197, + "time_per_iteration": 2.673870325088501 + }, + { + "auxiliary_loss_clip": 0.01441989, + "auxiliary_loss_mlp": 0.00272137, + "balance_loss_clip": 1.18103766, + "balance_loss_mlp": 0.23809057, + "epoch": 0.4327671727040433, + "flos": 25155891889920.0, + "grad_norm": 7.060082757211363, + "language_loss": 0.78107178, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.79821301, + "num_input_tokens_seen": 154423625, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.34057617, + "step": 7198, + "time_per_iteration": 4.0738301277160645 + }, + { + "auxiliary_loss_clip": 0.01436076, + "auxiliary_loss_mlp": 0.00259264, + "balance_loss_clip": 1.17446291, + "balance_loss_mlp": 0.22698174, + "epoch": 0.43282729595671127, + "flos": 25447163276160.0, + "grad_norm": 9.56450311146034, + "language_loss": 0.86200112, + "learning_rate": 2.525940831742934e-06, + "loss": 0.87895453, + "num_input_tokens_seen": 154444775, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.32250977, + "step": 7199, + "time_per_iteration": 2.704907178878784 + }, + { + "auxiliary_loss_clip": 0.01449737, + "auxiliary_loss_mlp": 0.00240935, + "balance_loss_clip": 1.18876767, + "balance_loss_mlp": 0.21034542, + "epoch": 0.43288741920937923, + "flos": 24126116878080.0, + "grad_norm": 27.181625789653122, + "language_loss": 0.75269544, + "learning_rate": 2.525565067625286e-06, + "loss": 0.76960212, + "num_input_tokens_seen": 154460815, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.3059082, + "step": 7200, + "time_per_iteration": 2.6408040523529053 + }, + { + "auxiliary_loss_clip": 0.01460342, + "auxiliary_loss_mlp": 0.00260752, + "balance_loss_clip": 1.19230592, + "balance_loss_mlp": 0.22415429, + "epoch": 0.4329475424620472, + "flos": 19204955066880.0, + "grad_norm": 4.29949919949412, + "language_loss": 0.9463104, + "learning_rate": 2.525189283578157e-06, + "loss": 0.9635213, + "num_input_tokens_seen": 154479145, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.36621094, + "step": 7201, + "time_per_iteration": 2.6874680519104004 + }, + { + "auxiliary_loss_clip": 0.01467047, + "auxiliary_loss_mlp": 0.00294465, + "balance_loss_clip": 1.19434047, + "balance_loss_mlp": 0.25619882, + "epoch": 0.43300766571471516, + "flos": 22638374438400.0, + "grad_norm": 3.6340710272290484, + "language_loss": 0.72850275, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.74611795, + "num_input_tokens_seen": 154498905, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.38305664, + "step": 7202, + "time_per_iteration": 2.677262783050537 + }, + { + "auxiliary_loss_clip": 0.01466043, + "auxiliary_loss_mlp": 0.00280255, + "balance_loss_clip": 1.20251703, + "balance_loss_mlp": 0.24735323, + "epoch": 0.4330677889673831, + "flos": 22121080721280.0, + "grad_norm": 8.422063697389142, + "language_loss": 0.87548876, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.89295173, + "num_input_tokens_seen": 154517270, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.32861328, + "step": 7203, + "time_per_iteration": 4.102859735488892 + }, + { + "auxiliary_loss_clip": 0.0145518, + "auxiliary_loss_mlp": 0.00268621, + "balance_loss_clip": 1.18508315, + "balance_loss_mlp": 0.23254836, + "epoch": 0.4331279122200511, + "flos": 23221527742080.0, + "grad_norm": 4.536724385730094, + "language_loss": 0.89160109, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.90883911, + "num_input_tokens_seen": 154535945, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.36083984, + "step": 7204, + "time_per_iteration": 2.6761093139648438 + }, + { + "auxiliary_loss_clip": 0.01451892, + "auxiliary_loss_mlp": 0.00276898, + "balance_loss_clip": 1.18587911, + "balance_loss_mlp": 0.24306688, + "epoch": 0.43318803547271906, + "flos": 18259750627200.0, + "grad_norm": 6.755103397887986, + "language_loss": 0.82254851, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.83983648, + "num_input_tokens_seen": 154554935, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.33813477, + "step": 7205, + "time_per_iteration": 2.598172426223755 + }, + { + "auxiliary_loss_clip": 0.01455564, + "auxiliary_loss_mlp": 0.0025219, + "balance_loss_clip": 1.19678903, + "balance_loss_mlp": 0.22021803, + "epoch": 0.433248158725387, + "flos": 27418407713280.0, + "grad_norm": 2.4844686035839265, + "language_loss": 0.8093369, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.82641447, + "num_input_tokens_seen": 154576065, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.31933594, + "step": 7206, + "time_per_iteration": 2.769469976425171 + }, + { + "auxiliary_loss_clip": 0.01448681, + "auxiliary_loss_mlp": 0.00250931, + "balance_loss_clip": 1.1844753, + "balance_loss_mlp": 0.21874449, + "epoch": 0.433308281978055, + "flos": 23218008209280.0, + "grad_norm": 14.900468760717507, + "language_loss": 0.85908628, + "learning_rate": 2.522934161574342e-06, + "loss": 0.87608242, + "num_input_tokens_seen": 154595110, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.32226562, + "step": 7207, + "time_per_iteration": 2.730645179748535 + }, + { + "auxiliary_loss_clip": 0.01475425, + "auxiliary_loss_mlp": 0.00276438, + "balance_loss_clip": 1.19880009, + "balance_loss_mlp": 0.24086568, + "epoch": 0.433368405230723, + "flos": 15852407166720.0, + "grad_norm": 4.286138229054606, + "language_loss": 0.87131894, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.88883758, + "num_input_tokens_seen": 154612255, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.35571289, + "step": 7208, + "time_per_iteration": 2.6528282165527344 + }, + { + "auxiliary_loss_clip": 0.01461605, + "auxiliary_loss_mlp": 0.00249399, + "balance_loss_clip": 1.19468713, + "balance_loss_mlp": 0.21709329, + "epoch": 0.433428528483391, + "flos": 19026084314880.0, + "grad_norm": 8.182674416468378, + "language_loss": 0.80217075, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.8192808, + "num_input_tokens_seen": 154630440, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.32299805, + "step": 7209, + "time_per_iteration": 2.619515895843506 + }, + { + "auxiliary_loss_clip": 0.01463935, + "auxiliary_loss_mlp": 0.00261633, + "balance_loss_clip": 1.19801235, + "balance_loss_mlp": 0.2267528, + "epoch": 0.43348865173605894, + "flos": 24718248581760.0, + "grad_norm": 2.957464644205263, + "language_loss": 0.87027657, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.88753223, + "num_input_tokens_seen": 154652515, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.34912109, + "step": 7210, + "time_per_iteration": 2.7165539264678955 + }, + { + "auxiliary_loss_clip": 0.01465441, + "auxiliary_loss_mlp": 0.0024686, + "balance_loss_clip": 1.20093036, + "balance_loss_mlp": 0.2131478, + "epoch": 0.4335487749887269, + "flos": 22090664880000.0, + "grad_norm": 15.842570537534623, + "language_loss": 0.87691689, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.89403993, + "num_input_tokens_seen": 154670965, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.33666992, + "step": 7211, + "time_per_iteration": 2.632728338241577 + }, + { + "auxiliary_loss_clip": 0.01466509, + "auxiliary_loss_mlp": 0.00260437, + "balance_loss_clip": 1.19606304, + "balance_loss_mlp": 0.22619966, + "epoch": 0.43360889824139487, + "flos": 22382941847040.0, + "grad_norm": 6.346530041723591, + "language_loss": 0.82880175, + "learning_rate": 2.521054347790029e-06, + "loss": 0.84607118, + "num_input_tokens_seen": 154689980, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.34228516, + "step": 7212, + "time_per_iteration": 2.693483352661133 + }, + { + "auxiliary_loss_clip": 0.01470597, + "auxiliary_loss_mlp": 0.00279345, + "balance_loss_clip": 1.20031476, + "balance_loss_mlp": 0.24506073, + "epoch": 0.43366902149406283, + "flos": 17528286067200.0, + "grad_norm": 55.27549230758034, + "language_loss": 0.81984901, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.83734846, + "num_input_tokens_seen": 154706570, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.34301758, + "step": 7213, + "time_per_iteration": 2.7501580715179443 + }, + { + "auxiliary_loss_clip": 0.01473034, + "auxiliary_loss_mlp": 0.00271124, + "balance_loss_clip": 1.20248342, + "balance_loss_mlp": 0.23641074, + "epoch": 0.4337291447467308, + "flos": 19022672522880.0, + "grad_norm": 7.19928023727967, + "language_loss": 0.69863546, + "learning_rate": 2.520302283867471e-06, + "loss": 0.71607709, + "num_input_tokens_seen": 154725210, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.34741211, + "step": 7214, + "time_per_iteration": 2.7224783897399902 + }, + { + "auxiliary_loss_clip": 0.01468273, + "auxiliary_loss_mlp": 0.00261182, + "balance_loss_clip": 1.20382583, + "balance_loss_mlp": 0.22789858, + "epoch": 0.43378926799939876, + "flos": 27234042180480.0, + "grad_norm": 28.011533508938165, + "language_loss": 0.76946074, + "learning_rate": 2.519926222304191e-06, + "loss": 0.78675526, + "num_input_tokens_seen": 154745945, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.33276367, + "step": 7215, + "time_per_iteration": 2.8568344116210938 + }, + { + "auxiliary_loss_clip": 0.01472735, + "auxiliary_loss_mlp": 0.00234974, + "balance_loss_clip": 1.20533586, + "balance_loss_mlp": 0.2014288, + "epoch": 0.43384939125206673, + "flos": 15961108700160.0, + "grad_norm": 248.30100575089963, + "language_loss": 0.82158458, + "learning_rate": 2.519550141025255e-06, + "loss": 0.83866167, + "num_input_tokens_seen": 154763580, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.33544922, + "step": 7216, + "time_per_iteration": 2.6499781608581543 + }, + { + "auxiliary_loss_clip": 0.01499883, + "auxiliary_loss_mlp": 0.00285019, + "balance_loss_clip": 1.2198478, + "balance_loss_mlp": 0.24837378, + "epoch": 0.4339095145047347, + "flos": 21793216354560.0, + "grad_norm": 20.793992988877946, + "language_loss": 0.86248839, + "learning_rate": 2.519174040044927e-06, + "loss": 0.88033742, + "num_input_tokens_seen": 154776825, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.36645508, + "step": 7217, + "time_per_iteration": 2.63392972946167 + }, + { + "auxiliary_loss_clip": 0.01472926, + "auxiliary_loss_mlp": 0.00249057, + "balance_loss_clip": 1.19776368, + "balance_loss_mlp": 0.21517786, + "epoch": 0.43396963775740266, + "flos": 14209853109120.0, + "grad_norm": 14.574209304261995, + "language_loss": 0.81362033, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.83084011, + "num_input_tokens_seen": 154794025, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.33862305, + "step": 7218, + "time_per_iteration": 2.6557343006134033 + }, + { + "auxiliary_loss_clip": 0.01485908, + "auxiliary_loss_mlp": 0.00281997, + "balance_loss_clip": 1.20737696, + "balance_loss_mlp": 0.24647222, + "epoch": 0.4340297610100706, + "flos": 19719052473600.0, + "grad_norm": 9.556305237400233, + "language_loss": 0.78871155, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.80639064, + "num_input_tokens_seen": 154813105, + "router_z_loss_clip": 2.78710938, + "router_z_loss_mlp": 0.35498047, + "step": 7219, + "time_per_iteration": 2.668442487716675 + }, + { + "auxiliary_loss_clip": 0.01464874, + "auxiliary_loss_mlp": 0.0026, + "balance_loss_clip": 1.19609332, + "balance_loss_mlp": 0.22380808, + "epoch": 0.4340898842627386, + "flos": 18953508885120.0, + "grad_norm": 4.257626854209452, + "language_loss": 0.8202225, + "learning_rate": 2.518045619038202e-06, + "loss": 0.83747119, + "num_input_tokens_seen": 154833525, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.36230469, + "step": 7220, + "time_per_iteration": 2.6939616203308105 + }, + { + "auxiliary_loss_clip": 0.0147538, + "auxiliary_loss_mlp": 0.0025398, + "balance_loss_clip": 1.20238256, + "balance_loss_mlp": 0.22219884, + "epoch": 0.4341500075154066, + "flos": 22018304931840.0, + "grad_norm": 50.7191474547803, + "language_loss": 0.78486532, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.80215895, + "num_input_tokens_seen": 154853090, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.31762695, + "step": 7221, + "time_per_iteration": 2.690606117248535 + }, + { + "auxiliary_loss_clip": 0.01496177, + "auxiliary_loss_mlp": 0.00232631, + "balance_loss_clip": 1.2193737, + "balance_loss_mlp": 0.196081, + "epoch": 0.4342101307680746, + "flos": 23582465556480.0, + "grad_norm": 34.841689521888284, + "language_loss": 0.72317195, + "learning_rate": 2.51729324012157e-06, + "loss": 0.74046004, + "num_input_tokens_seen": 154872055, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.36547852, + "step": 7222, + "time_per_iteration": 2.6492228507995605 + }, + { + "auxiliary_loss_clip": 0.01476984, + "auxiliary_loss_mlp": 0.00265703, + "balance_loss_clip": 1.20488727, + "balance_loss_mlp": 0.23079875, + "epoch": 0.43427025402074254, + "flos": 17967976450560.0, + "grad_norm": 69.84269914708423, + "language_loss": 0.81120008, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.82862687, + "num_input_tokens_seen": 154886645, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.34887695, + "step": 7223, + "time_per_iteration": 2.6126503944396973 + }, + { + "auxiliary_loss_clip": 0.01488453, + "auxiliary_loss_mlp": 0.00281887, + "balance_loss_clip": 1.21061158, + "balance_loss_mlp": 0.24583842, + "epoch": 0.4343303772734105, + "flos": 26286395616000.0, + "grad_norm": 24.675850933108137, + "language_loss": 1.0111773, + "learning_rate": 2.516540782741694e-06, + "loss": 1.02888072, + "num_input_tokens_seen": 154906775, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.36083984, + "step": 7224, + "time_per_iteration": 2.6850273609161377 + }, + { + "auxiliary_loss_clip": 0.01472357, + "auxiliary_loss_mlp": 0.00248247, + "balance_loss_clip": 1.20032048, + "balance_loss_mlp": 0.21522591, + "epoch": 0.43439050052607847, + "flos": 26833961520000.0, + "grad_norm": 3.0448093039055943, + "language_loss": 0.65397775, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.67118382, + "num_input_tokens_seen": 154926990, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.33032227, + "step": 7225, + "time_per_iteration": 2.6925857067108154 + }, + { + "auxiliary_loss_clip": 0.01488514, + "auxiliary_loss_mlp": 0.00293365, + "balance_loss_clip": 1.21083188, + "balance_loss_mlp": 0.25831753, + "epoch": 0.43445062377874644, + "flos": 21397660807680.0, + "grad_norm": 36.17282102409196, + "language_loss": 0.86051691, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.87833571, + "num_input_tokens_seen": 154946210, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.35009766, + "step": 7226, + "time_per_iteration": 2.6582775115966797 + }, + { + "auxiliary_loss_clip": 0.01469929, + "auxiliary_loss_mlp": 0.00236799, + "balance_loss_clip": 1.20092559, + "balance_loss_mlp": 0.20468396, + "epoch": 0.4345107470314144, + "flos": 19901945548800.0, + "grad_norm": 2.606903186525605, + "language_loss": 0.9113239, + "learning_rate": 2.515411949802964e-06, + "loss": 0.92839116, + "num_input_tokens_seen": 154964995, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.32104492, + "step": 7227, + "time_per_iteration": 2.671858072280884 + }, + { + "auxiliary_loss_clip": 0.01484427, + "auxiliary_loss_mlp": 0.00275489, + "balance_loss_clip": 1.21354377, + "balance_loss_mlp": 0.24200308, + "epoch": 0.43457087028408237, + "flos": 26432623883520.0, + "grad_norm": 1297.8232711124074, + "language_loss": 0.86026657, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.87786573, + "num_input_tokens_seen": 154984775, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.33508301, + "step": 7228, + "time_per_iteration": 2.696002244949341 + }, + { + "auxiliary_loss_clip": 0.01481781, + "auxiliary_loss_mlp": 0.00260911, + "balance_loss_clip": 1.21212244, + "balance_loss_mlp": 0.22748516, + "epoch": 0.43463099353675033, + "flos": 31868816855040.0, + "grad_norm": 156.97652655914985, + "language_loss": 0.87416697, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.89159387, + "num_input_tokens_seen": 155008125, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.33422852, + "step": 7229, + "time_per_iteration": 2.7644529342651367 + }, + { + "auxiliary_loss_clip": 0.01506573, + "auxiliary_loss_mlp": 0.00274352, + "balance_loss_clip": 1.22629845, + "balance_loss_mlp": 0.23866047, + "epoch": 0.4346911167894183, + "flos": 24571266128640.0, + "grad_norm": 3.2121100152395723, + "language_loss": 0.89152098, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.90933025, + "num_input_tokens_seen": 155027885, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.35693359, + "step": 7230, + "time_per_iteration": 2.691286087036133 + }, + { + "auxiliary_loss_clip": 0.01503795, + "auxiliary_loss_mlp": 0.00288568, + "balance_loss_clip": 1.2214179, + "balance_loss_mlp": 0.25349662, + "epoch": 0.43475124004208626, + "flos": 17090678672640.0, + "grad_norm": 25.874421708878987, + "language_loss": 0.84650004, + "learning_rate": 2.513906565661973e-06, + "loss": 0.86442363, + "num_input_tokens_seen": 155043375, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.35058594, + "step": 7231, + "time_per_iteration": 2.5903193950653076 + }, + { + "auxiliary_loss_clip": 0.01479272, + "auxiliary_loss_mlp": 0.00274155, + "balance_loss_clip": 1.20797491, + "balance_loss_mlp": 0.2431131, + "epoch": 0.4348113632947542, + "flos": 26104615862400.0, + "grad_norm": 136.3340014625959, + "language_loss": 0.744156, + "learning_rate": 2.513530170872575e-06, + "loss": 0.76169026, + "num_input_tokens_seen": 155062930, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.31005859, + "step": 7232, + "time_per_iteration": 2.726602792739868 + }, + { + "auxiliary_loss_clip": 0.01503802, + "auxiliary_loss_mlp": 0.00263931, + "balance_loss_clip": 1.22659302, + "balance_loss_mlp": 0.22764358, + "epoch": 0.4348714865474222, + "flos": 34200496316160.0, + "grad_norm": 40.60027612009066, + "language_loss": 0.77433813, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.79201543, + "num_input_tokens_seen": 155084980, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.36303711, + "step": 7233, + "time_per_iteration": 2.7750394344329834 + }, + { + "auxiliary_loss_clip": 0.0147136, + "auxiliary_loss_mlp": 0.00272621, + "balance_loss_clip": 1.19568217, + "balance_loss_mlp": 0.23776408, + "epoch": 0.43493160980009016, + "flos": 31537468869120.0, + "grad_norm": 4.181142762232191, + "language_loss": 0.80146086, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.8189007, + "num_input_tokens_seen": 155107260, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.34814453, + "step": 7234, + "time_per_iteration": 2.740543842315674 + }, + { + "auxiliary_loss_clip": 0.01503171, + "auxiliary_loss_mlp": 0.00308792, + "balance_loss_clip": 1.21785498, + "balance_loss_mlp": 0.27403039, + "epoch": 0.4349917330527582, + "flos": 24061334699520.0, + "grad_norm": 2.558584721870834, + "language_loss": 0.68066752, + "learning_rate": 2.512400869722782e-06, + "loss": 0.69878715, + "num_input_tokens_seen": 155126720, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.34765625, + "step": 7235, + "time_per_iteration": 4.086630582809448 + }, + { + "auxiliary_loss_clip": 0.01498897, + "auxiliary_loss_mlp": 0.0027903, + "balance_loss_clip": 1.22350717, + "balance_loss_mlp": 0.24660493, + "epoch": 0.43505185630542614, + "flos": 30519329863680.0, + "grad_norm": 5.518399378105568, + "language_loss": 0.8128767, + "learning_rate": 2.512024397126566e-06, + "loss": 0.83065605, + "num_input_tokens_seen": 155148640, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.32421875, + "step": 7236, + "time_per_iteration": 2.708411455154419 + }, + { + "auxiliary_loss_clip": 0.01508223, + "auxiliary_loss_mlp": 0.00271105, + "balance_loss_clip": 1.22948146, + "balance_loss_mlp": 0.2359619, + "epoch": 0.4351119795580941, + "flos": 15735158196480.0, + "grad_norm": 6.932034468039975, + "language_loss": 0.87028337, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.88807666, + "num_input_tokens_seen": 155165870, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 0.35180664, + "step": 7237, + "time_per_iteration": 4.101562738418579 + }, + { + "auxiliary_loss_clip": 0.01505258, + "auxiliary_loss_mlp": 0.00271948, + "balance_loss_clip": 1.22842813, + "balance_loss_mlp": 0.23926088, + "epoch": 0.4351721028107621, + "flos": 18731760272640.0, + "grad_norm": 23.14813408420282, + "language_loss": 0.70289534, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.72066742, + "num_input_tokens_seen": 155185315, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.3269043, + "step": 7238, + "time_per_iteration": 2.6839544773101807 + }, + { + "auxiliary_loss_clip": 0.01518874, + "auxiliary_loss_mlp": 0.00271367, + "balance_loss_clip": 1.23987007, + "balance_loss_mlp": 0.23720147, + "epoch": 0.43523222606343004, + "flos": 25226887121280.0, + "grad_norm": 5.957520849855805, + "language_loss": 0.89955866, + "learning_rate": 2.510894862898928e-06, + "loss": 0.91746104, + "num_input_tokens_seen": 155205790, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.34179688, + "step": 7239, + "time_per_iteration": 2.6861603260040283 + }, + { + "auxiliary_loss_clip": 0.01520621, + "auxiliary_loss_mlp": 0.00277219, + "balance_loss_clip": 1.23740447, + "balance_loss_mlp": 0.23964468, + "epoch": 0.435292349316098, + "flos": 22709190101760.0, + "grad_norm": 7.837466372449419, + "language_loss": 0.79394931, + "learning_rate": 2.510518312724309e-06, + "loss": 0.81192774, + "num_input_tokens_seen": 155226475, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.37573242, + "step": 7240, + "time_per_iteration": 4.116159677505493 + }, + { + "auxiliary_loss_clip": 0.01514598, + "auxiliary_loss_mlp": 0.00294589, + "balance_loss_clip": 1.22996032, + "balance_loss_mlp": 0.25715679, + "epoch": 0.43535247256876597, + "flos": 25775889569280.0, + "grad_norm": 2.6922513441178477, + "language_loss": 0.88963515, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.907727, + "num_input_tokens_seen": 155247110, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.37426758, + "step": 7241, + "time_per_iteration": 2.682924747467041 + }, + { + "auxiliary_loss_clip": 0.01513721, + "auxiliary_loss_mlp": 0.00293031, + "balance_loss_clip": 1.22815514, + "balance_loss_mlp": 0.2580786, + "epoch": 0.43541259582143393, + "flos": 17528142412800.0, + "grad_norm": 8.282638733423934, + "language_loss": 0.88402253, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.90209007, + "num_input_tokens_seen": 155261335, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.34912109, + "step": 7242, + "time_per_iteration": 2.6380200386047363 + }, + { + "auxiliary_loss_clip": 0.01513703, + "auxiliary_loss_mlp": 0.00274172, + "balance_loss_clip": 1.22941363, + "balance_loss_mlp": 0.23986372, + "epoch": 0.4354727190741019, + "flos": 15195205975680.0, + "grad_norm": 70.54150896605418, + "language_loss": 0.81026554, + "learning_rate": 2.509388546104138e-06, + "loss": 0.82814431, + "num_input_tokens_seen": 155278510, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.34301758, + "step": 7243, + "time_per_iteration": 2.6175479888916016 + }, + { + "auxiliary_loss_clip": 0.01505607, + "auxiliary_loss_mlp": 0.00282084, + "balance_loss_clip": 1.22935665, + "balance_loss_mlp": 0.24718, + "epoch": 0.43553284232676986, + "flos": 16649264436480.0, + "grad_norm": 32.31693551133028, + "language_loss": 0.88335305, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.90122998, + "num_input_tokens_seen": 155296450, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.34936523, + "step": 7244, + "time_per_iteration": 2.609377145767212 + }, + { + "auxiliary_loss_clip": 0.01504406, + "auxiliary_loss_mlp": 0.00254358, + "balance_loss_clip": 1.22746611, + "balance_loss_mlp": 0.21838032, + "epoch": 0.43559296557943783, + "flos": 23400865370880.0, + "grad_norm": 8.250903309492404, + "language_loss": 0.79400063, + "learning_rate": 2.508635271753234e-06, + "loss": 0.81158823, + "num_input_tokens_seen": 155316080, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.36010742, + "step": 7245, + "time_per_iteration": 4.110187292098999 + }, + { + "auxiliary_loss_clip": 0.01531994, + "auxiliary_loss_mlp": 0.00260223, + "balance_loss_clip": 1.24657714, + "balance_loss_mlp": 0.22693995, + "epoch": 0.4356530888321058, + "flos": 22419067950720.0, + "grad_norm": 57.313039612845586, + "language_loss": 0.84871042, + "learning_rate": 2.508258605639389e-06, + "loss": 0.86663258, + "num_input_tokens_seen": 155336765, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.33251953, + "step": 7246, + "time_per_iteration": 2.6899514198303223 + }, + { + "auxiliary_loss_clip": 0.01533918, + "auxiliary_loss_mlp": 0.00299713, + "balance_loss_clip": 1.24891615, + "balance_loss_mlp": 0.26554808, + "epoch": 0.43571321208477376, + "flos": 21616141282560.0, + "grad_norm": 5.100915358455112, + "language_loss": 0.91922832, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.93756473, + "num_input_tokens_seen": 155356440, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.34155273, + "step": 7247, + "time_per_iteration": 2.650034189224243 + }, + { + "auxiliary_loss_clip": 0.01519627, + "auxiliary_loss_mlp": 0.00302286, + "balance_loss_clip": 1.23838544, + "balance_loss_mlp": 0.26845443, + "epoch": 0.4357733353374418, + "flos": 23987358639360.0, + "grad_norm": 6.435234475624503, + "language_loss": 0.77858168, + "learning_rate": 2.507505215606333e-06, + "loss": 0.79680085, + "num_input_tokens_seen": 155377070, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.33813477, + "step": 7248, + "time_per_iteration": 2.677650213241577 + }, + { + "auxiliary_loss_clip": 0.01520872, + "auxiliary_loss_mlp": 0.00263984, + "balance_loss_clip": 1.24211109, + "balance_loss_mlp": 0.23096293, + "epoch": 0.43583345859010975, + "flos": 25264737077760.0, + "grad_norm": 13.010605532088857, + "language_loss": 0.92192972, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.93977833, + "num_input_tokens_seen": 155398415, + "router_z_loss_clip": 2.78710938, + "router_z_loss_mlp": 0.33007812, + "step": 7249, + "time_per_iteration": 2.6895663738250732 + }, + { + "auxiliary_loss_clip": 0.01512996, + "auxiliary_loss_mlp": 0.00254698, + "balance_loss_clip": 1.22958016, + "balance_loss_mlp": 0.22224891, + "epoch": 0.4358935818427777, + "flos": 23696302734720.0, + "grad_norm": 6.216413917901755, + "language_loss": 0.89062822, + "learning_rate": 2.506751748594683e-06, + "loss": 0.90830511, + "num_input_tokens_seen": 155415625, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.32446289, + "step": 7250, + "time_per_iteration": 2.677631378173828 + }, + { + "auxiliary_loss_clip": 0.01546394, + "auxiliary_loss_mlp": 0.00286702, + "balance_loss_clip": 1.26123714, + "balance_loss_mlp": 0.25325203, + "epoch": 0.4359537050954457, + "flos": 29532827761920.0, + "grad_norm": 28.486354221003324, + "language_loss": 0.90837705, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.92670798, + "num_input_tokens_seen": 155435505, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.33422852, + "step": 7251, + "time_per_iteration": 2.692474603652954 + }, + { + "auxiliary_loss_clip": 0.0151345, + "auxiliary_loss_mlp": 0.00279011, + "balance_loss_clip": 1.23255098, + "balance_loss_mlp": 0.24355796, + "epoch": 0.43601382834811364, + "flos": 22711273090560.0, + "grad_norm": 3.241834562229884, + "language_loss": 0.77468431, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.79260886, + "num_input_tokens_seen": 155455425, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.35449219, + "step": 7252, + "time_per_iteration": 2.664320230484009 + }, + { + "auxiliary_loss_clip": 0.01512727, + "auxiliary_loss_mlp": 0.00270705, + "balance_loss_clip": 1.23766303, + "balance_loss_mlp": 0.23949555, + "epoch": 0.4360739516007816, + "flos": 19098731571840.0, + "grad_norm": 7.254449351736594, + "language_loss": 0.89561605, + "learning_rate": 2.505621403992348e-06, + "loss": 0.91345042, + "num_input_tokens_seen": 155474250, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.31225586, + "step": 7253, + "time_per_iteration": 2.610624313354492 + }, + { + "auxiliary_loss_clip": 0.01535851, + "auxiliary_loss_mlp": 0.00312509, + "balance_loss_clip": 1.25598645, + "balance_loss_mlp": 0.27812928, + "epoch": 0.43613407485344957, + "flos": 23404420817280.0, + "grad_norm": 5.115141591930763, + "language_loss": 0.76777911, + "learning_rate": 2.505244584092757e-06, + "loss": 0.78626263, + "num_input_tokens_seen": 155494685, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.34375, + "step": 7254, + "time_per_iteration": 2.6885359287261963 + }, + { + "auxiliary_loss_clip": 0.0152871, + "auxiliary_loss_mlp": 0.00260231, + "balance_loss_clip": 1.25181103, + "balance_loss_mlp": 0.22785407, + "epoch": 0.43619419810611754, + "flos": 22637799820800.0, + "grad_norm": 2.9097816177130134, + "language_loss": 0.88666588, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.90455532, + "num_input_tokens_seen": 155513040, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.32348633, + "step": 7255, + "time_per_iteration": 2.6551365852355957 + }, + { + "auxiliary_loss_clip": 0.01517011, + "auxiliary_loss_mlp": 0.0026512, + "balance_loss_clip": 1.23852384, + "balance_loss_mlp": 0.23207489, + "epoch": 0.4362543213587855, + "flos": 20047958334720.0, + "grad_norm": 18.587280079041605, + "language_loss": 0.83430344, + "learning_rate": 2.504490886831089e-06, + "loss": 0.85212475, + "num_input_tokens_seen": 155530100, + "router_z_loss_clip": 2.78710938, + "router_z_loss_mlp": 0.33056641, + "step": 7256, + "time_per_iteration": 2.652520179748535 + }, + { + "auxiliary_loss_clip": 0.01520719, + "auxiliary_loss_mlp": 0.0024724, + "balance_loss_clip": 1.24568009, + "balance_loss_mlp": 0.21619831, + "epoch": 0.43631444461145347, + "flos": 21361319222400.0, + "grad_norm": 1.9664094373166843, + "language_loss": 0.80770373, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.82538337, + "num_input_tokens_seen": 155549375, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.31054688, + "step": 7257, + "time_per_iteration": 2.6535627841949463 + }, + { + "auxiliary_loss_clip": 0.01504386, + "auxiliary_loss_mlp": 0.00271387, + "balance_loss_clip": 1.22478795, + "balance_loss_mlp": 0.2393198, + "epoch": 0.43637456786412143, + "flos": 22418529246720.0, + "grad_norm": 4.489947651059835, + "language_loss": 0.7957086, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.81346631, + "num_input_tokens_seen": 155569395, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.32067871, + "step": 7258, + "time_per_iteration": 2.7075915336608887 + }, + { + "auxiliary_loss_clip": 0.01525623, + "auxiliary_loss_mlp": 0.00282058, + "balance_loss_clip": 1.24497831, + "balance_loss_mlp": 0.24822673, + "epoch": 0.4364346911167894, + "flos": 28548839612160.0, + "grad_norm": 10.902797817972038, + "language_loss": 0.84451306, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.86258984, + "num_input_tokens_seen": 155589090, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.33837891, + "step": 7259, + "time_per_iteration": 2.7714943885803223 + }, + { + "auxiliary_loss_clip": 0.01345186, + "auxiliary_loss_mlp": 0.0004557, + "balance_loss_clip": 1.18528497, + "balance_loss_mlp": 0.03298127, + "epoch": 0.43649481436945736, + "flos": 62659345380480.0, + "grad_norm": 0.7454392780255781, + "language_loss": 0.56662267, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.58053023, + "num_input_tokens_seen": 155648660, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.12597656, + "step": 7260, + "time_per_iteration": 3.188816547393799 + }, + { + "auxiliary_loss_clip": 0.01511669, + "auxiliary_loss_mlp": 0.00286916, + "balance_loss_clip": 1.23499584, + "balance_loss_mlp": 0.25556371, + "epoch": 0.4365549376221254, + "flos": 30592120775040.0, + "grad_norm": 14.114565907798262, + "language_loss": 0.78855294, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.80653882, + "num_input_tokens_seen": 155669945, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.31323242, + "step": 7261, + "time_per_iteration": 2.810563325881958 + }, + { + "auxiliary_loss_clip": 0.01508482, + "auxiliary_loss_mlp": 0.00283391, + "balance_loss_clip": 1.23472762, + "balance_loss_mlp": 0.25161031, + "epoch": 0.43661506087479335, + "flos": 17165875795200.0, + "grad_norm": 2.8906681741860307, + "language_loss": 0.77294886, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.79086757, + "num_input_tokens_seen": 155688555, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.31762695, + "step": 7262, + "time_per_iteration": 2.7119932174682617 + }, + { + "auxiliary_loss_clip": 0.01492337, + "auxiliary_loss_mlp": 0.00249654, + "balance_loss_clip": 1.22523427, + "balance_loss_mlp": 0.21870753, + "epoch": 0.4366751841274613, + "flos": 22047499710720.0, + "grad_norm": 16.76718528442695, + "language_loss": 0.83609217, + "learning_rate": 2.501852344559726e-06, + "loss": 0.85351205, + "num_input_tokens_seen": 155705370, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.30957031, + "step": 7263, + "time_per_iteration": 2.6913533210754395 + }, + { + "auxiliary_loss_clip": 0.01488983, + "auxiliary_loss_mlp": 0.00284352, + "balance_loss_clip": 1.2174983, + "balance_loss_mlp": 0.24928066, + "epoch": 0.4367353073801293, + "flos": 15997306631040.0, + "grad_norm": 10.782451335371709, + "language_loss": 0.82616782, + "learning_rate": 2.50147533371401e-06, + "loss": 0.84390116, + "num_input_tokens_seen": 155721890, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.35107422, + "step": 7264, + "time_per_iteration": 2.6102354526519775 + }, + { + "auxiliary_loss_clip": 0.01485007, + "auxiliary_loss_mlp": 0.00274405, + "balance_loss_clip": 1.21253645, + "balance_loss_mlp": 0.2432676, + "epoch": 0.43679543063279724, + "flos": 38217535868160.0, + "grad_norm": 32.6996138664104, + "language_loss": 0.68359804, + "learning_rate": 2.501098303852298e-06, + "loss": 0.70119214, + "num_input_tokens_seen": 155743970, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.31115723, + "step": 7265, + "time_per_iteration": 2.8027150630950928 + }, + { + "auxiliary_loss_clip": 0.0149115, + "auxiliary_loss_mlp": 0.00286431, + "balance_loss_clip": 1.21961212, + "balance_loss_mlp": 0.25388664, + "epoch": 0.4368555538854652, + "flos": 15193230727680.0, + "grad_norm": 5.316876045689238, + "language_loss": 0.80501473, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.8227905, + "num_input_tokens_seen": 155761830, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.32519531, + "step": 7266, + "time_per_iteration": 2.611496925354004 + }, + { + "auxiliary_loss_clip": 0.01490231, + "auxiliary_loss_mlp": 0.00260645, + "balance_loss_clip": 1.21868157, + "balance_loss_mlp": 0.2295074, + "epoch": 0.4369156771381332, + "flos": 23069086421760.0, + "grad_norm": 2.7154888398745154, + "language_loss": 0.8993457, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.91685444, + "num_input_tokens_seen": 155779610, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.31176758, + "step": 7267, + "time_per_iteration": 2.6958420276641846 + }, + { + "auxiliary_loss_clip": 0.0149317, + "auxiliary_loss_mlp": 0.00261818, + "balance_loss_clip": 1.22146869, + "balance_loss_mlp": 0.2296792, + "epoch": 0.43697580039080114, + "flos": 23441085624960.0, + "grad_norm": 21.148714826681662, + "language_loss": 0.80731624, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.82486618, + "num_input_tokens_seen": 155798765, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.32104492, + "step": 7268, + "time_per_iteration": 2.6992156505584717 + }, + { + "auxiliary_loss_clip": 0.01497152, + "auxiliary_loss_mlp": 0.00290048, + "balance_loss_clip": 1.22430682, + "balance_loss_mlp": 0.25731331, + "epoch": 0.4370359236434691, + "flos": 18514680428160.0, + "grad_norm": 6.663624550309342, + "language_loss": 0.86558902, + "learning_rate": 2.499589994531454e-06, + "loss": 0.883461, + "num_input_tokens_seen": 155817750, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.32714844, + "step": 7269, + "time_per_iteration": 2.618189811706543 + }, + { + "auxiliary_loss_clip": 0.01504256, + "auxiliary_loss_mlp": 0.00260528, + "balance_loss_clip": 1.23202801, + "balance_loss_mlp": 0.22929515, + "epoch": 0.43709604689613707, + "flos": 23222497409280.0, + "grad_norm": 1.6587667698471402, + "language_loss": 0.81733519, + "learning_rate": 2.499212869804237e-06, + "loss": 0.83498299, + "num_input_tokens_seen": 155836490, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.31201172, + "step": 7270, + "time_per_iteration": 2.666328191757202 + }, + { + "auxiliary_loss_clip": 0.014988, + "auxiliary_loss_mlp": 0.00284071, + "balance_loss_clip": 1.2235806, + "balance_loss_mlp": 0.25212246, + "epoch": 0.43715617014880503, + "flos": 23803711378560.0, + "grad_norm": 4.546199207249762, + "language_loss": 0.85591465, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.87374341, + "num_input_tokens_seen": 155856225, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.31933594, + "step": 7271, + "time_per_iteration": 2.6562395095825195 + }, + { + "auxiliary_loss_clip": 0.0133265, + "auxiliary_loss_mlp": 0.00031349, + "balance_loss_clip": 1.17139101, + "balance_loss_mlp": 0.02166962, + "epoch": 0.437216293401473, + "flos": 61941204766080.0, + "grad_norm": 0.6784669955250194, + "language_loss": 0.54440761, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.55804753, + "num_input_tokens_seen": 155916770, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.09667969, + "step": 7272, + "time_per_iteration": 3.214798927307129 + }, + { + "auxiliary_loss_clip": 0.0152108, + "auxiliary_loss_mlp": 0.00318094, + "balance_loss_clip": 1.23633742, + "balance_loss_mlp": 0.2828314, + "epoch": 0.43727641665414096, + "flos": 21982250655360.0, + "grad_norm": 5.762135047405825, + "language_loss": 0.7625035, + "learning_rate": 2.498081382098581e-06, + "loss": 0.78089529, + "num_input_tokens_seen": 155936490, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.35253906, + "step": 7273, + "time_per_iteration": 2.6737070083618164 + }, + { + "auxiliary_loss_clip": 0.01511272, + "auxiliary_loss_mlp": 0.00298177, + "balance_loss_clip": 1.22979748, + "balance_loss_mlp": 0.2654188, + "epoch": 0.437336539906809, + "flos": 39530860842240.0, + "grad_norm": 7.518891985093172, + "language_loss": 0.8462472, + "learning_rate": 2.497704181736367e-06, + "loss": 0.86434174, + "num_input_tokens_seen": 155957595, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.32763672, + "step": 7274, + "time_per_iteration": 2.8244316577911377 + }, + { + "auxiliary_loss_clip": 0.01505229, + "auxiliary_loss_mlp": 0.00293676, + "balance_loss_clip": 1.22582042, + "balance_loss_mlp": 0.26301566, + "epoch": 0.43739666315947695, + "flos": 17457147181440.0, + "grad_norm": 10.474375631164808, + "language_loss": 0.85969657, + "learning_rate": 2.49732696250116e-06, + "loss": 0.87768561, + "num_input_tokens_seen": 155975710, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.30664062, + "step": 7275, + "time_per_iteration": 2.6503450870513916 + }, + { + "auxiliary_loss_clip": 0.01517457, + "auxiliary_loss_mlp": 0.00292008, + "balance_loss_clip": 1.2395997, + "balance_loss_mlp": 0.25877213, + "epoch": 0.4374567864121449, + "flos": 16358747235840.0, + "grad_norm": 4.463723825872533, + "language_loss": 0.88430536, + "learning_rate": 2.496949724407266e-06, + "loss": 0.90239996, + "num_input_tokens_seen": 155993090, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.33227539, + "step": 7276, + "time_per_iteration": 2.6709163188934326 + }, + { + "auxiliary_loss_clip": 0.01535981, + "auxiliary_loss_mlp": 0.00327924, + "balance_loss_clip": 1.2443378, + "balance_loss_mlp": 0.29259008, + "epoch": 0.4375169096648129, + "flos": 30587523834240.0, + "grad_norm": 278.13720161080033, + "language_loss": 0.79978293, + "learning_rate": 2.496572467468988e-06, + "loss": 0.81842196, + "num_input_tokens_seen": 156013685, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.35351562, + "step": 7277, + "time_per_iteration": 2.748324155807495 + }, + { + "auxiliary_loss_clip": 0.0152106, + "auxiliary_loss_mlp": 0.00318091, + "balance_loss_clip": 1.23927402, + "balance_loss_mlp": 0.28378224, + "epoch": 0.43757703291748085, + "flos": 30555599621760.0, + "grad_norm": 5.683042869170781, + "language_loss": 0.80325711, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.8216486, + "num_input_tokens_seen": 156034300, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.34301758, + "step": 7278, + "time_per_iteration": 4.117890119552612 + }, + { + "auxiliary_loss_clip": 0.01523763, + "auxiliary_loss_mlp": 0.00303091, + "balance_loss_clip": 1.2429378, + "balance_loss_mlp": 0.27052286, + "epoch": 0.4376371561701488, + "flos": 21397373498880.0, + "grad_norm": 4.5497035946452575, + "language_loss": 0.73232996, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.75059849, + "num_input_tokens_seen": 156053805, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.32592773, + "step": 7279, + "time_per_iteration": 2.691434144973755 + }, + { + "auxiliary_loss_clip": 0.01539127, + "auxiliary_loss_mlp": 0.00311321, + "balance_loss_clip": 1.24872923, + "balance_loss_mlp": 0.27856246, + "epoch": 0.4376972794228168, + "flos": 23404384903680.0, + "grad_norm": 2.845805592339726, + "language_loss": 0.91148233, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.92998683, + "num_input_tokens_seen": 156073295, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.32763672, + "step": 7280, + "time_per_iteration": 4.09071159362793 + }, + { + "auxiliary_loss_clip": 0.01534922, + "auxiliary_loss_mlp": 0.00294054, + "balance_loss_clip": 1.24984848, + "balance_loss_mlp": 0.26073527, + "epoch": 0.43775740267548474, + "flos": 22892945103360.0, + "grad_norm": 10.660237855630115, + "language_loss": 0.82912064, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.84741044, + "num_input_tokens_seen": 156094540, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.33337402, + "step": 7281, + "time_per_iteration": 2.6758289337158203 + }, + { + "auxiliary_loss_clip": 0.01543715, + "auxiliary_loss_mlp": 0.00317866, + "balance_loss_clip": 1.25458765, + "balance_loss_mlp": 0.28570288, + "epoch": 0.4378175259281527, + "flos": 23294390480640.0, + "grad_norm": 241457.6099359091, + "language_loss": 0.82250702, + "learning_rate": 2.494685900612569e-06, + "loss": 0.84112275, + "num_input_tokens_seen": 156114070, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.32177734, + "step": 7282, + "time_per_iteration": 4.1027069091796875 + }, + { + "auxiliary_loss_clip": 0.01546891, + "auxiliary_loss_mlp": 0.00303404, + "balance_loss_clip": 1.25378633, + "balance_loss_mlp": 0.26888067, + "epoch": 0.43787764918082067, + "flos": 23876897339520.0, + "grad_norm": 30.441478872590594, + "language_loss": 0.90600628, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.92450929, + "num_input_tokens_seen": 156132130, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.34545898, + "step": 7283, + "time_per_iteration": 2.723203659057617 + }, + { + "auxiliary_loss_clip": 0.0155018, + "auxiliary_loss_mlp": 0.00302765, + "balance_loss_clip": 1.25396073, + "balance_loss_mlp": 0.26621589, + "epoch": 0.43793777243348864, + "flos": 23988148738560.0, + "grad_norm": 19.147280233016474, + "language_loss": 0.86293912, + "learning_rate": 2.49393114246007e-06, + "loss": 0.88146853, + "num_input_tokens_seen": 156150820, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.36547852, + "step": 7284, + "time_per_iteration": 2.679199695587158 + }, + { + "auxiliary_loss_clip": 0.01550097, + "auxiliary_loss_mlp": 0.00298849, + "balance_loss_clip": 1.25915706, + "balance_loss_mlp": 0.26392022, + "epoch": 0.4379978956861566, + "flos": 18624064320000.0, + "grad_norm": 21.375284320250195, + "language_loss": 0.88187939, + "learning_rate": 2.493553735281787e-06, + "loss": 0.90036881, + "num_input_tokens_seen": 156170125, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.34936523, + "step": 7285, + "time_per_iteration": 2.648146152496338 + }, + { + "auxiliary_loss_clip": 0.01558836, + "auxiliary_loss_mlp": 0.00295229, + "balance_loss_clip": 1.26257873, + "balance_loss_mlp": 0.2614688, + "epoch": 0.43805801893882457, + "flos": 21981388728960.0, + "grad_norm": 65.45806803913442, + "language_loss": 0.81937474, + "learning_rate": 2.493176309387897e-06, + "loss": 0.8379153, + "num_input_tokens_seen": 156187320, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.33740234, + "step": 7286, + "time_per_iteration": 2.6661813259124756 + }, + { + "auxiliary_loss_clip": 0.01549852, + "auxiliary_loss_mlp": 0.00322984, + "balance_loss_clip": 1.25651002, + "balance_loss_mlp": 0.28769809, + "epoch": 0.43811814219149253, + "flos": 26393337383040.0, + "grad_norm": 7.853077936822833, + "language_loss": 0.7923547, + "learning_rate": 2.492798864792712e-06, + "loss": 0.81108314, + "num_input_tokens_seen": 156207455, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.35253906, + "step": 7287, + "time_per_iteration": 2.70443058013916 + }, + { + "auxiliary_loss_clip": 0.01553847, + "auxiliary_loss_mlp": 0.0032759, + "balance_loss_clip": 1.26114535, + "balance_loss_mlp": 0.29154089, + "epoch": 0.43817826544416055, + "flos": 17493309198720.0, + "grad_norm": 26.653766958800315, + "language_loss": 0.88644314, + "learning_rate": 2.492421401510545e-06, + "loss": 0.90525746, + "num_input_tokens_seen": 156226560, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.3605957, + "step": 7288, + "time_per_iteration": 4.021993160247803 + }, + { + "auxiliary_loss_clip": 0.01552554, + "auxiliary_loss_mlp": 0.00317365, + "balance_loss_clip": 1.25394797, + "balance_loss_mlp": 0.28098226, + "epoch": 0.4382383886968285, + "flos": 21581020759680.0, + "grad_norm": 32.24217088701585, + "language_loss": 0.88468814, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.90338731, + "num_input_tokens_seen": 156246740, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.36376953, + "step": 7289, + "time_per_iteration": 2.6590895652770996 + }, + { + "auxiliary_loss_clip": 0.01566113, + "auxiliary_loss_mlp": 0.00309269, + "balance_loss_clip": 1.26968074, + "balance_loss_mlp": 0.27543738, + "epoch": 0.4382985119494965, + "flos": 27923742201600.0, + "grad_norm": 4.805620157790787, + "language_loss": 0.83923745, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.85799128, + "num_input_tokens_seen": 156266440, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.33837891, + "step": 7290, + "time_per_iteration": 2.753173351287842 + }, + { + "auxiliary_loss_clip": 0.01581593, + "auxiliary_loss_mlp": 0.00301694, + "balance_loss_clip": 1.28450489, + "balance_loss_mlp": 0.26793444, + "epoch": 0.43835863520216445, + "flos": 24936836797440.0, + "grad_norm": 5.75773157593334, + "language_loss": 0.84362745, + "learning_rate": 2.491288899685288e-06, + "loss": 0.86246032, + "num_input_tokens_seen": 156286900, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.33764648, + "step": 7291, + "time_per_iteration": 2.6935582160949707 + }, + { + "auxiliary_loss_clip": 0.01562464, + "auxiliary_loss_mlp": 0.00304626, + "balance_loss_clip": 1.2670449, + "balance_loss_mlp": 0.26955485, + "epoch": 0.4384187584548324, + "flos": 33510293504640.0, + "grad_norm": 4.4250744065213405, + "language_loss": 0.72744036, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.74611115, + "num_input_tokens_seen": 156307690, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.35107422, + "step": 7292, + "time_per_iteration": 2.7406883239746094 + }, + { + "auxiliary_loss_clip": 0.01563919, + "auxiliary_loss_mlp": 0.00291227, + "balance_loss_clip": 1.2686286, + "balance_loss_mlp": 0.25558299, + "epoch": 0.4384788817075004, + "flos": 23951052967680.0, + "grad_norm": 4.101440134559566, + "language_loss": 0.80379295, + "learning_rate": 2.49053380529597e-06, + "loss": 0.82234442, + "num_input_tokens_seen": 156326620, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.35668945, + "step": 7293, + "time_per_iteration": 2.6975927352905273 + }, + { + "auxiliary_loss_clip": 0.01562134, + "auxiliary_loss_mlp": 0.00347378, + "balance_loss_clip": 1.26395857, + "balance_loss_mlp": 0.3070378, + "epoch": 0.43853900496016834, + "flos": 19098516090240.0, + "grad_norm": 28.558922192532496, + "language_loss": 0.86101842, + "learning_rate": 2.490156230192516e-06, + "loss": 0.8801136, + "num_input_tokens_seen": 156345495, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.40332031, + "step": 7294, + "time_per_iteration": 2.671438694000244 + }, + { + "auxiliary_loss_clip": 0.01577819, + "auxiliary_loss_mlp": 0.00338692, + "balance_loss_clip": 1.28073621, + "balance_loss_mlp": 0.30271441, + "epoch": 0.4385991282128363, + "flos": 13225362168960.0, + "grad_norm": 15.588606604803557, + "language_loss": 0.79558486, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.81474996, + "num_input_tokens_seen": 156363155, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 0.35986328, + "step": 7295, + "time_per_iteration": 2.618680715560913 + }, + { + "auxiliary_loss_clip": 0.01577631, + "auxiliary_loss_mlp": 0.00349534, + "balance_loss_clip": 1.27516842, + "balance_loss_mlp": 0.31129181, + "epoch": 0.4386592514655043, + "flos": 14319883445760.0, + "grad_norm": 28.530577035163915, + "language_loss": 0.82249624, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.84176785, + "num_input_tokens_seen": 156380940, + "router_z_loss_clip": 3.0234375, + "router_z_loss_mlp": 0.3828125, + "step": 7296, + "time_per_iteration": 2.6362862586975098 + }, + { + "auxiliary_loss_clip": 0.01564214, + "auxiliary_loss_mlp": 0.0030522, + "balance_loss_clip": 1.26978123, + "balance_loss_mlp": 0.27057767, + "epoch": 0.43871937471817224, + "flos": 22784423137920.0, + "grad_norm": 5.112930681967207, + "language_loss": 0.75932246, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.77801681, + "num_input_tokens_seen": 156400415, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.34643555, + "step": 7297, + "time_per_iteration": 2.645451784133911 + }, + { + "auxiliary_loss_clip": 0.01566597, + "auxiliary_loss_mlp": 0.0032753, + "balance_loss_clip": 1.26916885, + "balance_loss_mlp": 0.29133767, + "epoch": 0.4387794979708402, + "flos": 28072304853120.0, + "grad_norm": 28.644485211810167, + "language_loss": 0.75280404, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.77174532, + "num_input_tokens_seen": 156421120, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.36206055, + "step": 7298, + "time_per_iteration": 2.6820783615112305 + }, + { + "auxiliary_loss_clip": 0.01556614, + "auxiliary_loss_mlp": 0.00268093, + "balance_loss_clip": 1.26379299, + "balance_loss_mlp": 0.23409463, + "epoch": 0.43883962122350817, + "flos": 26249551240320.0, + "grad_norm": 5.136832876619168, + "language_loss": 0.7844345, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.80268157, + "num_input_tokens_seen": 156441535, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.34008789, + "step": 7299, + "time_per_iteration": 2.7084038257598877 + }, + { + "auxiliary_loss_clip": 0.01551189, + "auxiliary_loss_mlp": 0.00313202, + "balance_loss_clip": 1.2517997, + "balance_loss_mlp": 0.27512604, + "epoch": 0.43889974447617613, + "flos": 25883765089920.0, + "grad_norm": 2.3872987763151783, + "language_loss": 0.85005844, + "learning_rate": 2.487890389750719e-06, + "loss": 0.86870235, + "num_input_tokens_seen": 156462015, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.38085938, + "step": 7300, + "time_per_iteration": 2.6784064769744873 + }, + { + "auxiliary_loss_clip": 0.01549006, + "auxiliary_loss_mlp": 0.00309412, + "balance_loss_clip": 1.25429797, + "balance_loss_mlp": 0.2763437, + "epoch": 0.43895986772884416, + "flos": 25046615738880.0, + "grad_norm": 59.73336525927075, + "language_loss": 0.78624815, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.80483234, + "num_input_tokens_seen": 156482165, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.33032227, + "step": 7301, + "time_per_iteration": 2.696646213531494 + }, + { + "auxiliary_loss_clip": 0.01578821, + "auxiliary_loss_mlp": 0.00311924, + "balance_loss_clip": 1.27942002, + "balance_loss_mlp": 0.27492177, + "epoch": 0.4390199909815121, + "flos": 25994585525760.0, + "grad_norm": 14.460226045007607, + "language_loss": 0.77757812, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.79648554, + "num_input_tokens_seen": 156503170, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.37011719, + "step": 7302, + "time_per_iteration": 2.7109427452087402 + }, + { + "auxiliary_loss_clip": 0.01558, + "auxiliary_loss_mlp": 0.00337389, + "balance_loss_clip": 1.26590192, + "balance_loss_mlp": 0.30036274, + "epoch": 0.4390801142341801, + "flos": 29022249888000.0, + "grad_norm": 9.024055898858883, + "language_loss": 0.87334687, + "learning_rate": 2.486757219574983e-06, + "loss": 0.89230084, + "num_input_tokens_seen": 156523005, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.37060547, + "step": 7303, + "time_per_iteration": 2.7572505474090576 + }, + { + "auxiliary_loss_clip": 0.01553499, + "auxiliary_loss_mlp": 0.00349994, + "balance_loss_clip": 1.24893665, + "balance_loss_mlp": 0.31411192, + "epoch": 0.43914023748684805, + "flos": 33438544087680.0, + "grad_norm": 192.69158567096125, + "language_loss": 0.77474165, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.79377663, + "num_input_tokens_seen": 156544440, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.35888672, + "step": 7304, + "time_per_iteration": 2.760329008102417 + }, + { + "auxiliary_loss_clip": 0.01536786, + "auxiliary_loss_mlp": 0.00296921, + "balance_loss_clip": 1.25187135, + "balance_loss_mlp": 0.26466262, + "epoch": 0.439200360739516, + "flos": 34531844302080.0, + "grad_norm": 3.473020966646635, + "language_loss": 0.83989298, + "learning_rate": 2.486001680477873e-06, + "loss": 0.85822999, + "num_input_tokens_seen": 156565410, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.32250977, + "step": 7305, + "time_per_iteration": 2.8378231525421143 + }, + { + "auxiliary_loss_clip": 0.01546679, + "auxiliary_loss_mlp": 0.00320428, + "balance_loss_clip": 1.25580978, + "balance_loss_mlp": 0.28609604, + "epoch": 0.439260483992184, + "flos": 21907843632000.0, + "grad_norm": 14.687439679925065, + "language_loss": 0.76640475, + "learning_rate": 2.485623883278308e-06, + "loss": 0.78507584, + "num_input_tokens_seen": 156584210, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.34350586, + "step": 7306, + "time_per_iteration": 2.786677837371826 + }, + { + "auxiliary_loss_clip": 0.01543953, + "auxiliary_loss_mlp": 0.00311433, + "balance_loss_clip": 1.2528801, + "balance_loss_mlp": 0.27521703, + "epoch": 0.43932060724485195, + "flos": 20996430912000.0, + "grad_norm": 18.330930905870922, + "language_loss": 0.67159784, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.69015169, + "num_input_tokens_seen": 156602730, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.36230469, + "step": 7307, + "time_per_iteration": 2.701305389404297 + }, + { + "auxiliary_loss_clip": 0.01551944, + "auxiliary_loss_mlp": 0.00331665, + "balance_loss_clip": 1.25563598, + "balance_loss_mlp": 0.29759496, + "epoch": 0.4393807304975199, + "flos": 17747053850880.0, + "grad_norm": 65.304704960893, + "language_loss": 0.80823249, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.82706869, + "num_input_tokens_seen": 156619405, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.34057617, + "step": 7308, + "time_per_iteration": 2.709374189376831 + }, + { + "auxiliary_loss_clip": 0.01539029, + "auxiliary_loss_mlp": 0.00321215, + "balance_loss_clip": 1.24081016, + "balance_loss_mlp": 0.28478497, + "epoch": 0.4394408537501879, + "flos": 22528523669760.0, + "grad_norm": 28.75117434182302, + "language_loss": 0.83405286, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.85265529, + "num_input_tokens_seen": 156638165, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.36425781, + "step": 7309, + "time_per_iteration": 2.7496745586395264 + }, + { + "auxiliary_loss_clip": 0.01522786, + "auxiliary_loss_mlp": 0.00281543, + "balance_loss_clip": 1.24145532, + "balance_loss_mlp": 0.24654339, + "epoch": 0.43950097700285584, + "flos": 23440654661760.0, + "grad_norm": 49.44882984726687, + "language_loss": 0.78576541, + "learning_rate": 2.484112510474251e-06, + "loss": 0.80380869, + "num_input_tokens_seen": 156658845, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.34985352, + "step": 7310, + "time_per_iteration": 2.6623010635375977 + }, + { + "auxiliary_loss_clip": 0.01543378, + "auxiliary_loss_mlp": 0.00320718, + "balance_loss_clip": 1.24369621, + "balance_loss_mlp": 0.28433508, + "epoch": 0.4395611002555238, + "flos": 23180696956800.0, + "grad_norm": 5.18026141527971, + "language_loss": 0.83372056, + "learning_rate": 2.483734621343429e-06, + "loss": 0.8523615, + "num_input_tokens_seen": 156677275, + "router_z_loss_clip": 2.99414062, + "router_z_loss_mlp": 0.36376953, + "step": 7311, + "time_per_iteration": 2.6556038856506348 + }, + { + "auxiliary_loss_clip": 0.01556185, + "auxiliary_loss_mlp": 0.00333907, + "balance_loss_clip": 1.25737572, + "balance_loss_mlp": 0.29728639, + "epoch": 0.43962122350819177, + "flos": 22127365601280.0, + "grad_norm": 2.516936596764019, + "language_loss": 0.8843258, + "learning_rate": 2.483356713869341e-06, + "loss": 0.90322667, + "num_input_tokens_seen": 156695815, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.36621094, + "step": 7312, + "time_per_iteration": 2.655214309692383 + }, + { + "auxiliary_loss_clip": 0.01538168, + "auxiliary_loss_mlp": 0.00348241, + "balance_loss_clip": 1.24677253, + "balance_loss_mlp": 0.31262159, + "epoch": 0.43968134676085974, + "flos": 17420554200960.0, + "grad_norm": 3.7601726452914948, + "language_loss": 0.95431167, + "learning_rate": 2.482978788066318e-06, + "loss": 0.97317576, + "num_input_tokens_seen": 156714385, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.35595703, + "step": 7313, + "time_per_iteration": 2.6299421787261963 + }, + { + "auxiliary_loss_clip": 0.01534808, + "auxiliary_loss_mlp": 0.00293514, + "balance_loss_clip": 1.2412889, + "balance_loss_mlp": 0.25765589, + "epoch": 0.43974147001352776, + "flos": 18952646958720.0, + "grad_norm": 3.776489252415717, + "language_loss": 0.73890805, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.75719124, + "num_input_tokens_seen": 156732615, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.35864258, + "step": 7314, + "time_per_iteration": 2.6484973430633545 + }, + { + "auxiliary_loss_clip": 0.01544181, + "auxiliary_loss_mlp": 0.00345807, + "balance_loss_clip": 1.24486959, + "balance_loss_mlp": 0.30730239, + "epoch": 0.4398015932661957, + "flos": 18953508885120.0, + "grad_norm": 10.997981563041034, + "language_loss": 0.81936389, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.83826375, + "num_input_tokens_seen": 156750920, + "router_z_loss_clip": 2.99414062, + "router_z_loss_mlp": 0.38525391, + "step": 7315, + "time_per_iteration": 2.6383233070373535 + }, + { + "auxiliary_loss_clip": 0.01522672, + "auxiliary_loss_mlp": 0.0034849, + "balance_loss_clip": 1.2355212, + "balance_loss_mlp": 0.31382418, + "epoch": 0.4398617165188637, + "flos": 24199913370240.0, + "grad_norm": 34.9719195240347, + "language_loss": 0.80322981, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.82194144, + "num_input_tokens_seen": 156768520, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.34643555, + "step": 7316, + "time_per_iteration": 2.695347309112549 + }, + { + "auxiliary_loss_clip": 0.01549352, + "auxiliary_loss_mlp": 0.00338582, + "balance_loss_clip": 1.25641739, + "balance_loss_mlp": 0.301126, + "epoch": 0.43992183977153165, + "flos": 22236677665920.0, + "grad_norm": 134.60177124067337, + "language_loss": 0.72891074, + "learning_rate": 2.481466901851506e-06, + "loss": 0.7477901, + "num_input_tokens_seen": 156788700, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.37475586, + "step": 7317, + "time_per_iteration": 2.6342244148254395 + }, + { + "auxiliary_loss_clip": 0.01542764, + "auxiliary_loss_mlp": 0.00348068, + "balance_loss_clip": 1.24902129, + "balance_loss_mlp": 0.31108928, + "epoch": 0.4399819630241996, + "flos": 18697465762560.0, + "grad_norm": 16.49235410700712, + "language_loss": 0.86800778, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.8869161, + "num_input_tokens_seen": 156806470, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.36987305, + "step": 7318, + "time_per_iteration": 2.609905481338501 + }, + { + "auxiliary_loss_clip": 0.01546172, + "auxiliary_loss_mlp": 0.0032894, + "balance_loss_clip": 1.24929321, + "balance_loss_mlp": 0.29312921, + "epoch": 0.4400420862768676, + "flos": 23879375377920.0, + "grad_norm": 2.5147463973278827, + "language_loss": 0.85490572, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.87365675, + "num_input_tokens_seen": 156825895, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.3581543, + "step": 7319, + "time_per_iteration": 2.7004446983337402 + }, + { + "auxiliary_loss_clip": 0.01540377, + "auxiliary_loss_mlp": 0.0034473, + "balance_loss_clip": 1.25013888, + "balance_loss_mlp": 0.30818063, + "epoch": 0.44010220952953555, + "flos": 28037615293440.0, + "grad_norm": 2.0879463915305436, + "language_loss": 0.85607237, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.87492347, + "num_input_tokens_seen": 156845990, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.36547852, + "step": 7320, + "time_per_iteration": 4.1041319370269775 + }, + { + "auxiliary_loss_clip": 0.0154807, + "auxiliary_loss_mlp": 0.0033763, + "balance_loss_clip": 1.25382936, + "balance_loss_mlp": 0.30172423, + "epoch": 0.4401623327822035, + "flos": 23768985905280.0, + "grad_norm": 2.291638169327599, + "language_loss": 0.77274644, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.79160345, + "num_input_tokens_seen": 156866685, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.35888672, + "step": 7321, + "time_per_iteration": 2.6922073364257812 + }, + { + "auxiliary_loss_clip": 0.01437022, + "auxiliary_loss_mlp": 0.00080958, + "balance_loss_clip": 1.27714455, + "balance_loss_mlp": 0.07270916, + "epoch": 0.4402224560348715, + "flos": 70774583264640.0, + "grad_norm": 0.8699970720707796, + "language_loss": 0.56529969, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.5804795, + "num_input_tokens_seen": 156923450, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.08251953, + "step": 7322, + "time_per_iteration": 4.636983633041382 + }, + { + "auxiliary_loss_clip": 0.0154145, + "auxiliary_loss_mlp": 0.00334003, + "balance_loss_clip": 1.25238466, + "balance_loss_mlp": 0.29950401, + "epoch": 0.44028257928753944, + "flos": 22891795868160.0, + "grad_norm": 138.98245912683544, + "language_loss": 0.81155896, + "learning_rate": 2.479198525097822e-06, + "loss": 0.83031344, + "num_input_tokens_seen": 156944795, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.34545898, + "step": 7323, + "time_per_iteration": 2.6831393241882324 + }, + { + "auxiliary_loss_clip": 0.01538763, + "auxiliary_loss_mlp": 0.00357122, + "balance_loss_clip": 1.24758422, + "balance_loss_mlp": 0.32081044, + "epoch": 0.4403427025402074, + "flos": 17895760156800.0, + "grad_norm": 6.437398626839328, + "language_loss": 0.86548162, + "learning_rate": 2.478820398622511e-06, + "loss": 0.88444042, + "num_input_tokens_seen": 156962755, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.36328125, + "step": 7324, + "time_per_iteration": 4.074963569641113 + }, + { + "auxiliary_loss_clip": 0.01419153, + "auxiliary_loss_mlp": 0.00087511, + "balance_loss_clip": 1.26271987, + "balance_loss_mlp": 0.07964273, + "epoch": 0.4404028257928754, + "flos": 69562525708800.0, + "grad_norm": 0.6565235410273872, + "language_loss": 0.54317701, + "learning_rate": 2.478442253990283e-06, + "loss": 0.55824363, + "num_input_tokens_seen": 157028095, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.07861328, + "step": 7325, + "time_per_iteration": 3.1279892921447754 + }, + { + "auxiliary_loss_clip": 0.01544707, + "auxiliary_loss_mlp": 0.00353763, + "balance_loss_clip": 1.25531101, + "balance_loss_mlp": 0.31907317, + "epoch": 0.44046294904554334, + "flos": 20923675914240.0, + "grad_norm": 55.035535785201496, + "language_loss": 0.75939983, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.77838457, + "num_input_tokens_seen": 157048365, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.34692383, + "step": 7326, + "time_per_iteration": 2.7075376510620117 + }, + { + "auxiliary_loss_clip": 0.01538203, + "auxiliary_loss_mlp": 0.00343779, + "balance_loss_clip": 1.25200713, + "balance_loss_mlp": 0.3090651, + "epoch": 0.44052307229821136, + "flos": 23623475909760.0, + "grad_norm": 64.46216369620049, + "language_loss": 0.81886971, + "learning_rate": 2.477685910312432e-06, + "loss": 0.83768952, + "num_input_tokens_seen": 157069130, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.34741211, + "step": 7327, + "time_per_iteration": 2.7055118083953857 + }, + { + "auxiliary_loss_clip": 0.0152341, + "auxiliary_loss_mlp": 0.00393152, + "balance_loss_clip": 1.2396512, + "balance_loss_mlp": 0.35593477, + "epoch": 0.4405831955508793, + "flos": 17597665186560.0, + "grad_norm": 17.09153034629179, + "language_loss": 0.88521338, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.90437895, + "num_input_tokens_seen": 157084940, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.37182617, + "step": 7328, + "time_per_iteration": 2.5965161323547363 + }, + { + "auxiliary_loss_clip": 0.01519469, + "auxiliary_loss_mlp": 0.00341063, + "balance_loss_clip": 1.2391398, + "balance_loss_mlp": 0.30806598, + "epoch": 0.4406433188035473, + "flos": 21463376739840.0, + "grad_norm": 9.698399129101826, + "language_loss": 0.8562246, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.87482995, + "num_input_tokens_seen": 157102770, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.32983398, + "step": 7329, + "time_per_iteration": 2.6797726154327393 + }, + { + "auxiliary_loss_clip": 0.01530292, + "auxiliary_loss_mlp": 0.00338837, + "balance_loss_clip": 1.23951864, + "balance_loss_mlp": 0.30510056, + "epoch": 0.44070344205621526, + "flos": 22673566788480.0, + "grad_norm": 62.40137517806673, + "language_loss": 0.79472202, + "learning_rate": 2.476551258977278e-06, + "loss": 0.81341332, + "num_input_tokens_seen": 157122035, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.33740234, + "step": 7330, + "time_per_iteration": 4.043255567550659 + }, + { + "auxiliary_loss_clip": 0.01553263, + "auxiliary_loss_mlp": 0.00386712, + "balance_loss_clip": 1.26395559, + "balance_loss_mlp": 0.35121137, + "epoch": 0.4407635653088832, + "flos": 23441193365760.0, + "grad_norm": 1.7445270537585265, + "language_loss": 0.80211246, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.82151222, + "num_input_tokens_seen": 157142800, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.35473633, + "step": 7331, + "time_per_iteration": 2.6802334785461426 + }, + { + "auxiliary_loss_clip": 0.01536332, + "auxiliary_loss_mlp": 0.00401415, + "balance_loss_clip": 1.24977326, + "balance_loss_mlp": 0.36670136, + "epoch": 0.4408236885615512, + "flos": 24021294013440.0, + "grad_norm": 516.245605651051, + "language_loss": 0.79580885, + "learning_rate": 2.475794734375581e-06, + "loss": 0.81518626, + "num_input_tokens_seen": 157163295, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.34692383, + "step": 7332, + "time_per_iteration": 2.695683002471924 + }, + { + "auxiliary_loss_clip": 0.01526361, + "auxiliary_loss_mlp": 0.00365931, + "balance_loss_clip": 1.23728776, + "balance_loss_mlp": 0.33066869, + "epoch": 0.44088381181421915, + "flos": 12676826597760.0, + "grad_norm": 2.4686148674785464, + "language_loss": 0.80965769, + "learning_rate": 2.475416445004285e-06, + "loss": 0.82858062, + "num_input_tokens_seen": 157180890, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.3527832, + "step": 7333, + "time_per_iteration": 2.6496803760528564 + }, + { + "auxiliary_loss_clip": 0.01530287, + "auxiliary_loss_mlp": 0.00365599, + "balance_loss_clip": 1.25465608, + "balance_loss_mlp": 0.33233982, + "epoch": 0.4409439350668871, + "flos": 24569865498240.0, + "grad_norm": 10.929291481449408, + "language_loss": 0.84965599, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.86861479, + "num_input_tokens_seen": 157200580, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.33251953, + "step": 7334, + "time_per_iteration": 2.71000337600708 + }, + { + "auxiliary_loss_clip": 0.01551054, + "auxiliary_loss_mlp": 0.00402663, + "balance_loss_clip": 1.25259805, + "balance_loss_mlp": 0.3646825, + "epoch": 0.4410040583195551, + "flos": 22668574798080.0, + "grad_norm": 4.2696264953422265, + "language_loss": 0.83417726, + "learning_rate": 2.47465981219252e-06, + "loss": 0.85371447, + "num_input_tokens_seen": 157218345, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.37963867, + "step": 7335, + "time_per_iteration": 2.6681854724884033 + }, + { + "auxiliary_loss_clip": 0.01526602, + "auxiliary_loss_mlp": 0.00359668, + "balance_loss_clip": 1.2367146, + "balance_loss_mlp": 0.32280898, + "epoch": 0.44106418157222305, + "flos": 10852528700160.0, + "grad_norm": 48.90201854758476, + "language_loss": 0.79071516, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.80957788, + "num_input_tokens_seen": 157234395, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.3684082, + "step": 7336, + "time_per_iteration": 2.5969319343566895 + }, + { + "auxiliary_loss_clip": 0.01533157, + "auxiliary_loss_mlp": 0.00407182, + "balance_loss_clip": 1.23979831, + "balance_loss_mlp": 0.36619779, + "epoch": 0.441124304824891, + "flos": 21726710323200.0, + "grad_norm": 16.112672596976136, + "language_loss": 0.71734911, + "learning_rate": 2.473903107384165e-06, + "loss": 0.73675251, + "num_input_tokens_seen": 157254805, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.40991211, + "step": 7337, + "time_per_iteration": 2.6690475940704346 + }, + { + "auxiliary_loss_clip": 0.01411648, + "auxiliary_loss_mlp": 0.00068199, + "balance_loss_clip": 1.23981726, + "balance_loss_mlp": 0.06047434, + "epoch": 0.441184428077559, + "flos": 63220486625280.0, + "grad_norm": 0.7346676939699418, + "language_loss": 0.52580285, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54060125, + "num_input_tokens_seen": 157317870, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.07714844, + "step": 7338, + "time_per_iteration": 3.2402782440185547 + }, + { + "auxiliary_loss_clip": 0.01527824, + "auxiliary_loss_mlp": 0.00439321, + "balance_loss_clip": 1.23418951, + "balance_loss_mlp": 0.39774066, + "epoch": 0.44124455133022694, + "flos": 21177959270400.0, + "grad_norm": 7.972563300668592, + "language_loss": 0.78128791, + "learning_rate": 2.473146330693997e-06, + "loss": 0.80095935, + "num_input_tokens_seen": 157336505, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.41552734, + "step": 7339, + "time_per_iteration": 2.629051685333252 + }, + { + "auxiliary_loss_clip": 0.01528178, + "auxiliary_loss_mlp": 0.00359734, + "balance_loss_clip": 1.25029492, + "balance_loss_mlp": 0.32728487, + "epoch": 0.4413046745828949, + "flos": 17457865453440.0, + "grad_norm": 4.3224456975866135, + "language_loss": 0.75865269, + "learning_rate": 2.472767915429105e-06, + "loss": 0.77753174, + "num_input_tokens_seen": 157354995, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.32470703, + "step": 7340, + "time_per_iteration": 2.6086714267730713 + }, + { + "auxiliary_loss_clip": 0.0142935, + "auxiliary_loss_mlp": 0.00071986, + "balance_loss_clip": 1.24921811, + "balance_loss_mlp": 0.06469015, + "epoch": 0.4413647978355629, + "flos": 61586153804160.0, + "grad_norm": 0.8994122847059501, + "language_loss": 0.64050949, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.65552282, + "num_input_tokens_seen": 157404260, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.07275391, + "step": 7341, + "time_per_iteration": 2.9027256965637207 + }, + { + "auxiliary_loss_clip": 0.01533075, + "auxiliary_loss_mlp": 0.00402685, + "balance_loss_clip": 1.24456191, + "balance_loss_mlp": 0.3653723, + "epoch": 0.4414249210882309, + "flos": 27527001505920.0, + "grad_norm": 11.92352291447366, + "language_loss": 0.80343819, + "learning_rate": 2.47201103113145e-06, + "loss": 0.82279581, + "num_input_tokens_seen": 157423045, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.37329102, + "step": 7342, + "time_per_iteration": 2.6988160610198975 + }, + { + "auxiliary_loss_clip": 0.01512971, + "auxiliary_loss_mlp": 0.00380435, + "balance_loss_clip": 1.23045039, + "balance_loss_mlp": 0.34431446, + "epoch": 0.44148504434089886, + "flos": 23513984277120.0, + "grad_norm": 161.36027056968615, + "language_loss": 0.86088336, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.87981743, + "num_input_tokens_seen": 157441815, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.36108398, + "step": 7343, + "time_per_iteration": 2.683457136154175 + }, + { + "auxiliary_loss_clip": 0.01505003, + "auxiliary_loss_mlp": 0.00375361, + "balance_loss_clip": 1.22456598, + "balance_loss_mlp": 0.34071916, + "epoch": 0.4415451675935668, + "flos": 21580589796480.0, + "grad_norm": 40.03970946119173, + "language_loss": 0.82178903, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.84059268, + "num_input_tokens_seen": 157460470, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.34594727, + "step": 7344, + "time_per_iteration": 2.652653217315674 + }, + { + "auxiliary_loss_clip": 0.01375238, + "auxiliary_loss_mlp": 0.00067613, + "balance_loss_clip": 1.21316767, + "balance_loss_mlp": 0.05998376, + "epoch": 0.4416052908462348, + "flos": 59006368126080.0, + "grad_norm": 0.8153351678240132, + "language_loss": 0.63365436, + "learning_rate": 2.470875570480556e-06, + "loss": 0.64808297, + "num_input_tokens_seen": 157512655, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.07617188, + "step": 7345, + "time_per_iteration": 2.88912296295166 + }, + { + "auxiliary_loss_clip": 0.01509689, + "auxiliary_loss_mlp": 0.00360786, + "balance_loss_clip": 1.22632587, + "balance_loss_mlp": 0.32831299, + "epoch": 0.44166541409890275, + "flos": 26357642242560.0, + "grad_norm": 9.923220876445336, + "language_loss": 0.90665513, + "learning_rate": 2.470497047866489e-06, + "loss": 0.92535985, + "num_input_tokens_seen": 157533700, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.32446289, + "step": 7346, + "time_per_iteration": 2.6994500160217285 + }, + { + "auxiliary_loss_clip": 0.01505441, + "auxiliary_loss_mlp": 0.00343893, + "balance_loss_clip": 1.22021806, + "balance_loss_mlp": 0.30886891, + "epoch": 0.4417255373515707, + "flos": 20192678231040.0, + "grad_norm": 442.18172346590893, + "language_loss": 0.86228466, + "learning_rate": 2.470118507411128e-06, + "loss": 0.88077796, + "num_input_tokens_seen": 157551105, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.35009766, + "step": 7347, + "time_per_iteration": 2.6699962615966797 + }, + { + "auxiliary_loss_clip": 0.01502968, + "auxiliary_loss_mlp": 0.00375922, + "balance_loss_clip": 1.2237674, + "balance_loss_mlp": 0.34130365, + "epoch": 0.4417856606042387, + "flos": 17887895078400.0, + "grad_norm": 15.243434358332195, + "language_loss": 0.89793408, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.91672301, + "num_input_tokens_seen": 157568285, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.34619141, + "step": 7348, + "time_per_iteration": 2.7058019638061523 + }, + { + "auxiliary_loss_clip": 0.01514252, + "auxiliary_loss_mlp": 0.0036732, + "balance_loss_clip": 1.23036826, + "balance_loss_mlp": 0.33079404, + "epoch": 0.44184578385690665, + "flos": 27964034282880.0, + "grad_norm": 6.458077647624142, + "language_loss": 0.7716949, + "learning_rate": 2.469361373033938e-06, + "loss": 0.79051059, + "num_input_tokens_seen": 157590405, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.36547852, + "step": 7349, + "time_per_iteration": 2.733731508255005 + }, + { + "auxiliary_loss_clip": 0.01502865, + "auxiliary_loss_mlp": 0.00367298, + "balance_loss_clip": 1.2182219, + "balance_loss_mlp": 0.33113015, + "epoch": 0.4419059071095746, + "flos": 23367899664000.0, + "grad_norm": 111.23803766849683, + "language_loss": 0.8183924, + "learning_rate": 2.468982779140819e-06, + "loss": 0.83709395, + "num_input_tokens_seen": 157607420, + "router_z_loss_clip": 2.84570312, + "router_z_loss_mlp": 0.36206055, + "step": 7350, + "time_per_iteration": 2.658177137374878 + }, + { + "auxiliary_loss_clip": 0.01502889, + "auxiliary_loss_mlp": 0.00373219, + "balance_loss_clip": 1.21902871, + "balance_loss_mlp": 0.33807606, + "epoch": 0.4419660303622426, + "flos": 15012169246080.0, + "grad_norm": 8.672458713026966, + "language_loss": 0.8970421, + "learning_rate": 2.468604167463827e-06, + "loss": 0.91580319, + "num_input_tokens_seen": 157624990, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.3515625, + "step": 7351, + "time_per_iteration": 2.659651517868042 + }, + { + "auxiliary_loss_clip": 0.01490363, + "auxiliary_loss_mlp": 0.00340327, + "balance_loss_clip": 1.21915388, + "balance_loss_mlp": 0.30823612, + "epoch": 0.44202615361491054, + "flos": 25371750672000.0, + "grad_norm": 9.620445212722936, + "language_loss": 0.78070331, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.79901028, + "num_input_tokens_seen": 157645300, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.32080078, + "step": 7352, + "time_per_iteration": 2.681013584136963 + }, + { + "auxiliary_loss_clip": 0.01524818, + "auxiliary_loss_mlp": 0.00370489, + "balance_loss_clip": 1.24181533, + "balance_loss_mlp": 0.33520317, + "epoch": 0.4420862768675785, + "flos": 24681116897280.0, + "grad_norm": 150.26796168536202, + "language_loss": 0.92828298, + "learning_rate": 2.467846890815649e-06, + "loss": 0.94723606, + "num_input_tokens_seen": 157664060, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.35253906, + "step": 7353, + "time_per_iteration": 2.7944436073303223 + }, + { + "auxiliary_loss_clip": 0.01493554, + "auxiliary_loss_mlp": 0.00352704, + "balance_loss_clip": 1.21494699, + "balance_loss_mlp": 0.3179189, + "epoch": 0.44214640012024653, + "flos": 19528437974400.0, + "grad_norm": 3.9125179937377697, + "language_loss": 0.82754374, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.84600627, + "num_input_tokens_seen": 157680905, + "router_z_loss_clip": 2.78710938, + "router_z_loss_mlp": 0.34765625, + "step": 7354, + "time_per_iteration": 2.7497243881225586 + }, + { + "auxiliary_loss_clip": 0.01501181, + "auxiliary_loss_mlp": 0.00328136, + "balance_loss_clip": 1.22831643, + "balance_loss_mlp": 0.29594964, + "epoch": 0.4422065233729145, + "flos": 47557434003840.0, + "grad_norm": 4.804159145189643, + "language_loss": 0.71200371, + "learning_rate": 2.467089543204268e-06, + "loss": 0.73029685, + "num_input_tokens_seen": 157701980, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.32202148, + "step": 7355, + "time_per_iteration": 2.9092941284179688 + }, + { + "auxiliary_loss_clip": 0.01499564, + "auxiliary_loss_mlp": 0.00348828, + "balance_loss_clip": 1.21140432, + "balance_loss_mlp": 0.31177801, + "epoch": 0.44226664662558246, + "flos": 19281050029440.0, + "grad_norm": 34.1465890863076, + "language_loss": 0.86182636, + "learning_rate": 2.466710842823274e-06, + "loss": 0.8803103, + "num_input_tokens_seen": 157720555, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.37084961, + "step": 7356, + "time_per_iteration": 2.6166698932647705 + }, + { + "auxiliary_loss_clip": 0.01508047, + "auxiliary_loss_mlp": 0.00379722, + "balance_loss_clip": 1.22579157, + "balance_loss_mlp": 0.34207559, + "epoch": 0.4423267698782504, + "flos": 17821820010240.0, + "grad_norm": 3.172619049800396, + "language_loss": 0.85102379, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.86990148, + "num_input_tokens_seen": 157739160, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.37670898, + "step": 7357, + "time_per_iteration": 2.607311248779297 + }, + { + "auxiliary_loss_clip": 0.01509354, + "auxiliary_loss_mlp": 0.00333683, + "balance_loss_clip": 1.23131323, + "balance_loss_mlp": 0.29854006, + "epoch": 0.4423868931309184, + "flos": 29204424691200.0, + "grad_norm": 25.891093085903112, + "language_loss": 0.79032338, + "learning_rate": 2.465953388982481e-06, + "loss": 0.80875373, + "num_input_tokens_seen": 157760020, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.35107422, + "step": 7358, + "time_per_iteration": 2.708587884902954 + }, + { + "auxiliary_loss_clip": 0.0150873, + "auxiliary_loss_mlp": 0.00342297, + "balance_loss_clip": 1.22904205, + "balance_loss_mlp": 0.3077026, + "epoch": 0.44244701638358636, + "flos": 29713135057920.0, + "grad_norm": 2.5364229223633092, + "language_loss": 0.82089138, + "learning_rate": 2.465574635551405e-06, + "loss": 0.83940166, + "num_input_tokens_seen": 157780435, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.34619141, + "step": 7359, + "time_per_iteration": 2.7162482738494873 + }, + { + "auxiliary_loss_clip": 0.01500945, + "auxiliary_loss_mlp": 0.00318356, + "balance_loss_clip": 1.22462523, + "balance_loss_mlp": 0.28485852, + "epoch": 0.4425071396362543, + "flos": 22930040874240.0, + "grad_norm": 24.69154530114858, + "language_loss": 0.76055861, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.77875167, + "num_input_tokens_seen": 157799420, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.33496094, + "step": 7360, + "time_per_iteration": 2.662841558456421 + }, + { + "auxiliary_loss_clip": 0.01492665, + "auxiliary_loss_mlp": 0.00335542, + "balance_loss_clip": 1.2135874, + "balance_loss_mlp": 0.30130535, + "epoch": 0.4425672628889223, + "flos": 19792346175360.0, + "grad_norm": 89.01171471225564, + "language_loss": 0.76572299, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.78400499, + "num_input_tokens_seen": 157817025, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.3425293, + "step": 7361, + "time_per_iteration": 2.6488630771636963 + }, + { + "auxiliary_loss_clip": 0.01510994, + "auxiliary_loss_mlp": 0.0034053, + "balance_loss_clip": 1.22738039, + "balance_loss_mlp": 0.30438614, + "epoch": 0.44262738614159025, + "flos": 13662215377920.0, + "grad_norm": 45.68039025799055, + "language_loss": 0.9239856, + "learning_rate": 2.464438269387809e-06, + "loss": 0.94250083, + "num_input_tokens_seen": 157834345, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.36157227, + "step": 7362, + "time_per_iteration": 4.074936628341675 + }, + { + "auxiliary_loss_clip": 0.01514538, + "auxiliary_loss_mlp": 0.0036916, + "balance_loss_clip": 1.22385216, + "balance_loss_mlp": 0.33072704, + "epoch": 0.4426875093942582, + "flos": 14210212245120.0, + "grad_norm": 65.72490082506302, + "language_loss": 0.827806, + "learning_rate": 2.464059445424366e-06, + "loss": 0.84664303, + "num_input_tokens_seen": 157852290, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.38476562, + "step": 7363, + "time_per_iteration": 2.6360108852386475 + }, + { + "auxiliary_loss_clip": 0.01318671, + "auxiliary_loss_mlp": 0.000587, + "balance_loss_clip": 1.15640724, + "balance_loss_mlp": 0.05178603, + "epoch": 0.4427476326469262, + "flos": 70117525728000.0, + "grad_norm": 0.7090862645362096, + "language_loss": 0.55368328, + "learning_rate": 2.463680603863743e-06, + "loss": 0.56745696, + "num_input_tokens_seen": 157923060, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.06933594, + "step": 7364, + "time_per_iteration": 3.2610929012298584 + }, + { + "auxiliary_loss_clip": 0.01494995, + "auxiliary_loss_mlp": 0.00303709, + "balance_loss_clip": 1.21985149, + "balance_loss_mlp": 0.27171323, + "epoch": 0.44280775589959415, + "flos": 25445080287360.0, + "grad_norm": 9.595097590887256, + "language_loss": 0.8025229, + "learning_rate": 2.463301744720305e-06, + "loss": 0.82050991, + "num_input_tokens_seen": 157944110, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.31982422, + "step": 7365, + "time_per_iteration": 4.156264066696167 + }, + { + "auxiliary_loss_clip": 0.0149048, + "auxiliary_loss_mlp": 0.00316386, + "balance_loss_clip": 1.21844745, + "balance_loss_mlp": 0.28281641, + "epoch": 0.4428678791522621, + "flos": 22857214049280.0, + "grad_norm": 18.96025807532063, + "language_loss": 0.8075214, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.82559007, + "num_input_tokens_seen": 157964295, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.33569336, + "step": 7366, + "time_per_iteration": 2.671660900115967 + }, + { + "auxiliary_loss_clip": 0.01491695, + "auxiliary_loss_mlp": 0.00322703, + "balance_loss_clip": 1.21698737, + "balance_loss_mlp": 0.28865695, + "epoch": 0.44292800240493013, + "flos": 25812446636160.0, + "grad_norm": 38.48023088836553, + "language_loss": 0.81476223, + "learning_rate": 2.46254397374245e-06, + "loss": 0.83290625, + "num_input_tokens_seen": 157983970, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.34057617, + "step": 7367, + "time_per_iteration": 4.116817474365234 + }, + { + "auxiliary_loss_clip": 0.01489024, + "auxiliary_loss_mlp": 0.00330533, + "balance_loss_clip": 1.21298957, + "balance_loss_mlp": 0.29565281, + "epoch": 0.4429881256575981, + "flos": 32416885549440.0, + "grad_norm": 12.751806792996721, + "language_loss": 0.79056692, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.80876243, + "num_input_tokens_seen": 158006515, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.34863281, + "step": 7368, + "time_per_iteration": 2.771739959716797 + }, + { + "auxiliary_loss_clip": 0.01492146, + "auxiliary_loss_mlp": 0.00337218, + "balance_loss_clip": 1.21599019, + "balance_loss_mlp": 0.30300462, + "epoch": 0.44304824891026606, + "flos": 22163707186560.0, + "grad_norm": 3.650854378386452, + "language_loss": 0.85836583, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.87665945, + "num_input_tokens_seen": 158025565, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.34228516, + "step": 7369, + "time_per_iteration": 2.7243847846984863 + }, + { + "auxiliary_loss_clip": 0.01479017, + "auxiliary_loss_mlp": 0.00317219, + "balance_loss_clip": 1.20832276, + "balance_loss_mlp": 0.28355405, + "epoch": 0.443108372162934, + "flos": 25338569483520.0, + "grad_norm": 3.8420618089989302, + "language_loss": 0.80180967, + "learning_rate": 2.461407185763737e-06, + "loss": 0.81977201, + "num_input_tokens_seen": 158045620, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.33666992, + "step": 7370, + "time_per_iteration": 2.6775269508361816 + }, + { + "auxiliary_loss_clip": 0.01476482, + "auxiliary_loss_mlp": 0.00296891, + "balance_loss_clip": 1.20603943, + "balance_loss_mlp": 0.26322651, + "epoch": 0.443168495415602, + "flos": 23330947547520.0, + "grad_norm": 119.09855689354859, + "language_loss": 0.77017558, + "learning_rate": 2.461028221425126e-06, + "loss": 0.78790927, + "num_input_tokens_seen": 158063505, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.33691406, + "step": 7371, + "time_per_iteration": 2.948523998260498 + }, + { + "auxiliary_loss_clip": 0.01486214, + "auxiliary_loss_mlp": 0.00305856, + "balance_loss_clip": 1.21381164, + "balance_loss_mlp": 0.27245343, + "epoch": 0.44322861866826996, + "flos": 21871502046720.0, + "grad_norm": 92.36824571186473, + "language_loss": 0.76118982, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.77911055, + "num_input_tokens_seen": 158080335, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.33398438, + "step": 7372, + "time_per_iteration": 4.055301189422607 + }, + { + "auxiliary_loss_clip": 0.01497385, + "auxiliary_loss_mlp": 0.00347631, + "balance_loss_clip": 1.21368384, + "balance_loss_mlp": 0.30943596, + "epoch": 0.4432887419209379, + "flos": 20084407660800.0, + "grad_norm": 69.1974292053158, + "language_loss": 0.89591253, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.91436267, + "num_input_tokens_seen": 158098955, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.38232422, + "step": 7373, + "time_per_iteration": 2.640072822570801 + }, + { + "auxiliary_loss_clip": 0.01326175, + "auxiliary_loss_mlp": 0.00040732, + "balance_loss_clip": 1.15817428, + "balance_loss_mlp": 0.03224437, + "epoch": 0.4433488651736059, + "flos": 70035540935040.0, + "grad_norm": 0.7562299881503296, + "language_loss": 0.5503521, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.56402111, + "num_input_tokens_seen": 158164110, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.08496094, + "step": 7374, + "time_per_iteration": 3.1987414360046387 + }, + { + "auxiliary_loss_clip": 0.01470297, + "auxiliary_loss_mlp": 0.00322289, + "balance_loss_clip": 1.19920063, + "balance_loss_mlp": 0.28788489, + "epoch": 0.44340898842627385, + "flos": 16282472705280.0, + "grad_norm": 15.352091416458023, + "language_loss": 0.90782213, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.92574799, + "num_input_tokens_seen": 158179850, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.34399414, + "step": 7375, + "time_per_iteration": 2.691990613937378 + }, + { + "auxiliary_loss_clip": 0.01499474, + "auxiliary_loss_mlp": 0.00330804, + "balance_loss_clip": 1.21986115, + "balance_loss_mlp": 0.29492235, + "epoch": 0.4434691116789418, + "flos": 16611989097600.0, + "grad_norm": 8.286118871483028, + "language_loss": 0.88943923, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.90774202, + "num_input_tokens_seen": 158196590, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.35888672, + "step": 7376, + "time_per_iteration": 2.666522741317749 + }, + { + "auxiliary_loss_clip": 0.0148802, + "auxiliary_loss_mlp": 0.00321001, + "balance_loss_clip": 1.21283746, + "balance_loss_mlp": 0.28640643, + "epoch": 0.4435292349316098, + "flos": 19063251912960.0, + "grad_norm": 13.80232097569648, + "language_loss": 0.84935772, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.86744797, + "num_input_tokens_seen": 158216355, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.34570312, + "step": 7377, + "time_per_iteration": 2.651468515396118 + }, + { + "auxiliary_loss_clip": 0.0148892, + "auxiliary_loss_mlp": 0.00268492, + "balance_loss_clip": 1.22175419, + "balance_loss_mlp": 0.23554239, + "epoch": 0.44358935818427775, + "flos": 21251324799360.0, + "grad_norm": 20.54575649723188, + "language_loss": 0.83723557, + "learning_rate": 2.458374982357057e-06, + "loss": 0.85480964, + "num_input_tokens_seen": 158235825, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.32958984, + "step": 7378, + "time_per_iteration": 2.6757638454437256 + }, + { + "auxiliary_loss_clip": 0.01488314, + "auxiliary_loss_mlp": 0.00317525, + "balance_loss_clip": 1.21110821, + "balance_loss_mlp": 0.28254941, + "epoch": 0.4436494814369457, + "flos": 12495298239360.0, + "grad_norm": 59.64743616829395, + "language_loss": 0.78745347, + "learning_rate": 2.457995878562982e-06, + "loss": 0.80551189, + "num_input_tokens_seen": 158254230, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.34960938, + "step": 7379, + "time_per_iteration": 2.6910157203674316 + }, + { + "auxiliary_loss_clip": 0.01507539, + "auxiliary_loss_mlp": 0.00298936, + "balance_loss_clip": 1.23374844, + "balance_loss_mlp": 0.26338822, + "epoch": 0.44370960468961373, + "flos": 23659853408640.0, + "grad_norm": 7.368121162923069, + "language_loss": 0.80060875, + "learning_rate": 2.457616757401656e-06, + "loss": 0.81867349, + "num_input_tokens_seen": 158273400, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.35546875, + "step": 7380, + "time_per_iteration": 2.68861722946167 + }, + { + "auxiliary_loss_clip": 0.01500587, + "auxiliary_loss_mlp": 0.00302631, + "balance_loss_clip": 1.22417188, + "balance_loss_mlp": 0.26779824, + "epoch": 0.4437697279422817, + "flos": 32416849635840.0, + "grad_norm": 39.158366468956, + "language_loss": 0.70083582, + "learning_rate": 2.457237618887458e-06, + "loss": 0.71886808, + "num_input_tokens_seen": 158296840, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.34838867, + "step": 7381, + "time_per_iteration": 2.7239747047424316 + }, + { + "auxiliary_loss_clip": 0.01528094, + "auxiliary_loss_mlp": 0.00332867, + "balance_loss_clip": 1.25050306, + "balance_loss_mlp": 0.29796267, + "epoch": 0.44382985119494966, + "flos": 18112875914880.0, + "grad_norm": 4.939166385335575, + "language_loss": 0.87970662, + "learning_rate": 2.456858463034763e-06, + "loss": 0.8983162, + "num_input_tokens_seen": 158314935, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.34887695, + "step": 7382, + "time_per_iteration": 2.638709545135498 + }, + { + "auxiliary_loss_clip": 0.01514627, + "auxiliary_loss_mlp": 0.00326213, + "balance_loss_clip": 1.2362349, + "balance_loss_mlp": 0.2898308, + "epoch": 0.44388997444761763, + "flos": 30774151923840.0, + "grad_norm": 145.57163664713198, + "language_loss": 0.72082424, + "learning_rate": 2.456479289857949e-06, + "loss": 0.73923266, + "num_input_tokens_seen": 158334620, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.36401367, + "step": 7383, + "time_per_iteration": 2.747565746307373 + }, + { + "auxiliary_loss_clip": 0.01507398, + "auxiliary_loss_mlp": 0.00341351, + "balance_loss_clip": 1.22645998, + "balance_loss_mlp": 0.30689931, + "epoch": 0.4439500977002856, + "flos": 20339157893760.0, + "grad_norm": 8.413238925963759, + "language_loss": 0.85080254, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.86929005, + "num_input_tokens_seen": 158350550, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.34448242, + "step": 7384, + "time_per_iteration": 2.657088041305542 + }, + { + "auxiliary_loss_clip": 0.01506824, + "auxiliary_loss_mlp": 0.00350098, + "balance_loss_clip": 1.22325039, + "balance_loss_mlp": 0.31273752, + "epoch": 0.44401022095295356, + "flos": 20371225760640.0, + "grad_norm": 4.069574597396487, + "language_loss": 0.86839998, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.88696921, + "num_input_tokens_seen": 158369555, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.37353516, + "step": 7385, + "time_per_iteration": 2.7047667503356934 + }, + { + "auxiliary_loss_clip": 0.01504405, + "auxiliary_loss_mlp": 0.00312655, + "balance_loss_clip": 1.22924984, + "balance_loss_mlp": 0.27410299, + "epoch": 0.4440703442056215, + "flos": 20230635928320.0, + "grad_norm": 7.131566844388839, + "language_loss": 0.8773967, + "learning_rate": 2.455341666526582e-06, + "loss": 0.8955673, + "num_input_tokens_seen": 158388045, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.38574219, + "step": 7386, + "time_per_iteration": 2.647714614868164 + }, + { + "auxiliary_loss_clip": 0.01528697, + "auxiliary_loss_mlp": 0.00361292, + "balance_loss_clip": 1.23668694, + "balance_loss_mlp": 0.32357419, + "epoch": 0.4441304674582895, + "flos": 39494698824960.0, + "grad_norm": 198.9616305560583, + "language_loss": 0.77455866, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.79345864, + "num_input_tokens_seen": 158410115, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.37719727, + "step": 7387, + "time_per_iteration": 2.8029348850250244 + }, + { + "auxiliary_loss_clip": 0.0150889, + "auxiliary_loss_mlp": 0.00334017, + "balance_loss_clip": 1.23124552, + "balance_loss_mlp": 0.29801536, + "epoch": 0.44419059071095746, + "flos": 14829671220480.0, + "grad_norm": 4.966003801294005, + "language_loss": 0.77591938, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.79434848, + "num_input_tokens_seen": 158427765, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.35961914, + "step": 7388, + "time_per_iteration": 2.6688618659973145 + }, + { + "auxiliary_loss_clip": 0.01501353, + "auxiliary_loss_mlp": 0.00327151, + "balance_loss_clip": 1.22067046, + "balance_loss_mlp": 0.29031533, + "epoch": 0.4442507139636254, + "flos": 22637835734400.0, + "grad_norm": 3.565729918194898, + "language_loss": 0.76574779, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.78403282, + "num_input_tokens_seen": 158446375, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.3684082, + "step": 7389, + "time_per_iteration": 2.673527717590332 + }, + { + "auxiliary_loss_clip": 0.01513838, + "auxiliary_loss_mlp": 0.00311363, + "balance_loss_clip": 1.23733866, + "balance_loss_mlp": 0.2766971, + "epoch": 0.4443108372162934, + "flos": 38290721829120.0, + "grad_norm": 4.35627762966236, + "language_loss": 0.82130247, + "learning_rate": 2.453824593752788e-06, + "loss": 0.83955443, + "num_input_tokens_seen": 158467260, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.34667969, + "step": 7390, + "time_per_iteration": 2.811337471008301 + }, + { + "auxiliary_loss_clip": 0.01510639, + "auxiliary_loss_mlp": 0.00292695, + "balance_loss_clip": 1.23486364, + "balance_loss_mlp": 0.2584815, + "epoch": 0.44437096046896135, + "flos": 17748993185280.0, + "grad_norm": 26.370569215225114, + "language_loss": 0.89795029, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.91598356, + "num_input_tokens_seen": 158486720, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.34228516, + "step": 7391, + "time_per_iteration": 2.7378737926483154 + }, + { + "auxiliary_loss_clip": 0.01510541, + "auxiliary_loss_mlp": 0.00298344, + "balance_loss_clip": 1.2380724, + "balance_loss_mlp": 0.26472709, + "epoch": 0.4444310837216293, + "flos": 13732348682880.0, + "grad_norm": 12.316481854077338, + "language_loss": 0.80677849, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.82486737, + "num_input_tokens_seen": 158502530, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.3359375, + "step": 7392, + "time_per_iteration": 2.628227472305298 + }, + { + "auxiliary_loss_clip": 0.01498352, + "auxiliary_loss_mlp": 0.00277137, + "balance_loss_clip": 1.22894454, + "balance_loss_mlp": 0.245094, + "epoch": 0.44449120697429734, + "flos": 25010238240000.0, + "grad_norm": 17.470729747963926, + "language_loss": 0.84760112, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.86535603, + "num_input_tokens_seen": 158522715, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.3203125, + "step": 7393, + "time_per_iteration": 2.752321481704712 + }, + { + "auxiliary_loss_clip": 0.01520651, + "auxiliary_loss_mlp": 0.00342303, + "balance_loss_clip": 1.24248588, + "balance_loss_mlp": 0.30446592, + "epoch": 0.4445513302269653, + "flos": 32671707609600.0, + "grad_norm": 15.426973792955925, + "language_loss": 0.8767969, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.89542645, + "num_input_tokens_seen": 158543615, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 0.37817383, + "step": 7394, + "time_per_iteration": 2.7762956619262695 + }, + { + "auxiliary_loss_clip": 0.01503101, + "auxiliary_loss_mlp": 0.0028533, + "balance_loss_clip": 1.22847271, + "balance_loss_mlp": 0.25109342, + "epoch": 0.44461145347963327, + "flos": 11655814504320.0, + "grad_norm": 2.3983766870555203, + "language_loss": 0.88372791, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.90161222, + "num_input_tokens_seen": 158560330, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.34228516, + "step": 7395, + "time_per_iteration": 2.6172335147857666 + }, + { + "auxiliary_loss_clip": 0.01508623, + "auxiliary_loss_mlp": 0.00305501, + "balance_loss_clip": 1.2343421, + "balance_loss_mlp": 0.26976186, + "epoch": 0.44467157673230123, + "flos": 20886759711360.0, + "grad_norm": 4.643988596067947, + "language_loss": 0.78440225, + "learning_rate": 2.451548468607584e-06, + "loss": 0.80254352, + "num_input_tokens_seen": 158579735, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.35742188, + "step": 7396, + "time_per_iteration": 2.6516571044921875 + }, + { + "auxiliary_loss_clip": 0.01511281, + "auxiliary_loss_mlp": 0.00330208, + "balance_loss_clip": 1.23558497, + "balance_loss_mlp": 0.29291981, + "epoch": 0.4447316999849692, + "flos": 18546137763840.0, + "grad_norm": 5.157042301305465, + "language_loss": 0.87367821, + "learning_rate": 2.451169054403126e-06, + "loss": 0.89209312, + "num_input_tokens_seen": 158597075, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.37304688, + "step": 7397, + "time_per_iteration": 2.6834349632263184 + }, + { + "auxiliary_loss_clip": 0.01520881, + "auxiliary_loss_mlp": 0.00310523, + "balance_loss_clip": 1.248703, + "balance_loss_mlp": 0.27538007, + "epoch": 0.44479182323763716, + "flos": 23769057732480.0, + "grad_norm": 15.378007665757304, + "language_loss": 0.70271599, + "learning_rate": 2.450789623090293e-06, + "loss": 0.72103, + "num_input_tokens_seen": 158616650, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.3515625, + "step": 7398, + "time_per_iteration": 2.7403244972229004 + }, + { + "auxiliary_loss_clip": 0.01510861, + "auxiliary_loss_mlp": 0.0031469, + "balance_loss_clip": 1.23742545, + "balance_loss_mlp": 0.28011954, + "epoch": 0.44485194649030513, + "flos": 16543831040640.0, + "grad_norm": 2.383289388184021, + "language_loss": 0.77566087, + "learning_rate": 2.450410174683472e-06, + "loss": 0.79391646, + "num_input_tokens_seen": 158634515, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.34594727, + "step": 7399, + "time_per_iteration": 2.8116514682769775 + }, + { + "auxiliary_loss_clip": 0.01488286, + "auxiliary_loss_mlp": 0.00290113, + "balance_loss_clip": 1.2194314, + "balance_loss_mlp": 0.25599501, + "epoch": 0.4449120697429731, + "flos": 22600955445120.0, + "grad_norm": 39.10449124341703, + "language_loss": 0.78327364, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.80105758, + "num_input_tokens_seen": 158653760, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.34106445, + "step": 7400, + "time_per_iteration": 2.726386547088623 + }, + { + "auxiliary_loss_clip": 0.01497152, + "auxiliary_loss_mlp": 0.00289243, + "balance_loss_clip": 1.22755849, + "balance_loss_mlp": 0.25488681, + "epoch": 0.44497219299564106, + "flos": 20004864992640.0, + "grad_norm": 2.5466734175810553, + "language_loss": 0.91669905, + "learning_rate": 2.449651226645422e-06, + "loss": 0.93456298, + "num_input_tokens_seen": 158672190, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.34350586, + "step": 7401, + "time_per_iteration": 2.677379608154297 + }, + { + "auxiliary_loss_clip": 0.01499935, + "auxiliary_loss_mlp": 0.00289602, + "balance_loss_clip": 1.22988105, + "balance_loss_mlp": 0.25474498, + "epoch": 0.445032316248309, + "flos": 25594253470080.0, + "grad_norm": 25.57362821901347, + "language_loss": 0.87796652, + "learning_rate": 2.449271727042973e-06, + "loss": 0.89586186, + "num_input_tokens_seen": 158694115, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.34863281, + "step": 7402, + "time_per_iteration": 2.865997076034546 + }, + { + "auxiliary_loss_clip": 0.01514107, + "auxiliary_loss_mlp": 0.00290931, + "balance_loss_clip": 1.23669362, + "balance_loss_mlp": 0.25817251, + "epoch": 0.445092439500977, + "flos": 21250426959360.0, + "grad_norm": 10.174356754244922, + "language_loss": 0.83075035, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.84880066, + "num_input_tokens_seen": 158711000, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.32788086, + "step": 7403, + "time_per_iteration": 2.6538796424865723 + }, + { + "auxiliary_loss_clip": 0.01350537, + "auxiliary_loss_mlp": 0.00029565, + "balance_loss_clip": 1.19559813, + "balance_loss_mlp": 0.02203143, + "epoch": 0.44515256275364495, + "flos": 57764900309760.0, + "grad_norm": 0.7711229171873077, + "language_loss": 0.59594095, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.60974193, + "num_input_tokens_seen": 158769675, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.07519531, + "step": 7404, + "time_per_iteration": 4.52139139175415 + }, + { + "auxiliary_loss_clip": 0.01522804, + "auxiliary_loss_mlp": 0.00336375, + "balance_loss_clip": 1.23962986, + "balance_loss_mlp": 0.29796591, + "epoch": 0.4452126860063129, + "flos": 15596004908160.0, + "grad_norm": 20.017628006201114, + "language_loss": 0.88069642, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.89928824, + "num_input_tokens_seen": 158788215, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.38427734, + "step": 7405, + "time_per_iteration": 2.713597297668457 + }, + { + "auxiliary_loss_clip": 0.01500241, + "auxiliary_loss_mlp": 0.0031142, + "balance_loss_clip": 1.2289257, + "balance_loss_mlp": 0.27625346, + "epoch": 0.4452728092589809, + "flos": 21617398258560.0, + "grad_norm": 1.8867227193685747, + "language_loss": 0.81147265, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.82958925, + "num_input_tokens_seen": 158809090, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.3515625, + "step": 7406, + "time_per_iteration": 2.7076852321624756 + }, + { + "auxiliary_loss_clip": 0.01494115, + "auxiliary_loss_mlp": 0.00279263, + "balance_loss_clip": 1.22892833, + "balance_loss_mlp": 0.2476006, + "epoch": 0.4453329325116489, + "flos": 29497491757440.0, + "grad_norm": 46.767230273914, + "language_loss": 0.7193138, + "learning_rate": 2.447373973772129e-06, + "loss": 0.73704761, + "num_input_tokens_seen": 158828320, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.31665039, + "step": 7407, + "time_per_iteration": 4.203291893005371 + }, + { + "auxiliary_loss_clip": 0.01509004, + "auxiliary_loss_mlp": 0.00308427, + "balance_loss_clip": 1.23525667, + "balance_loss_mlp": 0.27333191, + "epoch": 0.44539305576431687, + "flos": 21361139654400.0, + "grad_norm": 2.272648608768517, + "language_loss": 0.7383877, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.75656199, + "num_input_tokens_seen": 158847040, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.35107422, + "step": 7408, + "time_per_iteration": 2.666595935821533 + }, + { + "auxiliary_loss_clip": 0.01487257, + "auxiliary_loss_mlp": 0.00307466, + "balance_loss_clip": 1.21389294, + "balance_loss_mlp": 0.27389702, + "epoch": 0.44545317901698483, + "flos": 41427626428800.0, + "grad_norm": 151.53298610785535, + "language_loss": 0.76463622, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.78258348, + "num_input_tokens_seen": 158870490, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.33569336, + "step": 7409, + "time_per_iteration": 4.267614364624023 + }, + { + "auxiliary_loss_clip": 0.01488998, + "auxiliary_loss_mlp": 0.00324289, + "balance_loss_clip": 1.21500993, + "balance_loss_mlp": 0.2879782, + "epoch": 0.4455133022696528, + "flos": 22055005653120.0, + "grad_norm": 6.256165030796798, + "language_loss": 0.72950041, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.74763334, + "num_input_tokens_seen": 158889920, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.36303711, + "step": 7410, + "time_per_iteration": 2.7130978107452393 + }, + { + "auxiliary_loss_clip": 0.01502566, + "auxiliary_loss_mlp": 0.00308787, + "balance_loss_clip": 1.22576189, + "balance_loss_mlp": 0.27345306, + "epoch": 0.44557342552232077, + "flos": 23476960333440.0, + "grad_norm": 10.020503099638624, + "language_loss": 0.82790095, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.8460145, + "num_input_tokens_seen": 158909580, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.35327148, + "step": 7411, + "time_per_iteration": 2.6906332969665527 + }, + { + "auxiliary_loss_clip": 0.01494753, + "auxiliary_loss_mlp": 0.00271125, + "balance_loss_clip": 1.23452771, + "balance_loss_mlp": 0.24008319, + "epoch": 0.44563354877498873, + "flos": 19134678107520.0, + "grad_norm": 7.9365424996563805, + "language_loss": 0.83879495, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.85645372, + "num_input_tokens_seen": 158924600, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.31054688, + "step": 7412, + "time_per_iteration": 2.774928092956543 + }, + { + "auxiliary_loss_clip": 0.01493047, + "auxiliary_loss_mlp": 0.00309967, + "balance_loss_clip": 1.21845067, + "balance_loss_mlp": 0.2764934, + "epoch": 0.4456936720276567, + "flos": 13621420506240.0, + "grad_norm": 8.954484164143432, + "language_loss": 0.87865651, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.89668667, + "num_input_tokens_seen": 158939345, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.33496094, + "step": 7413, + "time_per_iteration": 2.6444928646087646 + }, + { + "auxiliary_loss_clip": 0.01482935, + "auxiliary_loss_mlp": 0.00294409, + "balance_loss_clip": 1.21605968, + "balance_loss_mlp": 0.26207942, + "epoch": 0.44575379528032466, + "flos": 14713715139840.0, + "grad_norm": 3.4000939208585796, + "language_loss": 0.82686448, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.84463787, + "num_input_tokens_seen": 158955855, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.32299805, + "step": 7414, + "time_per_iteration": 2.7273757457733154 + }, + { + "auxiliary_loss_clip": 0.01486806, + "auxiliary_loss_mlp": 0.00309577, + "balance_loss_clip": 1.21533298, + "balance_loss_mlp": 0.27410069, + "epoch": 0.4458139185329926, + "flos": 24170682677760.0, + "grad_norm": 24.388500680742577, + "language_loss": 0.90118062, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.91914445, + "num_input_tokens_seen": 158976315, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.35473633, + "step": 7415, + "time_per_iteration": 4.043541669845581 + }, + { + "auxiliary_loss_clip": 0.01471152, + "auxiliary_loss_mlp": 0.00322041, + "balance_loss_clip": 1.20430565, + "balance_loss_mlp": 0.28789955, + "epoch": 0.4458740417856606, + "flos": 21762225895680.0, + "grad_norm": 1424.2173673358109, + "language_loss": 0.90007806, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.91801, + "num_input_tokens_seen": 158996725, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.34155273, + "step": 7416, + "time_per_iteration": 2.6401963233947754 + }, + { + "auxiliary_loss_clip": 0.01486639, + "auxiliary_loss_mlp": 0.00297241, + "balance_loss_clip": 1.21649837, + "balance_loss_mlp": 0.2631467, + "epoch": 0.44593416503832856, + "flos": 21068790860160.0, + "grad_norm": 9.741006924283475, + "language_loss": 0.86838007, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.88621891, + "num_input_tokens_seen": 159017255, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.34106445, + "step": 7417, + "time_per_iteration": 2.67580509185791 + }, + { + "auxiliary_loss_clip": 0.01478979, + "auxiliary_loss_mlp": 0.00300563, + "balance_loss_clip": 1.21126294, + "balance_loss_mlp": 0.26570579, + "epoch": 0.4459942882909965, + "flos": 22600488568320.0, + "grad_norm": 489.7816659726153, + "language_loss": 0.88014054, + "learning_rate": 2.443197426237077e-06, + "loss": 0.89793599, + "num_input_tokens_seen": 159035010, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.34863281, + "step": 7418, + "time_per_iteration": 2.6474437713623047 + }, + { + "auxiliary_loss_clip": 0.01463523, + "auxiliary_loss_mlp": 0.00281815, + "balance_loss_clip": 1.19676268, + "balance_loss_mlp": 0.24877053, + "epoch": 0.4460544115436645, + "flos": 26505486622080.0, + "grad_norm": 24.57223788297306, + "language_loss": 0.83319336, + "learning_rate": 2.442817638972991e-06, + "loss": 0.85064679, + "num_input_tokens_seen": 159055345, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.33056641, + "step": 7419, + "time_per_iteration": 2.6933679580688477 + }, + { + "auxiliary_loss_clip": 0.01476646, + "auxiliary_loss_mlp": 0.00291642, + "balance_loss_clip": 1.21332252, + "balance_loss_mlp": 0.25890678, + "epoch": 0.4461145347963325, + "flos": 17604021893760.0, + "grad_norm": 1016.6495534940369, + "language_loss": 0.79712129, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.8148042, + "num_input_tokens_seen": 159074225, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.32714844, + "step": 7420, + "time_per_iteration": 2.656338691711426 + }, + { + "auxiliary_loss_clip": 0.01464258, + "auxiliary_loss_mlp": 0.00292172, + "balance_loss_clip": 1.20570576, + "balance_loss_mlp": 0.25993797, + "epoch": 0.44617465804900047, + "flos": 27268193036160.0, + "grad_norm": 97.22186273074672, + "language_loss": 0.80467856, + "learning_rate": 2.442058014084156e-06, + "loss": 0.82224286, + "num_input_tokens_seen": 159095415, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.32250977, + "step": 7421, + "time_per_iteration": 2.7221546173095703 + }, + { + "auxiliary_loss_clip": 0.0146213, + "auxiliary_loss_mlp": 0.00289005, + "balance_loss_clip": 1.20213056, + "balance_loss_mlp": 0.25514966, + "epoch": 0.44623478130166844, + "flos": 17786412178560.0, + "grad_norm": 7.559844451373558, + "language_loss": 0.82265633, + "learning_rate": 2.44167817648821e-06, + "loss": 0.8401677, + "num_input_tokens_seen": 159114615, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.33862305, + "step": 7422, + "time_per_iteration": 2.740950107574463 + }, + { + "auxiliary_loss_clip": 0.01473291, + "auxiliary_loss_mlp": 0.00336373, + "balance_loss_clip": 1.21033967, + "balance_loss_mlp": 0.30192155, + "epoch": 0.4462949045543364, + "flos": 23003011353600.0, + "grad_norm": 13.341818276430095, + "language_loss": 0.71352625, + "learning_rate": 2.441298322143784e-06, + "loss": 0.73162293, + "num_input_tokens_seen": 159134370, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.34423828, + "step": 7423, + "time_per_iteration": 2.679807424545288 + }, + { + "auxiliary_loss_clip": 0.01477897, + "auxiliary_loss_mlp": 0.00283502, + "balance_loss_clip": 1.21842194, + "balance_loss_mlp": 0.251196, + "epoch": 0.44635502780700437, + "flos": 17820096157440.0, + "grad_norm": 10.45270643034853, + "language_loss": 0.86103964, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.87865365, + "num_input_tokens_seen": 159152540, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.32324219, + "step": 7424, + "time_per_iteration": 2.6748740673065186 + }, + { + "auxiliary_loss_clip": 0.01469351, + "auxiliary_loss_mlp": 0.00264821, + "balance_loss_clip": 1.21004069, + "balance_loss_mlp": 0.23225351, + "epoch": 0.44641515105967233, + "flos": 26688020561280.0, + "grad_norm": 44.161158694753276, + "language_loss": 0.8496142, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.86695588, + "num_input_tokens_seen": 159173425, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.32543945, + "step": 7425, + "time_per_iteration": 2.6980209350585938 + }, + { + "auxiliary_loss_clip": 0.01482623, + "auxiliary_loss_mlp": 0.00300455, + "balance_loss_clip": 1.22046971, + "balance_loss_mlp": 0.26762491, + "epoch": 0.4464752743123403, + "flos": 18913324544640.0, + "grad_norm": 6.124757451645669, + "language_loss": 0.83841813, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.85624892, + "num_input_tokens_seen": 159191210, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.328125, + "step": 7426, + "time_per_iteration": 2.6671652793884277 + }, + { + "auxiliary_loss_clip": 0.01467181, + "auxiliary_loss_mlp": 0.00301377, + "balance_loss_clip": 1.20143282, + "balance_loss_mlp": 0.26718742, + "epoch": 0.44653539756500826, + "flos": 29570318582400.0, + "grad_norm": 4.986109324702479, + "language_loss": 0.72466481, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.7423504, + "num_input_tokens_seen": 159211755, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.34228516, + "step": 7427, + "time_per_iteration": 2.7839462757110596 + }, + { + "auxiliary_loss_clip": 0.01485677, + "auxiliary_loss_mlp": 0.00280261, + "balance_loss_clip": 1.21926713, + "balance_loss_mlp": 0.2485995, + "epoch": 0.44659552081767623, + "flos": 21468979261440.0, + "grad_norm": 15.569917997483254, + "language_loss": 0.82604766, + "learning_rate": 2.439398799698608e-06, + "loss": 0.84370697, + "num_input_tokens_seen": 159230315, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.31689453, + "step": 7428, + "time_per_iteration": 2.6447269916534424 + }, + { + "auxiliary_loss_clip": 0.01470636, + "auxiliary_loss_mlp": 0.00275336, + "balance_loss_clip": 1.20658839, + "balance_loss_mlp": 0.2432927, + "epoch": 0.4466556440703442, + "flos": 17931886260480.0, + "grad_norm": 6.122190114463814, + "language_loss": 0.84697551, + "learning_rate": 2.439018845165806e-06, + "loss": 0.8644352, + "num_input_tokens_seen": 159249810, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.32055664, + "step": 7429, + "time_per_iteration": 2.647575855255127 + }, + { + "auxiliary_loss_clip": 0.01482255, + "auxiliary_loss_mlp": 0.00298919, + "balance_loss_clip": 1.21587789, + "balance_loss_mlp": 0.26656556, + "epoch": 0.44671576732301216, + "flos": 21107430915840.0, + "grad_norm": 88.82640156945236, + "language_loss": 0.96352601, + "learning_rate": 2.438638873985366e-06, + "loss": 0.98133773, + "num_input_tokens_seen": 159271715, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.32348633, + "step": 7430, + "time_per_iteration": 2.6819207668304443 + }, + { + "auxiliary_loss_clip": 0.01480554, + "auxiliary_loss_mlp": 0.00304873, + "balance_loss_clip": 1.2056936, + "balance_loss_mlp": 0.27003992, + "epoch": 0.4467758905756801, + "flos": 23508920459520.0, + "grad_norm": 16.725276519696877, + "language_loss": 0.86794865, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.88580292, + "num_input_tokens_seen": 159290690, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.34814453, + "step": 7431, + "time_per_iteration": 2.7003397941589355 + }, + { + "auxiliary_loss_clip": 0.01470638, + "auxiliary_loss_mlp": 0.00320046, + "balance_loss_clip": 1.20309615, + "balance_loss_mlp": 0.28671461, + "epoch": 0.4468360138283481, + "flos": 18734022829440.0, + "grad_norm": 38.08134803682482, + "language_loss": 0.8815254, + "learning_rate": 2.437878881739204e-06, + "loss": 0.89943218, + "num_input_tokens_seen": 159309400, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.33325195, + "step": 7432, + "time_per_iteration": 2.6457695960998535 + }, + { + "auxiliary_loss_clip": 0.01478511, + "auxiliary_loss_mlp": 0.00338146, + "balance_loss_clip": 1.20631719, + "balance_loss_mlp": 0.30402794, + "epoch": 0.4468961370810161, + "flos": 23477139901440.0, + "grad_norm": 72.9796394552312, + "language_loss": 0.83264697, + "learning_rate": 2.437498860702301e-06, + "loss": 0.85081351, + "num_input_tokens_seen": 159327425, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.34106445, + "step": 7433, + "time_per_iteration": 2.685288429260254 + }, + { + "auxiliary_loss_clip": 0.01454655, + "auxiliary_loss_mlp": 0.00305147, + "balance_loss_clip": 1.19464946, + "balance_loss_mlp": 0.27361631, + "epoch": 0.4469562603336841, + "flos": 30075042539520.0, + "grad_norm": 19.36407299560044, + "language_loss": 0.81925935, + "learning_rate": 2.437118823075398e-06, + "loss": 0.83685738, + "num_input_tokens_seen": 159345805, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.31530762, + "step": 7434, + "time_per_iteration": 2.7955877780914307 + }, + { + "auxiliary_loss_clip": 0.01486899, + "auxiliary_loss_mlp": 0.00307303, + "balance_loss_clip": 1.2166779, + "balance_loss_mlp": 0.27535483, + "epoch": 0.44701638358635204, + "flos": 22456415116800.0, + "grad_norm": 19.164467893325792, + "language_loss": 0.70754939, + "learning_rate": 2.436738768872905e-06, + "loss": 0.7254914, + "num_input_tokens_seen": 159364595, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.31933594, + "step": 7435, + "time_per_iteration": 2.760272741317749 + }, + { + "auxiliary_loss_clip": 0.01481342, + "auxiliary_loss_mlp": 0.00329247, + "balance_loss_clip": 1.2162807, + "balance_loss_mlp": 0.29369858, + "epoch": 0.44707650683902, + "flos": 24057851080320.0, + "grad_norm": 7.724560927777531, + "language_loss": 0.88970578, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.90781164, + "num_input_tokens_seen": 159385265, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.35510254, + "step": 7436, + "time_per_iteration": 2.7431564331054688 + }, + { + "auxiliary_loss_clip": 0.01478467, + "auxiliary_loss_mlp": 0.00330418, + "balance_loss_clip": 1.21021736, + "balance_loss_mlp": 0.29639524, + "epoch": 0.44713663009168797, + "flos": 23766938830080.0, + "grad_norm": 3.394299363880444, + "language_loss": 0.85253584, + "learning_rate": 2.435978610798798e-06, + "loss": 0.87062466, + "num_input_tokens_seen": 159405080, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.34057617, + "step": 7437, + "time_per_iteration": 2.6747028827667236 + }, + { + "auxiliary_loss_clip": 0.01448289, + "auxiliary_loss_mlp": 0.00332243, + "balance_loss_clip": 1.18335271, + "balance_loss_mlp": 0.29783887, + "epoch": 0.44719675334435594, + "flos": 24499265316480.0, + "grad_norm": 3.5628081820448956, + "language_loss": 0.79214978, + "learning_rate": 2.435598506956009e-06, + "loss": 0.80995506, + "num_input_tokens_seen": 159424595, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.34399414, + "step": 7438, + "time_per_iteration": 2.703549385070801 + }, + { + "auxiliary_loss_clip": 0.01473098, + "auxiliary_loss_mlp": 0.00350737, + "balance_loss_clip": 1.20341074, + "balance_loss_mlp": 0.31340015, + "epoch": 0.4472568765970239, + "flos": 29781759991680.0, + "grad_norm": 59.52281381311678, + "language_loss": 0.7316069, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.74984521, + "num_input_tokens_seen": 159443865, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.37402344, + "step": 7439, + "time_per_iteration": 2.7008261680603027 + }, + { + "auxiliary_loss_clip": 0.01470714, + "auxiliary_loss_mlp": 0.00333041, + "balance_loss_clip": 1.20484662, + "balance_loss_mlp": 0.29758775, + "epoch": 0.44731699984969187, + "flos": 24643123286400.0, + "grad_norm": 30.238184673007975, + "language_loss": 0.80663764, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.8246752, + "num_input_tokens_seen": 159464525, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.35473633, + "step": 7440, + "time_per_iteration": 2.731043577194214 + }, + { + "auxiliary_loss_clip": 0.01459904, + "auxiliary_loss_mlp": 0.00344911, + "balance_loss_clip": 1.19654751, + "balance_loss_mlp": 0.31234285, + "epoch": 0.44737712310235983, + "flos": 29455691304960.0, + "grad_norm": 58.29658356100729, + "language_loss": 0.78632855, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.80437672, + "num_input_tokens_seen": 159486385, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.32592773, + "step": 7441, + "time_per_iteration": 2.7037012577056885 + }, + { + "auxiliary_loss_clip": 0.01480856, + "auxiliary_loss_mlp": 0.00341674, + "balance_loss_clip": 1.21195424, + "balance_loss_mlp": 0.30684125, + "epoch": 0.4474372463550278, + "flos": 24896832024960.0, + "grad_norm": 16.087457795837025, + "language_loss": 0.81776762, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.83599293, + "num_input_tokens_seen": 159503880, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.34863281, + "step": 7442, + "time_per_iteration": 2.6757938861846924 + }, + { + "auxiliary_loss_clip": 0.01471521, + "auxiliary_loss_mlp": 0.00362269, + "balance_loss_clip": 1.19427586, + "balance_loss_mlp": 0.32300138, + "epoch": 0.44749736960769576, + "flos": 33181603125120.0, + "grad_norm": 10.077058138133498, + "language_loss": 0.82652885, + "learning_rate": 2.433697740261273e-06, + "loss": 0.84486675, + "num_input_tokens_seen": 159522980, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.39233398, + "step": 7443, + "time_per_iteration": 2.7371933460235596 + }, + { + "auxiliary_loss_clip": 0.01460673, + "auxiliary_loss_mlp": 0.00366833, + "balance_loss_clip": 1.1953696, + "balance_loss_mlp": 0.32851934, + "epoch": 0.4475574928603637, + "flos": 21071807602560.0, + "grad_norm": 6.280644856268834, + "language_loss": 0.83550978, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.8537848, + "num_input_tokens_seen": 159543340, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.38256836, + "step": 7444, + "time_per_iteration": 2.7022743225097656 + }, + { + "auxiliary_loss_clip": 0.01461365, + "auxiliary_loss_mlp": 0.00373384, + "balance_loss_clip": 1.19513321, + "balance_loss_mlp": 0.33839637, + "epoch": 0.4476176161130317, + "flos": 21862523646720.0, + "grad_norm": 5.264781221726737, + "language_loss": 0.93751848, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.95586598, + "num_input_tokens_seen": 159558210, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.34997559, + "step": 7445, + "time_per_iteration": 2.9154083728790283 + }, + { + "auxiliary_loss_clip": 0.01477246, + "auxiliary_loss_mlp": 0.00358149, + "balance_loss_clip": 1.19934797, + "balance_loss_mlp": 0.32124197, + "epoch": 0.4476777393656997, + "flos": 22528667324160.0, + "grad_norm": 96.31092930432146, + "language_loss": 0.70201838, + "learning_rate": 2.432557082778765e-06, + "loss": 0.72037232, + "num_input_tokens_seen": 159577920, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.36889648, + "step": 7446, + "time_per_iteration": 2.807722330093384 + }, + { + "auxiliary_loss_clip": 0.01312838, + "auxiliary_loss_mlp": 0.00103539, + "balance_loss_clip": 1.16209769, + "balance_loss_mlp": 0.09600464, + "epoch": 0.4477378626183677, + "flos": 49017133877760.0, + "grad_norm": 0.7381774216104406, + "language_loss": 0.49614519, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.51030898, + "num_input_tokens_seen": 159632295, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.07519531, + "step": 7447, + "time_per_iteration": 4.464433670043945 + }, + { + "auxiliary_loss_clip": 0.01311884, + "auxiliary_loss_mlp": 0.00100424, + "balance_loss_clip": 1.16093194, + "balance_loss_mlp": 0.09293745, + "epoch": 0.44779798587103564, + "flos": 56542179392640.0, + "grad_norm": 0.7548482771092092, + "language_loss": 0.59052163, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.60464472, + "num_input_tokens_seen": 159698435, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.07470703, + "step": 7448, + "time_per_iteration": 3.182490587234497 + }, + { + "auxiliary_loss_clip": 0.01461229, + "auxiliary_loss_mlp": 0.00381195, + "balance_loss_clip": 1.19305682, + "balance_loss_mlp": 0.34419209, + "epoch": 0.4478581091237036, + "flos": 46498536040320.0, + "grad_norm": 20.55464656896347, + "language_loss": 0.63888967, + "learning_rate": 2.431416277672789e-06, + "loss": 0.65731388, + "num_input_tokens_seen": 159722150, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.37011719, + "step": 7449, + "time_per_iteration": 2.9274513721466064 + }, + { + "auxiliary_loss_clip": 0.01460766, + "auxiliary_loss_mlp": 0.00358691, + "balance_loss_clip": 1.19399107, + "balance_loss_mlp": 0.32464465, + "epoch": 0.4479182323763716, + "flos": 20814363849600.0, + "grad_norm": 97.99762856236228, + "language_loss": 0.8615579, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.8797524, + "num_input_tokens_seen": 159740550, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.33984375, + "step": 7450, + "time_per_iteration": 4.125848770141602 + }, + { + "auxiliary_loss_clip": 0.01460012, + "auxiliary_loss_mlp": 0.00361934, + "balance_loss_clip": 1.1946311, + "balance_loss_mlp": 0.32834107, + "epoch": 0.44797835562903954, + "flos": 14245979212800.0, + "grad_norm": 50.41735296176141, + "language_loss": 0.86416191, + "learning_rate": 2.430655659114697e-06, + "loss": 0.88238132, + "num_input_tokens_seen": 159758245, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.3359375, + "step": 7451, + "time_per_iteration": 4.1474199295043945 + }, + { + "auxiliary_loss_clip": 0.01296531, + "auxiliary_loss_mlp": 0.00085469, + "balance_loss_clip": 1.14349639, + "balance_loss_mlp": 0.07741013, + "epoch": 0.4480384788817075, + "flos": 63534560169600.0, + "grad_norm": 0.7874529869222724, + "language_loss": 0.62678277, + "learning_rate": 2.430275325332681e-06, + "loss": 0.64060277, + "num_input_tokens_seen": 159826790, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.08056641, + "step": 7452, + "time_per_iteration": 3.1976380348205566 + }, + { + "auxiliary_loss_clip": 0.01461244, + "auxiliary_loss_mlp": 0.0035897, + "balance_loss_clip": 1.19445682, + "balance_loss_mlp": 0.32122791, + "epoch": 0.44809860213437547, + "flos": 21652626522240.0, + "grad_norm": 49.82353570708272, + "language_loss": 0.69614166, + "learning_rate": 2.429894975234582e-06, + "loss": 0.71434385, + "num_input_tokens_seen": 159845805, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.37744141, + "step": 7453, + "time_per_iteration": 2.6838250160217285 + }, + { + "auxiliary_loss_clip": 0.01285948, + "auxiliary_loss_mlp": 0.00083153, + "balance_loss_clip": 1.13753915, + "balance_loss_mlp": 0.07557151, + "epoch": 0.44815872538704343, + "flos": 69190634246400.0, + "grad_norm": 0.7455263833094983, + "language_loss": 0.56545293, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.57914388, + "num_input_tokens_seen": 159898860, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07568359, + "step": 7454, + "time_per_iteration": 3.038912773132324 + }, + { + "auxiliary_loss_clip": 0.0147079, + "auxiliary_loss_mlp": 0.00391457, + "balance_loss_clip": 1.20340157, + "balance_loss_mlp": 0.35629028, + "epoch": 0.4482188486397114, + "flos": 12598289510400.0, + "grad_norm": 17.569077880092745, + "language_loss": 0.83320236, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.85182488, + "num_input_tokens_seen": 159911555, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.3515625, + "step": 7455, + "time_per_iteration": 2.6444125175476074 + }, + { + "auxiliary_loss_clip": 0.01471203, + "auxiliary_loss_mlp": 0.00413167, + "balance_loss_clip": 1.20556402, + "balance_loss_mlp": 0.37754756, + "epoch": 0.44827897189237936, + "flos": 34058182631040.0, + "grad_norm": 6.073098752184318, + "language_loss": 0.81782562, + "learning_rate": 2.428753827188016e-06, + "loss": 0.83666933, + "num_input_tokens_seen": 159931470, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.35620117, + "step": 7456, + "time_per_iteration": 2.82081937789917 + }, + { + "auxiliary_loss_clip": 0.01470682, + "auxiliary_loss_mlp": 0.00371629, + "balance_loss_clip": 1.20887876, + "balance_loss_mlp": 0.33643857, + "epoch": 0.44833909514504733, + "flos": 25147416280320.0, + "grad_norm": 203.7911659090859, + "language_loss": 0.8230325, + "learning_rate": 2.428373411969818e-06, + "loss": 0.84145558, + "num_input_tokens_seen": 159946115, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.35180664, + "step": 7457, + "time_per_iteration": 4.0529937744140625 + }, + { + "auxiliary_loss_clip": 0.01483048, + "auxiliary_loss_mlp": 0.00378061, + "balance_loss_clip": 1.21153426, + "balance_loss_mlp": 0.34065294, + "epoch": 0.4483992183977153, + "flos": 16179984224640.0, + "grad_norm": 29.502583909489754, + "language_loss": 0.74985576, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.76846689, + "num_input_tokens_seen": 159963915, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.37426758, + "step": 7458, + "time_per_iteration": 2.598811388015747 + }, + { + "auxiliary_loss_clip": 0.01466666, + "auxiliary_loss_mlp": 0.00371207, + "balance_loss_clip": 1.19592643, + "balance_loss_mlp": 0.3332265, + "epoch": 0.44845934165038326, + "flos": 17746048270080.0, + "grad_norm": 60.16075183345928, + "language_loss": 0.77445912, + "learning_rate": 2.427612532815961e-06, + "loss": 0.7928378, + "num_input_tokens_seen": 159982140, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.37939453, + "step": 7459, + "time_per_iteration": 2.6401870250701904 + }, + { + "auxiliary_loss_clip": 0.01465556, + "auxiliary_loss_mlp": 0.0038724, + "balance_loss_clip": 1.20160508, + "balance_loss_mlp": 0.3524074, + "epoch": 0.4485194649030513, + "flos": 21835914647040.0, + "grad_norm": 4.7532873951715855, + "language_loss": 0.76829302, + "learning_rate": 2.427232068909154e-06, + "loss": 0.78682101, + "num_input_tokens_seen": 160002280, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.34814453, + "step": 7460, + "time_per_iteration": 2.67160964012146 + }, + { + "auxiliary_loss_clip": 0.01476462, + "auxiliary_loss_mlp": 0.00378163, + "balance_loss_clip": 1.21043491, + "balance_loss_mlp": 0.34132779, + "epoch": 0.44857958815571924, + "flos": 20084515401600.0, + "grad_norm": 5.025969382765271, + "language_loss": 0.84554195, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.86408824, + "num_input_tokens_seen": 160020260, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.3684082, + "step": 7461, + "time_per_iteration": 2.6317391395568848 + }, + { + "auxiliary_loss_clip": 0.01488663, + "auxiliary_loss_mlp": 0.00380186, + "balance_loss_clip": 1.22012997, + "balance_loss_mlp": 0.34563917, + "epoch": 0.4486397114083872, + "flos": 27053519402880.0, + "grad_norm": 13.831304449131821, + "language_loss": 0.760234, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.77892244, + "num_input_tokens_seen": 160040240, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.34545898, + "step": 7462, + "time_per_iteration": 2.6861188411712646 + }, + { + "auxiliary_loss_clip": 0.01287849, + "auxiliary_loss_mlp": 0.00072211, + "balance_loss_clip": 1.14508581, + "balance_loss_mlp": 0.06613083, + "epoch": 0.4486998346610552, + "flos": 67321195931520.0, + "grad_norm": 130.90990620902886, + "language_loss": 0.54425406, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.55785465, + "num_input_tokens_seen": 160093865, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.06079102, + "step": 7463, + "time_per_iteration": 3.173318386077881 + }, + { + "auxiliary_loss_clip": 0.01462789, + "auxiliary_loss_mlp": 0.0036211, + "balance_loss_clip": 1.2021699, + "balance_loss_mlp": 0.32734907, + "epoch": 0.44875995791372314, + "flos": 27636816360960.0, + "grad_norm": 5.157265468428393, + "language_loss": 0.83557308, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.85382199, + "num_input_tokens_seen": 160113590, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.34741211, + "step": 7464, + "time_per_iteration": 2.724362850189209 + }, + { + "auxiliary_loss_clip": 0.01482427, + "auxiliary_loss_mlp": 0.00404388, + "balance_loss_clip": 1.2191478, + "balance_loss_mlp": 0.36869708, + "epoch": 0.4488200811663911, + "flos": 13005947940480.0, + "grad_norm": 247.59052934626763, + "language_loss": 0.82255793, + "learning_rate": 2.425329506653441e-06, + "loss": 0.84142613, + "num_input_tokens_seen": 160131795, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.35693359, + "step": 7465, + "time_per_iteration": 2.675835132598877 + }, + { + "auxiliary_loss_clip": 0.01487134, + "auxiliary_loss_mlp": 0.00372984, + "balance_loss_clip": 1.21570563, + "balance_loss_mlp": 0.33619612, + "epoch": 0.44888020441905907, + "flos": 27489977562240.0, + "grad_norm": 84.13552375343625, + "language_loss": 0.87205803, + "learning_rate": 2.424948945758966e-06, + "loss": 0.89065921, + "num_input_tokens_seen": 160150635, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.36791992, + "step": 7466, + "time_per_iteration": 2.6737465858459473 + }, + { + "auxiliary_loss_clip": 0.0149113, + "auxiliary_loss_mlp": 0.00411267, + "balance_loss_clip": 1.22329164, + "balance_loss_mlp": 0.37321538, + "epoch": 0.44894032767172704, + "flos": 18259678800000.0, + "grad_norm": 3.1547837413849975, + "language_loss": 0.88896871, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.90799272, + "num_input_tokens_seen": 160168615, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.38037109, + "step": 7467, + "time_per_iteration": 2.6458473205566406 + }, + { + "auxiliary_loss_clip": 0.0147808, + "auxiliary_loss_mlp": 0.00362272, + "balance_loss_clip": 1.21832037, + "balance_loss_mlp": 0.32791549, + "epoch": 0.449000450924395, + "flos": 21579835610880.0, + "grad_norm": 2.396717090190205, + "language_loss": 0.80675042, + "learning_rate": 2.424187775642129e-06, + "loss": 0.82515395, + "num_input_tokens_seen": 160187295, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.34350586, + "step": 7468, + "time_per_iteration": 2.653432846069336 + }, + { + "auxiliary_loss_clip": 0.01473165, + "auxiliary_loss_mlp": 0.00388037, + "balance_loss_clip": 1.2128818, + "balance_loss_mlp": 0.35246503, + "epoch": 0.44906057417706297, + "flos": 17967904623360.0, + "grad_norm": 2.046762795399905, + "language_loss": 0.75616097, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.774773, + "num_input_tokens_seen": 160205115, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.35571289, + "step": 7469, + "time_per_iteration": 2.6947221755981445 + }, + { + "auxiliary_loss_clip": 0.01490235, + "auxiliary_loss_mlp": 0.00434977, + "balance_loss_clip": 1.22095859, + "balance_loss_mlp": 0.39702114, + "epoch": 0.44912069742973093, + "flos": 20047347803520.0, + "grad_norm": 94.97121555276874, + "language_loss": 0.79709053, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.81634265, + "num_input_tokens_seen": 160222580, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.37963867, + "step": 7470, + "time_per_iteration": 2.662874460220337 + }, + { + "auxiliary_loss_clip": 0.01497089, + "auxiliary_loss_mlp": 0.00428762, + "balance_loss_clip": 1.22967303, + "balance_loss_mlp": 0.39021, + "epoch": 0.4491808206823989, + "flos": 21033526682880.0, + "grad_norm": 19.25646427737763, + "language_loss": 0.82551146, + "learning_rate": 2.423045899863634e-06, + "loss": 0.84476995, + "num_input_tokens_seen": 160241520, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.38549805, + "step": 7471, + "time_per_iteration": 2.755434513092041 + }, + { + "auxiliary_loss_clip": 0.01481205, + "auxiliary_loss_mlp": 0.00387431, + "balance_loss_clip": 1.21649706, + "balance_loss_mlp": 0.35293192, + "epoch": 0.44924094393506686, + "flos": 22967136645120.0, + "grad_norm": 24.428145653159667, + "language_loss": 0.79806006, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.81674635, + "num_input_tokens_seen": 160261815, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.34472656, + "step": 7472, + "time_per_iteration": 2.72841477394104 + }, + { + "auxiliary_loss_clip": 0.01312539, + "auxiliary_loss_mlp": 0.00064419, + "balance_loss_clip": 1.15586293, + "balance_loss_mlp": 0.05817212, + "epoch": 0.4493010671877349, + "flos": 59233467864960.0, + "grad_norm": 0.7462234388992085, + "language_loss": 0.61240864, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.62617826, + "num_input_tokens_seen": 160317070, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.06225586, + "step": 7473, + "time_per_iteration": 3.1195058822631836 + }, + { + "auxiliary_loss_clip": 0.01467029, + "auxiliary_loss_mlp": 0.00374926, + "balance_loss_clip": 1.2031579, + "balance_loss_mlp": 0.33894837, + "epoch": 0.44936119044040285, + "flos": 18004892653440.0, + "grad_norm": 11.474318667643908, + "language_loss": 0.83699203, + "learning_rate": 2.421903879707657e-06, + "loss": 0.85541153, + "num_input_tokens_seen": 160334980, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.35986328, + "step": 7474, + "time_per_iteration": 2.694025993347168 + }, + { + "auxiliary_loss_clip": 0.01457879, + "auxiliary_loss_mlp": 0.00400177, + "balance_loss_clip": 1.2038492, + "balance_loss_mlp": 0.36236334, + "epoch": 0.4494213136930708, + "flos": 21251827589760.0, + "grad_norm": 16.81968240610641, + "language_loss": 0.7809236, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.7995041, + "num_input_tokens_seen": 160354500, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.37817383, + "step": 7475, + "time_per_iteration": 2.6869421005249023 + }, + { + "auxiliary_loss_clip": 0.01478831, + "auxiliary_loss_mlp": 0.00409957, + "balance_loss_clip": 1.21833873, + "balance_loss_mlp": 0.37243032, + "epoch": 0.4494814369457388, + "flos": 27418695022080.0, + "grad_norm": 3.4462419034633984, + "language_loss": 0.81627828, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.83516622, + "num_input_tokens_seen": 160373650, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.37548828, + "step": 7476, + "time_per_iteration": 2.7476422786712646 + }, + { + "auxiliary_loss_clip": 0.01489992, + "auxiliary_loss_mlp": 0.00411073, + "balance_loss_clip": 1.2232002, + "balance_loss_mlp": 0.37204385, + "epoch": 0.44954156019840674, + "flos": 22854053652480.0, + "grad_norm": 4.067284329814961, + "language_loss": 0.78345358, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.80246425, + "num_input_tokens_seen": 160393430, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.390625, + "step": 7477, + "time_per_iteration": 2.7331507205963135 + }, + { + "auxiliary_loss_clip": 0.01485658, + "auxiliary_loss_mlp": 0.00412539, + "balance_loss_clip": 1.21920848, + "balance_loss_mlp": 0.37606141, + "epoch": 0.4496016834510747, + "flos": 17201570935680.0, + "grad_norm": 7.328608796810543, + "language_loss": 0.75639987, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.7753818, + "num_input_tokens_seen": 160410545, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.36474609, + "step": 7478, + "time_per_iteration": 2.66679048538208 + }, + { + "auxiliary_loss_clip": 0.01478615, + "auxiliary_loss_mlp": 0.00411763, + "balance_loss_clip": 1.21849847, + "balance_loss_mlp": 0.37485555, + "epoch": 0.4496618067037427, + "flos": 18916628595840.0, + "grad_norm": 27.498393372806614, + "language_loss": 0.95482105, + "learning_rate": 2.420000193000779e-06, + "loss": 0.97372484, + "num_input_tokens_seen": 160428105, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.36914062, + "step": 7479, + "time_per_iteration": 2.645629644393921 + }, + { + "auxiliary_loss_clip": 0.01490819, + "auxiliary_loss_mlp": 0.00366118, + "balance_loss_clip": 1.22890806, + "balance_loss_mlp": 0.3317377, + "epoch": 0.44972192995641064, + "flos": 21031659175680.0, + "grad_norm": 2.7517930100223245, + "language_loss": 0.81954038, + "learning_rate": 2.419619407822302e-06, + "loss": 0.83810973, + "num_input_tokens_seen": 160448815, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.34399414, + "step": 7480, + "time_per_iteration": 2.6951487064361572 + }, + { + "auxiliary_loss_clip": 0.01494175, + "auxiliary_loss_mlp": 0.00415687, + "balance_loss_clip": 1.22193503, + "balance_loss_mlp": 0.37582308, + "epoch": 0.4497820532090786, + "flos": 20777088510720.0, + "grad_norm": 38.890330236248786, + "language_loss": 0.87008381, + "learning_rate": 2.419238606731815e-06, + "loss": 0.88918245, + "num_input_tokens_seen": 160465940, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.3984375, + "step": 7481, + "time_per_iteration": 2.6476986408233643 + }, + { + "auxiliary_loss_clip": 0.01464775, + "auxiliary_loss_mlp": 0.00408111, + "balance_loss_clip": 1.20961607, + "balance_loss_mlp": 0.37239635, + "epoch": 0.44984217646174657, + "flos": 33802606385280.0, + "grad_norm": 4.389572901827345, + "language_loss": 0.74960154, + "learning_rate": 2.418857789743758e-06, + "loss": 0.76833034, + "num_input_tokens_seen": 160486710, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.35693359, + "step": 7482, + "time_per_iteration": 2.7309529781341553 + }, + { + "auxiliary_loss_clip": 0.01493414, + "auxiliary_loss_mlp": 0.00412186, + "balance_loss_clip": 1.2308495, + "balance_loss_mlp": 0.37511152, + "epoch": 0.44990229971441453, + "flos": 15518365660800.0, + "grad_norm": 4.3712954188511075, + "language_loss": 0.92708015, + "learning_rate": 2.418476956872571e-06, + "loss": 0.94613612, + "num_input_tokens_seen": 160503405, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.37060547, + "step": 7483, + "time_per_iteration": 2.6495511531829834 + }, + { + "auxiliary_loss_clip": 0.01494244, + "auxiliary_loss_mlp": 0.00434111, + "balance_loss_clip": 1.22623491, + "balance_loss_mlp": 0.39582103, + "epoch": 0.4499624229670825, + "flos": 29861913191040.0, + "grad_norm": 11.454507948617767, + "language_loss": 0.87045693, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.88974047, + "num_input_tokens_seen": 160525080, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.3828125, + "step": 7484, + "time_per_iteration": 2.702760934829712 + }, + { + "auxiliary_loss_clip": 0.01496906, + "auxiliary_loss_mlp": 0.00415222, + "balance_loss_clip": 1.22484136, + "balance_loss_mlp": 0.37697947, + "epoch": 0.45002254621975046, + "flos": 18513674847360.0, + "grad_norm": 20.522675581111905, + "language_loss": 0.83614498, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.85526621, + "num_input_tokens_seen": 160540895, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.38256836, + "step": 7485, + "time_per_iteration": 2.65505051612854 + }, + { + "auxiliary_loss_clip": 0.01289927, + "auxiliary_loss_mlp": 0.00103313, + "balance_loss_clip": 1.13416386, + "balance_loss_mlp": 0.095779, + "epoch": 0.4500826694724185, + "flos": 70420394229120.0, + "grad_norm": 0.7759483698122989, + "language_loss": 0.57781494, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.59174734, + "num_input_tokens_seen": 160598270, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.07519531, + "step": 7486, + "time_per_iteration": 3.181135892868042 + }, + { + "auxiliary_loss_clip": 0.01499241, + "auxiliary_loss_mlp": 0.00451262, + "balance_loss_clip": 1.23894858, + "balance_loss_mlp": 0.41306704, + "epoch": 0.45014279272508645, + "flos": 15778897983360.0, + "grad_norm": 4.881530364962694, + "language_loss": 0.89302063, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.91252565, + "num_input_tokens_seen": 160614720, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.38232422, + "step": 7487, + "time_per_iteration": 2.677926540374756 + }, + { + "auxiliary_loss_clip": 0.01467769, + "auxiliary_loss_mlp": 0.00455451, + "balance_loss_clip": 1.2092036, + "balance_loss_mlp": 0.41582593, + "epoch": 0.4502029159777544, + "flos": 21799573061760.0, + "grad_norm": 53.3339332803007, + "language_loss": 0.81919253, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.83842468, + "num_input_tokens_seen": 160635170, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.39575195, + "step": 7488, + "time_per_iteration": 2.6708250045776367 + }, + { + "auxiliary_loss_clip": 0.01497966, + "auxiliary_loss_mlp": 0.00516076, + "balance_loss_clip": 1.22172642, + "balance_loss_mlp": 0.46987098, + "epoch": 0.4502630392304224, + "flos": 28767966531840.0, + "grad_norm": 8.776764810422007, + "language_loss": 0.80074447, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.82088488, + "num_input_tokens_seen": 160654490, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.46191406, + "step": 7489, + "time_per_iteration": 4.215796709060669 + }, + { + "auxiliary_loss_clip": 0.0147708, + "auxiliary_loss_mlp": 0.00476846, + "balance_loss_clip": 1.21194506, + "balance_loss_mlp": 0.4347418, + "epoch": 0.45032316248309034, + "flos": 15844182952320.0, + "grad_norm": 116.77583091429432, + "language_loss": 0.76504862, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.78458786, + "num_input_tokens_seen": 160669400, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.42114258, + "step": 7490, + "time_per_iteration": 2.7430613040924072 + }, + { + "auxiliary_loss_clip": 0.01277455, + "auxiliary_loss_mlp": 0.00122553, + "balance_loss_clip": 1.11866963, + "balance_loss_mlp": 0.11497115, + "epoch": 0.4503832857357583, + "flos": 57853600945920.0, + "grad_norm": 0.7095109165091306, + "language_loss": 0.56278205, + "learning_rate": 2.415429723843495e-06, + "loss": 0.57678211, + "num_input_tokens_seen": 160733820, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.07568359, + "step": 7491, + "time_per_iteration": 3.1857144832611084 + }, + { + "auxiliary_loss_clip": 0.01470814, + "auxiliary_loss_mlp": 0.00467535, + "balance_loss_clip": 1.21131408, + "balance_loss_mlp": 0.4261691, + "epoch": 0.4504434089884263, + "flos": 23878082488320.0, + "grad_norm": 5.538800604064262, + "language_loss": 0.84404755, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.86343098, + "num_input_tokens_seen": 160753175, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.4140625, + "step": 7492, + "time_per_iteration": 4.207965850830078 + }, + { + "auxiliary_loss_clip": 0.01483391, + "auxiliary_loss_mlp": 0.00529762, + "balance_loss_clip": 1.20953846, + "balance_loss_mlp": 0.48181558, + "epoch": 0.45050353224109424, + "flos": 17785083375360.0, + "grad_norm": 37.008021802070274, + "language_loss": 0.98745, + "learning_rate": 2.4146677577659573e-06, + "loss": 1.00758147, + "num_input_tokens_seen": 160768310, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.47973633, + "step": 7493, + "time_per_iteration": 4.109467029571533 + }, + { + "auxiliary_loss_clip": 0.01277748, + "auxiliary_loss_mlp": 0.00224122, + "balance_loss_clip": 1.11543012, + "balance_loss_mlp": 0.21253526, + "epoch": 0.4505636554937622, + "flos": 65063420703360.0, + "grad_norm": 0.8023044342828962, + "language_loss": 0.62642491, + "learning_rate": 2.4142867511336e-06, + "loss": 0.64144361, + "num_input_tokens_seen": 160827370, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.11572266, + "step": 7494, + "time_per_iteration": 3.176154136657715 + }, + { + "auxiliary_loss_clip": 0.01471404, + "auxiliary_loss_mlp": 0.00427204, + "balance_loss_clip": 1.21364045, + "balance_loss_mlp": 0.38974851, + "epoch": 0.45062377874643017, + "flos": 22200084685440.0, + "grad_norm": 5.004121165894214, + "language_loss": 0.85451603, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.87350214, + "num_input_tokens_seen": 160849140, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.37475586, + "step": 7495, + "time_per_iteration": 2.666616916656494 + }, + { + "auxiliary_loss_clip": 0.01478305, + "auxiliary_loss_mlp": 0.00456105, + "balance_loss_clip": 1.2186625, + "balance_loss_mlp": 0.41755289, + "epoch": 0.45068390199909814, + "flos": 37670293186560.0, + "grad_norm": 3.5629823577327673, + "language_loss": 0.90688062, + "learning_rate": 2.41352469075395e-06, + "loss": 0.92622471, + "num_input_tokens_seen": 160871280, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.38574219, + "step": 7496, + "time_per_iteration": 2.8481857776641846 + }, + { + "auxiliary_loss_clip": 0.01455916, + "auxiliary_loss_mlp": 0.00496519, + "balance_loss_clip": 1.19397879, + "balance_loss_mlp": 0.45381844, + "epoch": 0.4507440252517661, + "flos": 22302501338880.0, + "grad_norm": 71.16624384024739, + "language_loss": 0.81495905, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.83448339, + "num_input_tokens_seen": 160888625, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.42724609, + "step": 7497, + "time_per_iteration": 2.710631847381592 + }, + { + "auxiliary_loss_clip": 0.014907, + "auxiliary_loss_mlp": 0.00488897, + "balance_loss_clip": 1.22252119, + "balance_loss_mlp": 0.44676861, + "epoch": 0.45080414850443407, + "flos": 13188374138880.0, + "grad_norm": 795.6308100234744, + "language_loss": 0.80969131, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.82948726, + "num_input_tokens_seen": 160907040, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.42114258, + "step": 7498, + "time_per_iteration": 2.659393787384033 + }, + { + "auxiliary_loss_clip": 0.01458455, + "auxiliary_loss_mlp": 0.00500552, + "balance_loss_clip": 1.19818544, + "balance_loss_mlp": 0.45623037, + "epoch": 0.4508642717571021, + "flos": 21944939402880.0, + "grad_norm": 30.265372858302353, + "language_loss": 0.78685528, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.80644536, + "num_input_tokens_seen": 160927115, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.44287109, + "step": 7499, + "time_per_iteration": 4.107067108154297 + }, + { + "auxiliary_loss_clip": 0.01471278, + "auxiliary_loss_mlp": 0.00487467, + "balance_loss_clip": 1.20841885, + "balance_loss_mlp": 0.44486207, + "epoch": 0.45092439500977005, + "flos": 23367468700800.0, + "grad_norm": 2.573831718910498, + "language_loss": 0.83176351, + "learning_rate": 2.412000381939477e-06, + "loss": 0.85135096, + "num_input_tokens_seen": 160944405, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.42602539, + "step": 7500, + "time_per_iteration": 2.6663076877593994 + }, + { + "auxiliary_loss_clip": 0.01467985, + "auxiliary_loss_mlp": 0.0049867, + "balance_loss_clip": 1.21129942, + "balance_loss_mlp": 0.45673257, + "epoch": 0.450984518262438, + "flos": 20772958446720.0, + "grad_norm": 15.983119202980113, + "language_loss": 0.68105257, + "learning_rate": 2.411619265641992e-06, + "loss": 0.70071912, + "num_input_tokens_seen": 160961345, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.41918945, + "step": 7501, + "time_per_iteration": 2.6267151832580566 + }, + { + "auxiliary_loss_clip": 0.01466661, + "auxiliary_loss_mlp": 0.00538413, + "balance_loss_clip": 1.202981, + "balance_loss_mlp": 0.49232626, + "epoch": 0.451044641515106, + "flos": 17707372300800.0, + "grad_norm": 6004.64005155851, + "language_loss": 0.91013551, + "learning_rate": 2.411238133735863e-06, + "loss": 0.93018615, + "num_input_tokens_seen": 160977330, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.46044922, + "step": 7502, + "time_per_iteration": 2.6102423667907715 + }, + { + "auxiliary_loss_clip": 0.01455353, + "auxiliary_loss_mlp": 0.0048575, + "balance_loss_clip": 1.19530535, + "balance_loss_mlp": 0.44226211, + "epoch": 0.45110476476777395, + "flos": 20594698225920.0, + "grad_norm": 99.71261367464086, + "language_loss": 0.8405953, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.86000633, + "num_input_tokens_seen": 160997280, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.43481445, + "step": 7503, + "time_per_iteration": 2.6892707347869873 + }, + { + "auxiliary_loss_clip": 0.01461933, + "auxiliary_loss_mlp": 0.00480859, + "balance_loss_clip": 1.20709729, + "balance_loss_mlp": 0.43803942, + "epoch": 0.4511648880204419, + "flos": 16034043265920.0, + "grad_norm": 21.505093549496127, + "language_loss": 0.86038566, + "learning_rate": 2.410475823155484e-06, + "loss": 0.87981355, + "num_input_tokens_seen": 161014235, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.42822266, + "step": 7504, + "time_per_iteration": 2.6092941761016846 + }, + { + "auxiliary_loss_clip": 0.01462738, + "auxiliary_loss_mlp": 0.00455315, + "balance_loss_clip": 1.2061007, + "balance_loss_mlp": 0.41566646, + "epoch": 0.4512250112731099, + "flos": 23978811202560.0, + "grad_norm": 1142.7614735418501, + "language_loss": 0.68527067, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.7044512, + "num_input_tokens_seen": 161032360, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.39624023, + "step": 7505, + "time_per_iteration": 2.7746388912200928 + }, + { + "auxiliary_loss_clip": 0.01296747, + "auxiliary_loss_mlp": 0.00207381, + "balance_loss_clip": 1.14290166, + "balance_loss_mlp": 0.19441062, + "epoch": 0.45128513452577784, + "flos": 71462308037760.0, + "grad_norm": 0.8286048787889418, + "language_loss": 0.58171731, + "learning_rate": 2.409713450313968e-06, + "loss": 0.5967586, + "num_input_tokens_seen": 161091360, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.12988281, + "step": 7506, + "time_per_iteration": 3.2007458209991455 + }, + { + "auxiliary_loss_clip": 0.0144665, + "auxiliary_loss_mlp": 0.00466567, + "balance_loss_clip": 1.19158792, + "balance_loss_mlp": 0.42543989, + "epoch": 0.4513452577784458, + "flos": 22090844448000.0, + "grad_norm": 8974.037925662034, + "language_loss": 0.85388678, + "learning_rate": 2.40933224058142e-06, + "loss": 0.87301898, + "num_input_tokens_seen": 161110825, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.41137695, + "step": 7507, + "time_per_iteration": 2.769765853881836 + }, + { + "auxiliary_loss_clip": 0.01458525, + "auxiliary_loss_mlp": 0.00524318, + "balance_loss_clip": 1.20041013, + "balance_loss_mlp": 0.48142684, + "epoch": 0.4514053810311138, + "flos": 24276403382400.0, + "grad_norm": 4.3062512411657785, + "language_loss": 0.77664435, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.79647279, + "num_input_tokens_seen": 161130685, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.42871094, + "step": 7508, + "time_per_iteration": 2.7502026557922363 + }, + { + "auxiliary_loss_clip": 0.01451241, + "auxiliary_loss_mlp": 0.00484165, + "balance_loss_clip": 1.19875956, + "balance_loss_mlp": 0.4434666, + "epoch": 0.45146550428378174, + "flos": 17886781756800.0, + "grad_norm": 389.93109793939385, + "language_loss": 0.84837031, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.86772436, + "num_input_tokens_seen": 161147555, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.40722656, + "step": 7509, + "time_per_iteration": 2.6461970806121826 + }, + { + "auxiliary_loss_clip": 0.01455257, + "auxiliary_loss_mlp": 0.00527927, + "balance_loss_clip": 1.20416057, + "balance_loss_mlp": 0.48660907, + "epoch": 0.4515256275364497, + "flos": 24243437675520.0, + "grad_norm": 136.32840443492984, + "language_loss": 0.81406367, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.83389544, + "num_input_tokens_seen": 161166255, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.4128418, + "step": 7510, + "time_per_iteration": 2.7284348011016846 + }, + { + "auxiliary_loss_clip": 0.01458881, + "auxiliary_loss_mlp": 0.0049615, + "balance_loss_clip": 1.19507456, + "balance_loss_mlp": 0.45483187, + "epoch": 0.45158575078911767, + "flos": 20631039811200.0, + "grad_norm": 25.39697404901466, + "language_loss": 0.84070843, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.8602587, + "num_input_tokens_seen": 161184720, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.41308594, + "step": 7511, + "time_per_iteration": 2.691373825073242 + }, + { + "auxiliary_loss_clip": 0.01460029, + "auxiliary_loss_mlp": 0.00534845, + "balance_loss_clip": 1.19848752, + "balance_loss_mlp": 0.49083275, + "epoch": 0.45164587404178563, + "flos": 23327751237120.0, + "grad_norm": 86.86396456971815, + "language_loss": 0.84718108, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.8671298, + "num_input_tokens_seen": 161204360, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.44018555, + "step": 7512, + "time_per_iteration": 2.6669094562530518 + }, + { + "auxiliary_loss_clip": 0.01471007, + "auxiliary_loss_mlp": 0.00559451, + "balance_loss_clip": 1.20063484, + "balance_loss_mlp": 0.51474756, + "epoch": 0.45170599729445365, + "flos": 23805973935360.0, + "grad_norm": 4.02544483168645, + "language_loss": 0.94234109, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.96264565, + "num_input_tokens_seen": 161223575, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.44677734, + "step": 7513, + "time_per_iteration": 2.656541347503662 + }, + { + "auxiliary_loss_clip": 0.01455672, + "auxiliary_loss_mlp": 0.00499236, + "balance_loss_clip": 1.20204341, + "balance_loss_mlp": 0.46058795, + "epoch": 0.4517661205471216, + "flos": 23512942782720.0, + "grad_norm": 4.100147097341956, + "language_loss": 0.72470766, + "learning_rate": 2.406663338649419e-06, + "loss": 0.74425673, + "num_input_tokens_seen": 161243805, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.38647461, + "step": 7514, + "time_per_iteration": 2.73429274559021 + }, + { + "auxiliary_loss_clip": 0.01463987, + "auxiliary_loss_mlp": 0.00514819, + "balance_loss_clip": 1.20116043, + "balance_loss_mlp": 0.46911398, + "epoch": 0.4518262437997896, + "flos": 23513948363520.0, + "grad_norm": 12.163853724226005, + "language_loss": 0.7611106, + "learning_rate": 2.406282005146318e-06, + "loss": 0.78089869, + "num_input_tokens_seen": 161261450, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.45678711, + "step": 7515, + "time_per_iteration": 2.6980865001678467 + }, + { + "auxiliary_loss_clip": 0.0145394, + "auxiliary_loss_mlp": 0.00533801, + "balance_loss_clip": 1.18881714, + "balance_loss_mlp": 0.48699921, + "epoch": 0.45188636705245755, + "flos": 14568061489920.0, + "grad_norm": 86.48580090087346, + "language_loss": 0.87829012, + "learning_rate": 2.405900656236963e-06, + "loss": 0.89816761, + "num_input_tokens_seen": 161276965, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.4675293, + "step": 7516, + "time_per_iteration": 2.6605420112609863 + }, + { + "auxiliary_loss_clip": 0.01452676, + "auxiliary_loss_mlp": 0.00564029, + "balance_loss_clip": 1.19445252, + "balance_loss_mlp": 0.52104235, + "epoch": 0.4519464903051255, + "flos": 19901550499200.0, + "grad_norm": 281.8224133383948, + "language_loss": 0.7133621, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.73352909, + "num_input_tokens_seen": 161295375, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.4296875, + "step": 7517, + "time_per_iteration": 2.643141746520996 + }, + { + "auxiliary_loss_clip": 0.01444932, + "auxiliary_loss_mlp": 0.00537137, + "balance_loss_clip": 1.19107556, + "balance_loss_mlp": 0.49667746, + "epoch": 0.4520066135577935, + "flos": 18844376388480.0, + "grad_norm": 20.876764787029927, + "language_loss": 0.69063777, + "learning_rate": 2.405137912257333e-06, + "loss": 0.71045852, + "num_input_tokens_seen": 161313010, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.40454102, + "step": 7518, + "time_per_iteration": 2.657953977584839 + }, + { + "auxiliary_loss_clip": 0.01445525, + "auxiliary_loss_mlp": 0.00524502, + "balance_loss_clip": 1.18937874, + "balance_loss_mlp": 0.48194411, + "epoch": 0.45206673681046144, + "flos": 48214419713280.0, + "grad_norm": 18.982833534625225, + "language_loss": 0.65105522, + "learning_rate": 2.404756517215982e-06, + "loss": 0.67075551, + "num_input_tokens_seen": 161336690, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.42553711, + "step": 7519, + "time_per_iteration": 2.929690361022949 + }, + { + "auxiliary_loss_clip": 0.01439528, + "auxiliary_loss_mlp": 0.00574457, + "balance_loss_clip": 1.18589807, + "balance_loss_mlp": 0.52860928, + "epoch": 0.4521268600631294, + "flos": 23842171866240.0, + "grad_norm": 8.416398524811147, + "language_loss": 0.78193098, + "learning_rate": 2.404375106826223e-06, + "loss": 0.80207086, + "num_input_tokens_seen": 161357845, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.45825195, + "step": 7520, + "time_per_iteration": 2.6856133937835693 + }, + { + "auxiliary_loss_clip": 0.01442677, + "auxiliary_loss_mlp": 0.00556601, + "balance_loss_clip": 1.18295217, + "balance_loss_mlp": 0.51041889, + "epoch": 0.4521869833157974, + "flos": 18843622202880.0, + "grad_norm": 98.090375144064, + "language_loss": 0.81437778, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.83437061, + "num_input_tokens_seen": 161375160, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.46166992, + "step": 7521, + "time_per_iteration": 2.6849207878112793 + }, + { + "auxiliary_loss_clip": 0.01443208, + "auxiliary_loss_mlp": 0.00547526, + "balance_loss_clip": 1.18076861, + "balance_loss_mlp": 0.50163031, + "epoch": 0.45224710656846534, + "flos": 19788072456960.0, + "grad_norm": 3.0805928684025505, + "language_loss": 0.74709237, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.76699972, + "num_input_tokens_seen": 161393690, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.45922852, + "step": 7522, + "time_per_iteration": 2.6507527828216553 + }, + { + "auxiliary_loss_clip": 0.01433387, + "auxiliary_loss_mlp": 0.00544078, + "balance_loss_clip": 1.17788196, + "balance_loss_mlp": 0.5000658, + "epoch": 0.4523072298211333, + "flos": 28256131681920.0, + "grad_norm": 7.825423562791712, + "language_loss": 0.65770197, + "learning_rate": 2.403230783711134e-06, + "loss": 0.6774767, + "num_input_tokens_seen": 161415015, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.43994141, + "step": 7523, + "time_per_iteration": 2.729522466659546 + }, + { + "auxiliary_loss_clip": 0.01449169, + "auxiliary_loss_mlp": 0.00529769, + "balance_loss_clip": 1.18739295, + "balance_loss_mlp": 0.48585242, + "epoch": 0.45236735307380127, + "flos": 11181039511680.0, + "grad_norm": 146.22176085431983, + "language_loss": 0.84126812, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.86105752, + "num_input_tokens_seen": 161432940, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.43920898, + "step": 7524, + "time_per_iteration": 2.6658365726470947 + }, + { + "auxiliary_loss_clip": 0.01438266, + "auxiliary_loss_mlp": 0.00508763, + "balance_loss_clip": 1.18234932, + "balance_loss_mlp": 0.46637216, + "epoch": 0.45242747632646924, + "flos": 22601386408320.0, + "grad_norm": 7.211583913064756, + "language_loss": 0.69649941, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.71596974, + "num_input_tokens_seen": 161452215, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.42358398, + "step": 7525, + "time_per_iteration": 2.665422201156616 + }, + { + "auxiliary_loss_clip": 0.01432074, + "auxiliary_loss_mlp": 0.00524651, + "balance_loss_clip": 1.18326569, + "balance_loss_mlp": 0.48428667, + "epoch": 0.45248759957913726, + "flos": 18256267008000.0, + "grad_norm": 9.559409504466883, + "language_loss": 0.83540034, + "learning_rate": 2.402086322981083e-06, + "loss": 0.85496759, + "num_input_tokens_seen": 161469520, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.40332031, + "step": 7526, + "time_per_iteration": 2.6207313537597656 + }, + { + "auxiliary_loss_clip": 0.01438249, + "auxiliary_loss_mlp": 0.00533536, + "balance_loss_clip": 1.18473566, + "balance_loss_mlp": 0.49305257, + "epoch": 0.4525477228318052, + "flos": 22450094323200.0, + "grad_norm": 5.8178366277881794, + "language_loss": 0.8651908, + "learning_rate": 2.40170480555747e-06, + "loss": 0.88490868, + "num_input_tokens_seen": 161487335, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.40454102, + "step": 7527, + "time_per_iteration": 2.660675287246704 + }, + { + "auxiliary_loss_clip": 0.01430624, + "auxiliary_loss_mlp": 0.00510972, + "balance_loss_clip": 1.17722619, + "balance_loss_mlp": 0.47003552, + "epoch": 0.4526078460844732, + "flos": 29644869260160.0, + "grad_norm": 7.661119786866352, + "language_loss": 0.70648575, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.72590172, + "num_input_tokens_seen": 161510095, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.40966797, + "step": 7528, + "time_per_iteration": 2.7197182178497314 + }, + { + "auxiliary_loss_clip": 0.0142655, + "auxiliary_loss_mlp": 0.00545898, + "balance_loss_clip": 1.17402637, + "balance_loss_mlp": 0.50281578, + "epoch": 0.45266796933714115, + "flos": 23039747988480.0, + "grad_norm": 18.280232616044394, + "language_loss": 0.80112791, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.8208524, + "num_input_tokens_seen": 161528725, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.43066406, + "step": 7529, + "time_per_iteration": 2.658329963684082 + }, + { + "auxiliary_loss_clip": 0.01430568, + "auxiliary_loss_mlp": 0.00538483, + "balance_loss_clip": 1.17482674, + "balance_loss_mlp": 0.4952817, + "epoch": 0.4527280925898091, + "flos": 14428405411200.0, + "grad_norm": 11.51695800436085, + "language_loss": 0.80425882, + "learning_rate": 2.400560161948384e-06, + "loss": 0.82394934, + "num_input_tokens_seen": 161547195, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.43212891, + "step": 7530, + "time_per_iteration": 2.6455655097961426 + }, + { + "auxiliary_loss_clip": 0.01429415, + "auxiliary_loss_mlp": 0.00549924, + "balance_loss_clip": 1.17740345, + "balance_loss_mlp": 0.50760448, + "epoch": 0.4527882158424771, + "flos": 22925515760640.0, + "grad_norm": 23.694580062295593, + "language_loss": 0.81326473, + "learning_rate": 2.400178583680834e-06, + "loss": 0.83305812, + "num_input_tokens_seen": 161565565, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.4230957, + "step": 7531, + "time_per_iteration": 4.113640785217285 + }, + { + "auxiliary_loss_clip": 0.01419408, + "auxiliary_loss_mlp": 0.0053095, + "balance_loss_clip": 1.17212045, + "balance_loss_mlp": 0.49058539, + "epoch": 0.45284833909514505, + "flos": 25555326105600.0, + "grad_norm": 754.6669815258074, + "language_loss": 0.72552276, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.74502629, + "num_input_tokens_seen": 161586630, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.40380859, + "step": 7532, + "time_per_iteration": 2.736219644546509 + }, + { + "auxiliary_loss_clip": 0.01418382, + "auxiliary_loss_mlp": 0.00522368, + "balance_loss_clip": 1.16963577, + "balance_loss_mlp": 0.47976264, + "epoch": 0.452908462347813, + "flos": 18150007599360.0, + "grad_norm": 10.251837705333461, + "language_loss": 0.83094585, + "learning_rate": 2.399415381635768e-06, + "loss": 0.8503533, + "num_input_tokens_seen": 161603815, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.42578125, + "step": 7533, + "time_per_iteration": 2.639467477798462 + }, + { + "auxiliary_loss_clip": 0.01421638, + "auxiliary_loss_mlp": 0.00563247, + "balance_loss_clip": 1.16276085, + "balance_loss_mlp": 0.51682675, + "epoch": 0.452968585600481, + "flos": 19062749122560.0, + "grad_norm": 15.323347894748178, + "language_loss": 0.90231979, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.92216861, + "num_input_tokens_seen": 161622900, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.46459961, + "step": 7534, + "time_per_iteration": 2.6816413402557373 + }, + { + "auxiliary_loss_clip": 0.01405087, + "auxiliary_loss_mlp": 0.00535945, + "balance_loss_clip": 1.15587425, + "balance_loss_mlp": 0.49095502, + "epoch": 0.45302870885314894, + "flos": 22051737515520.0, + "grad_norm": 787.74803932213, + "language_loss": 0.81196213, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.83137238, + "num_input_tokens_seen": 161641700, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.44995117, + "step": 7535, + "time_per_iteration": 5.7490270137786865 + }, + { + "auxiliary_loss_clip": 0.0140971, + "auxiliary_loss_mlp": 0.00492333, + "balance_loss_clip": 1.16248465, + "balance_loss_mlp": 0.45301783, + "epoch": 0.4530888321058169, + "flos": 20376217751040.0, + "grad_norm": 134.97468164615978, + "language_loss": 0.86859691, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.88761735, + "num_input_tokens_seen": 161661955, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.39306641, + "step": 7536, + "time_per_iteration": 2.7371819019317627 + }, + { + "auxiliary_loss_clip": 0.01399777, + "auxiliary_loss_mlp": 0.00489656, + "balance_loss_clip": 1.14674997, + "balance_loss_mlp": 0.44964981, + "epoch": 0.4531489553584849, + "flos": 14830425406080.0, + "grad_norm": 20.55438820063087, + "language_loss": 0.81005776, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.82895207, + "num_input_tokens_seen": 161679245, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.40014648, + "step": 7537, + "time_per_iteration": 2.639756441116333 + }, + { + "auxiliary_loss_clip": 0.01396801, + "auxiliary_loss_mlp": 0.00501212, + "balance_loss_clip": 1.14890122, + "balance_loss_mlp": 0.4621594, + "epoch": 0.45320907861115284, + "flos": 21944975316480.0, + "grad_norm": 21.591274845369494, + "language_loss": 0.8207773, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.83975738, + "num_input_tokens_seen": 161698795, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.390625, + "step": 7538, + "time_per_iteration": 2.652264356613159 + }, + { + "auxiliary_loss_clip": 0.01294905, + "auxiliary_loss_mlp": 0.00266763, + "balance_loss_clip": 1.1333648, + "balance_loss_mlp": 0.25360271, + "epoch": 0.45326920186382086, + "flos": 66251455038720.0, + "grad_norm": 0.7957771001938359, + "language_loss": 0.62082815, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.63644481, + "num_input_tokens_seen": 161761980, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.13183594, + "step": 7539, + "time_per_iteration": 3.1907784938812256 + }, + { + "auxiliary_loss_clip": 0.01401923, + "auxiliary_loss_mlp": 0.00464857, + "balance_loss_clip": 1.1578629, + "balance_loss_mlp": 0.42589915, + "epoch": 0.4533293251164888, + "flos": 14684233052160.0, + "grad_norm": 18416.959974929934, + "language_loss": 0.70765531, + "learning_rate": 2.396743698142872e-06, + "loss": 0.72632313, + "num_input_tokens_seen": 161779455, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.38964844, + "step": 7540, + "time_per_iteration": 2.6831774711608887 + }, + { + "auxiliary_loss_clip": 0.01409725, + "auxiliary_loss_mlp": 0.0048348, + "balance_loss_clip": 1.15302467, + "balance_loss_mlp": 0.44142273, + "epoch": 0.4533894483691568, + "flos": 22601206840320.0, + "grad_norm": 15.968056610546975, + "language_loss": 0.92417252, + "learning_rate": 2.396361968778424e-06, + "loss": 0.94310462, + "num_input_tokens_seen": 161798980, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.42041016, + "step": 7541, + "time_per_iteration": 4.136256694793701 + }, + { + "auxiliary_loss_clip": 0.01392485, + "auxiliary_loss_mlp": 0.00500176, + "balance_loss_clip": 1.14500976, + "balance_loss_mlp": 0.45852441, + "epoch": 0.45344957162182475, + "flos": 34751617666560.0, + "grad_norm": 12.880920318805352, + "language_loss": 0.81849569, + "learning_rate": 2.395980224383889e-06, + "loss": 0.83742225, + "num_input_tokens_seen": 161819745, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.41625977, + "step": 7542, + "time_per_iteration": 2.732893943786621 + }, + { + "auxiliary_loss_clip": 0.01390305, + "auxiliary_loss_mlp": 0.00451728, + "balance_loss_clip": 1.14602089, + "balance_loss_mlp": 0.41224596, + "epoch": 0.4535096948744927, + "flos": 23550218121600.0, + "grad_norm": 37.4049304772582, + "language_loss": 0.8511759, + "learning_rate": 2.395598464973746e-06, + "loss": 0.86959624, + "num_input_tokens_seen": 161838575, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.39526367, + "step": 7543, + "time_per_iteration": 2.6855037212371826 + }, + { + "auxiliary_loss_clip": 0.01391011, + "auxiliary_loss_mlp": 0.00477236, + "balance_loss_clip": 1.14344168, + "balance_loss_mlp": 0.43799269, + "epoch": 0.4535698181271607, + "flos": 25557552748800.0, + "grad_norm": 53.16560567948009, + "language_loss": 0.81230628, + "learning_rate": 2.395216690562469e-06, + "loss": 0.83098876, + "num_input_tokens_seen": 161858590, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.39233398, + "step": 7544, + "time_per_iteration": 2.6775143146514893 + }, + { + "auxiliary_loss_clip": 0.01398373, + "auxiliary_loss_mlp": 0.00504708, + "balance_loss_clip": 1.15055275, + "balance_loss_mlp": 0.46286604, + "epoch": 0.45362994137982865, + "flos": 24864117713280.0, + "grad_norm": 72.50017988571514, + "language_loss": 0.839185, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.85821581, + "num_input_tokens_seen": 161878390, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.41870117, + "step": 7545, + "time_per_iteration": 2.6845386028289795 + }, + { + "auxiliary_loss_clip": 0.01390863, + "auxiliary_loss_mlp": 0.00434031, + "balance_loss_clip": 1.14682555, + "balance_loss_mlp": 0.39724261, + "epoch": 0.4536900646324966, + "flos": 30806794408320.0, + "grad_norm": 17.6388814798412, + "language_loss": 0.78258109, + "learning_rate": 2.394453096794423e-06, + "loss": 0.80083001, + "num_input_tokens_seen": 161898610, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.36791992, + "step": 7546, + "time_per_iteration": 2.7356560230255127 + }, + { + "auxiliary_loss_clip": 0.01413974, + "auxiliary_loss_mlp": 0.0048157, + "balance_loss_clip": 1.15938473, + "balance_loss_mlp": 0.44261217, + "epoch": 0.4537501878851646, + "flos": 23404313076480.0, + "grad_norm": 9.253442624818158, + "language_loss": 0.82599688, + "learning_rate": 2.394071277466609e-06, + "loss": 0.84495229, + "num_input_tokens_seen": 161918210, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.3894043, + "step": 7547, + "time_per_iteration": 2.712801694869995 + }, + { + "auxiliary_loss_clip": 0.01396621, + "auxiliary_loss_mlp": 0.00478534, + "balance_loss_clip": 1.14558351, + "balance_loss_mlp": 0.43843168, + "epoch": 0.45381031113783254, + "flos": 18149289327360.0, + "grad_norm": 24.151522752542984, + "language_loss": 0.76080489, + "learning_rate": 2.393689443195573e-06, + "loss": 0.77955645, + "num_input_tokens_seen": 161936950, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.40063477, + "step": 7548, + "time_per_iteration": 2.6751809120178223 + }, + { + "auxiliary_loss_clip": 0.01377627, + "auxiliary_loss_mlp": 0.00445521, + "balance_loss_clip": 1.13646996, + "balance_loss_mlp": 0.4095439, + "epoch": 0.4538704343905005, + "flos": 25336666062720.0, + "grad_norm": 80.48224068872496, + "language_loss": 0.79872072, + "learning_rate": 2.393307593995794e-06, + "loss": 0.81695217, + "num_input_tokens_seen": 161955550, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.35961914, + "step": 7549, + "time_per_iteration": 2.7075366973876953 + }, + { + "auxiliary_loss_clip": 0.01394545, + "auxiliary_loss_mlp": 0.00450403, + "balance_loss_clip": 1.14384425, + "balance_loss_mlp": 0.41468805, + "epoch": 0.4539305576431685, + "flos": 28731445378560.0, + "grad_norm": 20.49449267955831, + "language_loss": 0.71582103, + "learning_rate": 2.392925729881751e-06, + "loss": 0.73427045, + "num_input_tokens_seen": 161976760, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.35717773, + "step": 7550, + "time_per_iteration": 2.7168548107147217 + }, + { + "auxiliary_loss_clip": 0.01379851, + "auxiliary_loss_mlp": 0.0046496, + "balance_loss_clip": 1.13488519, + "balance_loss_mlp": 0.42671788, + "epoch": 0.45399068089583644, + "flos": 22492397566080.0, + "grad_norm": 4.25066675935797, + "language_loss": 0.74873912, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.76718724, + "num_input_tokens_seen": 161996120, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.38232422, + "step": 7551, + "time_per_iteration": 2.7430081367492676 + }, + { + "auxiliary_loss_clip": 0.01398153, + "auxiliary_loss_mlp": 0.0047589, + "balance_loss_clip": 1.14182544, + "balance_loss_mlp": 0.4340471, + "epoch": 0.45405080414850446, + "flos": 12893403651840.0, + "grad_norm": 7.398487771690533, + "language_loss": 0.86284363, + "learning_rate": 2.392161956968798e-06, + "loss": 0.88158405, + "num_input_tokens_seen": 162011125, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.41821289, + "step": 7552, + "time_per_iteration": 2.784442186355591 + }, + { + "auxiliary_loss_clip": 0.01315759, + "auxiliary_loss_mlp": 0.00107542, + "balance_loss_clip": 1.14286697, + "balance_loss_mlp": 0.0985302, + "epoch": 0.4541109274011724, + "flos": 59766919724160.0, + "grad_norm": 0.810311917073569, + "language_loss": 0.57880372, + "learning_rate": 2.39178004819885e-06, + "loss": 0.59303671, + "num_input_tokens_seen": 162068705, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.09033203, + "step": 7553, + "time_per_iteration": 3.109130859375 + }, + { + "auxiliary_loss_clip": 0.01387491, + "auxiliary_loss_mlp": 0.00459154, + "balance_loss_clip": 1.14137888, + "balance_loss_mlp": 0.4219842, + "epoch": 0.4541710506538404, + "flos": 28511743841280.0, + "grad_norm": 7.914112514850036, + "language_loss": 0.81288075, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.83134723, + "num_input_tokens_seen": 162089655, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.37182617, + "step": 7554, + "time_per_iteration": 2.7354743480682373 + }, + { + "auxiliary_loss_clip": 0.01403639, + "auxiliary_loss_mlp": 0.00434468, + "balance_loss_clip": 1.14491749, + "balance_loss_mlp": 0.39663115, + "epoch": 0.45423117390650836, + "flos": 17675591742720.0, + "grad_norm": 41.15643693443292, + "language_loss": 0.84826684, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.86664796, + "num_input_tokens_seen": 162108465, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.37817383, + "step": 7555, + "time_per_iteration": 2.6685259342193604 + }, + { + "auxiliary_loss_clip": 0.01386654, + "auxiliary_loss_mlp": 0.00449438, + "balance_loss_clip": 1.13840151, + "balance_loss_mlp": 0.41052771, + "epoch": 0.4542912971591763, + "flos": 28072556248320.0, + "grad_norm": 104.69087040068001, + "language_loss": 0.77133596, + "learning_rate": 2.390634232808903e-06, + "loss": 0.78969681, + "num_input_tokens_seen": 162129910, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.38916016, + "step": 7556, + "time_per_iteration": 2.761812210083008 + }, + { + "auxiliary_loss_clip": 0.01409186, + "auxiliary_loss_mlp": 0.00497142, + "balance_loss_clip": 1.1503942, + "balance_loss_mlp": 0.45451325, + "epoch": 0.4543514204118443, + "flos": 22671771108480.0, + "grad_norm": 5.315675269143128, + "language_loss": 0.6902771, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.70934039, + "num_input_tokens_seen": 162148840, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.42675781, + "step": 7557, + "time_per_iteration": 2.7455177307128906 + }, + { + "auxiliary_loss_clip": 0.0130167, + "auxiliary_loss_mlp": 0.0008219, + "balance_loss_clip": 1.13097239, + "balance_loss_mlp": 0.07413148, + "epoch": 0.45441154366451225, + "flos": 58216549921920.0, + "grad_norm": 0.6656941516779946, + "language_loss": 0.57159871, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.5854373, + "num_input_tokens_seen": 162208500, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.08056641, + "step": 7558, + "time_per_iteration": 3.132261037826538 + }, + { + "auxiliary_loss_clip": 0.01411365, + "auxiliary_loss_mlp": 0.00466145, + "balance_loss_clip": 1.14999199, + "balance_loss_mlp": 0.42549437, + "epoch": 0.4544716669171802, + "flos": 16764286763520.0, + "grad_norm": 16.99316737911206, + "language_loss": 0.64894962, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.66772467, + "num_input_tokens_seen": 162224650, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.40625, + "step": 7559, + "time_per_iteration": 2.757943868637085 + }, + { + "auxiliary_loss_clip": 0.01405027, + "auxiliary_loss_mlp": 0.0048778, + "balance_loss_clip": 1.15123892, + "balance_loss_mlp": 0.44472212, + "epoch": 0.4545317901698482, + "flos": 15925233991680.0, + "grad_norm": 57.144020300106995, + "language_loss": 0.78336072, + "learning_rate": 2.389106271642792e-06, + "loss": 0.80228877, + "num_input_tokens_seen": 162242930, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.4309082, + "step": 7560, + "time_per_iteration": 2.6788315773010254 + }, + { + "auxiliary_loss_clip": 0.01418926, + "auxiliary_loss_mlp": 0.00439861, + "balance_loss_clip": 1.15712512, + "balance_loss_mlp": 0.39985472, + "epoch": 0.45459191342251615, + "flos": 17639752947840.0, + "grad_norm": 8.45195234352107, + "language_loss": 0.78098845, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.7995764, + "num_input_tokens_seen": 162261455, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.40014648, + "step": 7561, + "time_per_iteration": 2.671323537826538 + }, + { + "auxiliary_loss_clip": 0.01402502, + "auxiliary_loss_mlp": 0.00428371, + "balance_loss_clip": 1.15416074, + "balance_loss_mlp": 0.3908917, + "epoch": 0.4546520366751841, + "flos": 16176608346240.0, + "grad_norm": 30.13400509643176, + "language_loss": 0.90198398, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.92029274, + "num_input_tokens_seen": 162279725, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.375, + "step": 7562, + "time_per_iteration": 2.653895378112793 + }, + { + "auxiliary_loss_clip": 0.01406511, + "auxiliary_loss_mlp": 0.00395332, + "balance_loss_clip": 1.157758, + "balance_loss_mlp": 0.36064172, + "epoch": 0.4547121599278521, + "flos": 19751443562880.0, + "grad_norm": 4.088856829581359, + "language_loss": 0.94425738, + "learning_rate": 2.38796014579055e-06, + "loss": 0.96227586, + "num_input_tokens_seen": 162297865, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.34692383, + "step": 7563, + "time_per_iteration": 2.7290005683898926 + }, + { + "auxiliary_loss_clip": 0.01413271, + "auxiliary_loss_mlp": 0.0043893, + "balance_loss_clip": 1.15521264, + "balance_loss_mlp": 0.39842236, + "epoch": 0.45477228318052004, + "flos": 19937461121280.0, + "grad_norm": 12.988193882778504, + "language_loss": 0.78602695, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.80454898, + "num_input_tokens_seen": 162316010, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.40478516, + "step": 7564, + "time_per_iteration": 2.6878821849823 + }, + { + "auxiliary_loss_clip": 0.01403159, + "auxiliary_loss_mlp": 0.00447452, + "balance_loss_clip": 1.1421392, + "balance_loss_mlp": 0.40866095, + "epoch": 0.454832406433188, + "flos": 21288312829440.0, + "grad_norm": 2.8353115838090113, + "language_loss": 0.73216462, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.75067073, + "num_input_tokens_seen": 162336115, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.38818359, + "step": 7565, + "time_per_iteration": 2.7408156394958496 + }, + { + "auxiliary_loss_clip": 0.01397724, + "auxiliary_loss_mlp": 0.00434289, + "balance_loss_clip": 1.14721608, + "balance_loss_mlp": 0.39704764, + "epoch": 0.45489252968585603, + "flos": 24498726612480.0, + "grad_norm": 55.9976426084516, + "language_loss": 0.85288984, + "learning_rate": 2.386813887534922e-06, + "loss": 0.87120998, + "num_input_tokens_seen": 162355705, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.37231445, + "step": 7566, + "time_per_iteration": 2.7086780071258545 + }, + { + "auxiliary_loss_clip": 0.01407017, + "auxiliary_loss_mlp": 0.00417981, + "balance_loss_clip": 1.14889073, + "balance_loss_mlp": 0.37909546, + "epoch": 0.454952652938524, + "flos": 17092474352640.0, + "grad_norm": 35.557495629215744, + "language_loss": 0.79703265, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.8152827, + "num_input_tokens_seen": 162374055, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.38891602, + "step": 7567, + "time_per_iteration": 2.6516432762145996 + }, + { + "auxiliary_loss_clip": 0.01412723, + "auxiliary_loss_mlp": 0.00421675, + "balance_loss_clip": 1.15768218, + "balance_loss_mlp": 0.3847678, + "epoch": 0.45501277619119196, + "flos": 27630387826560.0, + "grad_norm": 244.17996734995018, + "language_loss": 0.85779965, + "learning_rate": 2.386049642000249e-06, + "loss": 0.87614357, + "num_input_tokens_seen": 162393560, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.36914062, + "step": 7568, + "time_per_iteration": 2.7208304405212402 + }, + { + "auxiliary_loss_clip": 0.0143528, + "auxiliary_loss_mlp": 0.00395743, + "balance_loss_clip": 1.16716266, + "balance_loss_mlp": 0.35874087, + "epoch": 0.4550728994438599, + "flos": 19974664632960.0, + "grad_norm": 4.044342095200329, + "language_loss": 0.88582605, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.90413624, + "num_input_tokens_seen": 162413170, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.37036133, + "step": 7569, + "time_per_iteration": 2.779404640197754 + }, + { + "auxiliary_loss_clip": 0.01422708, + "auxiliary_loss_mlp": 0.00433889, + "balance_loss_clip": 1.1581099, + "balance_loss_mlp": 0.3954556, + "epoch": 0.4551330226965279, + "flos": 26066873646720.0, + "grad_norm": 46.18699037321896, + "language_loss": 0.80460036, + "learning_rate": 2.385285337909412e-06, + "loss": 0.82316625, + "num_input_tokens_seen": 162434080, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.3840332, + "step": 7570, + "time_per_iteration": 2.7331037521362305 + }, + { + "auxiliary_loss_clip": 0.01418785, + "auxiliary_loss_mlp": 0.00411083, + "balance_loss_clip": 1.1626277, + "balance_loss_mlp": 0.37601143, + "epoch": 0.45519314594919585, + "flos": 32781091501440.0, + "grad_norm": 6.2685219474004645, + "language_loss": 0.80413347, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.82243216, + "num_input_tokens_seen": 162455445, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.35058594, + "step": 7571, + "time_per_iteration": 2.7777576446533203 + }, + { + "auxiliary_loss_clip": 0.01411661, + "auxiliary_loss_mlp": 0.00418647, + "balance_loss_clip": 1.1612978, + "balance_loss_mlp": 0.38257474, + "epoch": 0.4552532692018638, + "flos": 19172671718400.0, + "grad_norm": 5.956624558436569, + "language_loss": 0.84781361, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.86611676, + "num_input_tokens_seen": 162474940, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.36108398, + "step": 7572, + "time_per_iteration": 2.658423662185669 + }, + { + "auxiliary_loss_clip": 0.01430694, + "auxiliary_loss_mlp": 0.00469057, + "balance_loss_clip": 1.16699076, + "balance_loss_mlp": 0.42995673, + "epoch": 0.4553133924545318, + "flos": 26027156183040.0, + "grad_norm": 13.60266532844963, + "language_loss": 0.79508913, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.81408668, + "num_input_tokens_seen": 162493340, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.39086914, + "step": 7573, + "time_per_iteration": 4.17199182510376 + }, + { + "auxiliary_loss_clip": 0.01442062, + "auxiliary_loss_mlp": 0.00479091, + "balance_loss_clip": 1.17490005, + "balance_loss_mlp": 0.43436348, + "epoch": 0.45537351570719975, + "flos": 30661535808000.0, + "grad_norm": 26.218731854409505, + "language_loss": 0.80182374, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.82103521, + "num_input_tokens_seen": 162514360, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.44677734, + "step": 7574, + "time_per_iteration": 2.7613720893859863 + }, + { + "auxiliary_loss_clip": 0.01424319, + "auxiliary_loss_mlp": 0.00421441, + "balance_loss_clip": 1.16540742, + "balance_loss_mlp": 0.38503399, + "epoch": 0.4554336389598677, + "flos": 24353396184960.0, + "grad_norm": 41.071127179054244, + "language_loss": 0.77046835, + "learning_rate": 2.383374322259915e-06, + "loss": 0.78892601, + "num_input_tokens_seen": 162535240, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.36401367, + "step": 7575, + "time_per_iteration": 2.7275354862213135 + }, + { + "auxiliary_loss_clip": 0.01410835, + "auxiliary_loss_mlp": 0.00434531, + "balance_loss_clip": 1.15661216, + "balance_loss_mlp": 0.3966462, + "epoch": 0.4554937622125357, + "flos": 20557925677440.0, + "grad_norm": 6.813289329247483, + "language_loss": 0.79669189, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.81514555, + "num_input_tokens_seen": 162553880, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.37890625, + "step": 7576, + "time_per_iteration": 2.666214942932129 + }, + { + "auxiliary_loss_clip": 0.01418329, + "auxiliary_loss_mlp": 0.00385098, + "balance_loss_clip": 1.16381645, + "balance_loss_mlp": 0.35069418, + "epoch": 0.45555388546520365, + "flos": 22820764723200.0, + "grad_norm": 10.984060172419102, + "language_loss": 0.72564638, + "learning_rate": 2.382609814135511e-06, + "loss": 0.74368066, + "num_input_tokens_seen": 162574485, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.34399414, + "step": 7577, + "time_per_iteration": 5.586750745773315 + }, + { + "auxiliary_loss_clip": 0.01435723, + "auxiliary_loss_mlp": 0.00417617, + "balance_loss_clip": 1.17260623, + "balance_loss_mlp": 0.37832606, + "epoch": 0.4556140087178716, + "flos": 21725992051200.0, + "grad_norm": 7.574604678329882, + "language_loss": 0.80285048, + "learning_rate": 2.382227538303157e-06, + "loss": 0.82138395, + "num_input_tokens_seen": 162595130, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.39306641, + "step": 7578, + "time_per_iteration": 2.7509186267852783 + }, + { + "auxiliary_loss_clip": 0.01416855, + "auxiliary_loss_mlp": 0.00403712, + "balance_loss_clip": 1.16311026, + "balance_loss_mlp": 0.37097734, + "epoch": 0.45567413197053963, + "flos": 25994513698560.0, + "grad_norm": 157.1544504619399, + "language_loss": 0.76598823, + "learning_rate": 2.381845247976697e-06, + "loss": 0.78419387, + "num_input_tokens_seen": 162615720, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.32714844, + "step": 7579, + "time_per_iteration": 2.7198898792266846 + }, + { + "auxiliary_loss_clip": 0.01426924, + "auxiliary_loss_mlp": 0.00394864, + "balance_loss_clip": 1.16614199, + "balance_loss_mlp": 0.36093736, + "epoch": 0.4557342552232076, + "flos": 21537604195200.0, + "grad_norm": 24.99644659561302, + "language_loss": 0.8406052, + "learning_rate": 2.381462943170627e-06, + "loss": 0.85882306, + "num_input_tokens_seen": 162635825, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.33959961, + "step": 7580, + "time_per_iteration": 2.898420572280884 + }, + { + "auxiliary_loss_clip": 0.01429634, + "auxiliary_loss_mlp": 0.00385785, + "balance_loss_clip": 1.17552328, + "balance_loss_mlp": 0.35145292, + "epoch": 0.45579437847587556, + "flos": 40001972647680.0, + "grad_norm": 912.9577948483096, + "language_loss": 0.74337888, + "learning_rate": 2.381080623899444e-06, + "loss": 0.76153314, + "num_input_tokens_seen": 162659130, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.34326172, + "step": 7581, + "time_per_iteration": 2.8408920764923096 + }, + { + "auxiliary_loss_clip": 0.0141966, + "auxiliary_loss_mlp": 0.00384075, + "balance_loss_clip": 1.16976428, + "balance_loss_mlp": 0.34990925, + "epoch": 0.4558545017285435, + "flos": 31138501530240.0, + "grad_norm": 15.408074537333219, + "language_loss": 0.77553368, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.793571, + "num_input_tokens_seen": 162681665, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.34179688, + "step": 7582, + "time_per_iteration": 2.8271267414093018 + }, + { + "auxiliary_loss_clip": 0.0143103, + "auxiliary_loss_mlp": 0.00418981, + "balance_loss_clip": 1.16626561, + "balance_loss_mlp": 0.37859261, + "epoch": 0.4559146249812115, + "flos": 21725776569600.0, + "grad_norm": 68.31996912781864, + "language_loss": 0.7857877, + "learning_rate": 2.380315942019729e-06, + "loss": 0.80428779, + "num_input_tokens_seen": 162702040, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.40356445, + "step": 7583, + "time_per_iteration": 2.7957241535186768 + }, + { + "auxiliary_loss_clip": 0.01435093, + "auxiliary_loss_mlp": 0.00417258, + "balance_loss_clip": 1.17078447, + "balance_loss_mlp": 0.38128081, + "epoch": 0.45597474823387946, + "flos": 23805973935360.0, + "grad_norm": 19.080165849807504, + "language_loss": 0.79956806, + "learning_rate": 2.379933579440195e-06, + "loss": 0.81809163, + "num_input_tokens_seen": 162722375, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.35986328, + "step": 7584, + "time_per_iteration": 4.11084771156311 + }, + { + "auxiliary_loss_clip": 0.01438141, + "auxiliary_loss_mlp": 0.00432536, + "balance_loss_clip": 1.18032241, + "balance_loss_mlp": 0.39357847, + "epoch": 0.4560348714865474, + "flos": 31905661230720.0, + "grad_norm": 4.470858374547008, + "language_loss": 0.72675538, + "learning_rate": 2.379551202453541e-06, + "loss": 0.74546212, + "num_input_tokens_seen": 162746095, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.38964844, + "step": 7585, + "time_per_iteration": 2.774897336959839 + }, + { + "auxiliary_loss_clip": 0.01433223, + "auxiliary_loss_mlp": 0.00421434, + "balance_loss_clip": 1.17362821, + "balance_loss_mlp": 0.38507539, + "epoch": 0.4560949947392154, + "flos": 22048828513920.0, + "grad_norm": 7.378395418962431, + "language_loss": 0.81955588, + "learning_rate": 2.379168811074267e-06, + "loss": 0.8381024, + "num_input_tokens_seen": 162766330, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.36352539, + "step": 7586, + "time_per_iteration": 2.6550371646881104 + }, + { + "auxiliary_loss_clip": 0.01418912, + "auxiliary_loss_mlp": 0.00381223, + "balance_loss_clip": 1.16655779, + "balance_loss_mlp": 0.34770164, + "epoch": 0.45615511799188335, + "flos": 24571804832640.0, + "grad_norm": 1112.6829661335764, + "language_loss": 0.83850008, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.85650134, + "num_input_tokens_seen": 162784755, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.33569336, + "step": 7587, + "time_per_iteration": 2.7632453441619873 + }, + { + "auxiliary_loss_clip": 0.01429045, + "auxiliary_loss_mlp": 0.00466046, + "balance_loss_clip": 1.16668189, + "balance_loss_mlp": 0.42699328, + "epoch": 0.4562152412445513, + "flos": 18330709944960.0, + "grad_norm": 74.23460514733344, + "language_loss": 0.77569294, + "learning_rate": 2.378403985195863e-06, + "loss": 0.79464388, + "num_input_tokens_seen": 162803850, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.39038086, + "step": 7588, + "time_per_iteration": 2.6846091747283936 + }, + { + "auxiliary_loss_clip": 0.01416529, + "auxiliary_loss_mlp": 0.00413839, + "balance_loss_clip": 1.16688204, + "balance_loss_mlp": 0.37855312, + "epoch": 0.4562753644972193, + "flos": 13516525814400.0, + "grad_norm": 18.099436050125565, + "language_loss": 0.8491661, + "learning_rate": 2.378021550725735e-06, + "loss": 0.86746979, + "num_input_tokens_seen": 162820775, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.35253906, + "step": 7589, + "time_per_iteration": 2.6110925674438477 + }, + { + "auxiliary_loss_clip": 0.01407464, + "auxiliary_loss_mlp": 0.00386201, + "balance_loss_clip": 1.15674376, + "balance_loss_mlp": 0.35277507, + "epoch": 0.45633548774988725, + "flos": 29639697701760.0, + "grad_norm": 53.42490418097034, + "language_loss": 0.70740712, + "learning_rate": 2.377639101920992e-06, + "loss": 0.72534376, + "num_input_tokens_seen": 162839695, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.33422852, + "step": 7590, + "time_per_iteration": 2.7345635890960693 + }, + { + "auxiliary_loss_clip": 0.01420415, + "auxiliary_loss_mlp": 0.00402423, + "balance_loss_clip": 1.16492796, + "balance_loss_mlp": 0.369748, + "epoch": 0.4563956110025552, + "flos": 22233409528320.0, + "grad_norm": 10.686067809921528, + "language_loss": 0.78167713, + "learning_rate": 2.377256638796135e-06, + "loss": 0.79990554, + "num_input_tokens_seen": 162856095, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.32678223, + "step": 7591, + "time_per_iteration": 2.654724359512329 + }, + { + "auxiliary_loss_clip": 0.01421847, + "auxiliary_loss_mlp": 0.00435046, + "balance_loss_clip": 1.16855955, + "balance_loss_mlp": 0.39518288, + "epoch": 0.45645573425522323, + "flos": 17092043389440.0, + "grad_norm": 42.46724754714922, + "language_loss": 0.84599721, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.86456609, + "num_input_tokens_seen": 162874070, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.39892578, + "step": 7592, + "time_per_iteration": 2.6879401206970215 + }, + { + "auxiliary_loss_clip": 0.01417616, + "auxiliary_loss_mlp": 0.00388466, + "balance_loss_clip": 1.1633141, + "balance_loss_mlp": 0.35248888, + "epoch": 0.4565158575078912, + "flos": 20332334309760.0, + "grad_norm": 45.9361923495634, + "language_loss": 0.75669312, + "learning_rate": 2.376491669644098e-06, + "loss": 0.77475399, + "num_input_tokens_seen": 162891000, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.35986328, + "step": 7593, + "time_per_iteration": 2.7698769569396973 + }, + { + "auxiliary_loss_clip": 0.01409487, + "auxiliary_loss_mlp": 0.00416676, + "balance_loss_clip": 1.15869141, + "balance_loss_mlp": 0.38098449, + "epoch": 0.45657598076055916, + "flos": 23983013093760.0, + "grad_norm": 2.487200039177754, + "language_loss": 0.89371818, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.91197979, + "num_input_tokens_seen": 162910120, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.35717773, + "step": 7594, + "time_per_iteration": 2.754897356033325 + }, + { + "auxiliary_loss_clip": 0.0130142, + "auxiliary_loss_mlp": 0.00094047, + "balance_loss_clip": 1.1364665, + "balance_loss_mlp": 0.08555911, + "epoch": 0.45663610401322713, + "flos": 69364297526400.0, + "grad_norm": 0.8048262517647888, + "language_loss": 0.52502918, + "learning_rate": 2.375726643385654e-06, + "loss": 0.53898382, + "num_input_tokens_seen": 162963720, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.08496094, + "step": 7595, + "time_per_iteration": 3.160919666290283 + }, + { + "auxiliary_loss_clip": 0.01420011, + "auxiliary_loss_mlp": 0.00425359, + "balance_loss_clip": 1.16171622, + "balance_loss_mlp": 0.38771248, + "epoch": 0.4566962272658951, + "flos": 15149095891200.0, + "grad_norm": 6.062413196030684, + "language_loss": 0.93539107, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.95384479, + "num_input_tokens_seen": 162975760, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.37670898, + "step": 7596, + "time_per_iteration": 2.647336721420288 + }, + { + "auxiliary_loss_clip": 0.01416795, + "auxiliary_loss_mlp": 0.00408983, + "balance_loss_clip": 1.16559267, + "balance_loss_mlp": 0.3734107, + "epoch": 0.45675635051856306, + "flos": 18697465762560.0, + "grad_norm": 2.2004467664045926, + "language_loss": 0.83527803, + "learning_rate": 2.374961560136843e-06, + "loss": 0.85353577, + "num_input_tokens_seen": 162994865, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.35546875, + "step": 7597, + "time_per_iteration": 2.6661739349365234 + }, + { + "auxiliary_loss_clip": 0.01423718, + "auxiliary_loss_mlp": 0.00412637, + "balance_loss_clip": 1.16514909, + "balance_loss_mlp": 0.37527674, + "epoch": 0.456816473771231, + "flos": 19098300608640.0, + "grad_norm": 52.87763053245632, + "language_loss": 0.84305143, + "learning_rate": 2.374578997177314e-06, + "loss": 0.86141497, + "num_input_tokens_seen": 163014730, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.37353516, + "step": 7598, + "time_per_iteration": 2.6853725910186768 + }, + { + "auxiliary_loss_clip": 0.01406618, + "auxiliary_loss_mlp": 0.00394877, + "balance_loss_clip": 1.15545285, + "balance_loss_mlp": 0.36002019, + "epoch": 0.456876597023899, + "flos": 28950069507840.0, + "grad_norm": 6.110755640848057, + "language_loss": 0.76814032, + "learning_rate": 2.374196420013712e-06, + "loss": 0.78615534, + "num_input_tokens_seen": 163033405, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.34863281, + "step": 7599, + "time_per_iteration": 2.7635538578033447 + }, + { + "auxiliary_loss_clip": 0.01400631, + "auxiliary_loss_mlp": 0.00406503, + "balance_loss_clip": 1.15278018, + "balance_loss_mlp": 0.36947656, + "epoch": 0.45693672027656695, + "flos": 23289470317440.0, + "grad_norm": 9.423401603051845, + "language_loss": 0.75909317, + "learning_rate": 2.373813828660544e-06, + "loss": 0.77716452, + "num_input_tokens_seen": 163051400, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.37036133, + "step": 7600, + "time_per_iteration": 2.690176010131836 + }, + { + "auxiliary_loss_clip": 0.01425173, + "auxiliary_loss_mlp": 0.00428418, + "balance_loss_clip": 1.1676867, + "balance_loss_mlp": 0.39146292, + "epoch": 0.4569968435292349, + "flos": 20558212986240.0, + "grad_norm": 87.01000443008517, + "language_loss": 0.84548783, + "learning_rate": 2.373431223132319e-06, + "loss": 0.86402375, + "num_input_tokens_seen": 163069250, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.36962891, + "step": 7601, + "time_per_iteration": 2.665968656539917 + }, + { + "auxiliary_loss_clip": 0.01416148, + "auxiliary_loss_mlp": 0.00382778, + "balance_loss_clip": 1.16175556, + "balance_loss_mlp": 0.35021019, + "epoch": 0.4570569667819029, + "flos": 41282619223680.0, + "grad_norm": 15.46872564753309, + "language_loss": 0.78987145, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.80786073, + "num_input_tokens_seen": 163091755, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.32568359, + "step": 7602, + "time_per_iteration": 2.847581624984741 + }, + { + "auxiliary_loss_clip": 0.01424181, + "auxiliary_loss_mlp": 0.00411097, + "balance_loss_clip": 1.16571736, + "balance_loss_mlp": 0.37304568, + "epoch": 0.45711709003457085, + "flos": 26031573555840.0, + "grad_norm": 2.389255051961173, + "language_loss": 0.81465137, + "learning_rate": 2.372665969608729e-06, + "loss": 0.83300424, + "num_input_tokens_seen": 163111600, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.38012695, + "step": 7603, + "time_per_iteration": 2.8188705444335938 + }, + { + "auxiliary_loss_clip": 0.01395767, + "auxiliary_loss_mlp": 0.00391232, + "balance_loss_clip": 1.14890003, + "balance_loss_mlp": 0.35716161, + "epoch": 0.4571772132872388, + "flos": 22158068751360.0, + "grad_norm": 5.924937198503358, + "language_loss": 0.86805904, + "learning_rate": 2.372283321642383e-06, + "loss": 0.88592899, + "num_input_tokens_seen": 163127350, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.34082031, + "step": 7604, + "time_per_iteration": 2.7042856216430664 + }, + { + "auxiliary_loss_clip": 0.01414793, + "auxiliary_loss_mlp": 0.00390175, + "balance_loss_clip": 1.15830827, + "balance_loss_mlp": 0.35527027, + "epoch": 0.45723733653990684, + "flos": 23878872587520.0, + "grad_norm": 18.50935827562512, + "language_loss": 0.94087529, + "learning_rate": 2.371900659559016e-06, + "loss": 0.95892489, + "num_input_tokens_seen": 163145855, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.34887695, + "step": 7605, + "time_per_iteration": 2.7235891819000244 + }, + { + "auxiliary_loss_clip": 0.01404949, + "auxiliary_loss_mlp": 0.00419005, + "balance_loss_clip": 1.15479064, + "balance_loss_mlp": 0.38290796, + "epoch": 0.4572974597925748, + "flos": 16871803148160.0, + "grad_norm": 564.1555297609431, + "language_loss": 0.79339451, + "learning_rate": 2.371517983373138e-06, + "loss": 0.81163406, + "num_input_tokens_seen": 163163830, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.36083984, + "step": 7606, + "time_per_iteration": 2.6096878051757812 + }, + { + "auxiliary_loss_clip": 0.01407618, + "auxiliary_loss_mlp": 0.00417513, + "balance_loss_clip": 1.15461814, + "balance_loss_mlp": 0.38077304, + "epoch": 0.45735758304524277, + "flos": 13771491528960.0, + "grad_norm": 80.8582275914916, + "language_loss": 0.86439431, + "learning_rate": 2.371135293099262e-06, + "loss": 0.88264561, + "num_input_tokens_seen": 163180700, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.36743164, + "step": 7607, + "time_per_iteration": 2.6782703399658203 + }, + { + "auxiliary_loss_clip": 0.0140279, + "auxiliary_loss_mlp": 0.00374666, + "balance_loss_clip": 1.15300965, + "balance_loss_mlp": 0.34123927, + "epoch": 0.45741770629791073, + "flos": 21100750986240.0, + "grad_norm": 3.093280331836067, + "language_loss": 0.86123782, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.87901235, + "num_input_tokens_seen": 163199450, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.33398438, + "step": 7608, + "time_per_iteration": 2.68945574760437 + }, + { + "auxiliary_loss_clip": 0.01404451, + "auxiliary_loss_mlp": 0.00376128, + "balance_loss_clip": 1.15754879, + "balance_loss_mlp": 0.34147358, + "epoch": 0.4574778295505787, + "flos": 23112898035840.0, + "grad_norm": 17.531496126162345, + "language_loss": 0.75335377, + "learning_rate": 2.370369870345559e-06, + "loss": 0.77115953, + "num_input_tokens_seen": 163217875, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.34655762, + "step": 7609, + "time_per_iteration": 2.6796412467956543 + }, + { + "auxiliary_loss_clip": 0.01404628, + "auxiliary_loss_mlp": 0.00397746, + "balance_loss_clip": 1.15371585, + "balance_loss_mlp": 0.36405766, + "epoch": 0.45753795280324666, + "flos": 24352929308160.0, + "grad_norm": 2.888809037105175, + "language_loss": 0.86616755, + "learning_rate": 2.369987137894757e-06, + "loss": 0.88419127, + "num_input_tokens_seen": 163237430, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.33666992, + "step": 7610, + "time_per_iteration": 2.7877793312072754 + }, + { + "auxiliary_loss_clip": 0.01402017, + "auxiliary_loss_mlp": 0.00399276, + "balance_loss_clip": 1.15378487, + "balance_loss_mlp": 0.36358476, + "epoch": 0.4575980760559146, + "flos": 16653789550080.0, + "grad_norm": 38.262123412470764, + "language_loss": 0.89153421, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.90954721, + "num_input_tokens_seen": 163253905, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.35693359, + "step": 7611, + "time_per_iteration": 2.7945940494537354 + }, + { + "auxiliary_loss_clip": 0.01413853, + "auxiliary_loss_mlp": 0.00390762, + "balance_loss_clip": 1.16269588, + "balance_loss_mlp": 0.35809833, + "epoch": 0.4576581993085826, + "flos": 35911423912320.0, + "grad_norm": 424.54078627136573, + "language_loss": 0.80471337, + "learning_rate": 2.369221630917819e-06, + "loss": 0.82275951, + "num_input_tokens_seen": 163274285, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.3269043, + "step": 7612, + "time_per_iteration": 2.772831916809082 + }, + { + "auxiliary_loss_clip": 0.01379826, + "auxiliary_loss_mlp": 0.00391869, + "balance_loss_clip": 1.13786685, + "balance_loss_mlp": 0.35667843, + "epoch": 0.45771832256125056, + "flos": 20080421251200.0, + "grad_norm": 8.354411700478733, + "language_loss": 0.90118754, + "learning_rate": 2.368838856420711e-06, + "loss": 0.91890448, + "num_input_tokens_seen": 163293150, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.35180664, + "step": 7613, + "time_per_iteration": 2.6236801147460938 + }, + { + "auxiliary_loss_clip": 0.0139502, + "auxiliary_loss_mlp": 0.00401874, + "balance_loss_clip": 1.15037251, + "balance_loss_mlp": 0.36496711, + "epoch": 0.4577784458139185, + "flos": 10744329957120.0, + "grad_norm": 117.69026218656552, + "language_loss": 0.83040088, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.84836984, + "num_input_tokens_seen": 163310065, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.36938477, + "step": 7614, + "time_per_iteration": 2.6219818592071533 + }, + { + "auxiliary_loss_clip": 0.01386079, + "auxiliary_loss_mlp": 0.0038923, + "balance_loss_clip": 1.14610839, + "balance_loss_mlp": 0.35427749, + "epoch": 0.4578385690665865, + "flos": 21907269014400.0, + "grad_norm": 3.8957738357923772, + "language_loss": 0.78743303, + "learning_rate": 2.368073265481791e-06, + "loss": 0.80518609, + "num_input_tokens_seen": 163329415, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.34960938, + "step": 7615, + "time_per_iteration": 4.1506030559539795 + }, + { + "auxiliary_loss_clip": 0.01298201, + "auxiliary_loss_mlp": 0.00112547, + "balance_loss_clip": 1.13094378, + "balance_loss_mlp": 0.10463192, + "epoch": 0.45789869231925445, + "flos": 64758286667520.0, + "grad_norm": 0.7508989200259062, + "language_loss": 0.57603586, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.59014332, + "num_input_tokens_seen": 163385875, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.07910156, + "step": 7616, + "time_per_iteration": 3.069803237915039 + }, + { + "auxiliary_loss_clip": 0.01398456, + "auxiliary_loss_mlp": 0.00425503, + "balance_loss_clip": 1.15115666, + "balance_loss_mlp": 0.38780886, + "epoch": 0.4579588155719224, + "flos": 16144001775360.0, + "grad_norm": 5.597852264832207, + "language_loss": 0.78961629, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.80785584, + "num_input_tokens_seen": 163405170, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.37695312, + "step": 7617, + "time_per_iteration": 2.6174890995025635 + }, + { + "auxiliary_loss_clip": 0.01406461, + "auxiliary_loss_mlp": 0.00388861, + "balance_loss_clip": 1.15941072, + "balance_loss_mlp": 0.35305023, + "epoch": 0.45801893882459044, + "flos": 21395541905280.0, + "grad_norm": 5.657423201595948, + "language_loss": 0.84205055, + "learning_rate": 2.36692477442939e-06, + "loss": 0.86000371, + "num_input_tokens_seen": 163423155, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.35839844, + "step": 7618, + "time_per_iteration": 2.636505603790283 + }, + { + "auxiliary_loss_clip": 0.01400332, + "auxiliary_loss_mlp": 0.00359676, + "balance_loss_clip": 1.15319943, + "balance_loss_mlp": 0.3277514, + "epoch": 0.4580790620772584, + "flos": 19536554448000.0, + "grad_norm": 45.57102968009295, + "language_loss": 0.84998304, + "learning_rate": 2.366541916231585e-06, + "loss": 0.86758316, + "num_input_tokens_seen": 163442450, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.3190918, + "step": 7619, + "time_per_iteration": 4.100680828094482 + }, + { + "auxiliary_loss_clip": 0.01399194, + "auxiliary_loss_mlp": 0.00359632, + "balance_loss_clip": 1.15649581, + "balance_loss_mlp": 0.32796967, + "epoch": 0.45813918532992637, + "flos": 16581070465920.0, + "grad_norm": 42.31836650330557, + "language_loss": 0.7755028, + "learning_rate": 2.366159044134473e-06, + "loss": 0.793091, + "num_input_tokens_seen": 163459810, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.31640625, + "step": 7620, + "time_per_iteration": 4.15244722366333 + }, + { + "auxiliary_loss_clip": 0.01384733, + "auxiliary_loss_mlp": 0.00364139, + "balance_loss_clip": 1.14474607, + "balance_loss_mlp": 0.3330735, + "epoch": 0.45819930858259433, + "flos": 42230301701760.0, + "grad_norm": 2.6368067834573488, + "language_loss": 0.81949276, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.83698148, + "num_input_tokens_seen": 163482970, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.31054688, + "step": 7621, + "time_per_iteration": 2.8252310752868652 + }, + { + "auxiliary_loss_clip": 0.01304226, + "auxiliary_loss_mlp": 0.00137324, + "balance_loss_clip": 1.13917351, + "balance_loss_mlp": 0.126261, + "epoch": 0.4582594318352623, + "flos": 63714795638400.0, + "grad_norm": 0.7581395888896513, + "language_loss": 0.64390123, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.65831673, + "num_input_tokens_seen": 163545330, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.11083984, + "step": 7622, + "time_per_iteration": 3.174578905105591 + }, + { + "auxiliary_loss_clip": 0.01392196, + "auxiliary_loss_mlp": 0.00382, + "balance_loss_clip": 1.1489861, + "balance_loss_mlp": 0.34745342, + "epoch": 0.45831955508793026, + "flos": 26869979882880.0, + "grad_norm": 8.797827543306832, + "language_loss": 0.8566317, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.87437367, + "num_input_tokens_seen": 163564620, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.3449707, + "step": 7623, + "time_per_iteration": 2.7331736087799072 + }, + { + "auxiliary_loss_clip": 0.01398788, + "auxiliary_loss_mlp": 0.00422184, + "balance_loss_clip": 1.15296674, + "balance_loss_mlp": 0.38398927, + "epoch": 0.45837967834059823, + "flos": 18733951002240.0, + "grad_norm": 60.033737322821736, + "language_loss": 0.77292979, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.79113948, + "num_input_tokens_seen": 163581010, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.38183594, + "step": 7624, + "time_per_iteration": 2.626006603240967 + }, + { + "auxiliary_loss_clip": 0.01392563, + "auxiliary_loss_mlp": 0.0038674, + "balance_loss_clip": 1.14533865, + "balance_loss_mlp": 0.34933221, + "epoch": 0.4584398015932662, + "flos": 21178102924800.0, + "grad_norm": 64.40303583925996, + "language_loss": 0.80717987, + "learning_rate": 2.364244475667491e-06, + "loss": 0.82497287, + "num_input_tokens_seen": 163599955, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.37402344, + "step": 7625, + "time_per_iteration": 2.8010692596435547 + }, + { + "auxiliary_loss_clip": 0.01412987, + "auxiliary_loss_mlp": 0.00390852, + "balance_loss_clip": 1.16021883, + "balance_loss_mlp": 0.35661548, + "epoch": 0.45849992484593416, + "flos": 19790047704960.0, + "grad_norm": 65.43783362323063, + "language_loss": 0.84088516, + "learning_rate": 2.363861520479451e-06, + "loss": 0.85892355, + "num_input_tokens_seen": 163618545, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.3425293, + "step": 7626, + "time_per_iteration": 4.094861745834351 + }, + { + "auxiliary_loss_clip": 0.01403375, + "auxiliary_loss_mlp": 0.00379905, + "balance_loss_clip": 1.1563406, + "balance_loss_mlp": 0.34652635, + "epoch": 0.4585600480986021, + "flos": 18223265387520.0, + "grad_norm": 64.42590879749145, + "language_loss": 0.9018538, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.91968662, + "num_input_tokens_seen": 163636055, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.33361816, + "step": 7627, + "time_per_iteration": 2.605846881866455 + }, + { + "auxiliary_loss_clip": 0.01413129, + "auxiliary_loss_mlp": 0.00412138, + "balance_loss_clip": 1.15885901, + "balance_loss_mlp": 0.37589836, + "epoch": 0.4586201713512701, + "flos": 29022213974400.0, + "grad_norm": 15.435810044534561, + "language_loss": 0.75807273, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.77632535, + "num_input_tokens_seen": 163657485, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.36230469, + "step": 7628, + "time_per_iteration": 2.7360129356384277 + }, + { + "auxiliary_loss_clip": 0.0139258, + "auxiliary_loss_mlp": 0.00401369, + "balance_loss_clip": 1.14685345, + "balance_loss_mlp": 0.36431855, + "epoch": 0.45868029460393805, + "flos": 23404600385280.0, + "grad_norm": 267.8529474441672, + "language_loss": 0.83394051, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.85187995, + "num_input_tokens_seen": 163676030, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.37036133, + "step": 7629, + "time_per_iteration": 2.661344528198242 + }, + { + "auxiliary_loss_clip": 0.01412301, + "auxiliary_loss_mlp": 0.00407876, + "balance_loss_clip": 1.15810561, + "balance_loss_mlp": 0.3715167, + "epoch": 0.458740417856606, + "flos": 18221972497920.0, + "grad_norm": 6.649513197919966, + "language_loss": 0.87807107, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.89627278, + "num_input_tokens_seen": 163694490, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.36328125, + "step": 7630, + "time_per_iteration": 2.6195521354675293 + }, + { + "auxiliary_loss_clip": 0.01398115, + "auxiliary_loss_mlp": 0.00400203, + "balance_loss_clip": 1.1494664, + "balance_loss_mlp": 0.36308098, + "epoch": 0.458800541109274, + "flos": 34568760504960.0, + "grad_norm": 119.78780803390171, + "language_loss": 0.78440535, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.80238855, + "num_input_tokens_seen": 163717035, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.37133789, + "step": 7631, + "time_per_iteration": 2.7584073543548584 + }, + { + "auxiliary_loss_clip": 0.01410489, + "auxiliary_loss_mlp": 0.00392315, + "balance_loss_clip": 1.15532565, + "balance_loss_mlp": 0.35602784, + "epoch": 0.458860664361942, + "flos": 17712112896000.0, + "grad_norm": 16.083993480445585, + "language_loss": 0.7876749, + "learning_rate": 2.361563500108531e-06, + "loss": 0.80570292, + "num_input_tokens_seen": 163734525, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.36303711, + "step": 7632, + "time_per_iteration": 2.662785530090332 + }, + { + "auxiliary_loss_clip": 0.01409401, + "auxiliary_loss_mlp": 0.00378864, + "balance_loss_clip": 1.15235853, + "balance_loss_mlp": 0.34360141, + "epoch": 0.45892078761460997, + "flos": 18441889516800.0, + "grad_norm": 61.38493079348406, + "language_loss": 0.7987175, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.8166002, + "num_input_tokens_seen": 163752860, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.3527832, + "step": 7633, + "time_per_iteration": 2.619844436645508 + }, + { + "auxiliary_loss_clip": 0.01415245, + "auxiliary_loss_mlp": 0.00394482, + "balance_loss_clip": 1.15716362, + "balance_loss_mlp": 0.35736066, + "epoch": 0.45898091086727794, + "flos": 22672956257280.0, + "grad_norm": 22.867255500106467, + "language_loss": 0.86587274, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.88397002, + "num_input_tokens_seen": 163772495, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.37109375, + "step": 7634, + "time_per_iteration": 2.7244346141815186 + }, + { + "auxiliary_loss_clip": 0.01424334, + "auxiliary_loss_mlp": 0.00394173, + "balance_loss_clip": 1.16675675, + "balance_loss_mlp": 0.35864913, + "epoch": 0.4590410341199459, + "flos": 21652949744640.0, + "grad_norm": 55.873485187732385, + "language_loss": 0.87893659, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.89712167, + "num_input_tokens_seen": 163791475, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.35546875, + "step": 7635, + "time_per_iteration": 2.7165777683258057 + }, + { + "auxiliary_loss_clip": 0.01405686, + "auxiliary_loss_mlp": 0.00394315, + "balance_loss_clip": 1.15523458, + "balance_loss_mlp": 0.36053133, + "epoch": 0.45910115737261387, + "flos": 36535372087680.0, + "grad_norm": 31.07156326904667, + "language_loss": 0.69742262, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.71542263, + "num_input_tokens_seen": 163812995, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.33813477, + "step": 7636, + "time_per_iteration": 2.90412974357605 + }, + { + "auxiliary_loss_clip": 0.01406306, + "auxiliary_loss_mlp": 0.00351961, + "balance_loss_clip": 1.15810466, + "balance_loss_mlp": 0.31967962, + "epoch": 0.45916128062528183, + "flos": 24419866302720.0, + "grad_norm": 4.7072646142261565, + "language_loss": 0.85498923, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.87257189, + "num_input_tokens_seen": 163833945, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.32275391, + "step": 7637, + "time_per_iteration": 2.7241384983062744 + }, + { + "auxiliary_loss_clip": 0.01412305, + "auxiliary_loss_mlp": 0.00420664, + "balance_loss_clip": 1.15700269, + "balance_loss_mlp": 0.38225436, + "epoch": 0.4592214038779498, + "flos": 23221958705280.0, + "grad_norm": 35.14661808670471, + "language_loss": 0.80389798, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.8222276, + "num_input_tokens_seen": 163853885, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.3840332, + "step": 7638, + "time_per_iteration": 2.7056353092193604 + }, + { + "auxiliary_loss_clip": 0.01408054, + "auxiliary_loss_mlp": 0.00369637, + "balance_loss_clip": 1.15575051, + "balance_loss_mlp": 0.33506668, + "epoch": 0.45928152713061776, + "flos": 19172133014400.0, + "grad_norm": 42.215785917712054, + "language_loss": 0.80214727, + "learning_rate": 2.358881852733989e-06, + "loss": 0.81992418, + "num_input_tokens_seen": 163871855, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.34594727, + "step": 7639, + "time_per_iteration": 2.658640146255493 + }, + { + "auxiliary_loss_clip": 0.01416394, + "auxiliary_loss_mlp": 0.00391644, + "balance_loss_clip": 1.16204286, + "balance_loss_mlp": 0.35607231, + "epoch": 0.4593416503832857, + "flos": 22414686491520.0, + "grad_norm": 5.425272501572054, + "language_loss": 0.74108684, + "learning_rate": 2.358498705700346e-06, + "loss": 0.75916731, + "num_input_tokens_seen": 163891450, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.35571289, + "step": 7640, + "time_per_iteration": 2.712205410003662 + }, + { + "auxiliary_loss_clip": 0.0141406, + "auxiliary_loss_mlp": 0.0038902, + "balance_loss_clip": 1.15803695, + "balance_loss_mlp": 0.35399598, + "epoch": 0.4594017736359537, + "flos": 18880215183360.0, + "grad_norm": 3.112982251641233, + "language_loss": 0.80893743, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.82696819, + "num_input_tokens_seen": 163909345, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.3503418, + "step": 7641, + "time_per_iteration": 2.6795477867126465 + }, + { + "auxiliary_loss_clip": 0.01417265, + "auxiliary_loss_mlp": 0.00374303, + "balance_loss_clip": 1.16257465, + "balance_loss_mlp": 0.33947054, + "epoch": 0.45946189688862166, + "flos": 20518567349760.0, + "grad_norm": 6.518634142432164, + "language_loss": 0.80651093, + "learning_rate": 2.357732370864668e-06, + "loss": 0.82442665, + "num_input_tokens_seen": 163926940, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.34814453, + "step": 7642, + "time_per_iteration": 2.7242443561553955 + }, + { + "auxiliary_loss_clip": 0.01284609, + "auxiliary_loss_mlp": 0.00109907, + "balance_loss_clip": 1.11587667, + "balance_loss_mlp": 0.09960775, + "epoch": 0.4595220201412896, + "flos": 61405990162560.0, + "grad_norm": 0.8219408616513796, + "language_loss": 0.58071291, + "learning_rate": 2.357349183091694e-06, + "loss": 0.59465814, + "num_input_tokens_seen": 163977785, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.10302734, + "step": 7643, + "time_per_iteration": 2.9433772563934326 + }, + { + "auxiliary_loss_clip": 0.01418196, + "auxiliary_loss_mlp": 0.00437828, + "balance_loss_clip": 1.1587882, + "balance_loss_mlp": 0.39894193, + "epoch": 0.4595821433939576, + "flos": 23330947547520.0, + "grad_norm": 29.061702155960898, + "language_loss": 0.97294396, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.99150419, + "num_input_tokens_seen": 163996630, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.38891602, + "step": 7644, + "time_per_iteration": 2.7192814350128174 + }, + { + "auxiliary_loss_clip": 0.01420085, + "auxiliary_loss_mlp": 0.00420515, + "balance_loss_clip": 1.15949941, + "balance_loss_mlp": 0.38270217, + "epoch": 0.4596422666466256, + "flos": 14282356711680.0, + "grad_norm": 90.52420204394812, + "language_loss": 0.90003538, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.91844136, + "num_input_tokens_seen": 164013190, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.37817383, + "step": 7645, + "time_per_iteration": 2.679596424102783 + }, + { + "auxiliary_loss_clip": 0.01295455, + "auxiliary_loss_mlp": 0.00102974, + "balance_loss_clip": 1.12287021, + "balance_loss_mlp": 0.09338997, + "epoch": 0.4597023898992936, + "flos": 65727337737600.0, + "grad_norm": 0.756015204819982, + "language_loss": 0.59569842, + "learning_rate": 2.356199538526593e-06, + "loss": 0.60968274, + "num_input_tokens_seen": 164074030, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.09570312, + "step": 7646, + "time_per_iteration": 3.0933046340942383 + }, + { + "auxiliary_loss_clip": 0.01411092, + "auxiliary_loss_mlp": 0.00436004, + "balance_loss_clip": 1.15578568, + "balance_loss_mlp": 0.39909667, + "epoch": 0.45976251315196154, + "flos": 26907075653760.0, + "grad_norm": 8.797714011015263, + "language_loss": 0.78574979, + "learning_rate": 2.355816296637939e-06, + "loss": 0.80422074, + "num_input_tokens_seen": 164095515, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.36914062, + "step": 7647, + "time_per_iteration": 2.736733913421631 + }, + { + "auxiliary_loss_clip": 0.01426281, + "auxiliary_loss_mlp": 0.00437826, + "balance_loss_clip": 1.16865742, + "balance_loss_mlp": 0.39889181, + "epoch": 0.4598226364046295, + "flos": 26618066824320.0, + "grad_norm": 163.12288546755235, + "language_loss": 0.7147249, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.73336595, + "num_input_tokens_seen": 164117270, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.3894043, + "step": 7648, + "time_per_iteration": 2.7669765949249268 + }, + { + "auxiliary_loss_clip": 0.01417901, + "auxiliary_loss_mlp": 0.00400482, + "balance_loss_clip": 1.15684223, + "balance_loss_mlp": 0.36481482, + "epoch": 0.45988275965729747, + "flos": 24387762522240.0, + "grad_norm": 23.94743072795577, + "language_loss": 0.8291266, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.84731042, + "num_input_tokens_seen": 164137850, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.35668945, + "step": 7649, + "time_per_iteration": 2.792159080505371 + }, + { + "auxiliary_loss_clip": 0.01402675, + "auxiliary_loss_mlp": 0.00433253, + "balance_loss_clip": 1.15311289, + "balance_loss_mlp": 0.39505798, + "epoch": 0.45994288290996543, + "flos": 24535822383360.0, + "grad_norm": 179.88935262819012, + "language_loss": 0.75491273, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.77327204, + "num_input_tokens_seen": 164157960, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.38183594, + "step": 7650, + "time_per_iteration": 2.7706310749053955 + }, + { + "auxiliary_loss_clip": 0.01425332, + "auxiliary_loss_mlp": 0.00431637, + "balance_loss_clip": 1.16198564, + "balance_loss_mlp": 0.39272755, + "epoch": 0.4600030061626334, + "flos": 14830245838080.0, + "grad_norm": 33.906157063075156, + "language_loss": 0.91948307, + "learning_rate": 2.354283194302761e-06, + "loss": 0.93805277, + "num_input_tokens_seen": 164174590, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.38916016, + "step": 7651, + "time_per_iteration": 2.646437883377075 + }, + { + "auxiliary_loss_clip": 0.01419995, + "auxiliary_loss_mlp": 0.00455002, + "balance_loss_clip": 1.16496539, + "balance_loss_mlp": 0.41773677, + "epoch": 0.46006312941530136, + "flos": 18113845582080.0, + "grad_norm": 416.76154526912745, + "language_loss": 0.81158423, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.83033419, + "num_input_tokens_seen": 164192935, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.37280273, + "step": 7652, + "time_per_iteration": 2.659301996231079 + }, + { + "auxiliary_loss_clip": 0.01411728, + "auxiliary_loss_mlp": 0.00462651, + "balance_loss_clip": 1.15732861, + "balance_loss_mlp": 0.4234314, + "epoch": 0.46012325266796933, + "flos": 21976468565760.0, + "grad_norm": 6.91571312408346, + "language_loss": 0.80849749, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.8272413, + "num_input_tokens_seen": 164213160, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.39257812, + "step": 7653, + "time_per_iteration": 2.7195560932159424 + }, + { + "auxiliary_loss_clip": 0.01457607, + "auxiliary_loss_mlp": 0.00451385, + "balance_loss_clip": 1.18125272, + "balance_loss_mlp": 0.41063946, + "epoch": 0.4601833759206373, + "flos": 15268068714240.0, + "grad_norm": 6.922089269269168, + "language_loss": 0.76031899, + "learning_rate": 2.353133226438741e-06, + "loss": 0.77940893, + "num_input_tokens_seen": 164229330, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.40771484, + "step": 7654, + "time_per_iteration": 2.6520228385925293 + }, + { + "auxiliary_loss_clip": 0.01418536, + "auxiliary_loss_mlp": 0.00427229, + "balance_loss_clip": 1.16306973, + "balance_loss_mlp": 0.39091796, + "epoch": 0.46024349917330526, + "flos": 27088999061760.0, + "grad_norm": 184.31580219056374, + "language_loss": 0.85111797, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.86957562, + "num_input_tokens_seen": 164248240, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.36328125, + "step": 7655, + "time_per_iteration": 2.6922903060913086 + }, + { + "auxiliary_loss_clip": 0.01423646, + "auxiliary_loss_mlp": 0.00429717, + "balance_loss_clip": 1.16976261, + "balance_loss_mlp": 0.39545619, + "epoch": 0.4603036224259732, + "flos": 24462923731200.0, + "grad_norm": 39.01183208546183, + "language_loss": 0.74080592, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.75933957, + "num_input_tokens_seen": 164268020, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.34277344, + "step": 7656, + "time_per_iteration": 2.7271382808685303 + }, + { + "auxiliary_loss_clip": 0.01424101, + "auxiliary_loss_mlp": 0.00432882, + "balance_loss_clip": 1.16436577, + "balance_loss_mlp": 0.39535519, + "epoch": 0.4603637456786412, + "flos": 28109292883200.0, + "grad_norm": 47.836194236884126, + "language_loss": 0.84598786, + "learning_rate": 2.351983138057098e-06, + "loss": 0.86455774, + "num_input_tokens_seen": 164287305, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.37548828, + "step": 7657, + "time_per_iteration": 4.133780002593994 + }, + { + "auxiliary_loss_clip": 0.01434766, + "auxiliary_loss_mlp": 0.00451718, + "balance_loss_clip": 1.17121363, + "balance_loss_mlp": 0.41094875, + "epoch": 0.4604238689313092, + "flos": 24348942898560.0, + "grad_norm": 22.640477771635723, + "language_loss": 0.75804561, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.77691042, + "num_input_tokens_seen": 164306835, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.40795898, + "step": 7658, + "time_per_iteration": 2.7236971855163574 + }, + { + "auxiliary_loss_clip": 0.01326072, + "auxiliary_loss_mlp": 0.0016458, + "balance_loss_clip": 1.15932584, + "balance_loss_mlp": 0.15447119, + "epoch": 0.4604839921839772, + "flos": 53606229431040.0, + "grad_norm": 0.9149183973938942, + "language_loss": 0.61645919, + "learning_rate": 2.351216345708928e-06, + "loss": 0.63136572, + "num_input_tokens_seen": 164367095, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.10107422, + "step": 7659, + "time_per_iteration": 3.252645254135132 + }, + { + "auxiliary_loss_clip": 0.01420867, + "auxiliary_loss_mlp": 0.00409523, + "balance_loss_clip": 1.16725111, + "balance_loss_mlp": 0.37495205, + "epoch": 0.46054411543664514, + "flos": 31248424126080.0, + "grad_norm": 761.9066906063895, + "language_loss": 0.73894989, + "learning_rate": 2.350832929550336e-06, + "loss": 0.75725383, + "num_input_tokens_seen": 164388895, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.34570312, + "step": 7660, + "time_per_iteration": 2.9009156227111816 + }, + { + "auxiliary_loss_clip": 0.01442155, + "auxiliary_loss_mlp": 0.00431743, + "balance_loss_clip": 1.17654169, + "balance_loss_mlp": 0.39278561, + "epoch": 0.4606042386893131, + "flos": 24092863862400.0, + "grad_norm": 19.132640112170566, + "language_loss": 0.82577896, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.84451795, + "num_input_tokens_seen": 164409080, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.38989258, + "step": 7661, + "time_per_iteration": 4.181703805923462 + }, + { + "auxiliary_loss_clip": 0.01423002, + "auxiliary_loss_mlp": 0.0044356, + "balance_loss_clip": 1.17159665, + "balance_loss_mlp": 0.40758234, + "epoch": 0.46066436194198107, + "flos": 26578457101440.0, + "grad_norm": 50.83738502684465, + "language_loss": 0.81191289, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.83057845, + "num_input_tokens_seen": 164427585, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.35961914, + "step": 7662, + "time_per_iteration": 4.255288124084473 + }, + { + "auxiliary_loss_clip": 0.0144692, + "auxiliary_loss_mlp": 0.00475552, + "balance_loss_clip": 1.17697227, + "balance_loss_mlp": 0.43406728, + "epoch": 0.46072448519464904, + "flos": 17775602184960.0, + "grad_norm": 56.336187658634515, + "language_loss": 0.91839188, + "learning_rate": 2.349682601310998e-06, + "loss": 0.93761659, + "num_input_tokens_seen": 164438455, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.41503906, + "step": 7663, + "time_per_iteration": 2.67997407913208 + }, + { + "auxiliary_loss_clip": 0.01418007, + "auxiliary_loss_mlp": 0.00387754, + "balance_loss_clip": 1.16582024, + "balance_loss_mlp": 0.35423261, + "epoch": 0.460784608447317, + "flos": 15086109392640.0, + "grad_norm": 308.46594112592453, + "language_loss": 0.81574726, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.83380485, + "num_input_tokens_seen": 164456830, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.33520508, + "step": 7664, + "time_per_iteration": 2.6950490474700928 + }, + { + "auxiliary_loss_clip": 0.01427978, + "auxiliary_loss_mlp": 0.00430453, + "balance_loss_clip": 1.16886473, + "balance_loss_mlp": 0.39256808, + "epoch": 0.46084473169998497, + "flos": 18588261438720.0, + "grad_norm": 437.11682837334945, + "language_loss": 0.78801358, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.80659789, + "num_input_tokens_seen": 164475375, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.37890625, + "step": 7665, + "time_per_iteration": 2.736499071121216 + }, + { + "auxiliary_loss_clip": 0.01429483, + "auxiliary_loss_mlp": 0.00403446, + "balance_loss_clip": 1.17017698, + "balance_loss_mlp": 0.36880365, + "epoch": 0.46090485495265293, + "flos": 19494789909120.0, + "grad_norm": 15.43448059294349, + "language_loss": 0.82580984, + "learning_rate": 2.348532153731669e-06, + "loss": 0.8441391, + "num_input_tokens_seen": 164492040, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.34643555, + "step": 7666, + "time_per_iteration": 2.6448476314544678 + }, + { + "auxiliary_loss_clip": 0.01414419, + "auxiliary_loss_mlp": 0.00401251, + "balance_loss_clip": 1.16194987, + "balance_loss_mlp": 0.36622697, + "epoch": 0.4609649782053209, + "flos": 33364927163520.0, + "grad_norm": 5.85884153169361, + "language_loss": 0.78765929, + "learning_rate": 2.348148644753088e-06, + "loss": 0.80581594, + "num_input_tokens_seen": 164513665, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.35009766, + "step": 7667, + "time_per_iteration": 2.761565685272217 + }, + { + "auxiliary_loss_clip": 0.01421575, + "auxiliary_loss_mlp": 0.00437222, + "balance_loss_clip": 1.16374457, + "balance_loss_mlp": 0.40198374, + "epoch": 0.46102510145798886, + "flos": 23769165473280.0, + "grad_norm": 52.01780472547862, + "language_loss": 0.81259978, + "learning_rate": 2.347765122572676e-06, + "loss": 0.83118773, + "num_input_tokens_seen": 164533890, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.35229492, + "step": 7668, + "time_per_iteration": 4.123117446899414 + }, + { + "auxiliary_loss_clip": 0.0142333, + "auxiliary_loss_mlp": 0.00395543, + "balance_loss_clip": 1.17324185, + "balance_loss_mlp": 0.36180654, + "epoch": 0.4610852247106568, + "flos": 23294821443840.0, + "grad_norm": 3.8022482270983513, + "language_loss": 0.82566965, + "learning_rate": 2.347381587204975e-06, + "loss": 0.84385842, + "num_input_tokens_seen": 164553815, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.33740234, + "step": 7669, + "time_per_iteration": 2.6602416038513184 + }, + { + "auxiliary_loss_clip": 0.01423384, + "auxiliary_loss_mlp": 0.00430031, + "balance_loss_clip": 1.1680789, + "balance_loss_mlp": 0.39250407, + "epoch": 0.4611453479633248, + "flos": 25447450584960.0, + "grad_norm": 14.470089380725394, + "language_loss": 0.8784622, + "learning_rate": 2.34699803866453e-06, + "loss": 0.89699638, + "num_input_tokens_seen": 164573125, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.37548828, + "step": 7670, + "time_per_iteration": 2.834012746810913 + }, + { + "auxiliary_loss_clip": 0.01402173, + "auxiliary_loss_mlp": 0.00416277, + "balance_loss_clip": 1.15754151, + "balance_loss_mlp": 0.38101453, + "epoch": 0.4612054712159928, + "flos": 21139606523520.0, + "grad_norm": 1.9358541032659373, + "language_loss": 0.71394932, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.73213375, + "num_input_tokens_seen": 164592575, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.35253906, + "step": 7671, + "time_per_iteration": 2.832479476928711 + }, + { + "auxiliary_loss_clip": 0.01311013, + "auxiliary_loss_mlp": 0.00095025, + "balance_loss_clip": 1.14859378, + "balance_loss_mlp": 0.08515434, + "epoch": 0.4612655944686608, + "flos": 69959266404480.0, + "grad_norm": 0.6758314884533831, + "language_loss": 0.55472791, + "learning_rate": 2.346230902123583e-06, + "loss": 0.56878829, + "num_input_tokens_seen": 164659795, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.09863281, + "step": 7672, + "time_per_iteration": 3.3173041343688965 + }, + { + "auxiliary_loss_clip": 0.01426732, + "auxiliary_loss_mlp": 0.00411956, + "balance_loss_clip": 1.16505456, + "balance_loss_mlp": 0.37550229, + "epoch": 0.46132571772132874, + "flos": 16837149502080.0, + "grad_norm": 2.90609670688035, + "language_loss": 0.79696512, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.81535196, + "num_input_tokens_seen": 164678735, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.36474609, + "step": 7673, + "time_per_iteration": 2.657597780227661 + }, + { + "auxiliary_loss_clip": 0.01421134, + "auxiliary_loss_mlp": 0.00395306, + "balance_loss_clip": 1.16561759, + "balance_loss_mlp": 0.36004403, + "epoch": 0.4613858409739967, + "flos": 35808935431680.0, + "grad_norm": 4.98640644964533, + "language_loss": 0.76121044, + "learning_rate": 2.345463713066195e-06, + "loss": 0.7793749, + "num_input_tokens_seen": 164700885, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.35253906, + "step": 7674, + "time_per_iteration": 2.7828781604766846 + }, + { + "auxiliary_loss_clip": 0.01422221, + "auxiliary_loss_mlp": 0.00415904, + "balance_loss_clip": 1.16629994, + "balance_loss_mlp": 0.37737596, + "epoch": 0.4614459642266647, + "flos": 35266756567680.0, + "grad_norm": 4.459583461006877, + "language_loss": 0.71424055, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.73262179, + "num_input_tokens_seen": 164726960, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.38500977, + "step": 7675, + "time_per_iteration": 2.837834596633911 + }, + { + "auxiliary_loss_clip": 0.01308792, + "auxiliary_loss_mlp": 0.00144436, + "balance_loss_clip": 1.14136374, + "balance_loss_mlp": 0.13375497, + "epoch": 0.46150608747933264, + "flos": 66704610044160.0, + "grad_norm": 0.7312986489482346, + "language_loss": 0.58248788, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.59702015, + "num_input_tokens_seen": 164788525, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.10693359, + "step": 7676, + "time_per_iteration": 3.2074267864227295 + }, + { + "auxiliary_loss_clip": 0.01296945, + "auxiliary_loss_mlp": 0.00118374, + "balance_loss_clip": 1.1280849, + "balance_loss_mlp": 0.10769272, + "epoch": 0.4615662107320006, + "flos": 55830177025920.0, + "grad_norm": 0.8021137254901085, + "language_loss": 0.6281029, + "learning_rate": 2.344312831266341e-06, + "loss": 0.64225614, + "num_input_tokens_seen": 164843525, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.10693359, + "step": 7677, + "time_per_iteration": 3.004438877105713 + }, + { + "auxiliary_loss_clip": 0.01407697, + "auxiliary_loss_mlp": 0.00379021, + "balance_loss_clip": 1.15809536, + "balance_loss_mlp": 0.34440216, + "epoch": 0.46162633398466857, + "flos": 15483245137920.0, + "grad_norm": 52.70083872990648, + "language_loss": 0.81212914, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.82999623, + "num_input_tokens_seen": 164859895, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.34594727, + "step": 7678, + "time_per_iteration": 2.7829742431640625 + }, + { + "auxiliary_loss_clip": 0.01426812, + "auxiliary_loss_mlp": 0.00406154, + "balance_loss_clip": 1.16896057, + "balance_loss_mlp": 0.3676967, + "epoch": 0.46168645723733653, + "flos": 20011437181440.0, + "grad_norm": 71.78220200429496, + "language_loss": 0.72637177, + "learning_rate": 2.343545511426974e-06, + "loss": 0.7447015, + "num_input_tokens_seen": 164878030, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.38427734, + "step": 7679, + "time_per_iteration": 2.6852951049804688 + }, + { + "auxiliary_loss_clip": 0.01422346, + "auxiliary_loss_mlp": 0.00384651, + "balance_loss_clip": 1.16848528, + "balance_loss_mlp": 0.35008049, + "epoch": 0.4617465804900045, + "flos": 20298542590080.0, + "grad_norm": 14.241988461934266, + "language_loss": 0.78332591, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.80139589, + "num_input_tokens_seen": 164895710, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.34521484, + "step": 7680, + "time_per_iteration": 2.6947596073150635 + }, + { + "auxiliary_loss_clip": 0.01439022, + "auxiliary_loss_mlp": 0.00397251, + "balance_loss_clip": 1.17978716, + "balance_loss_mlp": 0.36008161, + "epoch": 0.46180670374267246, + "flos": 22346312952960.0, + "grad_norm": 14.047477096323238, + "language_loss": 0.71064508, + "learning_rate": 2.342778139478487e-06, + "loss": 0.72900772, + "num_input_tokens_seen": 164913365, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.37182617, + "step": 7681, + "time_per_iteration": 2.706242561340332 + }, + { + "auxiliary_loss_clip": 0.01404311, + "auxiliary_loss_mlp": 0.00370345, + "balance_loss_clip": 1.15788853, + "balance_loss_mlp": 0.33663306, + "epoch": 0.46186682699534043, + "flos": 19895696582400.0, + "grad_norm": 55.2816947646057, + "language_loss": 0.73146677, + "learning_rate": 2.342394433999697e-06, + "loss": 0.74921334, + "num_input_tokens_seen": 164931620, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.3371582, + "step": 7682, + "time_per_iteration": 2.6594200134277344 + }, + { + "auxiliary_loss_clip": 0.01418385, + "auxiliary_loss_mlp": 0.00391755, + "balance_loss_clip": 1.16709781, + "balance_loss_mlp": 0.35708934, + "epoch": 0.4619269502480084, + "flos": 31503569408640.0, + "grad_norm": 5.080691165220897, + "language_loss": 0.8136245, + "learning_rate": 2.342010715537275e-06, + "loss": 0.8317259, + "num_input_tokens_seen": 164950905, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.34692383, + "step": 7683, + "time_per_iteration": 2.7127525806427 + }, + { + "auxiliary_loss_clip": 0.01417629, + "auxiliary_loss_mlp": 0.00363357, + "balance_loss_clip": 1.16740489, + "balance_loss_mlp": 0.32773688, + "epoch": 0.46198707350067636, + "flos": 25009484054400.0, + "grad_norm": 4.523250672332182, + "language_loss": 0.83609378, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.85390359, + "num_input_tokens_seen": 164970950, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.35620117, + "step": 7684, + "time_per_iteration": 2.845942497253418 + }, + { + "auxiliary_loss_clip": 0.01430359, + "auxiliary_loss_mlp": 0.00392732, + "balance_loss_clip": 1.1719346, + "balance_loss_mlp": 0.35558608, + "epoch": 0.4620471967533444, + "flos": 18292357198080.0, + "grad_norm": 8.314822474666215, + "language_loss": 0.85683107, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.87506199, + "num_input_tokens_seen": 164989855, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.37158203, + "step": 7685, + "time_per_iteration": 2.6929025650024414 + }, + { + "auxiliary_loss_clip": 0.01407419, + "auxiliary_loss_mlp": 0.00364497, + "balance_loss_clip": 1.16402149, + "balance_loss_mlp": 0.33035558, + "epoch": 0.46210732000601235, + "flos": 33985104410880.0, + "grad_norm": 67.2408405759557, + "language_loss": 0.72826684, + "learning_rate": 2.340859482393731e-06, + "loss": 0.74598593, + "num_input_tokens_seen": 165012290, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.34130859, + "step": 7686, + "time_per_iteration": 2.813401460647583 + }, + { + "auxiliary_loss_clip": 0.01419455, + "auxiliary_loss_mlp": 0.00414218, + "balance_loss_clip": 1.16694427, + "balance_loss_mlp": 0.37754953, + "epoch": 0.4621674432586803, + "flos": 25009412227200.0, + "grad_norm": 38.09339677306485, + "language_loss": 0.80695885, + "learning_rate": 2.340475712142296e-06, + "loss": 0.82529563, + "num_input_tokens_seen": 165030810, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.36694336, + "step": 7687, + "time_per_iteration": 2.6821866035461426 + }, + { + "auxiliary_loss_clip": 0.01411766, + "auxiliary_loss_mlp": 0.00364796, + "balance_loss_clip": 1.16621745, + "balance_loss_mlp": 0.33156008, + "epoch": 0.4622275665113483, + "flos": 22014031213440.0, + "grad_norm": 49.44629060259665, + "language_loss": 0.81193942, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.829705, + "num_input_tokens_seen": 165050205, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.33251953, + "step": 7688, + "time_per_iteration": 2.6846508979797363 + }, + { + "auxiliary_loss_clip": 0.01419707, + "auxiliary_loss_mlp": 0.0038793, + "balance_loss_clip": 1.16896439, + "balance_loss_mlp": 0.35147557, + "epoch": 0.46228768976401624, + "flos": 24058820747520.0, + "grad_norm": 2.893861544559389, + "language_loss": 0.843499, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.86157537, + "num_input_tokens_seen": 165069370, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.36474609, + "step": 7689, + "time_per_iteration": 2.7051355838775635 + }, + { + "auxiliary_loss_clip": 0.01427044, + "auxiliary_loss_mlp": 0.00425036, + "balance_loss_clip": 1.17378724, + "balance_loss_mlp": 0.38746166, + "epoch": 0.4623478130166842, + "flos": 26651391667200.0, + "grad_norm": 3.733555840509, + "language_loss": 0.65876138, + "learning_rate": 2.339324323980964e-06, + "loss": 0.67728221, + "num_input_tokens_seen": 165089610, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.37597656, + "step": 7690, + "time_per_iteration": 2.721898078918457 + }, + { + "auxiliary_loss_clip": 0.01427349, + "auxiliary_loss_mlp": 0.00402933, + "balance_loss_clip": 1.16813874, + "balance_loss_mlp": 0.36428511, + "epoch": 0.46240793626935217, + "flos": 20558428467840.0, + "grad_norm": 175.4081222326952, + "language_loss": 0.90848291, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.92678571, + "num_input_tokens_seen": 165109050, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.38647461, + "step": 7691, + "time_per_iteration": 2.6418731212615967 + }, + { + "auxiliary_loss_clip": 0.01414891, + "auxiliary_loss_mlp": 0.00384225, + "balance_loss_clip": 1.16492701, + "balance_loss_mlp": 0.34982151, + "epoch": 0.46246805952202014, + "flos": 22456055980800.0, + "grad_norm": 1.7552019386906892, + "language_loss": 0.80244654, + "learning_rate": 2.338556667513091e-06, + "loss": 0.82043767, + "num_input_tokens_seen": 165130130, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.34423828, + "step": 7692, + "time_per_iteration": 2.6776273250579834 + }, + { + "auxiliary_loss_clip": 0.01422059, + "auxiliary_loss_mlp": 0.00392158, + "balance_loss_clip": 1.17138708, + "balance_loss_mlp": 0.35527515, + "epoch": 0.4625281827746881, + "flos": 35041308854400.0, + "grad_norm": 39.39882064929038, + "language_loss": 0.79742938, + "learning_rate": 2.338172820014723e-06, + "loss": 0.81557155, + "num_input_tokens_seen": 165152685, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.36889648, + "step": 7693, + "time_per_iteration": 2.7708561420440674 + }, + { + "auxiliary_loss_clip": 0.01412131, + "auxiliary_loss_mlp": 0.00387506, + "balance_loss_clip": 1.16916966, + "balance_loss_mlp": 0.3515763, + "epoch": 0.46258830602735607, + "flos": 21068647205760.0, + "grad_norm": 2.030859267439608, + "language_loss": 0.90798646, + "learning_rate": 2.337788959692808e-06, + "loss": 0.92598283, + "num_input_tokens_seen": 165173315, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.359375, + "step": 7694, + "time_per_iteration": 2.805649518966675 + }, + { + "auxiliary_loss_clip": 0.0142617, + "auxiliary_loss_mlp": 0.0039149, + "balance_loss_clip": 1.17681599, + "balance_loss_mlp": 0.35584652, + "epoch": 0.46264842928002403, + "flos": 26177227205760.0, + "grad_norm": 10.970809656456423, + "language_loss": 0.8682664, + "learning_rate": 2.337405086561902e-06, + "loss": 0.88644302, + "num_input_tokens_seen": 165192395, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.35644531, + "step": 7695, + "time_per_iteration": 2.772348642349243 + }, + { + "auxiliary_loss_clip": 0.0141947, + "auxiliary_loss_mlp": 0.00383656, + "balance_loss_clip": 1.17324042, + "balance_loss_mlp": 0.34901336, + "epoch": 0.462708552532692, + "flos": 16764214936320.0, + "grad_norm": 2.4249716555138754, + "language_loss": 0.78983706, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.8078683, + "num_input_tokens_seen": 165211355, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.34643555, + "step": 7696, + "time_per_iteration": 2.7246146202087402 + }, + { + "auxiliary_loss_clip": 0.01431018, + "auxiliary_loss_mlp": 0.00418886, + "balance_loss_clip": 1.17669034, + "balance_loss_mlp": 0.38064414, + "epoch": 0.46276867578535996, + "flos": 15560453422080.0, + "grad_norm": 11.29127185762115, + "language_loss": 0.77574241, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.79424143, + "num_input_tokens_seen": 165229380, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.38269043, + "step": 7697, + "time_per_iteration": 2.724553108215332 + }, + { + "auxiliary_loss_clip": 0.01416306, + "auxiliary_loss_mlp": 0.00446172, + "balance_loss_clip": 1.16825223, + "balance_loss_mlp": 0.40609419, + "epoch": 0.462828799038028, + "flos": 22415404763520.0, + "grad_norm": 13.720406181583867, + "language_loss": 0.9059673, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.92459208, + "num_input_tokens_seen": 165247200, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.40063477, + "step": 7698, + "time_per_iteration": 2.70389986038208 + }, + { + "auxiliary_loss_clip": 0.01410183, + "auxiliary_loss_mlp": 0.00398666, + "balance_loss_clip": 1.16769505, + "balance_loss_mlp": 0.36128211, + "epoch": 0.46288892229069595, + "flos": 21069580959360.0, + "grad_norm": 8.643926178970009, + "language_loss": 0.77132785, + "learning_rate": 2.335869466239502e-06, + "loss": 0.78941637, + "num_input_tokens_seen": 165265825, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.37353516, + "step": 7699, + "time_per_iteration": 4.115377902984619 + }, + { + "auxiliary_loss_clip": 0.01409839, + "auxiliary_loss_mlp": 0.00425983, + "balance_loss_clip": 1.16239214, + "balance_loss_mlp": 0.38731202, + "epoch": 0.4629490455433639, + "flos": 23185688947200.0, + "grad_norm": 16.13636484296391, + "language_loss": 0.77440262, + "learning_rate": 2.335485529281996e-06, + "loss": 0.79276091, + "num_input_tokens_seen": 165284380, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.38671875, + "step": 7700, + "time_per_iteration": 2.8820698261260986 + }, + { + "auxiliary_loss_clip": 0.01412222, + "auxiliary_loss_mlp": 0.00417017, + "balance_loss_clip": 1.16992509, + "balance_loss_mlp": 0.37934706, + "epoch": 0.4630091687960319, + "flos": 18835541642880.0, + "grad_norm": 6.3462644051765125, + "language_loss": 0.79740834, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.81570065, + "num_input_tokens_seen": 165300320, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.37695312, + "step": 7701, + "time_per_iteration": 2.6560544967651367 + }, + { + "auxiliary_loss_clip": 0.01427231, + "auxiliary_loss_mlp": 0.00445456, + "balance_loss_clip": 1.17462957, + "balance_loss_mlp": 0.40547276, + "epoch": 0.46306929204869984, + "flos": 38907020407680.0, + "grad_norm": 142.07563594554284, + "language_loss": 0.7183789, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.73710585, + "num_input_tokens_seen": 165318130, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.3996582, + "step": 7702, + "time_per_iteration": 2.8315579891204834 + }, + { + "auxiliary_loss_clip": 0.01424548, + "auxiliary_loss_mlp": 0.00410468, + "balance_loss_clip": 1.17913675, + "balance_loss_mlp": 0.37451464, + "epoch": 0.4631294153013678, + "flos": 19644178573440.0, + "grad_norm": 27.28107573906326, + "language_loss": 0.79330218, + "learning_rate": 2.33433364213785e-06, + "loss": 0.8116523, + "num_input_tokens_seen": 165336225, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.35961914, + "step": 7703, + "time_per_iteration": 4.10429048538208 + }, + { + "auxiliary_loss_clip": 0.01441719, + "auxiliary_loss_mlp": 0.00437498, + "balance_loss_clip": 1.18662417, + "balance_loss_mlp": 0.39823061, + "epoch": 0.4631895385540358, + "flos": 24608254158720.0, + "grad_norm": 72.836611170974, + "language_loss": 0.75916052, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.77795273, + "num_input_tokens_seen": 165355005, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.39233398, + "step": 7704, + "time_per_iteration": 2.751185417175293 + }, + { + "auxiliary_loss_clip": 0.01426986, + "auxiliary_loss_mlp": 0.00445394, + "balance_loss_clip": 1.17973053, + "balance_loss_mlp": 0.40464777, + "epoch": 0.46324966180670374, + "flos": 26320115508480.0, + "grad_norm": 12.663077242579849, + "language_loss": 0.8613109, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.88003469, + "num_input_tokens_seen": 165374910, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.40722656, + "step": 7705, + "time_per_iteration": 4.2530741691589355 + }, + { + "auxiliary_loss_clip": 0.01432912, + "auxiliary_loss_mlp": 0.00434299, + "balance_loss_clip": 1.17744827, + "balance_loss_mlp": 0.39586622, + "epoch": 0.4633097850593717, + "flos": 19240506552960.0, + "grad_norm": 109.54109621358397, + "language_loss": 0.83732355, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.85599566, + "num_input_tokens_seen": 165392590, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.38452148, + "step": 7706, + "time_per_iteration": 2.7141013145446777 + }, + { + "auxiliary_loss_clip": 0.01415746, + "auxiliary_loss_mlp": 0.00426407, + "balance_loss_clip": 1.17342556, + "balance_loss_mlp": 0.38947642, + "epoch": 0.46336990831203967, + "flos": 22783166161920.0, + "grad_norm": 1736.7654943398804, + "language_loss": 0.76201153, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.78043306, + "num_input_tokens_seen": 165411195, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.36962891, + "step": 7707, + "time_per_iteration": 2.712829113006592 + }, + { + "auxiliary_loss_clip": 0.01424756, + "auxiliary_loss_mlp": 0.00424524, + "balance_loss_clip": 1.17866993, + "balance_loss_mlp": 0.38706833, + "epoch": 0.46343003156470763, + "flos": 38210604543360.0, + "grad_norm": 9.160193218583473, + "language_loss": 0.67489898, + "learning_rate": 2.332413576865791e-06, + "loss": 0.6933918, + "num_input_tokens_seen": 165430150, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.37402344, + "step": 7708, + "time_per_iteration": 2.934591293334961 + }, + { + "auxiliary_loss_clip": 0.01428587, + "auxiliary_loss_mlp": 0.00394022, + "balance_loss_clip": 1.18012238, + "balance_loss_mlp": 0.35718679, + "epoch": 0.4634901548173756, + "flos": 31938555110400.0, + "grad_norm": 40.81718011669565, + "language_loss": 0.83678699, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.85501313, + "num_input_tokens_seen": 165450595, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.3684082, + "step": 7709, + "time_per_iteration": 2.795398712158203 + }, + { + "auxiliary_loss_clip": 0.01438393, + "auxiliary_loss_mlp": 0.00413965, + "balance_loss_clip": 1.18676066, + "balance_loss_mlp": 0.37641412, + "epoch": 0.46355027807004356, + "flos": 20082540153600.0, + "grad_norm": 200.41424176208054, + "language_loss": 0.83756196, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.8560856, + "num_input_tokens_seen": 165469515, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.37548828, + "step": 7710, + "time_per_iteration": 2.77746319770813 + }, + { + "auxiliary_loss_clip": 0.01458503, + "auxiliary_loss_mlp": 0.00417426, + "balance_loss_clip": 1.19831574, + "balance_loss_mlp": 0.37734795, + "epoch": 0.4636104013227116, + "flos": 24061370613120.0, + "grad_norm": 14.157243750468755, + "language_loss": 0.79383904, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.81259835, + "num_input_tokens_seen": 165488125, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.40063477, + "step": 7711, + "time_per_iteration": 4.154359340667725 + }, + { + "auxiliary_loss_clip": 0.01446629, + "auxiliary_loss_mlp": 0.00410976, + "balance_loss_clip": 1.19320631, + "balance_loss_mlp": 0.37054002, + "epoch": 0.46367052457537955, + "flos": 23914639555200.0, + "grad_norm": 25.02551509704164, + "language_loss": 0.76868498, + "learning_rate": 2.33087729766797e-06, + "loss": 0.78726101, + "num_input_tokens_seen": 165509225, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.40429688, + "step": 7712, + "time_per_iteration": 2.7194020748138428 + }, + { + "auxiliary_loss_clip": 0.01444411, + "auxiliary_loss_mlp": 0.00408447, + "balance_loss_clip": 1.18732429, + "balance_loss_mlp": 0.36791536, + "epoch": 0.4637306478280475, + "flos": 26396533693440.0, + "grad_norm": 864.402012880036, + "language_loss": 0.78041697, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.79894555, + "num_input_tokens_seen": 165529945, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.40527344, + "step": 7713, + "time_per_iteration": 2.6945700645446777 + }, + { + "auxiliary_loss_clip": 0.01453187, + "auxiliary_loss_mlp": 0.00459598, + "balance_loss_clip": 1.19219196, + "balance_loss_mlp": 0.41687325, + "epoch": 0.4637907710807155, + "flos": 21980706370560.0, + "grad_norm": 3007.079594154705, + "language_loss": 0.66409081, + "learning_rate": 2.3301090827294e-06, + "loss": 0.68321866, + "num_input_tokens_seen": 165550690, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.42724609, + "step": 7714, + "time_per_iteration": 2.673614263534546 + }, + { + "auxiliary_loss_clip": 0.01447075, + "auxiliary_loss_mlp": 0.00446801, + "balance_loss_clip": 1.1921221, + "balance_loss_mlp": 0.40674675, + "epoch": 0.46385089433338345, + "flos": 12422291846400.0, + "grad_norm": 19.513449427493267, + "language_loss": 0.77381527, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.79275405, + "num_input_tokens_seen": 165567775, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.40039062, + "step": 7715, + "time_per_iteration": 2.646658420562744 + }, + { + "auxiliary_loss_clip": 0.01466588, + "auxiliary_loss_mlp": 0.00459826, + "balance_loss_clip": 1.19881225, + "balance_loss_mlp": 0.41881847, + "epoch": 0.4639110175860514, + "flos": 23915752876800.0, + "grad_norm": 5.2668776853680805, + "language_loss": 0.75331742, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.77258158, + "num_input_tokens_seen": 165587010, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.40991211, + "step": 7716, + "time_per_iteration": 2.804318428039551 + }, + { + "auxiliary_loss_clip": 0.01454314, + "auxiliary_loss_mlp": 0.00438331, + "balance_loss_clip": 1.1985743, + "balance_loss_mlp": 0.39710861, + "epoch": 0.4639711408387194, + "flos": 25300396304640.0, + "grad_norm": 28.790624914316087, + "language_loss": 0.86066508, + "learning_rate": 2.328956666474691e-06, + "loss": 0.87959158, + "num_input_tokens_seen": 165607850, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.41235352, + "step": 7717, + "time_per_iteration": 2.79526424407959 + }, + { + "auxiliary_loss_clip": 0.01448311, + "auxiliary_loss_mlp": 0.00420342, + "balance_loss_clip": 1.18811929, + "balance_loss_mlp": 0.38117009, + "epoch": 0.46403126409138734, + "flos": 21211822817280.0, + "grad_norm": 26829.22328203678, + "language_loss": 0.77801311, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.79669964, + "num_input_tokens_seen": 165627175, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.39208984, + "step": 7718, + "time_per_iteration": 2.684689521789551 + }, + { + "auxiliary_loss_clip": 0.01455389, + "auxiliary_loss_mlp": 0.00414502, + "balance_loss_clip": 1.19213688, + "balance_loss_mlp": 0.37501994, + "epoch": 0.4640913873440553, + "flos": 35845564325760.0, + "grad_norm": 18.876480461252797, + "language_loss": 0.77176058, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.79045951, + "num_input_tokens_seen": 165648340, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.39453125, + "step": 7719, + "time_per_iteration": 2.8018598556518555 + }, + { + "auxiliary_loss_clip": 0.01456879, + "auxiliary_loss_mlp": 0.00428717, + "balance_loss_clip": 1.19633961, + "balance_loss_mlp": 0.38699359, + "epoch": 0.46415151059672327, + "flos": 19166207270400.0, + "grad_norm": 10.275416954369046, + "language_loss": 0.91317952, + "learning_rate": 2.327804137953357e-06, + "loss": 0.93203545, + "num_input_tokens_seen": 165667195, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.41723633, + "step": 7720, + "time_per_iteration": 2.6592020988464355 + }, + { + "auxiliary_loss_clip": 0.01315831, + "auxiliary_loss_mlp": 0.00080575, + "balance_loss_clip": 1.16608787, + "balance_loss_mlp": 0.07289782, + "epoch": 0.46421163384939124, + "flos": 58912750304640.0, + "grad_norm": 0.7352614559530523, + "language_loss": 0.549025, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.56298906, + "num_input_tokens_seen": 165726760, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.07666016, + "step": 7721, + "time_per_iteration": 3.251382350921631 + }, + { + "auxiliary_loss_clip": 0.01457637, + "auxiliary_loss_mlp": 0.0044183, + "balance_loss_clip": 1.20142722, + "balance_loss_mlp": 0.40079796, + "epoch": 0.4642717571020592, + "flos": 20157342226560.0, + "grad_norm": 6.920931130417418, + "language_loss": 0.86377609, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.88277078, + "num_input_tokens_seen": 165745005, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.41040039, + "step": 7722, + "time_per_iteration": 2.678438425064087 + }, + { + "auxiliary_loss_clip": 0.01462661, + "auxiliary_loss_mlp": 0.00437757, + "balance_loss_clip": 1.20068121, + "balance_loss_mlp": 0.39889449, + "epoch": 0.46433188035472717, + "flos": 25046184775680.0, + "grad_norm": 3954.402422050313, + "language_loss": 0.84261507, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.86161923, + "num_input_tokens_seen": 165765750, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.38867188, + "step": 7723, + "time_per_iteration": 2.7246551513671875 + }, + { + "auxiliary_loss_clip": 0.01444573, + "auxiliary_loss_mlp": 0.00375307, + "balance_loss_clip": 1.19104326, + "balance_loss_mlp": 0.33804268, + "epoch": 0.4643920036073952, + "flos": 28075644817920.0, + "grad_norm": 17.525975957085645, + "language_loss": 0.74997866, + "learning_rate": 2.326267259301118e-06, + "loss": 0.76817745, + "num_input_tokens_seen": 165787515, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.37231445, + "step": 7724, + "time_per_iteration": 2.7139244079589844 + }, + { + "auxiliary_loss_clip": 0.01446711, + "auxiliary_loss_mlp": 0.00401582, + "balance_loss_clip": 1.19222164, + "balance_loss_mlp": 0.36546111, + "epoch": 0.46445212686006315, + "flos": 18369350000640.0, + "grad_norm": 106.840680668386, + "language_loss": 0.7598269, + "learning_rate": 2.325883008671415e-06, + "loss": 0.77830982, + "num_input_tokens_seen": 165806675, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.36108398, + "step": 7725, + "time_per_iteration": 2.6213905811309814 + }, + { + "auxiliary_loss_clip": 0.01426797, + "auxiliary_loss_mlp": 0.00376125, + "balance_loss_clip": 1.18078482, + "balance_loss_mlp": 0.3407203, + "epoch": 0.4645122501127311, + "flos": 31721618920320.0, + "grad_norm": 47.71684316502801, + "language_loss": 0.71391416, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.73194337, + "num_input_tokens_seen": 165829835, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.35400391, + "step": 7726, + "time_per_iteration": 2.7235848903656006 + }, + { + "auxiliary_loss_clip": 0.01452021, + "auxiliary_loss_mlp": 0.00396746, + "balance_loss_clip": 1.19684517, + "balance_loss_mlp": 0.35971934, + "epoch": 0.4645723733653991, + "flos": 23768806337280.0, + "grad_norm": 12.331576323403942, + "language_loss": 0.82643354, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.84492123, + "num_input_tokens_seen": 165849380, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.37036133, + "step": 7727, + "time_per_iteration": 2.691620111465454 + }, + { + "auxiliary_loss_clip": 0.01447815, + "auxiliary_loss_mlp": 0.00421724, + "balance_loss_clip": 1.19208694, + "balance_loss_mlp": 0.38326687, + "epoch": 0.46463249661806705, + "flos": 33145512935040.0, + "grad_norm": 15.991635231022526, + "language_loss": 0.84432292, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.86301839, + "num_input_tokens_seen": 165868620, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.38452148, + "step": 7728, + "time_per_iteration": 2.734421730041504 + }, + { + "auxiliary_loss_clip": 0.01457099, + "auxiliary_loss_mlp": 0.00413643, + "balance_loss_clip": 1.19759822, + "balance_loss_mlp": 0.37354115, + "epoch": 0.464692619870735, + "flos": 18296020385280.0, + "grad_norm": 451.08135187199275, + "language_loss": 0.83221209, + "learning_rate": 2.324345882723155e-06, + "loss": 0.85091949, + "num_input_tokens_seen": 165885915, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.40136719, + "step": 7729, + "time_per_iteration": 2.659691095352173 + }, + { + "auxiliary_loss_clip": 0.01452124, + "auxiliary_loss_mlp": 0.00395968, + "balance_loss_clip": 1.19600999, + "balance_loss_mlp": 0.3563906, + "epoch": 0.464752743123403, + "flos": 22638051216000.0, + "grad_norm": 18.007248942280597, + "language_loss": 0.85960639, + "learning_rate": 2.323961570451588e-06, + "loss": 0.87808728, + "num_input_tokens_seen": 165905465, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.39575195, + "step": 7730, + "time_per_iteration": 2.624544620513916 + }, + { + "auxiliary_loss_clip": 0.01445795, + "auxiliary_loss_mlp": 0.00433579, + "balance_loss_clip": 1.19232225, + "balance_loss_mlp": 0.39416814, + "epoch": 0.46481286637607094, + "flos": 20412128373120.0, + "grad_norm": 9.464239707983618, + "language_loss": 0.82211578, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.84090954, + "num_input_tokens_seen": 165924640, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.39379883, + "step": 7731, + "time_per_iteration": 2.673732280731201 + }, + { + "auxiliary_loss_clip": 0.01436468, + "auxiliary_loss_mlp": 0.00390088, + "balance_loss_clip": 1.18792927, + "balance_loss_mlp": 0.35408688, + "epoch": 0.4648729896287389, + "flos": 34275406129920.0, + "grad_norm": 22.98631780535822, + "language_loss": 0.72140205, + "learning_rate": 2.323192909069061e-06, + "loss": 0.73966759, + "num_input_tokens_seen": 165945765, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.36010742, + "step": 7732, + "time_per_iteration": 2.7896337509155273 + }, + { + "auxiliary_loss_clip": 0.01449157, + "auxiliary_loss_mlp": 0.00439776, + "balance_loss_clip": 1.18832564, + "balance_loss_mlp": 0.39824307, + "epoch": 0.4649331128814069, + "flos": 21321781326720.0, + "grad_norm": 257.32269837062074, + "language_loss": 0.83509976, + "learning_rate": 2.32280855998725e-06, + "loss": 0.85398906, + "num_input_tokens_seen": 165964025, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.41552734, + "step": 7733, + "time_per_iteration": 2.6461408138275146 + }, + { + "auxiliary_loss_clip": 0.01283088, + "auxiliary_loss_mlp": 0.00062311, + "balance_loss_clip": 1.13068926, + "balance_loss_mlp": 0.0530126, + "epoch": 0.46499323613407484, + "flos": 58308515717760.0, + "grad_norm": 1.3314151244074708, + "language_loss": 0.51812351, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.53157747, + "num_input_tokens_seen": 166021950, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.09277344, + "step": 7734, + "time_per_iteration": 3.0830087661743164 + }, + { + "auxiliary_loss_clip": 0.01443613, + "auxiliary_loss_mlp": 0.00389514, + "balance_loss_clip": 1.18647301, + "balance_loss_mlp": 0.35315472, + "epoch": 0.4650533593867428, + "flos": 10889660384640.0, + "grad_norm": 65.08021082848143, + "language_loss": 0.80919129, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.82752252, + "num_input_tokens_seen": 166039675, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.36376953, + "step": 7735, + "time_per_iteration": 2.613586664199829 + }, + { + "auxiliary_loss_clip": 0.01423596, + "auxiliary_loss_mlp": 0.00383243, + "balance_loss_clip": 1.1765945, + "balance_loss_mlp": 0.34831458, + "epoch": 0.46511348263941077, + "flos": 19974592805760.0, + "grad_norm": 10.506670091082793, + "language_loss": 0.76958919, + "learning_rate": 2.321655439354519e-06, + "loss": 0.78765756, + "num_input_tokens_seen": 166057745, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.34960938, + "step": 7736, + "time_per_iteration": 2.667954444885254 + }, + { + "auxiliary_loss_clip": 0.01415825, + "auxiliary_loss_mlp": 0.00411051, + "balance_loss_clip": 1.17529726, + "balance_loss_mlp": 0.37316588, + "epoch": 0.46517360589207873, + "flos": 19678401256320.0, + "grad_norm": 26.984914452782068, + "language_loss": 0.76858497, + "learning_rate": 2.321271041396427e-06, + "loss": 0.78685373, + "num_input_tokens_seen": 166076440, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.37890625, + "step": 7737, + "time_per_iteration": 2.689790964126587 + }, + { + "auxiliary_loss_clip": 0.01445264, + "auxiliary_loss_mlp": 0.00417275, + "balance_loss_clip": 1.18663716, + "balance_loss_mlp": 0.3798196, + "epoch": 0.46523372914474675, + "flos": 16872665074560.0, + "grad_norm": 105.83708657193591, + "language_loss": 0.89857507, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.91720045, + "num_input_tokens_seen": 166092520, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.37451172, + "step": 7738, + "time_per_iteration": 2.6479854583740234 + }, + { + "auxiliary_loss_clip": 0.01273875, + "auxiliary_loss_mlp": 0.00105539, + "balance_loss_clip": 1.11770606, + "balance_loss_mlp": 0.09657423, + "epoch": 0.4652938523974147, + "flos": 53439138339840.0, + "grad_norm": 0.7318792301090837, + "language_loss": 0.57337832, + "learning_rate": 2.320502208946932e-06, + "loss": 0.58717245, + "num_input_tokens_seen": 166156285, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.08984375, + "step": 7739, + "time_per_iteration": 3.2470743656158447 + }, + { + "auxiliary_loss_clip": 0.01435765, + "auxiliary_loss_mlp": 0.00439372, + "balance_loss_clip": 1.18104994, + "balance_loss_mlp": 0.40024751, + "epoch": 0.4653539756500827, + "flos": 15231296165760.0, + "grad_norm": 24.434183408119207, + "language_loss": 0.91764903, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.93640041, + "num_input_tokens_seen": 166173455, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.39135742, + "step": 7740, + "time_per_iteration": 2.7231385707855225 + }, + { + "auxiliary_loss_clip": 0.01424093, + "auxiliary_loss_mlp": 0.00394025, + "balance_loss_clip": 1.17633057, + "balance_loss_mlp": 0.35838109, + "epoch": 0.46541409890275065, + "flos": 23732249270400.0, + "grad_norm": 5.065779651035473, + "language_loss": 0.80118906, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.81937027, + "num_input_tokens_seen": 166194370, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.35668945, + "step": 7741, + "time_per_iteration": 2.6854987144470215 + }, + { + "auxiliary_loss_clip": 0.01445028, + "auxiliary_loss_mlp": 0.0040582, + "balance_loss_clip": 1.18408227, + "balance_loss_mlp": 0.36748213, + "epoch": 0.4654742221554186, + "flos": 20847329556480.0, + "grad_norm": 165.89102817568545, + "language_loss": 0.86041266, + "learning_rate": 2.319348869158064e-06, + "loss": 0.87892115, + "num_input_tokens_seen": 166213195, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.38330078, + "step": 7742, + "time_per_iteration": 4.091065168380737 + }, + { + "auxiliary_loss_clip": 0.01436438, + "auxiliary_loss_mlp": 0.00469377, + "balance_loss_clip": 1.17718148, + "balance_loss_mlp": 0.42450637, + "epoch": 0.4655343454080866, + "flos": 20704836303360.0, + "grad_norm": 82.27316257214919, + "language_loss": 0.7820918, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.80114996, + "num_input_tokens_seen": 166231350, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.44897461, + "step": 7743, + "time_per_iteration": 2.732997417449951 + }, + { + "auxiliary_loss_clip": 0.01441552, + "auxiliary_loss_mlp": 0.00392561, + "balance_loss_clip": 1.18948913, + "balance_loss_mlp": 0.35512969, + "epoch": 0.46559446866075455, + "flos": 18989850470400.0, + "grad_norm": 39.174471569314306, + "language_loss": 0.78417504, + "learning_rate": 2.318579915392483e-06, + "loss": 0.80251616, + "num_input_tokens_seen": 166250530, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.37451172, + "step": 7744, + "time_per_iteration": 2.656733989715576 + }, + { + "auxiliary_loss_clip": 0.01421037, + "auxiliary_loss_mlp": 0.00402795, + "balance_loss_clip": 1.17566752, + "balance_loss_mlp": 0.36619794, + "epoch": 0.4656545919134225, + "flos": 34496364643200.0, + "grad_norm": 3.9794070834128505, + "language_loss": 0.8904649, + "learning_rate": 2.31819542038153e-06, + "loss": 0.90870321, + "num_input_tokens_seen": 166272545, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.36572266, + "step": 7745, + "time_per_iteration": 4.2389068603515625 + }, + { + "auxiliary_loss_clip": 0.01426851, + "auxiliary_loss_mlp": 0.00415717, + "balance_loss_clip": 1.17638707, + "balance_loss_mlp": 0.37675947, + "epoch": 0.4657147151660905, + "flos": 24310554238080.0, + "grad_norm": 6.73518516282038, + "language_loss": 0.78965718, + "learning_rate": 2.317810913304574e-06, + "loss": 0.80808282, + "num_input_tokens_seen": 166292135, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.38964844, + "step": 7746, + "time_per_iteration": 2.7008020877838135 + }, + { + "auxiliary_loss_clip": 0.01413413, + "auxiliary_loss_mlp": 0.00401003, + "balance_loss_clip": 1.17025185, + "balance_loss_mlp": 0.36554989, + "epoch": 0.46577483841875844, + "flos": 58795139220480.0, + "grad_norm": 71.92145163746531, + "language_loss": 0.74335587, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.7615, + "num_input_tokens_seen": 166316710, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.35473633, + "step": 7747, + "time_per_iteration": 4.534693479537964 + }, + { + "auxiliary_loss_clip": 0.0144524, + "auxiliary_loss_mlp": 0.00462153, + "balance_loss_clip": 1.18941426, + "balance_loss_mlp": 0.42000058, + "epoch": 0.4658349616714264, + "flos": 31321969223040.0, + "grad_norm": 27.825016166031137, + "language_loss": 0.72859341, + "learning_rate": 2.317041863010978e-06, + "loss": 0.74766731, + "num_input_tokens_seen": 166338535, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.421875, + "step": 7748, + "time_per_iteration": 2.7369587421417236 + }, + { + "auxiliary_loss_clip": 0.01435746, + "auxiliary_loss_mlp": 0.0042408, + "balance_loss_clip": 1.17749655, + "balance_loss_mlp": 0.38419229, + "epoch": 0.46589508492409437, + "flos": 14860338456960.0, + "grad_norm": 29.82758853779135, + "language_loss": 0.72469628, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.7432946, + "num_input_tokens_seen": 166355540, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.39892578, + "step": 7749, + "time_per_iteration": 2.654207944869995 + }, + { + "auxiliary_loss_clip": 0.01460232, + "auxiliary_loss_mlp": 0.00404807, + "balance_loss_clip": 1.19717073, + "balance_loss_mlp": 0.36618268, + "epoch": 0.46595520817676234, + "flos": 12895989431040.0, + "grad_norm": 21.794750896822336, + "language_loss": 0.82635009, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.84500051, + "num_input_tokens_seen": 166372635, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.38623047, + "step": 7750, + "time_per_iteration": 2.630199670791626 + }, + { + "auxiliary_loss_clip": 0.0144434, + "auxiliary_loss_mlp": 0.00431543, + "balance_loss_clip": 1.1866082, + "balance_loss_mlp": 0.3932054, + "epoch": 0.46601533142943036, + "flos": 32854169721600.0, + "grad_norm": 12.140458804098222, + "language_loss": 0.7981205, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.81687927, + "num_input_tokens_seen": 166393175, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.38330078, + "step": 7751, + "time_per_iteration": 2.7569050788879395 + }, + { + "auxiliary_loss_clip": 0.01458077, + "auxiliary_loss_mlp": 0.0043385, + "balance_loss_clip": 1.19356179, + "balance_loss_mlp": 0.39369982, + "epoch": 0.4660754546820983, + "flos": 19967517826560.0, + "grad_norm": 14.928418572502714, + "language_loss": 0.81111604, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.83003533, + "num_input_tokens_seen": 166408630, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.40185547, + "step": 7752, + "time_per_iteration": 2.631706714630127 + }, + { + "auxiliary_loss_clip": 0.01452043, + "auxiliary_loss_mlp": 0.00456673, + "balance_loss_clip": 1.1870743, + "balance_loss_mlp": 0.41263741, + "epoch": 0.4661355779347663, + "flos": 26688164215680.0, + "grad_norm": 42.264344633679116, + "language_loss": 0.76987249, + "learning_rate": 2.315119027142644e-06, + "loss": 0.78895962, + "num_input_tokens_seen": 166428170, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.44067383, + "step": 7753, + "time_per_iteration": 4.139386177062988 + }, + { + "auxiliary_loss_clip": 0.0144396, + "auxiliary_loss_mlp": 0.00396938, + "balance_loss_clip": 1.18931925, + "balance_loss_mlp": 0.35843322, + "epoch": 0.46619570118743425, + "flos": 20959442881920.0, + "grad_norm": 82.74953852607187, + "language_loss": 0.79190958, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.81031859, + "num_input_tokens_seen": 166446705, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.38500977, + "step": 7754, + "time_per_iteration": 2.659687042236328 + }, + { + "auxiliary_loss_clip": 0.01438029, + "auxiliary_loss_mlp": 0.0044216, + "balance_loss_clip": 1.18249798, + "balance_loss_mlp": 0.40141422, + "epoch": 0.4662558244401022, + "flos": 24426079355520.0, + "grad_norm": 25.26114273646324, + "language_loss": 0.84603399, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.86483592, + "num_input_tokens_seen": 166466750, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.40722656, + "step": 7755, + "time_per_iteration": 2.7454473972320557 + }, + { + "auxiliary_loss_clip": 0.01423442, + "auxiliary_loss_mlp": 0.0039272, + "balance_loss_clip": 1.1746217, + "balance_loss_mlp": 0.35605121, + "epoch": 0.4663159476927702, + "flos": 20595452411520.0, + "grad_norm": 18.080392585663816, + "language_loss": 0.78954303, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.80770463, + "num_input_tokens_seen": 166485400, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.36694336, + "step": 7756, + "time_per_iteration": 2.6340878009796143 + }, + { + "auxiliary_loss_clip": 0.01427502, + "auxiliary_loss_mlp": 0.00386744, + "balance_loss_clip": 1.17763376, + "balance_loss_mlp": 0.35093343, + "epoch": 0.46637607094543815, + "flos": 25661872823040.0, + "grad_norm": 168.03210623792987, + "language_loss": 0.83092386, + "learning_rate": 2.313580543272274e-06, + "loss": 0.84906638, + "num_input_tokens_seen": 166505730, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.3581543, + "step": 7757, + "time_per_iteration": 2.6764800548553467 + }, + { + "auxiliary_loss_clip": 0.01439701, + "auxiliary_loss_mlp": 0.00406752, + "balance_loss_clip": 1.18584228, + "balance_loss_mlp": 0.36874834, + "epoch": 0.4664361941981061, + "flos": 24273853516800.0, + "grad_norm": 20.720160713774675, + "language_loss": 0.73450971, + "learning_rate": 2.313195892540705e-06, + "loss": 0.75297421, + "num_input_tokens_seen": 166523770, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.37988281, + "step": 7758, + "time_per_iteration": 2.6738758087158203 + }, + { + "auxiliary_loss_clip": 0.01426761, + "auxiliary_loss_mlp": 0.00391066, + "balance_loss_clip": 1.17910528, + "balance_loss_mlp": 0.35575616, + "epoch": 0.4664963174507741, + "flos": 18405871153920.0, + "grad_norm": 41.16133747119332, + "language_loss": 0.82425374, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.84243202, + "num_input_tokens_seen": 166542935, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.3527832, + "step": 7759, + "time_per_iteration": 2.675135612487793 + }, + { + "auxiliary_loss_clip": 0.0143591, + "auxiliary_loss_mlp": 0.0039434, + "balance_loss_clip": 1.18659902, + "balance_loss_mlp": 0.35888726, + "epoch": 0.46655644070344204, + "flos": 22455122227200.0, + "grad_norm": 95.18789073871861, + "language_loss": 0.83296889, + "learning_rate": 2.312426555462893e-06, + "loss": 0.85127139, + "num_input_tokens_seen": 166563935, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.35473633, + "step": 7760, + "time_per_iteration": 2.684521198272705 + }, + { + "auxiliary_loss_clip": 0.01420899, + "auxiliary_loss_mlp": 0.00395211, + "balance_loss_clip": 1.17249632, + "balance_loss_mlp": 0.35632476, + "epoch": 0.46661656395611, + "flos": 13808407731840.0, + "grad_norm": 55.21596677400066, + "language_loss": 0.80808401, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.82624507, + "num_input_tokens_seen": 166582175, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.38891602, + "step": 7761, + "time_per_iteration": 2.6639411449432373 + }, + { + "auxiliary_loss_clip": 0.01457817, + "auxiliary_loss_mlp": 0.00422589, + "balance_loss_clip": 1.19637799, + "balance_loss_mlp": 0.38165271, + "epoch": 0.466676687208778, + "flos": 21652159645440.0, + "grad_norm": 72.8816268532683, + "language_loss": 0.84562618, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.86443019, + "num_input_tokens_seen": 166601870, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.40966797, + "step": 7762, + "time_per_iteration": 2.7596004009246826 + }, + { + "auxiliary_loss_clip": 0.01283176, + "auxiliary_loss_mlp": 0.00031994, + "balance_loss_clip": 1.12419784, + "balance_loss_mlp": 0.02550852, + "epoch": 0.46673681046144594, + "flos": 68534259068160.0, + "grad_norm": 0.7793218265170596, + "language_loss": 0.59404838, + "learning_rate": 2.311272461028297e-06, + "loss": 0.60720003, + "num_input_tokens_seen": 166668960, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.06494141, + "step": 7763, + "time_per_iteration": 3.2454962730407715 + }, + { + "auxiliary_loss_clip": 0.01445977, + "auxiliary_loss_mlp": 0.00422986, + "balance_loss_clip": 1.1834147, + "balance_loss_mlp": 0.38364673, + "epoch": 0.46679693371411396, + "flos": 15814449469440.0, + "grad_norm": 5.520071961283715, + "language_loss": 0.85127318, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.86996287, + "num_input_tokens_seen": 166686110, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.39331055, + "step": 7764, + "time_per_iteration": 2.6861584186553955 + }, + { + "auxiliary_loss_clip": 0.0142958, + "auxiliary_loss_mlp": 0.00386154, + "balance_loss_clip": 1.18193245, + "balance_loss_mlp": 0.35122615, + "epoch": 0.4668570569667819, + "flos": 18514572687360.0, + "grad_norm": 43.410744560735786, + "language_loss": 0.77407646, + "learning_rate": 2.310503005696839e-06, + "loss": 0.79223382, + "num_input_tokens_seen": 166703930, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.34912109, + "step": 7765, + "time_per_iteration": 2.656803607940674 + }, + { + "auxiliary_loss_clip": 0.01447154, + "auxiliary_loss_mlp": 0.00413662, + "balance_loss_clip": 1.18945527, + "balance_loss_mlp": 0.37606293, + "epoch": 0.4669171802194499, + "flos": 19206643006080.0, + "grad_norm": 3.034104927725337, + "language_loss": 0.84553504, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.86414313, + "num_input_tokens_seen": 166719940, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.37597656, + "step": 7766, + "time_per_iteration": 2.67101788520813 + }, + { + "auxiliary_loss_clip": 0.01437279, + "auxiliary_loss_mlp": 0.0041344, + "balance_loss_clip": 1.18542719, + "balance_loss_mlp": 0.37586561, + "epoch": 0.46697730347211786, + "flos": 12276135406080.0, + "grad_norm": 195.1313756225157, + "language_loss": 0.71576643, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.73427367, + "num_input_tokens_seen": 166738285, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.37573242, + "step": 7767, + "time_per_iteration": 2.701338768005371 + }, + { + "auxiliary_loss_clip": 0.01441795, + "auxiliary_loss_mlp": 0.0040081, + "balance_loss_clip": 1.19129992, + "balance_loss_mlp": 0.3645463, + "epoch": 0.4670374267247858, + "flos": 23586739274880.0, + "grad_norm": 3.6127496931601932, + "language_loss": 0.81263363, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.83105969, + "num_input_tokens_seen": 166758170, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.36254883, + "step": 7768, + "time_per_iteration": 2.686739921569824 + }, + { + "auxiliary_loss_clip": 0.01438902, + "auxiliary_loss_mlp": 0.00368144, + "balance_loss_clip": 1.18219829, + "balance_loss_mlp": 0.33364481, + "epoch": 0.4670975499774538, + "flos": 15991093578240.0, + "grad_norm": 10.13788530213485, + "language_loss": 0.76795882, + "learning_rate": 2.308963953858982e-06, + "loss": 0.78602928, + "num_input_tokens_seen": 166775750, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.3449707, + "step": 7769, + "time_per_iteration": 2.6551871299743652 + }, + { + "auxiliary_loss_clip": 0.01435051, + "auxiliary_loss_mlp": 0.00395389, + "balance_loss_clip": 1.18213272, + "balance_loss_mlp": 0.3588151, + "epoch": 0.46715767323012175, + "flos": 15377596260480.0, + "grad_norm": 7.485193737442467, + "language_loss": 0.87885875, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.89716315, + "num_input_tokens_seen": 166791720, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.36547852, + "step": 7770, + "time_per_iteration": 2.6290132999420166 + }, + { + "auxiliary_loss_clip": 0.01295873, + "auxiliary_loss_mlp": 0.00044487, + "balance_loss_clip": 1.1279068, + "balance_loss_mlp": 0.03638045, + "epoch": 0.4672177964827897, + "flos": 60252217401600.0, + "grad_norm": 0.7820811156193783, + "language_loss": 0.55486512, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.56826878, + "num_input_tokens_seen": 166856360, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.08105469, + "step": 7771, + "time_per_iteration": 3.2188403606414795 + }, + { + "auxiliary_loss_clip": 0.01441436, + "auxiliary_loss_mlp": 0.00398389, + "balance_loss_clip": 1.19142199, + "balance_loss_mlp": 0.36005154, + "epoch": 0.4672779197354577, + "flos": 27636134002560.0, + "grad_norm": 62.87245783590284, + "language_loss": 0.70592052, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.7243188, + "num_input_tokens_seen": 166875925, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.38330078, + "step": 7772, + "time_per_iteration": 2.7143852710723877 + }, + { + "auxiliary_loss_clip": 0.01438465, + "auxiliary_loss_mlp": 0.00385611, + "balance_loss_clip": 1.18908381, + "balance_loss_mlp": 0.34922856, + "epoch": 0.46733804298812565, + "flos": 31394257344000.0, + "grad_norm": 5.014852761136276, + "language_loss": 0.69705582, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.71529663, + "num_input_tokens_seen": 166896520, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.36401367, + "step": 7773, + "time_per_iteration": 2.7429630756378174 + }, + { + "auxiliary_loss_clip": 0.01437215, + "auxiliary_loss_mlp": 0.00401216, + "balance_loss_clip": 1.18265319, + "balance_loss_mlp": 0.36516708, + "epoch": 0.4673981662407936, + "flos": 19500607912320.0, + "grad_norm": 15.722193378764258, + "language_loss": 0.86925161, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.88763589, + "num_input_tokens_seen": 166915370, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.36035156, + "step": 7774, + "time_per_iteration": 2.6383397579193115 + }, + { + "auxiliary_loss_clip": 0.01434598, + "auxiliary_loss_mlp": 0.00379592, + "balance_loss_clip": 1.1813693, + "balance_loss_mlp": 0.34394801, + "epoch": 0.4674582894934616, + "flos": 20521835487360.0, + "grad_norm": 37.70167498126349, + "language_loss": 0.85320711, + "learning_rate": 2.306655024915726e-06, + "loss": 0.8713491, + "num_input_tokens_seen": 166934875, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.35644531, + "step": 7775, + "time_per_iteration": 2.793421983718872 + }, + { + "auxiliary_loss_clip": 0.01442875, + "auxiliary_loss_mlp": 0.00398003, + "balance_loss_clip": 1.18668818, + "balance_loss_mlp": 0.36064285, + "epoch": 0.46751841274612954, + "flos": 22090952188800.0, + "grad_norm": 45.3408244050632, + "language_loss": 0.76118636, + "learning_rate": 2.306270162640694e-06, + "loss": 0.77959514, + "num_input_tokens_seen": 166954285, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.3737793, + "step": 7776, + "time_per_iteration": 2.697019100189209 + }, + { + "auxiliary_loss_clip": 0.01444888, + "auxiliary_loss_mlp": 0.00427284, + "balance_loss_clip": 1.19362473, + "balance_loss_mlp": 0.39009118, + "epoch": 0.46757853599879756, + "flos": 26980082046720.0, + "grad_norm": 31.159758343867043, + "language_loss": 0.77547777, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.79419947, + "num_input_tokens_seen": 166975975, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.37182617, + "step": 7777, + "time_per_iteration": 2.697828531265259 + }, + { + "auxiliary_loss_clip": 0.01449977, + "auxiliary_loss_mlp": 0.00409575, + "balance_loss_clip": 1.19199443, + "balance_loss_mlp": 0.37283489, + "epoch": 0.4676386592514655, + "flos": 24134053783680.0, + "grad_norm": 5.687839213548536, + "language_loss": 0.77466881, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.79326433, + "num_input_tokens_seen": 166996140, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.3671875, + "step": 7778, + "time_per_iteration": 2.680717945098877 + }, + { + "auxiliary_loss_clip": 0.014587, + "auxiliary_loss_mlp": 0.00419169, + "balance_loss_clip": 1.19759059, + "balance_loss_mlp": 0.38073617, + "epoch": 0.4676987825041335, + "flos": 25483720343040.0, + "grad_norm": 8.852565293322966, + "language_loss": 0.79187357, + "learning_rate": 2.305115506191206e-06, + "loss": 0.81065226, + "num_input_tokens_seen": 167016105, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.38427734, + "step": 7779, + "time_per_iteration": 2.7090423107147217 + }, + { + "auxiliary_loss_clip": 0.01423804, + "auxiliary_loss_mlp": 0.00389681, + "balance_loss_clip": 1.17500973, + "balance_loss_mlp": 0.35551548, + "epoch": 0.46775890575680146, + "flos": 21945298538880.0, + "grad_norm": 10.61429086423236, + "language_loss": 0.78287548, + "learning_rate": 2.304730597548562e-06, + "loss": 0.80101031, + "num_input_tokens_seen": 167036185, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.34155273, + "step": 7780, + "time_per_iteration": 2.683276414871216 + }, + { + "auxiliary_loss_clip": 0.01467227, + "auxiliary_loss_mlp": 0.00413456, + "balance_loss_clip": 1.20257473, + "balance_loss_mlp": 0.37504712, + "epoch": 0.4678190290094694, + "flos": 25228395492480.0, + "grad_norm": 2.0212249296481795, + "language_loss": 0.80101168, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.81981856, + "num_input_tokens_seen": 167054515, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.38427734, + "step": 7781, + "time_per_iteration": 2.789621591567993 + }, + { + "auxiliary_loss_clip": 0.01448525, + "auxiliary_loss_mlp": 0.00392187, + "balance_loss_clip": 1.19193625, + "balance_loss_mlp": 0.35432598, + "epoch": 0.4678791522621374, + "flos": 32268358811520.0, + "grad_norm": 10.961494141560085, + "language_loss": 0.69609505, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.71450222, + "num_input_tokens_seen": 167077245, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.37841797, + "step": 7782, + "time_per_iteration": 2.797297954559326 + }, + { + "auxiliary_loss_clip": 0.01461955, + "auxiliary_loss_mlp": 0.00456747, + "balance_loss_clip": 1.19847202, + "balance_loss_mlp": 0.41731259, + "epoch": 0.46793927551480535, + "flos": 27046480337280.0, + "grad_norm": 29.361322041357795, + "language_loss": 0.69158518, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.71077216, + "num_input_tokens_seen": 167097235, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.39428711, + "step": 7783, + "time_per_iteration": 2.7048096656799316 + }, + { + "auxiliary_loss_clip": 0.01466374, + "auxiliary_loss_mlp": 0.00435004, + "balance_loss_clip": 1.20336843, + "balance_loss_mlp": 0.39189839, + "epoch": 0.4679993987674733, + "flos": 17457398576640.0, + "grad_norm": 23.163898353038107, + "language_loss": 0.76820886, + "learning_rate": 2.303190847569801e-06, + "loss": 0.78722262, + "num_input_tokens_seen": 167113155, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.43066406, + "step": 7784, + "time_per_iteration": 4.176246643066406 + }, + { + "auxiliary_loss_clip": 0.01447183, + "auxiliary_loss_mlp": 0.00408092, + "balance_loss_clip": 1.19166303, + "balance_loss_mlp": 0.37097073, + "epoch": 0.4680595220201413, + "flos": 17165121609600.0, + "grad_norm": 380.9269314595852, + "language_loss": 0.91167063, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.93022335, + "num_input_tokens_seen": 167131765, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.37109375, + "step": 7785, + "time_per_iteration": 2.812243938446045 + }, + { + "auxiliary_loss_clip": 0.01456938, + "auxiliary_loss_mlp": 0.00411181, + "balance_loss_clip": 1.20009625, + "balance_loss_mlp": 0.37150824, + "epoch": 0.46811964527280925, + "flos": 11327591001600.0, + "grad_norm": 127.35502072328538, + "language_loss": 0.85427713, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.8729583, + "num_input_tokens_seen": 167149030, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.39697266, + "step": 7786, + "time_per_iteration": 2.6494979858398438 + }, + { + "auxiliary_loss_clip": 0.01440418, + "auxiliary_loss_mlp": 0.00405622, + "balance_loss_clip": 1.19006538, + "balance_loss_mlp": 0.36981148, + "epoch": 0.4681797685254772, + "flos": 24278809593600.0, + "grad_norm": 49.93888078885759, + "language_loss": 0.78925085, + "learning_rate": 2.302035914315856e-06, + "loss": 0.80771124, + "num_input_tokens_seen": 167167375, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.35791016, + "step": 7787, + "time_per_iteration": 2.7078051567077637 + }, + { + "auxiliary_loss_clip": 0.01461955, + "auxiliary_loss_mlp": 0.00423475, + "balance_loss_clip": 1.2053175, + "balance_loss_mlp": 0.38532811, + "epoch": 0.4682398917781452, + "flos": 31650372293760.0, + "grad_norm": 4.805151459007983, + "language_loss": 0.70479012, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.7236445, + "num_input_tokens_seen": 167188065, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.3815918, + "step": 7788, + "time_per_iteration": 4.26589298248291 + }, + { + "auxiliary_loss_clip": 0.01457247, + "auxiliary_loss_mlp": 0.00429288, + "balance_loss_clip": 1.20086157, + "balance_loss_mlp": 0.391332, + "epoch": 0.46830001503081314, + "flos": 28110765340800.0, + "grad_norm": 6.217519944010808, + "language_loss": 0.70294929, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.72181469, + "num_input_tokens_seen": 167209675, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.37988281, + "step": 7789, + "time_per_iteration": 2.716214895248413 + }, + { + "auxiliary_loss_clip": 0.01288613, + "auxiliary_loss_mlp": 0.00147316, + "balance_loss_clip": 1.12986171, + "balance_loss_mlp": 0.13630083, + "epoch": 0.4683601382834811, + "flos": 57881718316800.0, + "grad_norm": 0.6970365995358444, + "language_loss": 0.61444992, + "learning_rate": 2.300880877982825e-06, + "loss": 0.62880915, + "num_input_tokens_seen": 167273940, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.11035156, + "step": 7790, + "time_per_iteration": 4.664637088775635 + }, + { + "auxiliary_loss_clip": 0.01458709, + "auxiliary_loss_mlp": 0.00388147, + "balance_loss_clip": 1.20294881, + "balance_loss_mlp": 0.35190761, + "epoch": 0.46842026153614913, + "flos": 21871933009920.0, + "grad_norm": 4.415124988310101, + "language_loss": 0.83883727, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.85730588, + "num_input_tokens_seen": 167292730, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.36206055, + "step": 7791, + "time_per_iteration": 2.6695494651794434 + }, + { + "auxiliary_loss_clip": 0.01452593, + "auxiliary_loss_mlp": 0.00400854, + "balance_loss_clip": 1.19764388, + "balance_loss_mlp": 0.36544865, + "epoch": 0.4684803847888171, + "flos": 24900818434560.0, + "grad_norm": 38.048629735113124, + "language_loss": 0.81327927, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.83181369, + "num_input_tokens_seen": 167313460, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.35400391, + "step": 7792, + "time_per_iteration": 2.6945934295654297 + }, + { + "auxiliary_loss_clip": 0.01446917, + "auxiliary_loss_mlp": 0.00401942, + "balance_loss_clip": 1.19537163, + "balance_loss_mlp": 0.36434329, + "epoch": 0.46854050804148506, + "flos": 26251670142720.0, + "grad_norm": 29.73433768821562, + "language_loss": 0.71724916, + "learning_rate": 2.299725738964898e-06, + "loss": 0.7357378, + "num_input_tokens_seen": 167335385, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.3762207, + "step": 7793, + "time_per_iteration": 2.7234768867492676 + }, + { + "auxiliary_loss_clip": 0.0145597, + "auxiliary_loss_mlp": 0.00432984, + "balance_loss_clip": 1.20012641, + "balance_loss_mlp": 0.39393115, + "epoch": 0.468600631294153, + "flos": 21579799697280.0, + "grad_norm": 4.0867378310414315, + "language_loss": 0.7821275, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.80101705, + "num_input_tokens_seen": 167353625, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.39013672, + "step": 7794, + "time_per_iteration": 2.6995718479156494 + }, + { + "auxiliary_loss_clip": 0.01474568, + "auxiliary_loss_mlp": 0.00466951, + "balance_loss_clip": 1.21319938, + "balance_loss_mlp": 0.42732573, + "epoch": 0.468660754546821, + "flos": 25885632597120.0, + "grad_norm": 31.028921765185693, + "language_loss": 0.7056154, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.72503066, + "num_input_tokens_seen": 167374565, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.39624023, + "step": 7795, + "time_per_iteration": 4.149586200714111 + }, + { + "auxiliary_loss_clip": 0.01466171, + "auxiliary_loss_mlp": 0.0042924, + "balance_loss_clip": 1.20547867, + "balance_loss_mlp": 0.39059252, + "epoch": 0.46872087779948896, + "flos": 35475001666560.0, + "grad_norm": 29.374960783805825, + "language_loss": 0.7326628, + "learning_rate": 2.298570497656304e-06, + "loss": 0.75161695, + "num_input_tokens_seen": 167395010, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.38671875, + "step": 7796, + "time_per_iteration": 2.8017351627349854 + }, + { + "auxiliary_loss_clip": 0.01467745, + "auxiliary_loss_mlp": 0.00467562, + "balance_loss_clip": 1.20361006, + "balance_loss_mlp": 0.42710286, + "epoch": 0.4687810010521569, + "flos": 26396425952640.0, + "grad_norm": 10.2453104258466, + "language_loss": 0.75886524, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.77821839, + "num_input_tokens_seen": 167415285, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.40454102, + "step": 7797, + "time_per_iteration": 2.7213380336761475 + }, + { + "auxiliary_loss_clip": 0.01490686, + "auxiliary_loss_mlp": 0.00462925, + "balance_loss_clip": 1.21982265, + "balance_loss_mlp": 0.4194133, + "epoch": 0.4688411243048249, + "flos": 19972761212160.0, + "grad_norm": 14.799809232309157, + "language_loss": 0.73591065, + "learning_rate": 2.297800280150454e-06, + "loss": 0.75544667, + "num_input_tokens_seen": 167432405, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.43481445, + "step": 7798, + "time_per_iteration": 2.6945462226867676 + }, + { + "auxiliary_loss_clip": 0.01278313, + "auxiliary_loss_mlp": 0.00113203, + "balance_loss_clip": 1.11821961, + "balance_loss_mlp": 0.10204468, + "epoch": 0.46890124755749285, + "flos": 63977015900160.0, + "grad_norm": 0.9289118476299877, + "language_loss": 0.64162815, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.65554321, + "num_input_tokens_seen": 167499365, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.11181641, + "step": 7799, + "time_per_iteration": 3.3190603256225586 + }, + { + "auxiliary_loss_clip": 0.01463257, + "auxiliary_loss_mlp": 0.00425281, + "balance_loss_clip": 1.20286858, + "balance_loss_mlp": 0.38541722, + "epoch": 0.4689613708101608, + "flos": 23768985905280.0, + "grad_norm": 2.529461011174724, + "language_loss": 0.77506757, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.79395294, + "num_input_tokens_seen": 167520390, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.3984375, + "step": 7800, + "time_per_iteration": 2.7100372314453125 + }, + { + "auxiliary_loss_clip": 0.01455854, + "auxiliary_loss_mlp": 0.00424466, + "balance_loss_clip": 1.20162296, + "balance_loss_mlp": 0.38789308, + "epoch": 0.4690214940628288, + "flos": 24788705109120.0, + "grad_norm": 25.56981776709766, + "language_loss": 0.78713387, + "learning_rate": 2.296644869233568e-06, + "loss": 0.80593705, + "num_input_tokens_seen": 167539865, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.36547852, + "step": 7801, + "time_per_iteration": 2.7636430263519287 + }, + { + "auxiliary_loss_clip": 0.01479692, + "auxiliary_loss_mlp": 0.00427, + "balance_loss_clip": 1.20791423, + "balance_loss_mlp": 0.38725555, + "epoch": 0.46908161731549675, + "flos": 18077324428800.0, + "grad_norm": 3.9076857218018906, + "language_loss": 0.73116112, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.75022805, + "num_input_tokens_seen": 167558190, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.39770508, + "step": 7802, + "time_per_iteration": 2.6927106380462646 + }, + { + "auxiliary_loss_clip": 0.01467789, + "auxiliary_loss_mlp": 0.00414002, + "balance_loss_clip": 1.20811868, + "balance_loss_mlp": 0.37571204, + "epoch": 0.4691417405681647, + "flos": 25703350053120.0, + "grad_norm": 12.859648919675832, + "language_loss": 0.79194784, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.81076574, + "num_input_tokens_seen": 167577685, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.38330078, + "step": 7803, + "time_per_iteration": 2.7394142150878906 + }, + { + "auxiliary_loss_clip": 0.0146441, + "auxiliary_loss_mlp": 0.00436413, + "balance_loss_clip": 1.20313883, + "balance_loss_mlp": 0.39857548, + "epoch": 0.46920186382083273, + "flos": 17457039440640.0, + "grad_norm": 4.9768416944098055, + "language_loss": 0.83656478, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.855573, + "num_input_tokens_seen": 167596390, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.37817383, + "step": 7804, + "time_per_iteration": 2.717755079269409 + }, + { + "auxiliary_loss_clip": 0.01467098, + "auxiliary_loss_mlp": 0.00448371, + "balance_loss_clip": 1.21070242, + "balance_loss_mlp": 0.40700534, + "epoch": 0.4692619870735007, + "flos": 20339445202560.0, + "grad_norm": 140.2737372182612, + "language_loss": 0.83204317, + "learning_rate": 2.295104163929305e-06, + "loss": 0.85119784, + "num_input_tokens_seen": 167614980, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.41357422, + "step": 7805, + "time_per_iteration": 2.708989381790161 + }, + { + "auxiliary_loss_clip": 0.01479283, + "auxiliary_loss_mlp": 0.00452569, + "balance_loss_clip": 1.20636785, + "balance_loss_mlp": 0.40922433, + "epoch": 0.46932211032616866, + "flos": 29496558003840.0, + "grad_norm": 41.69797523629519, + "language_loss": 0.88733947, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.90665793, + "num_input_tokens_seen": 167635895, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.43334961, + "step": 7806, + "time_per_iteration": 2.8007290363311768 + }, + { + "auxiliary_loss_clip": 0.01471387, + "auxiliary_loss_mlp": 0.00428697, + "balance_loss_clip": 1.20839, + "balance_loss_mlp": 0.39000127, + "epoch": 0.4693822335788366, + "flos": 36211242735360.0, + "grad_norm": 84.22850946247445, + "language_loss": 0.84398204, + "learning_rate": 2.294333744076472e-06, + "loss": 0.86298287, + "num_input_tokens_seen": 167657440, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.38720703, + "step": 7807, + "time_per_iteration": 3.0040881633758545 + }, + { + "auxiliary_loss_clip": 0.01489142, + "auxiliary_loss_mlp": 0.00454694, + "balance_loss_clip": 1.22140002, + "balance_loss_mlp": 0.41463992, + "epoch": 0.4694423568315046, + "flos": 20338978325760.0, + "grad_norm": 59.451023457042595, + "language_loss": 0.56449503, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.58393335, + "num_input_tokens_seen": 167675025, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.40063477, + "step": 7808, + "time_per_iteration": 2.9206793308258057 + }, + { + "auxiliary_loss_clip": 0.01239328, + "auxiliary_loss_mlp": 0.0015407, + "balance_loss_clip": 1.08474624, + "balance_loss_mlp": 0.14438999, + "epoch": 0.46950248008417256, + "flos": 64326353621760.0, + "grad_norm": 0.8740502522665015, + "language_loss": 0.57437456, + "learning_rate": 2.293563279578978e-06, + "loss": 0.58830857, + "num_input_tokens_seen": 167729635, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.09667969, + "step": 7809, + "time_per_iteration": 3.0720913410186768 + }, + { + "auxiliary_loss_clip": 0.01489524, + "auxiliary_loss_mlp": 0.00435648, + "balance_loss_clip": 1.2147572, + "balance_loss_mlp": 0.39585608, + "epoch": 0.4695626033368405, + "flos": 19200106730880.0, + "grad_norm": 41.20605232510803, + "language_loss": 0.78639609, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.80564779, + "num_input_tokens_seen": 167745135, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.39770508, + "step": 7810, + "time_per_iteration": 2.717860698699951 + }, + { + "auxiliary_loss_clip": 0.01476983, + "auxiliary_loss_mlp": 0.00409289, + "balance_loss_clip": 1.21154857, + "balance_loss_mlp": 0.37273934, + "epoch": 0.4696227265895085, + "flos": 23002436736000.0, + "grad_norm": 6.884402657050895, + "language_loss": 0.87954104, + "learning_rate": 2.29279277055369e-06, + "loss": 0.89840376, + "num_input_tokens_seen": 167763875, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.3659668, + "step": 7811, + "time_per_iteration": 2.684976577758789 + }, + { + "auxiliary_loss_clip": 0.01475807, + "auxiliary_loss_mlp": 0.00423594, + "balance_loss_clip": 1.21359849, + "balance_loss_mlp": 0.38692558, + "epoch": 0.46968284984217645, + "flos": 21870855601920.0, + "grad_norm": 73.86186748629362, + "language_loss": 0.85178673, + "learning_rate": 2.292407499379644e-06, + "loss": 0.87078083, + "num_input_tokens_seen": 167784895, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.36669922, + "step": 7812, + "time_per_iteration": 2.7008771896362305 + }, + { + "auxiliary_loss_clip": 0.01455417, + "auxiliary_loss_mlp": 0.00432569, + "balance_loss_clip": 1.20173693, + "balance_loss_mlp": 0.39265794, + "epoch": 0.4697429730948444, + "flos": 19974987855360.0, + "grad_norm": 96.69028869836833, + "language_loss": 0.80835104, + "learning_rate": 2.292022217117477e-06, + "loss": 0.82723081, + "num_input_tokens_seen": 167803185, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.39916992, + "step": 7813, + "time_per_iteration": 2.647787570953369 + }, + { + "auxiliary_loss_clip": 0.01464689, + "auxiliary_loss_mlp": 0.00399455, + "balance_loss_clip": 1.20472288, + "balance_loss_mlp": 0.36421651, + "epoch": 0.4698030963475124, + "flos": 15156206784000.0, + "grad_norm": 66.36554249266958, + "language_loss": 0.89311385, + "learning_rate": 2.291636923781798e-06, + "loss": 0.91175532, + "num_input_tokens_seen": 167816550, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.35253906, + "step": 7814, + "time_per_iteration": 2.644508123397827 + }, + { + "auxiliary_loss_clip": 0.01446771, + "auxiliary_loss_mlp": 0.0040536, + "balance_loss_clip": 1.19332361, + "balance_loss_mlp": 0.36819023, + "epoch": 0.46986321960018035, + "flos": 15151178880000.0, + "grad_norm": 25.39385611547622, + "language_loss": 0.86834002, + "learning_rate": 2.291251619387217e-06, + "loss": 0.88686126, + "num_input_tokens_seen": 167831845, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.37158203, + "step": 7815, + "time_per_iteration": 2.606029510498047 + }, + { + "auxiliary_loss_clip": 0.014432, + "auxiliary_loss_mlp": 0.00413023, + "balance_loss_clip": 1.18846178, + "balance_loss_mlp": 0.37489983, + "epoch": 0.4699233428528483, + "flos": 23108911626240.0, + "grad_norm": 171.4972963373993, + "language_loss": 0.8484109, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.86697316, + "num_input_tokens_seen": 167850360, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.38110352, + "step": 7816, + "time_per_iteration": 2.6986279487609863 + }, + { + "auxiliary_loss_clip": 0.01230474, + "auxiliary_loss_mlp": 0.00110387, + "balance_loss_clip": 1.07830179, + "balance_loss_mlp": 0.10175604, + "epoch": 0.46998346610551633, + "flos": 68105558246400.0, + "grad_norm": 1.0338634183287236, + "language_loss": 0.58002102, + "learning_rate": 2.290480977479796e-06, + "loss": 0.59342957, + "num_input_tokens_seen": 167908660, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.08642578, + "step": 7817, + "time_per_iteration": 3.224261522293091 + }, + { + "auxiliary_loss_clip": 0.01451094, + "auxiliary_loss_mlp": 0.00411507, + "balance_loss_clip": 1.20063722, + "balance_loss_mlp": 0.3750295, + "epoch": 0.4700435893581843, + "flos": 24129456842880.0, + "grad_norm": 18.55474279317209, + "language_loss": 0.85427386, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.87289989, + "num_input_tokens_seen": 167927905, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.36499023, + "step": 7818, + "time_per_iteration": 2.7036659717559814 + }, + { + "auxiliary_loss_clip": 0.01457641, + "auxiliary_loss_mlp": 0.00388552, + "balance_loss_clip": 1.20213056, + "balance_loss_mlp": 0.35112035, + "epoch": 0.47010371261085226, + "flos": 20150518642560.0, + "grad_norm": 11.470100991415269, + "language_loss": 0.89259744, + "learning_rate": 2.289710291512104e-06, + "loss": 0.91105938, + "num_input_tokens_seen": 167945995, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.37402344, + "step": 7819, + "time_per_iteration": 2.664466381072998 + }, + { + "auxiliary_loss_clip": 0.01470309, + "auxiliary_loss_mlp": 0.00435723, + "balance_loss_clip": 1.20547223, + "balance_loss_mlp": 0.39371318, + "epoch": 0.47016383586352023, + "flos": 15122199582720.0, + "grad_norm": 38.385219327831905, + "language_loss": 0.8493849, + "learning_rate": 2.289324932042186e-06, + "loss": 0.86844522, + "num_input_tokens_seen": 167963380, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.42041016, + "step": 7820, + "time_per_iteration": 2.618699789047241 + }, + { + "auxiliary_loss_clip": 0.01463591, + "auxiliary_loss_mlp": 0.00405899, + "balance_loss_clip": 1.207775, + "balance_loss_mlp": 0.36858615, + "epoch": 0.4702239591161882, + "flos": 13552975140480.0, + "grad_norm": 12.493897816419464, + "language_loss": 0.80470532, + "learning_rate": 2.288939561601039e-06, + "loss": 0.8234002, + "num_input_tokens_seen": 167981740, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.37304688, + "step": 7821, + "time_per_iteration": 2.643613338470459 + }, + { + "auxiliary_loss_clip": 0.01443891, + "auxiliary_loss_mlp": 0.00413872, + "balance_loss_clip": 1.19509792, + "balance_loss_mlp": 0.37601137, + "epoch": 0.47028408236885616, + "flos": 24276511123200.0, + "grad_norm": 4.180727848891327, + "language_loss": 0.93171835, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.95029598, + "num_input_tokens_seen": 167999380, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.37866211, + "step": 7822, + "time_per_iteration": 2.7382562160491943 + }, + { + "auxiliary_loss_clip": 0.01465961, + "auxiliary_loss_mlp": 0.00371088, + "balance_loss_clip": 1.21037519, + "balance_loss_mlp": 0.3367084, + "epoch": 0.4703442056215241, + "flos": 22856926740480.0, + "grad_norm": 3.7481412917293846, + "language_loss": 0.8566274, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.87499791, + "num_input_tokens_seen": 168018395, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.34375, + "step": 7823, + "time_per_iteration": 2.7065749168395996 + }, + { + "auxiliary_loss_clip": 0.01250706, + "auxiliary_loss_mlp": 0.00061239, + "balance_loss_clip": 1.10268962, + "balance_loss_mlp": 0.05236965, + "epoch": 0.4704043288741921, + "flos": 69240227950080.0, + "grad_norm": 0.6825453663908594, + "language_loss": 0.56216407, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.57528353, + "num_input_tokens_seen": 168084080, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.08886719, + "step": 7824, + "time_per_iteration": 3.268352746963501 + }, + { + "auxiliary_loss_clip": 0.01477774, + "auxiliary_loss_mlp": 0.00391941, + "balance_loss_clip": 1.21764517, + "balance_loss_mlp": 0.35589242, + "epoch": 0.47046445212686006, + "flos": 18041090584320.0, + "grad_norm": 15.058240774090676, + "language_loss": 0.87436771, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.89306486, + "num_input_tokens_seen": 168101555, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.36035156, + "step": 7825, + "time_per_iteration": 2.649292230606079 + }, + { + "auxiliary_loss_clip": 0.01475621, + "auxiliary_loss_mlp": 0.00413752, + "balance_loss_clip": 1.2114749, + "balance_loss_mlp": 0.37503269, + "epoch": 0.470524575379528, + "flos": 23951448017280.0, + "grad_norm": 6.581030112453944, + "language_loss": 0.74239069, + "learning_rate": 2.287012545338324e-06, + "loss": 0.76128447, + "num_input_tokens_seen": 168121530, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.38720703, + "step": 7826, + "time_per_iteration": 2.6737594604492188 + }, + { + "auxiliary_loss_clip": 0.01471336, + "auxiliary_loss_mlp": 0.00399499, + "balance_loss_clip": 1.20991099, + "balance_loss_mlp": 0.35961205, + "epoch": 0.470584698632196, + "flos": 18113558273280.0, + "grad_norm": 37.18756326212194, + "language_loss": 0.89406043, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.91276878, + "num_input_tokens_seen": 168140335, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.39892578, + "step": 7827, + "time_per_iteration": 4.03376579284668 + }, + { + "auxiliary_loss_clip": 0.01253844, + "auxiliary_loss_mlp": 0.00087277, + "balance_loss_clip": 1.10993648, + "balance_loss_mlp": 0.07831293, + "epoch": 0.47064482188486395, + "flos": 57251916224640.0, + "grad_norm": 0.8062469004772751, + "language_loss": 0.55372816, + "learning_rate": 2.286241662546122e-06, + "loss": 0.56713939, + "num_input_tokens_seen": 168200535, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.08984375, + "step": 7828, + "time_per_iteration": 3.159149408340454 + }, + { + "auxiliary_loss_clip": 0.01459787, + "auxiliary_loss_mlp": 0.00411742, + "balance_loss_clip": 1.20705557, + "balance_loss_mlp": 0.37264138, + "epoch": 0.4707049451375319, + "flos": 17895077798400.0, + "grad_norm": 6.950037041245772, + "language_loss": 0.86779493, + "learning_rate": 2.285856204861245e-06, + "loss": 0.88651025, + "num_input_tokens_seen": 168219610, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.39086914, + "step": 7829, + "time_per_iteration": 2.627861499786377 + }, + { + "auxiliary_loss_clip": 0.01466039, + "auxiliary_loss_mlp": 0.00409418, + "balance_loss_clip": 1.2092725, + "balance_loss_mlp": 0.3726064, + "epoch": 0.47076506839019994, + "flos": 25232669210880.0, + "grad_norm": 5.50269296828369, + "language_loss": 0.79020333, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.80895787, + "num_input_tokens_seen": 168242505, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.36791992, + "step": 7830, + "time_per_iteration": 4.214278936386108 + }, + { + "auxiliary_loss_clip": 0.01472202, + "auxiliary_loss_mlp": 0.00404024, + "balance_loss_clip": 1.22101021, + "balance_loss_mlp": 0.36821377, + "epoch": 0.4708251916428679, + "flos": 13479681438720.0, + "grad_norm": 8.719441195136728, + "language_loss": 0.84859645, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.86735868, + "num_input_tokens_seen": 168260220, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.35839844, + "step": 7831, + "time_per_iteration": 2.667121171951294 + }, + { + "auxiliary_loss_clip": 0.01493044, + "auxiliary_loss_mlp": 0.00483011, + "balance_loss_clip": 1.21996951, + "balance_loss_mlp": 0.43887937, + "epoch": 0.47088531489553587, + "flos": 30147833450880.0, + "grad_norm": 7.631637862719774, + "language_loss": 0.80693394, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.82669449, + "num_input_tokens_seen": 168277360, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.44140625, + "step": 7832, + "time_per_iteration": 2.728379487991333 + }, + { + "auxiliary_loss_clip": 0.01453342, + "auxiliary_loss_mlp": 0.0037502, + "balance_loss_clip": 1.20774412, + "balance_loss_mlp": 0.34218991, + "epoch": 0.47094543814820383, + "flos": 21798280172160.0, + "grad_norm": 12.210899881691363, + "language_loss": 0.79110014, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.80938375, + "num_input_tokens_seen": 168296605, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.328125, + "step": 7833, + "time_per_iteration": 4.078873157501221 + }, + { + "auxiliary_loss_clip": 0.01459894, + "auxiliary_loss_mlp": 0.00396677, + "balance_loss_clip": 1.21137357, + "balance_loss_mlp": 0.35931644, + "epoch": 0.4710055614008718, + "flos": 23003011353600.0, + "grad_norm": 43.94572495304055, + "language_loss": 0.82248807, + "learning_rate": 2.283928754133762e-06, + "loss": 0.84105378, + "num_input_tokens_seen": 168316205, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.37353516, + "step": 7834, + "time_per_iteration": 2.6856770515441895 + }, + { + "auxiliary_loss_clip": 0.01477678, + "auxiliary_loss_mlp": 0.00401722, + "balance_loss_clip": 1.22396827, + "balance_loss_mlp": 0.36436194, + "epoch": 0.47106568465353976, + "flos": 42741346452480.0, + "grad_norm": 53.76987002827452, + "language_loss": 0.71289814, + "learning_rate": 2.283543231629972e-06, + "loss": 0.73169219, + "num_input_tokens_seen": 168338935, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.37329102, + "step": 7835, + "time_per_iteration": 2.837968111038208 + }, + { + "auxiliary_loss_clip": 0.01274619, + "auxiliary_loss_mlp": 0.00108789, + "balance_loss_clip": 1.12391341, + "balance_loss_mlp": 0.09853655, + "epoch": 0.4711258079062077, + "flos": 68554008570240.0, + "grad_norm": 0.8561659686466754, + "language_loss": 0.62017202, + "learning_rate": 2.283157698374194e-06, + "loss": 0.63400614, + "num_input_tokens_seen": 168392800, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.10253906, + "step": 7836, + "time_per_iteration": 3.168658494949341 + }, + { + "auxiliary_loss_clip": 0.01488744, + "auxiliary_loss_mlp": 0.00421977, + "balance_loss_clip": 1.22162557, + "balance_loss_mlp": 0.38237557, + "epoch": 0.4711859311588757, + "flos": 25446588658560.0, + "grad_norm": 4.933600474170642, + "language_loss": 0.73476994, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.75387716, + "num_input_tokens_seen": 168412940, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.39599609, + "step": 7837, + "time_per_iteration": 4.160662889480591 + }, + { + "auxiliary_loss_clip": 0.01475035, + "auxiliary_loss_mlp": 0.00409985, + "balance_loss_clip": 1.21566963, + "balance_loss_mlp": 0.37100333, + "epoch": 0.47124605441154366, + "flos": 21981891519360.0, + "grad_norm": 4.028700044527988, + "language_loss": 0.72308433, + "learning_rate": 2.282386599665153e-06, + "loss": 0.74193448, + "num_input_tokens_seen": 168431995, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.38964844, + "step": 7838, + "time_per_iteration": 2.6805319786071777 + }, + { + "auxiliary_loss_clip": 0.01492023, + "auxiliary_loss_mlp": 0.00433186, + "balance_loss_clip": 1.22733545, + "balance_loss_mlp": 0.39282158, + "epoch": 0.4713061776642116, + "flos": 25412689198080.0, + "grad_norm": 69.69954370379695, + "language_loss": 0.84886611, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.86811817, + "num_input_tokens_seen": 168454585, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.40332031, + "step": 7839, + "time_per_iteration": 2.734173536300659 + }, + { + "auxiliary_loss_clip": 0.01478798, + "auxiliary_loss_mlp": 0.00405964, + "balance_loss_clip": 1.22485983, + "balance_loss_mlp": 0.36917588, + "epoch": 0.4713663009168796, + "flos": 26542259170560.0, + "grad_norm": 175.4179782364303, + "language_loss": 0.79092836, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.80977601, + "num_input_tokens_seen": 168471265, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.36791992, + "step": 7840, + "time_per_iteration": 2.669721841812134 + }, + { + "auxiliary_loss_clip": 0.01469973, + "auxiliary_loss_mlp": 0.00425297, + "balance_loss_clip": 1.21136975, + "balance_loss_mlp": 0.3862682, + "epoch": 0.47142642416954755, + "flos": 23623583650560.0, + "grad_norm": 4.327510380941089, + "language_loss": 0.80823547, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.82718813, + "num_input_tokens_seen": 168491360, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.39038086, + "step": 7841, + "time_per_iteration": 2.686037302017212 + }, + { + "auxiliary_loss_clip": 0.01481939, + "auxiliary_loss_mlp": 0.00435994, + "balance_loss_clip": 1.22248888, + "balance_loss_mlp": 0.3955577, + "epoch": 0.4714865474222155, + "flos": 22310150935680.0, + "grad_norm": 16.181053244316942, + "language_loss": 0.76285231, + "learning_rate": 2.280844273866501e-06, + "loss": 0.7820316, + "num_input_tokens_seen": 168511335, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.40429688, + "step": 7842, + "time_per_iteration": 2.6826794147491455 + }, + { + "auxiliary_loss_clip": 0.01486469, + "auxiliary_loss_mlp": 0.00417579, + "balance_loss_clip": 1.22961342, + "balance_loss_mlp": 0.37874073, + "epoch": 0.4715466706748835, + "flos": 17822430541440.0, + "grad_norm": 5.636944181580286, + "language_loss": 0.85315591, + "learning_rate": 2.280458665756177e-06, + "loss": 0.87219638, + "num_input_tokens_seen": 168529920, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.38818359, + "step": 7843, + "time_per_iteration": 2.667193651199341 + }, + { + "auxiliary_loss_clip": 0.01486405, + "auxiliary_loss_mlp": 0.00399791, + "balance_loss_clip": 1.22614312, + "balance_loss_mlp": 0.36064303, + "epoch": 0.4716067939275515, + "flos": 23659530186240.0, + "grad_norm": 54.57608820045155, + "language_loss": 0.79478639, + "learning_rate": 2.280073047010832e-06, + "loss": 0.81364834, + "num_input_tokens_seen": 168550595, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.3918457, + "step": 7844, + "time_per_iteration": 2.7039074897766113 + }, + { + "auxiliary_loss_clip": 0.0146688, + "auxiliary_loss_mlp": 0.00433042, + "balance_loss_clip": 1.21186125, + "balance_loss_mlp": 0.39267731, + "epoch": 0.47166691718021947, + "flos": 17930162407680.0, + "grad_norm": 6.529173680573744, + "language_loss": 0.84033608, + "learning_rate": 2.279687417645088e-06, + "loss": 0.8593353, + "num_input_tokens_seen": 168569765, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.40332031, + "step": 7845, + "time_per_iteration": 2.6972994804382324 + }, + { + "auxiliary_loss_clip": 0.01476425, + "auxiliary_loss_mlp": 0.00418516, + "balance_loss_clip": 1.21884727, + "balance_loss_mlp": 0.3802734, + "epoch": 0.47172704043288743, + "flos": 26614583205120.0, + "grad_norm": 39.63184368108871, + "language_loss": 0.76899159, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.78794092, + "num_input_tokens_seen": 168591525, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.38256836, + "step": 7846, + "time_per_iteration": 2.7766265869140625 + }, + { + "auxiliary_loss_clip": 0.01471575, + "auxiliary_loss_mlp": 0.00390783, + "balance_loss_clip": 1.22111511, + "balance_loss_mlp": 0.35156298, + "epoch": 0.4717871636855554, + "flos": 27922700707200.0, + "grad_norm": 1.9007371296621496, + "language_loss": 0.78903252, + "learning_rate": 2.2789161271109e-06, + "loss": 0.80765611, + "num_input_tokens_seen": 168611235, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.3918457, + "step": 7847, + "time_per_iteration": 2.7280147075653076 + }, + { + "auxiliary_loss_clip": 0.01471769, + "auxiliary_loss_mlp": 0.00418062, + "balance_loss_clip": 1.21799815, + "balance_loss_mlp": 0.37962919, + "epoch": 0.47184728693822336, + "flos": 14502237816960.0, + "grad_norm": 4.112456381486064, + "language_loss": 0.86687797, + "learning_rate": 2.278530465971703e-06, + "loss": 0.88577628, + "num_input_tokens_seen": 168628710, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.38427734, + "step": 7848, + "time_per_iteration": 2.6743545532226562 + }, + { + "auxiliary_loss_clip": 0.01493329, + "auxiliary_loss_mlp": 0.00430927, + "balance_loss_clip": 1.23699069, + "balance_loss_mlp": 0.39256537, + "epoch": 0.47190741019089133, + "flos": 17856545483520.0, + "grad_norm": 8.832091236981126, + "language_loss": 0.77561486, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.79485738, + "num_input_tokens_seen": 168645645, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.38378906, + "step": 7849, + "time_per_iteration": 2.6810929775238037 + }, + { + "auxiliary_loss_clip": 0.01495149, + "auxiliary_loss_mlp": 0.00446623, + "balance_loss_clip": 1.23112261, + "balance_loss_mlp": 0.4056623, + "epoch": 0.4719675334435593, + "flos": 17895472848000.0, + "grad_norm": 176.22499556089, + "language_loss": 0.76948887, + "learning_rate": 2.277759112022224e-06, + "loss": 0.78890657, + "num_input_tokens_seen": 168664165, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.40942383, + "step": 7850, + "time_per_iteration": 2.738771438598633 + }, + { + "auxiliary_loss_clip": 0.014916, + "auxiliary_loss_mlp": 0.00420819, + "balance_loss_clip": 1.23039484, + "balance_loss_mlp": 0.38188541, + "epoch": 0.47202765669622726, + "flos": 20704369426560.0, + "grad_norm": 10.58953269120672, + "language_loss": 0.79338235, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.8125065, + "num_input_tokens_seen": 168681940, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.38916016, + "step": 7851, + "time_per_iteration": 2.69732666015625 + }, + { + "auxiliary_loss_clip": 0.0147482, + "auxiliary_loss_mlp": 0.00426612, + "balance_loss_clip": 1.21645451, + "balance_loss_mlp": 0.38672429, + "epoch": 0.4720877799488952, + "flos": 16360255607040.0, + "grad_norm": 5.3629664942877255, + "language_loss": 0.82682025, + "learning_rate": 2.276987715942132e-06, + "loss": 0.84583461, + "num_input_tokens_seen": 168698830, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.39868164, + "step": 7852, + "time_per_iteration": 2.7819268703460693 + }, + { + "auxiliary_loss_clip": 0.01465914, + "auxiliary_loss_mlp": 0.00420155, + "balance_loss_clip": 1.21467555, + "balance_loss_mlp": 0.38339102, + "epoch": 0.4721479032015632, + "flos": 20668171495680.0, + "grad_norm": 57.898719709826224, + "language_loss": 0.74537647, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.76423717, + "num_input_tokens_seen": 168718305, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.36767578, + "step": 7853, + "time_per_iteration": 2.705368995666504 + }, + { + "auxiliary_loss_clip": 0.01257695, + "auxiliary_loss_mlp": 0.00103313, + "balance_loss_clip": 1.11344647, + "balance_loss_mlp": 0.09449109, + "epoch": 0.47220802645423116, + "flos": 67750438435200.0, + "grad_norm": 0.6795151251061062, + "language_loss": 0.50047308, + "learning_rate": 2.276216277848432e-06, + "loss": 0.51408315, + "num_input_tokens_seen": 168782365, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.08837891, + "step": 7854, + "time_per_iteration": 3.308317184448242 + }, + { + "auxiliary_loss_clip": 0.01484936, + "auxiliary_loss_mlp": 0.00402665, + "balance_loss_clip": 1.22325873, + "balance_loss_mlp": 0.3640888, + "epoch": 0.4722681497068991, + "flos": 20921449271040.0, + "grad_norm": 29.389327931366466, + "language_loss": 0.70024347, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.71911943, + "num_input_tokens_seen": 168800485, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.38598633, + "step": 7855, + "time_per_iteration": 2.675990343093872 + }, + { + "auxiliary_loss_clip": 0.01477858, + "auxiliary_loss_mlp": 0.0039172, + "balance_loss_clip": 1.22028363, + "balance_loss_mlp": 0.35374039, + "epoch": 0.4723282729595671, + "flos": 28293083798400.0, + "grad_norm": 40.889235721719835, + "language_loss": 0.82916343, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.8478592, + "num_input_tokens_seen": 168818965, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.37988281, + "step": 7856, + "time_per_iteration": 2.7275588512420654 + }, + { + "auxiliary_loss_clip": 0.01480281, + "auxiliary_loss_mlp": 0.0037046, + "balance_loss_clip": 1.22765267, + "balance_loss_mlp": 0.33495915, + "epoch": 0.4723883962122351, + "flos": 27125053338240.0, + "grad_norm": 5.7225511525833666, + "language_loss": 0.80273378, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.82124114, + "num_input_tokens_seen": 168840355, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.35522461, + "step": 7857, + "time_per_iteration": 2.6991918087005615 + }, + { + "auxiliary_loss_clip": 0.0147373, + "auxiliary_loss_mlp": 0.00387299, + "balance_loss_clip": 1.21912432, + "balance_loss_mlp": 0.35344389, + "epoch": 0.47244851946490307, + "flos": 31537253387520.0, + "grad_norm": 22.685355767284324, + "language_loss": 0.69565558, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.71426588, + "num_input_tokens_seen": 168861765, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.33837891, + "step": 7858, + "time_per_iteration": 2.7661001682281494 + }, + { + "auxiliary_loss_clip": 0.0146903, + "auxiliary_loss_mlp": 0.00366781, + "balance_loss_clip": 1.21758628, + "balance_loss_mlp": 0.330971, + "epoch": 0.47250864271757104, + "flos": 20886544229760.0, + "grad_norm": 49.63542552966917, + "language_loss": 0.769741, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.78809911, + "num_input_tokens_seen": 168881310, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.35791016, + "step": 7859, + "time_per_iteration": 2.7397143840789795 + }, + { + "auxiliary_loss_clip": 0.01482457, + "auxiliary_loss_mlp": 0.00422365, + "balance_loss_clip": 1.22166216, + "balance_loss_mlp": 0.38457549, + "epoch": 0.472568765970239, + "flos": 20522086882560.0, + "grad_norm": 4.712192687214576, + "language_loss": 0.67898858, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.69803685, + "num_input_tokens_seen": 168899470, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.37768555, + "step": 7860, + "time_per_iteration": 2.711091995239258 + }, + { + "auxiliary_loss_clip": 0.01469431, + "auxiliary_loss_mlp": 0.00377309, + "balance_loss_clip": 1.21424878, + "balance_loss_mlp": 0.33994871, + "epoch": 0.47262888922290697, + "flos": 35805200417280.0, + "grad_norm": 8.008743436764433, + "language_loss": 0.7905817, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.80904913, + "num_input_tokens_seen": 168921495, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.37353516, + "step": 7861, + "time_per_iteration": 2.7667899131774902 + }, + { + "auxiliary_loss_clip": 0.01471795, + "auxiliary_loss_mlp": 0.00409377, + "balance_loss_clip": 1.21785593, + "balance_loss_mlp": 0.37225485, + "epoch": 0.47268901247557493, + "flos": 20667740532480.0, + "grad_norm": 4.462786821928621, + "language_loss": 0.90868723, + "learning_rate": 2.273130107677896e-06, + "loss": 0.92749894, + "num_input_tokens_seen": 168940515, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.37109375, + "step": 7862, + "time_per_iteration": 2.6191375255584717 + }, + { + "auxiliary_loss_clip": 0.0146643, + "auxiliary_loss_mlp": 0.00398042, + "balance_loss_clip": 1.21056795, + "balance_loss_mlp": 0.36082506, + "epoch": 0.4727491357282429, + "flos": 19573291082880.0, + "grad_norm": 43.922871871147095, + "language_loss": 0.91488743, + "learning_rate": 2.272744289645927e-06, + "loss": 0.93353212, + "num_input_tokens_seen": 168958340, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.37231445, + "step": 7863, + "time_per_iteration": 2.620615243911743 + }, + { + "auxiliary_loss_clip": 0.01484446, + "auxiliary_loss_mlp": 0.00399739, + "balance_loss_clip": 1.22743154, + "balance_loss_mlp": 0.36192632, + "epoch": 0.47280925898091086, + "flos": 18217231902720.0, + "grad_norm": 4.060608326105938, + "language_loss": 0.70900464, + "learning_rate": 2.272358461271467e-06, + "loss": 0.7278465, + "num_input_tokens_seen": 168974850, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.37817383, + "step": 7864, + "time_per_iteration": 2.6454052925109863 + }, + { + "auxiliary_loss_clip": 0.01476538, + "auxiliary_loss_mlp": 0.00414471, + "balance_loss_clip": 1.22146332, + "balance_loss_mlp": 0.37656289, + "epoch": 0.4728693822335788, + "flos": 17821820010240.0, + "grad_norm": 5.307374737624215, + "language_loss": 0.74009812, + "learning_rate": 2.271972622569147e-06, + "loss": 0.75900817, + "num_input_tokens_seen": 168992860, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.37915039, + "step": 7865, + "time_per_iteration": 2.6863861083984375 + }, + { + "auxiliary_loss_clip": 0.01458723, + "auxiliary_loss_mlp": 0.00381993, + "balance_loss_clip": 1.21153212, + "balance_loss_mlp": 0.34844732, + "epoch": 0.4729295054862468, + "flos": 20595057361920.0, + "grad_norm": 165.0637526046751, + "language_loss": 0.79977763, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.81818485, + "num_input_tokens_seen": 169010325, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.33520508, + "step": 7866, + "time_per_iteration": 2.7064027786254883 + }, + { + "auxiliary_loss_clip": 0.01474948, + "auxiliary_loss_mlp": 0.00387891, + "balance_loss_clip": 1.21473658, + "balance_loss_mlp": 0.35295045, + "epoch": 0.47298962873891476, + "flos": 23368079232000.0, + "grad_norm": 16.76205737918974, + "language_loss": 0.88943124, + "learning_rate": 2.271200914239451e-06, + "loss": 0.9080596, + "num_input_tokens_seen": 169029840, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.34899902, + "step": 7867, + "time_per_iteration": 2.6942427158355713 + }, + { + "auxiliary_loss_clip": 0.01468558, + "auxiliary_loss_mlp": 0.00370495, + "balance_loss_clip": 1.21683419, + "balance_loss_mlp": 0.33623463, + "epoch": 0.4730497519915827, + "flos": 22052240305920.0, + "grad_norm": 4.853846567655911, + "language_loss": 0.83692634, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.85531688, + "num_input_tokens_seen": 169049975, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.34301758, + "step": 7868, + "time_per_iteration": 2.720332622528076 + }, + { + "auxiliary_loss_clip": 0.01482792, + "auxiliary_loss_mlp": 0.00409824, + "balance_loss_clip": 1.21924555, + "balance_loss_mlp": 0.37193894, + "epoch": 0.4731098752442507, + "flos": 21069724613760.0, + "grad_norm": 28.089907517430625, + "language_loss": 0.8447206, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.86364675, + "num_input_tokens_seen": 169069540, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.37890625, + "step": 7869, + "time_per_iteration": 4.063143014907837 + }, + { + "auxiliary_loss_clip": 0.01487535, + "auxiliary_loss_mlp": 0.0039011, + "balance_loss_clip": 1.22392035, + "balance_loss_mlp": 0.35465705, + "epoch": 0.4731699984969187, + "flos": 22528775064960.0, + "grad_norm": 2.947993621596573, + "language_loss": 0.78524083, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.80401731, + "num_input_tokens_seen": 169089940, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.35449219, + "step": 7870, + "time_per_iteration": 2.7365682125091553 + }, + { + "auxiliary_loss_clip": 0.0149173, + "auxiliary_loss_mlp": 0.00411584, + "balance_loss_clip": 1.22610903, + "balance_loss_mlp": 0.37017095, + "epoch": 0.4732301217495867, + "flos": 24898124914560.0, + "grad_norm": 76.98143057527443, + "language_loss": 0.86450285, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.88353598, + "num_input_tokens_seen": 169109650, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.4140625, + "step": 7871, + "time_per_iteration": 2.708550453186035 + }, + { + "auxiliary_loss_clip": 0.01477391, + "auxiliary_loss_mlp": 0.00362355, + "balance_loss_clip": 1.21727765, + "balance_loss_mlp": 0.32678336, + "epoch": 0.47329024500225464, + "flos": 22784423137920.0, + "grad_norm": 7.103895322042403, + "language_loss": 0.81741178, + "learning_rate": 2.269271463701879e-06, + "loss": 0.83580923, + "num_input_tokens_seen": 169128990, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.35595703, + "step": 7872, + "time_per_iteration": 4.114868402481079 + }, + { + "auxiliary_loss_clip": 0.01464803, + "auxiliary_loss_mlp": 0.0036278, + "balance_loss_clip": 1.20935345, + "balance_loss_mlp": 0.3279469, + "epoch": 0.4733503682549226, + "flos": 38695902220800.0, + "grad_norm": 101.55609736435034, + "language_loss": 0.72602028, + "learning_rate": 2.268885542903428e-06, + "loss": 0.74429607, + "num_input_tokens_seen": 169154645, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.34851074, + "step": 7873, + "time_per_iteration": 2.8550407886505127 + }, + { + "auxiliary_loss_clip": 0.0147688, + "auxiliary_loss_mlp": 0.00397162, + "balance_loss_clip": 1.21590912, + "balance_loss_mlp": 0.35706013, + "epoch": 0.47341049150759057, + "flos": 22966849336320.0, + "grad_norm": 4.701281574790249, + "language_loss": 0.78564143, + "learning_rate": 2.26849961190881e-06, + "loss": 0.80438185, + "num_input_tokens_seen": 169174995, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.40112305, + "step": 7874, + "time_per_iteration": 2.6905524730682373 + }, + { + "auxiliary_loss_clip": 0.01475596, + "auxiliary_loss_mlp": 0.00364708, + "balance_loss_clip": 1.21522915, + "balance_loss_mlp": 0.32975596, + "epoch": 0.47347061476025853, + "flos": 14538471661440.0, + "grad_norm": 11.00623129337569, + "language_loss": 0.72401243, + "learning_rate": 2.26811367073266e-06, + "loss": 0.74241543, + "num_input_tokens_seen": 169191815, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.34985352, + "step": 7875, + "time_per_iteration": 4.153144836425781 + }, + { + "auxiliary_loss_clip": 0.01481276, + "auxiliary_loss_mlp": 0.00371527, + "balance_loss_clip": 1.22219157, + "balance_loss_mlp": 0.33602655, + "epoch": 0.4735307380129265, + "flos": 30263250827520.0, + "grad_norm": 17.563394379482737, + "language_loss": 0.86922103, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.88774908, + "num_input_tokens_seen": 169210430, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.35498047, + "step": 7876, + "time_per_iteration": 2.71294903755188 + }, + { + "auxiliary_loss_clip": 0.01470812, + "auxiliary_loss_mlp": 0.00424956, + "balance_loss_clip": 1.20834184, + "balance_loss_mlp": 0.38640368, + "epoch": 0.47359086126559446, + "flos": 19391044452480.0, + "grad_norm": 28.499461959028928, + "language_loss": 0.8360014, + "learning_rate": 2.267341757894304e-06, + "loss": 0.85495913, + "num_input_tokens_seen": 169229295, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.38525391, + "step": 7877, + "time_per_iteration": 2.6243770122528076 + }, + { + "auxiliary_loss_clip": 0.01456611, + "auxiliary_loss_mlp": 0.00369598, + "balance_loss_clip": 1.20545125, + "balance_loss_mlp": 0.33488435, + "epoch": 0.47365098451826243, + "flos": 21939408708480.0, + "grad_norm": 6.351716821320333, + "language_loss": 0.75870878, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.77697086, + "num_input_tokens_seen": 169247855, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.34716797, + "step": 7878, + "time_per_iteration": 2.647332191467285 + }, + { + "auxiliary_loss_clip": 0.01472477, + "auxiliary_loss_mlp": 0.00346859, + "balance_loss_clip": 1.21629918, + "balance_loss_mlp": 0.31040478, + "epoch": 0.4737111077709304, + "flos": 25845053207040.0, + "grad_norm": 28.807219387070965, + "language_loss": 0.80309826, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.82129169, + "num_input_tokens_seen": 169268860, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.36450195, + "step": 7879, + "time_per_iteration": 4.094977617263794 + }, + { + "auxiliary_loss_clip": 0.01242894, + "auxiliary_loss_mlp": 0.00105164, + "balance_loss_clip": 1.09529841, + "balance_loss_mlp": 0.09467403, + "epoch": 0.47377123102359836, + "flos": 67760886314880.0, + "grad_norm": 0.7253507713251136, + "language_loss": 0.61110008, + "learning_rate": 2.266183812641164e-06, + "loss": 0.62458068, + "num_input_tokens_seen": 169331855, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.10498047, + "step": 7880, + "time_per_iteration": 3.258176803588867 + }, + { + "auxiliary_loss_clip": 0.01467797, + "auxiliary_loss_mlp": 0.00422195, + "balance_loss_clip": 1.21101356, + "balance_loss_mlp": 0.38132966, + "epoch": 0.4738313542762663, + "flos": 24315977191680.0, + "grad_norm": 353.84450565371293, + "language_loss": 0.73660707, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.75550705, + "num_input_tokens_seen": 169352175, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.40820312, + "step": 7881, + "time_per_iteration": 2.7044827938079834 + }, + { + "auxiliary_loss_clip": 0.01466611, + "auxiliary_loss_mlp": 0.00368344, + "balance_loss_clip": 1.21204484, + "balance_loss_mlp": 0.33444053, + "epoch": 0.4738914775289343, + "flos": 20705339093760.0, + "grad_norm": 86.93845546359643, + "language_loss": 0.82333249, + "learning_rate": 2.265411798646092e-06, + "loss": 0.84168202, + "num_input_tokens_seen": 169371215, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.33911133, + "step": 7882, + "time_per_iteration": 2.716411590576172 + }, + { + "auxiliary_loss_clip": 0.01476462, + "auxiliary_loss_mlp": 0.00387565, + "balance_loss_clip": 1.21592712, + "balance_loss_mlp": 0.35058624, + "epoch": 0.4739516007816023, + "flos": 25446337263360.0, + "grad_norm": 3.483226769158877, + "language_loss": 0.81283271, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.83147299, + "num_input_tokens_seen": 169391745, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.36938477, + "step": 7883, + "time_per_iteration": 2.7514121532440186 + }, + { + "auxiliary_loss_clip": 0.01477343, + "auxiliary_loss_mlp": 0.00386993, + "balance_loss_clip": 1.21694231, + "balance_loss_mlp": 0.34846413, + "epoch": 0.4740117240342703, + "flos": 19974341410560.0, + "grad_norm": 2.8998737707929676, + "language_loss": 0.79195082, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.81059414, + "num_input_tokens_seen": 169409845, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.38500977, + "step": 7884, + "time_per_iteration": 2.7267420291900635 + }, + { + "auxiliary_loss_clip": 0.01482365, + "auxiliary_loss_mlp": 0.00413029, + "balance_loss_clip": 1.21843553, + "balance_loss_mlp": 0.37445247, + "epoch": 0.47407184728693824, + "flos": 15661146222720.0, + "grad_norm": 70.68909952578645, + "language_loss": 0.88199836, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.90095228, + "num_input_tokens_seen": 169426085, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.38598633, + "step": 7885, + "time_per_iteration": 2.644940137863159 + }, + { + "auxiliary_loss_clip": 0.01484634, + "auxiliary_loss_mlp": 0.00386858, + "balance_loss_clip": 1.22509408, + "balance_loss_mlp": 0.34973666, + "epoch": 0.4741319705396062, + "flos": 18588800142720.0, + "grad_norm": 27.34758926470861, + "language_loss": 0.80249524, + "learning_rate": 2.263867649999751e-06, + "loss": 0.82121021, + "num_input_tokens_seen": 169444705, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.37109375, + "step": 7886, + "time_per_iteration": 2.656519889831543 + }, + { + "auxiliary_loss_clip": 0.01493647, + "auxiliary_loss_mlp": 0.00427739, + "balance_loss_clip": 1.22570968, + "balance_loss_mlp": 0.38611135, + "epoch": 0.47419209379227417, + "flos": 13261093223040.0, + "grad_norm": 122.16121340477392, + "language_loss": 0.81817299, + "learning_rate": 2.263481587786849e-06, + "loss": 0.83738685, + "num_input_tokens_seen": 169460850, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.41625977, + "step": 7887, + "time_per_iteration": 2.652818441390991 + }, + { + "auxiliary_loss_clip": 0.01475867, + "auxiliary_loss_mlp": 0.00382213, + "balance_loss_clip": 1.21978498, + "balance_loss_mlp": 0.34466159, + "epoch": 0.47425221704494214, + "flos": 20044043752320.0, + "grad_norm": 43.95953607912842, + "language_loss": 0.84145594, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.86003673, + "num_input_tokens_seen": 169478890, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.37597656, + "step": 7888, + "time_per_iteration": 2.637044668197632 + }, + { + "auxiliary_loss_clip": 0.01492029, + "auxiliary_loss_mlp": 0.00366113, + "balance_loss_clip": 1.23010421, + "balance_loss_mlp": 0.33096975, + "epoch": 0.4743123402976101, + "flos": 27271892136960.0, + "grad_norm": 58.99003992234917, + "language_loss": 0.78299844, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.80157983, + "num_input_tokens_seen": 169499690, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.35131836, + "step": 7889, + "time_per_iteration": 2.7386932373046875 + }, + { + "auxiliary_loss_clip": 0.01244438, + "auxiliary_loss_mlp": 0.00101283, + "balance_loss_clip": 1.09834599, + "balance_loss_mlp": 0.09107919, + "epoch": 0.47437246355027807, + "flos": 55393970261760.0, + "grad_norm": 0.690094214832616, + "language_loss": 0.55429256, + "learning_rate": 2.262323341259214e-06, + "loss": 0.56774974, + "num_input_tokens_seen": 169560475, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.10205078, + "step": 7890, + "time_per_iteration": 3.2382590770721436 + }, + { + "auxiliary_loss_clip": 0.01489811, + "auxiliary_loss_mlp": 0.00388655, + "balance_loss_clip": 1.22920299, + "balance_loss_mlp": 0.35167634, + "epoch": 0.47443258680294603, + "flos": 23878477537920.0, + "grad_norm": 12.263584876649997, + "language_loss": 0.72206664, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.74085128, + "num_input_tokens_seen": 169580110, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.36987305, + "step": 7891, + "time_per_iteration": 2.708894968032837 + }, + { + "auxiliary_loss_clip": 0.01493626, + "auxiliary_loss_mlp": 0.00478729, + "balance_loss_clip": 1.22594833, + "balance_loss_mlp": 0.43378681, + "epoch": 0.474492710055614, + "flos": 21977761455360.0, + "grad_norm": 10.99899324654528, + "language_loss": 0.7645576, + "learning_rate": 2.26155112714642e-06, + "loss": 0.78428113, + "num_input_tokens_seen": 169597510, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.44995117, + "step": 7892, + "time_per_iteration": 2.7353100776672363 + }, + { + "auxiliary_loss_clip": 0.01272626, + "auxiliary_loss_mlp": 0.00059128, + "balance_loss_clip": 1.12193632, + "balance_loss_mlp": 0.05035396, + "epoch": 0.47455283330828196, + "flos": 62557180122240.0, + "grad_norm": 0.800498952198806, + "language_loss": 0.58274984, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.59606737, + "num_input_tokens_seen": 169660010, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.08789062, + "step": 7893, + "time_per_iteration": 3.2548446655273438 + }, + { + "auxiliary_loss_clip": 0.01472416, + "auxiliary_loss_mlp": 0.00396363, + "balance_loss_clip": 1.22087777, + "balance_loss_mlp": 0.35747707, + "epoch": 0.47461295656094993, + "flos": 12093637380480.0, + "grad_norm": 118.86307385584706, + "language_loss": 0.85751647, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.87620425, + "num_input_tokens_seen": 169678485, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.38916016, + "step": 7894, + "time_per_iteration": 2.664578914642334 + }, + { + "auxiliary_loss_clip": 0.01472147, + "auxiliary_loss_mlp": 0.00378887, + "balance_loss_clip": 1.21942925, + "balance_loss_mlp": 0.34305245, + "epoch": 0.4746730798136179, + "flos": 20884568981760.0, + "grad_norm": 113.38159480046279, + "language_loss": 0.80353492, + "learning_rate": 2.260392731628497e-06, + "loss": 0.82204527, + "num_input_tokens_seen": 169697335, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.3581543, + "step": 7895, + "time_per_iteration": 2.6829771995544434 + }, + { + "auxiliary_loss_clip": 0.01474082, + "auxiliary_loss_mlp": 0.0040124, + "balance_loss_clip": 1.21713948, + "balance_loss_mlp": 0.36402297, + "epoch": 0.4747332030662859, + "flos": 19974808287360.0, + "grad_norm": 40.794977006583785, + "language_loss": 0.88401151, + "learning_rate": 2.260006580021429e-06, + "loss": 0.9027648, + "num_input_tokens_seen": 169715395, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.37231445, + "step": 7896, + "time_per_iteration": 2.6753251552581787 + }, + { + "auxiliary_loss_clip": 0.01484214, + "auxiliary_loss_mlp": 0.00410251, + "balance_loss_clip": 1.23200417, + "balance_loss_mlp": 0.37095967, + "epoch": 0.4747933263189539, + "flos": 16034186920320.0, + "grad_norm": 1504.4129373934372, + "language_loss": 0.83919775, + "learning_rate": 2.259620418554886e-06, + "loss": 0.85814238, + "num_input_tokens_seen": 169733755, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.39306641, + "step": 7897, + "time_per_iteration": 2.6341514587402344 + }, + { + "auxiliary_loss_clip": 0.01500749, + "auxiliary_loss_mlp": 0.0040536, + "balance_loss_clip": 1.23942757, + "balance_loss_mlp": 0.36778519, + "epoch": 0.47485344957162184, + "flos": 13955102876160.0, + "grad_norm": 72.06686125419053, + "language_loss": 0.73253351, + "learning_rate": 2.25923424724351e-06, + "loss": 0.7515946, + "num_input_tokens_seen": 169751390, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.37597656, + "step": 7898, + "time_per_iteration": 2.831624984741211 + }, + { + "auxiliary_loss_clip": 0.01489171, + "auxiliary_loss_mlp": 0.00400627, + "balance_loss_clip": 1.22708058, + "balance_loss_mlp": 0.3604297, + "epoch": 0.4749135728242898, + "flos": 20449080489600.0, + "grad_norm": 5.34180848302132, + "language_loss": 0.78304559, + "learning_rate": 2.258848066101946e-06, + "loss": 0.80194354, + "num_input_tokens_seen": 169769500, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.40185547, + "step": 7899, + "time_per_iteration": 2.718524694442749 + }, + { + "auxiliary_loss_clip": 0.01472721, + "auxiliary_loss_mlp": 0.00381523, + "balance_loss_clip": 1.22012532, + "balance_loss_mlp": 0.34523612, + "epoch": 0.4749736960769578, + "flos": 28949961767040.0, + "grad_norm": 14.052509037541208, + "language_loss": 0.75778055, + "learning_rate": 2.258461875144837e-06, + "loss": 0.77632308, + "num_input_tokens_seen": 169789215, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.36254883, + "step": 7900, + "time_per_iteration": 2.693476438522339 + }, + { + "auxiliary_loss_clip": 0.01486713, + "auxiliary_loss_mlp": 0.00379623, + "balance_loss_clip": 1.23535955, + "balance_loss_mlp": 0.34359813, + "epoch": 0.47503381932962574, + "flos": 31938770592000.0, + "grad_norm": 96.54344656173453, + "language_loss": 0.76282167, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.78148502, + "num_input_tokens_seen": 169808825, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.36035156, + "step": 7901, + "time_per_iteration": 2.712613344192505 + }, + { + "auxiliary_loss_clip": 0.01468243, + "auxiliary_loss_mlp": 0.00386314, + "balance_loss_clip": 1.22053552, + "balance_loss_mlp": 0.35031313, + "epoch": 0.4750939425822937, + "flos": 22127257860480.0, + "grad_norm": 9.748858156597224, + "language_loss": 0.79609293, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.8146385, + "num_input_tokens_seen": 169827590, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.36010742, + "step": 7902, + "time_per_iteration": 2.674734354019165 + }, + { + "auxiliary_loss_clip": 0.01466043, + "auxiliary_loss_mlp": 0.0038476, + "balance_loss_clip": 1.2194283, + "balance_loss_mlp": 0.34856835, + "epoch": 0.47515406583496167, + "flos": 20850094903680.0, + "grad_norm": 258.17901454233356, + "language_loss": 0.74600583, + "learning_rate": 2.257303243526688e-06, + "loss": 0.76451385, + "num_input_tokens_seen": 169844925, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.36181641, + "step": 7903, + "time_per_iteration": 2.6800525188446045 + }, + { + "auxiliary_loss_clip": 0.0145941, + "auxiliary_loss_mlp": 0.00364207, + "balance_loss_clip": 1.21545815, + "balance_loss_mlp": 0.33104253, + "epoch": 0.47521418908762963, + "flos": 17524802448000.0, + "grad_norm": 50.27372788755333, + "language_loss": 0.77490342, + "learning_rate": 2.256917013453848e-06, + "loss": 0.79313958, + "num_input_tokens_seen": 169862705, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.33154297, + "step": 7904, + "time_per_iteration": 2.678912878036499 + }, + { + "auxiliary_loss_clip": 0.01471696, + "auxiliary_loss_mlp": 0.00361809, + "balance_loss_clip": 1.22706723, + "balance_loss_mlp": 0.32943213, + "epoch": 0.4752743123402976, + "flos": 20559434048640.0, + "grad_norm": 657.9215529698724, + "language_loss": 0.90963525, + "learning_rate": 2.25653077363869e-06, + "loss": 0.92797029, + "num_input_tokens_seen": 169880155, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.32373047, + "step": 7905, + "time_per_iteration": 2.6729140281677246 + }, + { + "auxiliary_loss_clip": 0.01443034, + "auxiliary_loss_mlp": 0.00356994, + "balance_loss_clip": 1.20702004, + "balance_loss_mlp": 0.32404417, + "epoch": 0.47533443559296557, + "flos": 26360623071360.0, + "grad_norm": 42.23322761214558, + "language_loss": 0.87223613, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.89023638, + "num_input_tokens_seen": 169901525, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.32958984, + "step": 7906, + "time_per_iteration": 2.742083787918091 + }, + { + "auxiliary_loss_clip": 0.01267838, + "auxiliary_loss_mlp": 0.00086478, + "balance_loss_clip": 1.11801922, + "balance_loss_mlp": 0.07808595, + "epoch": 0.47539455884563353, + "flos": 65949660967680.0, + "grad_norm": 0.6540075695856886, + "language_loss": 0.58837897, + "learning_rate": 2.255758264840002e-06, + "loss": 0.60192209, + "num_input_tokens_seen": 169970345, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.08398438, + "step": 7907, + "time_per_iteration": 3.280851364135742 + }, + { + "auxiliary_loss_clip": 0.01465906, + "auxiliary_loss_mlp": 0.00377137, + "balance_loss_clip": 1.21743774, + "balance_loss_mlp": 0.34175602, + "epoch": 0.4754546820983015, + "flos": 17238128002560.0, + "grad_norm": 25.93361595916039, + "language_loss": 0.84792078, + "learning_rate": 2.255371995885765e-06, + "loss": 0.86635119, + "num_input_tokens_seen": 169986440, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.35375977, + "step": 7908, + "time_per_iteration": 2.6822829246520996 + }, + { + "auxiliary_loss_clip": 0.01480703, + "auxiliary_loss_mlp": 0.00369246, + "balance_loss_clip": 1.22524738, + "balance_loss_mlp": 0.33393627, + "epoch": 0.47551480535096946, + "flos": 19825886499840.0, + "grad_norm": 17.59412698468543, + "language_loss": 0.79364657, + "learning_rate": 2.254985717247797e-06, + "loss": 0.81214607, + "num_input_tokens_seen": 170005705, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.35302734, + "step": 7909, + "time_per_iteration": 2.733083486557007 + }, + { + "auxiliary_loss_clip": 0.01470725, + "auxiliary_loss_mlp": 0.00390542, + "balance_loss_clip": 1.21751535, + "balance_loss_mlp": 0.35444519, + "epoch": 0.4755749286036375, + "flos": 22163958581760.0, + "grad_norm": 7074.607589873091, + "language_loss": 0.81190681, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.83051944, + "num_input_tokens_seen": 170023415, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.36108398, + "step": 7910, + "time_per_iteration": 2.693406105041504 + }, + { + "auxiliary_loss_clip": 0.01463949, + "auxiliary_loss_mlp": 0.00345074, + "balance_loss_clip": 1.2159642, + "balance_loss_mlp": 0.31005013, + "epoch": 0.47563505185630545, + "flos": 21648280976640.0, + "grad_norm": 6.990605563407297, + "language_loss": 0.84091151, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.85900176, + "num_input_tokens_seen": 170042395, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.34985352, + "step": 7911, + "time_per_iteration": 4.1321141719818115 + }, + { + "auxiliary_loss_clip": 0.0148042, + "auxiliary_loss_mlp": 0.00373032, + "balance_loss_clip": 1.22529781, + "balance_loss_mlp": 0.33655423, + "epoch": 0.4756951751089734, + "flos": 20628777254400.0, + "grad_norm": 146.46833913942828, + "language_loss": 0.81752974, + "learning_rate": 2.253826823377983e-06, + "loss": 0.83606422, + "num_input_tokens_seen": 170061610, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.36523438, + "step": 7912, + "time_per_iteration": 2.6414685249328613 + }, + { + "auxiliary_loss_clip": 0.01491111, + "auxiliary_loss_mlp": 0.0037147, + "balance_loss_clip": 1.23660326, + "balance_loss_mlp": 0.3346819, + "epoch": 0.4757552983616414, + "flos": 25848788221440.0, + "grad_norm": 5.178928264074438, + "language_loss": 0.80240464, + "learning_rate": 2.253440506151569e-06, + "loss": 0.8210305, + "num_input_tokens_seen": 170083505, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.36767578, + "step": 7913, + "time_per_iteration": 2.7679078578948975 + }, + { + "auxiliary_loss_clip": 0.01478411, + "auxiliary_loss_mlp": 0.00358001, + "balance_loss_clip": 1.22664988, + "balance_loss_mlp": 0.32202399, + "epoch": 0.47581542161430934, + "flos": 18223013992320.0, + "grad_norm": 11.049759794618758, + "language_loss": 0.77667868, + "learning_rate": 2.253054179314666e-06, + "loss": 0.79504281, + "num_input_tokens_seen": 170100690, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.35961914, + "step": 7914, + "time_per_iteration": 4.0894153118133545 + }, + { + "auxiliary_loss_clip": 0.01479824, + "auxiliary_loss_mlp": 0.00400683, + "balance_loss_clip": 1.22530723, + "balance_loss_mlp": 0.36134365, + "epoch": 0.4758755448669773, + "flos": 21579763783680.0, + "grad_norm": 553.1658735214427, + "language_loss": 0.69909495, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.71790004, + "num_input_tokens_seen": 170119240, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.39355469, + "step": 7915, + "time_per_iteration": 2.7089927196502686 + }, + { + "auxiliary_loss_clip": 0.01455827, + "auxiliary_loss_mlp": 0.00337098, + "balance_loss_clip": 1.21302354, + "balance_loss_mlp": 0.30083406, + "epoch": 0.47593566811964527, + "flos": 15231152511360.0, + "grad_norm": 135.59031539480767, + "language_loss": 0.82472551, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.84265471, + "num_input_tokens_seen": 170136450, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.36254883, + "step": 7916, + "time_per_iteration": 2.6232852935791016 + }, + { + "auxiliary_loss_clip": 0.01461868, + "auxiliary_loss_mlp": 0.00383478, + "balance_loss_clip": 1.21604502, + "balance_loss_mlp": 0.34704816, + "epoch": 0.47599579137231324, + "flos": 21543242630400.0, + "grad_norm": 9.129893210784518, + "language_loss": 0.70102048, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.7194739, + "num_input_tokens_seen": 170155295, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.36401367, + "step": 7917, + "time_per_iteration": 4.030153751373291 + }, + { + "auxiliary_loss_clip": 0.01261172, + "auxiliary_loss_mlp": 0.00104846, + "balance_loss_clip": 1.11885035, + "balance_loss_mlp": 0.0960724, + "epoch": 0.4760559146249812, + "flos": 64554602595840.0, + "grad_norm": 0.8197203996311756, + "language_loss": 0.64950013, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.66316032, + "num_input_tokens_seen": 170222325, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.08789062, + "step": 7918, + "time_per_iteration": 3.18410325050354 + }, + { + "auxiliary_loss_clip": 0.01474058, + "auxiliary_loss_mlp": 0.0035625, + "balance_loss_clip": 1.22990561, + "balance_loss_mlp": 0.3208214, + "epoch": 0.47611603787764917, + "flos": 22233876405120.0, + "grad_norm": 106.93343411245026, + "language_loss": 0.74576205, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.76406515, + "num_input_tokens_seen": 170241625, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.35449219, + "step": 7919, + "time_per_iteration": 2.6736016273498535 + }, + { + "auxiliary_loss_clip": 0.01476998, + "auxiliary_loss_mlp": 0.00374717, + "balance_loss_clip": 1.22918916, + "balance_loss_mlp": 0.34071854, + "epoch": 0.47617616113031713, + "flos": 22780005765120.0, + "grad_norm": 15.085404893571033, + "language_loss": 0.79300988, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.81152707, + "num_input_tokens_seen": 170262470, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.34008789, + "step": 7920, + "time_per_iteration": 2.80709171295166 + }, + { + "auxiliary_loss_clip": 0.01499418, + "auxiliary_loss_mlp": 0.00368197, + "balance_loss_clip": 1.23724198, + "balance_loss_mlp": 0.33193356, + "epoch": 0.4762362843829851, + "flos": 24133802388480.0, + "grad_norm": 36.72210018245196, + "language_loss": 0.83454841, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.85322458, + "num_input_tokens_seen": 170283460, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.36279297, + "step": 7921, + "time_per_iteration": 4.1018311977386475 + }, + { + "auxiliary_loss_clip": 0.0149282, + "auxiliary_loss_mlp": 0.0040209, + "balance_loss_clip": 1.23331892, + "balance_loss_mlp": 0.36177331, + "epoch": 0.47629640763565306, + "flos": 22452069571200.0, + "grad_norm": 30.339427834161786, + "language_loss": 0.8405937, + "learning_rate": 2.249963220399845e-06, + "loss": 0.85954285, + "num_input_tokens_seen": 170304225, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.40283203, + "step": 7922, + "time_per_iteration": 2.7084150314331055 + }, + { + "auxiliary_loss_clip": 0.01492085, + "auxiliary_loss_mlp": 0.00415778, + "balance_loss_clip": 1.23042154, + "balance_loss_mlp": 0.37560427, + "epoch": 0.4763565308883211, + "flos": 11181398647680.0, + "grad_norm": 31.207788877377848, + "language_loss": 0.78718686, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.80626547, + "num_input_tokens_seen": 170322110, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.40161133, + "step": 7923, + "time_per_iteration": 2.6297779083251953 + }, + { + "auxiliary_loss_clip": 0.01501033, + "auxiliary_loss_mlp": 0.00374023, + "balance_loss_clip": 1.23973012, + "balance_loss_mlp": 0.34019113, + "epoch": 0.47641665414098905, + "flos": 22382151747840.0, + "grad_norm": 75.71767587047846, + "language_loss": 0.89364773, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.91239834, + "num_input_tokens_seen": 170340700, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.33837891, + "step": 7924, + "time_per_iteration": 2.670773506164551 + }, + { + "auxiliary_loss_clip": 0.01503854, + "auxiliary_loss_mlp": 0.00381872, + "balance_loss_clip": 1.24276686, + "balance_loss_mlp": 0.34250897, + "epoch": 0.476476777393657, + "flos": 25046148862080.0, + "grad_norm": 5.236339153159506, + "language_loss": 0.87515265, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.89400989, + "num_input_tokens_seen": 170359780, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.39355469, + "step": 7925, + "time_per_iteration": 2.7091190814971924 + }, + { + "auxiliary_loss_clip": 0.01490043, + "auxiliary_loss_mlp": 0.0037128, + "balance_loss_clip": 1.22908115, + "balance_loss_mlp": 0.33513576, + "epoch": 0.476536900646325, + "flos": 27269916888960.0, + "grad_norm": 42.702892460510704, + "language_loss": 0.77174425, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.79035747, + "num_input_tokens_seen": 170381260, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.36132812, + "step": 7926, + "time_per_iteration": 2.7371368408203125 + }, + { + "auxiliary_loss_clip": 0.01513637, + "auxiliary_loss_mlp": 0.00368293, + "balance_loss_clip": 1.249681, + "balance_loss_mlp": 0.33191052, + "epoch": 0.47659702389899294, + "flos": 25301401885440.0, + "grad_norm": 672.1553513316936, + "language_loss": 0.75502086, + "learning_rate": 2.248031062546432e-06, + "loss": 0.77384019, + "num_input_tokens_seen": 170400595, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.36376953, + "step": 7927, + "time_per_iteration": 2.6901485919952393 + }, + { + "auxiliary_loss_clip": 0.01498566, + "auxiliary_loss_mlp": 0.00331157, + "balance_loss_clip": 1.2448318, + "balance_loss_mlp": 0.29878008, + "epoch": 0.4766571471516609, + "flos": 25992861672960.0, + "grad_norm": 410.84781813548375, + "language_loss": 0.74572927, + "learning_rate": 2.247644602701045e-06, + "loss": 0.76402646, + "num_input_tokens_seen": 170421110, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.32373047, + "step": 7928, + "time_per_iteration": 2.7124111652374268 + }, + { + "auxiliary_loss_clip": 0.01497448, + "auxiliary_loss_mlp": 0.00386726, + "balance_loss_clip": 1.23563075, + "balance_loss_mlp": 0.34748268, + "epoch": 0.4767172704043289, + "flos": 16032211672320.0, + "grad_norm": 124.59590960505045, + "language_loss": 0.84820604, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.86704779, + "num_input_tokens_seen": 170436700, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.39257812, + "step": 7929, + "time_per_iteration": 2.6007707118988037 + }, + { + "auxiliary_loss_clip": 0.01490446, + "auxiliary_loss_mlp": 0.00353954, + "balance_loss_clip": 1.23745513, + "balance_loss_mlp": 0.32050419, + "epoch": 0.47677739365699684, + "flos": 39235351651200.0, + "grad_norm": 7.0184957486387605, + "language_loss": 0.71237284, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.73081684, + "num_input_tokens_seen": 170459555, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.3347168, + "step": 7930, + "time_per_iteration": 2.854783296585083 + }, + { + "auxiliary_loss_clip": 0.01488165, + "auxiliary_loss_mlp": 0.00350084, + "balance_loss_clip": 1.23336697, + "balance_loss_mlp": 0.31545329, + "epoch": 0.4768375169096648, + "flos": 24717781704960.0, + "grad_norm": 10.687306088033697, + "language_loss": 0.84141386, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.85979629, + "num_input_tokens_seen": 170479175, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.34655762, + "step": 7931, + "time_per_iteration": 2.672152042388916 + }, + { + "auxiliary_loss_clip": 0.01498129, + "auxiliary_loss_mlp": 0.00366364, + "balance_loss_clip": 1.23591363, + "balance_loss_mlp": 0.32981449, + "epoch": 0.47689764016233277, + "flos": 22528667324160.0, + "grad_norm": 91.15972405894814, + "language_loss": 0.84215975, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.86080468, + "num_input_tokens_seen": 170498450, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.36572266, + "step": 7932, + "time_per_iteration": 2.694103479385376 + }, + { + "auxiliary_loss_clip": 0.01483787, + "auxiliary_loss_mlp": 0.00366095, + "balance_loss_clip": 1.22651112, + "balance_loss_mlp": 0.33064198, + "epoch": 0.47695776341500074, + "flos": 15120619384320.0, + "grad_norm": 17.58606081139129, + "language_loss": 0.86514056, + "learning_rate": 2.245712162906593e-06, + "loss": 0.8836394, + "num_input_tokens_seen": 170516255, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.35473633, + "step": 7933, + "time_per_iteration": 2.668574810028076 + }, + { + "auxiliary_loss_clip": 0.01500405, + "auxiliary_loss_mlp": 0.00392038, + "balance_loss_clip": 1.23420811, + "balance_loss_mlp": 0.35219878, + "epoch": 0.4770178866676687, + "flos": 14678917839360.0, + "grad_norm": 981.2993484434162, + "language_loss": 0.81326938, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.83219379, + "num_input_tokens_seen": 170532705, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.39819336, + "step": 7934, + "time_per_iteration": 2.7863800525665283 + }, + { + "auxiliary_loss_clip": 0.01502284, + "auxiliary_loss_mlp": 0.00393471, + "balance_loss_clip": 1.23707008, + "balance_loss_mlp": 0.35582522, + "epoch": 0.47707800992033667, + "flos": 22565583527040.0, + "grad_norm": 23.92220183536478, + "language_loss": 0.85460246, + "learning_rate": 2.244939121664211e-06, + "loss": 0.87356007, + "num_input_tokens_seen": 170551925, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.37670898, + "step": 7935, + "time_per_iteration": 2.702028751373291 + }, + { + "auxiliary_loss_clip": 0.01515232, + "auxiliary_loss_mlp": 0.00375834, + "balance_loss_clip": 1.24704742, + "balance_loss_mlp": 0.33666167, + "epoch": 0.4771381331730047, + "flos": 30918225375360.0, + "grad_norm": 7.554014315246816, + "language_loss": 0.77676988, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.79568052, + "num_input_tokens_seen": 170572320, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.39160156, + "step": 7936, + "time_per_iteration": 2.719099760055542 + }, + { + "auxiliary_loss_clip": 0.01491859, + "auxiliary_loss_mlp": 0.0040647, + "balance_loss_clip": 1.22910559, + "balance_loss_mlp": 0.36782259, + "epoch": 0.47719825642567265, + "flos": 25738901539200.0, + "grad_norm": 9.704111808801265, + "language_loss": 0.75002253, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.76900584, + "num_input_tokens_seen": 170589470, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.38647461, + "step": 7937, + "time_per_iteration": 2.7146353721618652 + }, + { + "auxiliary_loss_clip": 0.01236888, + "auxiliary_loss_mlp": 0.00073228, + "balance_loss_clip": 1.0968889, + "balance_loss_mlp": 0.06383431, + "epoch": 0.4772583796783406, + "flos": 66355128668160.0, + "grad_norm": 0.7018780505163268, + "language_loss": 0.5606485, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.57374966, + "num_input_tokens_seen": 170662265, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.09375, + "step": 7938, + "time_per_iteration": 3.3182828426361084 + }, + { + "auxiliary_loss_clip": 0.01490248, + "auxiliary_loss_mlp": 0.0035985, + "balance_loss_clip": 1.23306453, + "balance_loss_mlp": 0.32377684, + "epoch": 0.4773185029310086, + "flos": 22051091070720.0, + "grad_norm": 18.527484706398152, + "language_loss": 0.95756525, + "learning_rate": 2.243392927839317e-06, + "loss": 0.97606623, + "num_input_tokens_seen": 170679680, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.36083984, + "step": 7939, + "time_per_iteration": 2.664494037628174 + }, + { + "auxiliary_loss_clip": 0.01487499, + "auxiliary_loss_mlp": 0.00340931, + "balance_loss_clip": 1.22902441, + "balance_loss_mlp": 0.30786252, + "epoch": 0.47737862618367655, + "flos": 16727801523840.0, + "grad_norm": 3.9361390160149217, + "language_loss": 0.84882951, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.86711377, + "num_input_tokens_seen": 170697340, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.33056641, + "step": 7940, + "time_per_iteration": 2.607713460922241 + }, + { + "auxiliary_loss_clip": 0.01484236, + "auxiliary_loss_mlp": 0.00371675, + "balance_loss_clip": 1.2251482, + "balance_loss_mlp": 0.33531615, + "epoch": 0.4774387494363445, + "flos": 19609453100160.0, + "grad_norm": 2144.2318750676545, + "language_loss": 0.8986693, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.91722846, + "num_input_tokens_seen": 170714905, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.36328125, + "step": 7941, + "time_per_iteration": 2.6414597034454346 + }, + { + "auxiliary_loss_clip": 0.01496489, + "auxiliary_loss_mlp": 0.00353551, + "balance_loss_clip": 1.23057866, + "balance_loss_mlp": 0.31752548, + "epoch": 0.4774988726890125, + "flos": 16653969118080.0, + "grad_norm": 38.70923880914926, + "language_loss": 0.83684224, + "learning_rate": 2.24223318550976e-06, + "loss": 0.85534263, + "num_input_tokens_seen": 170731810, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.36035156, + "step": 7942, + "time_per_iteration": 2.619664192199707 + }, + { + "auxiliary_loss_clip": 0.01500239, + "auxiliary_loss_mlp": 0.00371806, + "balance_loss_clip": 1.23530698, + "balance_loss_mlp": 0.33582875, + "epoch": 0.47755899594168044, + "flos": 20485565729280.0, + "grad_norm": 19.641876686318152, + "language_loss": 0.6990037, + "learning_rate": 2.241846586342682e-06, + "loss": 0.7177242, + "num_input_tokens_seen": 170750270, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.35986328, + "step": 7943, + "time_per_iteration": 2.630659580230713 + }, + { + "auxiliary_loss_clip": 0.01508793, + "auxiliary_loss_mlp": 0.0035668, + "balance_loss_clip": 1.24239111, + "balance_loss_mlp": 0.31979671, + "epoch": 0.4776191191943484, + "flos": 21652806090240.0, + "grad_norm": 59.4367468103365, + "language_loss": 0.81804383, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.83669853, + "num_input_tokens_seen": 170769015, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.36914062, + "step": 7944, + "time_per_iteration": 2.7299654483795166 + }, + { + "auxiliary_loss_clip": 0.01500237, + "auxiliary_loss_mlp": 0.0040927, + "balance_loss_clip": 1.23628187, + "balance_loss_mlp": 0.36902514, + "epoch": 0.4776792424470164, + "flos": 18770220760320.0, + "grad_norm": 7.545380646945023, + "language_loss": 0.75181776, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.77091283, + "num_input_tokens_seen": 170785725, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.40234375, + "step": 7945, + "time_per_iteration": 2.754721164703369 + }, + { + "auxiliary_loss_clip": 0.0150314, + "auxiliary_loss_mlp": 0.00363074, + "balance_loss_clip": 1.24054384, + "balance_loss_mlp": 0.32630998, + "epoch": 0.47773936569968434, + "flos": 29715828577920.0, + "grad_norm": 7.611451676541132, + "language_loss": 0.80280876, + "learning_rate": 2.240686733875009e-06, + "loss": 0.82147086, + "num_input_tokens_seen": 170804600, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.36791992, + "step": 7946, + "time_per_iteration": 2.781416177749634 + }, + { + "auxiliary_loss_clip": 0.01511077, + "auxiliary_loss_mlp": 0.00401379, + "balance_loss_clip": 1.24266219, + "balance_loss_mlp": 0.362755, + "epoch": 0.4777994889523523, + "flos": 24791542283520.0, + "grad_norm": 23.57957234281433, + "language_loss": 0.85689759, + "learning_rate": 2.240300098112506e-06, + "loss": 0.8760221, + "num_input_tokens_seen": 170824230, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.38647461, + "step": 7947, + "time_per_iteration": 2.817075729370117 + }, + { + "auxiliary_loss_clip": 0.01514774, + "auxiliary_loss_mlp": 0.00340227, + "balance_loss_clip": 1.25310779, + "balance_loss_mlp": 0.30670568, + "epoch": 0.47785961220502027, + "flos": 17858161595520.0, + "grad_norm": 3.7599289472954873, + "language_loss": 0.81924242, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.83779246, + "num_input_tokens_seen": 170843365, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.33520508, + "step": 7948, + "time_per_iteration": 2.659876823425293 + }, + { + "auxiliary_loss_clip": 0.01508879, + "auxiliary_loss_mlp": 0.00394177, + "balance_loss_clip": 1.24285066, + "balance_loss_mlp": 0.35758024, + "epoch": 0.4779197354576883, + "flos": 20266546550400.0, + "grad_norm": 25.641523977154474, + "language_loss": 0.83621192, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.85524249, + "num_input_tokens_seen": 170863515, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.36572266, + "step": 7949, + "time_per_iteration": 2.657289743423462 + }, + { + "auxiliary_loss_clip": 0.01506469, + "auxiliary_loss_mlp": 0.00370476, + "balance_loss_clip": 1.24033356, + "balance_loss_mlp": 0.33526191, + "epoch": 0.47797985871035625, + "flos": 17056599644160.0, + "grad_norm": 28.023961181066625, + "language_loss": 0.81898898, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.83775848, + "num_input_tokens_seen": 170881245, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.35205078, + "step": 7950, + "time_per_iteration": 2.657841205596924 + }, + { + "auxiliary_loss_clip": 0.01510983, + "auxiliary_loss_mlp": 0.00381252, + "balance_loss_clip": 1.24523211, + "balance_loss_mlp": 0.3433907, + "epoch": 0.4780399819630242, + "flos": 31358418549120.0, + "grad_norm": 5.426149717606869, + "language_loss": 0.80721754, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.82613993, + "num_input_tokens_seen": 170901285, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.37841797, + "step": 7951, + "time_per_iteration": 2.718863010406494 + }, + { + "auxiliary_loss_clip": 0.01519389, + "auxiliary_loss_mlp": 0.00372622, + "balance_loss_clip": 1.2497741, + "balance_loss_mlp": 0.33697852, + "epoch": 0.4781001052156922, + "flos": 24899597372160.0, + "grad_norm": 2.4757283474738374, + "language_loss": 0.87644666, + "learning_rate": 2.238366782910174e-06, + "loss": 0.89536679, + "num_input_tokens_seen": 170919740, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.35620117, + "step": 7952, + "time_per_iteration": 2.713710069656372 + }, + { + "auxiliary_loss_clip": 0.0152569, + "auxiliary_loss_mlp": 0.00379062, + "balance_loss_clip": 1.25298381, + "balance_loss_mlp": 0.34449112, + "epoch": 0.47816022846836015, + "flos": 18697717157760.0, + "grad_norm": 17.250299772750214, + "language_loss": 0.85293925, + "learning_rate": 2.23798009269438e-06, + "loss": 0.87198675, + "num_input_tokens_seen": 170938510, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.34570312, + "step": 7953, + "time_per_iteration": 4.084122896194458 + }, + { + "auxiliary_loss_clip": 0.01538449, + "auxiliary_loss_mlp": 0.00360758, + "balance_loss_clip": 1.25518012, + "balance_loss_mlp": 0.32430345, + "epoch": 0.4782203517210281, + "flos": 11977573559040.0, + "grad_norm": 72.59771617782684, + "language_loss": 0.89958978, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.91858184, + "num_input_tokens_seen": 170951170, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.36425781, + "step": 7954, + "time_per_iteration": 2.6468727588653564 + }, + { + "auxiliary_loss_clip": 0.01527637, + "auxiliary_loss_mlp": 0.00339656, + "balance_loss_clip": 1.25730443, + "balance_loss_mlp": 0.30471608, + "epoch": 0.4782804749736961, + "flos": 20813501923200.0, + "grad_norm": 55.596035773699356, + "language_loss": 0.76294166, + "learning_rate": 2.237206685204768e-06, + "loss": 0.7816146, + "num_input_tokens_seen": 170970990, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.3494873, + "step": 7955, + "time_per_iteration": 2.6679842472076416 + }, + { + "auxiliary_loss_clip": 0.01539787, + "auxiliary_loss_mlp": 0.00375674, + "balance_loss_clip": 1.26122236, + "balance_loss_mlp": 0.33788434, + "epoch": 0.47834059822636404, + "flos": 23840304359040.0, + "grad_norm": 2.737056614874643, + "language_loss": 0.87609172, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.89524639, + "num_input_tokens_seen": 170991215, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 0.37792969, + "step": 7956, + "time_per_iteration": 4.2982707023620605 + }, + { + "auxiliary_loss_clip": 0.0154273, + "auxiliary_loss_mlp": 0.0038008, + "balance_loss_clip": 1.2724061, + "balance_loss_mlp": 0.34052598, + "epoch": 0.478400721479032, + "flos": 22633777497600.0, + "grad_norm": 2.760342069819623, + "language_loss": 0.89345896, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.91268706, + "num_input_tokens_seen": 171007325, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.39550781, + "step": 7957, + "time_per_iteration": 2.7042229175567627 + }, + { + "auxiliary_loss_clip": 0.01534006, + "auxiliary_loss_mlp": 0.00392449, + "balance_loss_clip": 1.26313591, + "balance_loss_mlp": 0.35525569, + "epoch": 0.4784608447317, + "flos": 19354954262400.0, + "grad_norm": 245.16198727413382, + "language_loss": 0.84551239, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.86477691, + "num_input_tokens_seen": 171025650, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.37207031, + "step": 7958, + "time_per_iteration": 2.6951968669891357 + }, + { + "auxiliary_loss_clip": 0.01537505, + "auxiliary_loss_mlp": 0.00383858, + "balance_loss_clip": 1.26217866, + "balance_loss_mlp": 0.34537745, + "epoch": 0.47852096798436794, + "flos": 24021114445440.0, + "grad_norm": 5.215268833142549, + "language_loss": 0.90414834, + "learning_rate": 2.235659762404047e-06, + "loss": 0.92336202, + "num_input_tokens_seen": 171045045, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.38476562, + "step": 7959, + "time_per_iteration": 4.15187668800354 + }, + { + "auxiliary_loss_clip": 0.01542545, + "auxiliary_loss_mlp": 0.00381862, + "balance_loss_clip": 1.27229881, + "balance_loss_mlp": 0.34869757, + "epoch": 0.4785810912370359, + "flos": 25666433850240.0, + "grad_norm": 6.566352879252062, + "language_loss": 0.79528469, + "learning_rate": 2.235273009326599e-06, + "loss": 0.81452876, + "num_input_tokens_seen": 171062910, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.33154297, + "step": 7960, + "time_per_iteration": 2.6815638542175293 + }, + { + "auxiliary_loss_clip": 0.01543143, + "auxiliary_loss_mlp": 0.00350808, + "balance_loss_clip": 1.27076817, + "balance_loss_mlp": 0.31628463, + "epoch": 0.47864121448970387, + "flos": 21432134885760.0, + "grad_norm": 15.071282384913612, + "language_loss": 0.8237772, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.84271669, + "num_input_tokens_seen": 171080875, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.3449707, + "step": 7961, + "time_per_iteration": 2.668024778366089 + }, + { + "auxiliary_loss_clip": 0.01532743, + "auxiliary_loss_mlp": 0.00350119, + "balance_loss_clip": 1.261114, + "balance_loss_mlp": 0.31395093, + "epoch": 0.47870133774237184, + "flos": 16143894034560.0, + "grad_norm": 1.6334733512152482, + "language_loss": 0.83620894, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.85503751, + "num_input_tokens_seen": 171099190, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.36181641, + "step": 7962, + "time_per_iteration": 2.6461713314056396 + }, + { + "auxiliary_loss_clip": 0.01561349, + "auxiliary_loss_mlp": 0.00407509, + "balance_loss_clip": 1.27907205, + "balance_loss_mlp": 0.37000579, + "epoch": 0.47876146099503986, + "flos": 26906788344960.0, + "grad_norm": 7.256099939379373, + "language_loss": 0.71949768, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.73918629, + "num_input_tokens_seen": 171119060, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.37463379, + "step": 7963, + "time_per_iteration": 2.7030234336853027 + }, + { + "auxiliary_loss_clip": 0.01543461, + "auxiliary_loss_mlp": 0.00362643, + "balance_loss_clip": 1.26842093, + "balance_loss_mlp": 0.32609302, + "epoch": 0.4788215842477078, + "flos": 45332085778560.0, + "grad_norm": 602.3743080422138, + "language_loss": 0.838525, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.85758603, + "num_input_tokens_seen": 171141900, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.36547852, + "step": 7964, + "time_per_iteration": 4.309806823730469 + }, + { + "auxiliary_loss_clip": 0.015627, + "auxiliary_loss_mlp": 0.00445501, + "balance_loss_clip": 1.27562118, + "balance_loss_mlp": 0.4012025, + "epoch": 0.4788817075003758, + "flos": 22237180456320.0, + "grad_norm": 96.39978851310218, + "language_loss": 0.82312608, + "learning_rate": 2.233339110409044e-06, + "loss": 0.84320807, + "num_input_tokens_seen": 171161045, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.44335938, + "step": 7965, + "time_per_iteration": 2.660370111465454 + }, + { + "auxiliary_loss_clip": 0.01544187, + "auxiliary_loss_mlp": 0.00359496, + "balance_loss_clip": 1.26913548, + "balance_loss_mlp": 0.32499719, + "epoch": 0.47894183075304375, + "flos": 16471183783680.0, + "grad_norm": 2.8111059868059103, + "language_loss": 0.82275474, + "learning_rate": 2.232952304022137e-06, + "loss": 0.84179151, + "num_input_tokens_seen": 171179675, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.3449707, + "step": 7966, + "time_per_iteration": 2.774526596069336 + }, + { + "auxiliary_loss_clip": 0.01533141, + "auxiliary_loss_mlp": 0.00396732, + "balance_loss_clip": 1.25837374, + "balance_loss_mlp": 0.36077857, + "epoch": 0.4790019540057117, + "flos": 24282688262400.0, + "grad_norm": 3.7460585811202134, + "language_loss": 0.79114193, + "learning_rate": 2.232565488801655e-06, + "loss": 0.81044066, + "num_input_tokens_seen": 171201175, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.35913086, + "step": 7967, + "time_per_iteration": 2.7440385818481445 + }, + { + "auxiliary_loss_clip": 0.01545487, + "auxiliary_loss_mlp": 0.0037981, + "balance_loss_clip": 1.2763679, + "balance_loss_mlp": 0.3452397, + "epoch": 0.4790620772583797, + "flos": 25666469763840.0, + "grad_norm": 31.637231555061174, + "language_loss": 0.84861517, + "learning_rate": 2.232178664762267e-06, + "loss": 0.86786819, + "num_input_tokens_seen": 171221750, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.34545898, + "step": 7968, + "time_per_iteration": 2.7573018074035645 + }, + { + "auxiliary_loss_clip": 0.01393752, + "auxiliary_loss_mlp": 0.00075148, + "balance_loss_clip": 1.23661232, + "balance_loss_mlp": 0.06594521, + "epoch": 0.47912220051104765, + "flos": 69428077102080.0, + "grad_norm": 0.7417620644990021, + "language_loss": 0.61611998, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.63080895, + "num_input_tokens_seen": 171292235, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.09179688, + "step": 7969, + "time_per_iteration": 3.290376663208008 + }, + { + "auxiliary_loss_clip": 0.0155784, + "auxiliary_loss_mlp": 0.00366604, + "balance_loss_clip": 1.28010845, + "balance_loss_mlp": 0.33019769, + "epoch": 0.4791823237637156, + "flos": 24168922911360.0, + "grad_norm": 18.52981395364934, + "language_loss": 0.81572437, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.83496881, + "num_input_tokens_seen": 171312215, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.36425781, + "step": 7970, + "time_per_iteration": 2.7636051177978516 + }, + { + "auxiliary_loss_clip": 0.01546742, + "auxiliary_loss_mlp": 0.00399042, + "balance_loss_clip": 1.26683939, + "balance_loss_mlp": 0.36232582, + "epoch": 0.4792424470163836, + "flos": 24751465683840.0, + "grad_norm": 5.597738164226211, + "language_loss": 0.75937128, + "learning_rate": 2.231018139877349e-06, + "loss": 0.77882922, + "num_input_tokens_seen": 171332975, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.36743164, + "step": 7971, + "time_per_iteration": 2.7697737216949463 + }, + { + "auxiliary_loss_clip": 0.01533993, + "auxiliary_loss_mlp": 0.0041152, + "balance_loss_clip": 1.26043987, + "balance_loss_mlp": 0.37256235, + "epoch": 0.47930257026905154, + "flos": 23257905240960.0, + "grad_norm": 2.9541636755991587, + "language_loss": 0.84265685, + "learning_rate": 2.230631280709021e-06, + "loss": 0.86211205, + "num_input_tokens_seen": 171353880, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.3894043, + "step": 7972, + "time_per_iteration": 2.7200512886047363 + }, + { + "auxiliary_loss_clip": 0.015468, + "auxiliary_loss_mlp": 0.00419131, + "balance_loss_clip": 1.26637852, + "balance_loss_mlp": 0.37800363, + "epoch": 0.4793626935217195, + "flos": 14064091718400.0, + "grad_norm": 17.181006537723253, + "language_loss": 0.77395737, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.79361665, + "num_input_tokens_seen": 171370930, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.41113281, + "step": 7973, + "time_per_iteration": 2.64968204498291 + }, + { + "auxiliary_loss_clip": 0.01557056, + "auxiliary_loss_mlp": 0.00394317, + "balance_loss_clip": 1.28194273, + "balance_loss_mlp": 0.35814887, + "epoch": 0.4794228167743875, + "flos": 21798854789760.0, + "grad_norm": 6.359660294550561, + "language_loss": 0.8449862, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.86449987, + "num_input_tokens_seen": 171387575, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.36157227, + "step": 7974, + "time_per_iteration": 2.6696622371673584 + }, + { + "auxiliary_loss_clip": 0.01410806, + "auxiliary_loss_mlp": 0.00114837, + "balance_loss_clip": 1.25074744, + "balance_loss_mlp": 0.10520476, + "epoch": 0.47948294002705544, + "flos": 66968805553920.0, + "grad_norm": 0.7527598118688152, + "language_loss": 0.53713906, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.55239546, + "num_input_tokens_seen": 171449980, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.09619141, + "step": 7975, + "time_per_iteration": 3.2051451206207275 + }, + { + "auxiliary_loss_clip": 0.0155547, + "auxiliary_loss_mlp": 0.00445196, + "balance_loss_clip": 1.27001858, + "balance_loss_mlp": 0.40423554, + "epoch": 0.47954306327972346, + "flos": 12422471414400.0, + "grad_norm": 19.13541482651737, + "language_loss": 0.9685809, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.98858762, + "num_input_tokens_seen": 171465290, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.40942383, + "step": 7976, + "time_per_iteration": 2.6276350021362305 + }, + { + "auxiliary_loss_clip": 0.01565359, + "auxiliary_loss_mlp": 0.00436133, + "balance_loss_clip": 1.27544034, + "balance_loss_mlp": 0.39605486, + "epoch": 0.4796031865323914, + "flos": 18361951799040.0, + "grad_norm": 59.10197385788787, + "language_loss": 0.81575656, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.83577144, + "num_input_tokens_seen": 171481130, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.40087891, + "step": 7977, + "time_per_iteration": 2.7268686294555664 + }, + { + "auxiliary_loss_clip": 0.01548562, + "auxiliary_loss_mlp": 0.00358728, + "balance_loss_clip": 1.27250814, + "balance_loss_mlp": 0.32534969, + "epoch": 0.4796633097850594, + "flos": 21835088634240.0, + "grad_norm": 10.252436768332622, + "language_loss": 0.83358824, + "learning_rate": 2.228309942555734e-06, + "loss": 0.85266119, + "num_input_tokens_seen": 171501140, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.33398438, + "step": 7978, + "time_per_iteration": 2.6384074687957764 + }, + { + "auxiliary_loss_clip": 0.01562664, + "auxiliary_loss_mlp": 0.00417919, + "balance_loss_clip": 1.27654421, + "balance_loss_mlp": 0.37829396, + "epoch": 0.47972343303772735, + "flos": 23437350610560.0, + "grad_norm": 65.48422638290694, + "language_loss": 0.95372599, + "learning_rate": 2.22792302247656e-06, + "loss": 0.97353184, + "num_input_tokens_seen": 171519835, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.39624023, + "step": 7979, + "time_per_iteration": 2.6746113300323486 + }, + { + "auxiliary_loss_clip": 0.01549043, + "auxiliary_loss_mlp": 0.00417765, + "balance_loss_clip": 1.2672379, + "balance_loss_mlp": 0.37823552, + "epoch": 0.4797835562903953, + "flos": 24899776940160.0, + "grad_norm": 3.6526745687868294, + "language_loss": 0.82175314, + "learning_rate": 2.227536093754523e-06, + "loss": 0.84142125, + "num_input_tokens_seen": 171540980, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.39526367, + "step": 7980, + "time_per_iteration": 2.6809744834899902 + }, + { + "auxiliary_loss_clip": 0.01557616, + "auxiliary_loss_mlp": 0.00455292, + "balance_loss_clip": 1.2675581, + "balance_loss_mlp": 0.41082698, + "epoch": 0.4798436795430633, + "flos": 35042996793600.0, + "grad_norm": 3.168400618760364, + "language_loss": 0.79593766, + "learning_rate": 2.227149156404295e-06, + "loss": 0.81606674, + "num_input_tokens_seen": 171563600, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.44482422, + "step": 7981, + "time_per_iteration": 2.7928946018218994 + }, + { + "auxiliary_loss_clip": 0.01561788, + "auxiliary_loss_mlp": 0.00413882, + "balance_loss_clip": 1.28160191, + "balance_loss_mlp": 0.37625918, + "epoch": 0.47990380279573125, + "flos": 20590209025920.0, + "grad_norm": 2.178419688022749, + "language_loss": 0.76847363, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.7882303, + "num_input_tokens_seen": 171580700, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.3762207, + "step": 7982, + "time_per_iteration": 2.693531036376953 + }, + { + "auxiliary_loss_clip": 0.01537705, + "auxiliary_loss_mlp": 0.00365349, + "balance_loss_clip": 1.26782084, + "balance_loss_mlp": 0.3324706, + "epoch": 0.4799639260483992, + "flos": 26359402008960.0, + "grad_norm": 6.055681142721246, + "language_loss": 0.77079988, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.78983039, + "num_input_tokens_seen": 171602035, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.32885742, + "step": 7983, + "time_per_iteration": 2.7311758995056152 + }, + { + "auxiliary_loss_clip": 0.01428551, + "auxiliary_loss_mlp": 0.00144512, + "balance_loss_clip": 1.26992857, + "balance_loss_mlp": 0.13344947, + "epoch": 0.4800240493010672, + "flos": 70979021521920.0, + "grad_norm": 0.7627743079400525, + "language_loss": 0.58813667, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.60386729, + "num_input_tokens_seen": 171659215, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.11083984, + "step": 7984, + "time_per_iteration": 3.12073016166687 + }, + { + "auxiliary_loss_clip": 0.0153526, + "auxiliary_loss_mlp": 0.00378639, + "balance_loss_clip": 1.26532125, + "balance_loss_mlp": 0.34506959, + "epoch": 0.48008417255373514, + "flos": 17086656349440.0, + "grad_norm": 8.49216969045243, + "language_loss": 0.73404092, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.75317991, + "num_input_tokens_seen": 171675710, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.33569336, + "step": 7985, + "time_per_iteration": 2.6925485134124756 + }, + { + "auxiliary_loss_clip": 0.01558594, + "auxiliary_loss_mlp": 0.00400358, + "balance_loss_clip": 1.27424431, + "balance_loss_mlp": 0.36006564, + "epoch": 0.4801442958064031, + "flos": 15413435055360.0, + "grad_norm": 2.8617051399475084, + "language_loss": 0.76634693, + "learning_rate": 2.225214340743835e-06, + "loss": 0.78593647, + "num_input_tokens_seen": 171692510, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.40307617, + "step": 7986, + "time_per_iteration": 2.663752794265747 + }, + { + "auxiliary_loss_clip": 0.01547565, + "auxiliary_loss_mlp": 0.00388749, + "balance_loss_clip": 1.26455307, + "balance_loss_mlp": 0.35148454, + "epoch": 0.4802044190590711, + "flos": 11473747441920.0, + "grad_norm": 39.57537522313339, + "language_loss": 0.85005522, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.86941838, + "num_input_tokens_seen": 171710235, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.37280273, + "step": 7987, + "time_per_iteration": 2.616159677505493 + }, + { + "auxiliary_loss_clip": 0.01552561, + "auxiliary_loss_mlp": 0.00437076, + "balance_loss_clip": 1.2731775, + "balance_loss_mlp": 0.39582986, + "epoch": 0.48026454231173904, + "flos": 20951003185920.0, + "grad_norm": 235.40859018688536, + "language_loss": 0.81073004, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.83062643, + "num_input_tokens_seen": 171726715, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.41259766, + "step": 7988, + "time_per_iteration": 2.7625763416290283 + }, + { + "auxiliary_loss_clip": 0.01573955, + "auxiliary_loss_mlp": 0.0040388, + "balance_loss_clip": 1.29030848, + "balance_loss_mlp": 0.36589983, + "epoch": 0.48032466556440706, + "flos": 20448110822400.0, + "grad_norm": 4.173847814566595, + "language_loss": 0.86640596, + "learning_rate": 2.224053348748365e-06, + "loss": 0.88618433, + "num_input_tokens_seen": 171743605, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.37939453, + "step": 7989, + "time_per_iteration": 2.7317442893981934 + }, + { + "auxiliary_loss_clip": 0.0155817, + "auxiliary_loss_mlp": 0.00398125, + "balance_loss_clip": 1.27528143, + "balance_loss_mlp": 0.36088407, + "epoch": 0.480384788817075, + "flos": 37120823861760.0, + "grad_norm": 36.55565632149364, + "language_loss": 0.79115129, + "learning_rate": 2.223666334404724e-06, + "loss": 0.81071424, + "num_input_tokens_seen": 171765445, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.37255859, + "step": 7990, + "time_per_iteration": 2.9646973609924316 + }, + { + "auxiliary_loss_clip": 0.01447753, + "auxiliary_loss_mlp": 0.00170574, + "balance_loss_clip": 1.29104555, + "balance_loss_mlp": 0.16051297, + "epoch": 0.480444912069743, + "flos": 69552577641600.0, + "grad_norm": 0.7541637374572476, + "language_loss": 0.58710504, + "learning_rate": 2.223279311579633e-06, + "loss": 0.60328835, + "num_input_tokens_seen": 171830115, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.10058594, + "step": 7991, + "time_per_iteration": 3.2527034282684326 + }, + { + "auxiliary_loss_clip": 0.01566413, + "auxiliary_loss_mlp": 0.0044548, + "balance_loss_clip": 1.28557003, + "balance_loss_mlp": 0.40478203, + "epoch": 0.48050503532241096, + "flos": 29822231640960.0, + "grad_norm": 14.651102734714405, + "language_loss": 0.74065506, + "learning_rate": 2.222892280287768e-06, + "loss": 0.76077396, + "num_input_tokens_seen": 171849135, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.40673828, + "step": 7992, + "time_per_iteration": 2.760547637939453 + }, + { + "auxiliary_loss_clip": 0.01564905, + "auxiliary_loss_mlp": 0.00408045, + "balance_loss_clip": 1.28490806, + "balance_loss_mlp": 0.3699699, + "epoch": 0.4805651585750789, + "flos": 23948539015680.0, + "grad_norm": 253.93372394330098, + "language_loss": 0.80410773, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.82383728, + "num_input_tokens_seen": 171868880, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.38085938, + "step": 7993, + "time_per_iteration": 2.714796304702759 + }, + { + "auxiliary_loss_clip": 0.01565615, + "auxiliary_loss_mlp": 0.00382092, + "balance_loss_clip": 1.2914418, + "balance_loss_mlp": 0.34663954, + "epoch": 0.4806252818277469, + "flos": 25665428269440.0, + "grad_norm": 42.97359733364314, + "language_loss": 0.85433102, + "learning_rate": 2.222118192362422e-06, + "loss": 0.87380815, + "num_input_tokens_seen": 171889455, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.35473633, + "step": 7994, + "time_per_iteration": 2.7213151454925537 + }, + { + "auxiliary_loss_clip": 0.0158089, + "auxiliary_loss_mlp": 0.00401809, + "balance_loss_clip": 1.29721785, + "balance_loss_mlp": 0.36416304, + "epoch": 0.48068540508041485, + "flos": 13151996640000.0, + "grad_norm": 10.02373248049652, + "language_loss": 0.87023056, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.89005756, + "num_input_tokens_seen": 171906070, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.37670898, + "step": 7995, + "time_per_iteration": 2.7214276790618896 + }, + { + "auxiliary_loss_clip": 0.0157707, + "auxiliary_loss_mlp": 0.00395424, + "balance_loss_clip": 1.30019903, + "balance_loss_mlp": 0.35861251, + "epoch": 0.4807455283330828, + "flos": 21176738208000.0, + "grad_norm": 2.5031342867998236, + "language_loss": 0.87605703, + "learning_rate": 2.2213440707461e-06, + "loss": 0.89578193, + "num_input_tokens_seen": 171926515, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.36791992, + "step": 7996, + "time_per_iteration": 4.208987474441528 + }, + { + "auxiliary_loss_clip": 0.01586747, + "auxiliary_loss_mlp": 0.00433732, + "balance_loss_clip": 1.30886412, + "balance_loss_mlp": 0.39684826, + "epoch": 0.4808056515857508, + "flos": 12275991751680.0, + "grad_norm": 9.282874260692273, + "language_loss": 0.86432165, + "learning_rate": 2.220956997340516e-06, + "loss": 0.88452649, + "num_input_tokens_seen": 171943845, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.36889648, + "step": 7997, + "time_per_iteration": 2.71414852142334 + }, + { + "auxiliary_loss_clip": 0.01587366, + "auxiliary_loss_mlp": 0.0044111, + "balance_loss_clip": 1.30391622, + "balance_loss_mlp": 0.40205681, + "epoch": 0.48086577483841875, + "flos": 24826052275200.0, + "grad_norm": 7.450401424387744, + "language_loss": 0.76997709, + "learning_rate": 2.220569915556221e-06, + "loss": 0.79026186, + "num_input_tokens_seen": 171964970, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.39086914, + "step": 7998, + "time_per_iteration": 4.256881475448608 + }, + { + "auxiliary_loss_clip": 0.01579055, + "auxiliary_loss_mlp": 0.00396022, + "balance_loss_clip": 1.29975438, + "balance_loss_mlp": 0.35987824, + "epoch": 0.4809258980910867, + "flos": 24465365856000.0, + "grad_norm": 21.407954382931447, + "language_loss": 0.76053536, + "learning_rate": 2.220182825407892e-06, + "loss": 0.78028613, + "num_input_tokens_seen": 171986340, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.36157227, + "step": 7999, + "time_per_iteration": 2.702115535736084 + }, + { + "auxiliary_loss_clip": 0.01588058, + "auxiliary_loss_mlp": 0.00466635, + "balance_loss_clip": 1.30554748, + "balance_loss_mlp": 0.42712909, + "epoch": 0.4809860213437547, + "flos": 21215952881280.0, + "grad_norm": 38.153336091706954, + "language_loss": 0.76447111, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.78501809, + "num_input_tokens_seen": 172007300, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.39526367, + "step": 8000, + "time_per_iteration": 2.7140302658081055 + }, + { + "auxiliary_loss_clip": 0.01587548, + "auxiliary_loss_mlp": 0.00427944, + "balance_loss_clip": 1.30657268, + "balance_loss_mlp": 0.38862836, + "epoch": 0.48104614459642264, + "flos": 37632084094080.0, + "grad_norm": 7.673158857136519, + "language_loss": 0.78541839, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.80557334, + "num_input_tokens_seen": 172029585, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.39306641, + "step": 8001, + "time_per_iteration": 4.112316131591797 + }, + { + "auxiliary_loss_clip": 0.01599108, + "auxiliary_loss_mlp": 0.00433878, + "balance_loss_clip": 1.31554818, + "balance_loss_mlp": 0.39377564, + "epoch": 0.48110626784909066, + "flos": 18406122549120.0, + "grad_norm": 2.774281698056675, + "language_loss": 0.85347486, + "learning_rate": 2.219021504925493e-06, + "loss": 0.87380469, + "num_input_tokens_seen": 172047495, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.40087891, + "step": 8002, + "time_per_iteration": 2.68402099609375 + }, + { + "auxiliary_loss_clip": 0.01593414, + "auxiliary_loss_mlp": 0.00443816, + "balance_loss_clip": 1.30670583, + "balance_loss_mlp": 0.40285566, + "epoch": 0.48116639110175863, + "flos": 28439814856320.0, + "grad_norm": 4.5040155007402625, + "language_loss": 0.77137631, + "learning_rate": 2.218634381467819e-06, + "loss": 0.79174864, + "num_input_tokens_seen": 172067625, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.40991211, + "step": 8003, + "time_per_iteration": 2.7268877029418945 + }, + { + "auxiliary_loss_clip": 0.01589248, + "auxiliary_loss_mlp": 0.00392897, + "balance_loss_clip": 1.31498337, + "balance_loss_mlp": 0.35644257, + "epoch": 0.4812265143544266, + "flos": 21725237865600.0, + "grad_norm": 174.42427255693875, + "language_loss": 0.87687123, + "learning_rate": 2.218247249719507e-06, + "loss": 0.89669269, + "num_input_tokens_seen": 172087885, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.36474609, + "step": 8004, + "time_per_iteration": 2.6799049377441406 + }, + { + "auxiliary_loss_clip": 0.01610195, + "auxiliary_loss_mlp": 0.00448389, + "balance_loss_clip": 1.3119936, + "balance_loss_mlp": 0.40544939, + "epoch": 0.48128663760709456, + "flos": 13224679810560.0, + "grad_norm": 5.087716273443497, + "language_loss": 0.85910463, + "learning_rate": 2.217860109695239e-06, + "loss": 0.87969041, + "num_input_tokens_seen": 172105815, + "router_z_loss_clip": 2.98242188, + "router_z_loss_mlp": 0.42944336, + "step": 8005, + "time_per_iteration": 2.623615264892578 + }, + { + "auxiliary_loss_clip": 0.01616592, + "auxiliary_loss_mlp": 0.00441215, + "balance_loss_clip": 1.32701766, + "balance_loss_mlp": 0.40187606, + "epoch": 0.4813467608597625, + "flos": 24243437675520.0, + "grad_norm": 2.3213638543367603, + "language_loss": 0.77460217, + "learning_rate": 2.217472961409692e-06, + "loss": 0.79518026, + "num_input_tokens_seen": 172126125, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.39331055, + "step": 8006, + "time_per_iteration": 4.161262035369873 + }, + { + "auxiliary_loss_clip": 0.01624744, + "auxiliary_loss_mlp": 0.00457896, + "balance_loss_clip": 1.33766651, + "balance_loss_mlp": 0.41686457, + "epoch": 0.4814068841124305, + "flos": 27480424544640.0, + "grad_norm": 23.99154501449194, + "language_loss": 0.76320207, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.78402853, + "num_input_tokens_seen": 172141945, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.40991211, + "step": 8007, + "time_per_iteration": 2.663813829421997 + }, + { + "auxiliary_loss_clip": 0.01619738, + "auxiliary_loss_mlp": 0.00436388, + "balance_loss_clip": 1.33410561, + "balance_loss_mlp": 0.3962864, + "epoch": 0.48146700736509845, + "flos": 19572896033280.0, + "grad_norm": 3.1065804109554196, + "language_loss": 0.7616896, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.78225088, + "num_input_tokens_seen": 172161095, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.40087891, + "step": 8008, + "time_per_iteration": 2.837277412414551 + }, + { + "auxiliary_loss_clip": 0.01617405, + "auxiliary_loss_mlp": 0.00458063, + "balance_loss_clip": 1.32860184, + "balance_loss_mlp": 0.41486144, + "epoch": 0.4815271306177664, + "flos": 20627771673600.0, + "grad_norm": 67.89108454909244, + "language_loss": 0.68515629, + "learning_rate": 2.216311467132199e-06, + "loss": 0.70591092, + "num_input_tokens_seen": 172178750, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.43164062, + "step": 8009, + "time_per_iteration": 2.7527575492858887 + }, + { + "auxiliary_loss_clip": 0.01388841, + "auxiliary_loss_mlp": 0.00102666, + "balance_loss_clip": 1.22904408, + "balance_loss_mlp": 0.09393951, + "epoch": 0.4815872538704344, + "flos": 67691076232320.0, + "grad_norm": 0.8753545349380699, + "language_loss": 0.60408676, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.61900187, + "num_input_tokens_seen": 172240235, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.08740234, + "step": 8010, + "time_per_iteration": 3.2240664958953857 + }, + { + "auxiliary_loss_clip": 0.01616766, + "auxiliary_loss_mlp": 0.00413438, + "balance_loss_clip": 1.33793294, + "balance_loss_mlp": 0.37431353, + "epoch": 0.48164737712310235, + "flos": 22820764723200.0, + "grad_norm": 7.104737492643966, + "language_loss": 0.79388815, + "learning_rate": 2.215537096576639e-06, + "loss": 0.81419015, + "num_input_tokens_seen": 172259875, + "router_z_loss_clip": 2.78710938, + "router_z_loss_mlp": 0.39135742, + "step": 8011, + "time_per_iteration": 2.810405731201172 + }, + { + "auxiliary_loss_clip": 0.0161573, + "auxiliary_loss_mlp": 0.00404289, + "balance_loss_clip": 1.34166348, + "balance_loss_mlp": 0.36995703, + "epoch": 0.4817075003757703, + "flos": 23733865382400.0, + "grad_norm": 91.61494647967987, + "language_loss": 0.85455751, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.87475765, + "num_input_tokens_seen": 172280150, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.34326172, + "step": 8012, + "time_per_iteration": 2.6857852935791016 + }, + { + "auxiliary_loss_clip": 0.01616984, + "auxiliary_loss_mlp": 0.0043134, + "balance_loss_clip": 1.33584166, + "balance_loss_mlp": 0.39228749, + "epoch": 0.4817676236284383, + "flos": 28182909807360.0, + "grad_norm": 39.086304055731524, + "language_loss": 0.79932106, + "learning_rate": 2.214762693328326e-06, + "loss": 0.81980431, + "num_input_tokens_seen": 172300810, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.390625, + "step": 8013, + "time_per_iteration": 2.7622461318969727 + }, + { + "auxiliary_loss_clip": 0.01632288, + "auxiliary_loss_mlp": 0.00385552, + "balance_loss_clip": 1.35179472, + "balance_loss_mlp": 0.34809595, + "epoch": 0.48182774688110624, + "flos": 17091756080640.0, + "grad_norm": 15.108661024935254, + "language_loss": 0.97349489, + "learning_rate": 2.214375479481094e-06, + "loss": 0.99367326, + "num_input_tokens_seen": 172317930, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.37451172, + "step": 8014, + "time_per_iteration": 2.631533622741699 + }, + { + "auxiliary_loss_clip": 0.01612791, + "auxiliary_loss_mlp": 0.00465184, + "balance_loss_clip": 1.32771325, + "balance_loss_mlp": 0.42312673, + "epoch": 0.4818878701337742, + "flos": 12567873669120.0, + "grad_norm": 8.494339955660225, + "language_loss": 0.81653327, + "learning_rate": 2.213988257504722e-06, + "loss": 0.837313, + "num_input_tokens_seen": 172336340, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.4206543, + "step": 8015, + "time_per_iteration": 2.6446306705474854 + }, + { + "auxiliary_loss_clip": 0.01608787, + "auxiliary_loss_mlp": 0.00417281, + "balance_loss_clip": 1.32838702, + "balance_loss_mlp": 0.37822801, + "epoch": 0.48194799338644223, + "flos": 24608505553920.0, + "grad_norm": 326.9077435249571, + "language_loss": 0.86441863, + "learning_rate": 2.213601027413894e-06, + "loss": 0.88467932, + "num_input_tokens_seen": 172354315, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.39038086, + "step": 8016, + "time_per_iteration": 2.732821226119995 + }, + { + "auxiliary_loss_clip": 0.01600647, + "auxiliary_loss_mlp": 0.00393532, + "balance_loss_clip": 1.33106303, + "balance_loss_mlp": 0.35624337, + "epoch": 0.4820081166391102, + "flos": 21105204272640.0, + "grad_norm": 27.7324377245132, + "language_loss": 0.84027976, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.86022162, + "num_input_tokens_seen": 172372695, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.37280273, + "step": 8017, + "time_per_iteration": 2.690098524093628 + }, + { + "auxiliary_loss_clip": 0.01607905, + "auxiliary_loss_mlp": 0.00407813, + "balance_loss_clip": 1.33715463, + "balance_loss_mlp": 0.37138277, + "epoch": 0.48206823989177816, + "flos": 25264593423360.0, + "grad_norm": 13.783640466689613, + "language_loss": 0.85042894, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.8705861, + "num_input_tokens_seen": 172390905, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.36450195, + "step": 8018, + "time_per_iteration": 2.7563774585723877 + }, + { + "auxiliary_loss_clip": 0.01641498, + "auxiliary_loss_mlp": 0.00434357, + "balance_loss_clip": 1.35003936, + "balance_loss_mlp": 0.39511353, + "epoch": 0.4821283631444461, + "flos": 24645062620800.0, + "grad_norm": 12.416365851088491, + "language_loss": 0.82492602, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.84568459, + "num_input_tokens_seen": 172412295, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.39257812, + "step": 8019, + "time_per_iteration": 2.7875239849090576 + }, + { + "auxiliary_loss_clip": 0.01624269, + "auxiliary_loss_mlp": 0.00412181, + "balance_loss_clip": 1.34347045, + "balance_loss_mlp": 0.37439138, + "epoch": 0.4821884863971141, + "flos": 23952094462080.0, + "grad_norm": 19.201091865066687, + "language_loss": 0.85554081, + "learning_rate": 2.212052026199701e-06, + "loss": 0.87590528, + "num_input_tokens_seen": 172432625, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.37792969, + "step": 8020, + "time_per_iteration": 2.6882903575897217 + }, + { + "auxiliary_loss_clip": 0.01613569, + "auxiliary_loss_mlp": 0.0038361, + "balance_loss_clip": 1.33572769, + "balance_loss_mlp": 0.34713209, + "epoch": 0.48224860964978206, + "flos": 17160668323200.0, + "grad_norm": 141.3634961139285, + "language_loss": 0.76649827, + "learning_rate": 2.211664755756855e-06, + "loss": 0.78647006, + "num_input_tokens_seen": 172450010, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.36450195, + "step": 8021, + "time_per_iteration": 2.6767866611480713 + }, + { + "auxiliary_loss_clip": 0.01625495, + "auxiliary_loss_mlp": 0.00404082, + "balance_loss_clip": 1.33691049, + "balance_loss_mlp": 0.36519587, + "epoch": 0.48230873290245, + "flos": 23075838178560.0, + "grad_norm": 33.75528918231387, + "language_loss": 0.70192868, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.72222447, + "num_input_tokens_seen": 172469080, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.38867188, + "step": 8022, + "time_per_iteration": 2.7344892024993896 + }, + { + "auxiliary_loss_clip": 0.01613008, + "auxiliary_loss_mlp": 0.0038706, + "balance_loss_clip": 1.34094679, + "balance_loss_mlp": 0.3517974, + "epoch": 0.482368856155118, + "flos": 19353517718400.0, + "grad_norm": 9.312061379591139, + "language_loss": 0.72722602, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.7472266, + "num_input_tokens_seen": 172484850, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.35253906, + "step": 8023, + "time_per_iteration": 2.658505916595459 + }, + { + "auxiliary_loss_clip": 0.01632512, + "auxiliary_loss_mlp": 0.00410752, + "balance_loss_clip": 1.35387254, + "balance_loss_mlp": 0.37215248, + "epoch": 0.48242897940778595, + "flos": 20078984707200.0, + "grad_norm": 310.43031380331155, + "language_loss": 0.82675183, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.84718442, + "num_input_tokens_seen": 172503525, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.38549805, + "step": 8024, + "time_per_iteration": 2.666679620742798 + }, + { + "auxiliary_loss_clip": 0.01630212, + "auxiliary_loss_mlp": 0.00389464, + "balance_loss_clip": 1.35131013, + "balance_loss_mlp": 0.3494336, + "epoch": 0.4824891026604539, + "flos": 23403989854080.0, + "grad_norm": 38.39967750524729, + "language_loss": 0.812545, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.83274174, + "num_input_tokens_seen": 172524360, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.40014648, + "step": 8025, + "time_per_iteration": 2.716071605682373 + }, + { + "auxiliary_loss_clip": 0.01631468, + "auxiliary_loss_mlp": 0.003878, + "balance_loss_clip": 1.34959435, + "balance_loss_mlp": 0.35136938, + "epoch": 0.4825492259131219, + "flos": 20368675895040.0, + "grad_norm": 3.112945319240416, + "language_loss": 0.80075961, + "learning_rate": 2.209728283441112e-06, + "loss": 0.8209523, + "num_input_tokens_seen": 172541480, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.36450195, + "step": 8026, + "time_per_iteration": 2.699601411819458 + }, + { + "auxiliary_loss_clip": 0.01632629, + "auxiliary_loss_mlp": 0.00461312, + "balance_loss_clip": 1.35070395, + "balance_loss_mlp": 0.42206839, + "epoch": 0.48260934916578985, + "flos": 14319021519360.0, + "grad_norm": 58.13443584783437, + "language_loss": 0.83391035, + "learning_rate": 2.209340965060465e-06, + "loss": 0.8548497, + "num_input_tokens_seen": 172559005, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.39306641, + "step": 8027, + "time_per_iteration": 2.6185193061828613 + }, + { + "auxiliary_loss_clip": 0.01638633, + "auxiliary_loss_mlp": 0.00389618, + "balance_loss_clip": 1.35380578, + "balance_loss_mlp": 0.35249627, + "epoch": 0.4826694724184578, + "flos": 22121152548480.0, + "grad_norm": 11.213807844376648, + "language_loss": 0.74286121, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.76314378, + "num_input_tokens_seen": 172578435, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.37109375, + "step": 8028, + "time_per_iteration": 2.6815125942230225 + }, + { + "auxiliary_loss_clip": 0.01636361, + "auxiliary_loss_mlp": 0.00441485, + "balance_loss_clip": 1.35239255, + "balance_loss_mlp": 0.40095422, + "epoch": 0.48272959567112583, + "flos": 16181169373440.0, + "grad_norm": 50.8814512929007, + "language_loss": 0.7855882, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.80636668, + "num_input_tokens_seen": 172596095, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.40551758, + "step": 8029, + "time_per_iteration": 2.6577014923095703 + }, + { + "auxiliary_loss_clip": 0.01658368, + "auxiliary_loss_mlp": 0.00450845, + "balance_loss_clip": 1.36826146, + "balance_loss_mlp": 0.41195869, + "epoch": 0.4827897189237938, + "flos": 23180445561600.0, + "grad_norm": 143.225308416906, + "language_loss": 0.89874113, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.9198333, + "num_input_tokens_seen": 172615255, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.38916016, + "step": 8030, + "time_per_iteration": 2.697082042694092 + }, + { + "auxiliary_loss_clip": 0.01650957, + "auxiliary_loss_mlp": 0.00453811, + "balance_loss_clip": 1.37329948, + "balance_loss_mlp": 0.4167372, + "epoch": 0.48284984217646176, + "flos": 21652626522240.0, + "grad_norm": 8.13886854475101, + "language_loss": 0.78239828, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.80344594, + "num_input_tokens_seen": 172633185, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.37060547, + "step": 8031, + "time_per_iteration": 2.77010440826416 + }, + { + "auxiliary_loss_clip": 0.01666218, + "auxiliary_loss_mlp": 0.00444017, + "balance_loss_clip": 1.36912966, + "balance_loss_mlp": 0.40238935, + "epoch": 0.48290996542912973, + "flos": 31467443304960.0, + "grad_norm": 20.88136524397335, + "language_loss": 0.77754676, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.79864907, + "num_input_tokens_seen": 172654280, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.41625977, + "step": 8032, + "time_per_iteration": 2.840233564376831 + }, + { + "auxiliary_loss_clip": 0.01644156, + "auxiliary_loss_mlp": 0.00438216, + "balance_loss_clip": 1.36176658, + "balance_loss_mlp": 0.39873433, + "epoch": 0.4829700886817977, + "flos": 24461954064000.0, + "grad_norm": 5.820128943944328, + "language_loss": 0.80736208, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.8281858, + "num_input_tokens_seen": 172675545, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.39453125, + "step": 8033, + "time_per_iteration": 2.693984031677246 + }, + { + "auxiliary_loss_clip": 0.01680535, + "auxiliary_loss_mlp": 0.00464778, + "balance_loss_clip": 1.38621223, + "balance_loss_mlp": 0.42443722, + "epoch": 0.48303021193446566, + "flos": 25702164904320.0, + "grad_norm": 4.077800504976528, + "language_loss": 0.88431275, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.90576583, + "num_input_tokens_seen": 172696455, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.40332031, + "step": 8034, + "time_per_iteration": 2.7396440505981445 + }, + { + "auxiliary_loss_clip": 0.01642876, + "auxiliary_loss_mlp": 0.00419231, + "balance_loss_clip": 1.36426604, + "balance_loss_mlp": 0.38308704, + "epoch": 0.4830903351871336, + "flos": 20085233673600.0, + "grad_norm": 3.34942255179597, + "language_loss": 0.84718019, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.86780125, + "num_input_tokens_seen": 172716720, + "router_z_loss_clip": 2.78710938, + "router_z_loss_mlp": 0.36157227, + "step": 8035, + "time_per_iteration": 2.919151544570923 + }, + { + "auxiliary_loss_clip": 0.01646712, + "auxiliary_loss_mlp": 0.00397359, + "balance_loss_clip": 1.36297011, + "balance_loss_mlp": 0.36085743, + "epoch": 0.4831504584398016, + "flos": 39452216014080.0, + "grad_norm": 227.05581359270641, + "language_loss": 0.74366939, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.76411009, + "num_input_tokens_seen": 172737435, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.36499023, + "step": 8036, + "time_per_iteration": 2.8102447986602783 + }, + { + "auxiliary_loss_clip": 0.01648639, + "auxiliary_loss_mlp": 0.00423274, + "balance_loss_clip": 1.35919523, + "balance_loss_mlp": 0.38505584, + "epoch": 0.48321058169246955, + "flos": 20006588845440.0, + "grad_norm": 16.204581947993134, + "language_loss": 0.77828407, + "learning_rate": 2.205467347074847e-06, + "loss": 0.79900324, + "num_input_tokens_seen": 172755700, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.38183594, + "step": 8037, + "time_per_iteration": 2.6765575408935547 + }, + { + "auxiliary_loss_clip": 0.01647232, + "auxiliary_loss_mlp": 0.00436109, + "balance_loss_clip": 1.35406184, + "balance_loss_mlp": 0.39581633, + "epoch": 0.4832707049451375, + "flos": 20741465197440.0, + "grad_norm": 149.96363074041284, + "language_loss": 0.78878796, + "learning_rate": 2.205079942181525e-06, + "loss": 0.80962133, + "num_input_tokens_seen": 172775185, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.40258789, + "step": 8038, + "time_per_iteration": 4.124361991882324 + }, + { + "auxiliary_loss_clip": 0.01650092, + "auxiliary_loss_mlp": 0.0040824, + "balance_loss_clip": 1.36160779, + "balance_loss_mlp": 0.36861485, + "epoch": 0.4833308281978055, + "flos": 33145584762240.0, + "grad_norm": 1.529705727347306, + "language_loss": 0.83179653, + "learning_rate": 2.20469252951155e-06, + "loss": 0.85237992, + "num_input_tokens_seen": 172796990, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.39624023, + "step": 8039, + "time_per_iteration": 2.767123222351074 + }, + { + "auxiliary_loss_clip": 0.01635349, + "auxiliary_loss_mlp": 0.00406576, + "balance_loss_clip": 1.34865642, + "balance_loss_mlp": 0.36687949, + "epoch": 0.48339095145047345, + "flos": 19099234362240.0, + "grad_norm": 7.090389362487774, + "language_loss": 0.83325565, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.85367489, + "num_input_tokens_seen": 172814915, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.39697266, + "step": 8040, + "time_per_iteration": 2.7224395275115967 + }, + { + "auxiliary_loss_clip": 0.01655685, + "auxiliary_loss_mlp": 0.00373618, + "balance_loss_clip": 1.36605072, + "balance_loss_mlp": 0.33790338, + "epoch": 0.4834510747031414, + "flos": 34459448440320.0, + "grad_norm": 34.02868983011996, + "language_loss": 0.82117456, + "learning_rate": 2.203917680900409e-06, + "loss": 0.84146762, + "num_input_tokens_seen": 172837060, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.35717773, + "step": 8041, + "time_per_iteration": 4.295841932296753 + }, + { + "auxiliary_loss_clip": 0.0163166, + "auxiliary_loss_mlp": 0.00394641, + "balance_loss_clip": 1.35186911, + "balance_loss_mlp": 0.35899752, + "epoch": 0.48351119795580944, + "flos": 27380845065600.0, + "grad_norm": 39.03373033073294, + "language_loss": 0.73471773, + "learning_rate": 2.203530244988624e-06, + "loss": 0.7549808, + "num_input_tokens_seen": 172856545, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.35644531, + "step": 8042, + "time_per_iteration": 2.714569568634033 + }, + { + "auxiliary_loss_clip": 0.01349196, + "auxiliary_loss_mlp": 0.00158116, + "balance_loss_clip": 1.18357313, + "balance_loss_mlp": 0.1505339, + "epoch": 0.4835713212084774, + "flos": 67143941291520.0, + "grad_norm": 0.6942954345837632, + "language_loss": 0.57501698, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.5900901, + "num_input_tokens_seen": 172923055, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.07568359, + "step": 8043, + "time_per_iteration": 4.5979697704315186 + }, + { + "auxiliary_loss_clip": 0.01640005, + "auxiliary_loss_mlp": 0.00404388, + "balance_loss_clip": 1.351511, + "balance_loss_mlp": 0.36588296, + "epoch": 0.48363144446114537, + "flos": 17967473660160.0, + "grad_norm": 7.130207629157485, + "language_loss": 0.79224575, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.8126896, + "num_input_tokens_seen": 172940700, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.38525391, + "step": 8044, + "time_per_iteration": 2.6346702575683594 + }, + { + "auxiliary_loss_clip": 0.01637964, + "auxiliary_loss_mlp": 0.00395432, + "balance_loss_clip": 1.35311937, + "balance_loss_mlp": 0.35811931, + "epoch": 0.48369156771381333, + "flos": 20593513077120.0, + "grad_norm": 3.6156248274444582, + "language_loss": 0.80452126, + "learning_rate": 2.202367891004714e-06, + "loss": 0.82485521, + "num_input_tokens_seen": 172961125, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.37329102, + "step": 8045, + "time_per_iteration": 2.6600143909454346 + }, + { + "auxiliary_loss_clip": 0.01620676, + "auxiliary_loss_mlp": 0.00420378, + "balance_loss_clip": 1.33708811, + "balance_loss_mlp": 0.38211203, + "epoch": 0.4837516909664813, + "flos": 22675075159680.0, + "grad_norm": 51.21102904414122, + "language_loss": 0.74493039, + "learning_rate": 2.201980424309533e-06, + "loss": 0.76534092, + "num_input_tokens_seen": 172980405, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.3828125, + "step": 8046, + "time_per_iteration": 2.745100736618042 + }, + { + "auxiliary_loss_clip": 0.01600388, + "auxiliary_loss_mlp": 0.0036796, + "balance_loss_clip": 1.32340872, + "balance_loss_mlp": 0.33408031, + "epoch": 0.48381181421914926, + "flos": 25518625384320.0, + "grad_norm": 5.044059022474448, + "language_loss": 0.88495994, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.90464342, + "num_input_tokens_seen": 172999105, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.33886719, + "step": 8047, + "time_per_iteration": 2.786492109298706 + }, + { + "auxiliary_loss_clip": 0.0161086, + "auxiliary_loss_mlp": 0.00406962, + "balance_loss_clip": 1.32917762, + "balance_loss_mlp": 0.36826676, + "epoch": 0.4838719374718172, + "flos": 24207491139840.0, + "grad_norm": 19.414888700485335, + "language_loss": 0.85925323, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.87943149, + "num_input_tokens_seen": 173019935, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.38720703, + "step": 8048, + "time_per_iteration": 4.070250511169434 + }, + { + "auxiliary_loss_clip": 0.01614677, + "auxiliary_loss_mlp": 0.00406342, + "balance_loss_clip": 1.32585335, + "balance_loss_mlp": 0.36666894, + "epoch": 0.4839320607244852, + "flos": 26724577628160.0, + "grad_norm": 182.55883552506276, + "language_loss": 0.86986542, + "learning_rate": 2.200817978328054e-06, + "loss": 0.89007556, + "num_input_tokens_seen": 173039700, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.39672852, + "step": 8049, + "time_per_iteration": 2.694589614868164 + }, + { + "auxiliary_loss_clip": 0.0162737, + "auxiliary_loss_mlp": 0.0036476, + "balance_loss_clip": 1.34776723, + "balance_loss_mlp": 0.32818624, + "epoch": 0.48399218397715316, + "flos": 20448900921600.0, + "grad_norm": 7.4359126218212275, + "language_loss": 0.79277396, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.81269526, + "num_input_tokens_seen": 173059170, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.36572266, + "step": 8050, + "time_per_iteration": 2.6913018226623535 + }, + { + "auxiliary_loss_clip": 0.01335702, + "auxiliary_loss_mlp": 0.00079931, + "balance_loss_clip": 1.17098522, + "balance_loss_mlp": 0.07316026, + "epoch": 0.4840523072298211, + "flos": 67180570185600.0, + "grad_norm": 0.6968691840828601, + "language_loss": 0.55754405, + "learning_rate": 2.200042976240723e-06, + "loss": 0.57170039, + "num_input_tokens_seen": 173119000, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.06787109, + "step": 8051, + "time_per_iteration": 3.239795207977295 + }, + { + "auxiliary_loss_clip": 0.01599706, + "auxiliary_loss_mlp": 0.00366621, + "balance_loss_clip": 1.31843209, + "balance_loss_mlp": 0.32887915, + "epoch": 0.4841124304824891, + "flos": 22411490181120.0, + "grad_norm": 54.83090039164296, + "language_loss": 0.81973219, + "learning_rate": 2.199655463811236e-06, + "loss": 0.83939552, + "num_input_tokens_seen": 173137570, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.37744141, + "step": 8052, + "time_per_iteration": 2.709867000579834 + }, + { + "auxiliary_loss_clip": 0.01576744, + "auxiliary_loss_mlp": 0.00341687, + "balance_loss_clip": 1.30719268, + "balance_loss_mlp": 0.30571008, + "epoch": 0.48417255373515705, + "flos": 13843959217920.0, + "grad_norm": 23.171816902290583, + "language_loss": 0.74825776, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.76744199, + "num_input_tokens_seen": 173154355, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.35986328, + "step": 8053, + "time_per_iteration": 2.6731953620910645 + }, + { + "auxiliary_loss_clip": 0.0161088, + "auxiliary_loss_mlp": 0.00359388, + "balance_loss_clip": 1.33498406, + "balance_loss_mlp": 0.32302901, + "epoch": 0.484232676987825, + "flos": 31649689935360.0, + "grad_norm": 307.3499439822623, + "language_loss": 0.77673233, + "learning_rate": 2.198880416254091e-06, + "loss": 0.796435, + "num_input_tokens_seen": 173174845, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.36376953, + "step": 8054, + "time_per_iteration": 2.7893402576446533 + }, + { + "auxiliary_loss_clip": 0.01574215, + "auxiliary_loss_mlp": 0.00352435, + "balance_loss_clip": 1.30341315, + "balance_loss_mlp": 0.31557518, + "epoch": 0.48429280024049304, + "flos": 24095377814400.0, + "grad_norm": 295.6348804874185, + "language_loss": 0.74983203, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.76909852, + "num_input_tokens_seen": 173195025, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.3684082, + "step": 8055, + "time_per_iteration": 2.7101423740386963 + }, + { + "auxiliary_loss_clip": 0.01602127, + "auxiliary_loss_mlp": 0.00354623, + "balance_loss_clip": 1.32135785, + "balance_loss_mlp": 0.31819266, + "epoch": 0.484352923493161, + "flos": 17530081747200.0, + "grad_norm": 3.8150180837274457, + "language_loss": 0.70393991, + "learning_rate": 2.198105338530685e-06, + "loss": 0.7235074, + "num_input_tokens_seen": 173213065, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.36425781, + "step": 8056, + "time_per_iteration": 2.7163209915161133 + }, + { + "auxiliary_loss_clip": 0.01602118, + "auxiliary_loss_mlp": 0.00366491, + "balance_loss_clip": 1.32293713, + "balance_loss_mlp": 0.32729453, + "epoch": 0.48441304674582897, + "flos": 29166862043520.0, + "grad_norm": 297.44639207340396, + "language_loss": 0.73550767, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.75519371, + "num_input_tokens_seen": 173234545, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.39208984, + "step": 8057, + "time_per_iteration": 2.7987265586853027 + }, + { + "auxiliary_loss_clip": 0.01580383, + "auxiliary_loss_mlp": 0.00331636, + "balance_loss_clip": 1.31070065, + "balance_loss_mlp": 0.29708868, + "epoch": 0.48447316999849693, + "flos": 15886701676800.0, + "grad_norm": 24.644515223384282, + "language_loss": 0.86836064, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.88748085, + "num_input_tokens_seen": 173252175, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.34521484, + "step": 8058, + "time_per_iteration": 2.6905975341796875 + }, + { + "auxiliary_loss_clip": 0.01607959, + "auxiliary_loss_mlp": 0.00390217, + "balance_loss_clip": 1.31840968, + "balance_loss_mlp": 0.35042477, + "epoch": 0.4845332932511649, + "flos": 24381405815040.0, + "grad_norm": 7.0646778270042745, + "language_loss": 0.84857774, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.86855948, + "num_input_tokens_seen": 173268790, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.39794922, + "step": 8059, + "time_per_iteration": 2.767595052719116 + }, + { + "auxiliary_loss_clip": 0.01607639, + "auxiliary_loss_mlp": 0.00389169, + "balance_loss_clip": 1.32087135, + "balance_loss_mlp": 0.34751743, + "epoch": 0.48459341650383286, + "flos": 37116478316160.0, + "grad_norm": 10.429116664510918, + "language_loss": 0.71500939, + "learning_rate": 2.196555093055352e-06, + "loss": 0.73497742, + "num_input_tokens_seen": 173288030, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.41674805, + "step": 8060, + "time_per_iteration": 2.8406293392181396 + }, + { + "auxiliary_loss_clip": 0.01597456, + "auxiliary_loss_mlp": 0.00350816, + "balance_loss_clip": 1.32010961, + "balance_loss_mlp": 0.31061852, + "epoch": 0.48465353975650083, + "flos": 22966777509120.0, + "grad_norm": 49.725159672504496, + "language_loss": 0.73783135, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.75731409, + "num_input_tokens_seen": 173305965, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.40161133, + "step": 8061, + "time_per_iteration": 2.656487464904785 + }, + { + "auxiliary_loss_clip": 0.01617791, + "auxiliary_loss_mlp": 0.00377933, + "balance_loss_clip": 1.33384597, + "balance_loss_mlp": 0.3368293, + "epoch": 0.4847136630091688, + "flos": 17707695523200.0, + "grad_norm": 13.117662710212088, + "language_loss": 0.87907183, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.89902902, + "num_input_tokens_seen": 173321985, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.41113281, + "step": 8062, + "time_per_iteration": 2.659801721572876 + }, + { + "auxiliary_loss_clip": 0.01607049, + "auxiliary_loss_mlp": 0.00351316, + "balance_loss_clip": 1.33014536, + "balance_loss_mlp": 0.31438535, + "epoch": 0.48477378626183676, + "flos": 22018269018240.0, + "grad_norm": 4.587622713066759, + "language_loss": 0.79621637, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.81580007, + "num_input_tokens_seen": 173341315, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.36987305, + "step": 8063, + "time_per_iteration": 2.654578447341919 + }, + { + "auxiliary_loss_clip": 0.01612519, + "auxiliary_loss_mlp": 0.00352217, + "balance_loss_clip": 1.33252811, + "balance_loss_mlp": 0.31685942, + "epoch": 0.4848339095145047, + "flos": 27962956874880.0, + "grad_norm": 7.5182956605555376, + "language_loss": 0.84559798, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.86524534, + "num_input_tokens_seen": 173361055, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.35302734, + "step": 8064, + "time_per_iteration": 2.7007720470428467 + }, + { + "auxiliary_loss_clip": 0.01626389, + "auxiliary_loss_mlp": 0.00337159, + "balance_loss_clip": 1.34738362, + "balance_loss_mlp": 0.30025184, + "epoch": 0.4848940327671727, + "flos": 21688752625920.0, + "grad_norm": 3.05760826158405, + "language_loss": 0.87221503, + "learning_rate": 2.194617118620173e-06, + "loss": 0.89185053, + "num_input_tokens_seen": 173379255, + "router_z_loss_clip": 2.78710938, + "router_z_loss_mlp": 0.36914062, + "step": 8065, + "time_per_iteration": 2.697171211242676 + }, + { + "auxiliary_loss_clip": 0.01618269, + "auxiliary_loss_mlp": 0.00342818, + "balance_loss_clip": 1.34535658, + "balance_loss_mlp": 0.30576786, + "epoch": 0.48495415601984065, + "flos": 20631578515200.0, + "grad_norm": 2.6418060572772935, + "language_loss": 0.81311536, + "learning_rate": 2.194229501534644e-06, + "loss": 0.83272624, + "num_input_tokens_seen": 173398370, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.37060547, + "step": 8066, + "time_per_iteration": 2.6704559326171875 + }, + { + "auxiliary_loss_clip": 0.01620957, + "auxiliary_loss_mlp": 0.00344594, + "balance_loss_clip": 1.34708309, + "balance_loss_mlp": 0.30864066, + "epoch": 0.4850142792725086, + "flos": 25628152930560.0, + "grad_norm": 5.665192075704937, + "language_loss": 0.77253228, + "learning_rate": 2.193841877083912e-06, + "loss": 0.79218781, + "num_input_tokens_seen": 173419595, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.36010742, + "step": 8067, + "time_per_iteration": 2.723296880722046 + }, + { + "auxiliary_loss_clip": 0.0162654, + "auxiliary_loss_mlp": 0.00404091, + "balance_loss_clip": 1.34421885, + "balance_loss_mlp": 0.36234367, + "epoch": 0.4850744025251766, + "flos": 13771958405760.0, + "grad_norm": 9.895086953769486, + "language_loss": 0.86184782, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.88215411, + "num_input_tokens_seen": 173435390, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.41772461, + "step": 8068, + "time_per_iteration": 2.6966614723205566 + }, + { + "auxiliary_loss_clip": 0.01604589, + "auxiliary_loss_mlp": 0.00372478, + "balance_loss_clip": 1.32911968, + "balance_loss_mlp": 0.33228064, + "epoch": 0.4851345257778446, + "flos": 20261339078400.0, + "grad_norm": 242.2158797683225, + "language_loss": 0.89042604, + "learning_rate": 2.193066606145638e-06, + "loss": 0.91019666, + "num_input_tokens_seen": 173454095, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.40185547, + "step": 8069, + "time_per_iteration": 2.674145460128784 + }, + { + "auxiliary_loss_clip": 0.01620711, + "auxiliary_loss_mlp": 0.0037659, + "balance_loss_clip": 1.34298587, + "balance_loss_mlp": 0.33603477, + "epoch": 0.48519464903051257, + "flos": 27089681420160.0, + "grad_norm": 18.95450700553503, + "language_loss": 0.83854616, + "learning_rate": 2.192678959687493e-06, + "loss": 0.85851914, + "num_input_tokens_seen": 173475300, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.40527344, + "step": 8070, + "time_per_iteration": 2.7325594425201416 + }, + { + "auxiliary_loss_clip": 0.01610824, + "auxiliary_loss_mlp": 0.00397798, + "balance_loss_clip": 1.33033609, + "balance_loss_mlp": 0.35693276, + "epoch": 0.48525477228318054, + "flos": 17127235739520.0, + "grad_norm": 4.090360297454009, + "language_loss": 0.86672121, + "learning_rate": 2.192291305922943e-06, + "loss": 0.88680744, + "num_input_tokens_seen": 173492005, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.40869141, + "step": 8071, + "time_per_iteration": 2.6274349689483643 + }, + { + "auxiliary_loss_clip": 0.01623659, + "auxiliary_loss_mlp": 0.00361828, + "balance_loss_clip": 1.34348571, + "balance_loss_mlp": 0.32174999, + "epoch": 0.4853148955358485, + "flos": 28180324028160.0, + "grad_norm": 21.982631484410952, + "language_loss": 0.77302265, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.79287744, + "num_input_tokens_seen": 173511995, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.40063477, + "step": 8072, + "time_per_iteration": 2.7313485145568848 + }, + { + "auxiliary_loss_clip": 0.01631757, + "auxiliary_loss_mlp": 0.0036942, + "balance_loss_clip": 1.34405851, + "balance_loss_mlp": 0.32812619, + "epoch": 0.48537501878851647, + "flos": 17493309198720.0, + "grad_norm": 14.785133891502923, + "language_loss": 0.94227272, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.96228456, + "num_input_tokens_seen": 173530215, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.41333008, + "step": 8073, + "time_per_iteration": 2.7510604858398438 + }, + { + "auxiliary_loss_clip": 0.01620784, + "auxiliary_loss_mlp": 0.00352981, + "balance_loss_clip": 1.34868431, + "balance_loss_mlp": 0.31585971, + "epoch": 0.48543514204118443, + "flos": 28584857975040.0, + "grad_norm": 13.14988526651096, + "language_loss": 0.67439985, + "learning_rate": 2.19112830093786e-06, + "loss": 0.69413757, + "num_input_tokens_seen": 173550920, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.37060547, + "step": 8074, + "time_per_iteration": 2.699949264526367 + }, + { + "auxiliary_loss_clip": 0.01621872, + "auxiliary_loss_mlp": 0.00343696, + "balance_loss_clip": 1.33963692, + "balance_loss_mlp": 0.30485764, + "epoch": 0.4854952652938524, + "flos": 20959981585920.0, + "grad_norm": 3.5980100315598516, + "language_loss": 0.78353971, + "learning_rate": 2.19074061809469e-06, + "loss": 0.80319536, + "num_input_tokens_seen": 173569065, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.38818359, + "step": 8075, + "time_per_iteration": 2.6677908897399902 + }, + { + "auxiliary_loss_clip": 0.01632995, + "auxiliary_loss_mlp": 0.00341954, + "balance_loss_clip": 1.35546899, + "balance_loss_mlp": 0.30523768, + "epoch": 0.48555538854652036, + "flos": 66529543155840.0, + "grad_norm": 90.06876939117188, + "language_loss": 0.86069101, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.88044047, + "num_input_tokens_seen": 173596085, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.3671875, + "step": 8076, + "time_per_iteration": 3.081855297088623 + }, + { + "auxiliary_loss_clip": 0.01634727, + "auxiliary_loss_mlp": 0.00373842, + "balance_loss_clip": 1.34814346, + "balance_loss_mlp": 0.33083174, + "epoch": 0.4856155117991883, + "flos": 15924982596480.0, + "grad_norm": 30.17381864254862, + "language_loss": 0.91764152, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.93772721, + "num_input_tokens_seen": 173613900, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.4296875, + "step": 8077, + "time_per_iteration": 2.654985189437866 + }, + { + "auxiliary_loss_clip": 0.01476443, + "auxiliary_loss_mlp": 0.00098375, + "balance_loss_clip": 1.30143511, + "balance_loss_mlp": 0.08721658, + "epoch": 0.4856756350518563, + "flos": 71047395060480.0, + "grad_norm": 0.9116318905275735, + "language_loss": 0.57815093, + "learning_rate": 2.189577526226564e-06, + "loss": 0.59389913, + "num_input_tokens_seen": 173671305, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.11181641, + "step": 8078, + "time_per_iteration": 3.146068811416626 + }, + { + "auxiliary_loss_clip": 0.01639496, + "auxiliary_loss_mlp": 0.00357316, + "balance_loss_clip": 1.3459444, + "balance_loss_mlp": 0.31661838, + "epoch": 0.48573575830452426, + "flos": 29825679346560.0, + "grad_norm": 77.67071489469244, + "language_loss": 0.78686309, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.80683124, + "num_input_tokens_seen": 173692070, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.40698242, + "step": 8079, + "time_per_iteration": 2.833815336227417 + }, + { + "auxiliary_loss_clip": 0.01638341, + "auxiliary_loss_mlp": 0.00333707, + "balance_loss_clip": 1.35219979, + "balance_loss_mlp": 0.29532209, + "epoch": 0.4857958815571922, + "flos": 17639501552640.0, + "grad_norm": 2.7981869531608923, + "language_loss": 0.89322937, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.91294986, + "num_input_tokens_seen": 173709785, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.3840332, + "step": 8080, + "time_per_iteration": 4.195082902908325 + }, + { + "auxiliary_loss_clip": 0.01625293, + "auxiliary_loss_mlp": 0.00354863, + "balance_loss_clip": 1.34392667, + "balance_loss_mlp": 0.3150472, + "epoch": 0.4858560048098602, + "flos": 21105491581440.0, + "grad_norm": 3.203433580186773, + "language_loss": 0.8918196, + "learning_rate": 2.188414369659251e-06, + "loss": 0.91162115, + "num_input_tokens_seen": 173728770, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.3984375, + "step": 8081, + "time_per_iteration": 2.6881327629089355 + }, + { + "auxiliary_loss_clip": 0.01640838, + "auxiliary_loss_mlp": 0.00371929, + "balance_loss_clip": 1.35425079, + "balance_loss_mlp": 0.32751143, + "epoch": 0.4859161280625282, + "flos": 22090844448000.0, + "grad_norm": 13.294278913391782, + "language_loss": 0.87963402, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.89976168, + "num_input_tokens_seen": 173747355, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.44458008, + "step": 8082, + "time_per_iteration": 2.680919647216797 + }, + { + "auxiliary_loss_clip": 0.01644175, + "auxiliary_loss_mlp": 0.00333789, + "balance_loss_clip": 1.3628726, + "balance_loss_mlp": 0.29490328, + "epoch": 0.4859762513151962, + "flos": 17493452853120.0, + "grad_norm": 2.4747759416207105, + "language_loss": 0.93192559, + "learning_rate": 2.187638896199746e-06, + "loss": 0.95170522, + "num_input_tokens_seen": 173764825, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.38891602, + "step": 8083, + "time_per_iteration": 4.231025218963623 + }, + { + "auxiliary_loss_clip": 0.01622208, + "auxiliary_loss_mlp": 0.00335786, + "balance_loss_clip": 1.35078812, + "balance_loss_mlp": 0.29737711, + "epoch": 0.48603637456786414, + "flos": 18004246208640.0, + "grad_norm": 17.037036747524226, + "language_loss": 0.87416995, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.89374995, + "num_input_tokens_seen": 173783215, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.3840332, + "step": 8084, + "time_per_iteration": 2.6831517219543457 + }, + { + "auxiliary_loss_clip": 0.01647, + "auxiliary_loss_mlp": 0.00360718, + "balance_loss_clip": 1.3602469, + "balance_loss_mlp": 0.32142621, + "epoch": 0.4860964978205321, + "flos": 22492038430080.0, + "grad_norm": 14.054080385794629, + "language_loss": 0.75225341, + "learning_rate": 2.186863394279098e-06, + "loss": 0.77233058, + "num_input_tokens_seen": 173801905, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.39282227, + "step": 8085, + "time_per_iteration": 4.055827856063843 + }, + { + "auxiliary_loss_clip": 0.01638242, + "auxiliary_loss_mlp": 0.00337969, + "balance_loss_clip": 1.35516536, + "balance_loss_mlp": 0.29824811, + "epoch": 0.48615662107320007, + "flos": 23372532518400.0, + "grad_norm": 18.829276320622952, + "language_loss": 0.82582414, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.84558624, + "num_input_tokens_seen": 173824690, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.3972168, + "step": 8086, + "time_per_iteration": 2.7111735343933105 + }, + { + "auxiliary_loss_clip": 0.01645409, + "auxiliary_loss_mlp": 0.00336221, + "balance_loss_clip": 1.36005187, + "balance_loss_mlp": 0.29716766, + "epoch": 0.48621674432586803, + "flos": 34418833136640.0, + "grad_norm": 8.752229687969285, + "language_loss": 0.76303113, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.7828474, + "num_input_tokens_seen": 173844450, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.39038086, + "step": 8087, + "time_per_iteration": 2.7602598667144775 + }, + { + "auxiliary_loss_clip": 0.01638934, + "auxiliary_loss_mlp": 0.00362596, + "balance_loss_clip": 1.34748006, + "balance_loss_mlp": 0.31996632, + "epoch": 0.486276867578536, + "flos": 33107555237760.0, + "grad_norm": 73.27228151794307, + "language_loss": 0.79956758, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.81958294, + "num_input_tokens_seen": 173864975, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.42602539, + "step": 8088, + "time_per_iteration": 2.7506985664367676 + }, + { + "auxiliary_loss_clip": 0.01649789, + "auxiliary_loss_mlp": 0.00339568, + "balance_loss_clip": 1.36271501, + "balance_loss_mlp": 0.29882246, + "epoch": 0.48633699083120396, + "flos": 21470703114240.0, + "grad_norm": 180.24922632484748, + "language_loss": 0.81512845, + "learning_rate": 2.185312305524892e-06, + "loss": 0.83502197, + "num_input_tokens_seen": 173883805, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.4074707, + "step": 8089, + "time_per_iteration": 2.657520055770874 + }, + { + "auxiliary_loss_clip": 0.0165425, + "auxiliary_loss_mlp": 0.00361852, + "balance_loss_clip": 1.36297202, + "balance_loss_mlp": 0.32041508, + "epoch": 0.48639711408387193, + "flos": 20084335833600.0, + "grad_norm": 24.30865346380841, + "language_loss": 0.89217901, + "learning_rate": 2.184924515731926e-06, + "loss": 0.91234004, + "num_input_tokens_seen": 173903520, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.41455078, + "step": 8090, + "time_per_iteration": 4.0763022899627686 + }, + { + "auxiliary_loss_clip": 0.01664976, + "auxiliary_loss_mlp": 0.0032987, + "balance_loss_clip": 1.37680507, + "balance_loss_mlp": 0.28898153, + "epoch": 0.4864572373365399, + "flos": 20778884190720.0, + "grad_norm": 3.3744775035250645, + "language_loss": 0.81860423, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.83855265, + "num_input_tokens_seen": 173924255, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.40869141, + "step": 8091, + "time_per_iteration": 2.6842238903045654 + }, + { + "auxiliary_loss_clip": 0.01672138, + "auxiliary_loss_mlp": 0.00349057, + "balance_loss_clip": 1.37603235, + "balance_loss_mlp": 0.30633262, + "epoch": 0.48651736058920786, + "flos": 26025360503040.0, + "grad_norm": 6.199191207063659, + "language_loss": 0.84985626, + "learning_rate": 2.184148915123631e-06, + "loss": 0.87006819, + "num_input_tokens_seen": 173943285, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.42700195, + "step": 8092, + "time_per_iteration": 2.7277233600616455 + }, + { + "auxiliary_loss_clip": 0.01690217, + "auxiliary_loss_mlp": 0.00331393, + "balance_loss_clip": 1.39094532, + "balance_loss_mlp": 0.28974128, + "epoch": 0.4865774838418758, + "flos": 20485601642880.0, + "grad_norm": 35.23364157118287, + "language_loss": 0.7852459, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.805462, + "num_input_tokens_seen": 173962205, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.41601562, + "step": 8093, + "time_per_iteration": 2.7288691997528076 + }, + { + "auxiliary_loss_clip": 0.01685893, + "auxiliary_loss_mlp": 0.00341587, + "balance_loss_clip": 1.3894428, + "balance_loss_mlp": 0.30048323, + "epoch": 0.4866376070945438, + "flos": 23547704169600.0, + "grad_norm": 7.65772948616519, + "language_loss": 0.7518189, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.77209365, + "num_input_tokens_seen": 173980945, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.41064453, + "step": 8094, + "time_per_iteration": 2.788161277770996 + }, + { + "auxiliary_loss_clip": 0.01718649, + "auxiliary_loss_mlp": 0.00356281, + "balance_loss_clip": 1.40147436, + "balance_loss_mlp": 0.3149865, + "epoch": 0.4866977303472118, + "flos": 16690598012160.0, + "grad_norm": 4.372325643533697, + "language_loss": 0.76133132, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.78208059, + "num_input_tokens_seen": 173998860, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.41308594, + "step": 8095, + "time_per_iteration": 2.6834423542022705 + }, + { + "auxiliary_loss_clip": 0.0168445, + "auxiliary_loss_mlp": 0.00319904, + "balance_loss_clip": 1.38492668, + "balance_loss_mlp": 0.27772814, + "epoch": 0.4867578535998798, + "flos": 17896011552000.0, + "grad_norm": 25.656953361858804, + "language_loss": 0.85298669, + "learning_rate": 2.182597630229345e-06, + "loss": 0.87303019, + "num_input_tokens_seen": 174016665, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.42163086, + "step": 8096, + "time_per_iteration": 2.646864414215088 + }, + { + "auxiliary_loss_clip": 0.01693263, + "auxiliary_loss_mlp": 0.00346209, + "balance_loss_clip": 1.39482415, + "balance_loss_mlp": 0.30422375, + "epoch": 0.48681797685254774, + "flos": 22637799820800.0, + "grad_norm": 178.73950704180254, + "language_loss": 0.74456024, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.76495498, + "num_input_tokens_seen": 174034800, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.41967773, + "step": 8097, + "time_per_iteration": 2.6971967220306396 + }, + { + "auxiliary_loss_clip": 0.01715413, + "auxiliary_loss_mlp": 0.00313035, + "balance_loss_clip": 1.40263915, + "balance_loss_mlp": 0.27295658, + "epoch": 0.4868781001052157, + "flos": 20886077352960.0, + "grad_norm": 5.370627185553025, + "language_loss": 0.77215022, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.79243469, + "num_input_tokens_seen": 174054445, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.40087891, + "step": 8098, + "time_per_iteration": 2.7149953842163086 + }, + { + "auxiliary_loss_clip": 0.01728095, + "auxiliary_loss_mlp": 0.00345171, + "balance_loss_clip": 1.40674758, + "balance_loss_mlp": 0.29970431, + "epoch": 0.48693822335788367, + "flos": 41974940937600.0, + "grad_norm": 87.68975541122627, + "language_loss": 0.72760153, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.74833429, + "num_input_tokens_seen": 174077890, + "router_z_loss_clip": 3.21484375, + "router_z_loss_mlp": 0.45483398, + "step": 8099, + "time_per_iteration": 2.8706655502319336 + }, + { + "auxiliary_loss_clip": 0.01720509, + "auxiliary_loss_mlp": 0.00332039, + "balance_loss_clip": 1.40633368, + "balance_loss_mlp": 0.28948149, + "epoch": 0.48699834661055164, + "flos": 24243294021120.0, + "grad_norm": 2.479773421331886, + "language_loss": 0.76795328, + "learning_rate": 2.181046234549138e-06, + "loss": 0.78847873, + "num_input_tokens_seen": 174097460, + "router_z_loss_clip": 3.140625, + "router_z_loss_mlp": 0.42553711, + "step": 8100, + "time_per_iteration": 2.711744546890259 + }, + { + "auxiliary_loss_clip": 0.01721035, + "auxiliary_loss_mlp": 0.00319459, + "balance_loss_clip": 1.41171217, + "balance_loss_mlp": 0.27237177, + "epoch": 0.4870584698632196, + "flos": 25923877603200.0, + "grad_norm": 1.544977812262047, + "language_loss": 0.81092978, + "learning_rate": 2.180658368429088e-06, + "loss": 0.83133471, + "num_input_tokens_seen": 174120775, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.47070312, + "step": 8101, + "time_per_iteration": 2.7197091579437256 + }, + { + "auxiliary_loss_clip": 0.01477493, + "auxiliary_loss_mlp": 0.00097685, + "balance_loss_clip": 1.30724525, + "balance_loss_mlp": 0.08433346, + "epoch": 0.48711859311588757, + "flos": 70211933648640.0, + "grad_norm": 0.6873299800821779, + "language_loss": 0.51776296, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.53351474, + "num_input_tokens_seen": 174189135, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.13378906, + "step": 8102, + "time_per_iteration": 3.329775333404541 + }, + { + "auxiliary_loss_clip": 0.0170646, + "auxiliary_loss_mlp": 0.0032804, + "balance_loss_clip": 1.39552414, + "balance_loss_mlp": 0.2844573, + "epoch": 0.48717871636855553, + "flos": 12342964659840.0, + "grad_norm": 10.59475819163748, + "language_loss": 0.78078359, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.80112857, + "num_input_tokens_seen": 174203250, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.4362793, + "step": 8103, + "time_per_iteration": 2.668189764022827 + }, + { + "auxiliary_loss_clip": 0.01713769, + "auxiliary_loss_mlp": 0.00321357, + "balance_loss_clip": 1.39921236, + "balance_loss_mlp": 0.27734452, + "epoch": 0.4872388396212235, + "flos": 23477139901440.0, + "grad_norm": 24.23414201245346, + "language_loss": 0.68545073, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.70580202, + "num_input_tokens_seen": 174224145, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.44042969, + "step": 8104, + "time_per_iteration": 2.671679735183716 + }, + { + "auxiliary_loss_clip": 0.01754374, + "auxiliary_loss_mlp": 0.0033009, + "balance_loss_clip": 1.43229437, + "balance_loss_mlp": 0.28414658, + "epoch": 0.48729896287389146, + "flos": 31427582186880.0, + "grad_norm": 2.898134847514157, + "language_loss": 0.74281311, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.76365769, + "num_input_tokens_seen": 174244435, + "router_z_loss_clip": 3.22265625, + "router_z_loss_mlp": 0.45996094, + "step": 8105, + "time_per_iteration": 2.7569847106933594 + }, + { + "auxiliary_loss_clip": 0.01742212, + "auxiliary_loss_mlp": 0.0035034, + "balance_loss_clip": 1.42522383, + "balance_loss_mlp": 0.30463549, + "epoch": 0.4873590861265594, + "flos": 19057936700160.0, + "grad_norm": 5.973654210809919, + "language_loss": 0.79907596, + "learning_rate": 2.178718935364259e-06, + "loss": 0.82000154, + "num_input_tokens_seen": 174262710, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.45751953, + "step": 8106, + "time_per_iteration": 2.7501235008239746 + }, + { + "auxiliary_loss_clip": 0.01755013, + "auxiliary_loss_mlp": 0.00374576, + "balance_loss_clip": 1.42530644, + "balance_loss_mlp": 0.33011097, + "epoch": 0.4874192093792274, + "flos": 24348296453760.0, + "grad_norm": 18.970209485590637, + "language_loss": 0.82396734, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.84526324, + "num_input_tokens_seen": 174281545, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.44482422, + "step": 8107, + "time_per_iteration": 2.713927745819092 + }, + { + "auxiliary_loss_clip": 0.0175081, + "auxiliary_loss_mlp": 0.0033117, + "balance_loss_clip": 1.43584454, + "balance_loss_mlp": 0.28653759, + "epoch": 0.4874793326318954, + "flos": 23112610727040.0, + "grad_norm": 7.099705318149349, + "language_loss": 0.81994307, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.84076285, + "num_input_tokens_seen": 174300290, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.4465332, + "step": 8108, + "time_per_iteration": 2.663421392440796 + }, + { + "auxiliary_loss_clip": 0.01724449, + "auxiliary_loss_mlp": 0.00286848, + "balance_loss_clip": 1.41692328, + "balance_loss_mlp": 0.24469538, + "epoch": 0.4875394558845634, + "flos": 19026156142080.0, + "grad_norm": 12.2217517781924, + "language_loss": 0.80125707, + "learning_rate": 2.177555194083212e-06, + "loss": 0.82137007, + "num_input_tokens_seen": 174318490, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.42163086, + "step": 8109, + "time_per_iteration": 2.667231559753418 + }, + { + "auxiliary_loss_clip": 0.01740998, + "auxiliary_loss_mlp": 0.00302214, + "balance_loss_clip": 1.42984343, + "balance_loss_mlp": 0.25808299, + "epoch": 0.48759957913723134, + "flos": 21433607343360.0, + "grad_norm": 191.27250697614045, + "language_loss": 0.83102477, + "learning_rate": 2.177167266837428e-06, + "loss": 0.85145688, + "num_input_tokens_seen": 174335505, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.44116211, + "step": 8110, + "time_per_iteration": 2.673060417175293 + }, + { + "auxiliary_loss_clip": 0.01754045, + "auxiliary_loss_mlp": 0.00325551, + "balance_loss_clip": 1.43109584, + "balance_loss_mlp": 0.28156233, + "epoch": 0.4876597023898993, + "flos": 17748669962880.0, + "grad_norm": 10.36811642690939, + "language_loss": 0.80667114, + "learning_rate": 2.176779332873444e-06, + "loss": 0.82746702, + "num_input_tokens_seen": 174353990, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.44042969, + "step": 8111, + "time_per_iteration": 2.6539814472198486 + }, + { + "auxiliary_loss_clip": 0.01742384, + "auxiliary_loss_mlp": 0.00348997, + "balance_loss_clip": 1.42965686, + "balance_loss_mlp": 0.30658215, + "epoch": 0.4877198256425673, + "flos": 17019647527680.0, + "grad_norm": 22.979846604815688, + "language_loss": 0.8066116, + "learning_rate": 2.17639139220597e-06, + "loss": 0.82752538, + "num_input_tokens_seen": 174373425, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.42431641, + "step": 8112, + "time_per_iteration": 2.665844202041626 + }, + { + "auxiliary_loss_clip": 0.01738379, + "auxiliary_loss_mlp": 0.00351009, + "balance_loss_clip": 1.41281617, + "balance_loss_mlp": 0.30399281, + "epoch": 0.48777994889523524, + "flos": 22384091082240.0, + "grad_norm": 2.6774532638290296, + "language_loss": 0.80368114, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.82457507, + "num_input_tokens_seen": 174393070, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.47021484, + "step": 8113, + "time_per_iteration": 2.678218126296997 + }, + { + "auxiliary_loss_clip": 0.01515102, + "auxiliary_loss_mlp": 0.0008526, + "balance_loss_clip": 1.33900964, + "balance_loss_mlp": 0.0684756, + "epoch": 0.4878400721479032, + "flos": 61241772159360.0, + "grad_norm": 0.7851369840249823, + "language_loss": 0.48556584, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.50156951, + "num_input_tokens_seen": 174446880, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.16796875, + "step": 8114, + "time_per_iteration": 3.057821750640869 + }, + { + "auxiliary_loss_clip": 0.01745019, + "auxiliary_loss_mlp": 0.00299962, + "balance_loss_clip": 1.42544198, + "balance_loss_mlp": 0.25428143, + "epoch": 0.48790019540057117, + "flos": 24536612482560.0, + "grad_norm": 41.7703855707457, + "language_loss": 0.83909106, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.85954088, + "num_input_tokens_seen": 174468485, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.45678711, + "step": 8115, + "time_per_iteration": 2.71944522857666 + }, + { + "auxiliary_loss_clip": 0.01756057, + "auxiliary_loss_mlp": 0.00334245, + "balance_loss_clip": 1.42878377, + "balance_loss_mlp": 0.28789636, + "epoch": 0.48796031865323913, + "flos": 21833939399040.0, + "grad_norm": 24.482790539784304, + "language_loss": 0.81607407, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.83697712, + "num_input_tokens_seen": 174486360, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.46362305, + "step": 8116, + "time_per_iteration": 2.7781991958618164 + }, + { + "auxiliary_loss_clip": 0.01747575, + "auxiliary_loss_mlp": 0.00334422, + "balance_loss_clip": 1.43288529, + "balance_loss_mlp": 0.29148284, + "epoch": 0.4880204419059071, + "flos": 18588907883520.0, + "grad_norm": 70.15984799823327, + "language_loss": 0.71645069, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.73727059, + "num_input_tokens_seen": 174505075, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.4296875, + "step": 8117, + "time_per_iteration": 2.704153299331665 + }, + { + "auxiliary_loss_clip": 0.01736201, + "auxiliary_loss_mlp": 0.003165, + "balance_loss_clip": 1.42395377, + "balance_loss_mlp": 0.27303615, + "epoch": 0.48808056515857506, + "flos": 19172168928000.0, + "grad_norm": 35.6602153884737, + "language_loss": 0.84631431, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.86684132, + "num_input_tokens_seen": 174523385, + "router_z_loss_clip": 3.12304688, + "router_z_loss_mlp": 0.43505859, + "step": 8118, + "time_per_iteration": 2.684366226196289 + }, + { + "auxiliary_loss_clip": 0.01740233, + "auxiliary_loss_mlp": 0.00356133, + "balance_loss_clip": 1.42337108, + "balance_loss_mlp": 0.3117398, + "epoch": 0.48814068841124303, + "flos": 20120497850880.0, + "grad_norm": 3.1274364090770232, + "language_loss": 0.7169987, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.73796231, + "num_input_tokens_seen": 174542200, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.4440918, + "step": 8119, + "time_per_iteration": 2.650541305541992 + }, + { + "auxiliary_loss_clip": 0.01760609, + "auxiliary_loss_mlp": 0.00338372, + "balance_loss_clip": 1.42999709, + "balance_loss_mlp": 0.29757825, + "epoch": 0.488200811663911, + "flos": 22965592360320.0, + "grad_norm": 6.854083480534947, + "language_loss": 0.79876482, + "learning_rate": 2.173287627305878e-06, + "loss": 0.81975472, + "num_input_tokens_seen": 174563620, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.40795898, + "step": 8120, + "time_per_iteration": 2.7444674968719482 + }, + { + "auxiliary_loss_clip": 0.01772364, + "auxiliary_loss_mlp": 0.00324439, + "balance_loss_clip": 1.44346762, + "balance_loss_mlp": 0.27828091, + "epoch": 0.48826093491657896, + "flos": 33910697387520.0, + "grad_norm": 490.53534758602365, + "language_loss": 0.68733376, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.70830178, + "num_input_tokens_seen": 174586465, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.46142578, + "step": 8121, + "time_per_iteration": 2.7783873081207275 + }, + { + "auxiliary_loss_clip": 0.01756699, + "auxiliary_loss_mlp": 0.00348409, + "balance_loss_clip": 1.42487788, + "balance_loss_mlp": 0.30117831, + "epoch": 0.488321058169247, + "flos": 23070307484160.0, + "grad_norm": 20.38815350774075, + "language_loss": 0.91054296, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.93159401, + "num_input_tokens_seen": 174604035, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.47192383, + "step": 8122, + "time_per_iteration": 4.124528169631958 + }, + { + "auxiliary_loss_clip": 0.01782212, + "auxiliary_loss_mlp": 0.0032142, + "balance_loss_clip": 1.44350564, + "balance_loss_mlp": 0.27683526, + "epoch": 0.48838118142191494, + "flos": 19317714837120.0, + "grad_norm": 22.882908528142977, + "language_loss": 0.90971816, + "learning_rate": 2.172123606640866e-06, + "loss": 0.93075454, + "num_input_tokens_seen": 174621715, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.44555664, + "step": 8123, + "time_per_iteration": 2.68357515335083 + }, + { + "auxiliary_loss_clip": 0.0179649, + "auxiliary_loss_mlp": 0.00346757, + "balance_loss_clip": 1.44986558, + "balance_loss_mlp": 0.30105257, + "epoch": 0.4884413046745829, + "flos": 25410678036480.0, + "grad_norm": 106.72505042696287, + "language_loss": 0.90742302, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.92885542, + "num_input_tokens_seen": 174643835, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.45654297, + "step": 8124, + "time_per_iteration": 2.740358829498291 + }, + { + "auxiliary_loss_clip": 0.01784726, + "auxiliary_loss_mlp": 0.00338305, + "balance_loss_clip": 1.45191467, + "balance_loss_mlp": 0.29610503, + "epoch": 0.4885014279272509, + "flos": 20991546662400.0, + "grad_norm": 7.468246560942172, + "language_loss": 0.85525346, + "learning_rate": 2.171347560204948e-06, + "loss": 0.8764838, + "num_input_tokens_seen": 174660955, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.421875, + "step": 8125, + "time_per_iteration": 2.6699728965759277 + }, + { + "auxiliary_loss_clip": 0.0178087, + "auxiliary_loss_mlp": 0.00346268, + "balance_loss_clip": 1.44539785, + "balance_loss_mlp": 0.30402029, + "epoch": 0.48856155117991884, + "flos": 13771599269760.0, + "grad_norm": 4.209018266711976, + "language_loss": 0.79007399, + "learning_rate": 2.170959527233356e-06, + "loss": 0.8113454, + "num_input_tokens_seen": 174678270, + "router_z_loss_clip": 3.35742188, + "router_z_loss_mlp": 0.42236328, + "step": 8126, + "time_per_iteration": 4.234640836715698 + }, + { + "auxiliary_loss_clip": 0.01828998, + "auxiliary_loss_mlp": 0.00356005, + "balance_loss_clip": 1.47230649, + "balance_loss_mlp": 0.3108007, + "epoch": 0.4886216744325868, + "flos": 32087764206720.0, + "grad_norm": 7.246604081212289, + "language_loss": 0.7408585, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.76270854, + "num_input_tokens_seen": 174698360, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 0.45263672, + "step": 8127, + "time_per_iteration": 4.258450269699097 + }, + { + "auxiliary_loss_clip": 0.01821392, + "auxiliary_loss_mlp": 0.00368406, + "balance_loss_clip": 1.46480703, + "balance_loss_mlp": 0.32346442, + "epoch": 0.48868179768525477, + "flos": 19610063631360.0, + "grad_norm": 4.9987762601759815, + "language_loss": 0.82200122, + "learning_rate": 2.170183441856481e-06, + "loss": 0.84389913, + "num_input_tokens_seen": 174716755, + "router_z_loss_clip": 3.56835938, + "router_z_loss_mlp": 0.44921875, + "step": 8128, + "time_per_iteration": 2.6581361293792725 + }, + { + "auxiliary_loss_clip": 0.0181949, + "auxiliary_loss_mlp": 0.00376843, + "balance_loss_clip": 1.46620631, + "balance_loss_mlp": 0.33051825, + "epoch": 0.48874192093792274, + "flos": 21286912199040.0, + "grad_norm": 6.194027512099844, + "language_loss": 0.82224995, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.84421325, + "num_input_tokens_seen": 174735560, + "router_z_loss_clip": 3.53320312, + "router_z_loss_mlp": 0.46264648, + "step": 8129, + "time_per_iteration": 2.7145674228668213 + }, + { + "auxiliary_loss_clip": 0.01838582, + "auxiliary_loss_mlp": 0.00356999, + "balance_loss_clip": 1.47723258, + "balance_loss_mlp": 0.31205705, + "epoch": 0.4888020441905907, + "flos": 14173439696640.0, + "grad_norm": 9.559321449739732, + "language_loss": 0.73243392, + "learning_rate": 2.169407330666114e-06, + "loss": 0.7543897, + "num_input_tokens_seen": 174752730, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 0.44921875, + "step": 8130, + "time_per_iteration": 2.6408941745758057 + }, + { + "auxiliary_loss_clip": 0.01816071, + "auxiliary_loss_mlp": 0.00319283, + "balance_loss_clip": 1.47172213, + "balance_loss_mlp": 0.27453142, + "epoch": 0.48886216744325867, + "flos": 24097891766400.0, + "grad_norm": 337.5244275886292, + "language_loss": 0.78217417, + "learning_rate": 2.169019265427658e-06, + "loss": 0.80352765, + "num_input_tokens_seen": 174772520, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.44775391, + "step": 8131, + "time_per_iteration": 2.682281255722046 + }, + { + "auxiliary_loss_clip": 0.01844503, + "auxiliary_loss_mlp": 0.00347151, + "balance_loss_clip": 1.48202336, + "balance_loss_mlp": 0.29996771, + "epoch": 0.48892229069592663, + "flos": 38431419402240.0, + "grad_norm": 29.41030508571792, + "language_loss": 0.73999125, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.76190782, + "num_input_tokens_seen": 174796540, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.47192383, + "step": 8132, + "time_per_iteration": 4.2306506633758545 + }, + { + "auxiliary_loss_clip": 0.01836816, + "auxiliary_loss_mlp": 0.00352915, + "balance_loss_clip": 1.48221517, + "balance_loss_mlp": 0.30709052, + "epoch": 0.4889824139485946, + "flos": 23843321101440.0, + "grad_norm": 3.165970023733327, + "language_loss": 0.74855155, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.77044886, + "num_input_tokens_seen": 174817840, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 0.45776367, + "step": 8133, + "time_per_iteration": 2.7084128856658936 + }, + { + "auxiliary_loss_clip": 0.01816814, + "auxiliary_loss_mlp": 0.00341144, + "balance_loss_clip": 1.46959209, + "balance_loss_mlp": 0.29946834, + "epoch": 0.48904253720126256, + "flos": 24425827960320.0, + "grad_norm": 10.00437214852962, + "language_loss": 0.76449388, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.78607345, + "num_input_tokens_seen": 174837885, + "router_z_loss_clip": 3.4765625, + "router_z_loss_mlp": 0.41674805, + "step": 8134, + "time_per_iteration": 2.707709789276123 + }, + { + "auxiliary_loss_clip": 0.01825642, + "auxiliary_loss_mlp": 0.00382489, + "balance_loss_clip": 1.46902454, + "balance_loss_mlp": 0.33580691, + "epoch": 0.4891026604539306, + "flos": 24170682677760.0, + "grad_norm": 26.62676173601318, + "language_loss": 0.85413384, + "learning_rate": 2.167466940528718e-06, + "loss": 0.87621522, + "num_input_tokens_seen": 174855240, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.46679688, + "step": 8135, + "time_per_iteration": 2.688605785369873 + }, + { + "auxiliary_loss_clip": 0.01826881, + "auxiliary_loss_mlp": 0.00351677, + "balance_loss_clip": 1.47327435, + "balance_loss_mlp": 0.306997, + "epoch": 0.48916278370659855, + "flos": 21470954509440.0, + "grad_norm": 32.226858775408196, + "language_loss": 0.79981577, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.82160139, + "num_input_tokens_seen": 174875145, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.44677734, + "step": 8136, + "time_per_iteration": 2.7471845149993896 + }, + { + "auxiliary_loss_clip": 0.01823395, + "auxiliary_loss_mlp": 0.0035396, + "balance_loss_clip": 1.47310972, + "balance_loss_mlp": 0.31121191, + "epoch": 0.4892229069592665, + "flos": 22309755886080.0, + "grad_norm": 44.479420098249456, + "language_loss": 0.78006268, + "learning_rate": 2.166690739918204e-06, + "loss": 0.80183619, + "num_input_tokens_seen": 174894770, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.42749023, + "step": 8137, + "time_per_iteration": 2.6840076446533203 + }, + { + "auxiliary_loss_clip": 0.01823047, + "auxiliary_loss_mlp": 0.00356698, + "balance_loss_clip": 1.46664429, + "balance_loss_mlp": 0.30932379, + "epoch": 0.4892830302119345, + "flos": 12786856934400.0, + "grad_norm": 14.258433454983797, + "language_loss": 0.82517081, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.84696829, + "num_input_tokens_seen": 174912780, + "router_z_loss_clip": 3.5625, + "router_z_loss_mlp": 0.47387695, + "step": 8138, + "time_per_iteration": 2.63661527633667 + }, + { + "auxiliary_loss_clip": 0.01820711, + "auxiliary_loss_mlp": 0.00369334, + "balance_loss_clip": 1.47374606, + "balance_loss_mlp": 0.32348657, + "epoch": 0.48934315346460244, + "flos": 20813896972800.0, + "grad_norm": 30.43833279119974, + "language_loss": 0.7846272, + "learning_rate": 2.165914514023972e-06, + "loss": 0.80652761, + "num_input_tokens_seen": 174931250, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.45849609, + "step": 8139, + "time_per_iteration": 2.6750714778900146 + }, + { + "auxiliary_loss_clip": 0.01820627, + "auxiliary_loss_mlp": 0.00347842, + "balance_loss_clip": 1.47371793, + "balance_loss_mlp": 0.30273294, + "epoch": 0.4894032767172704, + "flos": 19755537713280.0, + "grad_norm": 4.07099940628131, + "language_loss": 0.69976431, + "learning_rate": 2.165526391632255e-06, + "loss": 0.72144902, + "num_input_tokens_seen": 174951105, + "router_z_loss_clip": 3.46679688, + "router_z_loss_mlp": 0.45117188, + "step": 8140, + "time_per_iteration": 2.7008414268493652 + }, + { + "auxiliary_loss_clip": 0.01834496, + "auxiliary_loss_mlp": 0.00372195, + "balance_loss_clip": 1.47345865, + "balance_loss_mlp": 0.32308114, + "epoch": 0.4894633999699384, + "flos": 17818982835840.0, + "grad_norm": 14.662687485566627, + "language_loss": 0.87047398, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.89254081, + "num_input_tokens_seen": 174969120, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 0.49121094, + "step": 8141, + "time_per_iteration": 2.6833789348602295 + }, + { + "auxiliary_loss_clip": 0.01838028, + "auxiliary_loss_mlp": 0.0037719, + "balance_loss_clip": 1.48054862, + "balance_loss_mlp": 0.32931536, + "epoch": 0.48952352322260634, + "flos": 25523222325120.0, + "grad_norm": 3.7815679225138275, + "language_loss": 0.77932465, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.8014769, + "num_input_tokens_seen": 174991295, + "router_z_loss_clip": 3.57617188, + "router_z_loss_mlp": 0.47875977, + "step": 8142, + "time_per_iteration": 2.714078903198242 + }, + { + "auxiliary_loss_clip": 0.01803445, + "auxiliary_loss_mlp": 0.00354664, + "balance_loss_clip": 1.45869267, + "balance_loss_mlp": 0.31010309, + "epoch": 0.4895836464752743, + "flos": 29055502903680.0, + "grad_norm": 2.274984913803755, + "language_loss": 0.74524987, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.76683092, + "num_input_tokens_seen": 175012830, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.44580078, + "step": 8143, + "time_per_iteration": 2.7421395778656006 + }, + { + "auxiliary_loss_clip": 0.01807244, + "auxiliary_loss_mlp": 0.00373665, + "balance_loss_clip": 1.46080947, + "balance_loss_mlp": 0.33217973, + "epoch": 0.48964376972794227, + "flos": 33546958312320.0, + "grad_norm": 2.640696213241605, + "language_loss": 0.79824388, + "learning_rate": 2.163973839444793e-06, + "loss": 0.82005298, + "num_input_tokens_seen": 175035695, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.41503906, + "step": 8144, + "time_per_iteration": 2.7693493366241455 + }, + { + "auxiliary_loss_clip": 0.01806064, + "auxiliary_loss_mlp": 0.00372186, + "balance_loss_clip": 1.4537276, + "balance_loss_mlp": 0.3245973, + "epoch": 0.48970389298061023, + "flos": 22054035985920.0, + "grad_norm": 11.120419923606454, + "language_loss": 0.81201357, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.83379602, + "num_input_tokens_seen": 175056425, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.47583008, + "step": 8145, + "time_per_iteration": 2.698435068130493 + }, + { + "auxiliary_loss_clip": 0.01792755, + "auxiliary_loss_mlp": 0.00337764, + "balance_loss_clip": 1.44782519, + "balance_loss_mlp": 0.29379976, + "epoch": 0.4897640162332782, + "flos": 20084299920000.0, + "grad_norm": 18.545418152033122, + "language_loss": 0.880979, + "learning_rate": 2.163197525984761e-06, + "loss": 0.90228415, + "num_input_tokens_seen": 175074800, + "router_z_loss_clip": 3.45117188, + "router_z_loss_mlp": 0.43969727, + "step": 8146, + "time_per_iteration": 2.6204919815063477 + }, + { + "auxiliary_loss_clip": 0.01790052, + "auxiliary_loss_mlp": 0.00312159, + "balance_loss_clip": 1.45241106, + "balance_loss_mlp": 0.26912487, + "epoch": 0.48982413948594616, + "flos": 23806225330560.0, + "grad_norm": 4.554239901391087, + "language_loss": 0.80376518, + "learning_rate": 2.162809359964687e-06, + "loss": 0.82478726, + "num_input_tokens_seen": 175094500, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.43041992, + "step": 8147, + "time_per_iteration": 2.666036605834961 + }, + { + "auxiliary_loss_clip": 0.01802895, + "auxiliary_loss_mlp": 0.00312537, + "balance_loss_clip": 1.45683289, + "balance_loss_mlp": 0.26893073, + "epoch": 0.4898842627386142, + "flos": 17639645207040.0, + "grad_norm": 103.51392996462536, + "language_loss": 0.91283518, + "learning_rate": 2.162421187770864e-06, + "loss": 0.93398952, + "num_input_tokens_seen": 175112920, + "router_z_loss_clip": 3.4609375, + "router_z_loss_mlp": 0.43603516, + "step": 8148, + "time_per_iteration": 2.7094063758850098 + }, + { + "auxiliary_loss_clip": 0.01801896, + "auxiliary_loss_mlp": 0.00328303, + "balance_loss_clip": 1.45730305, + "balance_loss_mlp": 0.28715178, + "epoch": 0.48994438599128215, + "flos": 16617914841600.0, + "grad_norm": 21.17914714619926, + "language_loss": 0.81310713, + "learning_rate": 2.162033009418015e-06, + "loss": 0.83440912, + "num_input_tokens_seen": 175129910, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 0.41137695, + "step": 8149, + "time_per_iteration": 2.675619602203369 + }, + { + "auxiliary_loss_clip": 0.01792182, + "auxiliary_loss_mlp": 0.00326407, + "balance_loss_clip": 1.44177747, + "balance_loss_mlp": 0.2822758, + "epoch": 0.4900045092439501, + "flos": 26614834600320.0, + "grad_norm": 8.32278009465709, + "language_loss": 0.84195799, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.8631438, + "num_input_tokens_seen": 175148705, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.44140625, + "step": 8150, + "time_per_iteration": 2.685760259628296 + }, + { + "auxiliary_loss_clip": 0.01795533, + "auxiliary_loss_mlp": 0.00316677, + "balance_loss_clip": 1.44265437, + "balance_loss_mlp": 0.27285615, + "epoch": 0.4900646324966181, + "flos": 19902125116800.0, + "grad_norm": 2.3937584808329664, + "language_loss": 0.78717589, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.80829799, + "num_input_tokens_seen": 175167425, + "router_z_loss_clip": 3.53125, + "router_z_loss_mlp": 0.43798828, + "step": 8151, + "time_per_iteration": 2.6519052982330322 + }, + { + "auxiliary_loss_clip": 0.0150187, + "auxiliary_loss_mlp": 0.00131471, + "balance_loss_clip": 1.32989585, + "balance_loss_mlp": 0.11239771, + "epoch": 0.49012475574928605, + "flos": 59189620337280.0, + "grad_norm": 0.8252572683060831, + "language_loss": 0.53923625, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.55556965, + "num_input_tokens_seen": 175227985, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.19042969, + "step": 8152, + "time_per_iteration": 3.1370296478271484 + }, + { + "auxiliary_loss_clip": 0.01797565, + "auxiliary_loss_mlp": 0.00302208, + "balance_loss_clip": 1.44581282, + "balance_loss_mlp": 0.25733811, + "epoch": 0.490184879001954, + "flos": 45259797657600.0, + "grad_norm": 3.306486271761101, + "language_loss": 0.6749025, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.6959002, + "num_input_tokens_seen": 175251895, + "router_z_loss_clip": 3.51757812, + "router_z_loss_mlp": 0.44897461, + "step": 8153, + "time_per_iteration": 2.843877077102661 + }, + { + "auxiliary_loss_clip": 0.01779818, + "auxiliary_loss_mlp": 0.00298138, + "balance_loss_clip": 1.44050574, + "balance_loss_mlp": 0.25136054, + "epoch": 0.490245002254622, + "flos": 28002135634560.0, + "grad_norm": 4.542728576773728, + "language_loss": 0.81258655, + "learning_rate": 2.160092025783549e-06, + "loss": 0.83336616, + "num_input_tokens_seen": 175272770, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.46801758, + "step": 8154, + "time_per_iteration": 2.729923963546753 + }, + { + "auxiliary_loss_clip": 0.01491834, + "auxiliary_loss_mlp": 0.00093607, + "balance_loss_clip": 1.31643558, + "balance_loss_mlp": 0.07491536, + "epoch": 0.49030512550728994, + "flos": 58951318533120.0, + "grad_norm": 0.9807515061144112, + "language_loss": 0.66727412, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.6831286, + "num_input_tokens_seen": 175336320, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.18652344, + "step": 8155, + "time_per_iteration": 3.196727991104126 + }, + { + "auxiliary_loss_clip": 0.01797993, + "auxiliary_loss_mlp": 0.00297101, + "balance_loss_clip": 1.44252038, + "balance_loss_mlp": 0.25313699, + "epoch": 0.4903652487599579, + "flos": 19791843384960.0, + "grad_norm": 1.862063119993553, + "language_loss": 0.82331288, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.84426379, + "num_input_tokens_seen": 175353540, + "router_z_loss_clip": 3.55859375, + "router_z_loss_mlp": 0.43969727, + "step": 8156, + "time_per_iteration": 2.6672098636627197 + }, + { + "auxiliary_loss_clip": 0.01796849, + "auxiliary_loss_mlp": 0.00303976, + "balance_loss_clip": 1.44358599, + "balance_loss_mlp": 0.26146597, + "epoch": 0.49042537201262587, + "flos": 21762082241280.0, + "grad_norm": 96.91405446506568, + "language_loss": 0.90024054, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.92124873, + "num_input_tokens_seen": 175370445, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.42504883, + "step": 8157, + "time_per_iteration": 2.665046215057373 + }, + { + "auxiliary_loss_clip": 0.01780244, + "auxiliary_loss_mlp": 0.00278277, + "balance_loss_clip": 1.43080735, + "balance_loss_mlp": 0.2325964, + "epoch": 0.49048549526529384, + "flos": 18953042008320.0, + "grad_norm": 1.6335323801635881, + "language_loss": 0.84587795, + "learning_rate": 2.158539129514956e-06, + "loss": 0.86646318, + "num_input_tokens_seen": 175389020, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 0.45703125, + "step": 8158, + "time_per_iteration": 2.6755099296569824 + }, + { + "auxiliary_loss_clip": 0.0181113, + "auxiliary_loss_mlp": 0.00306689, + "balance_loss_clip": 1.4483856, + "balance_loss_mlp": 0.26248628, + "epoch": 0.4905456185179618, + "flos": 26906393295360.0, + "grad_norm": 14.299401229640958, + "language_loss": 0.75823075, + "learning_rate": 2.158150890381454e-06, + "loss": 0.77940893, + "num_input_tokens_seen": 175409545, + "router_z_loss_clip": 3.63085938, + "router_z_loss_mlp": 0.44165039, + "step": 8159, + "time_per_iteration": 2.733875274658203 + }, + { + "auxiliary_loss_clip": 0.01804027, + "auxiliary_loss_mlp": 0.00298534, + "balance_loss_clip": 1.45202875, + "balance_loss_mlp": 0.250779, + "epoch": 0.49060574177062977, + "flos": 20412343854720.0, + "grad_norm": 28.372779300680552, + "language_loss": 0.78867817, + "learning_rate": 2.157762645250854e-06, + "loss": 0.80970377, + "num_input_tokens_seen": 175429335, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.47680664, + "step": 8160, + "time_per_iteration": 2.6749675273895264 + }, + { + "auxiliary_loss_clip": 0.01775731, + "auxiliary_loss_mlp": 0.00310706, + "balance_loss_clip": 1.42918396, + "balance_loss_mlp": 0.26657474, + "epoch": 0.4906658650232978, + "flos": 17493704248320.0, + "grad_norm": 5.931650878823519, + "language_loss": 0.77497935, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.79584372, + "num_input_tokens_seen": 175446955, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.44116211, + "step": 8161, + "time_per_iteration": 2.6128015518188477 + }, + { + "auxiliary_loss_clip": 0.0177266, + "auxiliary_loss_mlp": 0.00296635, + "balance_loss_clip": 1.4298389, + "balance_loss_mlp": 0.25255114, + "epoch": 0.49072598827596575, + "flos": 26614439550720.0, + "grad_norm": 3.5045578682384644, + "language_loss": 0.74322021, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.76391321, + "num_input_tokens_seen": 175468195, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.44091797, + "step": 8162, + "time_per_iteration": 2.6854124069213867 + }, + { + "auxiliary_loss_clip": 0.01759937, + "auxiliary_loss_mlp": 0.00291162, + "balance_loss_clip": 1.41588926, + "balance_loss_mlp": 0.24614835, + "epoch": 0.4907861115286337, + "flos": 20412595249920.0, + "grad_norm": 10.900156755087226, + "language_loss": 0.70838761, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.72889864, + "num_input_tokens_seen": 175487455, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.44970703, + "step": 8163, + "time_per_iteration": 2.6331493854522705 + }, + { + "auxiliary_loss_clip": 0.01735809, + "auxiliary_loss_mlp": 0.00296996, + "balance_loss_clip": 1.40829515, + "balance_loss_mlp": 0.2552017, + "epoch": 0.4908462347813017, + "flos": 14064271286400.0, + "grad_norm": 4.038048219205349, + "language_loss": 0.84071869, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.86104667, + "num_input_tokens_seen": 175504450, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.41796875, + "step": 8164, + "time_per_iteration": 4.019381284713745 + }, + { + "auxiliary_loss_clip": 0.01764472, + "auxiliary_loss_mlp": 0.00313805, + "balance_loss_clip": 1.4179213, + "balance_loss_mlp": 0.26783812, + "epoch": 0.49090635803396965, + "flos": 18735100237440.0, + "grad_norm": 481.3562346610187, + "language_loss": 0.82938194, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.85016471, + "num_input_tokens_seen": 175523600, + "router_z_loss_clip": 3.46679688, + "router_z_loss_mlp": 0.45996094, + "step": 8165, + "time_per_iteration": 2.6758644580841064 + }, + { + "auxiliary_loss_clip": 0.01763932, + "auxiliary_loss_mlp": 0.00303384, + "balance_loss_clip": 1.42490911, + "balance_loss_mlp": 0.26073083, + "epoch": 0.4909664812866376, + "flos": 20558500295040.0, + "grad_norm": 7.162009279599586, + "language_loss": 0.8536309, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.87430406, + "num_input_tokens_seen": 175542720, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.42651367, + "step": 8166, + "time_per_iteration": 2.6887905597686768 + }, + { + "auxiliary_loss_clip": 0.01400278, + "auxiliary_loss_mlp": 0.00142925, + "balance_loss_clip": 1.2457087, + "balance_loss_mlp": 0.12909694, + "epoch": 0.4910266045393056, + "flos": 54684017948160.0, + "grad_norm": 0.7783668828624731, + "language_loss": 0.54161751, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.55704951, + "num_input_tokens_seen": 175598640, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.13867188, + "step": 8167, + "time_per_iteration": 3.1710660457611084 + }, + { + "auxiliary_loss_clip": 0.0176046, + "auxiliary_loss_mlp": 0.00300534, + "balance_loss_clip": 1.42819786, + "balance_loss_mlp": 0.25556871, + "epoch": 0.49108672779197354, + "flos": 16246454342400.0, + "grad_norm": 2.5347312513498648, + "language_loss": 0.92788619, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.9484961, + "num_input_tokens_seen": 175615675, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.44995117, + "step": 8168, + "time_per_iteration": 4.135586738586426 + }, + { + "auxiliary_loss_clip": 0.01740493, + "auxiliary_loss_mlp": 0.00286844, + "balance_loss_clip": 1.4103092, + "balance_loss_mlp": 0.24147293, + "epoch": 0.4911468510446415, + "flos": 19825419623040.0, + "grad_norm": 2.214623309945843, + "language_loss": 0.78168356, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.80195689, + "num_input_tokens_seen": 175632255, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.45361328, + "step": 8169, + "time_per_iteration": 4.1668150424957275 + }, + { + "auxiliary_loss_clip": 0.01721501, + "auxiliary_loss_mlp": 0.00288166, + "balance_loss_clip": 1.39812899, + "balance_loss_mlp": 0.24527447, + "epoch": 0.4912069742973095, + "flos": 21212684743680.0, + "grad_norm": 7.619494214548272, + "language_loss": 0.82617104, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.84626776, + "num_input_tokens_seen": 175651625, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.42871094, + "step": 8170, + "time_per_iteration": 2.676424026489258 + }, + { + "auxiliary_loss_clip": 0.01762796, + "auxiliary_loss_mlp": 0.00323198, + "balance_loss_clip": 1.42203331, + "balance_loss_mlp": 0.27961531, + "epoch": 0.49126709754997744, + "flos": 19537129065600.0, + "grad_norm": 12.916693576931934, + "language_loss": 0.85481226, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.87567228, + "num_input_tokens_seen": 175669265, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.43603516, + "step": 8171, + "time_per_iteration": 2.7030532360076904 + }, + { + "auxiliary_loss_clip": 0.01752072, + "auxiliary_loss_mlp": 0.00289538, + "balance_loss_clip": 1.41719723, + "balance_loss_mlp": 0.24194978, + "epoch": 0.4913272208026454, + "flos": 12239686080000.0, + "grad_norm": 15.501023343054678, + "language_loss": 0.90044481, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.92086095, + "num_input_tokens_seen": 175686065, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.47583008, + "step": 8172, + "time_per_iteration": 2.615058183670044 + }, + { + "auxiliary_loss_clip": 0.01379284, + "auxiliary_loss_mlp": 0.00090546, + "balance_loss_clip": 1.22725129, + "balance_loss_mlp": 0.07767108, + "epoch": 0.49138734405531337, + "flos": 65465871661440.0, + "grad_norm": 0.683938290014935, + "language_loss": 0.52811992, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.54281819, + "num_input_tokens_seen": 175748595, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.12890625, + "step": 8173, + "time_per_iteration": 3.1528351306915283 + }, + { + "auxiliary_loss_clip": 0.01748055, + "auxiliary_loss_mlp": 0.00298344, + "balance_loss_clip": 1.41328645, + "balance_loss_mlp": 0.253712, + "epoch": 0.4914474673079814, + "flos": 18439052342400.0, + "grad_norm": 67.05011429277769, + "language_loss": 0.68686283, + "learning_rate": 2.152326591972107e-06, + "loss": 0.70732677, + "num_input_tokens_seen": 175766770, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.44628906, + "step": 8174, + "time_per_iteration": 4.0173656940460205 + }, + { + "auxiliary_loss_clip": 0.01720251, + "auxiliary_loss_mlp": 0.00296833, + "balance_loss_clip": 1.39622748, + "balance_loss_mlp": 0.25243953, + "epoch": 0.49150759056064935, + "flos": 21685053525120.0, + "grad_norm": 12.214327716364425, + "language_loss": 0.75345576, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.77362657, + "num_input_tokens_seen": 175783605, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.4440918, + "step": 8175, + "time_per_iteration": 2.6508235931396484 + }, + { + "auxiliary_loss_clip": 0.01743162, + "auxiliary_loss_mlp": 0.00299663, + "balance_loss_clip": 1.41465545, + "balance_loss_mlp": 0.25715277, + "epoch": 0.4915677138133173, + "flos": 22382439056640.0, + "grad_norm": 37.21413366187826, + "language_loss": 0.80160451, + "learning_rate": 2.151549919570068e-06, + "loss": 0.82203281, + "num_input_tokens_seen": 175801390, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.42480469, + "step": 8176, + "time_per_iteration": 2.6696791648864746 + }, + { + "auxiliary_loss_clip": 0.01744392, + "auxiliary_loss_mlp": 0.0030426, + "balance_loss_clip": 1.41299617, + "balance_loss_mlp": 0.26153505, + "epoch": 0.4916278370659853, + "flos": 18402890325120.0, + "grad_norm": 10.173400178640419, + "language_loss": 0.7545839, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.77507043, + "num_input_tokens_seen": 175819830, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.42749023, + "step": 8177, + "time_per_iteration": 2.652636766433716 + }, + { + "auxiliary_loss_clip": 0.01366078, + "auxiliary_loss_mlp": 0.00151244, + "balance_loss_clip": 1.21711445, + "balance_loss_mlp": 0.13932317, + "epoch": 0.49168796031865325, + "flos": 66609124715520.0, + "grad_norm": 0.6974561784120124, + "language_loss": 0.45750731, + "learning_rate": 2.150773224180877e-06, + "loss": 0.47268054, + "num_input_tokens_seen": 175881765, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11914062, + "step": 8178, + "time_per_iteration": 3.2028563022613525 + }, + { + "auxiliary_loss_clip": 0.01715839, + "auxiliary_loss_mlp": 0.00322637, + "balance_loss_clip": 1.39172745, + "balance_loss_mlp": 0.28112847, + "epoch": 0.4917480835713212, + "flos": 20959335141120.0, + "grad_norm": 103.33180577840932, + "language_loss": 0.71693373, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.73731846, + "num_input_tokens_seen": 175901795, + "router_z_loss_clip": 3.2421875, + "router_z_loss_mlp": 0.41503906, + "step": 8179, + "time_per_iteration": 2.816530466079712 + }, + { + "auxiliary_loss_clip": 0.01719989, + "auxiliary_loss_mlp": 0.00322897, + "balance_loss_clip": 1.3880868, + "balance_loss_mlp": 0.2801249, + "epoch": 0.4918082068239892, + "flos": 15772900412160.0, + "grad_norm": 5.672802862473398, + "language_loss": 0.76275253, + "learning_rate": 2.149996505922343e-06, + "loss": 0.78318143, + "num_input_tokens_seen": 175917770, + "router_z_loss_clip": 3.31640625, + "router_z_loss_mlp": 0.42749023, + "step": 8180, + "time_per_iteration": 2.634249448776245 + }, + { + "auxiliary_loss_clip": 0.01706334, + "auxiliary_loss_mlp": 0.0030882, + "balance_loss_clip": 1.38633871, + "balance_loss_mlp": 0.26552346, + "epoch": 0.49186833007665715, + "flos": 24604806453120.0, + "grad_norm": 20.559002547392563, + "language_loss": 0.8928895, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.913041, + "num_input_tokens_seen": 175937000, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.43310547, + "step": 8181, + "time_per_iteration": 2.735861301422119 + }, + { + "auxiliary_loss_clip": 0.01702233, + "auxiliary_loss_mlp": 0.00294537, + "balance_loss_clip": 1.3908124, + "balance_loss_mlp": 0.25379106, + "epoch": 0.4919284533293251, + "flos": 22090557139200.0, + "grad_norm": 16.799404553229635, + "language_loss": 0.79494345, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.81491125, + "num_input_tokens_seen": 175955170, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.40722656, + "step": 8182, + "time_per_iteration": 2.6644675731658936 + }, + { + "auxiliary_loss_clip": 0.01703456, + "auxiliary_loss_mlp": 0.00297725, + "balance_loss_clip": 1.38493991, + "balance_loss_mlp": 0.25476187, + "epoch": 0.4919885765819931, + "flos": 23368043318400.0, + "grad_norm": 869.6302064633481, + "language_loss": 0.81541336, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.83542514, + "num_input_tokens_seen": 175973725, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.42993164, + "step": 8183, + "time_per_iteration": 2.750823497772217 + }, + { + "auxiliary_loss_clip": 0.01736452, + "auxiliary_loss_mlp": 0.00318675, + "balance_loss_clip": 1.40260816, + "balance_loss_mlp": 0.27587891, + "epoch": 0.49204869983466104, + "flos": 21360493209600.0, + "grad_norm": 5.082062150863461, + "language_loss": 0.83428538, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.85483664, + "num_input_tokens_seen": 175993885, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 0.42797852, + "step": 8184, + "time_per_iteration": 2.7768585681915283 + }, + { + "auxiliary_loss_clip": 0.0170272, + "auxiliary_loss_mlp": 0.00291694, + "balance_loss_clip": 1.38689232, + "balance_loss_mlp": 0.25171113, + "epoch": 0.492108823087329, + "flos": 21142695093120.0, + "grad_norm": 3.893195202774834, + "language_loss": 0.78033519, + "learning_rate": 2.148054610995789e-06, + "loss": 0.80027938, + "num_input_tokens_seen": 176014210, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.39990234, + "step": 8185, + "time_per_iteration": 2.66658616065979 + }, + { + "auxiliary_loss_clip": 0.01733393, + "auxiliary_loss_mlp": 0.00317197, + "balance_loss_clip": 1.39878154, + "balance_loss_mlp": 0.27089632, + "epoch": 0.49216894633999697, + "flos": 25116605389440.0, + "grad_norm": 32.273739781766565, + "language_loss": 0.81096387, + "learning_rate": 2.147666215108831e-06, + "loss": 0.83146977, + "num_input_tokens_seen": 176033890, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.46289062, + "step": 8186, + "time_per_iteration": 2.707064151763916 + }, + { + "auxiliary_loss_clip": 0.01718392, + "auxiliary_loss_mlp": 0.00324998, + "balance_loss_clip": 1.39345467, + "balance_loss_mlp": 0.28131998, + "epoch": 0.49222906959266494, + "flos": 22637943475200.0, + "grad_norm": 4.029286656943426, + "language_loss": 0.75090158, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.77133554, + "num_input_tokens_seen": 176052720, + "router_z_loss_clip": 3.24804688, + "router_z_loss_mlp": 0.43676758, + "step": 8187, + "time_per_iteration": 2.6857657432556152 + }, + { + "auxiliary_loss_clip": 0.01708421, + "auxiliary_loss_mlp": 0.00323628, + "balance_loss_clip": 1.38692117, + "balance_loss_mlp": 0.27868617, + "epoch": 0.49228919284533296, + "flos": 20410548174720.0, + "grad_norm": 222.58073225764946, + "language_loss": 0.71682167, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.73714209, + "num_input_tokens_seen": 176072545, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.44897461, + "step": 8188, + "time_per_iteration": 2.6485345363616943 + }, + { + "auxiliary_loss_clip": 0.01721291, + "auxiliary_loss_mlp": 0.00320874, + "balance_loss_clip": 1.39890695, + "balance_loss_mlp": 0.28234541, + "epoch": 0.4923493160980009, + "flos": 27122359818240.0, + "grad_norm": 2.0743721855641573, + "language_loss": 0.80908346, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.82950509, + "num_input_tokens_seen": 176091490, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.38549805, + "step": 8189, + "time_per_iteration": 2.6965601444244385 + }, + { + "auxiliary_loss_clip": 0.01709927, + "auxiliary_loss_mlp": 0.00298326, + "balance_loss_clip": 1.38990164, + "balance_loss_mlp": 0.25824815, + "epoch": 0.4924094393506689, + "flos": 35736683224320.0, + "grad_norm": 45.89313695780101, + "language_loss": 0.69794786, + "learning_rate": 2.146112575713104e-06, + "loss": 0.71803039, + "num_input_tokens_seen": 176113200, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.40087891, + "step": 8190, + "time_per_iteration": 2.755208730697632 + }, + { + "auxiliary_loss_clip": 0.01718429, + "auxiliary_loss_mlp": 0.00294966, + "balance_loss_clip": 1.39326262, + "balance_loss_mlp": 0.25565043, + "epoch": 0.49246956260333685, + "flos": 20412487509120.0, + "grad_norm": 4.237079667955729, + "language_loss": 0.78953838, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.80967236, + "num_input_tokens_seen": 176132485, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.39331055, + "step": 8191, + "time_per_iteration": 2.6675689220428467 + }, + { + "auxiliary_loss_clip": 0.01728027, + "auxiliary_loss_mlp": 0.00321048, + "balance_loss_clip": 1.40061617, + "balance_loss_mlp": 0.27584359, + "epoch": 0.4925296858560048, + "flos": 38976938231040.0, + "grad_norm": 4.4301849371560635, + "language_loss": 0.77700567, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.79749644, + "num_input_tokens_seen": 176155755, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.45214844, + "step": 8192, + "time_per_iteration": 2.854296922683716 + }, + { + "auxiliary_loss_clip": 0.01343245, + "auxiliary_loss_mlp": 0.00082113, + "balance_loss_clip": 1.19968104, + "balance_loss_mlp": 0.06971523, + "epoch": 0.4925898091086728, + "flos": 64278917712000.0, + "grad_norm": 0.6986153400912771, + "language_loss": 0.51809096, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.53234446, + "num_input_tokens_seen": 176216295, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.12353516, + "step": 8193, + "time_per_iteration": 3.201075792312622 + }, + { + "auxiliary_loss_clip": 0.01731824, + "auxiliary_loss_mlp": 0.002703, + "balance_loss_clip": 1.40882087, + "balance_loss_mlp": 0.22998382, + "epoch": 0.49264993236134075, + "flos": 23036372110080.0, + "grad_norm": 1.8194520858667815, + "language_loss": 0.82394844, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.8439697, + "num_input_tokens_seen": 176235925, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.40332031, + "step": 8194, + "time_per_iteration": 2.7732198238372803 + }, + { + "auxiliary_loss_clip": 0.01736216, + "auxiliary_loss_mlp": 0.00331789, + "balance_loss_clip": 1.40528488, + "balance_loss_mlp": 0.29101938, + "epoch": 0.4927100556140087, + "flos": 24718212668160.0, + "grad_norm": 2.67366488377231, + "language_loss": 0.7800861, + "learning_rate": 2.144170401915341e-06, + "loss": 0.80076617, + "num_input_tokens_seen": 176253865, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.40795898, + "step": 8195, + "time_per_iteration": 2.6731414794921875 + }, + { + "auxiliary_loss_clip": 0.01714471, + "auxiliary_loss_mlp": 0.00299084, + "balance_loss_clip": 1.39204812, + "balance_loss_mlp": 0.25719365, + "epoch": 0.4927701788666767, + "flos": 23505544581120.0, + "grad_norm": 216.90566588284858, + "language_loss": 0.86029243, + "learning_rate": 2.143781950696001e-06, + "loss": 0.88042796, + "num_input_tokens_seen": 176271525, + "router_z_loss_clip": 3.2265625, + "router_z_loss_mlp": 0.41845703, + "step": 8196, + "time_per_iteration": 2.724663496017456 + }, + { + "auxiliary_loss_clip": 0.01731008, + "auxiliary_loss_mlp": 0.00327399, + "balance_loss_clip": 1.39890623, + "balance_loss_mlp": 0.28422105, + "epoch": 0.49283030211934464, + "flos": 22928891639040.0, + "grad_norm": 4.769995356371952, + "language_loss": 0.7704916, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.79107571, + "num_input_tokens_seen": 176290810, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.43188477, + "step": 8197, + "time_per_iteration": 2.6962990760803223 + }, + { + "auxiliary_loss_clip": 0.01741869, + "auxiliary_loss_mlp": 0.00296657, + "balance_loss_clip": 1.41439784, + "balance_loss_mlp": 0.25576773, + "epoch": 0.4928904253720126, + "flos": 16873024210560.0, + "grad_norm": 3.001666305641691, + "language_loss": 0.91787815, + "learning_rate": 2.143005031915374e-06, + "loss": 0.93826342, + "num_input_tokens_seen": 176309165, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.40869141, + "step": 8198, + "time_per_iteration": 2.6900618076324463 + }, + { + "auxiliary_loss_clip": 0.01750788, + "auxiliary_loss_mlp": 0.00327178, + "balance_loss_clip": 1.4128567, + "balance_loss_mlp": 0.28576499, + "epoch": 0.4929505486246806, + "flos": 14866551509760.0, + "grad_norm": 3.442085956896212, + "language_loss": 0.81621176, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.83699143, + "num_input_tokens_seen": 176324960, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.41430664, + "step": 8199, + "time_per_iteration": 2.6400697231292725 + }, + { + "auxiliary_loss_clip": 0.0174806, + "auxiliary_loss_mlp": 0.00343703, + "balance_loss_clip": 1.40986443, + "balance_loss_mlp": 0.29811722, + "epoch": 0.49301067187734854, + "flos": 23842351434240.0, + "grad_norm": 5.485531773138667, + "language_loss": 0.66909266, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.69001031, + "num_input_tokens_seen": 176346195, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.45581055, + "step": 8200, + "time_per_iteration": 2.6963675022125244 + }, + { + "auxiliary_loss_clip": 0.01735313, + "auxiliary_loss_mlp": 0.00304981, + "balance_loss_clip": 1.41396654, + "balance_loss_mlp": 0.26409268, + "epoch": 0.49307079513001656, + "flos": 22491284244480.0, + "grad_norm": 1412.3271083727207, + "language_loss": 0.84372336, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.86412632, + "num_input_tokens_seen": 176366735, + "router_z_loss_clip": 3.21289062, + "router_z_loss_mlp": 0.40869141, + "step": 8201, + "time_per_iteration": 2.6772193908691406 + }, + { + "auxiliary_loss_clip": 0.01742981, + "auxiliary_loss_mlp": 0.00323126, + "balance_loss_clip": 1.4039067, + "balance_loss_mlp": 0.27858937, + "epoch": 0.4931309183826845, + "flos": 15924587546880.0, + "grad_norm": 62.72833031794366, + "language_loss": 0.77613997, + "learning_rate": 2.141451129398785e-06, + "loss": 0.79680109, + "num_input_tokens_seen": 176384475, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.4453125, + "step": 8202, + "time_per_iteration": 2.6978633403778076 + }, + { + "auxiliary_loss_clip": 0.01726384, + "auxiliary_loss_mlp": 0.00323914, + "balance_loss_clip": 1.40167093, + "balance_loss_mlp": 0.28383541, + "epoch": 0.4931910416353525, + "flos": 27309059735040.0, + "grad_norm": 2.2090514157759054, + "language_loss": 0.83435726, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.85486019, + "num_input_tokens_seen": 176402645, + "router_z_loss_clip": 3.24609375, + "router_z_loss_mlp": 0.40087891, + "step": 8203, + "time_per_iteration": 2.7838833332061768 + }, + { + "auxiliary_loss_clip": 0.01748404, + "auxiliary_loss_mlp": 0.00293719, + "balance_loss_clip": 1.4131062, + "balance_loss_mlp": 0.25008804, + "epoch": 0.49325116488802045, + "flos": 20806139635200.0, + "grad_norm": 28.983820006045754, + "language_loss": 0.8754065, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.89582771, + "num_input_tokens_seen": 176416715, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.43676758, + "step": 8204, + "time_per_iteration": 2.6240766048431396 + }, + { + "auxiliary_loss_clip": 0.0172325, + "auxiliary_loss_mlp": 0.00300754, + "balance_loss_clip": 1.40278983, + "balance_loss_mlp": 0.25791031, + "epoch": 0.4933112881406884, + "flos": 19865963099520.0, + "grad_norm": 34.54654881717216, + "language_loss": 0.74782383, + "learning_rate": 2.140285646139455e-06, + "loss": 0.7680639, + "num_input_tokens_seen": 176435755, + "router_z_loss_clip": 3.20507812, + "router_z_loss_mlp": 0.4284668, + "step": 8205, + "time_per_iteration": 2.668473958969116 + }, + { + "auxiliary_loss_clip": 0.01720458, + "auxiliary_loss_mlp": 0.00354706, + "balance_loss_clip": 1.39216888, + "balance_loss_mlp": 0.30876297, + "epoch": 0.4933714113933564, + "flos": 21827977741440.0, + "grad_norm": 12.503163075918478, + "language_loss": 0.73369324, + "learning_rate": 2.139897141060744e-06, + "loss": 0.75444484, + "num_input_tokens_seen": 176453915, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.45947266, + "step": 8206, + "time_per_iteration": 2.692049026489258 + }, + { + "auxiliary_loss_clip": 0.01694619, + "auxiliary_loss_mlp": 0.00307524, + "balance_loss_clip": 1.38005948, + "balance_loss_mlp": 0.26723105, + "epoch": 0.49343153464602435, + "flos": 27890130049920.0, + "grad_norm": 10.725608039623944, + "language_loss": 0.84321767, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.86323905, + "num_input_tokens_seen": 176475175, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.40307617, + "step": 8207, + "time_per_iteration": 4.155500650405884 + }, + { + "auxiliary_loss_clip": 0.01710508, + "auxiliary_loss_mlp": 0.00312152, + "balance_loss_clip": 1.39428222, + "balance_loss_mlp": 0.26926064, + "epoch": 0.4934916578986923, + "flos": 24681080983680.0, + "grad_norm": 10.218398974185554, + "language_loss": 0.69287086, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.71309745, + "num_input_tokens_seen": 176494250, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.42919922, + "step": 8208, + "time_per_iteration": 2.703428268432617 + }, + { + "auxiliary_loss_clip": 0.01702251, + "auxiliary_loss_mlp": 0.00295089, + "balance_loss_clip": 1.3843205, + "balance_loss_mlp": 0.2549392, + "epoch": 0.4935517811513603, + "flos": 23405139089280.0, + "grad_norm": 9.130205847769034, + "language_loss": 0.87116003, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.89113343, + "num_input_tokens_seen": 176513325, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.40136719, + "step": 8209, + "time_per_iteration": 2.6835927963256836 + }, + { + "auxiliary_loss_clip": 0.01701118, + "auxiliary_loss_mlp": 0.00310647, + "balance_loss_clip": 1.39418161, + "balance_loss_mlp": 0.26701623, + "epoch": 0.49361190440402825, + "flos": 21944508439680.0, + "grad_norm": 3.771504733832411, + "language_loss": 0.84856468, + "learning_rate": 2.138343067844089e-06, + "loss": 0.86868227, + "num_input_tokens_seen": 176532915, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.43652344, + "step": 8210, + "time_per_iteration": 4.106863737106323 + }, + { + "auxiliary_loss_clip": 0.01694364, + "auxiliary_loss_mlp": 0.00323107, + "balance_loss_clip": 1.37488151, + "balance_loss_mlp": 0.28188413, + "epoch": 0.4936720276566962, + "flos": 25115671635840.0, + "grad_norm": 2.7947614379302324, + "language_loss": 0.86350667, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.88368142, + "num_input_tokens_seen": 176552775, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.41235352, + "step": 8211, + "time_per_iteration": 4.233346223831177 + }, + { + "auxiliary_loss_clip": 0.01692386, + "auxiliary_loss_mlp": 0.0030642, + "balance_loss_clip": 1.37640631, + "balance_loss_mlp": 0.26324272, + "epoch": 0.4937321509093642, + "flos": 26358935132160.0, + "grad_norm": 10.704721391912084, + "language_loss": 1.00709331, + "learning_rate": 2.137565999700933e-06, + "loss": 1.02708149, + "num_input_tokens_seen": 176572185, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.43188477, + "step": 8212, + "time_per_iteration": 2.769343376159668 + }, + { + "auxiliary_loss_clip": 0.01681811, + "auxiliary_loss_mlp": 0.00293095, + "balance_loss_clip": 1.37641108, + "balance_loss_mlp": 0.24960756, + "epoch": 0.49379227416203214, + "flos": 22961390469120.0, + "grad_norm": 17.8886542469904, + "language_loss": 0.71861029, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.73835933, + "num_input_tokens_seen": 176591490, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.43481445, + "step": 8213, + "time_per_iteration": 2.7249794006347656 + }, + { + "auxiliary_loss_clip": 0.01684213, + "auxiliary_loss_mlp": 0.00304953, + "balance_loss_clip": 1.37417912, + "balance_loss_mlp": 0.26077357, + "epoch": 0.49385239741470016, + "flos": 32489101843200.0, + "grad_norm": 27.537595744211142, + "language_loss": 0.83471429, + "learning_rate": 2.136788910691711e-06, + "loss": 0.85460591, + "num_input_tokens_seen": 176612715, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.44140625, + "step": 8214, + "time_per_iteration": 2.7611501216888428 + }, + { + "auxiliary_loss_clip": 0.01674957, + "auxiliary_loss_mlp": 0.00300537, + "balance_loss_clip": 1.3692838, + "balance_loss_mlp": 0.25960097, + "epoch": 0.4939125206673681, + "flos": 22492864442880.0, + "grad_norm": 63.30402690573189, + "language_loss": 0.91447043, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.93422538, + "num_input_tokens_seen": 176631950, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.40966797, + "step": 8215, + "time_per_iteration": 2.6575124263763428 + }, + { + "auxiliary_loss_clip": 0.01653341, + "auxiliary_loss_mlp": 0.00259794, + "balance_loss_clip": 1.36345863, + "balance_loss_mlp": 0.2200259, + "epoch": 0.4939726439200361, + "flos": 31176351486720.0, + "grad_norm": 86.92918198262056, + "language_loss": 0.88760793, + "learning_rate": 2.136011800934292e-06, + "loss": 0.90673923, + "num_input_tokens_seen": 176653060, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.39770508, + "step": 8216, + "time_per_iteration": 2.760728597640991 + }, + { + "auxiliary_loss_clip": 0.01655854, + "auxiliary_loss_mlp": 0.00321845, + "balance_loss_clip": 1.36002457, + "balance_loss_mlp": 0.27914461, + "epoch": 0.49403276717270406, + "flos": 22674213233280.0, + "grad_norm": 9.170620042209707, + "language_loss": 0.80342436, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.8232013, + "num_input_tokens_seen": 176673895, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.42700195, + "step": 8217, + "time_per_iteration": 4.190083026885986 + }, + { + "auxiliary_loss_clip": 0.01675724, + "auxiliary_loss_mlp": 0.0028438, + "balance_loss_clip": 1.37427711, + "balance_loss_mlp": 0.24625713, + "epoch": 0.494092890425372, + "flos": 20741070147840.0, + "grad_norm": 33.60781995111995, + "language_loss": 0.84249783, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.86209881, + "num_input_tokens_seen": 176692550, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.3815918, + "step": 8218, + "time_per_iteration": 2.727193593978882 + }, + { + "auxiliary_loss_clip": 0.01642737, + "auxiliary_loss_mlp": 0.00270602, + "balance_loss_clip": 1.35023916, + "balance_loss_mlp": 0.22737651, + "epoch": 0.49415301367804, + "flos": 18369026778240.0, + "grad_norm": 47.6293177950911, + "language_loss": 0.84344417, + "learning_rate": 2.134846097653142e-06, + "loss": 0.86257762, + "num_input_tokens_seen": 176709335, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.43237305, + "step": 8219, + "time_per_iteration": 2.673443078994751 + }, + { + "auxiliary_loss_clip": 0.01635892, + "auxiliary_loss_mlp": 0.00283628, + "balance_loss_clip": 1.34649956, + "balance_loss_mlp": 0.24591023, + "epoch": 0.49421313693070795, + "flos": 17530620451200.0, + "grad_norm": 112.71575367911119, + "language_loss": 0.68434584, + "learning_rate": 2.134457519646357e-06, + "loss": 0.70354104, + "num_input_tokens_seen": 176727715, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.37719727, + "step": 8220, + "time_per_iteration": 2.680807113647461 + }, + { + "auxiliary_loss_clip": 0.01634946, + "auxiliary_loss_mlp": 0.00302336, + "balance_loss_clip": 1.3455137, + "balance_loss_mlp": 0.26094687, + "epoch": 0.4942732601833759, + "flos": 20812173120000.0, + "grad_norm": 14.420172130642793, + "language_loss": 0.7950995, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.81447232, + "num_input_tokens_seen": 176747530, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.4140625, + "step": 8221, + "time_per_iteration": 2.673842430114746 + }, + { + "auxiliary_loss_clip": 0.01647687, + "auxiliary_loss_mlp": 0.0027187, + "balance_loss_clip": 1.36194539, + "balance_loss_mlp": 0.23169665, + "epoch": 0.4943333834360439, + "flos": 15048941794560.0, + "grad_norm": 114.22466884595354, + "language_loss": 0.85039586, + "learning_rate": 2.133680348351595e-06, + "loss": 0.86959147, + "num_input_tokens_seen": 176765260, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.40185547, + "step": 8222, + "time_per_iteration": 2.754786968231201 + }, + { + "auxiliary_loss_clip": 0.01650414, + "auxiliary_loss_mlp": 0.00323958, + "balance_loss_clip": 1.3557229, + "balance_loss_mlp": 0.28211555, + "epoch": 0.49439350668871185, + "flos": 16070420764800.0, + "grad_norm": 2.8880024963737236, + "language_loss": 0.79967988, + "learning_rate": 2.133291755093088e-06, + "loss": 0.81942362, + "num_input_tokens_seen": 176781770, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.41845703, + "step": 8223, + "time_per_iteration": 2.7002005577087402 + }, + { + "auxiliary_loss_clip": 0.01644655, + "auxiliary_loss_mlp": 0.00345866, + "balance_loss_clip": 1.34670568, + "balance_loss_mlp": 0.30206847, + "epoch": 0.4944536299413798, + "flos": 20880079781760.0, + "grad_norm": 38.03861986953734, + "language_loss": 0.80473047, + "learning_rate": 2.132903156780144e-06, + "loss": 0.82463568, + "num_input_tokens_seen": 176800655, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.43798828, + "step": 8224, + "time_per_iteration": 2.664337158203125 + }, + { + "auxiliary_loss_clip": 0.01639089, + "auxiliary_loss_mlp": 0.00307359, + "balance_loss_clip": 1.34521019, + "balance_loss_mlp": 0.26670903, + "epoch": 0.4945137531940478, + "flos": 26608908856320.0, + "grad_norm": 13.152464286330419, + "language_loss": 0.73920339, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.75866789, + "num_input_tokens_seen": 176820610, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.40649414, + "step": 8225, + "time_per_iteration": 2.7064077854156494 + }, + { + "auxiliary_loss_clip": 0.01631537, + "auxiliary_loss_mlp": 0.00291758, + "balance_loss_clip": 1.34058213, + "balance_loss_mlp": 0.25346753, + "epoch": 0.49457387644671574, + "flos": 23988148738560.0, + "grad_norm": 4.425474108537735, + "language_loss": 0.83511364, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.85434663, + "num_input_tokens_seen": 176840520, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.38305664, + "step": 8226, + "time_per_iteration": 2.6815624237060547 + }, + { + "auxiliary_loss_clip": 0.01607043, + "auxiliary_loss_mlp": 0.00297816, + "balance_loss_clip": 1.32022548, + "balance_loss_mlp": 0.25811923, + "epoch": 0.49463399969938376, + "flos": 26976598427520.0, + "grad_norm": 5.978997941982717, + "language_loss": 0.77570087, + "learning_rate": 2.131737331662051e-06, + "loss": 0.7947495, + "num_input_tokens_seen": 176860265, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.39697266, + "step": 8227, + "time_per_iteration": 2.7171592712402344 + }, + { + "auxiliary_loss_clip": 0.01632844, + "auxiliary_loss_mlp": 0.00300946, + "balance_loss_clip": 1.33649635, + "balance_loss_mlp": 0.26027179, + "epoch": 0.49469412295205173, + "flos": 29681534067840.0, + "grad_norm": 6.62010401268089, + "language_loss": 0.78349948, + "learning_rate": 2.131348713278718e-06, + "loss": 0.80283737, + "num_input_tokens_seen": 176882910, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.40673828, + "step": 8228, + "time_per_iteration": 2.7360432147979736 + }, + { + "auxiliary_loss_clip": 0.0161306, + "auxiliary_loss_mlp": 0.00285063, + "balance_loss_clip": 1.33199322, + "balance_loss_mlp": 0.24801265, + "epoch": 0.4947542462047197, + "flos": 24131791226880.0, + "grad_norm": 2.459432574822201, + "language_loss": 0.89888883, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.9178701, + "num_input_tokens_seen": 176903030, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.37036133, + "step": 8229, + "time_per_iteration": 2.76126766204834 + }, + { + "auxiliary_loss_clip": 0.01623344, + "auxiliary_loss_mlp": 0.00301062, + "balance_loss_clip": 1.33360159, + "balance_loss_mlp": 0.25924361, + "epoch": 0.49481436945738766, + "flos": 20045049333120.0, + "grad_norm": 1154.1990905621963, + "language_loss": 0.82664502, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.84588909, + "num_input_tokens_seen": 176919025, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.41796875, + "step": 8230, + "time_per_iteration": 2.624035120010376 + }, + { + "auxiliary_loss_clip": 0.01631506, + "auxiliary_loss_mlp": 0.00316929, + "balance_loss_clip": 1.33812284, + "balance_loss_mlp": 0.27804321, + "epoch": 0.4948744927100556, + "flos": 15669550005120.0, + "grad_norm": 8.369919294029394, + "language_loss": 0.88631213, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.90579647, + "num_input_tokens_seen": 176937945, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.38891602, + "step": 8231, + "time_per_iteration": 2.650423049926758 + }, + { + "auxiliary_loss_clip": 0.01236613, + "auxiliary_loss_mlp": 0.00045651, + "balance_loss_clip": 1.09761882, + "balance_loss_mlp": 0.03516044, + "epoch": 0.4949346159627236, + "flos": 68872071502080.0, + "grad_norm": 0.7507950587995613, + "language_loss": 0.59796774, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.61079037, + "num_input_tokens_seen": 177004575, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.10498047, + "step": 8232, + "time_per_iteration": 3.2845494747161865 + }, + { + "auxiliary_loss_clip": 0.01625808, + "auxiliary_loss_mlp": 0.00324059, + "balance_loss_clip": 1.33074069, + "balance_loss_mlp": 0.28567374, + "epoch": 0.49499473921539155, + "flos": 24790285307520.0, + "grad_norm": 6.61884199573343, + "language_loss": 0.74884057, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.76833928, + "num_input_tokens_seen": 177024155, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.38354492, + "step": 8233, + "time_per_iteration": 2.707324981689453 + }, + { + "auxiliary_loss_clip": 0.0160182, + "auxiliary_loss_mlp": 0.00320246, + "balance_loss_clip": 1.31872559, + "balance_loss_mlp": 0.28143167, + "epoch": 0.4950548624680595, + "flos": 32707905540480.0, + "grad_norm": 21.083551596626002, + "language_loss": 0.73265839, + "learning_rate": 2.129016898898633e-06, + "loss": 0.75187898, + "num_input_tokens_seen": 177046185, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.38793945, + "step": 8234, + "time_per_iteration": 2.7646031379699707 + }, + { + "auxiliary_loss_clip": 0.01235725, + "auxiliary_loss_mlp": 0.00048521, + "balance_loss_clip": 1.09670496, + "balance_loss_mlp": 0.03669561, + "epoch": 0.4951149857207275, + "flos": 50082173066880.0, + "grad_norm": 0.7964200110412154, + "language_loss": 0.57729524, + "learning_rate": 2.128628245959482e-06, + "loss": 0.59013778, + "num_input_tokens_seen": 177099025, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.11816406, + "step": 8235, + "time_per_iteration": 3.0552632808685303 + }, + { + "auxiliary_loss_clip": 0.01618145, + "auxiliary_loss_mlp": 0.00357937, + "balance_loss_clip": 1.32885289, + "balance_loss_mlp": 0.32024354, + "epoch": 0.49517510897339545, + "flos": 22236785406720.0, + "grad_norm": 36.923597885855756, + "language_loss": 0.84518141, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.86494219, + "num_input_tokens_seen": 177118365, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.37695312, + "step": 8236, + "time_per_iteration": 2.6941006183624268 + }, + { + "auxiliary_loss_clip": 0.01615529, + "auxiliary_loss_mlp": 0.00303546, + "balance_loss_clip": 1.33098555, + "balance_loss_mlp": 0.26537526, + "epoch": 0.4952352322260634, + "flos": 25374120969600.0, + "grad_norm": 25.133047652426903, + "language_loss": 0.79185176, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.81104249, + "num_input_tokens_seen": 177136415, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.38183594, + "step": 8237, + "time_per_iteration": 2.6642818450927734 + }, + { + "auxiliary_loss_clip": 0.01611377, + "auxiliary_loss_mlp": 0.00288757, + "balance_loss_clip": 1.32657838, + "balance_loss_mlp": 0.25099111, + "epoch": 0.4952953554787314, + "flos": 24608721035520.0, + "grad_norm": 470.89911875110727, + "language_loss": 0.82382274, + "learning_rate": 2.127462257935406e-06, + "loss": 0.8428241, + "num_input_tokens_seen": 177155690, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.37744141, + "step": 8238, + "time_per_iteration": 2.701849937438965 + }, + { + "auxiliary_loss_clip": 0.01606699, + "auxiliary_loss_mlp": 0.00322256, + "balance_loss_clip": 1.31770205, + "balance_loss_mlp": 0.28096172, + "epoch": 0.49535547873139935, + "flos": 17311278049920.0, + "grad_norm": 16.034726078011776, + "language_loss": 0.838202, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.85749149, + "num_input_tokens_seen": 177173350, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.41308594, + "step": 8239, + "time_per_iteration": 2.6110477447509766 + }, + { + "auxiliary_loss_clip": 0.01583828, + "auxiliary_loss_mlp": 0.00342398, + "balance_loss_clip": 1.2948842, + "balance_loss_mlp": 0.29795641, + "epoch": 0.4954156019840673, + "flos": 20740315962240.0, + "grad_norm": 6.268421261733063, + "language_loss": 0.86294502, + "learning_rate": 2.126684908394552e-06, + "loss": 0.88220727, + "num_input_tokens_seen": 177191115, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.44433594, + "step": 8240, + "time_per_iteration": 2.813711166381836 + }, + { + "auxiliary_loss_clip": 0.01583987, + "auxiliary_loss_mlp": 0.00298149, + "balance_loss_clip": 1.30709445, + "balance_loss_mlp": 0.25850028, + "epoch": 0.49547572523673533, + "flos": 12820684567680.0, + "grad_norm": 5.047517542609759, + "language_loss": 0.90827709, + "learning_rate": 2.126296226410898e-06, + "loss": 0.92709851, + "num_input_tokens_seen": 177206155, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.39648438, + "step": 8241, + "time_per_iteration": 2.613617420196533 + }, + { + "auxiliary_loss_clip": 0.01584103, + "auxiliary_loss_mlp": 0.00290475, + "balance_loss_clip": 1.30734169, + "balance_loss_mlp": 0.252042, + "epoch": 0.4955358484894033, + "flos": 15597046402560.0, + "grad_norm": 18.168196487099916, + "language_loss": 0.83997911, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.85872483, + "num_input_tokens_seen": 177224815, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.38427734, + "step": 8242, + "time_per_iteration": 2.6464316844940186 + }, + { + "auxiliary_loss_clip": 0.0160097, + "auxiliary_loss_mlp": 0.00287191, + "balance_loss_clip": 1.3122412, + "balance_loss_mlp": 0.24701738, + "epoch": 0.49559597174207126, + "flos": 26464368528000.0, + "grad_norm": 8.165754230408591, + "language_loss": 0.72537696, + "learning_rate": 2.125518848090833e-06, + "loss": 0.74425852, + "num_input_tokens_seen": 177244490, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.40185547, + "step": 8243, + "time_per_iteration": 2.702315330505371 + }, + { + "auxiliary_loss_clip": 0.01598741, + "auxiliary_loss_mlp": 0.00305265, + "balance_loss_clip": 1.30776143, + "balance_loss_mlp": 0.26752332, + "epoch": 0.4956560949947392, + "flos": 23148234040320.0, + "grad_norm": 15.557176977922582, + "language_loss": 0.75748777, + "learning_rate": 2.125130151783901e-06, + "loss": 0.77652788, + "num_input_tokens_seen": 177264340, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.37719727, + "step": 8244, + "time_per_iteration": 2.7242512702941895 + }, + { + "auxiliary_loss_clip": 0.01591109, + "auxiliary_loss_mlp": 0.00303897, + "balance_loss_clip": 1.30346847, + "balance_loss_mlp": 0.26117247, + "epoch": 0.4957162182474072, + "flos": 20773461237120.0, + "grad_norm": 14.086405795281935, + "language_loss": 0.83542538, + "learning_rate": 2.12474145073202e-06, + "loss": 0.85437536, + "num_input_tokens_seen": 177283055, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.42724609, + "step": 8245, + "time_per_iteration": 2.710317611694336 + }, + { + "auxiliary_loss_clip": 0.0158791, + "auxiliary_loss_mlp": 0.00279428, + "balance_loss_clip": 1.30999732, + "balance_loss_mlp": 0.23918253, + "epoch": 0.49577634150007516, + "flos": 18734202397440.0, + "grad_norm": 2.6401202526439542, + "language_loss": 0.90159428, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.9202677, + "num_input_tokens_seen": 177301140, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.40209961, + "step": 8246, + "time_per_iteration": 2.627795457839966 + }, + { + "auxiliary_loss_clip": 0.0159659, + "auxiliary_loss_mlp": 0.00290436, + "balance_loss_clip": 1.3044399, + "balance_loss_mlp": 0.24861708, + "epoch": 0.4958364647527431, + "flos": 25554176870400.0, + "grad_norm": 4.026796675948535, + "language_loss": 0.92804229, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.94691253, + "num_input_tokens_seen": 177323095, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.41845703, + "step": 8247, + "time_per_iteration": 2.7092082500457764 + }, + { + "auxiliary_loss_clip": 0.015716, + "auxiliary_loss_mlp": 0.00308567, + "balance_loss_clip": 1.289011, + "balance_loss_mlp": 0.26891786, + "epoch": 0.4958965880054111, + "flos": 24425325169920.0, + "grad_norm": 20.54126091507977, + "language_loss": 0.90341002, + "learning_rate": 2.123575319254087e-06, + "loss": 0.92221165, + "num_input_tokens_seen": 177339845, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.39624023, + "step": 8248, + "time_per_iteration": 2.660733461380005 + }, + { + "auxiliary_loss_clip": 0.01586116, + "auxiliary_loss_mlp": 0.0032391, + "balance_loss_clip": 1.29224014, + "balance_loss_mlp": 0.28137574, + "epoch": 0.49595671125807905, + "flos": 25083460114560.0, + "grad_norm": 149.45489290667024, + "language_loss": 0.80741489, + "learning_rate": 2.123186599369812e-06, + "loss": 0.82651508, + "num_input_tokens_seen": 177359980, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.42553711, + "step": 8249, + "time_per_iteration": 4.068791151046753 + }, + { + "auxiliary_loss_clip": 0.01581823, + "auxiliary_loss_mlp": 0.00301439, + "balance_loss_clip": 1.29800105, + "balance_loss_mlp": 0.26217115, + "epoch": 0.496016834510747, + "flos": 16435883692800.0, + "grad_norm": 1109.4912470114955, + "language_loss": 0.81902313, + "learning_rate": 2.122797874814289e-06, + "loss": 0.83785582, + "num_input_tokens_seen": 177378580, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.39282227, + "step": 8250, + "time_per_iteration": 2.6741857528686523 + }, + { + "auxiliary_loss_clip": 0.01593108, + "auxiliary_loss_mlp": 0.00304985, + "balance_loss_clip": 1.30845785, + "balance_loss_mlp": 0.26278502, + "epoch": 0.496076957763415, + "flos": 23437925228160.0, + "grad_norm": 2.326440952845298, + "language_loss": 0.75564086, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.77462184, + "num_input_tokens_seen": 177398790, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.421875, + "step": 8251, + "time_per_iteration": 2.6751270294189453 + }, + { + "auxiliary_loss_clip": 0.01580615, + "auxiliary_loss_mlp": 0.0030981, + "balance_loss_clip": 1.29588938, + "balance_loss_mlp": 0.26844427, + "epoch": 0.49613708101608295, + "flos": 16909509450240.0, + "grad_norm": 2.6088325624834745, + "language_loss": 0.87028372, + "learning_rate": 2.122020411748461e-06, + "loss": 0.88918799, + "num_input_tokens_seen": 177416515, + "router_z_loss_clip": 2.84570312, + "router_z_loss_mlp": 0.4140625, + "step": 8252, + "time_per_iteration": 4.0679144859313965 + }, + { + "auxiliary_loss_clip": 0.01616903, + "auxiliary_loss_mlp": 0.00321022, + "balance_loss_clip": 1.32708478, + "balance_loss_mlp": 0.27839291, + "epoch": 0.4961972042687509, + "flos": 16618094409600.0, + "grad_norm": 2.2628162397748297, + "language_loss": 0.88472468, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.90410393, + "num_input_tokens_seen": 177434425, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.42651367, + "step": 8253, + "time_per_iteration": 2.6116294860839844 + }, + { + "auxiliary_loss_clip": 0.01598904, + "auxiliary_loss_mlp": 0.00265647, + "balance_loss_clip": 1.3103472, + "balance_loss_mlp": 0.22721443, + "epoch": 0.49625732752141893, + "flos": 28956749437440.0, + "grad_norm": 30.680308996179328, + "language_loss": 0.71807361, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.73671913, + "num_input_tokens_seen": 177459675, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.38427734, + "step": 8254, + "time_per_iteration": 4.210940599441528 + }, + { + "auxiliary_loss_clip": 0.01591509, + "auxiliary_loss_mlp": 0.00306566, + "balance_loss_clip": 1.29993951, + "balance_loss_mlp": 0.26438969, + "epoch": 0.4963174507740869, + "flos": 23112359331840.0, + "grad_norm": 9.039631308669888, + "language_loss": 0.8204686, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.83944929, + "num_input_tokens_seen": 177478895, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.42163086, + "step": 8255, + "time_per_iteration": 2.707580804824829 + }, + { + "auxiliary_loss_clip": 0.01592714, + "auxiliary_loss_mlp": 0.00289005, + "balance_loss_clip": 1.3032763, + "balance_loss_mlp": 0.25019011, + "epoch": 0.49637757402675486, + "flos": 13917863450880.0, + "grad_norm": 307.4588068398472, + "language_loss": 0.88399476, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.90281194, + "num_input_tokens_seen": 177494920, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.38818359, + "step": 8256, + "time_per_iteration": 2.6517257690429688 + }, + { + "auxiliary_loss_clip": 0.01597921, + "auxiliary_loss_mlp": 0.0028365, + "balance_loss_clip": 1.30930746, + "balance_loss_mlp": 0.24116403, + "epoch": 0.49643769727942283, + "flos": 22309001700480.0, + "grad_norm": 17.577293207412183, + "language_loss": 0.86210573, + "learning_rate": 2.120076673368901e-06, + "loss": 0.88092142, + "num_input_tokens_seen": 177515455, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.42480469, + "step": 8257, + "time_per_iteration": 2.6988518238067627 + }, + { + "auxiliary_loss_clip": 0.01600637, + "auxiliary_loss_mlp": 0.00283711, + "balance_loss_clip": 1.30967891, + "balance_loss_mlp": 0.24263108, + "epoch": 0.4964978205320908, + "flos": 19500248776320.0, + "grad_norm": 11.617048483962511, + "language_loss": 0.74720919, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.76605266, + "num_input_tokens_seen": 177534040, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.41064453, + "step": 8258, + "time_per_iteration": 2.61411452293396 + }, + { + "auxiliary_loss_clip": 0.01582167, + "auxiliary_loss_mlp": 0.00264896, + "balance_loss_clip": 1.30546117, + "balance_loss_mlp": 0.22779775, + "epoch": 0.49655794378475876, + "flos": 23436524597760.0, + "grad_norm": 5.912249874854427, + "language_loss": 0.8142606, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.83273125, + "num_input_tokens_seen": 177554510, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.37084961, + "step": 8259, + "time_per_iteration": 4.059530973434448 + }, + { + "auxiliary_loss_clip": 0.01619957, + "auxiliary_loss_mlp": 0.00283752, + "balance_loss_clip": 1.3294996, + "balance_loss_mlp": 0.2417188, + "epoch": 0.4966180670374267, + "flos": 26831124345600.0, + "grad_norm": 47.9843026953094, + "language_loss": 0.84113562, + "learning_rate": 2.1189103755834e-06, + "loss": 0.86017269, + "num_input_tokens_seen": 177575780, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.42016602, + "step": 8260, + "time_per_iteration": 2.726888656616211 + }, + { + "auxiliary_loss_clip": 0.01603288, + "auxiliary_loss_mlp": 0.00301415, + "balance_loss_clip": 1.30553341, + "balance_loss_mlp": 0.25985849, + "epoch": 0.4966781902900947, + "flos": 22009326531840.0, + "grad_norm": 11.679280702841124, + "language_loss": 0.84046453, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.85951149, + "num_input_tokens_seen": 177588965, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.4152832, + "step": 8261, + "time_per_iteration": 2.61301851272583 + }, + { + "auxiliary_loss_clip": 0.01563533, + "auxiliary_loss_mlp": 0.00276296, + "balance_loss_clip": 1.28611612, + "balance_loss_mlp": 0.24029475, + "epoch": 0.49673831354276266, + "flos": 26213353309440.0, + "grad_norm": 2.493756276218485, + "language_loss": 0.95630819, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.97470653, + "num_input_tokens_seen": 177608425, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.35986328, + "step": 8262, + "time_per_iteration": 2.7837069034576416 + }, + { + "auxiliary_loss_clip": 0.01593129, + "auxiliary_loss_mlp": 0.00288732, + "balance_loss_clip": 1.3093679, + "balance_loss_mlp": 0.24874961, + "epoch": 0.4967984367954306, + "flos": 23182277155200.0, + "grad_norm": 14.199479709157625, + "language_loss": 0.77843237, + "learning_rate": 2.11774403721606e-06, + "loss": 0.79725099, + "num_input_tokens_seen": 177628240, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.39990234, + "step": 8263, + "time_per_iteration": 2.6469202041625977 + }, + { + "auxiliary_loss_clip": 0.01599217, + "auxiliary_loss_mlp": 0.00316982, + "balance_loss_clip": 1.30892777, + "balance_loss_mlp": 0.27502003, + "epoch": 0.4968585600480986, + "flos": 19281445079040.0, + "grad_norm": 10.321053238858216, + "language_loss": 0.77354181, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.79270375, + "num_input_tokens_seen": 177645920, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.41992188, + "step": 8264, + "time_per_iteration": 2.6227495670318604 + }, + { + "auxiliary_loss_clip": 0.01593713, + "auxiliary_loss_mlp": 0.00267589, + "balance_loss_clip": 1.29922044, + "balance_loss_mlp": 0.22722441, + "epoch": 0.49691868330076655, + "flos": 22528703237760.0, + "grad_norm": 41.574723361894776, + "language_loss": 0.72181505, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.74042809, + "num_input_tokens_seen": 177667185, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.40356445, + "step": 8265, + "time_per_iteration": 2.670355796813965 + }, + { + "auxiliary_loss_clip": 0.01159415, + "auxiliary_loss_mlp": 0.00084217, + "balance_loss_clip": 1.02558517, + "balance_loss_mlp": 0.07286798, + "epoch": 0.4969788065534345, + "flos": 66577128675840.0, + "grad_norm": 0.9004455656262217, + "language_loss": 0.53084362, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.54328001, + "num_input_tokens_seen": 177733020, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.11328125, + "step": 8266, + "time_per_iteration": 3.18708872795105 + }, + { + "auxiliary_loss_clip": 0.01563941, + "auxiliary_loss_mlp": 0.00260601, + "balance_loss_clip": 1.29279387, + "balance_loss_mlp": 0.22419478, + "epoch": 0.49703892980610254, + "flos": 24059503105920.0, + "grad_norm": 27.653337532524365, + "language_loss": 0.83629662, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.85454202, + "num_input_tokens_seen": 177753370, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.36401367, + "step": 8267, + "time_per_iteration": 2.7422688007354736 + }, + { + "auxiliary_loss_clip": 0.01589773, + "auxiliary_loss_mlp": 0.0026855, + "balance_loss_clip": 1.30355525, + "balance_loss_mlp": 0.22713624, + "epoch": 0.4970990530587705, + "flos": 29126174912640.0, + "grad_norm": 2.8132490407701063, + "language_loss": 0.82731068, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.84589392, + "num_input_tokens_seen": 177771530, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.41430664, + "step": 8268, + "time_per_iteration": 2.7157413959503174 + }, + { + "auxiliary_loss_clip": 0.0159878, + "auxiliary_loss_mlp": 0.00303783, + "balance_loss_clip": 1.30790925, + "balance_loss_mlp": 0.2611776, + "epoch": 0.49715917631143847, + "flos": 46026167258880.0, + "grad_norm": 35.11957646949294, + "language_loss": 0.7335977, + "learning_rate": 2.115411240328073e-06, + "loss": 0.75262332, + "num_input_tokens_seen": 177796355, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.42602539, + "step": 8269, + "time_per_iteration": 2.9109463691711426 + }, + { + "auxiliary_loss_clip": 0.01596746, + "auxiliary_loss_mlp": 0.00284607, + "balance_loss_clip": 1.31310987, + "balance_loss_mlp": 0.24405198, + "epoch": 0.49721929956410643, + "flos": 20191277600640.0, + "grad_norm": 48.95411155886271, + "language_loss": 0.89997101, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.91878456, + "num_input_tokens_seen": 177814300, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.40551758, + "step": 8270, + "time_per_iteration": 2.6716599464416504 + }, + { + "auxiliary_loss_clip": 0.01594805, + "auxiliary_loss_mlp": 0.00284019, + "balance_loss_clip": 1.3040036, + "balance_loss_mlp": 0.24436952, + "epoch": 0.4972794228167744, + "flos": 21653560275840.0, + "grad_norm": 13.567166219690304, + "language_loss": 0.75924313, + "learning_rate": 2.114633606196899e-06, + "loss": 0.77803135, + "num_input_tokens_seen": 177833615, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.39599609, + "step": 8271, + "time_per_iteration": 2.6862680912017822 + }, + { + "auxiliary_loss_clip": 0.01604976, + "auxiliary_loss_mlp": 0.00276021, + "balance_loss_clip": 1.31473589, + "balance_loss_mlp": 0.23606217, + "epoch": 0.49733954606944236, + "flos": 24279743347200.0, + "grad_norm": 1.8153646682645064, + "language_loss": 0.83700097, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.855811, + "num_input_tokens_seen": 177855315, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.39941406, + "step": 8272, + "time_per_iteration": 2.671494722366333 + }, + { + "auxiliary_loss_clip": 0.01575636, + "auxiliary_loss_mlp": 0.00287487, + "balance_loss_clip": 1.29508054, + "balance_loss_mlp": 0.25108021, + "epoch": 0.4973996693221103, + "flos": 37852575730560.0, + "grad_norm": 8.016527427350354, + "language_loss": 0.73006046, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.74869162, + "num_input_tokens_seen": 177875590, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.36401367, + "step": 8273, + "time_per_iteration": 2.7836966514587402 + }, + { + "auxiliary_loss_clip": 0.0158045, + "auxiliary_loss_mlp": 0.00283634, + "balance_loss_clip": 1.29868126, + "balance_loss_mlp": 0.24286492, + "epoch": 0.4974597925747783, + "flos": 21361426963200.0, + "grad_norm": 2.1532204163402446, + "language_loss": 0.83492839, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.85356915, + "num_input_tokens_seen": 177894175, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.40771484, + "step": 8274, + "time_per_iteration": 2.6122419834136963 + }, + { + "auxiliary_loss_clip": 0.01584741, + "auxiliary_loss_mlp": 0.00278876, + "balance_loss_clip": 1.29280639, + "balance_loss_mlp": 0.24056223, + "epoch": 0.49751991582744626, + "flos": 30738133560960.0, + "grad_norm": 16.182635020954972, + "language_loss": 0.82240188, + "learning_rate": 2.113078285889493e-06, + "loss": 0.84103805, + "num_input_tokens_seen": 177913920, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.38330078, + "step": 8275, + "time_per_iteration": 2.7147271633148193 + }, + { + "auxiliary_loss_clip": 0.01575819, + "auxiliary_loss_mlp": 0.00302362, + "balance_loss_clip": 1.28650033, + "balance_loss_mlp": 0.26044762, + "epoch": 0.4975800390801142, + "flos": 14100541044480.0, + "grad_norm": 57.78569877295585, + "language_loss": 0.92537934, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.94416118, + "num_input_tokens_seen": 177930425, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.41894531, + "step": 8276, + "time_per_iteration": 2.641982316970825 + }, + { + "auxiliary_loss_clip": 0.01559508, + "auxiliary_loss_mlp": 0.00278191, + "balance_loss_clip": 1.28659129, + "balance_loss_mlp": 0.24235669, + "epoch": 0.4976401623327822, + "flos": 24207275658240.0, + "grad_norm": 6.674944196356156, + "language_loss": 0.75626236, + "learning_rate": 2.112300599949172e-06, + "loss": 0.77463937, + "num_input_tokens_seen": 177949885, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.35864258, + "step": 8277, + "time_per_iteration": 2.6854665279388428 + }, + { + "auxiliary_loss_clip": 0.01563931, + "auxiliary_loss_mlp": 0.0025419, + "balance_loss_clip": 1.28657484, + "balance_loss_mlp": 0.21437439, + "epoch": 0.49770028558545015, + "flos": 21136769349120.0, + "grad_norm": 3.8601375213513713, + "language_loss": 0.88034403, + "learning_rate": 2.111911750583964e-06, + "loss": 0.89852524, + "num_input_tokens_seen": 177965720, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.39819336, + "step": 8278, + "time_per_iteration": 2.6437125205993652 + }, + { + "auxiliary_loss_clip": 0.0157207, + "auxiliary_loss_mlp": 0.00286192, + "balance_loss_clip": 1.28927994, + "balance_loss_mlp": 0.24485055, + "epoch": 0.4977604088381181, + "flos": 16763927627520.0, + "grad_norm": 6.128711646906723, + "language_loss": 0.74527138, + "learning_rate": 2.111522896975052e-06, + "loss": 0.76385403, + "num_input_tokens_seen": 177983190, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.41333008, + "step": 8279, + "time_per_iteration": 2.60491943359375 + }, + { + "auxiliary_loss_clip": 0.01560682, + "auxiliary_loss_mlp": 0.00316142, + "balance_loss_clip": 1.28091908, + "balance_loss_mlp": 0.2755636, + "epoch": 0.49782053209078614, + "flos": 15703521292800.0, + "grad_norm": 15.8415173669579, + "language_loss": 0.77671325, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.79548144, + "num_input_tokens_seen": 178000155, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.40551758, + "step": 8280, + "time_per_iteration": 2.6097826957702637 + }, + { + "auxiliary_loss_clip": 0.01562574, + "auxiliary_loss_mlp": 0.00259332, + "balance_loss_clip": 1.28501129, + "balance_loss_mlp": 0.22314033, + "epoch": 0.4978806553434541, + "flos": 24753692327040.0, + "grad_norm": 8.93595460356632, + "language_loss": 0.70333397, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.72155303, + "num_input_tokens_seen": 178021060, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.36206055, + "step": 8281, + "time_per_iteration": 2.6819510459899902 + }, + { + "auxiliary_loss_clip": 0.01558587, + "auxiliary_loss_mlp": 0.00280696, + "balance_loss_clip": 1.27950072, + "balance_loss_mlp": 0.24233469, + "epoch": 0.49794077859612207, + "flos": 13115726881920.0, + "grad_norm": 9.366564028822461, + "language_loss": 0.81392652, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.83231932, + "num_input_tokens_seen": 178038180, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.38330078, + "step": 8282, + "time_per_iteration": 2.656797170639038 + }, + { + "auxiliary_loss_clip": 0.01567042, + "auxiliary_loss_mlp": 0.00263183, + "balance_loss_clip": 1.28648138, + "balance_loss_mlp": 0.2251793, + "epoch": 0.49800090184879003, + "flos": 27525133998720.0, + "grad_norm": 8.751547627361585, + "language_loss": 0.78827572, + "learning_rate": 2.109967440397263e-06, + "loss": 0.80657804, + "num_input_tokens_seen": 178057565, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.38012695, + "step": 8283, + "time_per_iteration": 2.7004942893981934 + }, + { + "auxiliary_loss_clip": 0.01556029, + "auxiliary_loss_mlp": 0.00269434, + "balance_loss_clip": 1.28172779, + "balance_loss_mlp": 0.231502, + "epoch": 0.498061025101458, + "flos": 19792489829760.0, + "grad_norm": 1.7736128719756854, + "language_loss": 0.86350262, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.88175726, + "num_input_tokens_seen": 178076965, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.37939453, + "step": 8284, + "time_per_iteration": 2.6614372730255127 + }, + { + "auxiliary_loss_clip": 0.01565432, + "auxiliary_loss_mlp": 0.00315812, + "balance_loss_clip": 1.28147805, + "balance_loss_mlp": 0.27325472, + "epoch": 0.49812114835412596, + "flos": 29893909230720.0, + "grad_norm": 28.10545964587103, + "language_loss": 0.79484862, + "learning_rate": 2.109189687029526e-06, + "loss": 0.81366104, + "num_input_tokens_seen": 178095105, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.42578125, + "step": 8285, + "time_per_iteration": 2.72127628326416 + }, + { + "auxiliary_loss_clip": 0.01558471, + "auxiliary_loss_mlp": 0.00275153, + "balance_loss_clip": 1.28181982, + "balance_loss_mlp": 0.23548052, + "epoch": 0.49818127160679393, + "flos": 23147048891520.0, + "grad_norm": 558.1317371763907, + "language_loss": 0.81014919, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.82848537, + "num_input_tokens_seen": 178114505, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.39672852, + "step": 8286, + "time_per_iteration": 2.8055546283721924 + }, + { + "auxiliary_loss_clip": 0.01576504, + "auxiliary_loss_mlp": 0.00281975, + "balance_loss_clip": 1.29355907, + "balance_loss_mlp": 0.24277888, + "epoch": 0.4982413948594619, + "flos": 21652806090240.0, + "grad_norm": 7.0380952892820465, + "language_loss": 0.91973245, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.93831718, + "num_input_tokens_seen": 178131595, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.39208984, + "step": 8287, + "time_per_iteration": 2.6749281883239746 + }, + { + "auxiliary_loss_clip": 0.01573651, + "auxiliary_loss_mlp": 0.0028776, + "balance_loss_clip": 1.28710878, + "balance_loss_mlp": 0.2466327, + "epoch": 0.49830151811212986, + "flos": 32486982940800.0, + "grad_norm": 23.68307006351826, + "language_loss": 0.79543626, + "learning_rate": 2.108023025961159e-06, + "loss": 0.81405032, + "num_input_tokens_seen": 178152055, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.41137695, + "step": 8288, + "time_per_iteration": 2.7617757320404053 + }, + { + "auxiliary_loss_clip": 0.01570451, + "auxiliary_loss_mlp": 0.00290425, + "balance_loss_clip": 1.28697121, + "balance_loss_mlp": 0.24755731, + "epoch": 0.4983616413647978, + "flos": 18142358002560.0, + "grad_norm": 20.653876379602544, + "language_loss": 0.91759348, + "learning_rate": 2.10763413072622e-06, + "loss": 0.93620217, + "num_input_tokens_seen": 178168150, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.4284668, + "step": 8289, + "time_per_iteration": 2.618659496307373 + }, + { + "auxiliary_loss_clip": 0.01563481, + "auxiliary_loss_mlp": 0.00251868, + "balance_loss_clip": 1.28634548, + "balance_loss_mlp": 0.21224239, + "epoch": 0.4984217646174658, + "flos": 19718836992000.0, + "grad_norm": 39.82132615552353, + "language_loss": 0.81791532, + "learning_rate": 2.107245231409784e-06, + "loss": 0.83606875, + "num_input_tokens_seen": 178186150, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.39624023, + "step": 8290, + "time_per_iteration": 2.62422251701355 + }, + { + "auxiliary_loss_clip": 0.01580719, + "auxiliary_loss_mlp": 0.00277108, + "balance_loss_clip": 1.29723394, + "balance_loss_mlp": 0.23462164, + "epoch": 0.49848188787013376, + "flos": 24936549488640.0, + "grad_norm": 20.995234191672953, + "language_loss": 0.89798307, + "learning_rate": 2.106856328026598e-06, + "loss": 0.91656131, + "num_input_tokens_seen": 178207665, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.42480469, + "step": 8291, + "time_per_iteration": 4.117675304412842 + }, + { + "auxiliary_loss_clip": 0.01550114, + "auxiliary_loss_mlp": 0.00284851, + "balance_loss_clip": 1.26992404, + "balance_loss_mlp": 0.24634685, + "epoch": 0.4985420111228017, + "flos": 22382439056640.0, + "grad_norm": 3.644418875251962, + "language_loss": 0.7504428, + "learning_rate": 2.106467420591409e-06, + "loss": 0.76879251, + "num_input_tokens_seen": 178226325, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.38500977, + "step": 8292, + "time_per_iteration": 2.6323609352111816 + }, + { + "auxiliary_loss_clip": 0.01558631, + "auxiliary_loss_mlp": 0.00296159, + "balance_loss_clip": 1.28238583, + "balance_loss_mlp": 0.25515068, + "epoch": 0.4986021343754697, + "flos": 16216469464320.0, + "grad_norm": 6.896034325643645, + "language_loss": 0.74606562, + "learning_rate": 2.106078509118965e-06, + "loss": 0.76461345, + "num_input_tokens_seen": 178244960, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.41015625, + "step": 8293, + "time_per_iteration": 2.6407389640808105 + }, + { + "auxiliary_loss_clip": 0.01544311, + "auxiliary_loss_mlp": 0.00290387, + "balance_loss_clip": 1.26656079, + "balance_loss_mlp": 0.24835409, + "epoch": 0.4986622576281377, + "flos": 23403594804480.0, + "grad_norm": 1133.9241831978209, + "language_loss": 0.90293145, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.92127848, + "num_input_tokens_seen": 178265400, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.42016602, + "step": 8294, + "time_per_iteration": 4.121809005737305 + }, + { + "auxiliary_loss_clip": 0.01549266, + "auxiliary_loss_mlp": 0.00260062, + "balance_loss_clip": 1.27572775, + "balance_loss_mlp": 0.22050798, + "epoch": 0.49872238088080567, + "flos": 19974556892160.0, + "grad_norm": 11.067786763372983, + "language_loss": 0.80127347, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.81936669, + "num_input_tokens_seen": 178284535, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.39526367, + "step": 8295, + "time_per_iteration": 2.628880262374878 + }, + { + "auxiliary_loss_clip": 0.01535646, + "auxiliary_loss_mlp": 0.00255477, + "balance_loss_clip": 1.26713192, + "balance_loss_mlp": 0.21794973, + "epoch": 0.49878250413347364, + "flos": 22893016930560.0, + "grad_norm": 5.324015311635045, + "language_loss": 0.75591099, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.77382219, + "num_input_tokens_seen": 178302425, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.37548828, + "step": 8296, + "time_per_iteration": 3.9921317100524902 + }, + { + "auxiliary_loss_clip": 0.01550573, + "auxiliary_loss_mlp": 0.00265894, + "balance_loss_clip": 1.26718688, + "balance_loss_mlp": 0.22574462, + "epoch": 0.4988426273861416, + "flos": 32598449821440.0, + "grad_norm": 3.143093059184644, + "language_loss": 0.72897172, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.74713641, + "num_input_tokens_seen": 178323065, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.40136719, + "step": 8297, + "time_per_iteration": 2.7317750453948975 + }, + { + "auxiliary_loss_clip": 0.0154182, + "auxiliary_loss_mlp": 0.00260841, + "balance_loss_clip": 1.27842045, + "balance_loss_mlp": 0.22412413, + "epoch": 0.49890275063880957, + "flos": 20923604087040.0, + "grad_norm": 5.111270963256113, + "language_loss": 0.76128674, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.77931333, + "num_input_tokens_seen": 178343985, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.36743164, + "step": 8298, + "time_per_iteration": 2.6905357837677 + }, + { + "auxiliary_loss_clip": 0.01551956, + "auxiliary_loss_mlp": 0.00277347, + "balance_loss_clip": 1.28239906, + "balance_loss_mlp": 0.23869902, + "epoch": 0.49896287389147753, + "flos": 18624459369600.0, + "grad_norm": 12.85411277683915, + "language_loss": 0.9264462, + "learning_rate": 2.103744956327814e-06, + "loss": 0.94473922, + "num_input_tokens_seen": 178362345, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.38647461, + "step": 8299, + "time_per_iteration": 2.720627784729004 + }, + { + "auxiliary_loss_clip": 0.0156199, + "auxiliary_loss_mlp": 0.00261309, + "balance_loss_clip": 1.28095436, + "balance_loss_mlp": 0.22321013, + "epoch": 0.4990229971441455, + "flos": 24826555065600.0, + "grad_norm": 3.318187111837297, + "language_loss": 0.76271665, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.78094971, + "num_input_tokens_seen": 178383190, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.38110352, + "step": 8300, + "time_per_iteration": 2.672309637069702 + }, + { + "auxiliary_loss_clip": 0.013135, + "auxiliary_loss_mlp": 0.00050078, + "balance_loss_clip": 1.16644883, + "balance_loss_mlp": 0.03839538, + "epoch": 0.49908312039681346, + "flos": 71384525136000.0, + "grad_norm": 0.7534611225038539, + "language_loss": 0.50898254, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.52261829, + "num_input_tokens_seen": 178444250, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.11669922, + "step": 8301, + "time_per_iteration": 4.652975559234619 + }, + { + "auxiliary_loss_clip": 0.01544301, + "auxiliary_loss_mlp": 0.00253207, + "balance_loss_clip": 1.27823305, + "balance_loss_mlp": 0.21663322, + "epoch": 0.4991432436494814, + "flos": 19828651847040.0, + "grad_norm": 13.661283215860301, + "language_loss": 0.91914904, + "learning_rate": 2.102578126623879e-06, + "loss": 0.93712413, + "num_input_tokens_seen": 178463250, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.36572266, + "step": 8302, + "time_per_iteration": 2.633612632751465 + }, + { + "auxiliary_loss_clip": 0.01554698, + "auxiliary_loss_mlp": 0.00255894, + "balance_loss_clip": 1.28522396, + "balance_loss_mlp": 0.22022676, + "epoch": 0.4992033669021494, + "flos": 15121912273920.0, + "grad_norm": 9.310815278704329, + "language_loss": 0.7700659, + "learning_rate": 2.102189175590024e-06, + "loss": 0.78817177, + "num_input_tokens_seen": 178481340, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.35668945, + "step": 8303, + "time_per_iteration": 2.6400959491729736 + }, + { + "auxiliary_loss_clip": 0.01554025, + "auxiliary_loss_mlp": 0.00304335, + "balance_loss_clip": 1.27870989, + "balance_loss_mlp": 0.26635444, + "epoch": 0.49926349015481736, + "flos": 31207952476800.0, + "grad_norm": 49.48104176862393, + "language_loss": 0.77327991, + "learning_rate": 2.101800220681144e-06, + "loss": 0.79186356, + "num_input_tokens_seen": 178501545, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.38012695, + "step": 8304, + "time_per_iteration": 2.755831003189087 + }, + { + "auxiliary_loss_clip": 0.01552649, + "auxiliary_loss_mlp": 0.00273684, + "balance_loss_clip": 1.28285098, + "balance_loss_mlp": 0.23572783, + "epoch": 0.4993236134074853, + "flos": 24900207903360.0, + "grad_norm": 5.701512692202346, + "language_loss": 0.89880586, + "learning_rate": 2.10141126191199e-06, + "loss": 0.9170692, + "num_input_tokens_seen": 178519700, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.37963867, + "step": 8305, + "time_per_iteration": 2.669471502304077 + }, + { + "auxiliary_loss_clip": 0.0131389, + "auxiliary_loss_mlp": 0.00031649, + "balance_loss_clip": 1.16659403, + "balance_loss_mlp": 0.02297036, + "epoch": 0.4993837366601533, + "flos": 70420573797120.0, + "grad_norm": 0.7132946082690106, + "language_loss": 0.56840104, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.58185643, + "num_input_tokens_seen": 178576740, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.08691406, + "step": 8306, + "time_per_iteration": 3.2620975971221924 + }, + { + "auxiliary_loss_clip": 0.015464, + "auxiliary_loss_mlp": 0.00271105, + "balance_loss_clip": 1.27593565, + "balance_loss_mlp": 0.23262404, + "epoch": 0.4994438599128213, + "flos": 15961216440960.0, + "grad_norm": 2.9861251596144074, + "language_loss": 0.88417953, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.90235454, + "num_input_tokens_seen": 178594745, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.38476562, + "step": 8307, + "time_per_iteration": 2.653660297393799 + }, + { + "auxiliary_loss_clip": 0.01536658, + "auxiliary_loss_mlp": 0.00258016, + "balance_loss_clip": 1.27240562, + "balance_loss_mlp": 0.22237217, + "epoch": 0.4995039831654893, + "flos": 27928303228800.0, + "grad_norm": 83.26886751674232, + "language_loss": 0.68718207, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.70512879, + "num_input_tokens_seen": 178614110, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.35620117, + "step": 8308, + "time_per_iteration": 2.7902040481567383 + }, + { + "auxiliary_loss_clip": 0.01522353, + "auxiliary_loss_mlp": 0.00270639, + "balance_loss_clip": 1.26183522, + "balance_loss_mlp": 0.23568663, + "epoch": 0.49956410641815724, + "flos": 24204797619840.0, + "grad_norm": 5.286821443666597, + "language_loss": 0.80296683, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.82089674, + "num_input_tokens_seen": 178634170, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.34960938, + "step": 8309, + "time_per_iteration": 2.738373041152954 + }, + { + "auxiliary_loss_clip": 0.01536473, + "auxiliary_loss_mlp": 0.00271014, + "balance_loss_clip": 1.26427412, + "balance_loss_mlp": 0.23517923, + "epoch": 0.4996242296708252, + "flos": 16180127879040.0, + "grad_norm": 636.0163792387274, + "language_loss": 0.86780024, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.88587511, + "num_input_tokens_seen": 178651775, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.3581543, + "step": 8310, + "time_per_iteration": 2.6469452381134033 + }, + { + "auxiliary_loss_clip": 0.01537471, + "auxiliary_loss_mlp": 0.00297814, + "balance_loss_clip": 1.26424956, + "balance_loss_mlp": 0.26100242, + "epoch": 0.49968435292349317, + "flos": 16873527000960.0, + "grad_norm": 376.02021595260595, + "language_loss": 0.77926946, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.79762232, + "num_input_tokens_seen": 178669720, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.36816406, + "step": 8311, + "time_per_iteration": 2.6617627143859863 + }, + { + "auxiliary_loss_clip": 0.01525653, + "auxiliary_loss_mlp": 0.00274138, + "balance_loss_clip": 1.26270652, + "balance_loss_mlp": 0.23856601, + "epoch": 0.49974447617616113, + "flos": 14939521989120.0, + "grad_norm": 476.76245085127596, + "language_loss": 0.83990484, + "learning_rate": 2.098688443679187e-06, + "loss": 0.85790277, + "num_input_tokens_seen": 178686765, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.35546875, + "step": 8312, + "time_per_iteration": 2.6910252571105957 + }, + { + "auxiliary_loss_clip": 0.01536785, + "auxiliary_loss_mlp": 0.00307042, + "balance_loss_clip": 1.26656592, + "balance_loss_mlp": 0.2713981, + "epoch": 0.4998045994288291, + "flos": 26651535321600.0, + "grad_norm": 2.6480471378595847, + "language_loss": 0.91475868, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.93319696, + "num_input_tokens_seen": 178705845, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.35644531, + "step": 8313, + "time_per_iteration": 2.7459142208099365 + }, + { + "auxiliary_loss_clip": 0.01522987, + "auxiliary_loss_mlp": 0.00298754, + "balance_loss_clip": 1.25658691, + "balance_loss_mlp": 0.2627764, + "epoch": 0.49986472268149706, + "flos": 20953768533120.0, + "grad_norm": 4.712286776072296, + "language_loss": 0.8662343, + "learning_rate": 2.097910461710939e-06, + "loss": 0.88445169, + "num_input_tokens_seen": 178723410, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.35961914, + "step": 8314, + "time_per_iteration": 2.654977321624756 + }, + { + "auxiliary_loss_clip": 0.01505959, + "auxiliary_loss_mlp": 0.00275816, + "balance_loss_clip": 1.23907399, + "balance_loss_mlp": 0.23940918, + "epoch": 0.49992484593416503, + "flos": 22783884433920.0, + "grad_norm": 87.0736697185339, + "language_loss": 0.8599745, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.87779224, + "num_input_tokens_seen": 178743560, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.36425781, + "step": 8315, + "time_per_iteration": 2.6770336627960205 + }, + { + "auxiliary_loss_clip": 0.01509395, + "auxiliary_loss_mlp": 0.00268156, + "balance_loss_clip": 1.25151098, + "balance_loss_mlp": 0.23442024, + "epoch": 0.499984969186833, + "flos": 46786970252160.0, + "grad_norm": 44.045049164919554, + "language_loss": 0.79938495, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.81716049, + "num_input_tokens_seen": 178767225, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.33740234, + "step": 8316, + "time_per_iteration": 2.881390333175659 + }, + { + "auxiliary_loss_clip": 0.01515469, + "auxiliary_loss_mlp": 0.0028299, + "balance_loss_clip": 1.25614893, + "balance_loss_mlp": 0.25051743, + "epoch": 0.500045092439501, + "flos": 25556978131200.0, + "grad_norm": 12.585700615187667, + "language_loss": 0.8680687, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.8860532, + "num_input_tokens_seen": 178786810, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.32470703, + "step": 8317, + "time_per_iteration": 2.666767120361328 + }, + { + "auxiliary_loss_clip": 0.01495067, + "auxiliary_loss_mlp": 0.00277528, + "balance_loss_clip": 1.23169398, + "balance_loss_mlp": 0.24331445, + "epoch": 0.5001052156921689, + "flos": 20704764476160.0, + "grad_norm": 20.74793044744049, + "language_loss": 0.89145911, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.90918505, + "num_input_tokens_seen": 178805660, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.34204102, + "step": 8318, + "time_per_iteration": 2.653527021408081 + }, + { + "auxiliary_loss_clip": 0.01501112, + "auxiliary_loss_mlp": 0.00258561, + "balance_loss_clip": 1.23977971, + "balance_loss_mlp": 0.22441971, + "epoch": 0.500165338944837, + "flos": 21251109317760.0, + "grad_norm": 181.2066553924834, + "language_loss": 0.88603467, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.90363145, + "num_input_tokens_seen": 178824780, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.34106445, + "step": 8319, + "time_per_iteration": 2.6545042991638184 + }, + { + "auxiliary_loss_clip": 0.01475184, + "auxiliary_loss_mlp": 0.00291432, + "balance_loss_clip": 1.22724199, + "balance_loss_mlp": 0.25874481, + "epoch": 0.5002254621975049, + "flos": 27854398995840.0, + "grad_norm": 7.173321456236692, + "language_loss": 0.78335166, + "learning_rate": 2.095576427171635e-06, + "loss": 0.80101782, + "num_input_tokens_seen": 178845640, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.3269043, + "step": 8320, + "time_per_iteration": 2.7205400466918945 + }, + { + "auxiliary_loss_clip": 0.01506738, + "auxiliary_loss_mlp": 0.00316103, + "balance_loss_clip": 1.23859668, + "balance_loss_mlp": 0.2780987, + "epoch": 0.5002855854501729, + "flos": 15551941898880.0, + "grad_norm": 25.38293760669924, + "language_loss": 0.85190326, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.87013173, + "num_input_tokens_seen": 178862290, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.38037109, + "step": 8321, + "time_per_iteration": 2.5990335941314697 + }, + { + "auxiliary_loss_clip": 0.01484319, + "auxiliary_loss_mlp": 0.00271935, + "balance_loss_clip": 1.22425401, + "balance_loss_mlp": 0.23841313, + "epoch": 0.5003457087028408, + "flos": 16107408794880.0, + "grad_norm": 7.896609248323216, + "language_loss": 0.89831048, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.91587305, + "num_input_tokens_seen": 178879805, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.33544922, + "step": 8322, + "time_per_iteration": 2.623309850692749 + }, + { + "auxiliary_loss_clip": 0.01499309, + "auxiliary_loss_mlp": 0.00265344, + "balance_loss_clip": 1.24115407, + "balance_loss_mlp": 0.23318155, + "epoch": 0.5004058319555088, + "flos": 22710518904960.0, + "grad_norm": 23.73440893705101, + "language_loss": 0.83153778, + "learning_rate": 2.094409360775228e-06, + "loss": 0.84918433, + "num_input_tokens_seen": 178896985, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.3215332, + "step": 8323, + "time_per_iteration": 2.7042691707611084 + }, + { + "auxiliary_loss_clip": 0.01476556, + "auxiliary_loss_mlp": 0.00284722, + "balance_loss_clip": 1.22084987, + "balance_loss_mlp": 0.25165376, + "epoch": 0.5004659552081767, + "flos": 30117956313600.0, + "grad_norm": 8.830156648644511, + "language_loss": 0.7493493, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.76696205, + "num_input_tokens_seen": 178920605, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.33081055, + "step": 8324, + "time_per_iteration": 2.748060703277588 + }, + { + "auxiliary_loss_clip": 0.01481999, + "auxiliary_loss_mlp": 0.00266509, + "balance_loss_clip": 1.22466075, + "balance_loss_mlp": 0.23396532, + "epoch": 0.5005260784608447, + "flos": 18624710764800.0, + "grad_norm": 58.33306761231406, + "language_loss": 0.79470181, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.81218684, + "num_input_tokens_seen": 178937760, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.32568359, + "step": 8325, + "time_per_iteration": 2.6344377994537354 + }, + { + "auxiliary_loss_clip": 0.01482858, + "auxiliary_loss_mlp": 0.00286532, + "balance_loss_clip": 1.22329187, + "balance_loss_mlp": 0.2491717, + "epoch": 0.5005862017135126, + "flos": 24859987649280.0, + "grad_norm": 12.739242541349977, + "language_loss": 0.79311162, + "learning_rate": 2.093242262158709e-06, + "loss": 0.8108055, + "num_input_tokens_seen": 178957985, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.37353516, + "step": 8326, + "time_per_iteration": 2.694550037384033 + }, + { + "auxiliary_loss_clip": 0.01454003, + "auxiliary_loss_mlp": 0.00259559, + "balance_loss_clip": 1.20804393, + "balance_loss_mlp": 0.22622773, + "epoch": 0.5006463249661807, + "flos": 18734381965440.0, + "grad_norm": 42.668106256404506, + "language_loss": 0.83476615, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.85190177, + "num_input_tokens_seen": 178977070, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.33325195, + "step": 8327, + "time_per_iteration": 2.63102126121521 + }, + { + "auxiliary_loss_clip": 0.01479456, + "auxiliary_loss_mlp": 0.00261916, + "balance_loss_clip": 1.22080278, + "balance_loss_mlp": 0.22853786, + "epoch": 0.5007064482188487, + "flos": 13042145871360.0, + "grad_norm": 20.449983269550177, + "language_loss": 0.94838834, + "learning_rate": 2.092464178710997e-06, + "loss": 0.96580207, + "num_input_tokens_seen": 178994175, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.33398438, + "step": 8328, + "time_per_iteration": 2.647000551223755 + }, + { + "auxiliary_loss_clip": 0.01479471, + "auxiliary_loss_mlp": 0.00286864, + "balance_loss_clip": 1.21741557, + "balance_loss_mlp": 0.25348586, + "epoch": 0.5007665714715166, + "flos": 21288671965440.0, + "grad_norm": 12.713745288024715, + "language_loss": 0.81151652, + "learning_rate": 2.092075131720388e-06, + "loss": 0.82917988, + "num_input_tokens_seen": 179013710, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.33374023, + "step": 8329, + "time_per_iteration": 2.6821954250335693 + }, + { + "auxiliary_loss_clip": 0.0147274, + "auxiliary_loss_mlp": 0.00263914, + "balance_loss_clip": 1.2178129, + "balance_loss_mlp": 0.23046367, + "epoch": 0.5008266947241846, + "flos": 29754576374400.0, + "grad_norm": 13.934878913026669, + "language_loss": 0.85048699, + "learning_rate": 2.091686081238281e-06, + "loss": 0.86785352, + "num_input_tokens_seen": 179035255, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.33447266, + "step": 8330, + "time_per_iteration": 2.772576332092285 + }, + { + "auxiliary_loss_clip": 0.01226107, + "auxiliary_loss_mlp": 0.00055488, + "balance_loss_clip": 1.08714938, + "balance_loss_mlp": 0.04862158, + "epoch": 0.5008868179768525, + "flos": 63557829204480.0, + "grad_norm": 0.712579330952562, + "language_loss": 0.55603564, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.56885159, + "num_input_tokens_seen": 179090915, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.06884766, + "step": 8331, + "time_per_iteration": 2.9457015991210938 + }, + { + "auxiliary_loss_clip": 0.0145444, + "auxiliary_loss_mlp": 0.0027823, + "balance_loss_clip": 1.20095646, + "balance_loss_mlp": 0.24508986, + "epoch": 0.5009469412295205, + "flos": 27375637593600.0, + "grad_norm": 155.7590845590233, + "language_loss": 0.70471287, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.72203952, + "num_input_tokens_seen": 179109160, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.33154297, + "step": 8332, + "time_per_iteration": 2.7539167404174805 + }, + { + "auxiliary_loss_clip": 0.01457157, + "auxiliary_loss_mlp": 0.0027981, + "balance_loss_clip": 1.20517111, + "balance_loss_mlp": 0.24889892, + "epoch": 0.5010070644821885, + "flos": 27378833904000.0, + "grad_norm": 10.080759065047047, + "language_loss": 0.80818623, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.82555592, + "num_input_tokens_seen": 179130610, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.30895996, + "step": 8333, + "time_per_iteration": 4.14194917678833 + }, + { + "auxiliary_loss_clip": 0.01471103, + "auxiliary_loss_mlp": 0.00303911, + "balance_loss_clip": 1.2133584, + "balance_loss_mlp": 0.27224934, + "epoch": 0.5010671877348565, + "flos": 20662748542080.0, + "grad_norm": 11.10512071954758, + "language_loss": 0.85264301, + "learning_rate": 2.090129844689929e-06, + "loss": 0.87039316, + "num_input_tokens_seen": 179147860, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.31665039, + "step": 8334, + "time_per_iteration": 2.6392858028411865 + }, + { + "auxiliary_loss_clip": 0.01206012, + "auxiliary_loss_mlp": 0.00049304, + "balance_loss_clip": 1.06585586, + "balance_loss_mlp": 0.04162645, + "epoch": 0.5011273109875244, + "flos": 59128645000320.0, + "grad_norm": 0.8870809437483181, + "language_loss": 0.6232866, + "learning_rate": 2.089740776971626e-06, + "loss": 0.63583976, + "num_input_tokens_seen": 179210490, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.07666016, + "step": 8335, + "time_per_iteration": 3.0867512226104736 + }, + { + "auxiliary_loss_clip": 0.01443238, + "auxiliary_loss_mlp": 0.00298873, + "balance_loss_clip": 1.19701862, + "balance_loss_mlp": 0.26685357, + "epoch": 0.5011874342401924, + "flos": 25336342840320.0, + "grad_norm": 5.14581752502151, + "language_loss": 0.83946002, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.85688114, + "num_input_tokens_seen": 179231360, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.32006836, + "step": 8336, + "time_per_iteration": 4.1414759159088135 + }, + { + "auxiliary_loss_clip": 0.01444426, + "auxiliary_loss_mlp": 0.00269448, + "balance_loss_clip": 1.19411969, + "balance_loss_mlp": 0.23821557, + "epoch": 0.5012475574928603, + "flos": 20229953569920.0, + "grad_norm": 11.530746077821638, + "language_loss": 0.87506306, + "learning_rate": 2.088962631340836e-06, + "loss": 0.89220178, + "num_input_tokens_seen": 179250625, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.3125, + "step": 8337, + "time_per_iteration": 2.6344826221466064 + }, + { + "auxiliary_loss_clip": 0.01460715, + "auxiliary_loss_mlp": 0.00277988, + "balance_loss_clip": 1.19999552, + "balance_loss_mlp": 0.24608777, + "epoch": 0.5013076807455283, + "flos": 22710123855360.0, + "grad_norm": 4.393125940812073, + "language_loss": 0.89364803, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.91103506, + "num_input_tokens_seen": 179267360, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.3190918, + "step": 8338, + "time_per_iteration": 4.031171560287476 + }, + { + "auxiliary_loss_clip": 0.01439608, + "auxiliary_loss_mlp": 0.00290389, + "balance_loss_clip": 1.18724751, + "balance_loss_mlp": 0.25627089, + "epoch": 0.5013678039981962, + "flos": 24245161528320.0, + "grad_norm": 4.067965246588167, + "language_loss": 0.8942591, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.9115591, + "num_input_tokens_seen": 179289810, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.34130859, + "step": 8339, + "time_per_iteration": 2.6974287033081055 + }, + { + "auxiliary_loss_clip": 0.01454003, + "auxiliary_loss_mlp": 0.00276301, + "balance_loss_clip": 1.20240784, + "balance_loss_mlp": 0.24287513, + "epoch": 0.5014279272508643, + "flos": 26176688501760.0, + "grad_norm": 35.51987330234699, + "language_loss": 0.7763015, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.79360455, + "num_input_tokens_seen": 179310620, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.33422852, + "step": 8340, + "time_per_iteration": 2.6866447925567627 + }, + { + "auxiliary_loss_clip": 0.01457896, + "auxiliary_loss_mlp": 0.00297357, + "balance_loss_clip": 1.20183468, + "balance_loss_mlp": 0.26340666, + "epoch": 0.5014880505035323, + "flos": 21430446946560.0, + "grad_norm": 6.4527188049597015, + "language_loss": 0.85543227, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.87298477, + "num_input_tokens_seen": 179329005, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.33959961, + "step": 8341, + "time_per_iteration": 2.6983139514923096 + }, + { + "auxiliary_loss_clip": 0.0145816, + "auxiliary_loss_mlp": 0.00297727, + "balance_loss_clip": 1.19849324, + "balance_loss_mlp": 0.26492095, + "epoch": 0.5015481737562002, + "flos": 15770745596160.0, + "grad_norm": 7.01104753151254, + "language_loss": 0.97599179, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.99355072, + "num_input_tokens_seen": 179343785, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.32763672, + "step": 8342, + "time_per_iteration": 2.5808353424072266 + }, + { + "auxiliary_loss_clip": 0.0144374, + "auxiliary_loss_mlp": 0.00283185, + "balance_loss_clip": 1.1908114, + "balance_loss_mlp": 0.24994969, + "epoch": 0.5016082970088682, + "flos": 26830801123200.0, + "grad_norm": 11.606428044103685, + "language_loss": 0.82482362, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.84209287, + "num_input_tokens_seen": 179364070, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.33190918, + "step": 8343, + "time_per_iteration": 4.110390901565552 + }, + { + "auxiliary_loss_clip": 0.01444448, + "auxiliary_loss_mlp": 0.0027097, + "balance_loss_clip": 1.19871497, + "balance_loss_mlp": 0.23954621, + "epoch": 0.5016684202615361, + "flos": 21470595373440.0, + "grad_norm": 3.092284262478727, + "language_loss": 0.74420977, + "learning_rate": 2.086239016143293e-06, + "loss": 0.76136398, + "num_input_tokens_seen": 179384225, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.31445312, + "step": 8344, + "time_per_iteration": 2.6959409713745117 + }, + { + "auxiliary_loss_clip": 0.01455787, + "auxiliary_loss_mlp": 0.00275591, + "balance_loss_clip": 1.20144057, + "balance_loss_mlp": 0.24268943, + "epoch": 0.5017285435142042, + "flos": 26246821806720.0, + "grad_norm": 11.107452437986751, + "language_loss": 0.80935889, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.82667267, + "num_input_tokens_seen": 179402595, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.32885742, + "step": 8345, + "time_per_iteration": 2.694019079208374 + }, + { + "auxiliary_loss_clip": 0.01441796, + "auxiliary_loss_mlp": 0.00300582, + "balance_loss_clip": 1.19274426, + "balance_loss_mlp": 0.26751363, + "epoch": 0.5017886667668721, + "flos": 20777555387520.0, + "grad_norm": 431.0842443627243, + "language_loss": 0.84663343, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.86405724, + "num_input_tokens_seen": 179419635, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.33105469, + "step": 8346, + "time_per_iteration": 2.681772470474243 + }, + { + "auxiliary_loss_clip": 0.01439133, + "auxiliary_loss_mlp": 0.00266835, + "balance_loss_clip": 1.18647361, + "balance_loss_mlp": 0.23479134, + "epoch": 0.5018487900195401, + "flos": 20156408472960.0, + "grad_norm": 5.065189931772412, + "language_loss": 0.75553739, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.77259707, + "num_input_tokens_seen": 179438770, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.32055664, + "step": 8347, + "time_per_iteration": 2.641700506210327 + }, + { + "auxiliary_loss_clip": 0.01452719, + "auxiliary_loss_mlp": 0.0028615, + "balance_loss_clip": 1.1977216, + "balance_loss_mlp": 0.25306955, + "epoch": 0.501908913272208, + "flos": 18150689957760.0, + "grad_norm": 22.84304935680298, + "language_loss": 0.79019868, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.80758739, + "num_input_tokens_seen": 179457475, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.33093262, + "step": 8348, + "time_per_iteration": 2.711578845977783 + }, + { + "auxiliary_loss_clip": 0.01438472, + "auxiliary_loss_mlp": 0.00258697, + "balance_loss_clip": 1.19182217, + "balance_loss_mlp": 0.2267492, + "epoch": 0.501969036524876, + "flos": 23112287504640.0, + "grad_norm": 819.0956907181119, + "language_loss": 0.79558796, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.8125596, + "num_input_tokens_seen": 179478140, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.3190918, + "step": 8349, + "time_per_iteration": 2.667661190032959 + }, + { + "auxiliary_loss_clip": 0.01458528, + "auxiliary_loss_mlp": 0.00289963, + "balance_loss_clip": 1.19803953, + "balance_loss_mlp": 0.25627461, + "epoch": 0.5020291597775439, + "flos": 11363214314880.0, + "grad_norm": 8.94591119113137, + "language_loss": 0.73775196, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.75523686, + "num_input_tokens_seen": 179494325, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.3371582, + "step": 8350, + "time_per_iteration": 2.6572978496551514 + }, + { + "auxiliary_loss_clip": 0.01162616, + "auxiliary_loss_mlp": 0.00111571, + "balance_loss_clip": 1.02634883, + "balance_loss_mlp": 0.10484753, + "epoch": 0.5020892830302119, + "flos": 64011094928640.0, + "grad_norm": 0.81392388749973, + "language_loss": 0.59552813, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.60826993, + "num_input_tokens_seen": 179553545, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06738281, + "step": 8351, + "time_per_iteration": 3.2867138385772705 + }, + { + "auxiliary_loss_clip": 0.01424509, + "auxiliary_loss_mlp": 0.0028604, + "balance_loss_clip": 1.1771431, + "balance_loss_mlp": 0.2535198, + "epoch": 0.5021494062828799, + "flos": 23732859801600.0, + "grad_norm": 9.02671095886992, + "language_loss": 0.8137697, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.83087516, + "num_input_tokens_seen": 179573645, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.32507324, + "step": 8352, + "time_per_iteration": 2.71761417388916 + }, + { + "auxiliary_loss_clip": 0.0144143, + "auxiliary_loss_mlp": 0.00264548, + "balance_loss_clip": 1.19001138, + "balance_loss_mlp": 0.23047844, + "epoch": 0.5022095295355479, + "flos": 21576747041280.0, + "grad_norm": 3.323835666915715, + "language_loss": 0.78832614, + "learning_rate": 2.082736990429464e-06, + "loss": 0.80538595, + "num_input_tokens_seen": 179591435, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.34057617, + "step": 8353, + "time_per_iteration": 2.8355326652526855 + }, + { + "auxiliary_loss_clip": 0.0144268, + "auxiliary_loss_mlp": 0.00287691, + "balance_loss_clip": 1.19221389, + "balance_loss_mlp": 0.25321591, + "epoch": 0.5022696527882159, + "flos": 21397229844480.0, + "grad_norm": 16.139732130577777, + "language_loss": 0.83307296, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.85037667, + "num_input_tokens_seen": 179609955, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.34472656, + "step": 8354, + "time_per_iteration": 2.7903733253479004 + }, + { + "auxiliary_loss_clip": 0.01424443, + "auxiliary_loss_mlp": 0.00259481, + "balance_loss_clip": 1.1764878, + "balance_loss_mlp": 0.2266508, + "epoch": 0.5023297760408838, + "flos": 27160712565120.0, + "grad_norm": 3.63691636857119, + "language_loss": 0.7869693, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.80380851, + "num_input_tokens_seen": 179630875, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.328125, + "step": 8355, + "time_per_iteration": 2.8440442085266113 + }, + { + "auxiliary_loss_clip": 0.01437408, + "auxiliary_loss_mlp": 0.00287946, + "balance_loss_clip": 1.18131661, + "balance_loss_mlp": 0.2518968, + "epoch": 0.5023898992935518, + "flos": 26213820186240.0, + "grad_norm": 4.155823791507584, + "language_loss": 0.87978709, + "learning_rate": 2.081569591520548e-06, + "loss": 0.89704061, + "num_input_tokens_seen": 179649835, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.3605957, + "step": 8356, + "time_per_iteration": 2.775402307510376 + }, + { + "auxiliary_loss_clip": 0.01434452, + "auxiliary_loss_mlp": 0.00316065, + "balance_loss_clip": 1.17546904, + "balance_loss_mlp": 0.28077903, + "epoch": 0.5024500225462197, + "flos": 13440323111040.0, + "grad_norm": 6.997154238222545, + "language_loss": 0.86071908, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.87822425, + "num_input_tokens_seen": 179667605, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.3527832, + "step": 8357, + "time_per_iteration": 2.7920563220977783 + }, + { + "auxiliary_loss_clip": 0.01405952, + "auxiliary_loss_mlp": 0.00252699, + "balance_loss_clip": 1.16314209, + "balance_loss_mlp": 0.22108433, + "epoch": 0.5025101457988878, + "flos": 21579584215680.0, + "grad_norm": 2.4225416295754396, + "language_loss": 0.83975923, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.85634571, + "num_input_tokens_seen": 179686910, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.31567383, + "step": 8358, + "time_per_iteration": 2.656294584274292 + }, + { + "auxiliary_loss_clip": 0.01407948, + "auxiliary_loss_mlp": 0.00255068, + "balance_loss_clip": 1.1632781, + "balance_loss_mlp": 0.22269084, + "epoch": 0.5025702690515557, + "flos": 24645134448000.0, + "grad_norm": 22.596751682514483, + "language_loss": 0.81915838, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.83578849, + "num_input_tokens_seen": 179706395, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.32348633, + "step": 8359, + "time_per_iteration": 2.6656970977783203 + }, + { + "auxiliary_loss_clip": 0.01382212, + "auxiliary_loss_mlp": 0.00268878, + "balance_loss_clip": 1.14819217, + "balance_loss_mlp": 0.2355466, + "epoch": 0.5026303923042237, + "flos": 22090162089600.0, + "grad_norm": 12.798572337318557, + "language_loss": 0.84104347, + "learning_rate": 2.080013016407077e-06, + "loss": 0.85755438, + "num_input_tokens_seen": 179725735, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.33349609, + "step": 8360, + "time_per_iteration": 2.651395797729492 + }, + { + "auxiliary_loss_clip": 0.01382706, + "auxiliary_loss_mlp": 0.00258184, + "balance_loss_clip": 1.14870882, + "balance_loss_mlp": 0.22540191, + "epoch": 0.5026905155568916, + "flos": 23697200574720.0, + "grad_norm": 3.484702836986478, + "language_loss": 0.8360765, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.85248542, + "num_input_tokens_seen": 179746150, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.328125, + "step": 8361, + "time_per_iteration": 2.6885247230529785 + }, + { + "auxiliary_loss_clip": 0.01406641, + "auxiliary_loss_mlp": 0.002884, + "balance_loss_clip": 1.15981483, + "balance_loss_mlp": 0.25585574, + "epoch": 0.5027506388095596, + "flos": 25812410722560.0, + "grad_norm": 5.628477656703242, + "language_loss": 0.92275918, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.93970954, + "num_input_tokens_seen": 179767550, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.32543945, + "step": 8362, + "time_per_iteration": 2.7093560695648193 + }, + { + "auxiliary_loss_clip": 0.01407159, + "auxiliary_loss_mlp": 0.00285556, + "balance_loss_clip": 1.1584506, + "balance_loss_mlp": 0.25122365, + "epoch": 0.5028107620622275, + "flos": 27526606456320.0, + "grad_norm": 3.4500016642329285, + "language_loss": 0.84347016, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.86039728, + "num_input_tokens_seen": 179790075, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.34326172, + "step": 8363, + "time_per_iteration": 2.85449481010437 + }, + { + "auxiliary_loss_clip": 0.01392124, + "auxiliary_loss_mlp": 0.00279204, + "balance_loss_clip": 1.15855956, + "balance_loss_mlp": 0.24487191, + "epoch": 0.5028708853148955, + "flos": 24534278098560.0, + "grad_norm": 5.23196617032345, + "language_loss": 0.82314289, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.83985615, + "num_input_tokens_seen": 179806515, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.34350586, + "step": 8364, + "time_per_iteration": 2.6654088497161865 + }, + { + "auxiliary_loss_clip": 0.01376716, + "auxiliary_loss_mlp": 0.0024937, + "balance_loss_clip": 1.14213514, + "balance_loss_mlp": 0.21931693, + "epoch": 0.5029310085675635, + "flos": 20813609664000.0, + "grad_norm": 115.82632450621476, + "language_loss": 0.75418043, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.77044129, + "num_input_tokens_seen": 179826450, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.30078125, + "step": 8365, + "time_per_iteration": 2.640221357345581 + }, + { + "auxiliary_loss_clip": 0.01390775, + "auxiliary_loss_mlp": 0.00305159, + "balance_loss_clip": 1.14539349, + "balance_loss_mlp": 0.27180415, + "epoch": 0.5029911318202315, + "flos": 22342470197760.0, + "grad_norm": 12.794447933538077, + "language_loss": 0.79883558, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.81579489, + "num_input_tokens_seen": 179846770, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.33374023, + "step": 8366, + "time_per_iteration": 2.681597948074341 + }, + { + "auxiliary_loss_clip": 0.01379691, + "auxiliary_loss_mlp": 0.0030785, + "balance_loss_clip": 1.13918209, + "balance_loss_mlp": 0.2751869, + "epoch": 0.5030512550728995, + "flos": 24352713826560.0, + "grad_norm": 112.45764053623076, + "language_loss": 0.84721607, + "learning_rate": 2.077288893713735e-06, + "loss": 0.86409152, + "num_input_tokens_seen": 179866585, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.32653809, + "step": 8367, + "time_per_iteration": 2.70650315284729 + }, + { + "auxiliary_loss_clip": 0.01364695, + "auxiliary_loss_mlp": 0.00242232, + "balance_loss_clip": 1.13126302, + "balance_loss_mlp": 0.21182188, + "epoch": 0.5031113783255674, + "flos": 18259930195200.0, + "grad_norm": 14.693830472751841, + "language_loss": 0.77362263, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.78969187, + "num_input_tokens_seen": 179885575, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.30407715, + "step": 8368, + "time_per_iteration": 2.6716814041137695 + }, + { + "auxiliary_loss_clip": 0.01119178, + "auxiliary_loss_mlp": 0.00074172, + "balance_loss_clip": 0.99295616, + "balance_loss_mlp": 0.06697126, + "epoch": 0.5031715015782354, + "flos": 57253173200640.0, + "grad_norm": 0.8480609048502558, + "language_loss": 0.6292659, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.64119947, + "num_input_tokens_seen": 179939650, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.07177734, + "step": 8369, + "time_per_iteration": 3.086191415786743 + }, + { + "auxiliary_loss_clip": 0.01370715, + "auxiliary_loss_mlp": 0.00288309, + "balance_loss_clip": 1.13054323, + "balance_loss_mlp": 0.25652814, + "epoch": 0.5032316248309033, + "flos": 27527360641920.0, + "grad_norm": 14.984126726126169, + "language_loss": 0.68240643, + "learning_rate": 2.076121368302263e-06, + "loss": 0.69899666, + "num_input_tokens_seen": 179961765, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.31787109, + "step": 8370, + "time_per_iteration": 2.748309373855591 + }, + { + "auxiliary_loss_clip": 0.01364336, + "auxiliary_loss_mlp": 0.00276103, + "balance_loss_clip": 1.12714505, + "balance_loss_mlp": 0.2452274, + "epoch": 0.5032917480835714, + "flos": 34495825939200.0, + "grad_norm": 72.97394101645965, + "language_loss": 0.74241924, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.75882357, + "num_input_tokens_seen": 179983015, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.30908203, + "step": 8371, + "time_per_iteration": 2.835634708404541 + }, + { + "auxiliary_loss_clip": 0.01377714, + "auxiliary_loss_mlp": 0.00306002, + "balance_loss_clip": 1.13796294, + "balance_loss_mlp": 0.27135941, + "epoch": 0.5033518713362393, + "flos": 33656773167360.0, + "grad_norm": 6.120533729314986, + "language_loss": 0.74239898, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.75923622, + "num_input_tokens_seen": 180003210, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.34667969, + "step": 8372, + "time_per_iteration": 2.7883994579315186 + }, + { + "auxiliary_loss_clip": 0.01364903, + "auxiliary_loss_mlp": 0.00272482, + "balance_loss_clip": 1.12534308, + "balance_loss_mlp": 0.23855546, + "epoch": 0.5034119945889073, + "flos": 28185495586560.0, + "grad_norm": 3.5459312923437203, + "language_loss": 0.73071885, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.74709266, + "num_input_tokens_seen": 180025530, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.33911133, + "step": 8373, + "time_per_iteration": 2.7610809803009033 + }, + { + "auxiliary_loss_clip": 0.01357153, + "auxiliary_loss_mlp": 0.00280124, + "balance_loss_clip": 1.12199903, + "balance_loss_mlp": 0.24705526, + "epoch": 0.5034721178415752, + "flos": 21358697529600.0, + "grad_norm": 3.636721584583758, + "language_loss": 0.80362999, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.82000268, + "num_input_tokens_seen": 180043180, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.33093262, + "step": 8374, + "time_per_iteration": 2.767807960510254 + }, + { + "auxiliary_loss_clip": 0.0136588, + "auxiliary_loss_mlp": 0.00290248, + "balance_loss_clip": 1.12537336, + "balance_loss_mlp": 0.2564404, + "epoch": 0.5035322410942432, + "flos": 22674823764480.0, + "grad_norm": 12.29258553993945, + "language_loss": 0.74644792, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.76300919, + "num_input_tokens_seen": 180062905, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.33789062, + "step": 8375, + "time_per_iteration": 4.127256631851196 + }, + { + "auxiliary_loss_clip": 0.01374613, + "auxiliary_loss_mlp": 0.00275439, + "balance_loss_clip": 1.13195527, + "balance_loss_mlp": 0.24158344, + "epoch": 0.5035923643469111, + "flos": 19828723674240.0, + "grad_norm": 15.891379848684567, + "language_loss": 0.87726533, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.89376581, + "num_input_tokens_seen": 180082000, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.33837891, + "step": 8376, + "time_per_iteration": 2.7629427909851074 + }, + { + "auxiliary_loss_clip": 0.01367027, + "auxiliary_loss_mlp": 0.00311993, + "balance_loss_clip": 1.12624776, + "balance_loss_mlp": 0.27928203, + "epoch": 0.5036524875995791, + "flos": 30514625182080.0, + "grad_norm": 3.1061646355548596, + "language_loss": 0.68027282, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.69706303, + "num_input_tokens_seen": 180101340, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.32739258, + "step": 8377, + "time_per_iteration": 2.7158868312835693 + }, + { + "auxiliary_loss_clip": 0.01367758, + "auxiliary_loss_mlp": 0.0026476, + "balance_loss_clip": 1.13031924, + "balance_loss_mlp": 0.23314574, + "epoch": 0.5037126108522471, + "flos": 14720574637440.0, + "grad_norm": 11.679831293764575, + "language_loss": 0.86198485, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.87831008, + "num_input_tokens_seen": 180119160, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.31616211, + "step": 8378, + "time_per_iteration": 4.136762857437134 + }, + { + "auxiliary_loss_clip": 0.01361199, + "auxiliary_loss_mlp": 0.00279756, + "balance_loss_clip": 1.12648749, + "balance_loss_mlp": 0.24754606, + "epoch": 0.5037727341049151, + "flos": 25297702784640.0, + "grad_norm": 28.28919708900393, + "language_loss": 0.81032073, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.82673037, + "num_input_tokens_seen": 180138730, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.32226562, + "step": 8379, + "time_per_iteration": 2.7212252616882324 + }, + { + "auxiliary_loss_clip": 0.01351006, + "auxiliary_loss_mlp": 0.00272672, + "balance_loss_clip": 1.11803222, + "balance_loss_mlp": 0.24122481, + "epoch": 0.5038328573575831, + "flos": 28541764632960.0, + "grad_norm": 2.6844512212374085, + "language_loss": 0.75016129, + "learning_rate": 2.072229431544548e-06, + "loss": 0.76639807, + "num_input_tokens_seen": 180158810, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.31469727, + "step": 8380, + "time_per_iteration": 4.211390972137451 + }, + { + "auxiliary_loss_clip": 0.01344291, + "auxiliary_loss_mlp": 0.00259961, + "balance_loss_clip": 1.11606276, + "balance_loss_mlp": 0.22901402, + "epoch": 0.503892980610251, + "flos": 31649869503360.0, + "grad_norm": 1918.9575955827543, + "language_loss": 0.69356614, + "learning_rate": 2.071840222561051e-06, + "loss": 0.70960867, + "num_input_tokens_seen": 180179700, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.30944824, + "step": 8381, + "time_per_iteration": 2.7733652591705322 + }, + { + "auxiliary_loss_clip": 0.01347557, + "auxiliary_loss_mlp": 0.00248709, + "balance_loss_clip": 1.11762142, + "balance_loss_mlp": 0.21733285, + "epoch": 0.503953103862919, + "flos": 27089358197760.0, + "grad_norm": 3.9252661618166607, + "language_loss": 0.73879194, + "learning_rate": 2.071451010853365e-06, + "loss": 0.75475466, + "num_input_tokens_seen": 180199890, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.31396484, + "step": 8382, + "time_per_iteration": 2.7511401176452637 + }, + { + "auxiliary_loss_clip": 0.01361728, + "auxiliary_loss_mlp": 0.00304748, + "balance_loss_clip": 1.11908364, + "balance_loss_mlp": 0.26924735, + "epoch": 0.5040132271155869, + "flos": 15632957024640.0, + "grad_norm": 14.514916264859897, + "language_loss": 0.70208502, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.71874982, + "num_input_tokens_seen": 180217840, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.35498047, + "step": 8383, + "time_per_iteration": 2.6751596927642822 + }, + { + "auxiliary_loss_clip": 0.01340413, + "auxiliary_loss_mlp": 0.00246392, + "balance_loss_clip": 1.11077595, + "balance_loss_mlp": 0.21639854, + "epoch": 0.504073350368255, + "flos": 13590106824960.0, + "grad_norm": 11.228611075296522, + "language_loss": 0.75186872, + "learning_rate": 2.070672579324465e-06, + "loss": 0.76773679, + "num_input_tokens_seen": 180236465, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.30029297, + "step": 8384, + "time_per_iteration": 2.674572229385376 + }, + { + "auxiliary_loss_clip": 0.01329956, + "auxiliary_loss_mlp": 0.00250181, + "balance_loss_clip": 1.1006273, + "balance_loss_mlp": 0.21824472, + "epoch": 0.5041334736209229, + "flos": 29058160510080.0, + "grad_norm": 1152.4828387266602, + "language_loss": 0.77345878, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.78926015, + "num_input_tokens_seen": 180258025, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.31970215, + "step": 8385, + "time_per_iteration": 4.138592481613159 + }, + { + "auxiliary_loss_clip": 0.01339258, + "auxiliary_loss_mlp": 0.00256685, + "balance_loss_clip": 1.1112566, + "balance_loss_mlp": 0.22653654, + "epoch": 0.5041935968735909, + "flos": 24608361899520.0, + "grad_norm": 2.701822788894995, + "language_loss": 0.88682055, + "learning_rate": 2.069894137075919e-06, + "loss": 0.90277994, + "num_input_tokens_seen": 180277825, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.3013916, + "step": 8386, + "time_per_iteration": 2.6785714626312256 + }, + { + "auxiliary_loss_clip": 0.01324556, + "auxiliary_loss_mlp": 0.00266999, + "balance_loss_clip": 1.09620452, + "balance_loss_mlp": 0.23443097, + "epoch": 0.5042537201262588, + "flos": 26286934320000.0, + "grad_norm": 14.559947453353955, + "language_loss": 0.72368765, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.73960316, + "num_input_tokens_seen": 180300465, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.32568359, + "step": 8387, + "time_per_iteration": 2.713618040084839 + }, + { + "auxiliary_loss_clip": 0.01320616, + "auxiliary_loss_mlp": 0.00248018, + "balance_loss_clip": 1.09652853, + "balance_loss_mlp": 0.2180251, + "epoch": 0.5043138433789268, + "flos": 22017371178240.0, + "grad_norm": 146.56702360454182, + "language_loss": 0.86119992, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.87688625, + "num_input_tokens_seen": 180321050, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.29980469, + "step": 8388, + "time_per_iteration": 2.6889700889587402 + }, + { + "auxiliary_loss_clip": 0.01319442, + "auxiliary_loss_mlp": 0.00264279, + "balance_loss_clip": 1.09268904, + "balance_loss_mlp": 0.23154445, + "epoch": 0.5043739666315947, + "flos": 28767104605440.0, + "grad_norm": 11.892249356228541, + "language_loss": 0.77263486, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.78847206, + "num_input_tokens_seen": 180338870, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.32739258, + "step": 8389, + "time_per_iteration": 2.7304751873016357 + }, + { + "auxiliary_loss_clip": 0.01327682, + "auxiliary_loss_mlp": 0.00229477, + "balance_loss_clip": 1.0976367, + "balance_loss_mlp": 0.19671869, + "epoch": 0.5044340898842627, + "flos": 27599253713280.0, + "grad_norm": 10.343852951834906, + "language_loss": 0.75240803, + "learning_rate": 2.068337220892191e-06, + "loss": 0.76797962, + "num_input_tokens_seen": 180361285, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.32763672, + "step": 8390, + "time_per_iteration": 2.715522289276123 + }, + { + "auxiliary_loss_clip": 0.01237025, + "auxiliary_loss_mlp": 0.00033314, + "balance_loss_clip": 1.08259892, + "balance_loss_mlp": 0.02640026, + "epoch": 0.5044942131369307, + "flos": 67458050749440.0, + "grad_norm": 0.8390600312595843, + "language_loss": 0.52454656, + "learning_rate": 2.067947985330974e-06, + "loss": 0.53724998, + "num_input_tokens_seen": 180415170, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.06933594, + "step": 8391, + "time_per_iteration": 2.9326982498168945 + }, + { + "auxiliary_loss_clip": 0.01241243, + "auxiliary_loss_mlp": 0.00101871, + "balance_loss_clip": 1.08761764, + "balance_loss_mlp": 0.09262076, + "epoch": 0.5045543363895987, + "flos": 58630849390080.0, + "grad_norm": 0.8064278320968578, + "language_loss": 0.60452712, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.61795831, + "num_input_tokens_seen": 180468060, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.09228516, + "step": 8392, + "time_per_iteration": 2.9434127807617188 + }, + { + "auxiliary_loss_clip": 0.01298103, + "auxiliary_loss_mlp": 0.00234382, + "balance_loss_clip": 1.07763362, + "balance_loss_mlp": 0.20245816, + "epoch": 0.5046144596422667, + "flos": 22526620248960.0, + "grad_norm": 34.64253621057198, + "language_loss": 0.91606176, + "learning_rate": 2.067169506493517e-06, + "loss": 0.93138665, + "num_input_tokens_seen": 180486610, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.31860352, + "step": 8393, + "time_per_iteration": 2.6683945655822754 + }, + { + "auxiliary_loss_clip": 0.01296211, + "auxiliary_loss_mlp": 0.00246869, + "balance_loss_clip": 1.07896781, + "balance_loss_mlp": 0.21577933, + "epoch": 0.5046745828949346, + "flos": 27454246508160.0, + "grad_norm": 57.950754431525986, + "language_loss": 0.60318476, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.61861551, + "num_input_tokens_seen": 180508135, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.31103516, + "step": 8394, + "time_per_iteration": 2.8524320125579834 + }, + { + "auxiliary_loss_clip": 0.01308386, + "auxiliary_loss_mlp": 0.00232549, + "balance_loss_clip": 1.08704376, + "balance_loss_mlp": 0.19950414, + "epoch": 0.5047347061476026, + "flos": 17274541415040.0, + "grad_norm": 3.6634201161507702, + "language_loss": 0.82114863, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.83655798, + "num_input_tokens_seen": 180527000, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.33032227, + "step": 8395, + "time_per_iteration": 2.774489164352417 + }, + { + "auxiliary_loss_clip": 0.01287607, + "auxiliary_loss_mlp": 0.00238085, + "balance_loss_clip": 1.06937075, + "balance_loss_mlp": 0.20637567, + "epoch": 0.5047948294002705, + "flos": 16649515831680.0, + "grad_norm": 174.44317691999652, + "language_loss": 0.76069146, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.77594841, + "num_input_tokens_seen": 180544715, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.31713867, + "step": 8396, + "time_per_iteration": 2.666508436203003 + }, + { + "auxiliary_loss_clip": 0.01295268, + "auxiliary_loss_mlp": 0.00224792, + "balance_loss_clip": 1.07954264, + "balance_loss_mlp": 0.19453719, + "epoch": 0.5048549526529386, + "flos": 26865706164480.0, + "grad_norm": 8.88449262559083, + "language_loss": 0.85148978, + "learning_rate": 2.065612518371792e-06, + "loss": 0.8666904, + "num_input_tokens_seen": 180565365, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.30273438, + "step": 8397, + "time_per_iteration": 2.691591739654541 + }, + { + "auxiliary_loss_clip": 0.0127433, + "auxiliary_loss_mlp": 0.00226321, + "balance_loss_clip": 1.06346059, + "balance_loss_mlp": 0.19721037, + "epoch": 0.5049150759056065, + "flos": 21833939399040.0, + "grad_norm": 15.824481504938332, + "language_loss": 0.71582836, + "learning_rate": 2.065223265084376e-06, + "loss": 0.73083484, + "num_input_tokens_seen": 180586670, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.29101562, + "step": 8398, + "time_per_iteration": 2.64906644821167 + }, + { + "auxiliary_loss_clip": 0.0128325, + "auxiliary_loss_mlp": 0.0025105, + "balance_loss_clip": 1.06384206, + "balance_loss_mlp": 0.21908997, + "epoch": 0.5049751991582745, + "flos": 21685807710720.0, + "grad_norm": 9.215987286745595, + "language_loss": 0.78676176, + "learning_rate": 2.064834009323688e-06, + "loss": 0.80210483, + "num_input_tokens_seen": 180605085, + "router_z_loss_clip": 2.19238281, + "router_z_loss_mlp": 0.31982422, + "step": 8399, + "time_per_iteration": 2.6782491207122803 + }, + { + "auxiliary_loss_clip": 0.01292257, + "auxiliary_loss_mlp": 0.00277289, + "balance_loss_clip": 1.06679845, + "balance_loss_mlp": 0.24481633, + "epoch": 0.5050353224109424, + "flos": 21359379888000.0, + "grad_norm": 680.5594893330084, + "language_loss": 0.88688958, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.90258509, + "num_input_tokens_seen": 180624370, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.32470703, + "step": 8400, + "time_per_iteration": 2.7447712421417236 + }, + { + "auxiliary_loss_clip": 0.01281752, + "auxiliary_loss_mlp": 0.00221532, + "balance_loss_clip": 1.069098, + "balance_loss_mlp": 0.189798, + "epoch": 0.5050954456636104, + "flos": 22820082364800.0, + "grad_norm": 6.898512323047323, + "language_loss": 0.8476572, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.86268997, + "num_input_tokens_seen": 180642450, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.31713867, + "step": 8401, + "time_per_iteration": 2.6519651412963867 + }, + { + "auxiliary_loss_clip": 0.01291593, + "auxiliary_loss_mlp": 0.00233191, + "balance_loss_clip": 1.06993628, + "balance_loss_mlp": 0.20212492, + "epoch": 0.5051555689162783, + "flos": 30448226891520.0, + "grad_norm": 25.324071544169996, + "language_loss": 0.78536773, + "learning_rate": 2.063666227349593e-06, + "loss": 0.80061555, + "num_input_tokens_seen": 180665250, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.31103516, + "step": 8402, + "time_per_iteration": 2.7691829204559326 + }, + { + "auxiliary_loss_clip": 0.0128928, + "auxiliary_loss_mlp": 0.00271506, + "balance_loss_clip": 1.07080913, + "balance_loss_mlp": 0.2388902, + "epoch": 0.5052156921689464, + "flos": 21287953693440.0, + "grad_norm": 84.75088042004353, + "language_loss": 0.76057684, + "learning_rate": 2.063276961843422e-06, + "loss": 0.77618468, + "num_input_tokens_seen": 180687425, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.32617188, + "step": 8403, + "time_per_iteration": 2.7064619064331055 + }, + { + "auxiliary_loss_clip": 0.01277558, + "auxiliary_loss_mlp": 0.0025, + "balance_loss_clip": 1.063375, + "balance_loss_mlp": 0.21801579, + "epoch": 0.5052758154216143, + "flos": 25081305298560.0, + "grad_norm": 13.505889439336944, + "language_loss": 0.91634607, + "learning_rate": 2.062887693937781e-06, + "loss": 0.93162161, + "num_input_tokens_seen": 180708725, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.31945801, + "step": 8404, + "time_per_iteration": 2.74446702003479 + }, + { + "auxiliary_loss_clip": 0.01291649, + "auxiliary_loss_mlp": 0.00217843, + "balance_loss_clip": 1.07705975, + "balance_loss_mlp": 0.18728921, + "epoch": 0.5053359386742823, + "flos": 20885502735360.0, + "grad_norm": 13.674764815726505, + "language_loss": 0.81169021, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.82678515, + "num_input_tokens_seen": 180727990, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.30566406, + "step": 8405, + "time_per_iteration": 2.685103178024292 + }, + { + "auxiliary_loss_clip": 0.01289964, + "auxiliary_loss_mlp": 0.00263733, + "balance_loss_clip": 1.06908095, + "balance_loss_mlp": 0.23059301, + "epoch": 0.5053960619269503, + "flos": 37743335493120.0, + "grad_norm": 7.734724563451844, + "language_loss": 0.81798649, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.83352345, + "num_input_tokens_seen": 180749765, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.33154297, + "step": 8406, + "time_per_iteration": 2.8328564167022705 + }, + { + "auxiliary_loss_clip": 0.01271907, + "auxiliary_loss_mlp": 0.002196, + "balance_loss_clip": 1.05990112, + "balance_loss_mlp": 0.18831968, + "epoch": 0.5054561851796182, + "flos": 23513840622720.0, + "grad_norm": 6.343219989777322, + "language_loss": 0.8423081, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.85722321, + "num_input_tokens_seen": 180769580, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.3125, + "step": 8407, + "time_per_iteration": 2.6477203369140625 + }, + { + "auxiliary_loss_clip": 0.01278034, + "auxiliary_loss_mlp": 0.00228032, + "balance_loss_clip": 1.06292152, + "balance_loss_mlp": 0.1969423, + "epoch": 0.5055163084322862, + "flos": 30410233280640.0, + "grad_norm": 3.134876633636941, + "language_loss": 0.73308778, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.74814844, + "num_input_tokens_seen": 180790295, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.31103516, + "step": 8408, + "time_per_iteration": 2.716315984725952 + }, + { + "auxiliary_loss_clip": 0.01292725, + "auxiliary_loss_mlp": 0.00284335, + "balance_loss_clip": 1.07443738, + "balance_loss_mlp": 0.2509563, + "epoch": 0.5055764316849541, + "flos": 20259651139200.0, + "grad_norm": 13.949250419931529, + "language_loss": 0.7124629, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.72823352, + "num_input_tokens_seen": 180807875, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.33374023, + "step": 8409, + "time_per_iteration": 2.653784990310669 + }, + { + "auxiliary_loss_clip": 0.01265055, + "auxiliary_loss_mlp": 0.00225527, + "balance_loss_clip": 1.05632222, + "balance_loss_mlp": 0.19372171, + "epoch": 0.5056365549376222, + "flos": 26070895969920.0, + "grad_norm": 9.970262880830685, + "language_loss": 0.7578913, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.77279711, + "num_input_tokens_seen": 180831300, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.31811523, + "step": 8410, + "time_per_iteration": 2.7240092754364014 + }, + { + "auxiliary_loss_clip": 0.01276815, + "auxiliary_loss_mlp": 0.00232273, + "balance_loss_clip": 1.06319821, + "balance_loss_mlp": 0.20274523, + "epoch": 0.5056966781902901, + "flos": 19279074781440.0, + "grad_norm": 35.85111967493781, + "language_loss": 0.84981918, + "learning_rate": 2.060162752653113e-06, + "loss": 0.86491007, + "num_input_tokens_seen": 180849055, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.29553223, + "step": 8411, + "time_per_iteration": 2.6339781284332275 + }, + { + "auxiliary_loss_clip": 0.01282708, + "auxiliary_loss_mlp": 0.00258387, + "balance_loss_clip": 1.06480885, + "balance_loss_mlp": 0.22527042, + "epoch": 0.5057568014429581, + "flos": 21323325611520.0, + "grad_norm": 18.935798075865094, + "language_loss": 0.89281088, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.90822184, + "num_input_tokens_seen": 180867395, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.33105469, + "step": 8412, + "time_per_iteration": 2.654064178466797 + }, + { + "auxiliary_loss_clip": 0.01283012, + "auxiliary_loss_mlp": 0.0021283, + "balance_loss_clip": 1.06787515, + "balance_loss_mlp": 0.18200272, + "epoch": 0.505816924695626, + "flos": 17493596507520.0, + "grad_norm": 24.9287197487472, + "language_loss": 0.89707398, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.91203249, + "num_input_tokens_seen": 180886670, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.30834961, + "step": 8413, + "time_per_iteration": 2.674069881439209 + }, + { + "auxiliary_loss_clip": 0.01286004, + "auxiliary_loss_mlp": 0.00245485, + "balance_loss_clip": 1.0653584, + "balance_loss_mlp": 0.21344152, + "epoch": 0.505877047948294, + "flos": 21142084561920.0, + "grad_norm": 73.25322111356229, + "language_loss": 0.89396906, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.909284, + "num_input_tokens_seen": 180904645, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.32055664, + "step": 8414, + "time_per_iteration": 2.6650896072387695 + }, + { + "auxiliary_loss_clip": 0.01269854, + "auxiliary_loss_mlp": 0.00230914, + "balance_loss_clip": 1.05567789, + "balance_loss_mlp": 0.2001102, + "epoch": 0.5059371712009619, + "flos": 36350036887680.0, + "grad_norm": 153.13119586632106, + "language_loss": 0.71105802, + "learning_rate": 2.058605592832528e-06, + "loss": 0.7260657, + "num_input_tokens_seen": 180922340, + "router_z_loss_clip": 2.13964844, + "router_z_loss_mlp": 0.30822754, + "step": 8415, + "time_per_iteration": 2.8225138187408447 + }, + { + "auxiliary_loss_clip": 0.01282345, + "auxiliary_loss_mlp": 0.00230823, + "balance_loss_clip": 1.06353426, + "balance_loss_mlp": 0.19820763, + "epoch": 0.50599729445363, + "flos": 22673387220480.0, + "grad_norm": 176.68633009268746, + "language_loss": 0.8874706, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.90260226, + "num_input_tokens_seen": 180941350, + "router_z_loss_clip": 2.18652344, + "router_z_loss_mlp": 0.32641602, + "step": 8416, + "time_per_iteration": 2.662322998046875 + }, + { + "auxiliary_loss_clip": 0.0127175, + "auxiliary_loss_mlp": 0.00208749, + "balance_loss_clip": 1.06474364, + "balance_loss_mlp": 0.18026978, + "epoch": 0.5060574177062979, + "flos": 22747866071040.0, + "grad_norm": 6.46068441718129, + "language_loss": 0.870646, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.88545102, + "num_input_tokens_seen": 180960720, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.28479004, + "step": 8417, + "time_per_iteration": 2.749983310699463 + }, + { + "auxiliary_loss_clip": 0.0126459, + "auxiliary_loss_mlp": 0.00220104, + "balance_loss_clip": 1.05543399, + "balance_loss_mlp": 0.18813223, + "epoch": 0.5061175409589659, + "flos": 21653201139840.0, + "grad_norm": 76.05522947182168, + "language_loss": 0.70037484, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.71522182, + "num_input_tokens_seen": 180979725, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.31982422, + "step": 8418, + "time_per_iteration": 4.069144248962402 + }, + { + "auxiliary_loss_clip": 0.01284723, + "auxiliary_loss_mlp": 0.00223606, + "balance_loss_clip": 1.06500876, + "balance_loss_mlp": 0.19263516, + "epoch": 0.5061776642116339, + "flos": 21616249023360.0, + "grad_norm": 17.424369930228274, + "language_loss": 0.86612695, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.88121027, + "num_input_tokens_seen": 180998980, + "router_z_loss_clip": 2.20019531, + "router_z_loss_mlp": 0.30981445, + "step": 8419, + "time_per_iteration": 2.6608774662017822 + }, + { + "auxiliary_loss_clip": 0.01290748, + "auxiliary_loss_mlp": 0.00251939, + "balance_loss_clip": 1.06811833, + "balance_loss_mlp": 0.21844092, + "epoch": 0.5062377874643018, + "flos": 24426294837120.0, + "grad_norm": 16.520824183916023, + "language_loss": 0.86132622, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.87675309, + "num_input_tokens_seen": 181019165, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.33520508, + "step": 8420, + "time_per_iteration": 4.131247282028198 + }, + { + "auxiliary_loss_clip": 0.01284361, + "auxiliary_loss_mlp": 0.00237252, + "balance_loss_clip": 1.0646143, + "balance_loss_mlp": 0.20206141, + "epoch": 0.5062979107169698, + "flos": 22524429519360.0, + "grad_norm": 6.45836985693877, + "language_loss": 0.86403877, + "learning_rate": 2.056269786726999e-06, + "loss": 0.87925488, + "num_input_tokens_seen": 181037110, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.35180664, + "step": 8421, + "time_per_iteration": 2.6373038291931152 + }, + { + "auxiliary_loss_clip": 0.01271737, + "auxiliary_loss_mlp": 0.0021815, + "balance_loss_clip": 1.06056535, + "balance_loss_mlp": 0.18570112, + "epoch": 0.5063580339696377, + "flos": 24571984400640.0, + "grad_norm": 6.847833101204108, + "language_loss": 0.72525066, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.74014956, + "num_input_tokens_seen": 181057775, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.32446289, + "step": 8422, + "time_per_iteration": 4.045825004577637 + }, + { + "auxiliary_loss_clip": 0.01275099, + "auxiliary_loss_mlp": 0.00220256, + "balance_loss_clip": 1.06347668, + "balance_loss_mlp": 0.18799761, + "epoch": 0.5064181572223058, + "flos": 22596143022720.0, + "grad_norm": 9.515918847386917, + "language_loss": 0.89206892, + "learning_rate": 2.05549116746431e-06, + "loss": 0.90702248, + "num_input_tokens_seen": 181078260, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.32250977, + "step": 8423, + "time_per_iteration": 2.7575416564941406 + }, + { + "auxiliary_loss_clip": 0.01276031, + "auxiliary_loss_mlp": 0.00211053, + "balance_loss_clip": 1.06067944, + "balance_loss_mlp": 0.17567217, + "epoch": 0.5064782804749737, + "flos": 25994944661760.0, + "grad_norm": 19.400621431652336, + "language_loss": 0.851318, + "learning_rate": 2.055101854669237e-06, + "loss": 0.86618888, + "num_input_tokens_seen": 181098755, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.35375977, + "step": 8424, + "time_per_iteration": 2.6821236610412598 + }, + { + "auxiliary_loss_clip": 0.0126353, + "auxiliary_loss_mlp": 0.00200404, + "balance_loss_clip": 1.05608869, + "balance_loss_mlp": 0.16611984, + "epoch": 0.5065384037276417, + "flos": 28553041503360.0, + "grad_norm": 67.379192090613, + "language_loss": 0.75590849, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.77054781, + "num_input_tokens_seen": 181121570, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.34301758, + "step": 8425, + "time_per_iteration": 2.7815101146698 + }, + { + "auxiliary_loss_clip": 0.01271758, + "auxiliary_loss_mlp": 0.00197143, + "balance_loss_clip": 1.05687904, + "balance_loss_mlp": 0.16433649, + "epoch": 0.5065985269803096, + "flos": 22966023323520.0, + "grad_norm": 5.34010783961367, + "language_loss": 0.85942918, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.87411821, + "num_input_tokens_seen": 181140240, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.328125, + "step": 8426, + "time_per_iteration": 2.6267430782318115 + }, + { + "auxiliary_loss_clip": 0.01267603, + "auxiliary_loss_mlp": 0.00201071, + "balance_loss_clip": 1.0580219, + "balance_loss_mlp": 0.16790724, + "epoch": 0.5066586502329776, + "flos": 21608563512960.0, + "grad_norm": 51.651297328867166, + "language_loss": 0.86609602, + "learning_rate": 2.053933903806265e-06, + "loss": 0.88078272, + "num_input_tokens_seen": 181158630, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.33129883, + "step": 8427, + "time_per_iteration": 4.091872692108154 + }, + { + "auxiliary_loss_clip": 0.01277237, + "auxiliary_loss_mlp": 0.00202079, + "balance_loss_clip": 1.05956757, + "balance_loss_mlp": 0.16445683, + "epoch": 0.5067187734856455, + "flos": 20339912079360.0, + "grad_norm": 8.322531758877448, + "language_loss": 0.79537261, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.81016582, + "num_input_tokens_seen": 181176405, + "router_z_loss_clip": 2.17675781, + "router_z_loss_mlp": 0.3762207, + "step": 8428, + "time_per_iteration": 2.661098003387451 + }, + { + "auxiliary_loss_clip": 0.01264784, + "auxiliary_loss_mlp": 0.00210792, + "balance_loss_clip": 1.05344355, + "balance_loss_mlp": 0.17791384, + "epoch": 0.5067788967383136, + "flos": 28841080665600.0, + "grad_norm": 4.657257563646165, + "language_loss": 0.8974666, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.91222239, + "num_input_tokens_seen": 181197595, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.32885742, + "step": 8429, + "time_per_iteration": 2.6989338397979736 + }, + { + "auxiliary_loss_clip": 0.01286037, + "auxiliary_loss_mlp": 0.00228777, + "balance_loss_clip": 1.0639317, + "balance_loss_mlp": 0.18946135, + "epoch": 0.5068390199909815, + "flos": 32450174478720.0, + "grad_norm": 4.090888256715769, + "language_loss": 0.81045282, + "learning_rate": 2.052765934536682e-06, + "loss": 0.82560098, + "num_input_tokens_seen": 181218560, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.39331055, + "step": 8430, + "time_per_iteration": 2.7299647331237793 + }, + { + "auxiliary_loss_clip": 0.01263995, + "auxiliary_loss_mlp": 0.00201315, + "balance_loss_clip": 1.05597901, + "balance_loss_mlp": 0.16850869, + "epoch": 0.5068991432436495, + "flos": 23146582014720.0, + "grad_norm": 4.9730636051809665, + "language_loss": 0.8396371, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.85429025, + "num_input_tokens_seen": 181237095, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.328125, + "step": 8431, + "time_per_iteration": 2.642548084259033 + }, + { + "auxiliary_loss_clip": 0.01271276, + "auxiliary_loss_mlp": 0.00215306, + "balance_loss_clip": 1.06029201, + "balance_loss_mlp": 0.17889968, + "epoch": 0.5069592664963174, + "flos": 19936096404480.0, + "grad_norm": 9.841118033886003, + "language_loss": 0.78152716, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.79639304, + "num_input_tokens_seen": 181255940, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.36376953, + "step": 8432, + "time_per_iteration": 2.680083751678467 + }, + { + "auxiliary_loss_clip": 0.01219623, + "auxiliary_loss_mlp": 0.00040336, + "balance_loss_clip": 1.06920004, + "balance_loss_mlp": 0.02374209, + "epoch": 0.5070193897489854, + "flos": 65793771941760.0, + "grad_norm": 0.7437461861017748, + "language_loss": 0.63180155, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.64440113, + "num_input_tokens_seen": 181316945, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.16601562, + "step": 8433, + "time_per_iteration": 3.1762917041778564 + }, + { + "auxiliary_loss_clip": 0.01279399, + "auxiliary_loss_mlp": 0.00186975, + "balance_loss_clip": 1.0647099, + "balance_loss_mlp": 0.15066382, + "epoch": 0.5070795130016534, + "flos": 17275331514240.0, + "grad_norm": 31.318065524628665, + "language_loss": 0.83150655, + "learning_rate": 2.051208614233681e-06, + "loss": 0.84617031, + "num_input_tokens_seen": 181335555, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.36254883, + "step": 8434, + "time_per_iteration": 2.6155951023101807 + }, + { + "auxiliary_loss_clip": 0.01279225, + "auxiliary_loss_mlp": 0.00189792, + "balance_loss_clip": 1.06420648, + "balance_loss_mlp": 0.15147769, + "epoch": 0.5071396362543213, + "flos": 21069940095360.0, + "grad_norm": 7.170245874910778, + "language_loss": 0.79364133, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.80833155, + "num_input_tokens_seen": 181354580, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.38330078, + "step": 8435, + "time_per_iteration": 2.648918867111206 + }, + { + "auxiliary_loss_clip": 0.0128824, + "auxiliary_loss_mlp": 0.00221905, + "balance_loss_clip": 1.07195628, + "balance_loss_mlp": 0.18545055, + "epoch": 0.5071997595069894, + "flos": 23144822248320.0, + "grad_norm": 86.22890022610137, + "language_loss": 0.81292284, + "learning_rate": 2.050429942372112e-06, + "loss": 0.82802433, + "num_input_tokens_seen": 181374320, + "router_z_loss_clip": 2.16308594, + "router_z_loss_mlp": 0.36450195, + "step": 8436, + "time_per_iteration": 2.659109115600586 + }, + { + "auxiliary_loss_clip": 0.01286966, + "auxiliary_loss_mlp": 0.00219746, + "balance_loss_clip": 1.07051361, + "balance_loss_mlp": 0.18281505, + "epoch": 0.5072598827596573, + "flos": 22747183712640.0, + "grad_norm": 9.887858236092486, + "language_loss": 0.91403919, + "learning_rate": 2.050040603565483e-06, + "loss": 0.9291063, + "num_input_tokens_seen": 181392190, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.36889648, + "step": 8437, + "time_per_iteration": 2.6986544132232666 + }, + { + "auxiliary_loss_clip": 0.01260591, + "auxiliary_loss_mlp": 0.00196829, + "balance_loss_clip": 1.05568671, + "balance_loss_mlp": 0.1586577, + "epoch": 0.5073200060123253, + "flos": 22566301799040.0, + "grad_norm": 34.47839546704348, + "language_loss": 0.87715417, + "learning_rate": 2.049651262861309e-06, + "loss": 0.89172828, + "num_input_tokens_seen": 181413890, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.38183594, + "step": 8438, + "time_per_iteration": 2.742281436920166 + }, + { + "auxiliary_loss_clip": 0.01277759, + "auxiliary_loss_mlp": 0.00221491, + "balance_loss_clip": 1.06389713, + "balance_loss_mlp": 0.17988648, + "epoch": 0.5073801292649932, + "flos": 25806341324160.0, + "grad_norm": 10.871772058131743, + "language_loss": 0.86905766, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.88405013, + "num_input_tokens_seen": 181433240, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.41625977, + "step": 8439, + "time_per_iteration": 2.737874746322632 + }, + { + "auxiliary_loss_clip": 0.01266019, + "auxiliary_loss_mlp": 0.00186567, + "balance_loss_clip": 1.06048584, + "balance_loss_mlp": 0.1519721, + "epoch": 0.5074402525176612, + "flos": 25373941401600.0, + "grad_norm": 12.587574602759359, + "language_loss": 0.77313733, + "learning_rate": 2.048872575819383e-06, + "loss": 0.78766316, + "num_input_tokens_seen": 181453535, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.34594727, + "step": 8440, + "time_per_iteration": 2.6900556087493896 + }, + { + "auxiliary_loss_clip": 0.01273244, + "auxiliary_loss_mlp": 0.00201638, + "balance_loss_clip": 1.06467962, + "balance_loss_mlp": 0.16856973, + "epoch": 0.5075003757703291, + "flos": 26064431521920.0, + "grad_norm": 4.655087423318338, + "language_loss": 0.77361882, + "learning_rate": 2.048483229511158e-06, + "loss": 0.78836769, + "num_input_tokens_seen": 181474195, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.33056641, + "step": 8441, + "time_per_iteration": 2.680727481842041 + }, + { + "auxiliary_loss_clip": 0.01274356, + "auxiliary_loss_mlp": 0.00215778, + "balance_loss_clip": 1.06111324, + "balance_loss_mlp": 0.17975271, + "epoch": 0.5075604990229972, + "flos": 21835447770240.0, + "grad_norm": 5.548521568205392, + "language_loss": 0.73036867, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.74527001, + "num_input_tokens_seen": 181494000, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.36010742, + "step": 8442, + "time_per_iteration": 2.613765239715576 + }, + { + "auxiliary_loss_clip": 0.01259492, + "auxiliary_loss_mlp": 0.00202313, + "balance_loss_clip": 1.0605036, + "balance_loss_mlp": 0.17049587, + "epoch": 0.5076206222756651, + "flos": 31978703537280.0, + "grad_norm": 129.5741433363358, + "language_loss": 0.76958895, + "learning_rate": 2.047704531394006e-06, + "loss": 0.78420705, + "num_input_tokens_seen": 181515955, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.31799316, + "step": 8443, + "time_per_iteration": 2.7187209129333496 + }, + { + "auxiliary_loss_clip": 0.01283182, + "auxiliary_loss_mlp": 0.0025913, + "balance_loss_clip": 1.07144618, + "balance_loss_mlp": 0.22465512, + "epoch": 0.5076807455283331, + "flos": 36904031326080.0, + "grad_norm": 681.0511373393103, + "language_loss": 0.68542278, + "learning_rate": 2.047315179614607e-06, + "loss": 0.7008459, + "num_input_tokens_seen": 181540225, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.3449707, + "step": 8444, + "time_per_iteration": 2.802870988845825 + }, + { + "auxiliary_loss_clip": 0.01272235, + "auxiliary_loss_mlp": 0.00235053, + "balance_loss_clip": 1.06488276, + "balance_loss_mlp": 0.20162673, + "epoch": 0.507740868781001, + "flos": 29862415981440.0, + "grad_norm": 16.16549625514251, + "language_loss": 0.71433908, + "learning_rate": 2.046925826041012e-06, + "loss": 0.72941196, + "num_input_tokens_seen": 181560125, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.33422852, + "step": 8445, + "time_per_iteration": 2.755812406539917 + }, + { + "auxiliary_loss_clip": 0.01208517, + "auxiliary_loss_mlp": 0.00109759, + "balance_loss_clip": 1.05973899, + "balance_loss_mlp": 0.0964072, + "epoch": 0.507800992033669, + "flos": 61918974247680.0, + "grad_norm": 1.2574834824888388, + "language_loss": 0.61323088, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.62641358, + "num_input_tokens_seen": 181618830, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.13378906, + "step": 8446, + "time_per_iteration": 3.1855201721191406 + }, + { + "auxiliary_loss_clip": 0.01271336, + "auxiliary_loss_mlp": 0.00238263, + "balance_loss_clip": 1.06618476, + "balance_loss_mlp": 0.2048849, + "epoch": 0.507861115286337, + "flos": 20700490757760.0, + "grad_norm": 1953.7159115245338, + "language_loss": 0.87477577, + "learning_rate": 2.04614711357029e-06, + "loss": 0.88987184, + "num_input_tokens_seen": 181637120, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.33398438, + "step": 8447, + "time_per_iteration": 2.6632330417633057 + }, + { + "auxiliary_loss_clip": 0.01265664, + "auxiliary_loss_mlp": 0.00246971, + "balance_loss_clip": 1.05977273, + "balance_loss_mlp": 0.21418804, + "epoch": 0.507921238539005, + "flos": 30847050576000.0, + "grad_norm": 4.367107921108185, + "language_loss": 0.75394309, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.76906949, + "num_input_tokens_seen": 181659965, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.32788086, + "step": 8448, + "time_per_iteration": 2.7363381385803223 + }, + { + "auxiliary_loss_clip": 0.01273504, + "auxiliary_loss_mlp": 0.00243422, + "balance_loss_clip": 1.06824517, + "balance_loss_mlp": 0.21216494, + "epoch": 0.507981361791673, + "flos": 35700197984640.0, + "grad_norm": 44.019645796323005, + "language_loss": 0.7721526, + "learning_rate": 2.045368394099955e-06, + "loss": 0.78732193, + "num_input_tokens_seen": 181685290, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.31225586, + "step": 8449, + "time_per_iteration": 2.8312604427337646 + }, + { + "auxiliary_loss_clip": 0.01267952, + "auxiliary_loss_mlp": 0.00271546, + "balance_loss_clip": 1.06526363, + "balance_loss_mlp": 0.24031331, + "epoch": 0.5080414850443409, + "flos": 27161466750720.0, + "grad_norm": 10.527701214994753, + "language_loss": 0.81042624, + "learning_rate": 2.044979031776844e-06, + "loss": 0.82582122, + "num_input_tokens_seen": 181706080, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.31225586, + "step": 8450, + "time_per_iteration": 2.706831693649292 + }, + { + "auxiliary_loss_clip": 0.01284104, + "auxiliary_loss_mlp": 0.00267566, + "balance_loss_clip": 1.07314146, + "balance_loss_mlp": 0.23559368, + "epoch": 0.5081016082970089, + "flos": 27085192220160.0, + "grad_norm": 28.594115355912066, + "language_loss": 0.83704042, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.85255718, + "num_input_tokens_seen": 181724805, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.31958008, + "step": 8451, + "time_per_iteration": 2.6697113513946533 + }, + { + "auxiliary_loss_clip": 0.01279698, + "auxiliary_loss_mlp": 0.00253869, + "balance_loss_clip": 1.07108796, + "balance_loss_mlp": 0.22034709, + "epoch": 0.5081617315496768, + "flos": 22856531690880.0, + "grad_norm": 32.766752098141566, + "language_loss": 0.92501819, + "learning_rate": 2.044200302028559e-06, + "loss": 0.94035387, + "num_input_tokens_seen": 181743725, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.33496094, + "step": 8452, + "time_per_iteration": 2.6629557609558105 + }, + { + "auxiliary_loss_clip": 0.01289489, + "auxiliary_loss_mlp": 0.00255639, + "balance_loss_clip": 1.07426941, + "balance_loss_mlp": 0.22354811, + "epoch": 0.5082218548023448, + "flos": 16281898087680.0, + "grad_norm": 110.68424976105771, + "language_loss": 0.8922087, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.90766001, + "num_input_tokens_seen": 181757720, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.32080078, + "step": 8453, + "time_per_iteration": 2.575678586959839 + }, + { + "auxiliary_loss_clip": 0.01276254, + "auxiliary_loss_mlp": 0.00283194, + "balance_loss_clip": 1.0726645, + "balance_loss_mlp": 0.25390404, + "epoch": 0.5082819780550127, + "flos": 24460768915200.0, + "grad_norm": 3.651066331755099, + "language_loss": 0.8629269, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.87852138, + "num_input_tokens_seen": 181778545, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.29260254, + "step": 8454, + "time_per_iteration": 2.692570447921753 + }, + { + "auxiliary_loss_clip": 0.01277075, + "auxiliary_loss_mlp": 0.00271438, + "balance_loss_clip": 1.0701499, + "balance_loss_mlp": 0.2381551, + "epoch": 0.5083421013076808, + "flos": 23403271582080.0, + "grad_norm": 417.4921037647146, + "language_loss": 0.95648921, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.97197431, + "num_input_tokens_seen": 181799495, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.33300781, + "step": 8455, + "time_per_iteration": 2.680104970932007 + }, + { + "auxiliary_loss_clip": 0.01279936, + "auxiliary_loss_mlp": 0.00255226, + "balance_loss_clip": 1.06291652, + "balance_loss_mlp": 0.22129858, + "epoch": 0.5084022245603487, + "flos": 23872695448320.0, + "grad_norm": 29.39719836644077, + "language_loss": 0.69289601, + "learning_rate": 2.042642822537149e-06, + "loss": 0.70824766, + "num_input_tokens_seen": 181818400, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.33886719, + "step": 8456, + "time_per_iteration": 2.652148962020874 + }, + { + "auxiliary_loss_clip": 0.01280299, + "auxiliary_loss_mlp": 0.00076297, + "balance_loss_clip": 1.09052968, + "balance_loss_mlp": 0.05684246, + "epoch": 0.5084623478130167, + "flos": 62873336655360.0, + "grad_norm": 0.7923429600422457, + "language_loss": 0.61850691, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.63207293, + "num_input_tokens_seen": 181875975, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.19433594, + "step": 8457, + "time_per_iteration": 2.9996519088745117 + }, + { + "auxiliary_loss_clip": 0.01294995, + "auxiliary_loss_mlp": 0.00275396, + "balance_loss_clip": 1.07985592, + "balance_loss_mlp": 0.23989575, + "epoch": 0.5085224710656846, + "flos": 22346133384960.0, + "grad_norm": 3.6900447442513897, + "language_loss": 0.75521559, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.77091956, + "num_input_tokens_seen": 181896450, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.35498047, + "step": 8458, + "time_per_iteration": 2.7170698642730713 + }, + { + "auxiliary_loss_clip": 0.01286353, + "auxiliary_loss_mlp": 0.00253474, + "balance_loss_clip": 1.07175815, + "balance_loss_mlp": 0.21949948, + "epoch": 0.5085825943183526, + "flos": 26066263115520.0, + "grad_norm": 3.5831845483761664, + "language_loss": 0.83821005, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.85360837, + "num_input_tokens_seen": 181916770, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.33959961, + "step": 8459, + "time_per_iteration": 2.7478222846984863 + }, + { + "auxiliary_loss_clip": 0.01302362, + "auxiliary_loss_mlp": 0.00272226, + "balance_loss_clip": 1.08589482, + "balance_loss_mlp": 0.23715451, + "epoch": 0.5086427175710206, + "flos": 17420733768960.0, + "grad_norm": 12.85302653753024, + "language_loss": 0.88289273, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.89863861, + "num_input_tokens_seen": 181932710, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.35083008, + "step": 8460, + "time_per_iteration": 4.184171915054321 + }, + { + "auxiliary_loss_clip": 0.01301122, + "auxiliary_loss_mlp": 0.00276113, + "balance_loss_clip": 1.08526433, + "balance_loss_mlp": 0.24068442, + "epoch": 0.5087028408236886, + "flos": 20631758083200.0, + "grad_norm": 52.2493006133263, + "language_loss": 0.78074396, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.7965163, + "num_input_tokens_seen": 181950665, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.35449219, + "step": 8461, + "time_per_iteration": 2.6320743560791016 + }, + { + "auxiliary_loss_clip": 0.01278118, + "auxiliary_loss_mlp": 0.00240568, + "balance_loss_clip": 1.07236862, + "balance_loss_mlp": 0.21000226, + "epoch": 0.5087629640763566, + "flos": 25593822506880.0, + "grad_norm": 32.93595865379383, + "language_loss": 0.82978296, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.84496987, + "num_input_tokens_seen": 181971270, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.30541992, + "step": 8462, + "time_per_iteration": 2.6529345512390137 + }, + { + "auxiliary_loss_clip": 0.01301865, + "auxiliary_loss_mlp": 0.00287131, + "balance_loss_clip": 1.08808255, + "balance_loss_mlp": 0.25465804, + "epoch": 0.5088230873290245, + "flos": 13261631927040.0, + "grad_norm": 13.149036555126095, + "language_loss": 0.89704221, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.91293216, + "num_input_tokens_seen": 181988410, + "router_z_loss_clip": 2.13574219, + "router_z_loss_mlp": 0.32446289, + "step": 8463, + "time_per_iteration": 4.028928756713867 + }, + { + "auxiliary_loss_clip": 0.01292216, + "auxiliary_loss_mlp": 0.00277408, + "balance_loss_clip": 1.0821594, + "balance_loss_mlp": 0.24302843, + "epoch": 0.5088832105816925, + "flos": 20043469134720.0, + "grad_norm": 446.6297308732057, + "language_loss": 0.82334453, + "learning_rate": 2.039527786882341e-06, + "loss": 0.83904076, + "num_input_tokens_seen": 182006530, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.34375, + "step": 8464, + "time_per_iteration": 4.010192632675171 + }, + { + "auxiliary_loss_clip": 0.01284945, + "auxiliary_loss_mlp": 0.00040715, + "balance_loss_clip": 1.12230432, + "balance_loss_mlp": 0.02593261, + "epoch": 0.5089433338343604, + "flos": 67422179018880.0, + "grad_norm": 0.6792131237838422, + "language_loss": 0.5871287, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.60038531, + "num_input_tokens_seen": 182074240, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.14746094, + "step": 8465, + "time_per_iteration": 3.2345855236053467 + }, + { + "auxiliary_loss_clip": 0.01286908, + "auxiliary_loss_mlp": 0.00249595, + "balance_loss_clip": 1.07851255, + "balance_loss_mlp": 0.21640718, + "epoch": 0.5090034570870284, + "flos": 22710339336960.0, + "grad_norm": 12.533113653804534, + "language_loss": 0.88579273, + "learning_rate": 2.038749012684354e-06, + "loss": 0.90115786, + "num_input_tokens_seen": 182093360, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.33178711, + "step": 8466, + "time_per_iteration": 2.639962673187256 + }, + { + "auxiliary_loss_clip": 0.01292944, + "auxiliary_loss_mlp": 0.00256303, + "balance_loss_clip": 1.08256459, + "balance_loss_mlp": 0.2242592, + "epoch": 0.5090635803396963, + "flos": 20445812352000.0, + "grad_norm": 11.630830447566353, + "language_loss": 0.84543556, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.860928, + "num_input_tokens_seen": 182110170, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.32006836, + "step": 8467, + "time_per_iteration": 2.6207423210144043 + }, + { + "auxiliary_loss_clip": 0.01294088, + "auxiliary_loss_mlp": 0.00279172, + "balance_loss_clip": 1.0875771, + "balance_loss_mlp": 0.24731922, + "epoch": 0.5091237035923644, + "flos": 23768878164480.0, + "grad_norm": 4.746928782429192, + "language_loss": 0.80885452, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.82458711, + "num_input_tokens_seen": 182129570, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.31835938, + "step": 8468, + "time_per_iteration": 2.6897740364074707 + }, + { + "auxiliary_loss_clip": 0.01305068, + "auxiliary_loss_mlp": 0.00243002, + "balance_loss_clip": 1.09075534, + "balance_loss_mlp": 0.20914689, + "epoch": 0.5091838268450323, + "flos": 18327908684160.0, + "grad_norm": 32.499069481428066, + "language_loss": 0.84246469, + "learning_rate": 2.03758084040404e-06, + "loss": 0.85794532, + "num_input_tokens_seen": 182147565, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.33837891, + "step": 8469, + "time_per_iteration": 2.69844913482666 + }, + { + "auxiliary_loss_clip": 0.01319003, + "auxiliary_loss_mlp": 0.00267409, + "balance_loss_clip": 1.10307264, + "balance_loss_mlp": 0.23550904, + "epoch": 0.5092439500977003, + "flos": 29057621806080.0, + "grad_norm": 10.626290995701492, + "language_loss": 0.74347407, + "learning_rate": 2.037191446774109e-06, + "loss": 0.7593382, + "num_input_tokens_seen": 182169695, + "router_z_loss_clip": 2.15722656, + "router_z_loss_mlp": 0.31884766, + "step": 8470, + "time_per_iteration": 4.25928807258606 + }, + { + "auxiliary_loss_clip": 0.01306146, + "auxiliary_loss_mlp": 0.00262862, + "balance_loss_clip": 1.09083986, + "balance_loss_mlp": 0.22988835, + "epoch": 0.5093040733503682, + "flos": 13553908894080.0, + "grad_norm": 32.93261253484889, + "language_loss": 0.81251764, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.82820773, + "num_input_tokens_seen": 182186385, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.32958984, + "step": 8471, + "time_per_iteration": 2.7232437133789062 + }, + { + "auxiliary_loss_clip": 0.01301485, + "auxiliary_loss_mlp": 0.00045935, + "balance_loss_clip": 1.15952682, + "balance_loss_mlp": 0.03615949, + "epoch": 0.5093641966030362, + "flos": 68906617407360.0, + "grad_norm": 0.7480140293361, + "language_loss": 0.5787183, + "learning_rate": 2.036412655298103e-06, + "loss": 0.59219253, + "num_input_tokens_seen": 182247095, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.09765625, + "step": 8472, + "time_per_iteration": 3.1409759521484375 + }, + { + "auxiliary_loss_clip": 0.01305438, + "auxiliary_loss_mlp": 0.00248349, + "balance_loss_clip": 1.09115779, + "balance_loss_mlp": 0.21780801, + "epoch": 0.5094243198557042, + "flos": 21580948932480.0, + "grad_norm": 152.4589054260386, + "language_loss": 0.75193262, + "learning_rate": 2.03602325748156e-06, + "loss": 0.76747054, + "num_input_tokens_seen": 182266380, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.30517578, + "step": 8473, + "time_per_iteration": 2.6787497997283936 + }, + { + "auxiliary_loss_clip": 0.01310457, + "auxiliary_loss_mlp": 0.00241222, + "balance_loss_clip": 1.09423709, + "balance_loss_mlp": 0.20772411, + "epoch": 0.5094844431083722, + "flos": 28840721529600.0, + "grad_norm": 11.837596903857097, + "language_loss": 0.92461675, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.94013357, + "num_input_tokens_seen": 182284685, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.33520508, + "step": 8474, + "time_per_iteration": 2.7184979915618896 + }, + { + "auxiliary_loss_clip": 0.01317132, + "auxiliary_loss_mlp": 0.00248654, + "balance_loss_clip": 1.10054588, + "balance_loss_mlp": 0.21544194, + "epoch": 0.5095445663610402, + "flos": 14976114969600.0, + "grad_norm": 17.386978704838047, + "language_loss": 0.70518517, + "learning_rate": 2.035244457765222e-06, + "loss": 0.72084302, + "num_input_tokens_seen": 182301810, + "router_z_loss_clip": 2.16308594, + "router_z_loss_mlp": 0.33203125, + "step": 8475, + "time_per_iteration": 2.6275157928466797 + }, + { + "auxiliary_loss_clip": 0.01327171, + "auxiliary_loss_mlp": 0.00267896, + "balance_loss_clip": 1.10769415, + "balance_loss_mlp": 0.2356734, + "epoch": 0.5096046896137081, + "flos": 20777088510720.0, + "grad_norm": 7.641110800281134, + "language_loss": 0.89504814, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.91099882, + "num_input_tokens_seen": 182320285, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.3223877, + "step": 8476, + "time_per_iteration": 2.667182207107544 + }, + { + "auxiliary_loss_clip": 0.01320261, + "auxiliary_loss_mlp": 0.00294748, + "balance_loss_clip": 1.10065746, + "balance_loss_mlp": 0.25981927, + "epoch": 0.5096648128663761, + "flos": 23185078416000.0, + "grad_norm": 40.08769832645642, + "language_loss": 0.91213125, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.92828131, + "num_input_tokens_seen": 182339465, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.34936523, + "step": 8477, + "time_per_iteration": 2.6616976261138916 + }, + { + "auxiliary_loss_clip": 0.01316302, + "auxiliary_loss_mlp": 0.00250758, + "balance_loss_clip": 1.10213482, + "balance_loss_mlp": 0.2181896, + "epoch": 0.509724936119044, + "flos": 22309432663680.0, + "grad_norm": 72.4588935820032, + "language_loss": 0.70001853, + "learning_rate": 2.034076248204082e-06, + "loss": 0.71568906, + "num_input_tokens_seen": 182358375, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.32592773, + "step": 8478, + "time_per_iteration": 2.665623903274536 + }, + { + "auxiliary_loss_clip": 0.01305338, + "auxiliary_loss_mlp": 0.0025625, + "balance_loss_clip": 1.09520936, + "balance_loss_mlp": 0.22339615, + "epoch": 0.509785059371712, + "flos": 26287077974400.0, + "grad_norm": 19.684098157002953, + "language_loss": 0.74988538, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.76550126, + "num_input_tokens_seen": 182377935, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.32836914, + "step": 8479, + "time_per_iteration": 2.691483736038208 + }, + { + "auxiliary_loss_clip": 0.01309311, + "auxiliary_loss_mlp": 0.0024118, + "balance_loss_clip": 1.09731936, + "balance_loss_mlp": 0.20947018, + "epoch": 0.50984518262438, + "flos": 22964586779520.0, + "grad_norm": 20.152500716694146, + "language_loss": 0.77425849, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.78976333, + "num_input_tokens_seen": 182396440, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.31713867, + "step": 8480, + "time_per_iteration": 2.771641254425049 + }, + { + "auxiliary_loss_clip": 0.01315363, + "auxiliary_loss_mlp": 0.00237625, + "balance_loss_clip": 1.09620786, + "balance_loss_mlp": 0.2057727, + "epoch": 0.509905305877048, + "flos": 26213389223040.0, + "grad_norm": 13.064770799458795, + "language_loss": 0.84728092, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.86281085, + "num_input_tokens_seen": 182415890, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.31860352, + "step": 8481, + "time_per_iteration": 2.701718330383301 + }, + { + "auxiliary_loss_clip": 0.01307089, + "auxiliary_loss_mlp": 0.0023332, + "balance_loss_clip": 1.0942353, + "balance_loss_mlp": 0.20199221, + "epoch": 0.5099654291297159, + "flos": 20340055733760.0, + "grad_norm": 118.11644890954022, + "language_loss": 0.88855183, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.90395594, + "num_input_tokens_seen": 182434235, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.31347656, + "step": 8482, + "time_per_iteration": 2.61818265914917 + }, + { + "auxiliary_loss_clip": 0.01323387, + "auxiliary_loss_mlp": 0.00253911, + "balance_loss_clip": 1.10021901, + "balance_loss_mlp": 0.22115257, + "epoch": 0.5100255523823839, + "flos": 29054820545280.0, + "grad_norm": 47.631173307637034, + "language_loss": 0.9205929, + "learning_rate": 2.032129206622238e-06, + "loss": 0.93636584, + "num_input_tokens_seen": 182454360, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.32714844, + "step": 8483, + "time_per_iteration": 2.706202983856201 + }, + { + "auxiliary_loss_clip": 0.01312228, + "auxiliary_loss_mlp": 0.00240649, + "balance_loss_clip": 1.0969888, + "balance_loss_mlp": 0.21046518, + "epoch": 0.5100856756350518, + "flos": 22455912326400.0, + "grad_norm": 138.47691000063756, + "language_loss": 0.90027261, + "learning_rate": 2.031739794591775e-06, + "loss": 0.91580135, + "num_input_tokens_seen": 182471940, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.30175781, + "step": 8484, + "time_per_iteration": 2.665294885635376 + }, + { + "auxiliary_loss_clip": 0.01298221, + "auxiliary_loss_mlp": 0.00264153, + "balance_loss_clip": 1.08722949, + "balance_loss_mlp": 0.23239595, + "epoch": 0.5101457988877198, + "flos": 19171055606400.0, + "grad_norm": 13.656938368720379, + "language_loss": 0.88581443, + "learning_rate": 2.031350381357736e-06, + "loss": 0.90143824, + "num_input_tokens_seen": 182490685, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.31762695, + "step": 8485, + "time_per_iteration": 2.6119463443756104 + }, + { + "auxiliary_loss_clip": 0.01298503, + "auxiliary_loss_mlp": 0.0024403, + "balance_loss_clip": 1.09034216, + "balance_loss_mlp": 0.21375114, + "epoch": 0.5102059221403878, + "flos": 14866371941760.0, + "grad_norm": 126.14869748625088, + "language_loss": 0.79607844, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.81150377, + "num_input_tokens_seen": 182508325, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.30297852, + "step": 8486, + "time_per_iteration": 2.62530255317688 + }, + { + "auxiliary_loss_clip": 0.01326552, + "auxiliary_loss_mlp": 0.00255083, + "balance_loss_clip": 1.10584903, + "balance_loss_mlp": 0.22330174, + "epoch": 0.5102660453930558, + "flos": 22961103160320.0, + "grad_norm": 22.562402633572344, + "language_loss": 0.77014875, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.78596509, + "num_input_tokens_seen": 182527020, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.31787109, + "step": 8487, + "time_per_iteration": 2.628732919692993 + }, + { + "auxiliary_loss_clip": 0.01319589, + "auxiliary_loss_mlp": 0.00245602, + "balance_loss_clip": 1.10466707, + "balance_loss_mlp": 0.21489391, + "epoch": 0.5103261686457238, + "flos": 23149311448320.0, + "grad_norm": 95.41460318245436, + "language_loss": 0.7885403, + "learning_rate": 2.030182134581827e-06, + "loss": 0.8041923, + "num_input_tokens_seen": 182543505, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.30712891, + "step": 8488, + "time_per_iteration": 2.671250820159912 + }, + { + "auxiliary_loss_clip": 0.01323837, + "auxiliary_loss_mlp": 0.00239227, + "balance_loss_clip": 1.10266006, + "balance_loss_mlp": 0.21084291, + "epoch": 0.5103862918983917, + "flos": 14319237000960.0, + "grad_norm": 62.33357847033048, + "language_loss": 0.79148042, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.80711102, + "num_input_tokens_seen": 182562250, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.28381348, + "step": 8489, + "time_per_iteration": 2.6053555011749268 + }, + { + "auxiliary_loss_clip": 0.01301137, + "auxiliary_loss_mlp": 0.00248596, + "balance_loss_clip": 1.08838439, + "balance_loss_mlp": 0.21791139, + "epoch": 0.5104464151510597, + "flos": 25848536826240.0, + "grad_norm": 10.146232131850132, + "language_loss": 0.79599714, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.81149447, + "num_input_tokens_seen": 182581910, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.30664062, + "step": 8490, + "time_per_iteration": 2.681851387023926 + }, + { + "auxiliary_loss_clip": 0.01288683, + "auxiliary_loss_mlp": 0.00243332, + "balance_loss_clip": 1.08039582, + "balance_loss_mlp": 0.21472159, + "epoch": 0.5105065384037276, + "flos": 21652913831040.0, + "grad_norm": 8.05218828763606, + "language_loss": 0.87145436, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.88677454, + "num_input_tokens_seen": 182601350, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.28588867, + "step": 8491, + "time_per_iteration": 2.665956735610962 + }, + { + "auxiliary_loss_clip": 0.01284165, + "auxiliary_loss_mlp": 0.00216662, + "balance_loss_clip": 1.08198142, + "balance_loss_mlp": 0.18963754, + "epoch": 0.5105666616563956, + "flos": 22491571553280.0, + "grad_norm": 10.549271272895274, + "language_loss": 0.86861056, + "learning_rate": 2.028624456259728e-06, + "loss": 0.88361883, + "num_input_tokens_seen": 182619660, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.27001953, + "step": 8492, + "time_per_iteration": 2.742173194885254 + }, + { + "auxiliary_loss_clip": 0.01326494, + "auxiliary_loss_mlp": 0.00242409, + "balance_loss_clip": 1.10094237, + "balance_loss_mlp": 0.21049632, + "epoch": 0.5106267849090635, + "flos": 22455768672000.0, + "grad_norm": 14.036177486165249, + "language_loss": 0.85189164, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.86758065, + "num_input_tokens_seen": 182639815, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.31896973, + "step": 8493, + "time_per_iteration": 2.732360601425171 + }, + { + "auxiliary_loss_clip": 0.01315362, + "auxiliary_loss_mlp": 0.00262983, + "balance_loss_clip": 1.09861326, + "balance_loss_mlp": 0.23260829, + "epoch": 0.5106869081617316, + "flos": 23547093638400.0, + "grad_norm": 3.4696409386257905, + "language_loss": 0.89904654, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.91482997, + "num_input_tokens_seen": 182659655, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.3034668, + "step": 8494, + "time_per_iteration": 2.652782678604126 + }, + { + "auxiliary_loss_clip": 0.01307463, + "auxiliary_loss_mlp": 0.00261805, + "balance_loss_clip": 1.09132278, + "balance_loss_mlp": 0.23092949, + "epoch": 0.5107470314143995, + "flos": 26792987080320.0, + "grad_norm": 2.998870048337578, + "language_loss": 0.84829956, + "learning_rate": 2.027456186069326e-06, + "loss": 0.86399221, + "num_input_tokens_seen": 182677075, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.30908203, + "step": 8495, + "time_per_iteration": 2.6659493446350098 + }, + { + "auxiliary_loss_clip": 0.01311973, + "auxiliary_loss_mlp": 0.00270541, + "balance_loss_clip": 1.09502435, + "balance_loss_mlp": 0.23854569, + "epoch": 0.5108071546670675, + "flos": 25739691638400.0, + "grad_norm": 22.486743027263202, + "language_loss": 0.8512634, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.86708856, + "num_input_tokens_seen": 182699625, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.31982422, + "step": 8496, + "time_per_iteration": 2.6910994052886963 + }, + { + "auxiliary_loss_clip": 0.01299275, + "auxiliary_loss_mlp": 0.00256343, + "balance_loss_clip": 1.08994067, + "balance_loss_mlp": 0.22642121, + "epoch": 0.5108672779197354, + "flos": 18697537589760.0, + "grad_norm": 27.734630306463366, + "language_loss": 0.85657007, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.87212622, + "num_input_tokens_seen": 182717020, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.29919434, + "step": 8497, + "time_per_iteration": 2.6414034366607666 + }, + { + "auxiliary_loss_clip": 0.01316332, + "auxiliary_loss_mlp": 0.00236334, + "balance_loss_clip": 1.09933352, + "balance_loss_mlp": 0.20668659, + "epoch": 0.5109274011724034, + "flos": 26688164215680.0, + "grad_norm": 10.323675546996526, + "language_loss": 0.87303859, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.88856524, + "num_input_tokens_seen": 182736955, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.29675293, + "step": 8498, + "time_per_iteration": 2.7126123905181885 + }, + { + "auxiliary_loss_clip": 0.01306805, + "auxiliary_loss_mlp": 0.00246393, + "balance_loss_clip": 1.0968653, + "balance_loss_mlp": 0.21663833, + "epoch": 0.5109875244250714, + "flos": 22784028088320.0, + "grad_norm": 6.115657491781187, + "language_loss": 0.79758054, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.8131125, + "num_input_tokens_seen": 182757620, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.29760742, + "step": 8499, + "time_per_iteration": 2.6821248531341553 + }, + { + "auxiliary_loss_clip": 0.01311965, + "auxiliary_loss_mlp": 0.00242558, + "balance_loss_clip": 1.09523642, + "balance_loss_mlp": 0.21337564, + "epoch": 0.5110476476777394, + "flos": 35588515622400.0, + "grad_norm": 25.352956061327028, + "language_loss": 0.78399205, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.7995373, + "num_input_tokens_seen": 182780195, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.29150391, + "step": 8500, + "time_per_iteration": 2.7731637954711914 + }, + { + "auxiliary_loss_clip": 0.01321705, + "auxiliary_loss_mlp": 0.00281464, + "balance_loss_clip": 1.0948329, + "balance_loss_mlp": 0.24770394, + "epoch": 0.5111077709304074, + "flos": 19280798634240.0, + "grad_norm": 5.411947949582414, + "language_loss": 0.74715716, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.76318884, + "num_input_tokens_seen": 182795765, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.33740234, + "step": 8501, + "time_per_iteration": 2.6389389038085938 + }, + { + "auxiliary_loss_clip": 0.01309816, + "auxiliary_loss_mlp": 0.00254911, + "balance_loss_clip": 1.0900631, + "balance_loss_mlp": 0.22384493, + "epoch": 0.5111678941830753, + "flos": 20668207409280.0, + "grad_norm": 28.801695260416466, + "language_loss": 0.95031214, + "learning_rate": 2.024730186540907e-06, + "loss": 0.96595937, + "num_input_tokens_seen": 182813120, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.31079102, + "step": 8502, + "time_per_iteration": 4.112308979034424 + }, + { + "auxiliary_loss_clip": 0.0129652, + "auxiliary_loss_mlp": 0.00265425, + "balance_loss_clip": 1.08558214, + "balance_loss_mlp": 0.23493141, + "epoch": 0.5112280174357433, + "flos": 26287903987200.0, + "grad_norm": 18.874767344309955, + "language_loss": 0.88513637, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.90075582, + "num_input_tokens_seen": 182835745, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.3046875, + "step": 8503, + "time_per_iteration": 2.768415927886963 + }, + { + "auxiliary_loss_clip": 0.01282991, + "auxiliary_loss_mlp": 0.00132425, + "balance_loss_clip": 1.14478922, + "balance_loss_mlp": 0.12551077, + "epoch": 0.5112881406884112, + "flos": 59474247707520.0, + "grad_norm": 3.3641392976996745, + "language_loss": 0.63695085, + "learning_rate": 2.023951320871339e-06, + "loss": 0.65110493, + "num_input_tokens_seen": 182892540, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.06933594, + "step": 8504, + "time_per_iteration": 3.143454074859619 + }, + { + "auxiliary_loss_clip": 0.01312248, + "auxiliary_loss_mlp": 0.00249552, + "balance_loss_clip": 1.09652495, + "balance_loss_mlp": 0.21932048, + "epoch": 0.5113482639410792, + "flos": 26468857728000.0, + "grad_norm": 360.2809455277972, + "language_loss": 0.90587717, + "learning_rate": 2.023561886666816e-06, + "loss": 0.9214952, + "num_input_tokens_seen": 182911515, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.30212402, + "step": 8505, + "time_per_iteration": 4.072691202163696 + }, + { + "auxiliary_loss_clip": 0.01311972, + "auxiliary_loss_mlp": 0.00248951, + "balance_loss_clip": 1.09922338, + "balance_loss_mlp": 0.21979198, + "epoch": 0.5114083871937471, + "flos": 29895848565120.0, + "grad_norm": 8.739401977587278, + "language_loss": 0.81953675, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.83514595, + "num_input_tokens_seen": 182930860, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.29150391, + "step": 8506, + "time_per_iteration": 2.6735408306121826 + }, + { + "auxiliary_loss_clip": 0.01300492, + "auxiliary_loss_mlp": 0.00271401, + "balance_loss_clip": 1.08506823, + "balance_loss_mlp": 0.23777199, + "epoch": 0.5114685104464152, + "flos": 24314576561280.0, + "grad_norm": 5.08256383253446, + "language_loss": 0.68369132, + "learning_rate": 2.022783015592131e-06, + "loss": 0.69941026, + "num_input_tokens_seen": 182949960, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.33630371, + "step": 8507, + "time_per_iteration": 4.1303184032440186 + }, + { + "auxiliary_loss_clip": 0.0130656, + "auxiliary_loss_mlp": 0.00228027, + "balance_loss_clip": 1.09470713, + "balance_loss_mlp": 0.1979503, + "epoch": 0.5115286336990831, + "flos": 17019288391680.0, + "grad_norm": 16.254968202248065, + "language_loss": 0.9122653, + "learning_rate": 2.022393578751503e-06, + "loss": 0.92761111, + "num_input_tokens_seen": 182968085, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.30078125, + "step": 8508, + "time_per_iteration": 2.6117300987243652 + }, + { + "auxiliary_loss_clip": 0.01317888, + "auxiliary_loss_mlp": 0.00245029, + "balance_loss_clip": 1.09940171, + "balance_loss_mlp": 0.21520317, + "epoch": 0.5115887569517511, + "flos": 23659386531840.0, + "grad_norm": 6.065328738655889, + "language_loss": 0.78097463, + "learning_rate": 2.022004141061709e-06, + "loss": 0.7966038, + "num_input_tokens_seen": 182987275, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.29833984, + "step": 8509, + "time_per_iteration": 2.688170909881592 + }, + { + "auxiliary_loss_clip": 0.01284605, + "auxiliary_loss_mlp": 0.00242021, + "balance_loss_clip": 1.0816716, + "balance_loss_mlp": 0.21521118, + "epoch": 0.511648880204419, + "flos": 16107193313280.0, + "grad_norm": 13.316060235560677, + "language_loss": 0.81346655, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.82873279, + "num_input_tokens_seen": 183004700, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.26818848, + "step": 8510, + "time_per_iteration": 2.609095811843872 + }, + { + "auxiliary_loss_clip": 0.0130141, + "auxiliary_loss_mlp": 0.00219095, + "balance_loss_clip": 1.09338987, + "balance_loss_mlp": 0.19089019, + "epoch": 0.511709003457087, + "flos": 32634970974720.0, + "grad_norm": 4.633752670743582, + "language_loss": 0.78483927, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.8000443, + "num_input_tokens_seen": 183025830, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.2824707, + "step": 8511, + "time_per_iteration": 2.7290170192718506 + }, + { + "auxiliary_loss_clip": 0.01308644, + "auxiliary_loss_mlp": 0.00245448, + "balance_loss_clip": 1.09849, + "balance_loss_mlp": 0.21650442, + "epoch": 0.511769126709755, + "flos": 21762082241280.0, + "grad_norm": 11.145708259270094, + "language_loss": 0.75608617, + "learning_rate": 2.020835823045001e-06, + "loss": 0.77162707, + "num_input_tokens_seen": 183045140, + "router_z_loss_clip": 2.10253906, + "router_z_loss_mlp": 0.28955078, + "step": 8512, + "time_per_iteration": 4.115967035293579 + }, + { + "auxiliary_loss_clip": 0.01306349, + "auxiliary_loss_mlp": 0.00247244, + "balance_loss_clip": 1.0918256, + "balance_loss_mlp": 0.21436571, + "epoch": 0.511829249962423, + "flos": 23915357827200.0, + "grad_norm": 30.27477105736768, + "language_loss": 0.74868524, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.76422113, + "num_input_tokens_seen": 183063935, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.32861328, + "step": 8513, + "time_per_iteration": 2.710601568222046 + }, + { + "auxiliary_loss_clip": 0.01321709, + "auxiliary_loss_mlp": 0.00234453, + "balance_loss_clip": 1.10501623, + "balance_loss_mlp": 0.20534185, + "epoch": 0.511889373215091, + "flos": 23727005884800.0, + "grad_norm": 4.668750131548081, + "language_loss": 0.75076085, + "learning_rate": 2.0200569403921e-06, + "loss": 0.76632249, + "num_input_tokens_seen": 183084135, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.29125977, + "step": 8514, + "time_per_iteration": 2.7405760288238525 + }, + { + "auxiliary_loss_clip": 0.01303115, + "auxiliary_loss_mlp": 0.00241477, + "balance_loss_clip": 1.09142852, + "balance_loss_mlp": 0.21417816, + "epoch": 0.5119494964677589, + "flos": 28111519526400.0, + "grad_norm": 263.6517526992869, + "language_loss": 0.7140367, + "learning_rate": 2.019667497917424e-06, + "loss": 0.72948265, + "num_input_tokens_seen": 183104570, + "router_z_loss_clip": 2.11425781, + "router_z_loss_mlp": 0.27282715, + "step": 8515, + "time_per_iteration": 2.7509236335754395 + }, + { + "auxiliary_loss_clip": 0.01302007, + "auxiliary_loss_mlp": 0.00227464, + "balance_loss_clip": 1.09122503, + "balance_loss_mlp": 0.20033219, + "epoch": 0.5120096197204269, + "flos": 24973214296320.0, + "grad_norm": 29.375448356283805, + "language_loss": 0.82164121, + "learning_rate": 2.019278054696955e-06, + "loss": 0.83693588, + "num_input_tokens_seen": 183123850, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.27111816, + "step": 8516, + "time_per_iteration": 2.682950735092163 + }, + { + "auxiliary_loss_clip": 0.01331721, + "auxiliary_loss_mlp": 0.0026322, + "balance_loss_clip": 1.11561954, + "balance_loss_mlp": 0.23359665, + "epoch": 0.5120697429730948, + "flos": 17968012364160.0, + "grad_norm": 8.25869494932315, + "language_loss": 0.84183049, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.85777986, + "num_input_tokens_seen": 183141725, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.29602051, + "step": 8517, + "time_per_iteration": 2.630096673965454 + }, + { + "auxiliary_loss_clip": 0.01330392, + "auxiliary_loss_mlp": 0.0025842, + "balance_loss_clip": 1.10625434, + "balance_loss_mlp": 0.22661456, + "epoch": 0.5121298662257628, + "flos": 23292343405440.0, + "grad_norm": 9.27130142498079, + "language_loss": 0.8107295, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.8266176, + "num_input_tokens_seen": 183161300, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.31811523, + "step": 8518, + "time_per_iteration": 2.7049458026885986 + }, + { + "auxiliary_loss_clip": 0.01318213, + "auxiliary_loss_mlp": 0.00224619, + "balance_loss_clip": 1.1026535, + "balance_loss_mlp": 0.19410104, + "epoch": 0.5121899894784308, + "flos": 17311062568320.0, + "grad_norm": 45.75327703544161, + "language_loss": 0.88307118, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.89849949, + "num_input_tokens_seen": 183180495, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.30541992, + "step": 8519, + "time_per_iteration": 2.652827262878418 + }, + { + "auxiliary_loss_clip": 0.01314247, + "auxiliary_loss_mlp": 0.00254392, + "balance_loss_clip": 1.10107255, + "balance_loss_mlp": 0.22536427, + "epoch": 0.5122501127310988, + "flos": 24930085040640.0, + "grad_norm": 571.1712046530563, + "language_loss": 0.87009263, + "learning_rate": 2.017720274652497e-06, + "loss": 0.88577902, + "num_input_tokens_seen": 183200330, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.28991699, + "step": 8520, + "time_per_iteration": 2.699948310852051 + }, + { + "auxiliary_loss_clip": 0.01327085, + "auxiliary_loss_mlp": 0.00276616, + "balance_loss_clip": 1.10744333, + "balance_loss_mlp": 0.24597907, + "epoch": 0.5123102359837667, + "flos": 18442859184000.0, + "grad_norm": 105.98418948983861, + "language_loss": 0.89361119, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.90964818, + "num_input_tokens_seen": 183218230, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.3067627, + "step": 8521, + "time_per_iteration": 2.682382345199585 + }, + { + "auxiliary_loss_clip": 0.01315862, + "auxiliary_loss_mlp": 0.00241709, + "balance_loss_clip": 1.09973168, + "balance_loss_mlp": 0.21071422, + "epoch": 0.5123703592364347, + "flos": 26684860164480.0, + "grad_norm": 5.398366088232258, + "language_loss": 0.72024548, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.73582125, + "num_input_tokens_seen": 183236735, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.31005859, + "step": 8522, + "time_per_iteration": 2.7388296127319336 + }, + { + "auxiliary_loss_clip": 0.01331191, + "auxiliary_loss_mlp": 0.00260834, + "balance_loss_clip": 1.10778666, + "balance_loss_mlp": 0.22759876, + "epoch": 0.5124304824891026, + "flos": 28803948981120.0, + "grad_norm": 23.29252831163486, + "language_loss": 0.70978272, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.72570294, + "num_input_tokens_seen": 183257550, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.33203125, + "step": 8523, + "time_per_iteration": 2.696420669555664 + }, + { + "auxiliary_loss_clip": 0.0132494, + "auxiliary_loss_mlp": 0.00239373, + "balance_loss_clip": 1.11053395, + "balance_loss_mlp": 0.21164495, + "epoch": 0.5124906057417706, + "flos": 21761830846080.0, + "grad_norm": 6.345559632676771, + "language_loss": 0.83082169, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.84646481, + "num_input_tokens_seen": 183275515, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.27709961, + "step": 8524, + "time_per_iteration": 2.6373908519744873 + }, + { + "auxiliary_loss_clip": 0.01301252, + "auxiliary_loss_mlp": 0.00232045, + "balance_loss_clip": 1.09406567, + "balance_loss_mlp": 0.20360178, + "epoch": 0.5125507289944387, + "flos": 18880538405760.0, + "grad_norm": 4.8536476067603225, + "language_loss": 0.81792569, + "learning_rate": 2.015773034588706e-06, + "loss": 0.83325875, + "num_input_tokens_seen": 183293880, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.28442383, + "step": 8525, + "time_per_iteration": 2.664383888244629 + }, + { + "auxiliary_loss_clip": 0.01329718, + "auxiliary_loss_mlp": 0.00267424, + "balance_loss_clip": 1.11244845, + "balance_loss_mlp": 0.23592857, + "epoch": 0.5126108522471066, + "flos": 35627838036480.0, + "grad_norm": 10.008540859571776, + "language_loss": 0.80576456, + "learning_rate": 2.015383584722531e-06, + "loss": 0.82173598, + "num_input_tokens_seen": 183315860, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.31518555, + "step": 8526, + "time_per_iteration": 2.754840850830078 + }, + { + "auxiliary_loss_clip": 0.01321151, + "auxiliary_loss_mlp": 0.00258239, + "balance_loss_clip": 1.10550988, + "balance_loss_mlp": 0.22860309, + "epoch": 0.5126709754997746, + "flos": 20190918464640.0, + "grad_norm": 8.954339155284618, + "language_loss": 0.71682155, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.73261547, + "num_input_tokens_seen": 183335480, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.29650879, + "step": 8527, + "time_per_iteration": 2.6380720138549805 + }, + { + "auxiliary_loss_clip": 0.01308387, + "auxiliary_loss_mlp": 0.00211215, + "balance_loss_clip": 1.10456371, + "balance_loss_mlp": 0.18178183, + "epoch": 0.5127310987524425, + "flos": 18588548747520.0, + "grad_norm": 62.52091897798757, + "language_loss": 0.80170536, + "learning_rate": 2.014604683254908e-06, + "loss": 0.81690139, + "num_input_tokens_seen": 183354395, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.29431152, + "step": 8528, + "time_per_iteration": 2.6126840114593506 + }, + { + "auxiliary_loss_clip": 0.01313058, + "auxiliary_loss_mlp": 0.00250377, + "balance_loss_clip": 1.10071445, + "balance_loss_mlp": 0.2194784, + "epoch": 0.5127912220051105, + "flos": 22454691264000.0, + "grad_norm": 16.446021112361503, + "language_loss": 0.88676512, + "learning_rate": 2.014215231682995e-06, + "loss": 0.90239948, + "num_input_tokens_seen": 183372980, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.30908203, + "step": 8529, + "time_per_iteration": 2.6172664165496826 + }, + { + "auxiliary_loss_clip": 0.01310428, + "auxiliary_loss_mlp": 0.00246081, + "balance_loss_clip": 1.09994316, + "balance_loss_mlp": 0.2161116, + "epoch": 0.5128513452577784, + "flos": 19093703667840.0, + "grad_norm": 3.8738542864535743, + "language_loss": 0.79803091, + "learning_rate": 2.01382577957204e-06, + "loss": 0.81359595, + "num_input_tokens_seen": 183390160, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.29980469, + "step": 8530, + "time_per_iteration": 2.6011102199554443 + }, + { + "auxiliary_loss_clip": 0.01395475, + "auxiliary_loss_mlp": 0.00090571, + "balance_loss_clip": 1.25420904, + "balance_loss_mlp": 0.08217871, + "epoch": 0.5129114685104464, + "flos": 67892285243520.0, + "grad_norm": 0.7561724740301653, + "language_loss": 0.60380822, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.61866868, + "num_input_tokens_seen": 183455280, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.08398438, + "step": 8531, + "time_per_iteration": 3.3527121543884277 + }, + { + "auxiliary_loss_clip": 0.01340876, + "auxiliary_loss_mlp": 0.00249544, + "balance_loss_clip": 1.11725748, + "balance_loss_mlp": 0.21843049, + "epoch": 0.5129715917631144, + "flos": 20449152316800.0, + "grad_norm": 72.64106910377896, + "language_loss": 0.85529923, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.87120336, + "num_input_tokens_seen": 183473955, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.31103516, + "step": 8532, + "time_per_iteration": 2.831019163131714 + }, + { + "auxiliary_loss_clip": 0.01330257, + "auxiliary_loss_mlp": 0.00233015, + "balance_loss_clip": 1.11154175, + "balance_loss_mlp": 0.20392773, + "epoch": 0.5130317150157824, + "flos": 35116146840960.0, + "grad_norm": 107.32788476234585, + "language_loss": 0.73930424, + "learning_rate": 2.012657420152597e-06, + "loss": 0.75493705, + "num_input_tokens_seen": 183497195, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.29101562, + "step": 8533, + "time_per_iteration": 2.7663958072662354 + }, + { + "auxiliary_loss_clip": 0.01305959, + "auxiliary_loss_mlp": 0.00261388, + "balance_loss_clip": 1.0943476, + "balance_loss_mlp": 0.22979754, + "epoch": 0.5130918382684503, + "flos": 19791627903360.0, + "grad_norm": 10.615137030939177, + "language_loss": 0.87302768, + "learning_rate": 2.01226796603315e-06, + "loss": 0.8887012, + "num_input_tokens_seen": 183513675, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.31591797, + "step": 8534, + "time_per_iteration": 2.6639246940612793 + }, + { + "auxiliary_loss_clip": 0.01323886, + "auxiliary_loss_mlp": 0.00244601, + "balance_loss_clip": 1.10710144, + "balance_loss_mlp": 0.21448895, + "epoch": 0.5131519615211183, + "flos": 26323096337280.0, + "grad_norm": 1.8798263428697572, + "language_loss": 0.70081782, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.71650267, + "num_input_tokens_seen": 183535165, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.30102539, + "step": 8535, + "time_per_iteration": 2.7494118213653564 + }, + { + "auxiliary_loss_clip": 0.01325972, + "auxiliary_loss_mlp": 0.00251163, + "balance_loss_clip": 1.11210322, + "balance_loss_mlp": 0.22361347, + "epoch": 0.5132120847737862, + "flos": 19171917532800.0, + "grad_norm": 56.31981144928579, + "language_loss": 0.75820208, + "learning_rate": 2.011489056413418e-06, + "loss": 0.77397335, + "num_input_tokens_seen": 183553780, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.27526855, + "step": 8536, + "time_per_iteration": 2.694150686264038 + }, + { + "auxiliary_loss_clip": 0.01331074, + "auxiliary_loss_mlp": 0.00258636, + "balance_loss_clip": 1.10406625, + "balance_loss_mlp": 0.22518633, + "epoch": 0.5132722080264542, + "flos": 20230420446720.0, + "grad_norm": 906.8943342572752, + "language_loss": 0.80616939, + "learning_rate": 2.011099600942669e-06, + "loss": 0.82206649, + "num_input_tokens_seen": 183572285, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.33447266, + "step": 8537, + "time_per_iteration": 2.633655548095703 + }, + { + "auxiliary_loss_clip": 0.01315918, + "auxiliary_loss_mlp": 0.00251759, + "balance_loss_clip": 1.09922528, + "balance_loss_mlp": 0.22157523, + "epoch": 0.5133323312791223, + "flos": 16469459930880.0, + "grad_norm": 107.45650582590834, + "language_loss": 0.87648046, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.8921572, + "num_input_tokens_seen": 183589330, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.30151367, + "step": 8538, + "time_per_iteration": 2.616544723510742 + }, + { + "auxiliary_loss_clip": 0.01301845, + "auxiliary_loss_mlp": 0.0025063, + "balance_loss_clip": 1.08797443, + "balance_loss_mlp": 0.22240105, + "epoch": 0.5133924545317902, + "flos": 26068094709120.0, + "grad_norm": 188.41370126511399, + "language_loss": 0.85025084, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.86577559, + "num_input_tokens_seen": 183609205, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.28222656, + "step": 8539, + "time_per_iteration": 2.752021551132202 + }, + { + "auxiliary_loss_clip": 0.0132142, + "auxiliary_loss_mlp": 0.00261287, + "balance_loss_clip": 1.10646749, + "balance_loss_mlp": 0.22993451, + "epoch": 0.5134525777844582, + "flos": 29131023248640.0, + "grad_norm": 46.226696416787576, + "language_loss": 0.81445551, + "learning_rate": 2.009931232064105e-06, + "loss": 0.83028269, + "num_input_tokens_seen": 183629985, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.31347656, + "step": 8540, + "time_per_iteration": 2.7432947158813477 + }, + { + "auxiliary_loss_clip": 0.01326475, + "auxiliary_loss_mlp": 0.00290998, + "balance_loss_clip": 1.10340285, + "balance_loss_mlp": 0.25961018, + "epoch": 0.5135127010371261, + "flos": 17454776883840.0, + "grad_norm": 13.229944814442021, + "language_loss": 0.83198172, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.84815639, + "num_input_tokens_seen": 183648220, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.31384277, + "step": 8541, + "time_per_iteration": 2.704939842224121 + }, + { + "auxiliary_loss_clip": 0.01295545, + "auxiliary_loss_mlp": 0.00250004, + "balance_loss_clip": 1.07975185, + "balance_loss_mlp": 0.21981996, + "epoch": 0.5135728242897941, + "flos": 21944975316480.0, + "grad_norm": 7.221556596690133, + "language_loss": 0.76648796, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.78194344, + "num_input_tokens_seen": 183668230, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.30175781, + "step": 8542, + "time_per_iteration": 2.6243157386779785 + }, + { + "auxiliary_loss_clip": 0.01301907, + "auxiliary_loss_mlp": 0.00260607, + "balance_loss_clip": 1.08591413, + "balance_loss_mlp": 0.23092359, + "epoch": 0.513632947542462, + "flos": 22674859678080.0, + "grad_norm": 40.626844672477986, + "language_loss": 0.87501252, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.89063764, + "num_input_tokens_seen": 183687800, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.296875, + "step": 8543, + "time_per_iteration": 2.6885883808135986 + }, + { + "auxiliary_loss_clip": 0.01296754, + "auxiliary_loss_mlp": 0.00230516, + "balance_loss_clip": 1.08696008, + "balance_loss_mlp": 0.20146433, + "epoch": 0.51369307079513, + "flos": 29457163762560.0, + "grad_norm": 14.696719186808535, + "language_loss": 0.75738263, + "learning_rate": 2.008373401689299e-06, + "loss": 0.77265537, + "num_input_tokens_seen": 183709025, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.29052734, + "step": 8544, + "time_per_iteration": 4.090541362762451 + }, + { + "auxiliary_loss_clip": 0.0130241, + "auxiliary_loss_mlp": 0.00239473, + "balance_loss_clip": 1.08481574, + "balance_loss_mlp": 0.21216168, + "epoch": 0.513753194047798, + "flos": 18989347680000.0, + "grad_norm": 17.727606494495316, + "language_loss": 0.78630614, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.80172491, + "num_input_tokens_seen": 183725740, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.27331543, + "step": 8545, + "time_per_iteration": 2.631136655807495 + }, + { + "auxiliary_loss_clip": 0.01293643, + "auxiliary_loss_mlp": 0.00269006, + "balance_loss_clip": 1.07575846, + "balance_loss_mlp": 0.23861967, + "epoch": 0.513813317300466, + "flos": 17821855923840.0, + "grad_norm": 3.8445616251395447, + "language_loss": 0.91920364, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.93483013, + "num_input_tokens_seen": 183743995, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.30383301, + "step": 8546, + "time_per_iteration": 2.672301769256592 + }, + { + "auxiliary_loss_clip": 0.01297021, + "auxiliary_loss_mlp": 0.00261467, + "balance_loss_clip": 1.07361281, + "balance_loss_mlp": 0.23196244, + "epoch": 0.5138734405531339, + "flos": 24061191045120.0, + "grad_norm": 875.2027744885571, + "language_loss": 0.81051064, + "learning_rate": 2.007205025522544e-06, + "loss": 0.82609558, + "num_input_tokens_seen": 183764150, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.29528809, + "step": 8547, + "time_per_iteration": 4.110399007797241 + }, + { + "auxiliary_loss_clip": 0.01297232, + "auxiliary_loss_mlp": 0.00261492, + "balance_loss_clip": 1.0787214, + "balance_loss_mlp": 0.232131, + "epoch": 0.5139335638058019, + "flos": 26097253574400.0, + "grad_norm": 36.50938696856157, + "language_loss": 0.80683899, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.82242632, + "num_input_tokens_seen": 183783280, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.29370117, + "step": 8548, + "time_per_iteration": 2.720400810241699 + }, + { + "auxiliary_loss_clip": 0.01289582, + "auxiliary_loss_mlp": 0.0025819, + "balance_loss_clip": 1.07923126, + "balance_loss_mlp": 0.22895969, + "epoch": 0.5139936870584698, + "flos": 18917095472640.0, + "grad_norm": 9.808784018770366, + "language_loss": 0.87828523, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.89376295, + "num_input_tokens_seen": 183800725, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.29223633, + "step": 8549, + "time_per_iteration": 4.043980598449707 + }, + { + "auxiliary_loss_clip": 0.01286517, + "auxiliary_loss_mlp": 0.0024706, + "balance_loss_clip": 1.07233524, + "balance_loss_mlp": 0.21999986, + "epoch": 0.5140538103111378, + "flos": 16144001775360.0, + "grad_norm": 55.970292748203875, + "language_loss": 0.78338587, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.79872167, + "num_input_tokens_seen": 183818735, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.27087402, + "step": 8550, + "time_per_iteration": 2.587674140930176 + }, + { + "auxiliary_loss_clip": 0.01297789, + "auxiliary_loss_mlp": 0.00267732, + "balance_loss_clip": 1.07515335, + "balance_loss_mlp": 0.23535466, + "epoch": 0.5141139335638057, + "flos": 22420145358720.0, + "grad_norm": 11.111596399133129, + "language_loss": 0.8185634, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.83421862, + "num_input_tokens_seen": 183840015, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.32373047, + "step": 8551, + "time_per_iteration": 2.635011672973633 + }, + { + "auxiliary_loss_clip": 0.01297583, + "auxiliary_loss_mlp": 0.00235881, + "balance_loss_clip": 1.085289, + "balance_loss_mlp": 0.20759231, + "epoch": 0.5141740568164738, + "flos": 27089645506560.0, + "grad_norm": 11.984774902095037, + "language_loss": 0.77305079, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.78838545, + "num_input_tokens_seen": 183860145, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.28295898, + "step": 8552, + "time_per_iteration": 2.6556344032287598 + }, + { + "auxiliary_loss_clip": 0.01290147, + "auxiliary_loss_mlp": 0.00264361, + "balance_loss_clip": 1.07222223, + "balance_loss_mlp": 0.23496385, + "epoch": 0.5142341800691418, + "flos": 24973250209920.0, + "grad_norm": 6.5990965493817635, + "language_loss": 0.82633114, + "learning_rate": 2.004868266210965e-06, + "loss": 0.84187615, + "num_input_tokens_seen": 183880540, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.29370117, + "step": 8553, + "time_per_iteration": 2.6707186698913574 + }, + { + "auxiliary_loss_clip": 0.01291496, + "auxiliary_loss_mlp": 0.0024763, + "balance_loss_clip": 1.07806587, + "balance_loss_mlp": 0.21947324, + "epoch": 0.5142943033218097, + "flos": 20704513080960.0, + "grad_norm": 378.63544239676037, + "language_loss": 0.75858963, + "learning_rate": 2.004478805593435e-06, + "loss": 0.77398086, + "num_input_tokens_seen": 183900895, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.28161621, + "step": 8554, + "time_per_iteration": 4.071545600891113 + }, + { + "auxiliary_loss_clip": 0.01307757, + "auxiliary_loss_mlp": 0.00263298, + "balance_loss_clip": 1.08481121, + "balance_loss_mlp": 0.23226783, + "epoch": 0.5143544265744777, + "flos": 22925479847040.0, + "grad_norm": 11.547098206660268, + "language_loss": 0.80634993, + "learning_rate": 2.004089344806068e-06, + "loss": 0.82206047, + "num_input_tokens_seen": 183920335, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.31030273, + "step": 8555, + "time_per_iteration": 2.666668653488159 + }, + { + "auxiliary_loss_clip": 0.01286898, + "auxiliary_loss_mlp": 0.00235379, + "balance_loss_clip": 1.0700618, + "balance_loss_mlp": 0.20650639, + "epoch": 0.5144145498271456, + "flos": 15921391236480.0, + "grad_norm": 5.352696268231434, + "language_loss": 0.89968944, + "learning_rate": 2.003699883863633e-06, + "loss": 0.91491222, + "num_input_tokens_seen": 183936220, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.28857422, + "step": 8556, + "time_per_iteration": 2.647824764251709 + }, + { + "auxiliary_loss_clip": 0.01280145, + "auxiliary_loss_mlp": 0.00228077, + "balance_loss_clip": 1.0678122, + "balance_loss_mlp": 0.20249483, + "epoch": 0.5144746730798136, + "flos": 19681238430720.0, + "grad_norm": 452.46213459842505, + "language_loss": 0.92241091, + "learning_rate": 2.003310422780898e-06, + "loss": 0.93749315, + "num_input_tokens_seen": 183953250, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.25622559, + "step": 8557, + "time_per_iteration": 2.638580799102783 + }, + { + "auxiliary_loss_clip": 0.0127578, + "auxiliary_loss_mlp": 0.00218879, + "balance_loss_clip": 1.0653764, + "balance_loss_mlp": 0.19358285, + "epoch": 0.5145347963324816, + "flos": 23914711382400.0, + "grad_norm": 51.013453990213875, + "language_loss": 0.94977373, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.96472025, + "num_input_tokens_seen": 183973865, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.25317383, + "step": 8558, + "time_per_iteration": 2.6791610717773438 + }, + { + "auxiliary_loss_clip": 0.0127514, + "auxiliary_loss_mlp": 0.00258982, + "balance_loss_clip": 1.06236219, + "balance_loss_mlp": 0.23156349, + "epoch": 0.5145949195851496, + "flos": 18260002022400.0, + "grad_norm": 231.87787367061324, + "language_loss": 0.74010587, + "learning_rate": 2.002531500253602e-06, + "loss": 0.75544709, + "num_input_tokens_seen": 183992555, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.27453613, + "step": 8559, + "time_per_iteration": 2.6561360359191895 + }, + { + "auxiliary_loss_clip": 0.01288188, + "auxiliary_loss_mlp": 0.00256042, + "balance_loss_clip": 1.07073343, + "balance_loss_mlp": 0.22743168, + "epoch": 0.5146550428378175, + "flos": 26213425136640.0, + "grad_norm": 5791.281651634087, + "language_loss": 0.70384026, + "learning_rate": 2.002142038838577e-06, + "loss": 0.71928251, + "num_input_tokens_seen": 184010825, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.28625488, + "step": 8560, + "time_per_iteration": 2.71158504486084 + }, + { + "auxiliary_loss_clip": 0.01287331, + "auxiliary_loss_mlp": 0.00247512, + "balance_loss_clip": 1.07280529, + "balance_loss_mlp": 0.21949756, + "epoch": 0.5147151660904855, + "flos": 22674177319680.0, + "grad_norm": 3.8628767851488894, + "language_loss": 0.7752738, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.79062223, + "num_input_tokens_seen": 184030155, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.28027344, + "step": 8561, + "time_per_iteration": 2.7872166633605957 + }, + { + "auxiliary_loss_clip": 0.01307695, + "auxiliary_loss_mlp": 0.00243943, + "balance_loss_clip": 1.08933139, + "balance_loss_mlp": 0.21793202, + "epoch": 0.5147752893431534, + "flos": 24972388283520.0, + "grad_norm": 32.393471333260095, + "language_loss": 0.72911817, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.74463451, + "num_input_tokens_seen": 184051440, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.26025391, + "step": 8562, + "time_per_iteration": 2.696347713470459 + }, + { + "auxiliary_loss_clip": 0.01308141, + "auxiliary_loss_mlp": 0.00220042, + "balance_loss_clip": 1.09145832, + "balance_loss_mlp": 0.19457905, + "epoch": 0.5148354125958214, + "flos": 22744669760640.0, + "grad_norm": 20.63804222226166, + "language_loss": 0.83104783, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.84632969, + "num_input_tokens_seen": 184070205, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.2545166, + "step": 8563, + "time_per_iteration": 2.6580114364624023 + }, + { + "auxiliary_loss_clip": 0.0131986, + "auxiliary_loss_mlp": 0.00265817, + "balance_loss_clip": 1.09543014, + "balance_loss_mlp": 0.23581225, + "epoch": 0.5148955358484893, + "flos": 23068763199360.0, + "grad_norm": 2.289097278488612, + "language_loss": 0.91857684, + "learning_rate": 2.0005841925139e-06, + "loss": 0.9344337, + "num_input_tokens_seen": 184087345, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.29992676, + "step": 8564, + "time_per_iteration": 2.6317920684814453 + }, + { + "auxiliary_loss_clip": 0.01337094, + "auxiliary_loss_mlp": 0.00261522, + "balance_loss_clip": 1.10785747, + "balance_loss_mlp": 0.230039, + "epoch": 0.5149556591011574, + "flos": 20340127560960.0, + "grad_norm": 55.083333760847076, + "language_loss": 0.80285871, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.81884491, + "num_input_tokens_seen": 184107110, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.31481934, + "step": 8565, + "time_per_iteration": 2.728675127029419 + }, + { + "auxiliary_loss_clip": 0.01321818, + "auxiliary_loss_mlp": 0.00247125, + "balance_loss_clip": 1.09542704, + "balance_loss_mlp": 0.217418, + "epoch": 0.5150157823538254, + "flos": 22638230784000.0, + "grad_norm": 6.080678467400373, + "language_loss": 0.77547866, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.79116809, + "num_input_tokens_seen": 184127105, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.29736328, + "step": 8566, + "time_per_iteration": 2.6549859046936035 + }, + { + "auxiliary_loss_clip": 0.01335141, + "auxiliary_loss_mlp": 0.0024912, + "balance_loss_clip": 1.10126126, + "balance_loss_mlp": 0.21888858, + "epoch": 0.5150759056064933, + "flos": 26067627832320.0, + "grad_norm": 4.521346618051375, + "language_loss": 0.85315359, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.86899614, + "num_input_tokens_seen": 184148060, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.30224609, + "step": 8567, + "time_per_iteration": 2.7024617195129395 + }, + { + "auxiliary_loss_clip": 0.01337695, + "auxiliary_loss_mlp": 0.00242906, + "balance_loss_clip": 1.11323881, + "balance_loss_mlp": 0.21517825, + "epoch": 0.5151360288591613, + "flos": 25952641418880.0, + "grad_norm": 16.484585625290137, + "language_loss": 0.86282456, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.87863064, + "num_input_tokens_seen": 184166175, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.27734375, + "step": 8568, + "time_per_iteration": 2.642699718475342 + }, + { + "auxiliary_loss_clip": 0.01315119, + "auxiliary_loss_mlp": 0.00224464, + "balance_loss_clip": 1.09843147, + "balance_loss_mlp": 0.19737956, + "epoch": 0.5151961521118292, + "flos": 18507246312960.0, + "grad_norm": 11.16696037472944, + "language_loss": 0.97975516, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.99515104, + "num_input_tokens_seen": 184182600, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.27087402, + "step": 8569, + "time_per_iteration": 2.651681423187256 + }, + { + "auxiliary_loss_clip": 0.01328343, + "auxiliary_loss_mlp": 0.0023875, + "balance_loss_clip": 1.09964669, + "balance_loss_mlp": 0.20835175, + "epoch": 0.5152562753644973, + "flos": 22233696837120.0, + "grad_norm": 71.67685700624712, + "language_loss": 0.84806287, + "learning_rate": 1.998247422657674e-06, + "loss": 0.86373377, + "num_input_tokens_seen": 184202020, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.30395508, + "step": 8570, + "time_per_iteration": 2.609334707260132 + }, + { + "auxiliary_loss_clip": 0.01331193, + "auxiliary_loss_mlp": 0.0022857, + "balance_loss_clip": 1.10132003, + "balance_loss_mlp": 0.20013812, + "epoch": 0.5153163986171652, + "flos": 38436555047040.0, + "grad_norm": 199.2491366618405, + "language_loss": 0.80903387, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.82463145, + "num_input_tokens_seen": 184224850, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.28430176, + "step": 8571, + "time_per_iteration": 2.765758752822876 + }, + { + "auxiliary_loss_clip": 0.01384939, + "auxiliary_loss_mlp": 0.0005828, + "balance_loss_clip": 1.23951638, + "balance_loss_mlp": 0.04669261, + "epoch": 0.5153765218698332, + "flos": 66384503015040.0, + "grad_norm": 0.7674170452075944, + "language_loss": 0.52797455, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.54240674, + "num_input_tokens_seen": 184288520, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.11572266, + "step": 8572, + "time_per_iteration": 3.1619808673858643 + }, + { + "auxiliary_loss_clip": 0.01333933, + "auxiliary_loss_mlp": 0.00221298, + "balance_loss_clip": 1.11731601, + "balance_loss_mlp": 0.19583479, + "epoch": 0.5154366451225011, + "flos": 24024669891840.0, + "grad_norm": 85.9455454690201, + "language_loss": 0.85021031, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.86576271, + "num_input_tokens_seen": 184308565, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.2545166, + "step": 8573, + "time_per_iteration": 2.6907858848571777 + }, + { + "auxiliary_loss_clip": 0.01332647, + "auxiliary_loss_mlp": 0.00252731, + "balance_loss_clip": 1.11287367, + "balance_loss_mlp": 0.22410847, + "epoch": 0.5154967683751691, + "flos": 23468843859840.0, + "grad_norm": 5.74131350529177, + "language_loss": 0.85802698, + "learning_rate": 1.996689577219102e-06, + "loss": 0.8738808, + "num_input_tokens_seen": 184326795, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.28625488, + "step": 8574, + "time_per_iteration": 2.6767327785491943 + }, + { + "auxiliary_loss_clip": 0.01319174, + "auxiliary_loss_mlp": 0.00202717, + "balance_loss_clip": 1.10463369, + "balance_loss_mlp": 0.17750394, + "epoch": 0.515556891627837, + "flos": 23805650712960.0, + "grad_norm": 3.1313864504382325, + "language_loss": 0.92614877, + "learning_rate": 1.996300116136367e-06, + "loss": 0.94136769, + "num_input_tokens_seen": 184345990, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.25183105, + "step": 8575, + "time_per_iteration": 2.6547799110412598 + }, + { + "auxiliary_loss_clip": 0.0134158, + "auxiliary_loss_mlp": 0.00226592, + "balance_loss_clip": 1.11591101, + "balance_loss_mlp": 0.19868523, + "epoch": 0.515617014880505, + "flos": 19828544106240.0, + "grad_norm": 2.1295550859535206, + "language_loss": 0.83520442, + "learning_rate": 1.995910655193932e-06, + "loss": 0.85088617, + "num_input_tokens_seen": 184366300, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.27893066, + "step": 8576, + "time_per_iteration": 2.690713405609131 + }, + { + "auxiliary_loss_clip": 0.0134383, + "auxiliary_loss_mlp": 0.00246587, + "balance_loss_clip": 1.10865152, + "balance_loss_mlp": 0.21667689, + "epoch": 0.515677138133173, + "flos": 14245907385600.0, + "grad_norm": 21.459571615651747, + "language_loss": 0.83687729, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.85278147, + "num_input_tokens_seen": 184383030, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.29882812, + "step": 8577, + "time_per_iteration": 2.6270253658294678 + }, + { + "auxiliary_loss_clip": 0.0134378, + "auxiliary_loss_mlp": 0.00252899, + "balance_loss_clip": 1.11752808, + "balance_loss_mlp": 0.22368127, + "epoch": 0.515737261385841, + "flos": 28289707920000.0, + "grad_norm": 154.19297646956448, + "language_loss": 0.87831336, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.89428014, + "num_input_tokens_seen": 184403410, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.29223633, + "step": 8578, + "time_per_iteration": 2.689704418182373 + }, + { + "auxiliary_loss_clip": 0.01324013, + "auxiliary_loss_mlp": 0.00226677, + "balance_loss_clip": 1.10439241, + "balance_loss_mlp": 0.19960438, + "epoch": 0.515797384638509, + "flos": 27891925729920.0, + "grad_norm": 264.2871326908366, + "language_loss": 0.84674436, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.86225128, + "num_input_tokens_seen": 184423830, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.27075195, + "step": 8579, + "time_per_iteration": 2.706963062286377 + }, + { + "auxiliary_loss_clip": 0.01318984, + "auxiliary_loss_mlp": 0.00220936, + "balance_loss_clip": 1.09877777, + "balance_loss_mlp": 0.19389959, + "epoch": 0.5158575078911769, + "flos": 23040071210880.0, + "grad_norm": 1077.631278717671, + "language_loss": 0.87862098, + "learning_rate": 1.994352813122559e-06, + "loss": 0.8940202, + "num_input_tokens_seen": 184445050, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.27038574, + "step": 8580, + "time_per_iteration": 2.6637191772460938 + }, + { + "auxiliary_loss_clip": 0.01340958, + "auxiliary_loss_mlp": 0.00247792, + "balance_loss_clip": 1.11320984, + "balance_loss_mlp": 0.22086224, + "epoch": 0.5159176311438449, + "flos": 12641346938880.0, + "grad_norm": 14.58522689442267, + "language_loss": 0.80746472, + "learning_rate": 1.99396335310315e-06, + "loss": 0.82335222, + "num_input_tokens_seen": 184460775, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.26904297, + "step": 8581, + "time_per_iteration": 2.578061103820801 + }, + { + "auxiliary_loss_clip": 0.01319238, + "auxiliary_loss_mlp": 0.00212755, + "balance_loss_clip": 1.10381675, + "balance_loss_mlp": 0.18837646, + "epoch": 0.5159777543965128, + "flos": 15558154951680.0, + "grad_norm": 15.651098357926333, + "language_loss": 0.82455951, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.83987945, + "num_input_tokens_seen": 184477365, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24389648, + "step": 8582, + "time_per_iteration": 2.6051039695739746 + }, + { + "auxiliary_loss_clip": 0.01324711, + "auxiliary_loss_mlp": 0.00201613, + "balance_loss_clip": 1.1060257, + "balance_loss_mlp": 0.17747357, + "epoch": 0.5160378776491809, + "flos": 23221671396480.0, + "grad_norm": 7.939311349582776, + "language_loss": 0.73655993, + "learning_rate": 1.99318443376583e-06, + "loss": 0.75182319, + "num_input_tokens_seen": 184497045, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24145508, + "step": 8583, + "time_per_iteration": 2.61879301071167 + }, + { + "auxiliary_loss_clip": 0.01339389, + "auxiliary_loss_mlp": 0.00220566, + "balance_loss_clip": 1.11274397, + "balance_loss_mlp": 0.19383934, + "epoch": 0.5160980009018488, + "flos": 21944616180480.0, + "grad_norm": 13.460536451188158, + "language_loss": 0.81705421, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.83265376, + "num_input_tokens_seen": 184517675, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26733398, + "step": 8584, + "time_per_iteration": 2.6456823348999023 + }, + { + "auxiliary_loss_clip": 0.01320012, + "auxiliary_loss_mlp": 0.00237592, + "balance_loss_clip": 1.09805095, + "balance_loss_mlp": 0.21043663, + "epoch": 0.5161581241545168, + "flos": 22784064001920.0, + "grad_norm": 11.749811223651335, + "language_loss": 0.86240935, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.87798536, + "num_input_tokens_seen": 184537745, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.27172852, + "step": 8585, + "time_per_iteration": 2.6957192420959473 + }, + { + "auxiliary_loss_clip": 0.01313895, + "auxiliary_loss_mlp": 0.00202031, + "balance_loss_clip": 1.09514451, + "balance_loss_mlp": 0.17699692, + "epoch": 0.5162182474071847, + "flos": 19675384513920.0, + "grad_norm": 25.309562775214182, + "language_loss": 0.88441128, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.89957052, + "num_input_tokens_seen": 184553630, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.25012207, + "step": 8586, + "time_per_iteration": 4.149962663650513 + }, + { + "auxiliary_loss_clip": 0.0132521, + "auxiliary_loss_mlp": 0.00214316, + "balance_loss_clip": 1.10531294, + "balance_loss_mlp": 0.18923484, + "epoch": 0.5162783706598527, + "flos": 20046198568320.0, + "grad_norm": 15.206442345567124, + "language_loss": 0.78895462, + "learning_rate": 1.991626598310701e-06, + "loss": 0.80434984, + "num_input_tokens_seen": 184573530, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25085449, + "step": 8587, + "time_per_iteration": 2.6840264797210693 + }, + { + "auxiliary_loss_clip": 0.01319361, + "auxiliary_loss_mlp": 0.00047534, + "balance_loss_clip": 1.1775583, + "balance_loss_mlp": 0.0399043, + "epoch": 0.5163384939125206, + "flos": 69959553713280.0, + "grad_norm": 2.3997974905760837, + "language_loss": 0.57302713, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.58669615, + "num_input_tokens_seen": 184637875, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.07617188, + "step": 8588, + "time_per_iteration": 3.092831611633301 + }, + { + "auxiliary_loss_clip": 0.01335321, + "auxiliary_loss_mlp": 0.00249439, + "balance_loss_clip": 1.11079848, + "balance_loss_mlp": 0.21901673, + "epoch": 0.5163986171651886, + "flos": 17417034668160.0, + "grad_norm": 19.08288734515957, + "language_loss": 0.8247695, + "learning_rate": 1.990847682429185e-06, + "loss": 0.84061712, + "num_input_tokens_seen": 184656125, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.30407715, + "step": 8589, + "time_per_iteration": 4.031512498855591 + }, + { + "auxiliary_loss_clip": 0.01318473, + "auxiliary_loss_mlp": 0.00231487, + "balance_loss_clip": 1.0975312, + "balance_loss_mlp": 0.20516543, + "epoch": 0.5164587404178566, + "flos": 21322679166720.0, + "grad_norm": 1.701975919176331, + "language_loss": 0.74344289, + "learning_rate": 1.990458225001627e-06, + "loss": 0.75894248, + "num_input_tokens_seen": 184675920, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.26318359, + "step": 8590, + "time_per_iteration": 2.6488611698150635 + }, + { + "auxiliary_loss_clip": 0.01338771, + "auxiliary_loss_mlp": 0.00017809, + "balance_loss_clip": 1.19438815, + "balance_loss_mlp": 0.01013229, + "epoch": 0.5165188636705246, + "flos": 68057149691520.0, + "grad_norm": 0.813317025299719, + "language_loss": 0.55511916, + "learning_rate": 1.990068767935895e-06, + "loss": 0.56868494, + "num_input_tokens_seen": 184730520, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.07666016, + "step": 8591, + "time_per_iteration": 4.382440090179443 + }, + { + "auxiliary_loss_clip": 0.01306323, + "auxiliary_loss_mlp": 0.00189013, + "balance_loss_clip": 1.09462929, + "balance_loss_mlp": 0.16414635, + "epoch": 0.5165789869231926, + "flos": 19385657412480.0, + "grad_norm": 5.366405874738355, + "language_loss": 0.88311017, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.89806354, + "num_input_tokens_seen": 184748340, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.2487793, + "step": 8592, + "time_per_iteration": 2.613340377807617 + }, + { + "auxiliary_loss_clip": 0.01317316, + "auxiliary_loss_mlp": 0.00197892, + "balance_loss_clip": 1.09665811, + "balance_loss_mlp": 0.17306118, + "epoch": 0.5166391101758605, + "flos": 20960197067520.0, + "grad_norm": 5.440980167407868, + "language_loss": 0.89525592, + "learning_rate": 1.989289854948979e-06, + "loss": 0.91040808, + "num_input_tokens_seen": 184766615, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24841309, + "step": 8593, + "time_per_iteration": 2.632797956466675 + }, + { + "auxiliary_loss_clip": 0.01312381, + "auxiliary_loss_mlp": 0.00218855, + "balance_loss_clip": 1.09418201, + "balance_loss_mlp": 0.19224782, + "epoch": 0.5166992334285285, + "flos": 29462407148160.0, + "grad_norm": 6.549722289059256, + "language_loss": 0.7743811, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.78969347, + "num_input_tokens_seen": 184788075, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.26635742, + "step": 8594, + "time_per_iteration": 2.6878552436828613 + }, + { + "auxiliary_loss_clip": 0.01315177, + "auxiliary_loss_mlp": 0.00223983, + "balance_loss_clip": 1.09368503, + "balance_loss_mlp": 0.19672005, + "epoch": 0.5167593566811964, + "flos": 20304360593280.0, + "grad_norm": 2.2589584522222106, + "language_loss": 0.84031677, + "learning_rate": 1.988510943586582e-06, + "loss": 0.85570836, + "num_input_tokens_seen": 184808710, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.27246094, + "step": 8595, + "time_per_iteration": 2.668076276779175 + }, + { + "auxiliary_loss_clip": 0.01311853, + "auxiliary_loss_mlp": 0.00201612, + "balance_loss_clip": 1.09468818, + "balance_loss_mlp": 0.17581493, + "epoch": 0.5168194799338645, + "flos": 14611370313600.0, + "grad_norm": 3.2040018096237026, + "language_loss": 0.72905242, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.74418712, + "num_input_tokens_seen": 184826475, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25817871, + "step": 8596, + "time_per_iteration": 4.061712980270386 + }, + { + "auxiliary_loss_clip": 0.01326554, + "auxiliary_loss_mlp": 0.00223976, + "balance_loss_clip": 1.10260963, + "balance_loss_mlp": 0.19602151, + "epoch": 0.5168796031865324, + "flos": 25007257411200.0, + "grad_norm": 6.533144725638429, + "language_loss": 0.82277769, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.83828306, + "num_input_tokens_seen": 184845245, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.27929688, + "step": 8597, + "time_per_iteration": 2.6552045345306396 + }, + { + "auxiliary_loss_clip": 0.01314653, + "auxiliary_loss_mlp": 0.00238049, + "balance_loss_clip": 1.0939362, + "balance_loss_mlp": 0.21086955, + "epoch": 0.5169397264392004, + "flos": 26939969533440.0, + "grad_norm": 2.9797213594697194, + "language_loss": 0.87655175, + "learning_rate": 1.987342579847403e-06, + "loss": 0.89207876, + "num_input_tokens_seen": 184866605, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.27185059, + "step": 8598, + "time_per_iteration": 2.6864242553710938 + }, + { + "auxiliary_loss_clip": 0.01312766, + "auxiliary_loss_mlp": 0.00196374, + "balance_loss_clip": 1.09082985, + "balance_loss_mlp": 0.17066008, + "epoch": 0.5169998496918683, + "flos": 25407804948480.0, + "grad_norm": 2.0114478106457696, + "language_loss": 0.81687874, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.83197016, + "num_input_tokens_seen": 184886945, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25720215, + "step": 8599, + "time_per_iteration": 2.6735033988952637 + }, + { + "auxiliary_loss_clip": 0.0128592, + "auxiliary_loss_mlp": 0.00189435, + "balance_loss_clip": 1.07020307, + "balance_loss_mlp": 0.16452008, + "epoch": 0.5170599729445363, + "flos": 24680793674880.0, + "grad_norm": 14.10689567765216, + "language_loss": 0.80009121, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.81484473, + "num_input_tokens_seen": 184905590, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24926758, + "step": 8600, + "time_per_iteration": 2.672043561935425 + }, + { + "auxiliary_loss_clip": 0.01296991, + "auxiliary_loss_mlp": 0.00198478, + "balance_loss_clip": 1.07603955, + "balance_loss_mlp": 0.17230015, + "epoch": 0.5171200961972042, + "flos": 20994455664000.0, + "grad_norm": 14.414504337102118, + "language_loss": 0.8132264, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.82818109, + "num_input_tokens_seen": 184925555, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26196289, + "step": 8601, + "time_per_iteration": 2.628859043121338 + }, + { + "auxiliary_loss_clip": 0.0130074, + "auxiliary_loss_mlp": 0.00223649, + "balance_loss_clip": 1.07575691, + "balance_loss_mlp": 0.19660079, + "epoch": 0.5171802194498722, + "flos": 22745639427840.0, + "grad_norm": 13.632469094008558, + "language_loss": 0.9478339, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.9630779, + "num_input_tokens_seen": 184944490, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.27075195, + "step": 8602, + "time_per_iteration": 2.634979486465454 + }, + { + "auxiliary_loss_clip": 0.01293552, + "auxiliary_loss_mlp": 0.00232861, + "balance_loss_clip": 1.07399321, + "balance_loss_mlp": 0.2063604, + "epoch": 0.5172403427025402, + "flos": 28176732668160.0, + "grad_norm": 54222.05134740459, + "language_loss": 0.82945317, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.84471732, + "num_input_tokens_seen": 184963190, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26501465, + "step": 8603, + "time_per_iteration": 2.704261541366577 + }, + { + "auxiliary_loss_clip": 0.01283399, + "auxiliary_loss_mlp": 0.00230765, + "balance_loss_clip": 1.06218183, + "balance_loss_mlp": 0.20468217, + "epoch": 0.5173004659552082, + "flos": 20337829090560.0, + "grad_norm": 167.44533959382747, + "language_loss": 0.82652575, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.84166741, + "num_input_tokens_seen": 184981220, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.2611084, + "step": 8604, + "time_per_iteration": 2.679879903793335 + }, + { + "auxiliary_loss_clip": 0.01284267, + "auxiliary_loss_mlp": 0.00239042, + "balance_loss_clip": 1.05885112, + "balance_loss_mlp": 0.21062236, + "epoch": 0.5173605892078762, + "flos": 19063323740160.0, + "grad_norm": 80.16575033886134, + "language_loss": 0.92928863, + "learning_rate": 1.984616415277469e-06, + "loss": 0.94452173, + "num_input_tokens_seen": 184998810, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.28417969, + "step": 8605, + "time_per_iteration": 2.6371939182281494 + }, + { + "auxiliary_loss_clip": 0.01267505, + "auxiliary_loss_mlp": 0.00213505, + "balance_loss_clip": 1.04913759, + "balance_loss_mlp": 0.18689741, + "epoch": 0.5174207124605441, + "flos": 27995168396160.0, + "grad_norm": 159.85669288214112, + "language_loss": 0.71462601, + "learning_rate": 1.984226965411294e-06, + "loss": 0.7294361, + "num_input_tokens_seen": 185021185, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.26611328, + "step": 8606, + "time_per_iteration": 2.7054202556610107 + }, + { + "auxiliary_loss_clip": 0.01284186, + "auxiliary_loss_mlp": 0.00235089, + "balance_loss_clip": 1.06681526, + "balance_loss_mlp": 0.20846979, + "epoch": 0.5174808357132121, + "flos": 19496657416320.0, + "grad_norm": 930.4462231236995, + "language_loss": 0.85359865, + "learning_rate": 1.983837516143234e-06, + "loss": 0.8687914, + "num_input_tokens_seen": 185038465, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.26611328, + "step": 8607, + "time_per_iteration": 2.6510581970214844 + }, + { + "auxiliary_loss_clip": 0.01275007, + "auxiliary_loss_mlp": 0.00225475, + "balance_loss_clip": 1.05536973, + "balance_loss_mlp": 0.19974984, + "epoch": 0.51754095896588, + "flos": 22784171742720.0, + "grad_norm": 9.903491754278866, + "language_loss": 0.79113817, + "learning_rate": 1.983448067488057e-06, + "loss": 0.80614299, + "num_input_tokens_seen": 185057340, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25708008, + "step": 8608, + "time_per_iteration": 2.721646308898926 + }, + { + "auxiliary_loss_clip": 0.01276597, + "auxiliary_loss_mlp": 0.00248454, + "balance_loss_clip": 1.05385506, + "balance_loss_mlp": 0.22109517, + "epoch": 0.5176010822185481, + "flos": 22669257156480.0, + "grad_norm": 141.52814354118655, + "language_loss": 0.94053972, + "learning_rate": 1.983058619460531e-06, + "loss": 0.95579022, + "num_input_tokens_seen": 185074935, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.27331543, + "step": 8609, + "time_per_iteration": 2.64128041267395 + }, + { + "auxiliary_loss_clip": 0.01283014, + "auxiliary_loss_mlp": 0.00233277, + "balance_loss_clip": 1.06425142, + "balance_loss_mlp": 0.2071943, + "epoch": 0.517661205471216, + "flos": 23951196622080.0, + "grad_norm": 9.900649696420677, + "language_loss": 0.80362856, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.81879145, + "num_input_tokens_seen": 185095050, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26074219, + "step": 8610, + "time_per_iteration": 2.6624159812927246 + }, + { + "auxiliary_loss_clip": 0.01300046, + "auxiliary_loss_mlp": 0.00253027, + "balance_loss_clip": 1.07097661, + "balance_loss_mlp": 0.22477484, + "epoch": 0.517721328723884, + "flos": 15596076735360.0, + "grad_norm": 2.256936561840729, + "language_loss": 0.77563447, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.79116517, + "num_input_tokens_seen": 185112275, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.2824707, + "step": 8611, + "time_per_iteration": 2.681755304336548 + }, + { + "auxiliary_loss_clip": 0.01293194, + "auxiliary_loss_mlp": 0.00229981, + "balance_loss_clip": 1.07303178, + "balance_loss_mlp": 0.20427951, + "epoch": 0.5177814519765519, + "flos": 20960197067520.0, + "grad_norm": 6.116958344400622, + "language_loss": 0.86210734, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.87733912, + "num_input_tokens_seen": 185132165, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25708008, + "step": 8612, + "time_per_iteration": 2.6607611179351807 + }, + { + "auxiliary_loss_clip": 0.01277901, + "auxiliary_loss_mlp": 0.00248759, + "balance_loss_clip": 1.05905247, + "balance_loss_mlp": 0.22136468, + "epoch": 0.5178415752292199, + "flos": 17967832796160.0, + "grad_norm": 18.110813058834033, + "language_loss": 0.90601897, + "learning_rate": 1.981500833922294e-06, + "loss": 0.92128551, + "num_input_tokens_seen": 185151025, + "router_z_loss_clip": 2.18652344, + "router_z_loss_mlp": 0.27392578, + "step": 8613, + "time_per_iteration": 2.6280791759490967 + }, + { + "auxiliary_loss_clip": 0.01305065, + "auxiliary_loss_mlp": 0.002828, + "balance_loss_clip": 1.07700193, + "balance_loss_mlp": 0.25373715, + "epoch": 0.5179016984818878, + "flos": 17821496787840.0, + "grad_norm": 6.0259859239729545, + "language_loss": 0.76304018, + "learning_rate": 1.981111389254541e-06, + "loss": 0.77891886, + "num_input_tokens_seen": 185168455, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.29077148, + "step": 8614, + "time_per_iteration": 2.599669933319092 + }, + { + "auxiliary_loss_clip": 0.01306168, + "auxiliary_loss_mlp": 0.002399, + "balance_loss_clip": 1.08273339, + "balance_loss_mlp": 0.21274462, + "epoch": 0.5179618217345558, + "flos": 17820455293440.0, + "grad_norm": 10.358248357639125, + "language_loss": 0.93970639, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.95516706, + "num_input_tokens_seen": 185184415, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.27124023, + "step": 8615, + "time_per_iteration": 2.6214077472686768 + }, + { + "auxiliary_loss_clip": 0.0129652, + "auxiliary_loss_mlp": 0.002445, + "balance_loss_clip": 1.07841539, + "balance_loss_mlp": 0.21785641, + "epoch": 0.5180219449872238, + "flos": 22522131048960.0, + "grad_norm": 20.269198571240334, + "language_loss": 0.87421489, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.88962513, + "num_input_tokens_seen": 185202910, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.26635742, + "step": 8616, + "time_per_iteration": 2.629202365875244 + }, + { + "auxiliary_loss_clip": 0.01325585, + "auxiliary_loss_mlp": 0.00248219, + "balance_loss_clip": 1.09192061, + "balance_loss_mlp": 0.22070578, + "epoch": 0.5180820682398918, + "flos": 23915465568000.0, + "grad_norm": 8.150878452278226, + "language_loss": 0.82821566, + "learning_rate": 1.9799430596079e-06, + "loss": 0.84395373, + "num_input_tokens_seen": 185223085, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.2755127, + "step": 8617, + "time_per_iteration": 2.676535129547119 + }, + { + "auxiliary_loss_clip": 0.01301004, + "auxiliary_loss_mlp": 0.00246704, + "balance_loss_clip": 1.07802773, + "balance_loss_mlp": 0.22095503, + "epoch": 0.5181421914925598, + "flos": 16979930064000.0, + "grad_norm": 13.244953608333724, + "language_loss": 0.76943672, + "learning_rate": 1.979553617893785e-06, + "loss": 0.78491378, + "num_input_tokens_seen": 185241295, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25732422, + "step": 8618, + "time_per_iteration": 2.6128787994384766 + }, + { + "auxiliary_loss_clip": 0.01295324, + "auxiliary_loss_mlp": 0.00161475, + "balance_loss_clip": 1.15211248, + "balance_loss_mlp": 0.15417896, + "epoch": 0.5182023147452277, + "flos": 66059870872320.0, + "grad_norm": 0.9545370620787204, + "language_loss": 0.67244083, + "learning_rate": 1.979164176954999e-06, + "loss": 0.6870088, + "num_input_tokens_seen": 185298295, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.07275391, + "step": 8619, + "time_per_iteration": 3.0390384197235107 + }, + { + "auxiliary_loss_clip": 0.01274166, + "auxiliary_loss_mlp": 0.00219631, + "balance_loss_clip": 1.05982089, + "balance_loss_mlp": 0.19531232, + "epoch": 0.5182624379978957, + "flos": 18187749815040.0, + "grad_norm": 12.262761695703075, + "language_loss": 0.90321028, + "learning_rate": 1.97877473680631e-06, + "loss": 0.91814828, + "num_input_tokens_seen": 185317000, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.24304199, + "step": 8620, + "time_per_iteration": 2.611311197280884 + }, + { + "auxiliary_loss_clip": 0.01286982, + "auxiliary_loss_mlp": 0.00234249, + "balance_loss_clip": 1.06893444, + "balance_loss_mlp": 0.20931, + "epoch": 0.5183225612505636, + "flos": 14026708638720.0, + "grad_norm": 4.926298081453719, + "language_loss": 0.88607043, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.90128267, + "num_input_tokens_seen": 185331185, + "router_z_loss_clip": 2.18066406, + "router_z_loss_mlp": 0.24951172, + "step": 8621, + "time_per_iteration": 2.59924578666687 + }, + { + "auxiliary_loss_clip": 0.01294204, + "auxiliary_loss_mlp": 0.00261849, + "balance_loss_clip": 1.07130635, + "balance_loss_mlp": 0.23533666, + "epoch": 0.5183826845032317, + "flos": 23659781581440.0, + "grad_norm": 6.7251986023099795, + "language_loss": 0.68911147, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.70467198, + "num_input_tokens_seen": 185348955, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.26501465, + "step": 8622, + "time_per_iteration": 2.718533992767334 + }, + { + "auxiliary_loss_clip": 0.01292011, + "auxiliary_loss_mlp": 0.00221011, + "balance_loss_clip": 1.06258428, + "balance_loss_mlp": 0.1945222, + "epoch": 0.5184428077558996, + "flos": 15888605097600.0, + "grad_norm": 2073.3729197571492, + "language_loss": 0.73585641, + "learning_rate": 1.977606421248497e-06, + "loss": 0.75098658, + "num_input_tokens_seen": 185367330, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.26464844, + "step": 8623, + "time_per_iteration": 2.665191411972046 + }, + { + "auxiliary_loss_clip": 0.01272937, + "auxiliary_loss_mlp": 0.00214632, + "balance_loss_clip": 1.05455029, + "balance_loss_mlp": 0.18831092, + "epoch": 0.5185029310085676, + "flos": 21030833162880.0, + "grad_norm": 19.3671805212736, + "language_loss": 0.83541644, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.85029221, + "num_input_tokens_seen": 185385060, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.2635498, + "step": 8624, + "time_per_iteration": 2.6391501426696777 + }, + { + "auxiliary_loss_clip": 0.01279013, + "auxiliary_loss_mlp": 0.00238896, + "balance_loss_clip": 1.05955136, + "balance_loss_mlp": 0.21338472, + "epoch": 0.5185630542612355, + "flos": 26542690133760.0, + "grad_norm": 29.4960340020081, + "language_loss": 0.77757549, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.79275453, + "num_input_tokens_seen": 185403745, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25524902, + "step": 8625, + "time_per_iteration": 2.7565486431121826 + }, + { + "auxiliary_loss_clip": 0.01284874, + "auxiliary_loss_mlp": 0.00250887, + "balance_loss_clip": 1.0653913, + "balance_loss_mlp": 0.22632989, + "epoch": 0.5186231775139035, + "flos": 20668422890880.0, + "grad_norm": 53.95443963680843, + "language_loss": 0.77861977, + "learning_rate": 1.976438113333184e-06, + "loss": 0.79397738, + "num_input_tokens_seen": 185422620, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24560547, + "step": 8626, + "time_per_iteration": 2.726149797439575 + }, + { + "auxiliary_loss_clip": 0.01297756, + "auxiliary_loss_mlp": 0.00267864, + "balance_loss_clip": 1.07151377, + "balance_loss_mlp": 0.24128026, + "epoch": 0.5186833007665714, + "flos": 20885502735360.0, + "grad_norm": 14.332441578143106, + "language_loss": 0.78466249, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.80031866, + "num_input_tokens_seen": 185439380, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26599121, + "step": 8627, + "time_per_iteration": 2.663118600845337 + }, + { + "auxiliary_loss_clip": 0.01292981, + "auxiliary_loss_mlp": 0.00254233, + "balance_loss_clip": 1.06732202, + "balance_loss_mlp": 0.22775689, + "epoch": 0.5187434240192395, + "flos": 20886903365760.0, + "grad_norm": 22.60923316639077, + "language_loss": 0.80407178, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.8195439, + "num_input_tokens_seen": 185458830, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26464844, + "step": 8628, + "time_per_iteration": 2.7386903762817383 + }, + { + "auxiliary_loss_clip": 0.01296671, + "auxiliary_loss_mlp": 0.00274133, + "balance_loss_clip": 1.07525206, + "balance_loss_mlp": 0.24629736, + "epoch": 0.5188035472719074, + "flos": 19859929614720.0, + "grad_norm": 38.11802610076561, + "language_loss": 0.83277678, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.84848475, + "num_input_tokens_seen": 185477270, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.27819824, + "step": 8629, + "time_per_iteration": 4.103916168212891 + }, + { + "auxiliary_loss_clip": 0.01313243, + "auxiliary_loss_mlp": 0.00260473, + "balance_loss_clip": 1.08540797, + "balance_loss_mlp": 0.23363894, + "epoch": 0.5188636705245754, + "flos": 21138313633920.0, + "grad_norm": 53.51414726226495, + "language_loss": 0.81479722, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.83053434, + "num_input_tokens_seen": 185495795, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.26818848, + "step": 8630, + "time_per_iteration": 2.7035200595855713 + }, + { + "auxiliary_loss_clip": 0.01310709, + "auxiliary_loss_mlp": 0.00290844, + "balance_loss_clip": 1.0806241, + "balance_loss_mlp": 0.26259172, + "epoch": 0.5189237937772434, + "flos": 22419786222720.0, + "grad_norm": 11.436073132060754, + "language_loss": 0.86933845, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.88535398, + "num_input_tokens_seen": 185514885, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.28271484, + "step": 8631, + "time_per_iteration": 4.179270029067993 + }, + { + "auxiliary_loss_clip": 0.01324533, + "auxiliary_loss_mlp": 0.00281049, + "balance_loss_clip": 1.09200728, + "balance_loss_mlp": 0.25274852, + "epoch": 0.5189839170299113, + "flos": 25446696399360.0, + "grad_norm": 30.650740462938018, + "language_loss": 0.81649858, + "learning_rate": 1.974101522024942e-06, + "loss": 0.8325544, + "num_input_tokens_seen": 185537155, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.28283691, + "step": 8632, + "time_per_iteration": 2.686420202255249 + }, + { + "auxiliary_loss_clip": 0.01311036, + "auxiliary_loss_mlp": 0.0026564, + "balance_loss_clip": 1.08331764, + "balance_loss_mlp": 0.23857985, + "epoch": 0.5190440402825793, + "flos": 18587722734720.0, + "grad_norm": 7.145416870206982, + "language_loss": 0.88277382, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.89854056, + "num_input_tokens_seen": 185555520, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.27075195, + "step": 8633, + "time_per_iteration": 4.1167073249816895 + }, + { + "auxiliary_loss_clip": 0.01310849, + "auxiliary_loss_mlp": 0.0027519, + "balance_loss_clip": 1.08211291, + "balance_loss_mlp": 0.24860635, + "epoch": 0.5191041635352472, + "flos": 21908633731200.0, + "grad_norm": 5.930323528373013, + "language_loss": 0.87913632, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.89499676, + "num_input_tokens_seen": 185573855, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.26538086, + "step": 8634, + "time_per_iteration": 2.66119122505188 + }, + { + "auxiliary_loss_clip": 0.01309518, + "auxiliary_loss_mlp": 0.00280362, + "balance_loss_clip": 1.08328152, + "balance_loss_mlp": 0.25320628, + "epoch": 0.5191642867879153, + "flos": 27527971173120.0, + "grad_norm": 4.570284894965837, + "language_loss": 0.75948286, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.77538168, + "num_input_tokens_seen": 185595145, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.27148438, + "step": 8635, + "time_per_iteration": 2.731816291809082 + }, + { + "auxiliary_loss_clip": 0.01324332, + "auxiliary_loss_mlp": 0.00273576, + "balance_loss_clip": 1.09277868, + "balance_loss_mlp": 0.24609847, + "epoch": 0.5192244100405832, + "flos": 15705999331200.0, + "grad_norm": 41.09422686173065, + "language_loss": 0.84277689, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.85875607, + "num_input_tokens_seen": 185613320, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.27490234, + "step": 8636, + "time_per_iteration": 2.6545984745025635 + }, + { + "auxiliary_loss_clip": 0.01341936, + "auxiliary_loss_mlp": 0.0031264, + "balance_loss_clip": 1.10430789, + "balance_loss_mlp": 0.28247967, + "epoch": 0.5192845332932512, + "flos": 12057080313600.0, + "grad_norm": 1014.9762881167925, + "language_loss": 0.83429968, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.85084546, + "num_input_tokens_seen": 185630730, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.30200195, + "step": 8637, + "time_per_iteration": 2.67226243019104 + }, + { + "auxiliary_loss_clip": 0.01319676, + "auxiliary_loss_mlp": 0.00292478, + "balance_loss_clip": 1.09043777, + "balance_loss_mlp": 0.26429653, + "epoch": 0.5193446565459191, + "flos": 18953185662720.0, + "grad_norm": 10.689998886399612, + "language_loss": 0.81110716, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.82722867, + "num_input_tokens_seen": 185648515, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.28186035, + "step": 8638, + "time_per_iteration": 4.045186996459961 + }, + { + "auxiliary_loss_clip": 0.01322262, + "auxiliary_loss_mlp": 0.00300838, + "balance_loss_clip": 1.0926125, + "balance_loss_mlp": 0.27417123, + "epoch": 0.5194047797985871, + "flos": 20374960775040.0, + "grad_norm": 10.383115079261245, + "language_loss": 0.82125068, + "learning_rate": 1.971375543740272e-06, + "loss": 0.83748162, + "num_input_tokens_seen": 185665220, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.2668457, + "step": 8639, + "time_per_iteration": 2.6569623947143555 + }, + { + "auxiliary_loss_clip": 0.01326287, + "auxiliary_loss_mlp": 0.00275685, + "balance_loss_clip": 1.09884977, + "balance_loss_mlp": 0.25123566, + "epoch": 0.519464903051255, + "flos": 24353001135360.0, + "grad_norm": 2.1845887131679733, + "language_loss": 0.85249388, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.86851358, + "num_input_tokens_seen": 185683750, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.24462891, + "step": 8640, + "time_per_iteration": 2.6652920246124268 + }, + { + "auxiliary_loss_clip": 0.01329222, + "auxiliary_loss_mlp": 0.00284524, + "balance_loss_clip": 1.10180306, + "balance_loss_mlp": 0.25788069, + "epoch": 0.519525026303923, + "flos": 14061829161600.0, + "grad_norm": 3.695166871587914, + "language_loss": 0.74391454, + "learning_rate": 1.97059670234927e-06, + "loss": 0.76005203, + "num_input_tokens_seen": 185700625, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.26647949, + "step": 8641, + "time_per_iteration": 2.6432716846466064 + }, + { + "auxiliary_loss_clip": 0.0132721, + "auxiliary_loss_mlp": 0.00269582, + "balance_loss_clip": 1.0995307, + "balance_loss_mlp": 0.24540612, + "epoch": 0.519585149556591, + "flos": 28835873193600.0, + "grad_norm": 2.008280157714009, + "language_loss": 0.81664902, + "learning_rate": 1.97020728331885e-06, + "loss": 0.83261693, + "num_input_tokens_seen": 185721155, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.24157715, + "step": 8642, + "time_per_iteration": 2.703280448913574 + }, + { + "auxiliary_loss_clip": 0.0133184, + "auxiliary_loss_mlp": 0.00256099, + "balance_loss_clip": 1.09958887, + "balance_loss_mlp": 0.22872832, + "epoch": 0.519645272809259, + "flos": 25373007648000.0, + "grad_norm": 8.01486399021342, + "language_loss": 0.88801515, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.90389454, + "num_input_tokens_seen": 185740990, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.27355957, + "step": 8643, + "time_per_iteration": 2.7444188594818115 + }, + { + "auxiliary_loss_clip": 0.01320405, + "auxiliary_loss_mlp": 0.0028674, + "balance_loss_clip": 1.08792222, + "balance_loss_mlp": 0.25896436, + "epoch": 0.519705396061927, + "flos": 25372863993600.0, + "grad_norm": 7.906198530643072, + "language_loss": 0.77309787, + "learning_rate": 1.969428448662004e-06, + "loss": 0.78916931, + "num_input_tokens_seen": 185762235, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.27783203, + "step": 8644, + "time_per_iteration": 2.6926310062408447 + }, + { + "auxiliary_loss_clip": 0.0131511, + "auxiliary_loss_mlp": 0.00275494, + "balance_loss_clip": 1.08808994, + "balance_loss_mlp": 0.24968541, + "epoch": 0.5197655193145949, + "flos": 28476228268800.0, + "grad_norm": 14.759422163629875, + "language_loss": 0.87389207, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.88979816, + "num_input_tokens_seen": 185783415, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.2578125, + "step": 8645, + "time_per_iteration": 2.7297756671905518 + }, + { + "auxiliary_loss_clip": 0.01330962, + "auxiliary_loss_mlp": 0.00269505, + "balance_loss_clip": 1.09778416, + "balance_loss_mlp": 0.24263524, + "epoch": 0.5198256425672629, + "flos": 20009138711040.0, + "grad_norm": 124.36435583925469, + "language_loss": 0.85033131, + "learning_rate": 1.968649618642264e-06, + "loss": 0.86633599, + "num_input_tokens_seen": 185801345, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.2689209, + "step": 8646, + "time_per_iteration": 2.598688840866089 + }, + { + "auxiliary_loss_clip": 0.01320291, + "auxiliary_loss_mlp": 0.0028153, + "balance_loss_clip": 1.0899117, + "balance_loss_mlp": 0.25550711, + "epoch": 0.5198857658199308, + "flos": 19828867328640.0, + "grad_norm": 75.23612504362386, + "language_loss": 0.75689185, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.77291006, + "num_input_tokens_seen": 185820815, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.26037598, + "step": 8647, + "time_per_iteration": 2.6993558406829834 + }, + { + "auxiliary_loss_clip": 0.01333319, + "auxiliary_loss_mlp": 0.00289904, + "balance_loss_clip": 1.09821343, + "balance_loss_mlp": 0.26072192, + "epoch": 0.5199458890725989, + "flos": 24461918150400.0, + "grad_norm": 6.868184580827574, + "language_loss": 0.80654681, + "learning_rate": 1.967870793377763e-06, + "loss": 0.82277906, + "num_input_tokens_seen": 185841450, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.29199219, + "step": 8648, + "time_per_iteration": 2.750157117843628 + }, + { + "auxiliary_loss_clip": 0.0133195, + "auxiliary_loss_mlp": 0.00270392, + "balance_loss_clip": 1.09645188, + "balance_loss_mlp": 0.24306932, + "epoch": 0.5200060123252668, + "flos": 23404779953280.0, + "grad_norm": 9.227708734424787, + "language_loss": 0.72854418, + "learning_rate": 1.967481382565642e-06, + "loss": 0.74456751, + "num_input_tokens_seen": 185859935, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.27331543, + "step": 8649, + "time_per_iteration": 2.6890242099761963 + }, + { + "auxiliary_loss_clip": 0.01331341, + "auxiliary_loss_mlp": 0.00295073, + "balance_loss_clip": 1.0915643, + "balance_loss_mlp": 0.26750013, + "epoch": 0.5200661355779348, + "flos": 17201355454080.0, + "grad_norm": 28.20984074304388, + "language_loss": 0.77995646, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.79622054, + "num_input_tokens_seen": 185876795, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.27563477, + "step": 8650, + "time_per_iteration": 2.652677536010742 + }, + { + "auxiliary_loss_clip": 0.01307727, + "auxiliary_loss_mlp": 0.00237762, + "balance_loss_clip": 1.08314157, + "balance_loss_mlp": 0.21214405, + "epoch": 0.5201262588306027, + "flos": 18515075477760.0, + "grad_norm": 19.9176673921206, + "language_loss": 0.841824, + "learning_rate": 1.966702564655496e-06, + "loss": 0.85727882, + "num_input_tokens_seen": 185895570, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.25622559, + "step": 8651, + "time_per_iteration": 2.6604502201080322 + }, + { + "auxiliary_loss_clip": 0.01341847, + "auxiliary_loss_mlp": 0.00267777, + "balance_loss_clip": 1.10352433, + "balance_loss_mlp": 0.23826081, + "epoch": 0.5201863820832707, + "flos": 18619395552000.0, + "grad_norm": 18.031691800563078, + "language_loss": 0.87126696, + "learning_rate": 1.966313157587003e-06, + "loss": 0.8873632, + "num_input_tokens_seen": 185913700, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.29528809, + "step": 8652, + "time_per_iteration": 2.6407690048217773 + }, + { + "auxiliary_loss_clip": 0.01343781, + "auxiliary_loss_mlp": 0.00280148, + "balance_loss_clip": 1.10976744, + "balance_loss_mlp": 0.25340983, + "epoch": 0.5202465053359386, + "flos": 22857142222080.0, + "grad_norm": 6.7838242760156735, + "language_loss": 0.76326478, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.77950406, + "num_input_tokens_seen": 185932460, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.26745605, + "step": 8653, + "time_per_iteration": 2.62227463722229 + }, + { + "auxiliary_loss_clip": 0.01346852, + "auxiliary_loss_mlp": 0.00267944, + "balance_loss_clip": 1.10838294, + "balance_loss_mlp": 0.24010868, + "epoch": 0.5203066285886067, + "flos": 21981532383360.0, + "grad_norm": 29.01737080117579, + "language_loss": 0.8490212, + "learning_rate": 1.965534347297008e-06, + "loss": 0.86516911, + "num_input_tokens_seen": 185952030, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.27807617, + "step": 8654, + "time_per_iteration": 2.7160770893096924 + }, + { + "auxiliary_loss_clip": 0.01349459, + "auxiliary_loss_mlp": 0.00278441, + "balance_loss_clip": 1.11193109, + "balance_loss_mlp": 0.2502484, + "epoch": 0.5203667518412746, + "flos": 20233329448320.0, + "grad_norm": 41.97198194775774, + "language_loss": 0.88920355, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.90548253, + "num_input_tokens_seen": 185973130, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.28210449, + "step": 8655, + "time_per_iteration": 2.7061166763305664 + }, + { + "auxiliary_loss_clip": 0.01343719, + "auxiliary_loss_mlp": 0.002599, + "balance_loss_clip": 1.11316657, + "balance_loss_mlp": 0.23596242, + "epoch": 0.5204268750939426, + "flos": 15705460627200.0, + "grad_norm": 2.904460162457369, + "language_loss": 0.73942006, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.75545621, + "num_input_tokens_seen": 185990200, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.23962402, + "step": 8656, + "time_per_iteration": 2.593651056289673 + }, + { + "auxiliary_loss_clip": 0.01334249, + "auxiliary_loss_mlp": 0.00263116, + "balance_loss_clip": 1.10126746, + "balance_loss_mlp": 0.23622289, + "epoch": 0.5204869983466105, + "flos": 27449469999360.0, + "grad_norm": 3.3974440960562724, + "language_loss": 0.79630548, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.81227916, + "num_input_tokens_seen": 186009880, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.26879883, + "step": 8657, + "time_per_iteration": 2.6855216026306152 + }, + { + "auxiliary_loss_clip": 0.01328123, + "auxiliary_loss_mlp": 0.00248779, + "balance_loss_clip": 1.0977577, + "balance_loss_mlp": 0.21857163, + "epoch": 0.5205471215992785, + "flos": 20595452411520.0, + "grad_norm": 6.441722959114923, + "language_loss": 0.78776133, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.8035304, + "num_input_tokens_seen": 186026680, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.30200195, + "step": 8658, + "time_per_iteration": 2.6385529041290283 + }, + { + "auxiliary_loss_clip": 0.01345539, + "auxiliary_loss_mlp": 0.00268381, + "balance_loss_clip": 1.11105633, + "balance_loss_mlp": 0.24102271, + "epoch": 0.5206072448519465, + "flos": 22127904305280.0, + "grad_norm": 722.5326566862364, + "language_loss": 0.91238868, + "learning_rate": 1.963587344701897e-06, + "loss": 0.92852795, + "num_input_tokens_seen": 186046920, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.27380371, + "step": 8659, + "time_per_iteration": 2.630228042602539 + }, + { + "auxiliary_loss_clip": 0.01360091, + "auxiliary_loss_mlp": 0.00247118, + "balance_loss_clip": 1.11424279, + "balance_loss_mlp": 0.21689895, + "epoch": 0.5206673681046144, + "flos": 18330422636160.0, + "grad_norm": 388.6066750737373, + "language_loss": 0.84363616, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.85970831, + "num_input_tokens_seen": 186062090, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.30212402, + "step": 8660, + "time_per_iteration": 2.616302013397217 + }, + { + "auxiliary_loss_clip": 0.01356633, + "auxiliary_loss_mlp": 0.00248476, + "balance_loss_clip": 1.11729383, + "balance_loss_mlp": 0.22297713, + "epoch": 0.5207274913572825, + "flos": 20230240878720.0, + "grad_norm": 2.6026656518952236, + "language_loss": 0.868016, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.88406712, + "num_input_tokens_seen": 186081135, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.25488281, + "step": 8661, + "time_per_iteration": 2.6394431591033936 + }, + { + "auxiliary_loss_clip": 0.0135162, + "auxiliary_loss_mlp": 0.00241473, + "balance_loss_clip": 1.11457705, + "balance_loss_mlp": 0.21610576, + "epoch": 0.5207876146099504, + "flos": 22127042378880.0, + "grad_norm": 3.305803864250956, + "language_loss": 0.7846961, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.80062705, + "num_input_tokens_seen": 186099700, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.25402832, + "step": 8662, + "time_per_iteration": 2.699007749557495 + }, + { + "auxiliary_loss_clip": 0.01339224, + "auxiliary_loss_mlp": 0.00237204, + "balance_loss_clip": 1.1102953, + "balance_loss_mlp": 0.21090662, + "epoch": 0.5208477378626184, + "flos": 23878908501120.0, + "grad_norm": 142.7526016661227, + "language_loss": 0.74581826, + "learning_rate": 1.962029767391098e-06, + "loss": 0.76158261, + "num_input_tokens_seen": 186119740, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.26318359, + "step": 8663, + "time_per_iteration": 2.723644971847534 + }, + { + "auxiliary_loss_clip": 0.01344352, + "auxiliary_loss_mlp": 0.00235474, + "balance_loss_clip": 1.11390424, + "balance_loss_mlp": 0.21051157, + "epoch": 0.5209078611152863, + "flos": 20961525870720.0, + "grad_norm": 150.70238817600153, + "language_loss": 0.83769596, + "learning_rate": 1.961640376626072e-06, + "loss": 0.85349417, + "num_input_tokens_seen": 186140645, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.24963379, + "step": 8664, + "time_per_iteration": 2.6786653995513916 + }, + { + "auxiliary_loss_clip": 0.01345337, + "auxiliary_loss_mlp": 0.00230667, + "balance_loss_clip": 1.11205852, + "balance_loss_mlp": 0.20243812, + "epoch": 0.5209679843679543, + "flos": 20667740532480.0, + "grad_norm": 8.020126493434178, + "language_loss": 0.86995244, + "learning_rate": 1.961250987315646e-06, + "loss": 0.8857125, + "num_input_tokens_seen": 186160130, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.2824707, + "step": 8665, + "time_per_iteration": 2.6777002811431885 + }, + { + "auxiliary_loss_clip": 0.0133407, + "auxiliary_loss_mlp": 0.00223089, + "balance_loss_clip": 1.10759115, + "balance_loss_mlp": 0.19610025, + "epoch": 0.5210281076206222, + "flos": 20227295963520.0, + "grad_norm": 3.7044953202039075, + "language_loss": 0.79898208, + "learning_rate": 1.960861599474586e-06, + "loss": 0.81455374, + "num_input_tokens_seen": 186179485, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.27001953, + "step": 8666, + "time_per_iteration": 2.696317434310913 + }, + { + "auxiliary_loss_clip": 0.01373494, + "auxiliary_loss_mlp": 0.00257765, + "balance_loss_clip": 1.12242496, + "balance_loss_mlp": 0.22746195, + "epoch": 0.5210882308732903, + "flos": 16069989801600.0, + "grad_norm": 43.99100846671773, + "language_loss": 0.82994658, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.84625918, + "num_input_tokens_seen": 186197140, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.30273438, + "step": 8667, + "time_per_iteration": 2.609222412109375 + }, + { + "auxiliary_loss_clip": 0.01337438, + "auxiliary_loss_mlp": 0.00209903, + "balance_loss_clip": 1.11275315, + "balance_loss_mlp": 0.18461896, + "epoch": 0.5211483541259582, + "flos": 24825298089600.0, + "grad_norm": 23.646137416574632, + "language_loss": 0.86215454, + "learning_rate": 1.960082828259629e-06, + "loss": 0.87762797, + "num_input_tokens_seen": 186216800, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.25317383, + "step": 8668, + "time_per_iteration": 2.6683850288391113 + }, + { + "auxiliary_loss_clip": 0.01332732, + "auxiliary_loss_mlp": 0.002163, + "balance_loss_clip": 1.10451841, + "balance_loss_mlp": 0.18925159, + "epoch": 0.5212084773786262, + "flos": 20370651143040.0, + "grad_norm": 203.97292417911447, + "language_loss": 0.69784021, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.71333051, + "num_input_tokens_seen": 186235320, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27050781, + "step": 8669, + "time_per_iteration": 2.6489346027374268 + }, + { + "auxiliary_loss_clip": 0.01327352, + "auxiliary_loss_mlp": 0.00198369, + "balance_loss_clip": 1.09965086, + "balance_loss_mlp": 0.17146379, + "epoch": 0.5212686006312941, + "flos": 23145468693120.0, + "grad_norm": 75.71801169339555, + "language_loss": 0.74306011, + "learning_rate": 1.959304063099325e-06, + "loss": 0.75831735, + "num_input_tokens_seen": 186254460, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.26904297, + "step": 8670, + "time_per_iteration": 2.6310510635375977 + }, + { + "auxiliary_loss_clip": 0.01334693, + "auxiliary_loss_mlp": 0.0019236, + "balance_loss_clip": 1.10298765, + "balance_loss_mlp": 0.16560954, + "epoch": 0.5213287238839621, + "flos": 27774030314880.0, + "grad_norm": 40.328554288468744, + "language_loss": 0.84172249, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.85699302, + "num_input_tokens_seen": 186269465, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.26757812, + "step": 8671, + "time_per_iteration": 4.065420389175415 + }, + { + "auxiliary_loss_clip": 0.01348031, + "auxiliary_loss_mlp": 0.00211138, + "balance_loss_clip": 1.11468267, + "balance_loss_mlp": 0.18360074, + "epoch": 0.5213888471366301, + "flos": 19937676602880.0, + "grad_norm": 3.119627668983307, + "language_loss": 0.86073267, + "learning_rate": 1.958525304111796e-06, + "loss": 0.87632442, + "num_input_tokens_seen": 186288660, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.27563477, + "step": 8672, + "time_per_iteration": 2.610459327697754 + }, + { + "auxiliary_loss_clip": 0.01328159, + "auxiliary_loss_mlp": 0.00188954, + "balance_loss_clip": 1.09999084, + "balance_loss_mlp": 0.16377747, + "epoch": 0.521448970389298, + "flos": 16982731324800.0, + "grad_norm": 11.159418348510965, + "language_loss": 0.79635811, + "learning_rate": 1.958135926969736e-06, + "loss": 0.81152928, + "num_input_tokens_seen": 186305760, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.25183105, + "step": 8673, + "time_per_iteration": 2.6128013134002686 + }, + { + "auxiliary_loss_clip": 0.01315958, + "auxiliary_loss_mlp": 0.0019576, + "balance_loss_clip": 1.09327936, + "balance_loss_mlp": 0.16973692, + "epoch": 0.5215090936419661, + "flos": 18989706816000.0, + "grad_norm": 48.453941433515695, + "language_loss": 0.83879066, + "learning_rate": 1.957746551415166e-06, + "loss": 0.85390788, + "num_input_tokens_seen": 186324135, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26037598, + "step": 8674, + "time_per_iteration": 4.13818621635437 + }, + { + "auxiliary_loss_clip": 0.01325245, + "auxiliary_loss_mlp": 0.00209546, + "balance_loss_clip": 1.09556389, + "balance_loss_mlp": 0.18104318, + "epoch": 0.521569216894634, + "flos": 16143427157760.0, + "grad_norm": 43.44135468723662, + "language_loss": 0.95735836, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.9727062, + "num_input_tokens_seen": 186340205, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.28503418, + "step": 8675, + "time_per_iteration": 4.026090860366821 + }, + { + "auxiliary_loss_clip": 0.01415097, + "auxiliary_loss_mlp": 0.00088299, + "balance_loss_clip": 1.26795864, + "balance_loss_mlp": 0.08028814, + "epoch": 0.521629340147302, + "flos": 57579493282560.0, + "grad_norm": 0.8375249121178142, + "language_loss": 0.62444764, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.63948154, + "num_input_tokens_seen": 186396940, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.08007812, + "step": 8676, + "time_per_iteration": 3.1050665378570557 + }, + { + "auxiliary_loss_clip": 0.0131493, + "auxiliary_loss_mlp": 0.00193287, + "balance_loss_clip": 1.09168291, + "balance_loss_mlp": 0.16688192, + "epoch": 0.5216894633999699, + "flos": 26796901662720.0, + "grad_norm": 75.79667921473555, + "language_loss": 0.74274182, + "learning_rate": 1.956578434424046e-06, + "loss": 0.75782394, + "num_input_tokens_seen": 186418680, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26428223, + "step": 8677, + "time_per_iteration": 2.8021066188812256 + }, + { + "auxiliary_loss_clip": 0.01325053, + "auxiliary_loss_mlp": 0.00220469, + "balance_loss_clip": 1.09944046, + "balance_loss_mlp": 0.19288382, + "epoch": 0.5217495866526379, + "flos": 26358719650560.0, + "grad_norm": 46.736778028873715, + "language_loss": 0.74167812, + "learning_rate": 1.956189065367086e-06, + "loss": 0.75713331, + "num_input_tokens_seen": 186438265, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.27587891, + "step": 8678, + "time_per_iteration": 2.7071902751922607 + }, + { + "auxiliary_loss_clip": 0.01333382, + "auxiliary_loss_mlp": 0.00209895, + "balance_loss_clip": 1.09776151, + "balance_loss_mlp": 0.18259603, + "epoch": 0.5218097099053058, + "flos": 23584009841280.0, + "grad_norm": 4.296233952765604, + "language_loss": 0.79171014, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.80714285, + "num_input_tokens_seen": 186456870, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.27282715, + "step": 8679, + "time_per_iteration": 2.683425188064575 + }, + { + "auxiliary_loss_clip": 0.01315988, + "auxiliary_loss_mlp": 0.00197296, + "balance_loss_clip": 1.09322333, + "balance_loss_mlp": 0.17178479, + "epoch": 0.5218698331579739, + "flos": 18077396256000.0, + "grad_norm": 2.477452910342753, + "language_loss": 0.73225158, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.74738443, + "num_input_tokens_seen": 186476425, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25524902, + "step": 8680, + "time_per_iteration": 4.14244532585144 + }, + { + "auxiliary_loss_clip": 0.01325991, + "auxiliary_loss_mlp": 0.0020823, + "balance_loss_clip": 1.09845281, + "balance_loss_mlp": 0.18256426, + "epoch": 0.5219299564106418, + "flos": 19281121856640.0, + "grad_norm": 29.43441058207596, + "language_loss": 0.89221746, + "learning_rate": 1.955020968223156e-06, + "loss": 0.90755963, + "num_input_tokens_seen": 186492555, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.25671387, + "step": 8681, + "time_per_iteration": 2.6387033462524414 + }, + { + "auxiliary_loss_clip": 0.0131727, + "auxiliary_loss_mlp": 0.00216905, + "balance_loss_clip": 1.09284484, + "balance_loss_mlp": 0.19151306, + "epoch": 0.5219900796633098, + "flos": 26651355753600.0, + "grad_norm": 2.8139530623940776, + "language_loss": 0.85018206, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.86552387, + "num_input_tokens_seen": 186513190, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.25402832, + "step": 8682, + "time_per_iteration": 2.7209036350250244 + }, + { + "auxiliary_loss_clip": 0.01312689, + "auxiliary_loss_mlp": 0.00199382, + "balance_loss_clip": 1.09159899, + "balance_loss_mlp": 0.1727742, + "epoch": 0.5220502029159777, + "flos": 34312717382400.0, + "grad_norm": 92.37076792486904, + "language_loss": 0.76979762, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.78491831, + "num_input_tokens_seen": 186534830, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26599121, + "step": 8683, + "time_per_iteration": 2.7684600353240967 + }, + { + "auxiliary_loss_clip": 0.01327299, + "auxiliary_loss_mlp": 0.00232672, + "balance_loss_clip": 1.0982933, + "balance_loss_mlp": 0.20606408, + "epoch": 0.5221103261686457, + "flos": 22156488552960.0, + "grad_norm": 48.751284751979725, + "language_loss": 0.83300543, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.84860516, + "num_input_tokens_seen": 186554390, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.26647949, + "step": 8684, + "time_per_iteration": 2.708794593811035 + }, + { + "auxiliary_loss_clip": 0.0132296, + "auxiliary_loss_mlp": 0.00187055, + "balance_loss_clip": 1.10009742, + "balance_loss_mlp": 0.16049474, + "epoch": 0.5221704494213137, + "flos": 19208402772480.0, + "grad_norm": 32.05160695449131, + "language_loss": 0.84027565, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.85537583, + "num_input_tokens_seen": 186572360, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.26550293, + "step": 8685, + "time_per_iteration": 2.601736307144165 + }, + { + "auxiliary_loss_clip": 0.01325874, + "auxiliary_loss_mlp": 0.00195384, + "balance_loss_clip": 1.1003648, + "balance_loss_mlp": 0.16838267, + "epoch": 0.5222305726739817, + "flos": 19354056422400.0, + "grad_norm": 112.86295572581314, + "language_loss": 0.87520564, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.89041823, + "num_input_tokens_seen": 186590655, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.27001953, + "step": 8686, + "time_per_iteration": 2.687774658203125 + }, + { + "auxiliary_loss_clip": 0.01305218, + "auxiliary_loss_mlp": 0.00185695, + "balance_loss_clip": 1.08853447, + "balance_loss_mlp": 0.16115028, + "epoch": 0.5222906959266497, + "flos": 27814789272960.0, + "grad_norm": 2.4373293314642743, + "language_loss": 0.76165116, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.77656031, + "num_input_tokens_seen": 186610345, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24523926, + "step": 8687, + "time_per_iteration": 2.779982566833496 + }, + { + "auxiliary_loss_clip": 0.01311463, + "auxiliary_loss_mlp": 0.00183228, + "balance_loss_clip": 1.0891757, + "balance_loss_mlp": 0.15981495, + "epoch": 0.5223508191793176, + "flos": 12712988615040.0, + "grad_norm": 19.46395448881557, + "language_loss": 0.88042718, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.89537406, + "num_input_tokens_seen": 186624360, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.23388672, + "step": 8688, + "time_per_iteration": 2.6443660259246826 + }, + { + "auxiliary_loss_clip": 0.01319173, + "auxiliary_loss_mlp": 0.00207103, + "balance_loss_clip": 1.09562826, + "balance_loss_mlp": 0.18200937, + "epoch": 0.5224109424319856, + "flos": 15632238752640.0, + "grad_norm": 73.111432870144, + "language_loss": 0.83745539, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.85271823, + "num_input_tokens_seen": 186638680, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.25109863, + "step": 8689, + "time_per_iteration": 2.6356804370880127 + }, + { + "auxiliary_loss_clip": 0.0131317, + "auxiliary_loss_mlp": 0.00192671, + "balance_loss_clip": 1.09162784, + "balance_loss_mlp": 0.16829288, + "epoch": 0.5224710656846535, + "flos": 15742233175680.0, + "grad_norm": 4.623411373628884, + "language_loss": 0.90186602, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.91692442, + "num_input_tokens_seen": 186655840, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.24365234, + "step": 8690, + "time_per_iteration": 2.6110692024230957 + }, + { + "auxiliary_loss_clip": 0.01326122, + "auxiliary_loss_mlp": 0.00189947, + "balance_loss_clip": 1.09833944, + "balance_loss_mlp": 0.16338761, + "epoch": 0.5225311889373215, + "flos": 26030998938240.0, + "grad_norm": 35.93993873469718, + "language_loss": 0.86371195, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.87887263, + "num_input_tokens_seen": 186674150, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.26574707, + "step": 8691, + "time_per_iteration": 2.6973085403442383 + }, + { + "auxiliary_loss_clip": 0.01325739, + "auxiliary_loss_mlp": 0.0021363, + "balance_loss_clip": 1.09592378, + "balance_loss_mlp": 0.18667667, + "epoch": 0.5225913121899894, + "flos": 18369278173440.0, + "grad_norm": 9.528622182217601, + "language_loss": 0.86540395, + "learning_rate": 1.950738079725646e-06, + "loss": 0.88079762, + "num_input_tokens_seen": 186690675, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.26953125, + "step": 8692, + "time_per_iteration": 2.6510980129241943 + }, + { + "auxiliary_loss_clip": 0.01317751, + "auxiliary_loss_mlp": 0.00210222, + "balance_loss_clip": 1.09712553, + "balance_loss_mlp": 0.18651089, + "epoch": 0.5226514354426575, + "flos": 29273516501760.0, + "grad_norm": 12.96252574507916, + "language_loss": 0.78472757, + "learning_rate": 1.950348737138691e-06, + "loss": 0.80000722, + "num_input_tokens_seen": 186710380, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.23730469, + "step": 8693, + "time_per_iteration": 2.7111778259277344 + }, + { + "auxiliary_loss_clip": 0.01327536, + "auxiliary_loss_mlp": 0.00218939, + "balance_loss_clip": 1.09343052, + "balance_loss_mlp": 0.19121101, + "epoch": 0.5227115586953254, + "flos": 22853299466880.0, + "grad_norm": 21.800559627578078, + "language_loss": 0.92531723, + "learning_rate": 1.949959396434517e-06, + "loss": 0.94078195, + "num_input_tokens_seen": 186729135, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.27697754, + "step": 8694, + "time_per_iteration": 2.7333521842956543 + }, + { + "auxiliary_loss_clip": 0.01392555, + "auxiliary_loss_mlp": 0.00083908, + "balance_loss_clip": 1.25312936, + "balance_loss_mlp": 0.07699391, + "epoch": 0.5227716819479934, + "flos": 57474419022720.0, + "grad_norm": 0.780223714155609, + "language_loss": 0.55480015, + "learning_rate": 1.949570057627888e-06, + "loss": 0.56956482, + "num_input_tokens_seen": 186791115, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.06933594, + "step": 8695, + "time_per_iteration": 3.1823086738586426 + }, + { + "auxiliary_loss_clip": 0.01327947, + "auxiliary_loss_mlp": 0.00197594, + "balance_loss_clip": 1.09720671, + "balance_loss_mlp": 0.17108151, + "epoch": 0.5228318052006613, + "flos": 13808264077440.0, + "grad_norm": 9.16178479085916, + "language_loss": 0.83771962, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.85297501, + "num_input_tokens_seen": 186808660, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.26501465, + "step": 8696, + "time_per_iteration": 2.6099448204040527 + }, + { + "auxiliary_loss_clip": 0.01320964, + "auxiliary_loss_mlp": 0.00227356, + "balance_loss_clip": 1.09249473, + "balance_loss_mlp": 0.20122558, + "epoch": 0.5228919284533293, + "flos": 15596184476160.0, + "grad_norm": 13.31309519057925, + "language_loss": 0.78740549, + "learning_rate": 1.948791385766319e-06, + "loss": 0.80288863, + "num_input_tokens_seen": 186825900, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.26159668, + "step": 8697, + "time_per_iteration": 2.5982067584991455 + }, + { + "auxiliary_loss_clip": 0.01313294, + "auxiliary_loss_mlp": 0.00186173, + "balance_loss_clip": 1.09048426, + "balance_loss_mlp": 0.16195004, + "epoch": 0.5229520517059973, + "flos": 22491499726080.0, + "grad_norm": 560.6025877591582, + "language_loss": 0.88417006, + "learning_rate": 1.948402052740906e-06, + "loss": 0.89916468, + "num_input_tokens_seen": 186843735, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.24230957, + "step": 8698, + "time_per_iteration": 2.7050907611846924 + }, + { + "auxiliary_loss_clip": 0.01327765, + "auxiliary_loss_mlp": 0.00206542, + "balance_loss_clip": 1.10221386, + "balance_loss_mlp": 0.18161505, + "epoch": 0.5230121749586653, + "flos": 22090880361600.0, + "grad_norm": 355.3555848806379, + "language_loss": 0.80202156, + "learning_rate": 1.948012721672093e-06, + "loss": 0.81736469, + "num_input_tokens_seen": 186862440, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.24938965, + "step": 8699, + "time_per_iteration": 2.612898588180542 + }, + { + "auxiliary_loss_clip": 0.01324357, + "auxiliary_loss_mlp": 0.00213278, + "balance_loss_clip": 1.09351933, + "balance_loss_mlp": 0.1872662, + "epoch": 0.5230722982113333, + "flos": 22127150119680.0, + "grad_norm": 4.323121778295936, + "language_loss": 0.82875663, + "learning_rate": 1.947623392574642e-06, + "loss": 0.84413296, + "num_input_tokens_seen": 186880940, + "router_z_loss_clip": 2.30761719, + "router_z_loss_mlp": 0.26037598, + "step": 8700, + "time_per_iteration": 2.6153993606567383 + }, + { + "auxiliary_loss_clip": 0.01335241, + "auxiliary_loss_mlp": 0.00249071, + "balance_loss_clip": 1.10485768, + "balance_loss_mlp": 0.21992481, + "epoch": 0.5231324214640012, + "flos": 25009268572800.0, + "grad_norm": 25.98822799556201, + "language_loss": 0.77944636, + "learning_rate": 1.947234065463318e-06, + "loss": 0.79528952, + "num_input_tokens_seen": 186900785, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.29174805, + "step": 8701, + "time_per_iteration": 2.7400989532470703 + }, + { + "auxiliary_loss_clip": 0.01323692, + "auxiliary_loss_mlp": 0.00211021, + "balance_loss_clip": 1.10195017, + "balance_loss_mlp": 0.18552178, + "epoch": 0.5231925447166692, + "flos": 25740517651200.0, + "grad_norm": 19.39618331621996, + "language_loss": 0.72969341, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.74504054, + "num_input_tokens_seen": 186920895, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25488281, + "step": 8702, + "time_per_iteration": 2.7058589458465576 + }, + { + "auxiliary_loss_clip": 0.01333397, + "auxiliary_loss_mlp": 0.00225247, + "balance_loss_clip": 1.10530186, + "balance_loss_mlp": 0.19903252, + "epoch": 0.5232526679693371, + "flos": 21433930565760.0, + "grad_norm": 117.28917546995004, + "language_loss": 0.83249772, + "learning_rate": 1.946455417258101e-06, + "loss": 0.84808421, + "num_input_tokens_seen": 186940605, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.26257324, + "step": 8703, + "time_per_iteration": 2.6143510341644287 + }, + { + "auxiliary_loss_clip": 0.01342704, + "auxiliary_loss_mlp": 0.00236348, + "balance_loss_clip": 1.10890865, + "balance_loss_mlp": 0.2067845, + "epoch": 0.5233127912220051, + "flos": 35298393471360.0, + "grad_norm": 4.143081550041936, + "language_loss": 0.85688293, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.87267345, + "num_input_tokens_seen": 186960820, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.2956543, + "step": 8704, + "time_per_iteration": 2.732206344604492 + }, + { + "auxiliary_loss_clip": 0.01326963, + "auxiliary_loss_mlp": 0.00214684, + "balance_loss_clip": 1.10362458, + "balance_loss_mlp": 0.19007912, + "epoch": 0.523372914474673, + "flos": 17051320344960.0, + "grad_norm": 59.2422699942105, + "language_loss": 0.84164059, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.85705703, + "num_input_tokens_seen": 186976240, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.24597168, + "step": 8705, + "time_per_iteration": 2.574928045272827 + }, + { + "auxiliary_loss_clip": 0.0134797, + "auxiliary_loss_mlp": 0.00246633, + "balance_loss_clip": 1.10979605, + "balance_loss_mlp": 0.21774882, + "epoch": 0.5234330377273411, + "flos": 18406302117120.0, + "grad_norm": 25.688637777263935, + "language_loss": 0.77503526, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.79098129, + "num_input_tokens_seen": 186992855, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.28857422, + "step": 8706, + "time_per_iteration": 2.6038146018981934 + }, + { + "auxiliary_loss_clip": 0.01340569, + "auxiliary_loss_mlp": 0.00079417, + "balance_loss_clip": 1.20297647, + "balance_loss_mlp": 0.07317084, + "epoch": 0.523493160980009, + "flos": 65850296970240.0, + "grad_norm": 0.6706772288122443, + "language_loss": 0.51796836, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.53216821, + "num_input_tokens_seen": 187051205, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06225586, + "step": 8707, + "time_per_iteration": 3.1681737899780273 + }, + { + "auxiliary_loss_clip": 0.01333349, + "auxiliary_loss_mlp": 0.00223269, + "balance_loss_clip": 1.10696495, + "balance_loss_mlp": 0.19750786, + "epoch": 0.523553284232677, + "flos": 21872076664320.0, + "grad_norm": 12.923242881491706, + "language_loss": 0.81784856, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.83341473, + "num_input_tokens_seen": 187070540, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.25805664, + "step": 8708, + "time_per_iteration": 2.6391568183898926 + }, + { + "auxiliary_loss_clip": 0.01356509, + "auxiliary_loss_mlp": 0.00210467, + "balance_loss_clip": 1.12159896, + "balance_loss_mlp": 0.18340613, + "epoch": 0.5236134074853449, + "flos": 20848191482880.0, + "grad_norm": 45.1590500030247, + "language_loss": 0.85032684, + "learning_rate": 1.944119521844849e-06, + "loss": 0.8659966, + "num_input_tokens_seen": 187089975, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.27075195, + "step": 8709, + "time_per_iteration": 2.6328024864196777 + }, + { + "auxiliary_loss_clip": 0.01368692, + "auxiliary_loss_mlp": 0.00241814, + "balance_loss_clip": 1.12428069, + "balance_loss_mlp": 0.2153497, + "epoch": 0.5236735307380129, + "flos": 25520421064320.0, + "grad_norm": 32.4974513325414, + "language_loss": 0.92962658, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.94573164, + "num_input_tokens_seen": 187108775, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.26489258, + "step": 8710, + "time_per_iteration": 2.6800832748413086 + }, + { + "auxiliary_loss_clip": 0.01359814, + "auxiliary_loss_mlp": 0.00233004, + "balance_loss_clip": 1.12522388, + "balance_loss_mlp": 0.20733847, + "epoch": 0.523733653990681, + "flos": 23583112001280.0, + "grad_norm": 134.67168824228617, + "language_loss": 0.78061938, + "learning_rate": 1.943340906834908e-06, + "loss": 0.79654759, + "num_input_tokens_seen": 187128830, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.2565918, + "step": 8711, + "time_per_iteration": 2.663094997406006 + }, + { + "auxiliary_loss_clip": 0.01338979, + "auxiliary_loss_mlp": 0.00214004, + "balance_loss_clip": 1.1119076, + "balance_loss_mlp": 0.18787345, + "epoch": 0.5237937772433489, + "flos": 21106245767040.0, + "grad_norm": 60.66558033643959, + "language_loss": 0.89702618, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.91255599, + "num_input_tokens_seen": 187149570, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.26171875, + "step": 8712, + "time_per_iteration": 2.6427013874053955 + }, + { + "auxiliary_loss_clip": 0.01351896, + "auxiliary_loss_mlp": 0.00231871, + "balance_loss_clip": 1.11604965, + "balance_loss_mlp": 0.20341563, + "epoch": 0.5238539004960169, + "flos": 19172887200000.0, + "grad_norm": 9.895452196084216, + "language_loss": 0.76579171, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.78162938, + "num_input_tokens_seen": 187170575, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.2845459, + "step": 8713, + "time_per_iteration": 2.6725943088531494 + }, + { + "auxiliary_loss_clip": 0.01373944, + "auxiliary_loss_mlp": 0.00229668, + "balance_loss_clip": 1.12626588, + "balance_loss_mlp": 0.20029525, + "epoch": 0.5239140237486848, + "flos": 17888218300800.0, + "grad_norm": 3.9677858357195626, + "language_loss": 0.88528204, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.90131819, + "num_input_tokens_seen": 187187190, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.29394531, + "step": 8714, + "time_per_iteration": 4.006772518157959 + }, + { + "auxiliary_loss_clip": 0.01368418, + "auxiliary_loss_mlp": 0.00238311, + "balance_loss_clip": 1.12667847, + "balance_loss_mlp": 0.21032029, + "epoch": 0.5239741470013528, + "flos": 17930413802880.0, + "grad_norm": 15.081194458382718, + "language_loss": 0.85069621, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.86676347, + "num_input_tokens_seen": 187204350, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.27990723, + "step": 8715, + "time_per_iteration": 2.658881664276123 + }, + { + "auxiliary_loss_clip": 0.01352989, + "auxiliary_loss_mlp": 0.00250069, + "balance_loss_clip": 1.12219787, + "balance_loss_mlp": 0.22246048, + "epoch": 0.5240342702540207, + "flos": 30993386584320.0, + "grad_norm": 2.8988929167895066, + "language_loss": 0.7812162, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.79724681, + "num_input_tokens_seen": 187225605, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.27600098, + "step": 8716, + "time_per_iteration": 4.17400336265564 + }, + { + "auxiliary_loss_clip": 0.01348233, + "auxiliary_loss_mlp": 0.00233037, + "balance_loss_clip": 1.11989963, + "balance_loss_mlp": 0.20621453, + "epoch": 0.5240943935066887, + "flos": 25005066681600.0, + "grad_norm": 92.438617378468, + "language_loss": 0.90824556, + "learning_rate": 1.941005113841926e-06, + "loss": 0.92405826, + "num_input_tokens_seen": 187241335, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.26818848, + "step": 8717, + "time_per_iteration": 2.6524624824523926 + }, + { + "auxiliary_loss_clip": 0.01346661, + "auxiliary_loss_mlp": 0.00225074, + "balance_loss_clip": 1.11614966, + "balance_loss_mlp": 0.19973008, + "epoch": 0.5241545167593566, + "flos": 23659099223040.0, + "grad_norm": 21.273435720812692, + "language_loss": 0.72295237, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.73866963, + "num_input_tokens_seen": 187259925, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.25390625, + "step": 8718, + "time_per_iteration": 4.080482006072998 + }, + { + "auxiliary_loss_clip": 0.01357678, + "auxiliary_loss_mlp": 0.00237658, + "balance_loss_clip": 1.12095129, + "balance_loss_mlp": 0.21213478, + "epoch": 0.5242146400120247, + "flos": 23400398494080.0, + "grad_norm": 3.672402949561507, + "language_loss": 0.78463554, + "learning_rate": 1.940226533916872e-06, + "loss": 0.80058897, + "num_input_tokens_seen": 187279035, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.25537109, + "step": 8719, + "time_per_iteration": 2.6590681076049805 + }, + { + "auxiliary_loss_clip": 0.01355005, + "auxiliary_loss_mlp": 0.00217973, + "balance_loss_clip": 1.12461805, + "balance_loss_mlp": 0.18963677, + "epoch": 0.5242747632646926, + "flos": 17749065012480.0, + "grad_norm": 9.324763420720776, + "language_loss": 0.81881428, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.83454406, + "num_input_tokens_seen": 187297555, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.28356934, + "step": 8720, + "time_per_iteration": 2.6398894786834717 + }, + { + "auxiliary_loss_clip": 0.01345593, + "auxiliary_loss_mlp": 0.00222084, + "balance_loss_clip": 1.11266398, + "balance_loss_mlp": 0.19653791, + "epoch": 0.5243348865173606, + "flos": 32597731549440.0, + "grad_norm": 3.817981156370407, + "language_loss": 0.77106357, + "learning_rate": 1.939447963058281e-06, + "loss": 0.7867403, + "num_input_tokens_seen": 187320265, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.25549316, + "step": 8721, + "time_per_iteration": 2.7294511795043945 + }, + { + "auxiliary_loss_clip": 0.01347002, + "auxiliary_loss_mlp": 0.00221, + "balance_loss_clip": 1.11702609, + "balance_loss_mlp": 0.19535807, + "epoch": 0.5243950097700285, + "flos": 25484115392640.0, + "grad_norm": 402.7813240863664, + "language_loss": 0.92875111, + "learning_rate": 1.939058681065813e-06, + "loss": 0.94443119, + "num_input_tokens_seen": 187338045, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.25683594, + "step": 8722, + "time_per_iteration": 4.183077335357666 + }, + { + "auxiliary_loss_clip": 0.0133803, + "auxiliary_loss_mlp": 0.00224152, + "balance_loss_clip": 1.11241031, + "balance_loss_mlp": 0.20079887, + "epoch": 0.5244551330226965, + "flos": 15268391936640.0, + "grad_norm": 25.626829760871253, + "language_loss": 0.86227071, + "learning_rate": 1.938669401384247e-06, + "loss": 0.87789255, + "num_input_tokens_seen": 187356040, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.23364258, + "step": 8723, + "time_per_iteration": 2.8015623092651367 + }, + { + "auxiliary_loss_clip": 0.01369921, + "auxiliary_loss_mlp": 0.00244082, + "balance_loss_clip": 1.13357532, + "balance_loss_mlp": 0.21468449, + "epoch": 0.5245152562753645, + "flos": 22237108629120.0, + "grad_norm": 7.717778653255853, + "language_loss": 0.8219986, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.8381387, + "num_input_tokens_seen": 187374185, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.29394531, + "step": 8724, + "time_per_iteration": 2.6780712604522705 + }, + { + "auxiliary_loss_clip": 0.01346667, + "auxiliary_loss_mlp": 0.00226525, + "balance_loss_clip": 1.11286128, + "balance_loss_mlp": 0.19908254, + "epoch": 0.5245753795280325, + "flos": 29426460612480.0, + "grad_norm": 6.696353230952606, + "language_loss": 0.78242493, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.79815686, + "num_input_tokens_seen": 187396640, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.27441406, + "step": 8725, + "time_per_iteration": 2.7072596549987793 + }, + { + "auxiliary_loss_clip": 0.0131993, + "auxiliary_loss_mlp": 0.00065955, + "balance_loss_clip": 1.16058195, + "balance_loss_mlp": 0.05808684, + "epoch": 0.5246355027807005, + "flos": 58834392785280.0, + "grad_norm": 0.756203945213419, + "language_loss": 0.55376494, + "learning_rate": 1.937501576352568e-06, + "loss": 0.56762385, + "num_input_tokens_seen": 187455945, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.07861328, + "step": 8726, + "time_per_iteration": 3.139814615249634 + }, + { + "auxiliary_loss_clip": 0.01300644, + "auxiliary_loss_mlp": 0.00059355, + "balance_loss_clip": 1.15308619, + "balance_loss_mlp": 0.05267959, + "epoch": 0.5246956260333684, + "flos": 64526592965760.0, + "grad_norm": 0.7848901735414096, + "language_loss": 0.5810051, + "learning_rate": 1.937112306062219e-06, + "loss": 0.59460509, + "num_input_tokens_seen": 187519975, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.06689453, + "step": 8727, + "time_per_iteration": 3.102858543395996 + }, + { + "auxiliary_loss_clip": 0.01336385, + "auxiliary_loss_mlp": 0.00216214, + "balance_loss_clip": 1.10584021, + "balance_loss_mlp": 0.18860546, + "epoch": 0.5247557492860364, + "flos": 24533631653760.0, + "grad_norm": 1.5709914728329955, + "language_loss": 0.76392782, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.77945381, + "num_input_tokens_seen": 187541775, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.27612305, + "step": 8728, + "time_per_iteration": 2.6917593479156494 + }, + { + "auxiliary_loss_clip": 0.01334322, + "auxiliary_loss_mlp": 0.00243609, + "balance_loss_clip": 1.10919285, + "balance_loss_mlp": 0.21727526, + "epoch": 0.5248158725387043, + "flos": 18806131382400.0, + "grad_norm": 8.960818321664098, + "language_loss": 0.74466473, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.76044405, + "num_input_tokens_seen": 187560425, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.26379395, + "step": 8729, + "time_per_iteration": 2.6244232654571533 + }, + { + "auxiliary_loss_clip": 0.01333588, + "auxiliary_loss_mlp": 0.00219694, + "balance_loss_clip": 1.10598993, + "balance_loss_mlp": 0.19396845, + "epoch": 0.5248759957913723, + "flos": 20955851521920.0, + "grad_norm": 14.95197429593985, + "language_loss": 0.91056979, + "learning_rate": 1.935944509558464e-06, + "loss": 0.92610258, + "num_input_tokens_seen": 187579930, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.25708008, + "step": 8730, + "time_per_iteration": 2.70786714553833 + }, + { + "auxiliary_loss_clip": 0.01327262, + "auxiliary_loss_mlp": 0.00203052, + "balance_loss_clip": 1.10518098, + "balance_loss_mlp": 0.17653942, + "epoch": 0.5249361190440403, + "flos": 18660980522880.0, + "grad_norm": 28.233331842286848, + "language_loss": 0.86788523, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.88318837, + "num_input_tokens_seen": 187595365, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.26525879, + "step": 8731, + "time_per_iteration": 2.6540987491607666 + }, + { + "auxiliary_loss_clip": 0.01309958, + "auxiliary_loss_mlp": 0.00210626, + "balance_loss_clip": 1.09281468, + "balance_loss_mlp": 0.18246858, + "epoch": 0.5249962422967083, + "flos": 24863327614080.0, + "grad_norm": 2.7186488987372743, + "language_loss": 0.90421367, + "learning_rate": 1.935165990676312e-06, + "loss": 0.91941953, + "num_input_tokens_seen": 187614715, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.28161621, + "step": 8732, + "time_per_iteration": 2.696241617202759 + }, + { + "auxiliary_loss_clip": 0.01317036, + "auxiliary_loss_mlp": 0.00215415, + "balance_loss_clip": 1.09330678, + "balance_loss_mlp": 0.18909329, + "epoch": 0.5250563655493762, + "flos": 15262681674240.0, + "grad_norm": 5.037251344658125, + "language_loss": 0.85128438, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.86660892, + "num_input_tokens_seen": 187630745, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26342773, + "step": 8733, + "time_per_iteration": 2.6373541355133057 + }, + { + "auxiliary_loss_clip": 0.01337072, + "auxiliary_loss_mlp": 0.00213757, + "balance_loss_clip": 1.11336803, + "balance_loss_mlp": 0.18765047, + "epoch": 0.5251164888020442, + "flos": 18625177641600.0, + "grad_norm": 9.983878801712631, + "language_loss": 0.8680768, + "learning_rate": 1.934387481628208e-06, + "loss": 0.8835851, + "num_input_tokens_seen": 187648200, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.2611084, + "step": 8734, + "time_per_iteration": 2.6244609355926514 + }, + { + "auxiliary_loss_clip": 0.01319768, + "auxiliary_loss_mlp": 0.00210871, + "balance_loss_clip": 1.09789991, + "balance_loss_mlp": 0.18656461, + "epoch": 0.5251766120547121, + "flos": 29710764760320.0, + "grad_norm": 5.141296155669662, + "language_loss": 0.82166135, + "learning_rate": 1.933998230828826e-06, + "loss": 0.83696771, + "num_input_tokens_seen": 187669205, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.2434082, + "step": 8735, + "time_per_iteration": 2.702881097793579 + }, + { + "auxiliary_loss_clip": 0.01308405, + "auxiliary_loss_mlp": 0.00201426, + "balance_loss_clip": 1.08949709, + "balance_loss_mlp": 0.17541437, + "epoch": 0.5252367353073801, + "flos": 23440295525760.0, + "grad_norm": 4.274709842780115, + "language_loss": 0.87163401, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.88673222, + "num_input_tokens_seen": 187690890, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26013184, + "step": 8736, + "time_per_iteration": 2.7621912956237793 + }, + { + "auxiliary_loss_clip": 0.01326712, + "auxiliary_loss_mlp": 0.00214621, + "balance_loss_clip": 1.10067904, + "balance_loss_mlp": 0.18921718, + "epoch": 0.5252968585600482, + "flos": 30810708990720.0, + "grad_norm": 22.883176476513633, + "language_loss": 0.78813553, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.80354881, + "num_input_tokens_seen": 187713045, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.25415039, + "step": 8737, + "time_per_iteration": 2.741071939468384 + }, + { + "auxiliary_loss_clip": 0.01300036, + "auxiliary_loss_mlp": 0.00219492, + "balance_loss_clip": 1.07875156, + "balance_loss_mlp": 0.19238427, + "epoch": 0.5253569818127161, + "flos": 20628274464000.0, + "grad_norm": 28.46474572215583, + "language_loss": 0.85611677, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.87131208, + "num_input_tokens_seen": 187733640, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.27087402, + "step": 8738, + "time_per_iteration": 2.6930294036865234 + }, + { + "auxiliary_loss_clip": 0.01339395, + "auxiliary_loss_mlp": 0.00069745, + "balance_loss_clip": 1.19836426, + "balance_loss_mlp": 0.06254484, + "epoch": 0.5254171050653841, + "flos": 63428695810560.0, + "grad_norm": 0.7327871542160603, + "language_loss": 0.54072988, + "learning_rate": 1.932441252806837e-06, + "loss": 0.55482125, + "num_input_tokens_seen": 187792930, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.07177734, + "step": 8739, + "time_per_iteration": 3.0960376262664795 + }, + { + "auxiliary_loss_clip": 0.01313929, + "auxiliary_loss_mlp": 0.00205333, + "balance_loss_clip": 1.0923388, + "balance_loss_mlp": 0.18035881, + "epoch": 0.525477228318052, + "flos": 34670782108800.0, + "grad_norm": 21.342601743047616, + "language_loss": 0.90241957, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.9176122, + "num_input_tokens_seen": 187812495, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.24987793, + "step": 8740, + "time_per_iteration": 2.720768928527832 + }, + { + "auxiliary_loss_clip": 0.01306988, + "auxiliary_loss_mlp": 0.00208599, + "balance_loss_clip": 1.08940303, + "balance_loss_mlp": 0.18381497, + "epoch": 0.52553735157072, + "flos": 17930844766080.0, + "grad_norm": 6.525669603670679, + "language_loss": 0.77784121, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.79299706, + "num_input_tokens_seen": 187829685, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.2479248, + "step": 8741, + "time_per_iteration": 2.6303632259368896 + }, + { + "auxiliary_loss_clip": 0.01314661, + "auxiliary_loss_mlp": 0.00211449, + "balance_loss_clip": 1.09276688, + "balance_loss_mlp": 0.18717843, + "epoch": 0.5255974748233879, + "flos": 9940864584960.0, + "grad_norm": 3.4285897445473528, + "language_loss": 0.7699582, + "learning_rate": 1.931273546137947e-06, + "loss": 0.78521931, + "num_input_tokens_seen": 187846495, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.24279785, + "step": 8742, + "time_per_iteration": 2.5922300815582275 + }, + { + "auxiliary_loss_clip": 0.01325774, + "auxiliary_loss_mlp": 0.00229138, + "balance_loss_clip": 1.09527564, + "balance_loss_mlp": 0.20136176, + "epoch": 0.5256575980760559, + "flos": 16868427269760.0, + "grad_norm": 59.676126771846896, + "language_loss": 0.70970088, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.72525001, + "num_input_tokens_seen": 187862010, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.27758789, + "step": 8743, + "time_per_iteration": 2.5665974617004395 + }, + { + "auxiliary_loss_clip": 0.01289776, + "auxiliary_loss_mlp": 0.00047819, + "balance_loss_clip": 1.14588594, + "balance_loss_mlp": 0.04188281, + "epoch": 0.5257177213287239, + "flos": 62386210362240.0, + "grad_norm": 0.7840215671527526, + "language_loss": 0.53719765, + "learning_rate": 1.930495088031323e-06, + "loss": 0.55057359, + "num_input_tokens_seen": 187922730, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.05932617, + "step": 8744, + "time_per_iteration": 3.2221949100494385 + }, + { + "auxiliary_loss_clip": 0.01324954, + "auxiliary_loss_mlp": 0.00228265, + "balance_loss_clip": 1.09803104, + "balance_loss_mlp": 0.20159821, + "epoch": 0.5257778445813919, + "flos": 20776908942720.0, + "grad_norm": 6.386966417305812, + "language_loss": 0.85954452, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.87507677, + "num_input_tokens_seen": 187940160, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.26672363, + "step": 8745, + "time_per_iteration": 2.635817766189575 + }, + { + "auxiliary_loss_clip": 0.0130458, + "auxiliary_loss_mlp": 0.00198299, + "balance_loss_clip": 1.08985686, + "balance_loss_mlp": 0.17389637, + "epoch": 0.5258379678340598, + "flos": 17018606033280.0, + "grad_norm": 2.0103218463379555, + "language_loss": 0.89547968, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.91050851, + "num_input_tokens_seen": 187958625, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24401855, + "step": 8746, + "time_per_iteration": 2.622610330581665 + }, + { + "auxiliary_loss_clip": 0.01308708, + "auxiliary_loss_mlp": 0.002232, + "balance_loss_clip": 1.09442997, + "balance_loss_mlp": 0.19735497, + "epoch": 0.5258980910867278, + "flos": 21068754946560.0, + "grad_norm": 19.628025074882057, + "language_loss": 0.82617676, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.84149575, + "num_input_tokens_seen": 187977575, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.25854492, + "step": 8747, + "time_per_iteration": 2.884514093399048 + }, + { + "auxiliary_loss_clip": 0.01293754, + "auxiliary_loss_mlp": 0.0020459, + "balance_loss_clip": 1.08394909, + "balance_loss_mlp": 0.17880502, + "epoch": 0.5259582143393957, + "flos": 18004461690240.0, + "grad_norm": 6.308715782003099, + "language_loss": 0.89930069, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.91428411, + "num_input_tokens_seen": 187996650, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.25805664, + "step": 8748, + "time_per_iteration": 2.874084234237671 + }, + { + "auxiliary_loss_clip": 0.01319613, + "auxiliary_loss_mlp": 0.00220982, + "balance_loss_clip": 1.09725881, + "balance_loss_mlp": 0.19375482, + "epoch": 0.5260183375920637, + "flos": 22783848520320.0, + "grad_norm": 9622.41238292116, + "language_loss": 0.90782845, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.92323452, + "num_input_tokens_seen": 188013510, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.27258301, + "step": 8749, + "time_per_iteration": 2.6463942527770996 + }, + { + "auxiliary_loss_clip": 0.01307374, + "auxiliary_loss_mlp": 0.00199309, + "balance_loss_clip": 1.09408116, + "balance_loss_mlp": 0.17342886, + "epoch": 0.5260784608447318, + "flos": 27052406081280.0, + "grad_norm": 13.152606729273494, + "language_loss": 0.81330574, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.8283726, + "num_input_tokens_seen": 188032085, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.25866699, + "step": 8750, + "time_per_iteration": 2.7072129249572754 + }, + { + "auxiliary_loss_clip": 0.0131179, + "auxiliary_loss_mlp": 0.00203344, + "balance_loss_clip": 1.09236836, + "balance_loss_mlp": 0.17748731, + "epoch": 0.5261385840973997, + "flos": 20662820369280.0, + "grad_norm": 11.01912012816676, + "language_loss": 0.82895911, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.84411049, + "num_input_tokens_seen": 188050590, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25866699, + "step": 8751, + "time_per_iteration": 2.6649913787841797 + }, + { + "auxiliary_loss_clip": 0.01314069, + "auxiliary_loss_mlp": 0.00203278, + "balance_loss_clip": 1.09868705, + "balance_loss_mlp": 0.17811325, + "epoch": 0.5261987073500677, + "flos": 23622649896960.0, + "grad_norm": 107.52906716512537, + "language_loss": 0.81325549, + "learning_rate": 1.927381362210902e-06, + "loss": 0.82842898, + "num_input_tokens_seen": 188071620, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.25170898, + "step": 8752, + "time_per_iteration": 2.7370588779449463 + }, + { + "auxiliary_loss_clip": 0.01321078, + "auxiliary_loss_mlp": 0.00194425, + "balance_loss_clip": 1.09640145, + "balance_loss_mlp": 0.16753156, + "epoch": 0.5262588306027356, + "flos": 27636241743360.0, + "grad_norm": 18.031455719705097, + "language_loss": 0.75468552, + "learning_rate": 1.926992158720058e-06, + "loss": 0.76984054, + "num_input_tokens_seen": 188091740, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.26904297, + "step": 8753, + "time_per_iteration": 2.7903401851654053 + }, + { + "auxiliary_loss_clip": 0.01314726, + "auxiliary_loss_mlp": 0.00196859, + "balance_loss_clip": 1.09936452, + "balance_loss_mlp": 0.17388692, + "epoch": 0.5263189538554036, + "flos": 21759711943680.0, + "grad_norm": 6.302425941081012, + "language_loss": 0.90206659, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.91718245, + "num_input_tokens_seen": 188111165, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.22961426, + "step": 8754, + "time_per_iteration": 2.6750130653381348 + }, + { + "auxiliary_loss_clip": 0.01316509, + "auxiliary_loss_mlp": 0.00229513, + "balance_loss_clip": 1.09999478, + "balance_loss_mlp": 0.2027622, + "epoch": 0.5263790771080715, + "flos": 14276359140480.0, + "grad_norm": 5.739316509615888, + "language_loss": 0.95324349, + "learning_rate": 1.926213760058522e-06, + "loss": 0.96870363, + "num_input_tokens_seen": 188127825, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.26745605, + "step": 8755, + "time_per_iteration": 2.6240272521972656 + }, + { + "auxiliary_loss_clip": 0.0127263, + "auxiliary_loss_mlp": 0.00063407, + "balance_loss_clip": 1.12342298, + "balance_loss_mlp": 0.05751814, + "epoch": 0.5264392003607395, + "flos": 65806413528960.0, + "grad_norm": 0.7463553545142124, + "language_loss": 0.58253968, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.59590006, + "num_input_tokens_seen": 188194050, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.05883789, + "step": 8756, + "time_per_iteration": 4.598665714263916 + }, + { + "auxiliary_loss_clip": 0.01311866, + "auxiliary_loss_mlp": 0.00217664, + "balance_loss_clip": 1.0901134, + "balance_loss_mlp": 0.18888664, + "epoch": 0.5264993236134075, + "flos": 21032413361280.0, + "grad_norm": 9.475397611786136, + "language_loss": 0.78685123, + "learning_rate": 1.925435372588913e-06, + "loss": 0.80214655, + "num_input_tokens_seen": 188212565, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.2878418, + "step": 8757, + "time_per_iteration": 2.6966755390167236 + }, + { + "auxiliary_loss_clip": 0.01298627, + "auxiliary_loss_mlp": 0.00202254, + "balance_loss_clip": 1.0864588, + "balance_loss_mlp": 0.17872185, + "epoch": 0.5265594468660755, + "flos": 16618202150400.0, + "grad_norm": 358.94980271108506, + "language_loss": 0.95036775, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.9653765, + "num_input_tokens_seen": 188229505, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.23535156, + "step": 8758, + "time_per_iteration": 4.054173469543457 + }, + { + "auxiliary_loss_clip": 0.01288395, + "auxiliary_loss_mlp": 0.00199501, + "balance_loss_clip": 1.07616758, + "balance_loss_mlp": 0.1743713, + "epoch": 0.5266195701187434, + "flos": 24134125610880.0, + "grad_norm": 6.842721422307652, + "language_loss": 0.83007544, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.84495437, + "num_input_tokens_seen": 188250395, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.25097656, + "step": 8759, + "time_per_iteration": 2.6442711353302 + }, + { + "auxiliary_loss_clip": 0.0129048, + "auxiliary_loss_mlp": 0.00193912, + "balance_loss_clip": 1.08124232, + "balance_loss_mlp": 0.16805521, + "epoch": 0.5266796933714114, + "flos": 15844111125120.0, + "grad_norm": 18.954552335763864, + "language_loss": 0.81401372, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.82885766, + "num_input_tokens_seen": 188266785, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.25854492, + "step": 8760, + "time_per_iteration": 4.010049104690552 + }, + { + "auxiliary_loss_clip": 0.01313524, + "auxiliary_loss_mlp": 0.00199229, + "balance_loss_clip": 1.09368014, + "balance_loss_mlp": 0.17225154, + "epoch": 0.5267398166240793, + "flos": 20951434149120.0, + "grad_norm": 9.829907473617705, + "language_loss": 0.87094736, + "learning_rate": 1.923878631697736e-06, + "loss": 0.8860749, + "num_input_tokens_seen": 188282525, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26953125, + "step": 8761, + "time_per_iteration": 2.6444709300994873 + }, + { + "auxiliary_loss_clip": 0.01308094, + "auxiliary_loss_mlp": 0.00187871, + "balance_loss_clip": 1.08999968, + "balance_loss_mlp": 0.16184777, + "epoch": 0.5267999398767473, + "flos": 20996394998400.0, + "grad_norm": 4.059219752938652, + "language_loss": 0.81924474, + "learning_rate": 1.923489453654373e-06, + "loss": 0.83420438, + "num_input_tokens_seen": 188301395, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.26049805, + "step": 8762, + "time_per_iteration": 2.7862846851348877 + }, + { + "auxiliary_loss_clip": 0.01301423, + "auxiliary_loss_mlp": 0.00057536, + "balance_loss_clip": 1.15686917, + "balance_loss_mlp": 0.05186136, + "epoch": 0.5268600631294152, + "flos": 66849401767680.0, + "grad_norm": 0.9511442441184109, + "language_loss": 0.65046495, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.66405457, + "num_input_tokens_seen": 188357665, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.05664062, + "step": 8763, + "time_per_iteration": 2.995384931564331 + }, + { + "auxiliary_loss_clip": 0.01335237, + "auxiliary_loss_mlp": 0.00207632, + "balance_loss_clip": 1.1154232, + "balance_loss_mlp": 0.18073803, + "epoch": 0.5269201863820833, + "flos": 17165552572800.0, + "grad_norm": 8.032090947629124, + "language_loss": 0.80323005, + "learning_rate": 1.922711106286265e-06, + "loss": 0.81865871, + "num_input_tokens_seen": 188376935, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26904297, + "step": 8764, + "time_per_iteration": 4.100106716156006 + }, + { + "auxiliary_loss_clip": 0.01328008, + "auxiliary_loss_mlp": 0.0021971, + "balance_loss_clip": 1.10968733, + "balance_loss_mlp": 0.19340067, + "epoch": 0.5269803096347513, + "flos": 20522589672960.0, + "grad_norm": 11.386695265955117, + "language_loss": 0.82060581, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.83608299, + "num_input_tokens_seen": 188394995, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.26318359, + "step": 8765, + "time_per_iteration": 2.684817314147949 + }, + { + "auxiliary_loss_clip": 0.01328873, + "auxiliary_loss_mlp": 0.00210501, + "balance_loss_clip": 1.1082685, + "balance_loss_mlp": 0.18190277, + "epoch": 0.5270404328874192, + "flos": 27230989524480.0, + "grad_norm": 225.0763880521249, + "language_loss": 0.92324448, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.93863815, + "num_input_tokens_seen": 188415475, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.2857666, + "step": 8766, + "time_per_iteration": 2.7008094787597656 + }, + { + "auxiliary_loss_clip": 0.01340614, + "auxiliary_loss_mlp": 0.00203778, + "balance_loss_clip": 1.11510229, + "balance_loss_mlp": 0.1767533, + "epoch": 0.5271005561400872, + "flos": 23110491824640.0, + "grad_norm": 4941.280965514039, + "language_loss": 0.86625671, + "learning_rate": 1.921543607252017e-06, + "loss": 0.88170069, + "num_input_tokens_seen": 188435665, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.27038574, + "step": 8767, + "time_per_iteration": 2.63136887550354 + }, + { + "auxiliary_loss_clip": 0.01340772, + "auxiliary_loss_mlp": 0.00231817, + "balance_loss_clip": 1.11580622, + "balance_loss_mlp": 0.20377934, + "epoch": 0.5271606793927551, + "flos": 22564793427840.0, + "grad_norm": 43.77457821984019, + "language_loss": 0.80716127, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.82288718, + "num_input_tokens_seen": 188455405, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.28027344, + "step": 8768, + "time_per_iteration": 2.658099412918091 + }, + { + "auxiliary_loss_clip": 0.01336717, + "auxiliary_loss_mlp": 0.0020913, + "balance_loss_clip": 1.1182797, + "balance_loss_mlp": 0.18282032, + "epoch": 0.5272208026454231, + "flos": 18764259102720.0, + "grad_norm": 2.9185506531911267, + "language_loss": 0.82755673, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.84301519, + "num_input_tokens_seen": 188472940, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.26293945, + "step": 8769, + "time_per_iteration": 2.7952685356140137 + }, + { + "auxiliary_loss_clip": 0.01340699, + "auxiliary_loss_mlp": 0.00200271, + "balance_loss_clip": 1.12163734, + "balance_loss_mlp": 0.17305601, + "epoch": 0.5272809258980911, + "flos": 20412164286720.0, + "grad_norm": 20.976225596258786, + "language_loss": 0.82601523, + "learning_rate": 1.920376134993436e-06, + "loss": 0.84142494, + "num_input_tokens_seen": 188493035, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.27209473, + "step": 8770, + "time_per_iteration": 2.657714605331421 + }, + { + "auxiliary_loss_clip": 0.01361958, + "auxiliary_loss_mlp": 0.00223774, + "balance_loss_clip": 1.13910651, + "balance_loss_mlp": 0.19597465, + "epoch": 0.5273410491507591, + "flos": 28256742213120.0, + "grad_norm": 8.657165811505251, + "language_loss": 0.77478528, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.79064256, + "num_input_tokens_seen": 188513860, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.2779541, + "step": 8771, + "time_per_iteration": 2.729231357574463 + }, + { + "auxiliary_loss_clip": 0.01339124, + "auxiliary_loss_mlp": 0.00222606, + "balance_loss_clip": 1.1236012, + "balance_loss_mlp": 0.19530748, + "epoch": 0.527401172403427, + "flos": 22455158140800.0, + "grad_norm": 5.096451887355266, + "language_loss": 0.84126627, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.85688353, + "num_input_tokens_seen": 188533345, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.27294922, + "step": 8772, + "time_per_iteration": 2.643411874771118 + }, + { + "auxiliary_loss_clip": 0.01347123, + "auxiliary_loss_mlp": 0.00209327, + "balance_loss_clip": 1.125494, + "balance_loss_mlp": 0.18213573, + "epoch": 0.527461295656095, + "flos": 21031084558080.0, + "grad_norm": 8.652763085333394, + "language_loss": 0.76740527, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.78296977, + "num_input_tokens_seen": 188551550, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.2722168, + "step": 8773, + "time_per_iteration": 2.754176139831543 + }, + { + "auxiliary_loss_clip": 0.01337681, + "auxiliary_loss_mlp": 0.00211368, + "balance_loss_clip": 1.12289023, + "balance_loss_mlp": 0.18493938, + "epoch": 0.5275214189087629, + "flos": 26322018929280.0, + "grad_norm": 3.2819183339953164, + "language_loss": 0.91477644, + "learning_rate": 1.91881954765502e-06, + "loss": 0.93026692, + "num_input_tokens_seen": 188571615, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.26403809, + "step": 8774, + "time_per_iteration": 2.6675946712493896 + }, + { + "auxiliary_loss_clip": 0.01337865, + "auxiliary_loss_mlp": 0.002257, + "balance_loss_clip": 1.12287617, + "balance_loss_mlp": 0.19881785, + "epoch": 0.5275815421614309, + "flos": 20047024581120.0, + "grad_norm": 12.287881996266727, + "language_loss": 0.86215061, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.87778628, + "num_input_tokens_seen": 188591965, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.2689209, + "step": 8775, + "time_per_iteration": 2.647134780883789 + }, + { + "auxiliary_loss_clip": 0.01332577, + "auxiliary_loss_mlp": 0.00210388, + "balance_loss_clip": 1.12214351, + "balance_loss_mlp": 0.18403043, + "epoch": 0.5276416654140988, + "flos": 21432206712960.0, + "grad_norm": 173.75319241468904, + "language_loss": 0.90181148, + "learning_rate": 1.918041272397012e-06, + "loss": 0.91724116, + "num_input_tokens_seen": 188610675, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.26379395, + "step": 8776, + "time_per_iteration": 2.645421266555786 + }, + { + "auxiliary_loss_clip": 0.01353783, + "auxiliary_loss_mlp": 0.00208903, + "balance_loss_clip": 1.13038683, + "balance_loss_mlp": 0.18398872, + "epoch": 0.5277017886667669, + "flos": 17165085696000.0, + "grad_norm": 4.856251409565469, + "language_loss": 0.74933195, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.7649588, + "num_input_tokens_seen": 188628235, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.24926758, + "step": 8777, + "time_per_iteration": 2.6404359340667725 + }, + { + "auxiliary_loss_clip": 0.01331475, + "auxiliary_loss_mlp": 0.00213702, + "balance_loss_clip": 1.11819363, + "balance_loss_mlp": 0.18701127, + "epoch": 0.5277619119194349, + "flos": 20448146736000.0, + "grad_norm": 3051.1527252057713, + "language_loss": 0.88289505, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.89834684, + "num_input_tokens_seen": 188648925, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.26660156, + "step": 8778, + "time_per_iteration": 2.7115731239318848 + }, + { + "auxiliary_loss_clip": 0.0136076, + "auxiliary_loss_mlp": 0.00201499, + "balance_loss_clip": 1.13607597, + "balance_loss_mlp": 0.17395003, + "epoch": 0.5278220351721028, + "flos": 24061083304320.0, + "grad_norm": 14.243541795252291, + "language_loss": 0.88129747, + "learning_rate": 1.916873882856013e-06, + "loss": 0.89692008, + "num_input_tokens_seen": 188668125, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.2755127, + "step": 8779, + "time_per_iteration": 2.7158584594726562 + }, + { + "auxiliary_loss_clip": 0.01348471, + "auxiliary_loss_mlp": 0.00191241, + "balance_loss_clip": 1.13143277, + "balance_loss_mlp": 0.16449061, + "epoch": 0.5278821584247708, + "flos": 24642907804800.0, + "grad_norm": 141.1988562768924, + "language_loss": 0.85694206, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.87233913, + "num_input_tokens_seen": 188684410, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.26745605, + "step": 8780, + "time_per_iteration": 2.632591724395752 + }, + { + "auxiliary_loss_clip": 0.01373394, + "auxiliary_loss_mlp": 0.00220424, + "balance_loss_clip": 1.14797533, + "balance_loss_mlp": 0.19319689, + "epoch": 0.5279422816774387, + "flos": 35408244240000.0, + "grad_norm": 2.7024173427867897, + "language_loss": 0.76463783, + "learning_rate": 1.916095638898174e-06, + "loss": 0.78057599, + "num_input_tokens_seen": 188706130, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.2722168, + "step": 8781, + "time_per_iteration": 2.786389112472534 + }, + { + "auxiliary_loss_clip": 0.0132607, + "auxiliary_loss_mlp": 0.00195243, + "balance_loss_clip": 1.11104429, + "balance_loss_mlp": 0.17020926, + "epoch": 0.5280024049301068, + "flos": 22967028904320.0, + "grad_norm": 143.1367412560772, + "language_loss": 0.77952826, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.79474139, + "num_input_tokens_seen": 188725030, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.25024414, + "step": 8782, + "time_per_iteration": 2.6549019813537598 + }, + { + "auxiliary_loss_clip": 0.01338006, + "auxiliary_loss_mlp": 0.00213208, + "balance_loss_clip": 1.12203932, + "balance_loss_mlp": 0.18745838, + "epoch": 0.5280625281827747, + "flos": 21507619317120.0, + "grad_norm": 10.193152351942766, + "language_loss": 0.76988947, + "learning_rate": 1.915317407666982e-06, + "loss": 0.78540158, + "num_input_tokens_seen": 188744325, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.2578125, + "step": 8783, + "time_per_iteration": 2.623230218887329 + }, + { + "auxiliary_loss_clip": 0.0138413, + "auxiliary_loss_mlp": 0.0024425, + "balance_loss_clip": 1.14946318, + "balance_loss_mlp": 0.21462689, + "epoch": 0.5281226514354427, + "flos": 31208167958400.0, + "grad_norm": 9.563992231684418, + "language_loss": 0.77443409, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.79071796, + "num_input_tokens_seen": 188765100, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.29626465, + "step": 8784, + "time_per_iteration": 2.7164928913116455 + }, + { + "auxiliary_loss_clip": 0.01360876, + "auxiliary_loss_mlp": 0.0023211, + "balance_loss_clip": 1.13200045, + "balance_loss_mlp": 0.20344047, + "epoch": 0.5281827746881106, + "flos": 25077821679360.0, + "grad_norm": 31.214546731829504, + "language_loss": 0.84746575, + "learning_rate": 1.91453918928048e-06, + "loss": 0.86339557, + "num_input_tokens_seen": 188783995, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.28637695, + "step": 8785, + "time_per_iteration": 2.6566293239593506 + }, + { + "auxiliary_loss_clip": 0.01357906, + "auxiliary_loss_mlp": 0.00207266, + "balance_loss_clip": 1.13590491, + "balance_loss_mlp": 0.18005055, + "epoch": 0.5282428979407786, + "flos": 20631255292800.0, + "grad_norm": 4.100255038035106, + "language_loss": 0.90262705, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.91827869, + "num_input_tokens_seen": 188803120, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.2722168, + "step": 8786, + "time_per_iteration": 2.6585400104522705 + }, + { + "auxiliary_loss_clip": 0.01353329, + "auxiliary_loss_mlp": 0.00191131, + "balance_loss_clip": 1.13639343, + "balance_loss_mlp": 0.16509528, + "epoch": 0.5283030211934465, + "flos": 22419391173120.0, + "grad_norm": 10.738251558330598, + "language_loss": 0.88979506, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.9052397, + "num_input_tokens_seen": 188820960, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.26037598, + "step": 8787, + "time_per_iteration": 2.6691057682037354 + }, + { + "auxiliary_loss_clip": 0.01347654, + "auxiliary_loss_mlp": 0.00210959, + "balance_loss_clip": 1.12931085, + "balance_loss_mlp": 0.1848882, + "epoch": 0.5283631444461145, + "flos": 23615467176960.0, + "grad_norm": 78.77848929551989, + "language_loss": 0.89184928, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.90743542, + "num_input_tokens_seen": 188837165, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.26049805, + "step": 8788, + "time_per_iteration": 2.624455213546753 + }, + { + "auxiliary_loss_clip": 0.01356695, + "auxiliary_loss_mlp": 0.00219927, + "balance_loss_clip": 1.13883209, + "balance_loss_mlp": 0.19287859, + "epoch": 0.5284232676987825, + "flos": 32671994918400.0, + "grad_norm": 2.876132248976745, + "language_loss": 0.83894062, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.85470688, + "num_input_tokens_seen": 188858555, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.27050781, + "step": 8789, + "time_per_iteration": 2.720179319381714 + }, + { + "auxiliary_loss_clip": 0.01351338, + "auxiliary_loss_mlp": 0.00192773, + "balance_loss_clip": 1.13479006, + "balance_loss_mlp": 0.16680947, + "epoch": 0.5284833909514505, + "flos": 26760919213440.0, + "grad_norm": 373.7935282398046, + "language_loss": 0.79383242, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.80927354, + "num_input_tokens_seen": 188879050, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25964355, + "step": 8790, + "time_per_iteration": 2.6904942989349365 + }, + { + "auxiliary_loss_clip": 0.01369828, + "auxiliary_loss_mlp": 0.00199224, + "balance_loss_clip": 1.14658117, + "balance_loss_mlp": 0.17360634, + "epoch": 0.5285435142041185, + "flos": 22090700793600.0, + "grad_norm": 2.494653406557066, + "language_loss": 0.85364234, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.86933285, + "num_input_tokens_seen": 188898885, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25634766, + "step": 8791, + "time_per_iteration": 2.6830339431762695 + }, + { + "auxiliary_loss_clip": 0.01364315, + "auxiliary_loss_mlp": 0.00213659, + "balance_loss_clip": 1.14345133, + "balance_loss_mlp": 0.18681312, + "epoch": 0.5286036374567864, + "flos": 20375463565440.0, + "grad_norm": 4.846754304173797, + "language_loss": 0.75943935, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.77521908, + "num_input_tokens_seen": 188917225, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.26818848, + "step": 8792, + "time_per_iteration": 2.7056219577789307 + }, + { + "auxiliary_loss_clip": 0.01357751, + "auxiliary_loss_mlp": 0.00213799, + "balance_loss_clip": 1.13940072, + "balance_loss_mlp": 0.18869352, + "epoch": 0.5286637607094544, + "flos": 24352175122560.0, + "grad_norm": 122.4877236744321, + "language_loss": 0.90120554, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.91692114, + "num_input_tokens_seen": 188936120, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25097656, + "step": 8793, + "time_per_iteration": 2.758869171142578 + }, + { + "auxiliary_loss_clip": 0.01371763, + "auxiliary_loss_mlp": 0.00225933, + "balance_loss_clip": 1.15052605, + "balance_loss_mlp": 0.20046987, + "epoch": 0.5287238839621223, + "flos": 17271165536640.0, + "grad_norm": 32.31575803405323, + "language_loss": 0.92098975, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.93696678, + "num_input_tokens_seen": 188953405, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.2545166, + "step": 8794, + "time_per_iteration": 2.661921977996826 + }, + { + "auxiliary_loss_clip": 0.01379944, + "auxiliary_loss_mlp": 0.00238388, + "balance_loss_clip": 1.15008497, + "balance_loss_mlp": 0.20939678, + "epoch": 0.5287840072147904, + "flos": 17566890209280.0, + "grad_norm": 6.116919524505697, + "language_loss": 0.77946317, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.79564643, + "num_input_tokens_seen": 188971150, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.28979492, + "step": 8795, + "time_per_iteration": 2.70296573638916 + }, + { + "auxiliary_loss_clip": 0.01386478, + "auxiliary_loss_mlp": 0.00235055, + "balance_loss_clip": 1.15720057, + "balance_loss_mlp": 0.20826825, + "epoch": 0.5288441304674583, + "flos": 18552099421440.0, + "grad_norm": 7.860429239029773, + "language_loss": 0.90936494, + "learning_rate": 1.910259223028374e-06, + "loss": 0.9255802, + "num_input_tokens_seen": 188989550, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.26757812, + "step": 8796, + "time_per_iteration": 2.6790072917938232 + }, + { + "auxiliary_loss_clip": 0.01386295, + "auxiliary_loss_mlp": 0.00244973, + "balance_loss_clip": 1.1593821, + "balance_loss_mlp": 0.2190212, + "epoch": 0.5289042537201263, + "flos": 20814507504000.0, + "grad_norm": 392.73718118810285, + "language_loss": 0.77594495, + "learning_rate": 1.909870155310071e-06, + "loss": 0.79225761, + "num_input_tokens_seen": 189008795, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.2598877, + "step": 8797, + "time_per_iteration": 2.6476998329162598 + }, + { + "auxiliary_loss_clip": 0.01392428, + "auxiliary_loss_mlp": 0.0021146, + "balance_loss_clip": 1.1667335, + "balance_loss_mlp": 0.1854604, + "epoch": 0.5289643769727942, + "flos": 15735265937280.0, + "grad_norm": 6.944182003126047, + "language_loss": 0.89601398, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.91205287, + "num_input_tokens_seen": 189025540, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.2598877, + "step": 8798, + "time_per_iteration": 2.635138511657715 + }, + { + "auxiliary_loss_clip": 0.01400088, + "auxiliary_loss_mlp": 0.00235595, + "balance_loss_clip": 1.16378915, + "balance_loss_mlp": 0.20632918, + "epoch": 0.5290245002254622, + "flos": 19537308633600.0, + "grad_norm": 4.4708604464919635, + "language_loss": 0.7878812, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.80423802, + "num_input_tokens_seen": 189044885, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.29284668, + "step": 8799, + "time_per_iteration": 4.047868490219116 + }, + { + "auxiliary_loss_clip": 0.01408166, + "auxiliary_loss_mlp": 0.00183932, + "balance_loss_clip": 1.17940342, + "balance_loss_mlp": 0.15850492, + "epoch": 0.5290846234781301, + "flos": 15815131827840.0, + "grad_norm": 140.50146424453175, + "language_loss": 0.7761932, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.79211414, + "num_input_tokens_seen": 189061280, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.25402832, + "step": 8800, + "time_per_iteration": 4.087331295013428 + }, + { + "auxiliary_loss_clip": 0.01401952, + "auxiliary_loss_mlp": 0.00077983, + "balance_loss_clip": 1.25786543, + "balance_loss_mlp": 0.07044925, + "epoch": 0.5291447467307981, + "flos": 70057624821120.0, + "grad_norm": 1.0005875669802278, + "language_loss": 0.57119167, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.58599102, + "num_input_tokens_seen": 189114775, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.07519531, + "step": 8801, + "time_per_iteration": 2.9887704849243164 + }, + { + "auxiliary_loss_clip": 0.01397064, + "auxiliary_loss_mlp": 0.00241164, + "balance_loss_clip": 1.16534042, + "balance_loss_mlp": 0.2129713, + "epoch": 0.529204869983466, + "flos": 28364186770560.0, + "grad_norm": 39.04311803007078, + "language_loss": 0.71494085, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.73132312, + "num_input_tokens_seen": 189134700, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.28198242, + "step": 8802, + "time_per_iteration": 4.111076831817627 + }, + { + "auxiliary_loss_clip": 0.01404078, + "auxiliary_loss_mlp": 0.00218076, + "balance_loss_clip": 1.17256665, + "balance_loss_mlp": 0.19275641, + "epoch": 0.5292649932361341, + "flos": 33758830684800.0, + "grad_norm": 7.9317238929451985, + "language_loss": 0.76087439, + "learning_rate": 1.907535821289003e-06, + "loss": 0.77709591, + "num_input_tokens_seen": 189155365, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2532959, + "step": 8803, + "time_per_iteration": 2.7654898166656494 + }, + { + "auxiliary_loss_clip": 0.01394003, + "auxiliary_loss_mlp": 0.00219254, + "balance_loss_clip": 1.16258609, + "balance_loss_mlp": 0.1918118, + "epoch": 0.5293251164888021, + "flos": 20447679859200.0, + "grad_norm": 66.07972433164336, + "language_loss": 0.84798658, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.86411917, + "num_input_tokens_seen": 189173885, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.27490234, + "step": 8804, + "time_per_iteration": 2.6279683113098145 + }, + { + "auxiliary_loss_clip": 0.01441931, + "auxiliary_loss_mlp": 0.00112033, + "balance_loss_clip": 1.2982235, + "balance_loss_mlp": 0.10340185, + "epoch": 0.52938523974147, + "flos": 66545312204160.0, + "grad_norm": 0.755845984043334, + "language_loss": 0.5256331, + "learning_rate": 1.906757737841291e-06, + "loss": 0.54117274, + "num_input_tokens_seen": 189236515, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.08642578, + "step": 8805, + "time_per_iteration": 3.187822103500366 + }, + { + "auxiliary_loss_clip": 0.01445483, + "auxiliary_loss_mlp": 0.00080676, + "balance_loss_clip": 1.29988706, + "balance_loss_mlp": 0.07276089, + "epoch": 0.529445362994138, + "flos": 67151734542720.0, + "grad_norm": 0.7412859649412711, + "language_loss": 0.63745016, + "learning_rate": 1.906368701413693e-06, + "loss": 0.65271169, + "num_input_tokens_seen": 189300500, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.07910156, + "step": 8806, + "time_per_iteration": 3.090163230895996 + }, + { + "auxiliary_loss_clip": 0.01414836, + "auxiliary_loss_mlp": 0.00233369, + "balance_loss_clip": 1.17406106, + "balance_loss_mlp": 0.20422257, + "epoch": 0.5295054862468059, + "flos": 17749316407680.0, + "grad_norm": 48.09907611455745, + "language_loss": 0.78694928, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.80343133, + "num_input_tokens_seen": 189319745, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.29125977, + "step": 8807, + "time_per_iteration": 4.012441635131836 + }, + { + "auxiliary_loss_clip": 0.01405278, + "auxiliary_loss_mlp": 0.00202846, + "balance_loss_clip": 1.173491, + "balance_loss_mlp": 0.17722759, + "epoch": 0.529565609499474, + "flos": 11397401084160.0, + "grad_norm": 16.150898058132146, + "language_loss": 0.78422147, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.80030274, + "num_input_tokens_seen": 189334550, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.25610352, + "step": 8808, + "time_per_iteration": 2.5804152488708496 + }, + { + "auxiliary_loss_clip": 0.01400344, + "auxiliary_loss_mlp": 0.00194452, + "balance_loss_clip": 1.17147648, + "balance_loss_mlp": 0.16894153, + "epoch": 0.5296257327521419, + "flos": 17196363463680.0, + "grad_norm": 3.325717687548196, + "language_loss": 0.9411239, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.95707184, + "num_input_tokens_seen": 189351735, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.25537109, + "step": 8809, + "time_per_iteration": 2.5766797065734863 + }, + { + "auxiliary_loss_clip": 0.01432476, + "auxiliary_loss_mlp": 0.00245414, + "balance_loss_clip": 1.18837905, + "balance_loss_mlp": 0.2160172, + "epoch": 0.5296858560048099, + "flos": 39964086777600.0, + "grad_norm": 86.90373895790039, + "language_loss": 0.70980811, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.72658706, + "num_input_tokens_seen": 189373105, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.29394531, + "step": 8810, + "time_per_iteration": 2.8256092071533203 + }, + { + "auxiliary_loss_clip": 0.0140656, + "auxiliary_loss_mlp": 0.00216457, + "balance_loss_clip": 1.17503834, + "balance_loss_mlp": 0.18779886, + "epoch": 0.5297459792574778, + "flos": 20961418129920.0, + "grad_norm": 23.38185010130542, + "language_loss": 0.74032247, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.75655264, + "num_input_tokens_seen": 189394615, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.28649902, + "step": 8811, + "time_per_iteration": 2.6961348056793213 + }, + { + "auxiliary_loss_clip": 0.0138955, + "auxiliary_loss_mlp": 0.00075435, + "balance_loss_clip": 1.25316262, + "balance_loss_mlp": 0.06813949, + "epoch": 0.5298061025101458, + "flos": 66523620389760.0, + "grad_norm": 0.6553655750857896, + "language_loss": 0.52993536, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.54458523, + "num_input_tokens_seen": 189459750, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07275391, + "step": 8812, + "time_per_iteration": 3.25697922706604 + }, + { + "auxiliary_loss_clip": 0.01385266, + "auxiliary_loss_mlp": 0.00078771, + "balance_loss_clip": 1.24743748, + "balance_loss_mlp": 0.07128425, + "epoch": 0.5298662257628137, + "flos": 67662994775040.0, + "grad_norm": 0.7192499334016521, + "language_loss": 0.56301618, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.57765651, + "num_input_tokens_seen": 189527540, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.07470703, + "step": 8813, + "time_per_iteration": 3.225308656692505 + }, + { + "auxiliary_loss_clip": 0.01404014, + "auxiliary_loss_mlp": 0.00190685, + "balance_loss_clip": 1.17762566, + "balance_loss_mlp": 0.16592479, + "epoch": 0.5299263490154817, + "flos": 19646405216640.0, + "grad_norm": 15.440749390224251, + "language_loss": 0.86950344, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.88545042, + "num_input_tokens_seen": 189546900, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.24768066, + "step": 8814, + "time_per_iteration": 2.633758306503296 + }, + { + "auxiliary_loss_clip": 0.0142403, + "auxiliary_loss_mlp": 0.00233561, + "balance_loss_clip": 1.18154716, + "balance_loss_mlp": 0.20515332, + "epoch": 0.5299864722681497, + "flos": 22055005653120.0, + "grad_norm": 11.035321479293287, + "language_loss": 0.90120775, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.91778362, + "num_input_tokens_seen": 189566490, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.28381348, + "step": 8815, + "time_per_iteration": 2.8344321250915527 + }, + { + "auxiliary_loss_clip": 0.01407328, + "auxiliary_loss_mlp": 0.00212024, + "balance_loss_clip": 1.17683887, + "balance_loss_mlp": 0.18368816, + "epoch": 0.5300465955208177, + "flos": 21763698353280.0, + "grad_norm": 16.78085962703768, + "language_loss": 0.72733676, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.74353027, + "num_input_tokens_seen": 189585580, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2833252, + "step": 8816, + "time_per_iteration": 2.6888022422790527 + }, + { + "auxiliary_loss_clip": 0.01389706, + "auxiliary_loss_mlp": 0.00220221, + "balance_loss_clip": 1.1602366, + "balance_loss_mlp": 0.19319603, + "epoch": 0.5301067187734857, + "flos": 42996491735040.0, + "grad_norm": 10.422712312636461, + "language_loss": 0.79774773, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.81384695, + "num_input_tokens_seen": 189608485, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.26989746, + "step": 8817, + "time_per_iteration": 2.8365633487701416 + }, + { + "auxiliary_loss_clip": 0.01380404, + "auxiliary_loss_mlp": 0.00253294, + "balance_loss_clip": 1.1487385, + "balance_loss_mlp": 0.22554184, + "epoch": 0.5301668420261536, + "flos": 20554298403840.0, + "grad_norm": 8.730615305490105, + "language_loss": 0.71572036, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.73205733, + "num_input_tokens_seen": 189627815, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.27770996, + "step": 8818, + "time_per_iteration": 2.679097890853882 + }, + { + "auxiliary_loss_clip": 0.01386768, + "auxiliary_loss_mlp": 0.0022931, + "balance_loss_clip": 1.15269065, + "balance_loss_mlp": 0.19938877, + "epoch": 0.5302269652788216, + "flos": 17486665182720.0, + "grad_norm": 16.987772065000524, + "language_loss": 0.82641101, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.84257174, + "num_input_tokens_seen": 189644850, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.29931641, + "step": 8819, + "time_per_iteration": 2.6713345050811768 + }, + { + "auxiliary_loss_clip": 0.01388361, + "auxiliary_loss_mlp": 0.00246988, + "balance_loss_clip": 1.15173507, + "balance_loss_mlp": 0.22029723, + "epoch": 0.5302870885314895, + "flos": 14574202715520.0, + "grad_norm": 66.23858829634625, + "language_loss": 0.91628933, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.93264288, + "num_input_tokens_seen": 189660945, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.2668457, + "step": 8820, + "time_per_iteration": 2.5928876399993896 + }, + { + "auxiliary_loss_clip": 0.01368091, + "auxiliary_loss_mlp": 0.00229519, + "balance_loss_clip": 1.14102948, + "balance_loss_mlp": 0.2031979, + "epoch": 0.5303472117841576, + "flos": 23438032968960.0, + "grad_norm": 575.2210811618712, + "language_loss": 0.7850045, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.80098057, + "num_input_tokens_seen": 189680425, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.26306152, + "step": 8821, + "time_per_iteration": 2.6302483081817627 + }, + { + "auxiliary_loss_clip": 0.01363655, + "auxiliary_loss_mlp": 0.00216808, + "balance_loss_clip": 1.13729918, + "balance_loss_mlp": 0.18925855, + "epoch": 0.5304073350368255, + "flos": 22709010533760.0, + "grad_norm": 14.468224962031535, + "language_loss": 0.81105047, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.82685512, + "num_input_tokens_seen": 189700375, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.27563477, + "step": 8822, + "time_per_iteration": 2.645082712173462 + }, + { + "auxiliary_loss_clip": 0.01377021, + "auxiliary_loss_mlp": 0.00213953, + "balance_loss_clip": 1.14396131, + "balance_loss_mlp": 0.18659422, + "epoch": 0.5304674582894935, + "flos": 27928554624000.0, + "grad_norm": 33.49209955528062, + "language_loss": 0.73441648, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.75032622, + "num_input_tokens_seen": 189721225, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.27331543, + "step": 8823, + "time_per_iteration": 2.6888954639434814 + }, + { + "auxiliary_loss_clip": 0.01359644, + "auxiliary_loss_mlp": 0.00234203, + "balance_loss_clip": 1.13199496, + "balance_loss_mlp": 0.2068803, + "epoch": 0.5305275815421614, + "flos": 21250642440960.0, + "grad_norm": 9.09299471811976, + "language_loss": 0.7666384, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.78257686, + "num_input_tokens_seen": 189740170, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.27355957, + "step": 8824, + "time_per_iteration": 2.694246768951416 + }, + { + "auxiliary_loss_clip": 0.01357296, + "auxiliary_loss_mlp": 0.0021522, + "balance_loss_clip": 1.12958944, + "balance_loss_mlp": 0.18943472, + "epoch": 0.5305877047948294, + "flos": 17603088140160.0, + "grad_norm": 5.312070021836843, + "language_loss": 0.84695888, + "learning_rate": 1.898977700702689e-06, + "loss": 0.86268401, + "num_input_tokens_seen": 189757890, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.25769043, + "step": 8825, + "time_per_iteration": 2.7001466751098633 + }, + { + "auxiliary_loss_clip": 0.01347124, + "auxiliary_loss_mlp": 0.00205691, + "balance_loss_clip": 1.12313437, + "balance_loss_mlp": 0.17890519, + "epoch": 0.5306478280474973, + "flos": 15195493284480.0, + "grad_norm": 49.48392362559418, + "language_loss": 0.92142254, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.93695068, + "num_input_tokens_seen": 189775390, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26806641, + "step": 8826, + "time_per_iteration": 2.620328903198242 + }, + { + "auxiliary_loss_clip": 0.01356589, + "auxiliary_loss_mlp": 0.00198842, + "balance_loss_clip": 1.12761998, + "balance_loss_mlp": 0.1734032, + "epoch": 0.5307079513001653, + "flos": 15341218761600.0, + "grad_norm": 2.7983889975298184, + "language_loss": 0.71236753, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.72792184, + "num_input_tokens_seen": 189793975, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.2545166, + "step": 8827, + "time_per_iteration": 2.6464192867279053 + }, + { + "auxiliary_loss_clip": 0.01352994, + "auxiliary_loss_mlp": 0.00207036, + "balance_loss_clip": 1.12634885, + "balance_loss_mlp": 0.17977326, + "epoch": 0.5307680745528333, + "flos": 43544452688640.0, + "grad_norm": 4.411015113791605, + "language_loss": 0.68317652, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.69877684, + "num_input_tokens_seen": 189817870, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.27233887, + "step": 8828, + "time_per_iteration": 2.8367538452148438 + }, + { + "auxiliary_loss_clip": 0.01377189, + "auxiliary_loss_mlp": 0.00214935, + "balance_loss_clip": 1.13864231, + "balance_loss_mlp": 0.18638425, + "epoch": 0.5308281978055013, + "flos": 20048928001920.0, + "grad_norm": 232.51399822569337, + "language_loss": 0.88935149, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.90527272, + "num_input_tokens_seen": 189837905, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.28503418, + "step": 8829, + "time_per_iteration": 2.647507429122925 + }, + { + "auxiliary_loss_clip": 0.01349945, + "auxiliary_loss_mlp": 0.00205237, + "balance_loss_clip": 1.12064171, + "balance_loss_mlp": 0.17902312, + "epoch": 0.5308883210581693, + "flos": 20703938463360.0, + "grad_norm": 60.53715645860559, + "language_loss": 0.84490311, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.86045492, + "num_input_tokens_seen": 189856970, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.26196289, + "step": 8830, + "time_per_iteration": 2.631129741668701 + }, + { + "auxiliary_loss_clip": 0.01350024, + "auxiliary_loss_mlp": 0.00209028, + "balance_loss_clip": 1.1177609, + "balance_loss_mlp": 0.1803223, + "epoch": 0.5309484443108372, + "flos": 14355506759040.0, + "grad_norm": 9.51350635935875, + "language_loss": 0.88299304, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.89858353, + "num_input_tokens_seen": 189872830, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.2869873, + "step": 8831, + "time_per_iteration": 2.640223741531372 + }, + { + "auxiliary_loss_clip": 0.01328885, + "auxiliary_loss_mlp": 0.00224479, + "balance_loss_clip": 1.1027714, + "balance_loss_mlp": 0.19828892, + "epoch": 0.5310085675635052, + "flos": 20010503427840.0, + "grad_norm": 10.255647969393165, + "language_loss": 0.80420673, + "learning_rate": 1.896255043672186e-06, + "loss": 0.81974041, + "num_input_tokens_seen": 189891635, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.26220703, + "step": 8832, + "time_per_iteration": 2.6848440170288086 + }, + { + "auxiliary_loss_clip": 0.01355997, + "auxiliary_loss_mlp": 0.00243234, + "balance_loss_clip": 1.1207794, + "balance_loss_mlp": 0.21378979, + "epoch": 0.5310686908161731, + "flos": 22127293774080.0, + "grad_norm": 8.787643586964142, + "language_loss": 0.84486139, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.86085373, + "num_input_tokens_seen": 189909050, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.29455566, + "step": 8833, + "time_per_iteration": 2.6634650230407715 + }, + { + "auxiliary_loss_clip": 0.01333057, + "auxiliary_loss_mlp": 0.00237352, + "balance_loss_clip": 1.10725641, + "balance_loss_mlp": 0.20797896, + "epoch": 0.5311288140688412, + "flos": 24717889445760.0, + "grad_norm": 13.790071571962171, + "language_loss": 0.79554892, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.81125301, + "num_input_tokens_seen": 189927405, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.29406738, + "step": 8834, + "time_per_iteration": 2.6782963275909424 + }, + { + "auxiliary_loss_clip": 0.0134365, + "auxiliary_loss_mlp": 0.00228294, + "balance_loss_clip": 1.10550046, + "balance_loss_mlp": 0.19837235, + "epoch": 0.5311889373215091, + "flos": 24097712198400.0, + "grad_norm": 22.188036947049916, + "language_loss": 0.86247337, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.87819278, + "num_input_tokens_seen": 189947740, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.29919434, + "step": 8835, + "time_per_iteration": 2.693085193634033 + }, + { + "auxiliary_loss_clip": 0.01316476, + "auxiliary_loss_mlp": 0.00222164, + "balance_loss_clip": 1.09480906, + "balance_loss_mlp": 0.19534159, + "epoch": 0.5312490605741771, + "flos": 22017012042240.0, + "grad_norm": 27.597922389265907, + "language_loss": 0.8042531, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.81963944, + "num_input_tokens_seen": 189966495, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26818848, + "step": 8836, + "time_per_iteration": 2.641768455505371 + }, + { + "auxiliary_loss_clip": 0.01330738, + "auxiliary_loss_mlp": 0.00229, + "balance_loss_clip": 1.10438693, + "balance_loss_mlp": 0.19934025, + "epoch": 0.531309183826845, + "flos": 19390541662080.0, + "grad_norm": 4.574279460768588, + "language_loss": 0.88490582, + "learning_rate": 1.894310406375987e-06, + "loss": 0.90050316, + "num_input_tokens_seen": 189985325, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.296875, + "step": 8837, + "time_per_iteration": 2.6276605129241943 + }, + { + "auxiliary_loss_clip": 0.01334654, + "auxiliary_loss_mlp": 0.00228742, + "balance_loss_clip": 1.10744703, + "balance_loss_mlp": 0.20133591, + "epoch": 0.531369307079513, + "flos": 20190056538240.0, + "grad_norm": 4.416263697111819, + "language_loss": 0.92115259, + "learning_rate": 1.893921490881035e-06, + "loss": 0.93678653, + "num_input_tokens_seen": 190003290, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.27404785, + "step": 8838, + "time_per_iteration": 2.656812906265259 + }, + { + "auxiliary_loss_clip": 0.01321757, + "auxiliary_loss_mlp": 0.00226262, + "balance_loss_clip": 1.10148644, + "balance_loss_mlp": 0.20081039, + "epoch": 0.5314294303321809, + "flos": 18880143356160.0, + "grad_norm": 36.19616666801055, + "language_loss": 0.78582323, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.80130339, + "num_input_tokens_seen": 190023260, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25427246, + "step": 8839, + "time_per_iteration": 2.83188533782959 + }, + { + "auxiliary_loss_clip": 0.01331403, + "auxiliary_loss_mlp": 0.00208644, + "balance_loss_clip": 1.10593116, + "balance_loss_mlp": 0.18232268, + "epoch": 0.531489553584849, + "flos": 23040035297280.0, + "grad_norm": 2.4897470004534044, + "language_loss": 0.82628262, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.84168309, + "num_input_tokens_seen": 190042035, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.26367188, + "step": 8840, + "time_per_iteration": 2.7143337726593018 + }, + { + "auxiliary_loss_clip": 0.01324165, + "auxiliary_loss_mlp": 0.00230721, + "balance_loss_clip": 1.10074162, + "balance_loss_mlp": 0.20425692, + "epoch": 0.5315496768375169, + "flos": 19790478668160.0, + "grad_norm": 15.087597368523925, + "language_loss": 0.83620954, + "learning_rate": 1.892754768590216e-06, + "loss": 0.85175848, + "num_input_tokens_seen": 190057545, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26452637, + "step": 8841, + "time_per_iteration": 4.158872127532959 + }, + { + "auxiliary_loss_clip": 0.01295858, + "auxiliary_loss_mlp": 0.0009674, + "balance_loss_clip": 1.16282797, + "balance_loss_mlp": 0.08920594, + "epoch": 0.5316098000901849, + "flos": 71023228185600.0, + "grad_norm": 0.6773362758827933, + "language_loss": 0.56506139, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.57898736, + "num_input_tokens_seen": 190123800, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.07519531, + "step": 8842, + "time_per_iteration": 3.2526378631591797 + }, + { + "auxiliary_loss_clip": 0.01337018, + "auxiliary_loss_mlp": 0.00235055, + "balance_loss_clip": 1.1032331, + "balance_loss_mlp": 0.20588492, + "epoch": 0.5316699233428529, + "flos": 16435560470400.0, + "grad_norm": 10.904316354062432, + "language_loss": 0.82617456, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.84189528, + "num_input_tokens_seen": 190141625, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.29187012, + "step": 8843, + "time_per_iteration": 3.9723427295684814 + }, + { + "auxiliary_loss_clip": 0.01302161, + "auxiliary_loss_mlp": 0.00069313, + "balance_loss_clip": 1.16841578, + "balance_loss_mlp": 0.06225591, + "epoch": 0.5317300465955208, + "flos": 67420814302080.0, + "grad_norm": 1.059797079044665, + "language_loss": 0.60708344, + "learning_rate": 1.891588082900145e-06, + "loss": 0.62079811, + "num_input_tokens_seen": 190198110, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.07080078, + "step": 8844, + "time_per_iteration": 4.463963747024536 + }, + { + "auxiliary_loss_clip": 0.01314539, + "auxiliary_loss_mlp": 0.00090941, + "balance_loss_clip": 1.17590451, + "balance_loss_mlp": 0.08331159, + "epoch": 0.5317901698481888, + "flos": 59508075340800.0, + "grad_norm": 0.8223014431592927, + "language_loss": 0.61542094, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.62947571, + "num_input_tokens_seen": 190259950, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.07617188, + "step": 8845, + "time_per_iteration": 3.100733518600464 + }, + { + "auxiliary_loss_clip": 0.01331617, + "auxiliary_loss_mlp": 0.00237584, + "balance_loss_clip": 1.10711813, + "balance_loss_mlp": 0.21070217, + "epoch": 0.5318502931008567, + "flos": 19129219240320.0, + "grad_norm": 294.50330362886973, + "language_loss": 0.85988998, + "learning_rate": 1.890810312970474e-06, + "loss": 0.87558198, + "num_input_tokens_seen": 190278265, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.26879883, + "step": 8846, + "time_per_iteration": 2.608633041381836 + }, + { + "auxiliary_loss_clip": 0.01318153, + "auxiliary_loss_mlp": 0.00220045, + "balance_loss_clip": 1.09625697, + "balance_loss_mlp": 0.19342583, + "epoch": 0.5319104163535248, + "flos": 24681045070080.0, + "grad_norm": 4.891145386346162, + "language_loss": 0.82126832, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.83665025, + "num_input_tokens_seen": 190298400, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.26635742, + "step": 8847, + "time_per_iteration": 2.6302475929260254 + }, + { + "auxiliary_loss_clip": 0.01305915, + "auxiliary_loss_mlp": 0.0022219, + "balance_loss_clip": 1.08791065, + "balance_loss_mlp": 0.19753733, + "epoch": 0.5319705396061927, + "flos": 19385513758080.0, + "grad_norm": 88.7614540242931, + "language_loss": 0.94171488, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.95699596, + "num_input_tokens_seen": 190316235, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24645996, + "step": 8848, + "time_per_iteration": 2.6336886882781982 + }, + { + "auxiliary_loss_clip": 0.01321424, + "auxiliary_loss_mlp": 0.00252999, + "balance_loss_clip": 1.09794021, + "balance_loss_mlp": 0.22401963, + "epoch": 0.5320306628588607, + "flos": 18259319664000.0, + "grad_norm": 17.597573527670246, + "language_loss": 0.84148508, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.85722935, + "num_input_tokens_seen": 190335060, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.28967285, + "step": 8849, + "time_per_iteration": 3.9967682361602783 + }, + { + "auxiliary_loss_clip": 0.01315605, + "auxiliary_loss_mlp": 0.00221927, + "balance_loss_clip": 1.09026122, + "balance_loss_mlp": 0.19410405, + "epoch": 0.5320907861115286, + "flos": 23732321097600.0, + "grad_norm": 84.88460102244512, + "language_loss": 0.86652172, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.88189703, + "num_input_tokens_seen": 190353265, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.27819824, + "step": 8850, + "time_per_iteration": 2.6548116207122803 + }, + { + "auxiliary_loss_clip": 0.01284585, + "auxiliary_loss_mlp": 0.00238835, + "balance_loss_clip": 1.07454014, + "balance_loss_mlp": 0.21284755, + "epoch": 0.5321509093641966, + "flos": 34495251321600.0, + "grad_norm": 3.304838102593183, + "language_loss": 0.60671806, + "learning_rate": 1.888865960862821e-06, + "loss": 0.62195224, + "num_input_tokens_seen": 190376575, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.25976562, + "step": 8851, + "time_per_iteration": 2.764244318008423 + }, + { + "auxiliary_loss_clip": 0.01311494, + "auxiliary_loss_mlp": 0.0023552, + "balance_loss_clip": 1.08968818, + "balance_loss_mlp": 0.2073632, + "epoch": 0.5322110326168645, + "flos": 20010934391040.0, + "grad_norm": 22.905813403211262, + "language_loss": 0.77382159, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.78929174, + "num_input_tokens_seen": 190395185, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.28198242, + "step": 8852, + "time_per_iteration": 2.6741743087768555 + }, + { + "auxiliary_loss_clip": 0.01256105, + "auxiliary_loss_mlp": 0.00263991, + "balance_loss_clip": 1.13541722, + "balance_loss_mlp": 0.24844609, + "epoch": 0.5322711558695326, + "flos": 64631164435200.0, + "grad_norm": 0.7729125415477177, + "language_loss": 0.62148398, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.63668489, + "num_input_tokens_seen": 190452595, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.15527344, + "step": 8853, + "time_per_iteration": 3.1103200912475586 + }, + { + "auxiliary_loss_clip": 0.01301216, + "auxiliary_loss_mlp": 0.0022769, + "balance_loss_clip": 1.07999444, + "balance_loss_mlp": 0.199223, + "epoch": 0.5323312791222005, + "flos": 14939342421120.0, + "grad_norm": 8.832547720753915, + "language_loss": 0.88873434, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.90402341, + "num_input_tokens_seen": 190469140, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.28466797, + "step": 8854, + "time_per_iteration": 2.638186454772949 + }, + { + "auxiliary_loss_clip": 0.01300325, + "auxiliary_loss_mlp": 0.00194379, + "balance_loss_clip": 1.08509636, + "balance_loss_mlp": 0.16893977, + "epoch": 0.5323914023748685, + "flos": 23440834229760.0, + "grad_norm": 1229.6016526915168, + "language_loss": 0.82048434, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.8354314, + "num_input_tokens_seen": 190489015, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.2545166, + "step": 8855, + "time_per_iteration": 2.736544370651245 + }, + { + "auxiliary_loss_clip": 0.0129957, + "auxiliary_loss_mlp": 0.00231281, + "balance_loss_clip": 1.0837909, + "balance_loss_mlp": 0.20658064, + "epoch": 0.5324515256275365, + "flos": 26286180134400.0, + "grad_norm": 3.9389162121399353, + "language_loss": 0.71859181, + "learning_rate": 1.886921714110507e-06, + "loss": 0.73390031, + "num_input_tokens_seen": 190508065, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24694824, + "step": 8856, + "time_per_iteration": 2.638658046722412 + }, + { + "auxiliary_loss_clip": 0.01282524, + "auxiliary_loss_mlp": 0.00232595, + "balance_loss_clip": 1.06458652, + "balance_loss_mlp": 0.20514119, + "epoch": 0.5325116488802044, + "flos": 26870913636480.0, + "grad_norm": 29.41792904167496, + "language_loss": 0.84985232, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.86500359, + "num_input_tokens_seen": 190527045, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.27478027, + "step": 8857, + "time_per_iteration": 2.688770055770874 + }, + { + "auxiliary_loss_clip": 0.01280748, + "auxiliary_loss_mlp": 0.00203013, + "balance_loss_clip": 1.06728601, + "balance_loss_mlp": 0.17741901, + "epoch": 0.5325717721328724, + "flos": 25884734757120.0, + "grad_norm": 3.6609156397968152, + "language_loss": 0.76774359, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.78258115, + "num_input_tokens_seen": 190544075, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.2557373, + "step": 8858, + "time_per_iteration": 2.6629624366760254 + }, + { + "auxiliary_loss_clip": 0.01288437, + "auxiliary_loss_mlp": 0.00243677, + "balance_loss_clip": 1.07244444, + "balance_loss_mlp": 0.21568625, + "epoch": 0.5326318953855403, + "flos": 21799321666560.0, + "grad_norm": 41.6166877389548, + "language_loss": 0.75965673, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.7749778, + "num_input_tokens_seen": 190566030, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.2800293, + "step": 8859, + "time_per_iteration": 2.741753339767456 + }, + { + "auxiliary_loss_clip": 0.01287099, + "auxiliary_loss_mlp": 0.00217038, + "balance_loss_clip": 1.07691705, + "balance_loss_mlp": 0.19296984, + "epoch": 0.5326920186382084, + "flos": 20922921728640.0, + "grad_norm": 11.947770417350432, + "language_loss": 0.74576199, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.76080346, + "num_input_tokens_seen": 190585605, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.24060059, + "step": 8860, + "time_per_iteration": 2.6310105323791504 + }, + { + "auxiliary_loss_clip": 0.01255771, + "auxiliary_loss_mlp": 0.00225497, + "balance_loss_clip": 1.04780495, + "balance_loss_mlp": 0.20016477, + "epoch": 0.5327521418908763, + "flos": 21433427775360.0, + "grad_norm": 6.653158920884805, + "language_loss": 0.84974873, + "learning_rate": 1.884977574556683e-06, + "loss": 0.86456144, + "num_input_tokens_seen": 190604625, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.25341797, + "step": 8861, + "time_per_iteration": 2.797293186187744 + }, + { + "auxiliary_loss_clip": 0.01277945, + "auxiliary_loss_mlp": 0.00212474, + "balance_loss_clip": 1.06450868, + "balance_loss_mlp": 0.1877141, + "epoch": 0.5328122651435443, + "flos": 21760250647680.0, + "grad_norm": 48.17642276655718, + "language_loss": 0.93506718, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.94997132, + "num_input_tokens_seen": 190625060, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2479248, + "step": 8862, + "time_per_iteration": 2.66556978225708 + }, + { + "auxiliary_loss_clip": 0.01267526, + "auxiliary_loss_mlp": 0.0024941, + "balance_loss_clip": 1.04937983, + "balance_loss_mlp": 0.22069217, + "epoch": 0.5328723883962122, + "flos": 18296487262080.0, + "grad_norm": 3.8529983561861556, + "language_loss": 0.74106026, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.75622964, + "num_input_tokens_seen": 190643150, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.28710938, + "step": 8863, + "time_per_iteration": 2.602647542953491 + }, + { + "auxiliary_loss_clip": 0.01268808, + "auxiliary_loss_mlp": 0.00195996, + "balance_loss_clip": 1.06204152, + "balance_loss_mlp": 0.1736915, + "epoch": 0.5329325116488802, + "flos": 25374911068800.0, + "grad_norm": 2.4531460637536093, + "language_loss": 0.81563449, + "learning_rate": 1.883811143046377e-06, + "loss": 0.83028245, + "num_input_tokens_seen": 190662725, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.22302246, + "step": 8864, + "time_per_iteration": 2.6671297550201416 + }, + { + "auxiliary_loss_clip": 0.01251954, + "auxiliary_loss_mlp": 0.00223238, + "balance_loss_clip": 1.04547524, + "balance_loss_mlp": 0.19739342, + "epoch": 0.5329926349015481, + "flos": 25592098654080.0, + "grad_norm": 5.620023521291987, + "language_loss": 0.72259665, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.7373485, + "num_input_tokens_seen": 190683680, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.25805664, + "step": 8865, + "time_per_iteration": 2.6593377590179443 + }, + { + "auxiliary_loss_clip": 0.01266497, + "auxiliary_loss_mlp": 0.00195008, + "balance_loss_clip": 1.05623996, + "balance_loss_mlp": 0.17099962, + "epoch": 0.5330527581542162, + "flos": 22889605138560.0, + "grad_norm": 6.664707170839729, + "language_loss": 0.86979735, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.88441241, + "num_input_tokens_seen": 190703350, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.23986816, + "step": 8866, + "time_per_iteration": 2.6713709831237793 + }, + { + "auxiliary_loss_clip": 0.01271762, + "auxiliary_loss_mlp": 0.0022683, + "balance_loss_clip": 1.05704784, + "balance_loss_mlp": 0.1990664, + "epoch": 0.5331128814068841, + "flos": 16026752805120.0, + "grad_norm": 719.5317030516484, + "language_loss": 0.82582045, + "learning_rate": 1.882644751189108e-06, + "loss": 0.84080631, + "num_input_tokens_seen": 190721170, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.27758789, + "step": 8867, + "time_per_iteration": 2.622035026550293 + }, + { + "auxiliary_loss_clip": 0.01257287, + "auxiliary_loss_mlp": 0.00211565, + "balance_loss_clip": 1.04778695, + "balance_loss_mlp": 0.18588698, + "epoch": 0.5331730046595521, + "flos": 39344699629440.0, + "grad_norm": 13.518775138942594, + "language_loss": 0.79014653, + "learning_rate": 1.88225596278394e-06, + "loss": 0.80483508, + "num_input_tokens_seen": 190743795, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.25683594, + "step": 8868, + "time_per_iteration": 2.8673477172851562 + }, + { + "auxiliary_loss_clip": 0.01245028, + "auxiliary_loss_mlp": 0.00215958, + "balance_loss_clip": 1.04058933, + "balance_loss_mlp": 0.19086421, + "epoch": 0.5332331279122201, + "flos": 24024382583040.0, + "grad_norm": 1858.1293599996404, + "language_loss": 0.85191309, + "learning_rate": 1.881867178843637e-06, + "loss": 0.86652291, + "num_input_tokens_seen": 190761560, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.25061035, + "step": 8869, + "time_per_iteration": 2.6527585983276367 + }, + { + "auxiliary_loss_clip": 0.01257082, + "auxiliary_loss_mlp": 0.00207704, + "balance_loss_clip": 1.05016744, + "balance_loss_mlp": 0.18245585, + "epoch": 0.533293251164888, + "flos": 17129318728320.0, + "grad_norm": 4.47995647040371, + "language_loss": 0.85181195, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.86645985, + "num_input_tokens_seen": 190778875, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.25256348, + "step": 8870, + "time_per_iteration": 2.6158089637756348 + }, + { + "auxiliary_loss_clip": 0.01258937, + "auxiliary_loss_mlp": 0.00235417, + "balance_loss_clip": 1.04905641, + "balance_loss_mlp": 0.20728317, + "epoch": 0.533353374417556, + "flos": 22126360020480.0, + "grad_norm": 5.8664594407530615, + "language_loss": 0.82018769, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.83513123, + "num_input_tokens_seen": 190799830, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.28125, + "step": 8871, + "time_per_iteration": 2.6351559162139893 + }, + { + "auxiliary_loss_clip": 0.01242485, + "auxiliary_loss_mlp": 0.00197175, + "balance_loss_clip": 1.03955758, + "balance_loss_mlp": 0.17381039, + "epoch": 0.533413497670224, + "flos": 15011091838080.0, + "grad_norm": 98.37284758353567, + "language_loss": 0.79346818, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.80786479, + "num_input_tokens_seen": 190817155, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.23364258, + "step": 8872, + "time_per_iteration": 2.6107332706451416 + }, + { + "auxiliary_loss_clip": 0.01253214, + "auxiliary_loss_mlp": 0.00238158, + "balance_loss_clip": 1.04712987, + "balance_loss_mlp": 0.21212304, + "epoch": 0.533473620922892, + "flos": 19609955890560.0, + "grad_norm": 13.950931229906196, + "language_loss": 0.72189522, + "learning_rate": 1.880312088025936e-06, + "loss": 0.7368089, + "num_input_tokens_seen": 190835240, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.26037598, + "step": 8873, + "time_per_iteration": 2.606431007385254 + }, + { + "auxiliary_loss_clip": 0.01239214, + "auxiliary_loss_mlp": 0.00211811, + "balance_loss_clip": 1.04063225, + "balance_loss_mlp": 0.18774214, + "epoch": 0.5335337441755599, + "flos": 14282644020480.0, + "grad_norm": 33.404682582222485, + "language_loss": 0.89046156, + "learning_rate": 1.879923326631099e-06, + "loss": 0.90497178, + "num_input_tokens_seen": 190851620, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.24060059, + "step": 8874, + "time_per_iteration": 2.606678009033203 + }, + { + "auxiliary_loss_clip": 0.01240042, + "auxiliary_loss_mlp": 0.00206832, + "balance_loss_clip": 1.03489637, + "balance_loss_mlp": 0.18142813, + "epoch": 0.5335938674282279, + "flos": 20814830726400.0, + "grad_norm": 40.10226619420842, + "language_loss": 0.7814163, + "learning_rate": 1.879534569789582e-06, + "loss": 0.79588503, + "num_input_tokens_seen": 190870545, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.25390625, + "step": 8875, + "time_per_iteration": 2.6564202308654785 + }, + { + "auxiliary_loss_clip": 0.01253457, + "auxiliary_loss_mlp": 0.00093748, + "balance_loss_clip": 1.13382363, + "balance_loss_mlp": 0.0854513, + "epoch": 0.5336539906808958, + "flos": 71396448451200.0, + "grad_norm": 0.7074284326733348, + "language_loss": 0.58820498, + "learning_rate": 1.879145817516126e-06, + "loss": 0.601677, + "num_input_tokens_seen": 190931995, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.08300781, + "step": 8876, + "time_per_iteration": 3.2505505084991455 + }, + { + "auxiliary_loss_clip": 0.01230365, + "auxiliary_loss_mlp": 0.00197605, + "balance_loss_clip": 1.02616906, + "balance_loss_mlp": 0.17446598, + "epoch": 0.5337141139335638, + "flos": 20152996680960.0, + "grad_norm": 5.6233165282297755, + "language_loss": 0.83433867, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.84861839, + "num_input_tokens_seen": 190949890, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.23144531, + "step": 8877, + "time_per_iteration": 2.6191632747650146 + }, + { + "auxiliary_loss_clip": 0.01252927, + "auxiliary_loss_mlp": 0.00258554, + "balance_loss_clip": 1.13111925, + "balance_loss_mlp": 0.24720517, + "epoch": 0.5337742371862317, + "flos": 67728387484800.0, + "grad_norm": 0.763243412565211, + "language_loss": 0.56707144, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.58218622, + "num_input_tokens_seen": 191008480, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.11328125, + "step": 8878, + "time_per_iteration": 3.02239990234375 + }, + { + "auxiliary_loss_clip": 0.01250563, + "auxiliary_loss_mlp": 0.00202621, + "balance_loss_clip": 1.04265177, + "balance_loss_mlp": 0.1766576, + "epoch": 0.5338343604388998, + "flos": 25008909436800.0, + "grad_norm": 5.1547410402627065, + "language_loss": 0.81597376, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.83050561, + "num_input_tokens_seen": 191028995, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.25952148, + "step": 8879, + "time_per_iteration": 2.710376024246216 + }, + { + "auxiliary_loss_clip": 0.01224254, + "auxiliary_loss_mlp": 0.00218781, + "balance_loss_clip": 1.02053916, + "balance_loss_mlp": 0.19284084, + "epoch": 0.5338944836915677, + "flos": 17601256546560.0, + "grad_norm": 26.567737356174575, + "language_loss": 0.93517911, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.9496094, + "num_input_tokens_seen": 191045285, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.25964355, + "step": 8880, + "time_per_iteration": 2.607015609741211 + }, + { + "auxiliary_loss_clip": 0.01221399, + "auxiliary_loss_mlp": 0.00203307, + "balance_loss_clip": 1.02357352, + "balance_loss_mlp": 0.17877354, + "epoch": 0.5339546069442357, + "flos": 21724124544000.0, + "grad_norm": 24.149540695525886, + "language_loss": 0.86234915, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.87659621, + "num_input_tokens_seen": 191066105, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.24511719, + "step": 8881, + "time_per_iteration": 2.701364278793335 + }, + { + "auxiliary_loss_clip": 0.01239862, + "auxiliary_loss_mlp": 0.0014224, + "balance_loss_clip": 1.11768198, + "balance_loss_mlp": 0.13246524, + "epoch": 0.5340147301969036, + "flos": 69723583315200.0, + "grad_norm": 0.7757066038150747, + "language_loss": 0.59247291, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.60629392, + "num_input_tokens_seen": 191126315, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.09765625, + "step": 8882, + "time_per_iteration": 3.074885845184326 + }, + { + "auxiliary_loss_clip": 0.01238744, + "auxiliary_loss_mlp": 0.00125012, + "balance_loss_clip": 1.1196034, + "balance_loss_mlp": 0.11471229, + "epoch": 0.5340748534495716, + "flos": 63880701580800.0, + "grad_norm": 0.8577185168346692, + "language_loss": 0.63671088, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65034842, + "num_input_tokens_seen": 191174240, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.10302734, + "step": 8883, + "time_per_iteration": 2.9871091842651367 + }, + { + "auxiliary_loss_clip": 0.01220294, + "auxiliary_loss_mlp": 0.00210553, + "balance_loss_clip": 1.01695299, + "balance_loss_mlp": 0.18536375, + "epoch": 0.5341349767022396, + "flos": 28694313694080.0, + "grad_norm": 5.075538719651704, + "language_loss": 0.9302088, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.94451725, + "num_input_tokens_seen": 191193335, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.25170898, + "step": 8884, + "time_per_iteration": 4.256259441375732 + }, + { + "auxiliary_loss_clip": 0.01225618, + "auxiliary_loss_mlp": 0.00231487, + "balance_loss_clip": 1.02553511, + "balance_loss_mlp": 0.20863496, + "epoch": 0.5341950999549075, + "flos": 16289691338880.0, + "grad_norm": 6.7302531585989325, + "language_loss": 0.78390789, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.79847896, + "num_input_tokens_seen": 191210900, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.22875977, + "step": 8885, + "time_per_iteration": 4.0613853931427 + }, + { + "auxiliary_loss_clip": 0.01214102, + "auxiliary_loss_mlp": 0.00249637, + "balance_loss_clip": 1.01325727, + "balance_loss_mlp": 0.22257617, + "epoch": 0.5342552232075756, + "flos": 14355650413440.0, + "grad_norm": 599.5630470781546, + "language_loss": 0.88424766, + "learning_rate": 1.87525854926798e-06, + "loss": 0.89888513, + "num_input_tokens_seen": 191226730, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.27038574, + "step": 8886, + "time_per_iteration": 2.6026904582977295 + }, + { + "auxiliary_loss_clip": 0.01220439, + "auxiliary_loss_mlp": 0.00240784, + "balance_loss_clip": 1.0200814, + "balance_loss_mlp": 0.21478495, + "epoch": 0.5343153464602435, + "flos": 30297976300800.0, + "grad_norm": 18.795702230806196, + "language_loss": 0.81734073, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.83195293, + "num_input_tokens_seen": 191250435, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.25976562, + "step": 8887, + "time_per_iteration": 4.127835988998413 + }, + { + "auxiliary_loss_clip": 0.01222977, + "auxiliary_loss_mlp": 0.00239355, + "balance_loss_clip": 1.02327371, + "balance_loss_mlp": 0.21283078, + "epoch": 0.5343754697129115, + "flos": 15596292216960.0, + "grad_norm": 88.07753281698386, + "language_loss": 0.81665587, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.83127928, + "num_input_tokens_seen": 191268315, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.26538086, + "step": 8888, + "time_per_iteration": 2.66058349609375 + }, + { + "auxiliary_loss_clip": 0.01229671, + "auxiliary_loss_mlp": 0.00253793, + "balance_loss_clip": 1.0219456, + "balance_loss_mlp": 0.22582659, + "epoch": 0.5344355929655794, + "flos": 16909617191040.0, + "grad_norm": 6.7860216335414965, + "language_loss": 0.88776743, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.90260208, + "num_input_tokens_seen": 191287000, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.27978516, + "step": 8889, + "time_per_iteration": 2.6240501403808594 + }, + { + "auxiliary_loss_clip": 0.01212328, + "auxiliary_loss_mlp": 0.00240405, + "balance_loss_clip": 1.0123632, + "balance_loss_mlp": 0.21539497, + "epoch": 0.5344957162182474, + "flos": 16798186224000.0, + "grad_norm": 57.348491882443774, + "language_loss": 0.76784641, + "learning_rate": 1.873703773589102e-06, + "loss": 0.78237373, + "num_input_tokens_seen": 191304565, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.25036621, + "step": 8890, + "time_per_iteration": 2.613736391067505 + }, + { + "auxiliary_loss_clip": 0.01217368, + "auxiliary_loss_mlp": 0.0026321, + "balance_loss_clip": 1.01316738, + "balance_loss_mlp": 0.23752028, + "epoch": 0.5345558394709153, + "flos": 12705590413440.0, + "grad_norm": 9.202328631142448, + "language_loss": 0.89075148, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.90555722, + "num_input_tokens_seen": 191318300, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.25671387, + "step": 8891, + "time_per_iteration": 4.136599540710449 + }, + { + "auxiliary_loss_clip": 0.0118407, + "auxiliary_loss_mlp": 0.00230223, + "balance_loss_clip": 0.99147081, + "balance_loss_mlp": 0.20566589, + "epoch": 0.5346159627235834, + "flos": 22455050400000.0, + "grad_norm": 39.65254688451946, + "language_loss": 0.81689513, + "learning_rate": 1.872926414425699e-06, + "loss": 0.831038, + "num_input_tokens_seen": 191337925, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.24572754, + "step": 8892, + "time_per_iteration": 2.6474435329437256 + }, + { + "auxiliary_loss_clip": 0.01198094, + "auxiliary_loss_mlp": 0.0021384, + "balance_loss_clip": 0.9999972, + "balance_loss_mlp": 0.18856725, + "epoch": 0.5346760859762513, + "flos": 22415763899520.0, + "grad_norm": 825.8254394766508, + "language_loss": 0.94102156, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.95514089, + "num_input_tokens_seen": 191357120, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.25292969, + "step": 8893, + "time_per_iteration": 2.6200153827667236 + }, + { + "auxiliary_loss_clip": 0.01193964, + "auxiliary_loss_mlp": 0.00215571, + "balance_loss_clip": 0.99707752, + "balance_loss_mlp": 0.19157444, + "epoch": 0.5347362092289193, + "flos": 22816131868800.0, + "grad_norm": 7.578053181565926, + "language_loss": 0.81227577, + "learning_rate": 1.872149074536869e-06, + "loss": 0.82637107, + "num_input_tokens_seen": 191375395, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.2401123, + "step": 8894, + "time_per_iteration": 2.6408772468566895 + }, + { + "auxiliary_loss_clip": 0.01186166, + "auxiliary_loss_mlp": 0.0026147, + "balance_loss_clip": 0.99413121, + "balance_loss_mlp": 0.23780695, + "epoch": 0.5347963324815872, + "flos": 23219480666880.0, + "grad_norm": 6.812642533570317, + "language_loss": 0.82060778, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.8350842, + "num_input_tokens_seen": 191395595, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.23681641, + "step": 8895, + "time_per_iteration": 2.6268327236175537 + }, + { + "auxiliary_loss_clip": 0.01191724, + "auxiliary_loss_mlp": 0.00236078, + "balance_loss_clip": 0.99539161, + "balance_loss_mlp": 0.21245024, + "epoch": 0.5348564557342552, + "flos": 22601350494720.0, + "grad_norm": 9.024884742447265, + "language_loss": 0.8315171, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.84579515, + "num_input_tokens_seen": 191413730, + "router_z_loss_clip": 1.96289062, + "router_z_loss_mlp": 0.23608398, + "step": 8896, + "time_per_iteration": 2.6638476848602295 + }, + { + "auxiliary_loss_clip": 0.01192183, + "auxiliary_loss_mlp": 0.00262191, + "balance_loss_clip": 0.99468231, + "balance_loss_mlp": 0.23673971, + "epoch": 0.5349165789869232, + "flos": 18002378701440.0, + "grad_norm": 37.20040389691521, + "language_loss": 0.86120582, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.87574953, + "num_input_tokens_seen": 191432400, + "router_z_loss_clip": 1.97460938, + "router_z_loss_mlp": 0.25463867, + "step": 8897, + "time_per_iteration": 2.607800006866455 + }, + { + "auxiliary_loss_clip": 0.01193568, + "auxiliary_loss_mlp": 0.00221776, + "balance_loss_clip": 0.99636316, + "balance_loss_mlp": 0.19770774, + "epoch": 0.5349767022395912, + "flos": 17159770483200.0, + "grad_norm": 273.68908097704957, + "language_loss": 0.85429442, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.86844778, + "num_input_tokens_seen": 191448855, + "router_z_loss_clip": 1.97167969, + "router_z_loss_mlp": 0.2409668, + "step": 8898, + "time_per_iteration": 2.5986950397491455 + }, + { + "auxiliary_loss_clip": 0.01211113, + "auxiliary_loss_mlp": 0.0032346, + "balance_loss_clip": 1.0871532, + "balance_loss_mlp": 0.31187332, + "epoch": 0.5350368254922592, + "flos": 70992058158720.0, + "grad_norm": 0.9128233983857472, + "language_loss": 0.57680213, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.59214789, + "num_input_tokens_seen": 191519690, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.11572266, + "step": 8899, + "time_per_iteration": 3.3019955158233643 + }, + { + "auxiliary_loss_clip": 0.01185984, + "auxiliary_loss_mlp": 0.00251196, + "balance_loss_clip": 0.98920846, + "balance_loss_mlp": 0.22794999, + "epoch": 0.5350969487449271, + "flos": 27417833095680.0, + "grad_norm": 137.30723193951238, + "language_loss": 0.76555437, + "learning_rate": 1.869817171696868e-06, + "loss": 0.77992618, + "num_input_tokens_seen": 191539380, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.23266602, + "step": 8900, + "time_per_iteration": 2.681436538696289 + }, + { + "auxiliary_loss_clip": 0.01196969, + "auxiliary_loss_mlp": 0.00237427, + "balance_loss_clip": 0.99639654, + "balance_loss_mlp": 0.21412188, + "epoch": 0.5351570719975951, + "flos": 19316134638720.0, + "grad_norm": 3.495578175722658, + "language_loss": 0.78697693, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.80132091, + "num_input_tokens_seen": 191557400, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.23291016, + "step": 8901, + "time_per_iteration": 2.640850305557251 + }, + { + "auxiliary_loss_clip": 0.0118046, + "auxiliary_loss_mlp": 0.00245539, + "balance_loss_clip": 0.98650849, + "balance_loss_mlp": 0.22049281, + "epoch": 0.535217195250263, + "flos": 19828580019840.0, + "grad_norm": 15.13914920355804, + "language_loss": 0.86219859, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.87645859, + "num_input_tokens_seen": 191575860, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.25048828, + "step": 8902, + "time_per_iteration": 2.667057514190674 + }, + { + "auxiliary_loss_clip": 0.01174292, + "auxiliary_loss_mlp": 0.00224045, + "balance_loss_clip": 0.98318851, + "balance_loss_mlp": 0.20138311, + "epoch": 0.535277318502931, + "flos": 22127868391680.0, + "grad_norm": 11.598828619997196, + "language_loss": 0.77425408, + "learning_rate": 1.868651286721281e-06, + "loss": 0.78823745, + "num_input_tokens_seen": 191595775, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.2265625, + "step": 8903, + "time_per_iteration": 2.7000534534454346 + }, + { + "auxiliary_loss_clip": 0.01199881, + "auxiliary_loss_mlp": 0.00242123, + "balance_loss_clip": 0.99947679, + "balance_loss_mlp": 0.21871036, + "epoch": 0.5353374417555989, + "flos": 25045897466880.0, + "grad_norm": 408.8967133801997, + "language_loss": 0.80880791, + "learning_rate": 1.86826266833795e-06, + "loss": 0.82322794, + "num_input_tokens_seen": 191617785, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.234375, + "step": 8904, + "time_per_iteration": 2.703160524368286 + }, + { + "auxiliary_loss_clip": 0.01191273, + "auxiliary_loss_mlp": 0.002274, + "balance_loss_clip": 0.99483049, + "balance_loss_mlp": 0.20461896, + "epoch": 0.535397565008267, + "flos": 19388710068480.0, + "grad_norm": 275.9712018598858, + "language_loss": 0.81878948, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.83297616, + "num_input_tokens_seen": 191636900, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.2277832, + "step": 8905, + "time_per_iteration": 2.6156468391418457 + }, + { + "auxiliary_loss_clip": 0.01180352, + "auxiliary_loss_mlp": 0.00221643, + "balance_loss_clip": 0.99007833, + "balance_loss_mlp": 0.19930293, + "epoch": 0.5354576882609349, + "flos": 21471205904640.0, + "grad_norm": 21.299333172410023, + "language_loss": 0.90211987, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.91613984, + "num_input_tokens_seen": 191656720, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.22338867, + "step": 8906, + "time_per_iteration": 2.63397216796875 + }, + { + "auxiliary_loss_clip": 0.01195373, + "auxiliary_loss_mlp": 0.0022534, + "balance_loss_clip": 0.99510968, + "balance_loss_mlp": 0.20078273, + "epoch": 0.5355178115136029, + "flos": 20777519473920.0, + "grad_norm": 10.872820808968797, + "language_loss": 0.8258599, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.84006703, + "num_input_tokens_seen": 191674445, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.24511719, + "step": 8907, + "time_per_iteration": 2.7108986377716064 + }, + { + "auxiliary_loss_clip": 0.01171219, + "auxiliary_loss_mlp": 0.00221682, + "balance_loss_clip": 0.97822964, + "balance_loss_mlp": 0.19671918, + "epoch": 0.5355779347662708, + "flos": 23514020190720.0, + "grad_norm": 7.473800042738709, + "language_loss": 0.85166693, + "learning_rate": 1.866708244906912e-06, + "loss": 0.86559594, + "num_input_tokens_seen": 191695000, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.24987793, + "step": 8908, + "time_per_iteration": 2.658963441848755 + }, + { + "auxiliary_loss_clip": 0.01195852, + "auxiliary_loss_mlp": 0.00220461, + "balance_loss_clip": 0.99249029, + "balance_loss_mlp": 0.19645238, + "epoch": 0.5356380580189388, + "flos": 20303211358080.0, + "grad_norm": 374.51788972854257, + "language_loss": 0.83756006, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.85172319, + "num_input_tokens_seen": 191713295, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.24035645, + "step": 8909, + "time_per_iteration": 2.60960054397583 + }, + { + "auxiliary_loss_clip": 0.01187868, + "auxiliary_loss_mlp": 0.00203384, + "balance_loss_clip": 0.99236953, + "balance_loss_mlp": 0.1801382, + "epoch": 0.5356981812716068, + "flos": 21361642444800.0, + "grad_norm": 32.11247089544186, + "language_loss": 0.91226268, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.92617512, + "num_input_tokens_seen": 191732725, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.23242188, + "step": 8910, + "time_per_iteration": 2.6668524742126465 + }, + { + "auxiliary_loss_clip": 0.01183116, + "auxiliary_loss_mlp": 0.00221741, + "balance_loss_clip": 0.98795873, + "balance_loss_mlp": 0.19739801, + "epoch": 0.5357583045242748, + "flos": 23111246010240.0, + "grad_norm": 14.646340422444975, + "language_loss": 0.8842656, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.89831412, + "num_input_tokens_seen": 191753765, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.24365234, + "step": 8911, + "time_per_iteration": 2.680492401123047 + }, + { + "auxiliary_loss_clip": 0.01178749, + "auxiliary_loss_mlp": 0.00199486, + "balance_loss_clip": 0.98503828, + "balance_loss_mlp": 0.17806403, + "epoch": 0.5358184277769428, + "flos": 21141761339520.0, + "grad_norm": 3.8425172820041356, + "language_loss": 0.75189209, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.76567441, + "num_input_tokens_seen": 191773560, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.2142334, + "step": 8912, + "time_per_iteration": 2.6505861282348633 + }, + { + "auxiliary_loss_clip": 0.0120212, + "auxiliary_loss_mlp": 0.00253879, + "balance_loss_clip": 1.0049597, + "balance_loss_mlp": 0.22779605, + "epoch": 0.5358785510296107, + "flos": 16282400878080.0, + "grad_norm": 12.266506025594108, + "language_loss": 0.80418479, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.81874478, + "num_input_tokens_seen": 191791255, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.26049805, + "step": 8913, + "time_per_iteration": 2.6107358932495117 + }, + { + "auxiliary_loss_clip": 0.01201352, + "auxiliary_loss_mlp": 0.00227495, + "balance_loss_clip": 0.99704349, + "balance_loss_mlp": 0.20228189, + "epoch": 0.5359386742822787, + "flos": 16976877408000.0, + "grad_norm": 2760.2208625135813, + "language_loss": 0.79832685, + "learning_rate": 1.864376761688156e-06, + "loss": 0.81261533, + "num_input_tokens_seen": 191809325, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.25219727, + "step": 8914, + "time_per_iteration": 2.653411388397217 + }, + { + "auxiliary_loss_clip": 0.01195055, + "auxiliary_loss_mlp": 0.00247355, + "balance_loss_clip": 0.99407655, + "balance_loss_mlp": 0.22165391, + "epoch": 0.5359987975349466, + "flos": 20812927305600.0, + "grad_norm": 3.744776655752441, + "language_loss": 0.78784996, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.80227405, + "num_input_tokens_seen": 191829795, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.25695801, + "step": 8915, + "time_per_iteration": 2.6386425495147705 + }, + { + "auxiliary_loss_clip": 0.01198956, + "auxiliary_loss_mlp": 0.00221824, + "balance_loss_clip": 0.99892378, + "balance_loss_mlp": 0.1978271, + "epoch": 0.5360589207876146, + "flos": 22199941031040.0, + "grad_norm": 10.682722117966357, + "language_loss": 0.85290104, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.86710888, + "num_input_tokens_seen": 191850840, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.23986816, + "step": 8916, + "time_per_iteration": 2.690218687057495 + }, + { + "auxiliary_loss_clip": 0.01191271, + "auxiliary_loss_mlp": 0.0022084, + "balance_loss_clip": 0.99612844, + "balance_loss_mlp": 0.19641447, + "epoch": 0.5361190440402825, + "flos": 31394365084800.0, + "grad_norm": 9.937623516628049, + "language_loss": 0.79203188, + "learning_rate": 1.863211089308289e-06, + "loss": 0.806153, + "num_input_tokens_seen": 191869520, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.24438477, + "step": 8917, + "time_per_iteration": 2.718505382537842 + }, + { + "auxiliary_loss_clip": 0.01224372, + "auxiliary_loss_mlp": 0.00231795, + "balance_loss_clip": 1.01782691, + "balance_loss_mlp": 0.20543739, + "epoch": 0.5361791672929506, + "flos": 16069882060800.0, + "grad_norm": 62.12950401603561, + "language_loss": 0.82558256, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.84014428, + "num_input_tokens_seen": 191887240, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.2635498, + "step": 8918, + "time_per_iteration": 2.669370412826538 + }, + { + "auxiliary_loss_clip": 0.01185931, + "auxiliary_loss_mlp": 0.00227894, + "balance_loss_clip": 0.98926622, + "balance_loss_mlp": 0.20296766, + "epoch": 0.5362392905456185, + "flos": 20740926493440.0, + "grad_norm": 2.2977508142702483, + "language_loss": 0.8279835, + "learning_rate": 1.862434000299067e-06, + "loss": 0.84212172, + "num_input_tokens_seen": 191905690, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.24926758, + "step": 8919, + "time_per_iteration": 2.630331516265869 + }, + { + "auxiliary_loss_clip": 0.01203582, + "auxiliary_loss_mlp": 0.00219488, + "balance_loss_clip": 1.00059664, + "balance_loss_mlp": 0.19266562, + "epoch": 0.5362994137982865, + "flos": 17340077779200.0, + "grad_norm": 183.00461399416508, + "language_loss": 0.80713278, + "learning_rate": 1.862045463611864e-06, + "loss": 0.82136351, + "num_input_tokens_seen": 191920725, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.26831055, + "step": 8920, + "time_per_iteration": 2.6078219413757324 + }, + { + "auxiliary_loss_clip": 0.01176673, + "auxiliary_loss_mlp": 0.00223882, + "balance_loss_clip": 0.9834879, + "balance_loss_mlp": 0.19804926, + "epoch": 0.5363595370509544, + "flos": 42813957795840.0, + "grad_norm": 7.187150334972805, + "language_loss": 0.7441411, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.75814664, + "num_input_tokens_seen": 191944645, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.25842285, + "step": 8921, + "time_per_iteration": 2.885011672973633 + }, + { + "auxiliary_loss_clip": 0.01205663, + "auxiliary_loss_mlp": 0.00263504, + "balance_loss_clip": 1.00505352, + "balance_loss_mlp": 0.23740953, + "epoch": 0.5364196603036224, + "flos": 19171953446400.0, + "grad_norm": 115.53113359382232, + "language_loss": 0.88319898, + "learning_rate": 1.86126840594594e-06, + "loss": 0.89789069, + "num_input_tokens_seen": 191962265, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.26074219, + "step": 8922, + "time_per_iteration": 2.626796245574951 + }, + { + "auxiliary_loss_clip": 0.01210915, + "auxiliary_loss_mlp": 0.00231551, + "balance_loss_clip": 1.00669956, + "balance_loss_mlp": 0.20502727, + "epoch": 0.5364797835562904, + "flos": 17931060247680.0, + "grad_norm": 4.839510568598882, + "language_loss": 0.85520422, + "learning_rate": 1.860879884996686e-06, + "loss": 0.86962891, + "num_input_tokens_seen": 191978850, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.26550293, + "step": 8923, + "time_per_iteration": 2.5880393981933594 + }, + { + "auxiliary_loss_clip": 0.01192279, + "auxiliary_loss_mlp": 0.00220016, + "balance_loss_clip": 0.99594784, + "balance_loss_mlp": 0.19488618, + "epoch": 0.5365399068089584, + "flos": 30228058477440.0, + "grad_norm": 16.72169007390184, + "language_loss": 0.77336776, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.78749067, + "num_input_tokens_seen": 192002000, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.2512207, + "step": 8924, + "time_per_iteration": 2.717101812362671 + }, + { + "auxiliary_loss_clip": 0.01223945, + "auxiliary_loss_mlp": 0.00222727, + "balance_loss_clip": 1.01330924, + "balance_loss_mlp": 0.19471329, + "epoch": 0.5366000300616264, + "flos": 24891696380160.0, + "grad_norm": 17.849524471044784, + "language_loss": 0.95844519, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.97291189, + "num_input_tokens_seen": 192019100, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.2800293, + "step": 8925, + "time_per_iteration": 2.8330416679382324 + }, + { + "auxiliary_loss_clip": 0.01194215, + "auxiliary_loss_mlp": 0.00217931, + "balance_loss_clip": 0.99502182, + "balance_loss_mlp": 0.19076297, + "epoch": 0.5366601533142943, + "flos": 29826649013760.0, + "grad_norm": 130.2814890620954, + "language_loss": 0.8538509, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.86797237, + "num_input_tokens_seen": 192041660, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.27160645, + "step": 8926, + "time_per_iteration": 4.139146327972412 + }, + { + "auxiliary_loss_clip": 0.01199698, + "auxiliary_loss_mlp": 0.00205025, + "balance_loss_clip": 1.00165927, + "balance_loss_mlp": 0.17963353, + "epoch": 0.5367202765669623, + "flos": 27199352620800.0, + "grad_norm": 135.80505769073244, + "language_loss": 0.74492002, + "learning_rate": 1.85932585410148e-06, + "loss": 0.75896722, + "num_input_tokens_seen": 192063540, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.25415039, + "step": 8927, + "time_per_iteration": 4.115447044372559 + }, + { + "auxiliary_loss_clip": 0.01224536, + "auxiliary_loss_mlp": 0.00238096, + "balance_loss_clip": 1.01870275, + "balance_loss_mlp": 0.20900846, + "epoch": 0.5367803998196302, + "flos": 20229953569920.0, + "grad_norm": 4.292643080023921, + "language_loss": 0.84430486, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.85893118, + "num_input_tokens_seen": 192081760, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.29077148, + "step": 8928, + "time_per_iteration": 2.6568150520324707 + }, + { + "auxiliary_loss_clip": 0.01175258, + "auxiliary_loss_mlp": 0.00228437, + "balance_loss_clip": 0.98387539, + "balance_loss_mlp": 0.20100668, + "epoch": 0.5368405230722982, + "flos": 32154629374080.0, + "grad_norm": 14.452749556604573, + "language_loss": 0.71008158, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.72411847, + "num_input_tokens_seen": 192101620, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.27429199, + "step": 8929, + "time_per_iteration": 4.1480748653411865 + }, + { + "auxiliary_loss_clip": 0.01210117, + "auxiliary_loss_mlp": 0.00217077, + "balance_loss_clip": 1.01029491, + "balance_loss_mlp": 0.19089907, + "epoch": 0.5369006463249661, + "flos": 26247935128320.0, + "grad_norm": 5.381299114714289, + "language_loss": 0.75191104, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.76618296, + "num_input_tokens_seen": 192121805, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.26196289, + "step": 8930, + "time_per_iteration": 2.779059886932373 + }, + { + "auxiliary_loss_clip": 0.01176792, + "auxiliary_loss_mlp": 0.00249788, + "balance_loss_clip": 0.98512155, + "balance_loss_mlp": 0.22362138, + "epoch": 0.5369607695776342, + "flos": 26211306234240.0, + "grad_norm": 41.16752111691508, + "language_loss": 0.72854555, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.74281132, + "num_input_tokens_seen": 192141765, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.26147461, + "step": 8931, + "time_per_iteration": 2.6939103603363037 + }, + { + "auxiliary_loss_clip": 0.01196354, + "auxiliary_loss_mlp": 0.00223676, + "balance_loss_clip": 1.00077856, + "balance_loss_mlp": 0.19653234, + "epoch": 0.5370208928303021, + "flos": 25009017177600.0, + "grad_norm": 17.418433857568488, + "language_loss": 0.82908958, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.84328985, + "num_input_tokens_seen": 192161560, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.27124023, + "step": 8932, + "time_per_iteration": 2.655109167098999 + }, + { + "auxiliary_loss_clip": 0.01182928, + "auxiliary_loss_mlp": 0.00230375, + "balance_loss_clip": 0.98873544, + "balance_loss_mlp": 0.20399427, + "epoch": 0.5370810160829701, + "flos": 31792147274880.0, + "grad_norm": 7.881113772765568, + "language_loss": 0.72060931, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.7347424, + "num_input_tokens_seen": 192180190, + "router_z_loss_clip": 1.94335938, + "router_z_loss_mlp": 0.26391602, + "step": 8933, + "time_per_iteration": 4.107812166213989 + }, + { + "auxiliary_loss_clip": 0.01184555, + "auxiliary_loss_mlp": 0.00224154, + "balance_loss_clip": 0.98861057, + "balance_loss_mlp": 0.19687892, + "epoch": 0.537141139335638, + "flos": 23842602829440.0, + "grad_norm": 122.20216784959115, + "language_loss": 0.90167695, + "learning_rate": 1.856606505975565e-06, + "loss": 0.91576409, + "num_input_tokens_seen": 192198855, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.27258301, + "step": 8934, + "time_per_iteration": 2.7282941341400146 + }, + { + "auxiliary_loss_clip": 0.0119993, + "auxiliary_loss_mlp": 0.00229372, + "balance_loss_clip": 1.00106311, + "balance_loss_mlp": 0.20319328, + "epoch": 0.537201262588306, + "flos": 18508826511360.0, + "grad_norm": 3.8902069161396917, + "language_loss": 0.87145948, + "learning_rate": 1.856218049303999e-06, + "loss": 0.88575244, + "num_input_tokens_seen": 192216555, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.26184082, + "step": 8935, + "time_per_iteration": 2.621873378753662 + }, + { + "auxiliary_loss_clip": 0.01193744, + "auxiliary_loss_mlp": 0.00247602, + "balance_loss_clip": 0.99159908, + "balance_loss_mlp": 0.21825245, + "epoch": 0.537261385840974, + "flos": 25662950231040.0, + "grad_norm": 14.320277872611955, + "language_loss": 0.91148579, + "learning_rate": 1.855829598084659e-06, + "loss": 0.92589927, + "num_input_tokens_seen": 192236910, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.29321289, + "step": 8936, + "time_per_iteration": 2.6810572147369385 + }, + { + "auxiliary_loss_clip": 0.0121825, + "auxiliary_loss_mlp": 0.0021435, + "balance_loss_clip": 1.01408958, + "balance_loss_mlp": 0.18829037, + "epoch": 0.537321509093642, + "flos": 40735017406080.0, + "grad_norm": 2413.7596904559746, + "language_loss": 0.7741487, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.78847468, + "num_input_tokens_seen": 192260790, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.26037598, + "step": 8937, + "time_per_iteration": 2.829597234725952 + }, + { + "auxiliary_loss_clip": 0.01205257, + "auxiliary_loss_mlp": 0.00214279, + "balance_loss_clip": 0.99956375, + "balance_loss_mlp": 0.18681301, + "epoch": 0.53738163234631, + "flos": 17238487138560.0, + "grad_norm": 89.34425630173071, + "language_loss": 0.90141141, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.91560674, + "num_input_tokens_seen": 192277230, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.2746582, + "step": 8938, + "time_per_iteration": 2.6579015254974365 + }, + { + "auxiliary_loss_clip": 0.0122382, + "auxiliary_loss_mlp": 0.0022493, + "balance_loss_clip": 1.01391089, + "balance_loss_mlp": 0.19680873, + "epoch": 0.5374417555989779, + "flos": 12821977457280.0, + "grad_norm": 25.58896612128774, + "language_loss": 0.92104143, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.93552887, + "num_input_tokens_seen": 192292840, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.28137207, + "step": 8939, + "time_per_iteration": 2.6239569187164307 + }, + { + "auxiliary_loss_clip": 0.01252387, + "auxiliary_loss_mlp": 0.00047253, + "balance_loss_clip": 1.09459543, + "balance_loss_mlp": 0.04157844, + "epoch": 0.5375018788516459, + "flos": 67256018703360.0, + "grad_norm": 0.6944223447241654, + "language_loss": 0.52199817, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.5349946, + "num_input_tokens_seen": 192358240, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.05664062, + "step": 8940, + "time_per_iteration": 3.175300121307373 + }, + { + "auxiliary_loss_clip": 0.01193505, + "auxiliary_loss_mlp": 0.00225414, + "balance_loss_clip": 0.99622184, + "balance_loss_mlp": 0.20047534, + "epoch": 0.5375620021043138, + "flos": 18114168804480.0, + "grad_norm": 5.084700371918976, + "language_loss": 0.81436563, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.82855475, + "num_input_tokens_seen": 192377370, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.24963379, + "step": 8941, + "time_per_iteration": 2.7099053859710693 + }, + { + "auxiliary_loss_clip": 0.01189739, + "auxiliary_loss_mlp": 0.00217621, + "balance_loss_clip": 0.99325848, + "balance_loss_mlp": 0.1943993, + "epoch": 0.5376221253569818, + "flos": 23149383275520.0, + "grad_norm": 3.6599172911285827, + "language_loss": 0.87076116, + "learning_rate": 1.853499006090237e-06, + "loss": 0.88483477, + "num_input_tokens_seen": 192396450, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.2322998, + "step": 8942, + "time_per_iteration": 2.6869091987609863 + }, + { + "auxiliary_loss_clip": 0.0122698, + "auxiliary_loss_mlp": 0.00272381, + "balance_loss_clip": 1.02018845, + "balance_loss_mlp": 0.24323446, + "epoch": 0.5376822486096497, + "flos": 29972302663680.0, + "grad_norm": 4.014320295563004, + "language_loss": 0.79672992, + "learning_rate": 1.853110593448911e-06, + "loss": 0.81172347, + "num_input_tokens_seen": 192417390, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.29174805, + "step": 8943, + "time_per_iteration": 2.7247962951660156 + }, + { + "auxiliary_loss_clip": 0.01240118, + "auxiliary_loss_mlp": 0.00055888, + "balance_loss_clip": 1.08870959, + "balance_loss_mlp": 0.05002324, + "epoch": 0.5377423718623178, + "flos": 54168950874240.0, + "grad_norm": 0.8300011927430635, + "language_loss": 0.5949437, + "learning_rate": 1.852722186377645e-06, + "loss": 0.60790372, + "num_input_tokens_seen": 192478060, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.05859375, + "step": 8944, + "time_per_iteration": 3.1343846321105957 + }, + { + "auxiliary_loss_clip": 0.01213487, + "auxiliary_loss_mlp": 0.00261368, + "balance_loss_clip": 1.00241756, + "balance_loss_mlp": 0.23296037, + "epoch": 0.5378024951149857, + "flos": 23257079228160.0, + "grad_norm": 16.16021064130627, + "language_loss": 0.8635335, + "learning_rate": 1.852333784891169e-06, + "loss": 0.87828207, + "num_input_tokens_seen": 192495985, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.28430176, + "step": 8945, + "time_per_iteration": 2.6860923767089844 + }, + { + "auxiliary_loss_clip": 0.01211232, + "auxiliary_loss_mlp": 0.00230684, + "balance_loss_clip": 1.00424218, + "balance_loss_mlp": 0.20420802, + "epoch": 0.5378626183676537, + "flos": 24024095274240.0, + "grad_norm": 15.609742518299052, + "language_loss": 0.76200897, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.7764281, + "num_input_tokens_seen": 192515445, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.26489258, + "step": 8946, + "time_per_iteration": 2.657294988632202 + }, + { + "auxiliary_loss_clip": 0.01216591, + "auxiliary_loss_mlp": 0.00213203, + "balance_loss_clip": 1.0124867, + "balance_loss_mlp": 0.18758501, + "epoch": 0.5379227416203216, + "flos": 27161789973120.0, + "grad_norm": 2.3521200134724194, + "language_loss": 0.82958841, + "learning_rate": 1.851556998731498e-06, + "loss": 0.84388638, + "num_input_tokens_seen": 192536530, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.25646973, + "step": 8947, + "time_per_iteration": 2.6773128509521484 + }, + { + "auxiliary_loss_clip": 0.01188073, + "auxiliary_loss_mlp": 0.00251079, + "balance_loss_clip": 0.99292094, + "balance_loss_mlp": 0.22454301, + "epoch": 0.5379828648729896, + "flos": 24681619687680.0, + "grad_norm": 13.233527172978588, + "language_loss": 0.65650874, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.67090034, + "num_input_tokens_seen": 192556075, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.26550293, + "step": 8948, + "time_per_iteration": 2.6582489013671875 + }, + { + "auxiliary_loss_clip": 0.01245277, + "auxiliary_loss_mlp": 0.00230717, + "balance_loss_clip": 1.02749681, + "balance_loss_mlp": 0.20409763, + "epoch": 0.5380429881256577, + "flos": 22523280284160.0, + "grad_norm": 149.10677546755377, + "language_loss": 0.8632406, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.87800056, + "num_input_tokens_seen": 192575535, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.26623535, + "step": 8949, + "time_per_iteration": 2.6427125930786133 + }, + { + "auxiliary_loss_clip": 0.01206465, + "auxiliary_loss_mlp": 0.00225836, + "balance_loss_clip": 1.0039258, + "balance_loss_mlp": 0.19989586, + "epoch": 0.5381031113783256, + "flos": 26979543342720.0, + "grad_norm": 305.8847264578508, + "language_loss": 0.84049946, + "learning_rate": 1.850391861746111e-06, + "loss": 0.85482246, + "num_input_tokens_seen": 192594490, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.25964355, + "step": 8950, + "time_per_iteration": 2.6786553859710693 + }, + { + "auxiliary_loss_clip": 0.01216969, + "auxiliary_loss_mlp": 0.00214358, + "balance_loss_clip": 1.01124823, + "balance_loss_mlp": 0.19019458, + "epoch": 0.5381632346309936, + "flos": 24754087376640.0, + "grad_norm": 32.08594321549141, + "language_loss": 0.78595364, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.80026686, + "num_input_tokens_seen": 192615650, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.24157715, + "step": 8951, + "time_per_iteration": 2.6709060668945312 + }, + { + "auxiliary_loss_clip": 0.01215343, + "auxiliary_loss_mlp": 0.00240791, + "balance_loss_clip": 1.0060699, + "balance_loss_mlp": 0.2147679, + "epoch": 0.5382233578836615, + "flos": 15560058372480.0, + "grad_norm": 6.468566041076626, + "language_loss": 0.85067916, + "learning_rate": 1.849615132097085e-06, + "loss": 0.86524051, + "num_input_tokens_seen": 192633840, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.26000977, + "step": 8952, + "time_per_iteration": 2.6522443294525146 + }, + { + "auxiliary_loss_clip": 0.01216361, + "auxiliary_loss_mlp": 0.00257221, + "balance_loss_clip": 1.00816643, + "balance_loss_mlp": 0.22864619, + "epoch": 0.5382834811363295, + "flos": 25084501608960.0, + "grad_norm": 5.54996104663942, + "language_loss": 0.84209502, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.85683084, + "num_input_tokens_seen": 192655890, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.28588867, + "step": 8953, + "time_per_iteration": 2.750161647796631 + }, + { + "auxiliary_loss_clip": 0.01202315, + "auxiliary_loss_mlp": 0.00217422, + "balance_loss_clip": 1.00051212, + "balance_loss_mlp": 0.19245917, + "epoch": 0.5383436043889974, + "flos": 13297901685120.0, + "grad_norm": 31.29334018668495, + "language_loss": 0.89061964, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.90481699, + "num_input_tokens_seen": 192673025, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.24975586, + "step": 8954, + "time_per_iteration": 2.727889060974121 + }, + { + "auxiliary_loss_clip": 0.01200634, + "auxiliary_loss_mlp": 0.00213808, + "balance_loss_clip": 1.00034058, + "balance_loss_mlp": 0.18894145, + "epoch": 0.5384037276416654, + "flos": 23039388852480.0, + "grad_norm": 4.443381574678608, + "language_loss": 0.83969897, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.85384345, + "num_input_tokens_seen": 192692190, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.24902344, + "step": 8955, + "time_per_iteration": 2.7250399589538574 + }, + { + "auxiliary_loss_clip": 0.01226074, + "auxiliary_loss_mlp": 0.00233547, + "balance_loss_clip": 1.01368213, + "balance_loss_mlp": 0.20716557, + "epoch": 0.5384638508943334, + "flos": 20631147552000.0, + "grad_norm": 8.01026972339475, + "language_loss": 0.84121448, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.8558107, + "num_input_tokens_seen": 192710380, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.26391602, + "step": 8956, + "time_per_iteration": 2.6953909397125244 + }, + { + "auxiliary_loss_clip": 0.01244699, + "auxiliary_loss_mlp": 0.00131651, + "balance_loss_clip": 1.09070408, + "balance_loss_mlp": 0.12449831, + "epoch": 0.5385239741470014, + "flos": 66737683491840.0, + "grad_norm": 1.1386778661633585, + "language_loss": 0.62664497, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.6404084, + "num_input_tokens_seen": 192768995, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.07128906, + "step": 8957, + "time_per_iteration": 3.0596301555633545 + }, + { + "auxiliary_loss_clip": 0.01246622, + "auxiliary_loss_mlp": 0.00227916, + "balance_loss_clip": 1.09404325, + "balance_loss_mlp": 0.21990524, + "epoch": 0.5385840973996693, + "flos": 64716058229760.0, + "grad_norm": 0.727813091780219, + "language_loss": 0.50955099, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.5242964, + "num_input_tokens_seen": 192825585, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.08007812, + "step": 8958, + "time_per_iteration": 3.1520891189575195 + }, + { + "auxiliary_loss_clip": 0.01250421, + "auxiliary_loss_mlp": 0.00236649, + "balance_loss_clip": 1.03584087, + "balance_loss_mlp": 0.20927849, + "epoch": 0.5386442206523373, + "flos": 26141783460480.0, + "grad_norm": 202.34952494165069, + "language_loss": 0.84216475, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.85703552, + "num_input_tokens_seen": 192847335, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.27404785, + "step": 8959, + "time_per_iteration": 2.6918883323669434 + }, + { + "auxiliary_loss_clip": 0.01215166, + "auxiliary_loss_mlp": 0.00226159, + "balance_loss_clip": 1.00503969, + "balance_loss_mlp": 0.19899088, + "epoch": 0.5387043439050052, + "flos": 18251849635200.0, + "grad_norm": 421.7498377919202, + "language_loss": 0.94589627, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.96030951, + "num_input_tokens_seen": 192862205, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.27172852, + "step": 8960, + "time_per_iteration": 2.622714042663574 + }, + { + "auxiliary_loss_clip": 0.01199934, + "auxiliary_loss_mlp": 0.00278242, + "balance_loss_clip": 0.99692518, + "balance_loss_mlp": 0.25130099, + "epoch": 0.5387644671576732, + "flos": 29788296266880.0, + "grad_norm": 86.28706065324721, + "language_loss": 0.83693743, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.85171926, + "num_input_tokens_seen": 192883695, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.26965332, + "step": 8961, + "time_per_iteration": 2.6952714920043945 + }, + { + "auxiliary_loss_clip": 0.01212221, + "auxiliary_loss_mlp": 0.00220878, + "balance_loss_clip": 1.00253975, + "balance_loss_mlp": 0.19411595, + "epoch": 0.5388245904103413, + "flos": 22374466237440.0, + "grad_norm": 47.49131696331509, + "language_loss": 0.89356685, + "learning_rate": 1.845731828364681e-06, + "loss": 0.90789783, + "num_input_tokens_seen": 192900190, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.26757812, + "step": 8962, + "time_per_iteration": 2.6492762565612793 + }, + { + "auxiliary_loss_clip": 0.0125378, + "auxiliary_loss_mlp": 0.00046421, + "balance_loss_clip": 1.10203481, + "balance_loss_mlp": 0.03993564, + "epoch": 0.5388847136630092, + "flos": 69807794751360.0, + "grad_norm": 0.7316416282296551, + "language_loss": 0.5380435, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.55104548, + "num_input_tokens_seen": 192958675, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.06494141, + "step": 8963, + "time_per_iteration": 3.0788145065307617 + }, + { + "auxiliary_loss_clip": 0.01271954, + "auxiliary_loss_mlp": 0.00046564, + "balance_loss_clip": 1.1140548, + "balance_loss_mlp": 0.04067529, + "epoch": 0.5389448369156772, + "flos": 69822303845760.0, + "grad_norm": 0.8045951480971929, + "language_loss": 0.62408161, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.63726676, + "num_input_tokens_seen": 193033135, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.05883789, + "step": 8964, + "time_per_iteration": 3.180586099624634 + }, + { + "auxiliary_loss_clip": 0.0122452, + "auxiliary_loss_mlp": 0.00228917, + "balance_loss_clip": 1.00806546, + "balance_loss_mlp": 0.20308387, + "epoch": 0.5390049601683451, + "flos": 31722444933120.0, + "grad_norm": 11.721539513392262, + "language_loss": 0.76617134, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.78070569, + "num_input_tokens_seen": 193055570, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.25866699, + "step": 8965, + "time_per_iteration": 2.7270731925964355 + }, + { + "auxiliary_loss_clip": 0.01239574, + "auxiliary_loss_mlp": 0.00235531, + "balance_loss_clip": 1.02508283, + "balance_loss_mlp": 0.20875695, + "epoch": 0.5390650834210131, + "flos": 18113486446080.0, + "grad_norm": 31.3684897476987, + "language_loss": 0.91358459, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.92833561, + "num_input_tokens_seen": 193073120, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.26782227, + "step": 8966, + "time_per_iteration": 2.583099126815796 + }, + { + "auxiliary_loss_clip": 0.01247871, + "auxiliary_loss_mlp": 0.00226039, + "balance_loss_clip": 1.03156281, + "balance_loss_mlp": 0.20167288, + "epoch": 0.539125206673681, + "flos": 17416711445760.0, + "grad_norm": 3.2526487362252707, + "language_loss": 0.77796435, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.79270345, + "num_input_tokens_seen": 193090105, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.24377441, + "step": 8967, + "time_per_iteration": 2.627350330352783 + }, + { + "auxiliary_loss_clip": 0.01188256, + "auxiliary_loss_mlp": 0.0023666, + "balance_loss_clip": 0.98999608, + "balance_loss_mlp": 0.21056473, + "epoch": 0.539185329926349, + "flos": 22198935450240.0, + "grad_norm": 13.00897374795693, + "language_loss": 0.88278127, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.89703047, + "num_input_tokens_seen": 193109325, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.2611084, + "step": 8968, + "time_per_iteration": 4.198465585708618 + }, + { + "auxiliary_loss_clip": 0.01233922, + "auxiliary_loss_mlp": 0.00230618, + "balance_loss_clip": 1.02012753, + "balance_loss_mlp": 0.20517848, + "epoch": 0.539245453179017, + "flos": 21434397442560.0, + "grad_norm": 4.383862923332199, + "language_loss": 0.8067494, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.8213948, + "num_input_tokens_seen": 193130595, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.25439453, + "step": 8969, + "time_per_iteration": 4.048234224319458 + }, + { + "auxiliary_loss_clip": 0.01203627, + "auxiliary_loss_mlp": 0.00244539, + "balance_loss_clip": 0.99529111, + "balance_loss_mlp": 0.21768141, + "epoch": 0.539305576431685, + "flos": 20735000749440.0, + "grad_norm": 6.269445136715458, + "language_loss": 0.89194739, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.90642899, + "num_input_tokens_seen": 193148930, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.26867676, + "step": 8970, + "time_per_iteration": 2.6081318855285645 + }, + { + "auxiliary_loss_clip": 0.01207875, + "auxiliary_loss_mlp": 0.00229153, + "balance_loss_clip": 1.00595117, + "balance_loss_mlp": 0.20402348, + "epoch": 0.5393656996843529, + "flos": 30920452018560.0, + "grad_norm": 33.96749633928989, + "language_loss": 0.80105305, + "learning_rate": 1.842237354749146e-06, + "loss": 0.81542337, + "num_input_tokens_seen": 193170140, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.2512207, + "step": 8971, + "time_per_iteration": 4.1103246212005615 + }, + { + "auxiliary_loss_clip": 0.01301117, + "auxiliary_loss_mlp": 0.00088149, + "balance_loss_clip": 1.11992061, + "balance_loss_mlp": 0.08133061, + "epoch": 0.5394258229370209, + "flos": 50317781351040.0, + "grad_norm": 0.8611977426343315, + "language_loss": 0.59874415, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.6126368, + "num_input_tokens_seen": 193227235, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.06835938, + "step": 8972, + "time_per_iteration": 3.0876595973968506 + }, + { + "auxiliary_loss_clip": 0.01207165, + "auxiliary_loss_mlp": 0.00230122, + "balance_loss_clip": 1.00246906, + "balance_loss_mlp": 0.20474212, + "epoch": 0.5394859461896888, + "flos": 25411935012480.0, + "grad_norm": 5.326626720177186, + "language_loss": 0.84514683, + "learning_rate": 1.841460870485045e-06, + "loss": 0.85951966, + "num_input_tokens_seen": 193248435, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.25378418, + "step": 8973, + "time_per_iteration": 2.6404311656951904 + }, + { + "auxiliary_loss_clip": 0.01260085, + "auxiliary_loss_mlp": 0.0022301, + "balance_loss_clip": 1.03625858, + "balance_loss_mlp": 0.19703446, + "epoch": 0.5395460694423568, + "flos": 25478476957440.0, + "grad_norm": 73.98809471002781, + "language_loss": 0.82381499, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.83864594, + "num_input_tokens_seen": 193267490, + "router_z_loss_clip": 2.23535156, + "router_z_loss_mlp": 0.25964355, + "step": 8974, + "time_per_iteration": 2.6521666049957275 + }, + { + "auxiliary_loss_clip": 0.01288421, + "auxiliary_loss_mlp": 0.00076404, + "balance_loss_clip": 1.11077809, + "balance_loss_mlp": 0.06953774, + "epoch": 0.5396061926950249, + "flos": 53249493507840.0, + "grad_norm": 0.7198825078424088, + "language_loss": 0.50820446, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.52185261, + "num_input_tokens_seen": 193326050, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.06884766, + "step": 8975, + "time_per_iteration": 3.12680983543396 + }, + { + "auxiliary_loss_clip": 0.01217462, + "auxiliary_loss_mlp": 0.00222735, + "balance_loss_clip": 1.01334238, + "balance_loss_mlp": 0.19685432, + "epoch": 0.5396663159476928, + "flos": 26725080418560.0, + "grad_norm": 226.1361678915132, + "language_loss": 0.81354594, + "learning_rate": 1.840296189214344e-06, + "loss": 0.82794785, + "num_input_tokens_seen": 193348785, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.25866699, + "step": 8976, + "time_per_iteration": 4.121136903762817 + }, + { + "auxiliary_loss_clip": 0.01230065, + "auxiliary_loss_mlp": 0.0021347, + "balance_loss_clip": 1.02494884, + "balance_loss_mlp": 0.18640909, + "epoch": 0.5397264392003608, + "flos": 23253380127360.0, + "grad_norm": 14.876756283441377, + "language_loss": 0.77686143, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.79129678, + "num_input_tokens_seen": 193367080, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.27038574, + "step": 8977, + "time_per_iteration": 2.687260150909424 + }, + { + "auxiliary_loss_clip": 0.01241214, + "auxiliary_loss_mlp": 0.00238381, + "balance_loss_clip": 1.0312115, + "balance_loss_mlp": 0.21295336, + "epoch": 0.5397865624530287, + "flos": 18294188791680.0, + "grad_norm": 6.280867508486488, + "language_loss": 0.8027879, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.81758392, + "num_input_tokens_seen": 193383715, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.25439453, + "step": 8978, + "time_per_iteration": 2.6679701805114746 + }, + { + "auxiliary_loss_clip": 0.01220205, + "auxiliary_loss_mlp": 0.0022926, + "balance_loss_clip": 1.01462615, + "balance_loss_mlp": 0.20181775, + "epoch": 0.5398466857056967, + "flos": 15297514888320.0, + "grad_norm": 162.0887075250353, + "language_loss": 0.83667004, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.8511647, + "num_input_tokens_seen": 193400560, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.27453613, + "step": 8979, + "time_per_iteration": 2.6376953125 + }, + { + "auxiliary_loss_clip": 0.01238986, + "auxiliary_loss_mlp": 0.00225461, + "balance_loss_clip": 1.02731502, + "balance_loss_mlp": 0.19869855, + "epoch": 0.5399068089583646, + "flos": 17821748183040.0, + "grad_norm": 43.345281675758336, + "language_loss": 0.86029768, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.87494218, + "num_input_tokens_seen": 193418680, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.26794434, + "step": 8980, + "time_per_iteration": 2.61337947845459 + }, + { + "auxiliary_loss_clip": 0.01227589, + "auxiliary_loss_mlp": 0.00234223, + "balance_loss_clip": 1.02186108, + "balance_loss_mlp": 0.20716225, + "epoch": 0.5399669322110326, + "flos": 27381635164800.0, + "grad_norm": 2.0369457106945688, + "language_loss": 0.88805908, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.90267718, + "num_input_tokens_seen": 193439310, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.27038574, + "step": 8981, + "time_per_iteration": 2.710923194885254 + }, + { + "auxiliary_loss_clip": 0.01239719, + "auxiliary_loss_mlp": 0.00226294, + "balance_loss_clip": 1.02744555, + "balance_loss_mlp": 0.19857767, + "epoch": 0.5400270554637006, + "flos": 20449116403200.0, + "grad_norm": 3.347719629867188, + "language_loss": 0.7639007, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.77856082, + "num_input_tokens_seen": 193458115, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.27709961, + "step": 8982, + "time_per_iteration": 2.6420998573303223 + }, + { + "auxiliary_loss_clip": 0.01242707, + "auxiliary_loss_mlp": 0.00216411, + "balance_loss_clip": 1.0342257, + "balance_loss_mlp": 0.19061424, + "epoch": 0.5400871787163686, + "flos": 21689578638720.0, + "grad_norm": 6.222465541662933, + "language_loss": 0.88900572, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.90359688, + "num_input_tokens_seen": 193477365, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.2578125, + "step": 8983, + "time_per_iteration": 2.6744887828826904 + }, + { + "auxiliary_loss_clip": 0.012581, + "auxiliary_loss_mlp": 0.00214543, + "balance_loss_clip": 1.04109883, + "balance_loss_mlp": 0.18595614, + "epoch": 0.5401473019690365, + "flos": 19204739585280.0, + "grad_norm": 14.4417826303989, + "language_loss": 0.79165024, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.8063767, + "num_input_tokens_seen": 193495595, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.28588867, + "step": 8984, + "time_per_iteration": 2.7580180168151855 + }, + { + "auxiliary_loss_clip": 0.01275586, + "auxiliary_loss_mlp": 0.00207815, + "balance_loss_clip": 1.05009162, + "balance_loss_mlp": 0.18050429, + "epoch": 0.5402074252217045, + "flos": 20627376624000.0, + "grad_norm": 14.538621904216667, + "language_loss": 0.88869166, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.90352559, + "num_input_tokens_seen": 193514035, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.27282715, + "step": 8985, + "time_per_iteration": 2.6615865230560303 + }, + { + "auxiliary_loss_clip": 0.01238311, + "auxiliary_loss_mlp": 0.00195582, + "balance_loss_clip": 1.03676152, + "balance_loss_mlp": 0.16912965, + "epoch": 0.5402675484743724, + "flos": 24973465691520.0, + "grad_norm": 797.9031512247736, + "language_loss": 0.85259891, + "learning_rate": 1.83641431418363e-06, + "loss": 0.86693782, + "num_input_tokens_seen": 193535445, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.26428223, + "step": 8986, + "time_per_iteration": 2.7091715335845947 + }, + { + "auxiliary_loss_clip": 0.01237748, + "auxiliary_loss_mlp": 0.00190206, + "balance_loss_clip": 1.03186035, + "balance_loss_mlp": 0.16355082, + "epoch": 0.5403276717270404, + "flos": 19459022941440.0, + "grad_norm": 4.716611335926856, + "language_loss": 0.84845573, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.86273533, + "num_input_tokens_seen": 193554780, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.26635742, + "step": 8987, + "time_per_iteration": 2.6641154289245605 + }, + { + "auxiliary_loss_clip": 0.01248257, + "auxiliary_loss_mlp": 0.00166271, + "balance_loss_clip": 1.04149842, + "balance_loss_mlp": 0.13916257, + "epoch": 0.5403877949797083, + "flos": 18442140912000.0, + "grad_norm": 1216.1360466241233, + "language_loss": 0.78963828, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.8037836, + "num_input_tokens_seen": 193573580, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.27124023, + "step": 8988, + "time_per_iteration": 2.6734046936035156 + }, + { + "auxiliary_loss_clip": 0.01248613, + "auxiliary_loss_mlp": 0.0018308, + "balance_loss_clip": 1.04084647, + "balance_loss_mlp": 0.15547085, + "epoch": 0.5404479182323764, + "flos": 28292868316800.0, + "grad_norm": 319.20831707369564, + "language_loss": 0.75727332, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.77159023, + "num_input_tokens_seen": 193590490, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.27624512, + "step": 8989, + "time_per_iteration": 2.6887199878692627 + }, + { + "auxiliary_loss_clip": 0.01236486, + "auxiliary_loss_mlp": 0.00189455, + "balance_loss_clip": 1.03306997, + "balance_loss_mlp": 0.1629429, + "epoch": 0.5405080414850444, + "flos": 23367325046400.0, + "grad_norm": 4.65728460474569, + "language_loss": 0.8411479, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.8554073, + "num_input_tokens_seen": 193609900, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.26538086, + "step": 8990, + "time_per_iteration": 2.6552820205688477 + }, + { + "auxiliary_loss_clip": 0.01244166, + "auxiliary_loss_mlp": 0.00174247, + "balance_loss_clip": 1.03580451, + "balance_loss_mlp": 0.14672175, + "epoch": 0.5405681647377123, + "flos": 21106425335040.0, + "grad_norm": 45.17739646932531, + "language_loss": 0.77779794, + "learning_rate": 1.834473608367745e-06, + "loss": 0.79198211, + "num_input_tokens_seen": 193629775, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.2755127, + "step": 8991, + "time_per_iteration": 2.6324219703674316 + }, + { + "auxiliary_loss_clip": 0.01264424, + "auxiliary_loss_mlp": 0.00171981, + "balance_loss_clip": 1.05803967, + "balance_loss_mlp": 0.14229769, + "epoch": 0.5406282879903803, + "flos": 20449188230400.0, + "grad_norm": 18.278924219017046, + "language_loss": 0.84487396, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.85923797, + "num_input_tokens_seen": 193648070, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.29675293, + "step": 8992, + "time_per_iteration": 2.6469972133636475 + }, + { + "auxiliary_loss_clip": 0.01255791, + "auxiliary_loss_mlp": 0.00167257, + "balance_loss_clip": 1.04573464, + "balance_loss_mlp": 0.13744229, + "epoch": 0.5406884112430482, + "flos": 14209493973120.0, + "grad_norm": 6.760223657495277, + "language_loss": 0.87062281, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.88485324, + "num_input_tokens_seen": 193665060, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.2980957, + "step": 8993, + "time_per_iteration": 2.6489744186401367 + }, + { + "auxiliary_loss_clip": 0.01236343, + "auxiliary_loss_mlp": 0.00154589, + "balance_loss_clip": 1.03367996, + "balance_loss_mlp": 0.12488233, + "epoch": 0.5407485344957162, + "flos": 23875568536320.0, + "grad_norm": 13.534352149841487, + "language_loss": 0.79190195, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.80581129, + "num_input_tokens_seen": 193683620, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.296875, + "step": 8994, + "time_per_iteration": 2.6587440967559814 + }, + { + "auxiliary_loss_clip": 0.01253371, + "auxiliary_loss_mlp": 0.00170902, + "balance_loss_clip": 1.04251885, + "balance_loss_mlp": 0.14266142, + "epoch": 0.5408086577483842, + "flos": 23148485435520.0, + "grad_norm": 74.03991637036381, + "language_loss": 0.83156538, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.84580815, + "num_input_tokens_seen": 193702990, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.28234863, + "step": 8995, + "time_per_iteration": 2.625929832458496 + }, + { + "auxiliary_loss_clip": 0.01247278, + "auxiliary_loss_mlp": 0.00149134, + "balance_loss_clip": 1.04635203, + "balance_loss_mlp": 0.12053599, + "epoch": 0.5408687810010522, + "flos": 18771046773120.0, + "grad_norm": 2.4746435767653603, + "language_loss": 0.80305141, + "learning_rate": 1.832533059471282e-06, + "loss": 0.81701553, + "num_input_tokens_seen": 193721785, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.28601074, + "step": 8996, + "time_per_iteration": 2.625659465789795 + }, + { + "auxiliary_loss_clip": 0.01246704, + "auxiliary_loss_mlp": 0.00153166, + "balance_loss_clip": 1.04544497, + "balance_loss_mlp": 0.12601049, + "epoch": 0.5409289042537201, + "flos": 13881557779200.0, + "grad_norm": 222.31253902571675, + "language_loss": 0.80831873, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.82231748, + "num_input_tokens_seen": 193740315, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.27160645, + "step": 8997, + "time_per_iteration": 2.609140634536743 + }, + { + "auxiliary_loss_clip": 0.01259567, + "auxiliary_loss_mlp": 0.00143594, + "balance_loss_clip": 1.05451405, + "balance_loss_mlp": 0.11381592, + "epoch": 0.5409890275063881, + "flos": 14465357527680.0, + "grad_norm": 6.3117262745232185, + "language_loss": 0.84567332, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.85970497, + "num_input_tokens_seen": 193757580, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.29785156, + "step": 8998, + "time_per_iteration": 2.6332356929779053 + }, + { + "auxiliary_loss_clip": 0.01253637, + "auxiliary_loss_mlp": 0.00172464, + "balance_loss_clip": 1.05081391, + "balance_loss_mlp": 0.14058726, + "epoch": 0.541049150759056, + "flos": 48977449349760.0, + "grad_norm": 1.9243630579732653, + "language_loss": 0.77131289, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.78557396, + "num_input_tokens_seen": 193780965, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.31860352, + "step": 8999, + "time_per_iteration": 3.020439863204956 + }, + { + "auxiliary_loss_clip": 0.01270101, + "auxiliary_loss_mlp": 0.00157242, + "balance_loss_clip": 1.06262779, + "balance_loss_mlp": 0.12643833, + "epoch": 0.541109274011724, + "flos": 18147601388160.0, + "grad_norm": 261.53651024427944, + "language_loss": 0.90157455, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.9158479, + "num_input_tokens_seen": 193797855, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.30810547, + "step": 9000, + "time_per_iteration": 2.7224552631378174 + }, + { + "auxiliary_loss_clip": 0.01248429, + "auxiliary_loss_mlp": 0.00139075, + "balance_loss_clip": 1.04622161, + "balance_loss_mlp": 0.10841419, + "epoch": 0.541169397264392, + "flos": 20522553759360.0, + "grad_norm": 10.843095606730348, + "language_loss": 0.81718069, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.83105576, + "num_input_tokens_seen": 193817375, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.30664062, + "step": 9001, + "time_per_iteration": 2.7406649589538574 + }, + { + "auxiliary_loss_clip": 0.01265409, + "auxiliary_loss_mlp": 0.00176939, + "balance_loss_clip": 1.05585933, + "balance_loss_mlp": 0.14713626, + "epoch": 0.54122952051706, + "flos": 20044043752320.0, + "grad_norm": 9.103646038588721, + "language_loss": 0.94999206, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.96441555, + "num_input_tokens_seen": 193832205, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.29833984, + "step": 9002, + "time_per_iteration": 2.633077383041382 + }, + { + "auxiliary_loss_clip": 0.01270509, + "auxiliary_loss_mlp": 0.00171847, + "balance_loss_clip": 1.0662384, + "balance_loss_mlp": 0.14396413, + "epoch": 0.541289643769728, + "flos": 19062246332160.0, + "grad_norm": 3.244165588681721, + "language_loss": 0.87702447, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.89144802, + "num_input_tokens_seen": 193849830, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.27868652, + "step": 9003, + "time_per_iteration": 2.647686004638672 + }, + { + "auxiliary_loss_clip": 0.01247577, + "auxiliary_loss_mlp": 0.00178719, + "balance_loss_clip": 1.0478673, + "balance_loss_mlp": 0.15102656, + "epoch": 0.5413497670223959, + "flos": 22382295402240.0, + "grad_norm": 2.1230985281995767, + "language_loss": 0.76987201, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.78413498, + "num_input_tokens_seen": 193869945, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.27709961, + "step": 9004, + "time_per_iteration": 2.643705368041992 + }, + { + "auxiliary_loss_clip": 0.01248169, + "auxiliary_loss_mlp": 0.00030175, + "balance_loss_clip": 1.10144043, + "balance_loss_mlp": 0.01567901, + "epoch": 0.5414098902750639, + "flos": 70031734093440.0, + "grad_norm": 0.9385847089737126, + "language_loss": 0.58290136, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.59568489, + "num_input_tokens_seen": 193930860, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.14453125, + "step": 9005, + "time_per_iteration": 3.2872657775878906 + }, + { + "auxiliary_loss_clip": 0.01250137, + "auxiliary_loss_mlp": 0.00162096, + "balance_loss_clip": 1.04883647, + "balance_loss_mlp": 0.13236535, + "epoch": 0.5414700135277318, + "flos": 21798962530560.0, + "grad_norm": 87.02574684233967, + "language_loss": 0.87230146, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.88642377, + "num_input_tokens_seen": 193949075, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.29724121, + "step": 9006, + "time_per_iteration": 2.6529476642608643 + }, + { + "auxiliary_loss_clip": 0.01268521, + "auxiliary_loss_mlp": 0.00160161, + "balance_loss_clip": 1.06705987, + "balance_loss_mlp": 0.13312417, + "epoch": 0.5415301367803999, + "flos": 16907929251840.0, + "grad_norm": 32.662353835547655, + "language_loss": 0.89096612, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.90525293, + "num_input_tokens_seen": 193967630, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.27038574, + "step": 9007, + "time_per_iteration": 2.655665159225464 + }, + { + "auxiliary_loss_clip": 0.0124956, + "auxiliary_loss_mlp": 0.00176343, + "balance_loss_clip": 1.04733765, + "balance_loss_mlp": 0.14582568, + "epoch": 0.5415902600330678, + "flos": 25704176065920.0, + "grad_norm": 106.15232248598551, + "language_loss": 0.7556349, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.76989388, + "num_input_tokens_seen": 193988730, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.30541992, + "step": 9008, + "time_per_iteration": 2.6788227558135986 + }, + { + "auxiliary_loss_clip": 0.01270959, + "auxiliary_loss_mlp": 0.00170856, + "balance_loss_clip": 1.06251049, + "balance_loss_mlp": 0.13950378, + "epoch": 0.5416503832857358, + "flos": 19208151377280.0, + "grad_norm": 105.09534636160784, + "language_loss": 0.86295283, + "learning_rate": 1.827488379924234e-06, + "loss": 0.87737101, + "num_input_tokens_seen": 194005160, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.31323242, + "step": 9009, + "time_per_iteration": 2.6834778785705566 + }, + { + "auxiliary_loss_clip": 0.01269228, + "auxiliary_loss_mlp": 0.00189823, + "balance_loss_clip": 1.06067634, + "balance_loss_mlp": 0.1583516, + "epoch": 0.5417105065384037, + "flos": 12713706887040.0, + "grad_norm": 4.41865246831462, + "language_loss": 1.01268959, + "learning_rate": 1.8271003730309923e-06, + "loss": 1.02728009, + "num_input_tokens_seen": 194021700, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.31469727, + "step": 9010, + "time_per_iteration": 4.0207359790802 + }, + { + "auxiliary_loss_clip": 0.01268282, + "auxiliary_loss_mlp": 0.00167012, + "balance_loss_clip": 1.06326008, + "balance_loss_mlp": 0.13639927, + "epoch": 0.5417706297910717, + "flos": 30335933998080.0, + "grad_norm": 45.43046830904062, + "language_loss": 0.75387979, + "learning_rate": 1.826712372694122e-06, + "loss": 0.7682327, + "num_input_tokens_seen": 194042620, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.3059082, + "step": 9011, + "time_per_iteration": 4.229523658752441 + }, + { + "auxiliary_loss_clip": 0.01274593, + "auxiliary_loss_mlp": 0.00181219, + "balance_loss_clip": 1.06545019, + "balance_loss_mlp": 0.15067783, + "epoch": 0.5418307530437396, + "flos": 29020992912000.0, + "grad_norm": 113.87803051359643, + "language_loss": 0.90695584, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.92151403, + "num_input_tokens_seen": 194061800, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.30541992, + "step": 9012, + "time_per_iteration": 2.697009325027466 + }, + { + "auxiliary_loss_clip": 0.01256233, + "auxiliary_loss_mlp": 0.00195043, + "balance_loss_clip": 1.05306053, + "balance_loss_mlp": 0.16695765, + "epoch": 0.5418908762964076, + "flos": 16873455173760.0, + "grad_norm": 5.16925484863136, + "language_loss": 0.84704912, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.86156178, + "num_input_tokens_seen": 194079890, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.28063965, + "step": 9013, + "time_per_iteration": 2.7071123123168945 + }, + { + "auxiliary_loss_clip": 0.01273533, + "auxiliary_loss_mlp": 0.00166398, + "balance_loss_clip": 1.06498337, + "balance_loss_mlp": 0.13635761, + "epoch": 0.5419509995490756, + "flos": 18949702043520.0, + "grad_norm": 3.9590128315691193, + "language_loss": 0.79940039, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.81379974, + "num_input_tokens_seen": 194097625, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.30029297, + "step": 9014, + "time_per_iteration": 4.028576374053955 + }, + { + "auxiliary_loss_clip": 0.01262946, + "auxiliary_loss_mlp": 0.00154942, + "balance_loss_clip": 1.06240487, + "balance_loss_mlp": 0.12368502, + "epoch": 0.5420111228017436, + "flos": 18077719478400.0, + "grad_norm": 3.3715547253379485, + "language_loss": 0.87847948, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.89265835, + "num_input_tokens_seen": 194116055, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.31298828, + "step": 9015, + "time_per_iteration": 2.643716812133789 + }, + { + "auxiliary_loss_clip": 0.01275654, + "auxiliary_loss_mlp": 0.00184162, + "balance_loss_clip": 1.06314683, + "balance_loss_mlp": 0.1555874, + "epoch": 0.5420712460544116, + "flos": 19061779455360.0, + "grad_norm": 145.2380231935182, + "language_loss": 0.91765839, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.93225658, + "num_input_tokens_seen": 194130365, + "router_z_loss_clip": 2.12402344, + "router_z_loss_mlp": 0.28601074, + "step": 9016, + "time_per_iteration": 2.6276724338531494 + }, + { + "auxiliary_loss_clip": 0.01259856, + "auxiliary_loss_mlp": 0.00179803, + "balance_loss_clip": 1.05562842, + "balance_loss_mlp": 0.15423232, + "epoch": 0.5421313693070795, + "flos": 18187103370240.0, + "grad_norm": 27.41925540251094, + "language_loss": 0.89513105, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.90952766, + "num_input_tokens_seen": 194148975, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.2557373, + "step": 9017, + "time_per_iteration": 2.6806094646453857 + }, + { + "auxiliary_loss_clip": 0.01262619, + "auxiliary_loss_mlp": 0.00171372, + "balance_loss_clip": 1.06340432, + "balance_loss_mlp": 0.1439901, + "epoch": 0.5421914925597475, + "flos": 13005947940480.0, + "grad_norm": 17.06004218757237, + "language_loss": 0.86837554, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.88271546, + "num_input_tokens_seen": 194167185, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.27392578, + "step": 9018, + "time_per_iteration": 4.122587203979492 + }, + { + "auxiliary_loss_clip": 0.01275071, + "auxiliary_loss_mlp": 0.00169553, + "balance_loss_clip": 1.06379104, + "balance_loss_mlp": 0.14125308, + "epoch": 0.5422516158124154, + "flos": 46758457831680.0, + "grad_norm": 15.618300764552208, + "language_loss": 0.74637234, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.76081854, + "num_input_tokens_seen": 194192840, + "router_z_loss_clip": 2.11425781, + "router_z_loss_mlp": 0.28295898, + "step": 9019, + "time_per_iteration": 2.8973214626312256 + }, + { + "auxiliary_loss_clip": 0.01264064, + "auxiliary_loss_mlp": 0.00176532, + "balance_loss_clip": 1.06430268, + "balance_loss_mlp": 0.15041275, + "epoch": 0.5423117390650835, + "flos": 31758642864000.0, + "grad_norm": 5.3829844684334045, + "language_loss": 0.78684419, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.8012501, + "num_input_tokens_seen": 194213150, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.26123047, + "step": 9020, + "time_per_iteration": 2.7548422813415527 + }, + { + "auxiliary_loss_clip": 0.01249582, + "auxiliary_loss_mlp": 0.00164403, + "balance_loss_clip": 1.05485201, + "balance_loss_mlp": 0.13592362, + "epoch": 0.5423718623177514, + "flos": 27201974313600.0, + "grad_norm": 35.83955312933443, + "language_loss": 0.85618454, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.87032437, + "num_input_tokens_seen": 194234665, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.28442383, + "step": 9021, + "time_per_iteration": 2.864750385284424 + }, + { + "auxiliary_loss_clip": 0.01288657, + "auxiliary_loss_mlp": 0.00183933, + "balance_loss_clip": 1.07847071, + "balance_loss_mlp": 0.15436953, + "epoch": 0.5424319855704194, + "flos": 23546447193600.0, + "grad_norm": 31.173271935817187, + "language_loss": 0.84660351, + "learning_rate": 1.822444805916788e-06, + "loss": 0.86132944, + "num_input_tokens_seen": 194253790, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.29577637, + "step": 9022, + "time_per_iteration": 2.6545016765594482 + }, + { + "auxiliary_loss_clip": 0.01264074, + "auxiliary_loss_mlp": 0.00180581, + "balance_loss_clip": 1.06050241, + "balance_loss_mlp": 0.15273325, + "epoch": 0.5424921088230873, + "flos": 26615624699520.0, + "grad_norm": 231.30009404226678, + "language_loss": 0.91150999, + "learning_rate": 1.822056885403915e-06, + "loss": 0.92595655, + "num_input_tokens_seen": 194274950, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.27880859, + "step": 9023, + "time_per_iteration": 2.65956449508667 + }, + { + "auxiliary_loss_clip": 0.0130142, + "auxiliary_loss_mlp": 0.00179601, + "balance_loss_clip": 1.0900743, + "balance_loss_mlp": 0.15010856, + "epoch": 0.5425522320757553, + "flos": 23586811102080.0, + "grad_norm": 8.23486380295083, + "language_loss": 0.76659811, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.78140831, + "num_input_tokens_seen": 194296155, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.29516602, + "step": 9024, + "time_per_iteration": 2.71765398979187 + }, + { + "auxiliary_loss_clip": 0.01273984, + "auxiliary_loss_mlp": 0.00180889, + "balance_loss_clip": 1.06763101, + "balance_loss_mlp": 0.15276755, + "epoch": 0.5426123553284232, + "flos": 30592264429440.0, + "grad_norm": 6.767997791198535, + "language_loss": 0.72697926, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.74152803, + "num_input_tokens_seen": 194318025, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.28125, + "step": 9025, + "time_per_iteration": 2.7597060203552246 + }, + { + "auxiliary_loss_clip": 0.01288051, + "auxiliary_loss_mlp": 0.00182331, + "balance_loss_clip": 1.07860148, + "balance_loss_mlp": 0.15317282, + "epoch": 0.5426724785810912, + "flos": 12495118671360.0, + "grad_norm": 33.99769094383093, + "language_loss": 0.82808971, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.84279352, + "num_input_tokens_seen": 194336150, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.29199219, + "step": 9026, + "time_per_iteration": 2.6400275230407715 + }, + { + "auxiliary_loss_clip": 0.01280422, + "auxiliary_loss_mlp": 0.00205241, + "balance_loss_clip": 1.07155502, + "balance_loss_mlp": 0.17454445, + "epoch": 0.5427326018337592, + "flos": 26064611089920.0, + "grad_norm": 23.166239240420627, + "language_loss": 0.85047024, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.86532688, + "num_input_tokens_seen": 194355980, + "router_z_loss_clip": 2.08886719, + "router_z_loss_mlp": 0.30688477, + "step": 9027, + "time_per_iteration": 2.703296661376953 + }, + { + "auxiliary_loss_clip": 0.01289999, + "auxiliary_loss_mlp": 0.00088267, + "balance_loss_clip": 1.15437186, + "balance_loss_mlp": 0.07877778, + "epoch": 0.5427927250864272, + "flos": 65984745576960.0, + "grad_norm": 5.825467870730686, + "language_loss": 0.56399918, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.5777818, + "num_input_tokens_seen": 194422660, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.09472656, + "step": 9028, + "time_per_iteration": 3.180621862411499 + }, + { + "auxiliary_loss_clip": 0.01264925, + "auxiliary_loss_mlp": 0.00184943, + "balance_loss_clip": 1.05937552, + "balance_loss_mlp": 0.15683302, + "epoch": 0.5428528483390952, + "flos": 19975382904960.0, + "grad_norm": 217.14638944844683, + "language_loss": 0.87943679, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.8939355, + "num_input_tokens_seen": 194438545, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.28125, + "step": 9029, + "time_per_iteration": 2.7416439056396484 + }, + { + "auxiliary_loss_clip": 0.0128159, + "auxiliary_loss_mlp": 0.00187203, + "balance_loss_clip": 1.07223892, + "balance_loss_mlp": 0.15871215, + "epoch": 0.5429129715917631, + "flos": 21832323287040.0, + "grad_norm": 3.0507593233738612, + "language_loss": 0.88488394, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.8995719, + "num_input_tokens_seen": 194458060, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.28466797, + "step": 9030, + "time_per_iteration": 2.6822896003723145 + }, + { + "auxiliary_loss_clip": 0.01284279, + "auxiliary_loss_mlp": 0.0016608, + "balance_loss_clip": 1.07221282, + "balance_loss_mlp": 0.13589609, + "epoch": 0.5429730948444311, + "flos": 27782685492480.0, + "grad_norm": 12862.268036236532, + "language_loss": 0.84922028, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.86372387, + "num_input_tokens_seen": 194477405, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.30200195, + "step": 9031, + "time_per_iteration": 2.756277322769165 + }, + { + "auxiliary_loss_clip": 0.01285397, + "auxiliary_loss_mlp": 0.00159967, + "balance_loss_clip": 1.08050942, + "balance_loss_mlp": 0.13412286, + "epoch": 0.543033218097099, + "flos": 26760452336640.0, + "grad_norm": 4.006438114586593, + "language_loss": 0.90928411, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.92373776, + "num_input_tokens_seen": 194497085, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.25866699, + "step": 9032, + "time_per_iteration": 2.656460762023926 + }, + { + "auxiliary_loss_clip": 0.01288385, + "auxiliary_loss_mlp": 0.00198938, + "balance_loss_clip": 1.07298839, + "balance_loss_mlp": 0.1692307, + "epoch": 0.5430933413497671, + "flos": 22675254727680.0, + "grad_norm": 2.05909256620772, + "language_loss": 0.81752974, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.83240294, + "num_input_tokens_seen": 194516785, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.296875, + "step": 9033, + "time_per_iteration": 2.6282646656036377 + }, + { + "auxiliary_loss_clip": 0.01271413, + "auxiliary_loss_mlp": 0.00177777, + "balance_loss_clip": 1.06282473, + "balance_loss_mlp": 0.14989373, + "epoch": 0.543153464602435, + "flos": 24607499973120.0, + "grad_norm": 4626.358512746314, + "language_loss": 0.84910429, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.8635962, + "num_input_tokens_seen": 194536475, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.27893066, + "step": 9034, + "time_per_iteration": 2.639439582824707 + }, + { + "auxiliary_loss_clip": 0.01273173, + "auxiliary_loss_mlp": 0.00176072, + "balance_loss_clip": 1.06384468, + "balance_loss_mlp": 0.14764038, + "epoch": 0.543213587855103, + "flos": 19025725178880.0, + "grad_norm": 44.05716110781452, + "language_loss": 0.91806906, + "learning_rate": 1.817402369770655e-06, + "loss": 0.93256152, + "num_input_tokens_seen": 194554495, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.28405762, + "step": 9035, + "time_per_iteration": 2.6830666065216064 + }, + { + "auxiliary_loss_clip": 0.01279961, + "auxiliary_loss_mlp": 0.00102417, + "balance_loss_clip": 1.1374501, + "balance_loss_mlp": 0.09478774, + "epoch": 0.5432737111077709, + "flos": 65686435125120.0, + "grad_norm": 0.6720765487579117, + "language_loss": 0.55222464, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.56604844, + "num_input_tokens_seen": 194617620, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.07617188, + "step": 9036, + "time_per_iteration": 3.1237282752990723 + }, + { + "auxiliary_loss_clip": 0.01291334, + "auxiliary_loss_mlp": 0.00185923, + "balance_loss_clip": 1.07650435, + "balance_loss_mlp": 0.15478534, + "epoch": 0.5433338343604389, + "flos": 22091670460800.0, + "grad_norm": 3.9552969955978994, + "language_loss": 0.83755803, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.85233057, + "num_input_tokens_seen": 194637690, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.31152344, + "step": 9037, + "time_per_iteration": 2.6717493534088135 + }, + { + "auxiliary_loss_clip": 0.01257238, + "auxiliary_loss_mlp": 0.00175719, + "balance_loss_clip": 1.05418348, + "balance_loss_mlp": 0.14869455, + "epoch": 0.5433939576131068, + "flos": 34672649616000.0, + "grad_norm": 78.22172608513462, + "language_loss": 0.73713326, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.75146282, + "num_input_tokens_seen": 194659520, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.2701416, + "step": 9038, + "time_per_iteration": 2.761455535888672 + }, + { + "auxiliary_loss_clip": 0.01277535, + "auxiliary_loss_mlp": 0.00196933, + "balance_loss_clip": 1.06524467, + "balance_loss_mlp": 0.16765499, + "epoch": 0.5434540808657748, + "flos": 20303355012480.0, + "grad_norm": 36.649960670252305, + "language_loss": 0.86968088, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.88442552, + "num_input_tokens_seen": 194677645, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.29260254, + "step": 9039, + "time_per_iteration": 2.651200771331787 + }, + { + "auxiliary_loss_clip": 0.01278232, + "auxiliary_loss_mlp": 0.00177392, + "balance_loss_clip": 1.06709266, + "balance_loss_mlp": 0.14558692, + "epoch": 0.5435142041184428, + "flos": 23112790295040.0, + "grad_norm": 2.793763133610511, + "language_loss": 0.86116099, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.87571728, + "num_input_tokens_seen": 194697400, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.31835938, + "step": 9040, + "time_per_iteration": 2.7033214569091797 + }, + { + "auxiliary_loss_clip": 0.01251842, + "auxiliary_loss_mlp": 0.00088783, + "balance_loss_clip": 1.11327267, + "balance_loss_mlp": 0.08124937, + "epoch": 0.5435743273711108, + "flos": 64012746954240.0, + "grad_norm": 0.6633407967088617, + "language_loss": 0.5187695, + "learning_rate": 1.815075484268074e-06, + "loss": 0.53217578, + "num_input_tokens_seen": 194761205, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.07519531, + "step": 9041, + "time_per_iteration": 3.1589760780334473 + }, + { + "auxiliary_loss_clip": 0.01287713, + "auxiliary_loss_mlp": 0.0017889, + "balance_loss_clip": 1.07656121, + "balance_loss_mlp": 0.14708519, + "epoch": 0.5436344506237788, + "flos": 25118903859840.0, + "grad_norm": 7.9164375089894055, + "language_loss": 0.84481168, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.85947776, + "num_input_tokens_seen": 194782445, + "router_z_loss_clip": 2.11035156, + "router_z_loss_mlp": 0.31835938, + "step": 9042, + "time_per_iteration": 2.69921612739563 + }, + { + "auxiliary_loss_clip": 0.01260643, + "auxiliary_loss_mlp": 0.00181694, + "balance_loss_clip": 1.06048131, + "balance_loss_mlp": 0.15240443, + "epoch": 0.5436945738764467, + "flos": 19572967860480.0, + "grad_norm": 20.37869249342085, + "language_loss": 0.76114285, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.77556622, + "num_input_tokens_seen": 194800325, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.29309082, + "step": 9043, + "time_per_iteration": 2.67382550239563 + }, + { + "auxiliary_loss_clip": 0.01278984, + "auxiliary_loss_mlp": 0.0019084, + "balance_loss_clip": 1.07186866, + "balance_loss_mlp": 0.16034669, + "epoch": 0.5437546971291147, + "flos": 21142515525120.0, + "grad_norm": 9.42423836439278, + "language_loss": 0.91094398, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.92564231, + "num_input_tokens_seen": 194818675, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.30493164, + "step": 9044, + "time_per_iteration": 2.631122350692749 + }, + { + "auxiliary_loss_clip": 0.0130452, + "auxiliary_loss_mlp": 0.00193767, + "balance_loss_clip": 1.08511114, + "balance_loss_mlp": 0.16246219, + "epoch": 0.5438148203817826, + "flos": 25118688378240.0, + "grad_norm": 8.81745390794574, + "language_loss": 0.70303917, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.71802205, + "num_input_tokens_seen": 194836595, + "router_z_loss_clip": 2.19628906, + "router_z_loss_mlp": 0.31323242, + "step": 9045, + "time_per_iteration": 2.7987048625946045 + }, + { + "auxiliary_loss_clip": 0.0127954, + "auxiliary_loss_mlp": 0.00168201, + "balance_loss_clip": 1.07211208, + "balance_loss_mlp": 0.14149809, + "epoch": 0.5438749436344507, + "flos": 23002939526400.0, + "grad_norm": 4.6475986929391695, + "language_loss": 0.77586454, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.79034197, + "num_input_tokens_seen": 194857520, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.26708984, + "step": 9046, + "time_per_iteration": 2.654956340789795 + }, + { + "auxiliary_loss_clip": 0.01268093, + "auxiliary_loss_mlp": 0.00188232, + "balance_loss_clip": 1.06343818, + "balance_loss_mlp": 0.16125542, + "epoch": 0.5439350668871186, + "flos": 15487016065920.0, + "grad_norm": 2.63470561576437, + "language_loss": 0.8409493, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.8555125, + "num_input_tokens_seen": 194876020, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.26953125, + "step": 9047, + "time_per_iteration": 2.6353092193603516 + }, + { + "auxiliary_loss_clip": 0.01292521, + "auxiliary_loss_mlp": 0.00206911, + "balance_loss_clip": 1.08287728, + "balance_loss_mlp": 0.17856339, + "epoch": 0.5439951901397866, + "flos": 17238415311360.0, + "grad_norm": 15.355358258704085, + "language_loss": 0.8071965, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.82219082, + "num_input_tokens_seen": 194894650, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.28369141, + "step": 9048, + "time_per_iteration": 2.6165213584899902 + }, + { + "auxiliary_loss_clip": 0.01276073, + "auxiliary_loss_mlp": 0.00202263, + "balance_loss_clip": 1.06579494, + "balance_loss_mlp": 0.17243698, + "epoch": 0.5440553133924545, + "flos": 18661016436480.0, + "grad_norm": 90.2177537044943, + "language_loss": 1.01950085, + "learning_rate": 1.8119733635055076e-06, + "loss": 1.03428411, + "num_input_tokens_seen": 194911935, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.2980957, + "step": 9049, + "time_per_iteration": 2.661010503768921 + }, + { + "auxiliary_loss_clip": 0.012868, + "auxiliary_loss_mlp": 0.00184396, + "balance_loss_clip": 1.07835126, + "balance_loss_mlp": 0.15733582, + "epoch": 0.5441154366451225, + "flos": 27122934435840.0, + "grad_norm": 4.5590711749294925, + "language_loss": 0.82501036, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.83972228, + "num_input_tokens_seen": 194931620, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.27050781, + "step": 9050, + "time_per_iteration": 2.6919608116149902 + }, + { + "auxiliary_loss_clip": 0.01298547, + "auxiliary_loss_mlp": 0.00193608, + "balance_loss_clip": 1.08319366, + "balance_loss_mlp": 0.1667864, + "epoch": 0.5441755598977904, + "flos": 25993867253760.0, + "grad_norm": 5.876946802242328, + "language_loss": 0.77688241, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.79180396, + "num_input_tokens_seen": 194952560, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.26818848, + "step": 9051, + "time_per_iteration": 2.715994119644165 + }, + { + "auxiliary_loss_clip": 0.01283959, + "auxiliary_loss_mlp": 0.00191851, + "balance_loss_clip": 1.07320487, + "balance_loss_mlp": 0.16527955, + "epoch": 0.5442356831504584, + "flos": 32380041173760.0, + "grad_norm": 24.59558638236739, + "language_loss": 0.75093585, + "learning_rate": 1.810810185460011e-06, + "loss": 0.76569402, + "num_input_tokens_seen": 194973915, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.26574707, + "step": 9052, + "time_per_iteration": 4.146685361862183 + }, + { + "auxiliary_loss_clip": 0.01298563, + "auxiliary_loss_mlp": 0.00191114, + "balance_loss_clip": 1.08263421, + "balance_loss_mlp": 0.16405359, + "epoch": 0.5442958064031264, + "flos": 24164290056960.0, + "grad_norm": 5.232871640428255, + "language_loss": 0.98468173, + "learning_rate": 1.810422473773436e-06, + "loss": 0.99957848, + "num_input_tokens_seen": 194990170, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.2701416, + "step": 9053, + "time_per_iteration": 4.1191489696502686 + }, + { + "auxiliary_loss_clip": 0.0131776, + "auxiliary_loss_mlp": 0.00204723, + "balance_loss_clip": 1.09787393, + "balance_loss_mlp": 0.17791273, + "epoch": 0.5443559296557944, + "flos": 18764690065920.0, + "grad_norm": 3.7696694274516584, + "language_loss": 0.91288519, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.92811, + "num_input_tokens_seen": 195006395, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.26806641, + "step": 9054, + "time_per_iteration": 2.670396327972412 + }, + { + "auxiliary_loss_clip": 0.01313459, + "auxiliary_loss_mlp": 0.0021474, + "balance_loss_clip": 1.09553719, + "balance_loss_mlp": 0.1875128, + "epoch": 0.5444160529084624, + "flos": 22632556435200.0, + "grad_norm": 22.3977864800882, + "language_loss": 0.78102905, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.79631108, + "num_input_tokens_seen": 195025080, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.2722168, + "step": 9055, + "time_per_iteration": 2.6875159740448 + }, + { + "auxiliary_loss_clip": 0.01315415, + "auxiliary_loss_mlp": 0.0005314, + "balance_loss_clip": 1.16788077, + "balance_loss_mlp": 0.04622554, + "epoch": 0.5444761761611303, + "flos": 69671909600640.0, + "grad_norm": 0.730601141915738, + "language_loss": 0.57123089, + "learning_rate": 1.80925938190531e-06, + "loss": 0.58491635, + "num_input_tokens_seen": 195085725, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.06933594, + "step": 9056, + "time_per_iteration": 4.559916973114014 + }, + { + "auxiliary_loss_clip": 0.01314091, + "auxiliary_loss_mlp": 0.00221484, + "balance_loss_clip": 1.09096193, + "balance_loss_mlp": 0.19290994, + "epoch": 0.5445362994137983, + "flos": 14278442129280.0, + "grad_norm": 6.200288403787547, + "language_loss": 0.77514243, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.79049814, + "num_input_tokens_seen": 195102585, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.28601074, + "step": 9057, + "time_per_iteration": 2.613802433013916 + }, + { + "auxiliary_loss_clip": 0.0132712, + "auxiliary_loss_mlp": 0.00208709, + "balance_loss_clip": 1.10565007, + "balance_loss_mlp": 0.18071899, + "epoch": 0.5445964226664662, + "flos": 28986195611520.0, + "grad_norm": 12.90885426177813, + "language_loss": 0.81695211, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.83231038, + "num_input_tokens_seen": 195120055, + "router_z_loss_clip": 2.21582031, + "router_z_loss_mlp": 0.27978516, + "step": 9058, + "time_per_iteration": 2.670454978942871 + }, + { + "auxiliary_loss_clip": 0.01314141, + "auxiliary_loss_mlp": 0.00074251, + "balance_loss_clip": 1.16535449, + "balance_loss_mlp": 0.06695568, + "epoch": 0.5446565459191343, + "flos": 68620230270720.0, + "grad_norm": 0.7670727536768611, + "language_loss": 0.61792272, + "learning_rate": 1.808096355133312e-06, + "loss": 0.63180661, + "num_input_tokens_seen": 195181045, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07275391, + "step": 9059, + "time_per_iteration": 3.2225944995880127 + }, + { + "auxiliary_loss_clip": 0.01292852, + "auxiliary_loss_mlp": 0.00208866, + "balance_loss_clip": 1.07920527, + "balance_loss_mlp": 0.18311691, + "epoch": 0.5447166691718022, + "flos": 16216469464320.0, + "grad_norm": 36.363134355467416, + "language_loss": 0.88402176, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.89903903, + "num_input_tokens_seen": 195198840, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.25744629, + "step": 9060, + "time_per_iteration": 4.023879766464233 + }, + { + "auxiliary_loss_clip": 0.01310548, + "auxiliary_loss_mlp": 0.00215019, + "balance_loss_clip": 1.09217107, + "balance_loss_mlp": 0.18892412, + "epoch": 0.5447767924244702, + "flos": 25849039616640.0, + "grad_norm": 6.523750944046212, + "language_loss": 0.87471896, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.88997459, + "num_input_tokens_seen": 195218720, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.26123047, + "step": 9061, + "time_per_iteration": 2.692802906036377 + }, + { + "auxiliary_loss_clip": 0.01306662, + "auxiliary_loss_mlp": 0.00229567, + "balance_loss_clip": 1.09076011, + "balance_loss_mlp": 0.20309061, + "epoch": 0.5448369156771381, + "flos": 19677718897920.0, + "grad_norm": 19.12608170176957, + "language_loss": 0.93093812, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.94630039, + "num_input_tokens_seen": 195235770, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.26477051, + "step": 9062, + "time_per_iteration": 2.635654926300049 + }, + { + "auxiliary_loss_clip": 0.01325987, + "auxiliary_loss_mlp": 0.00254909, + "balance_loss_clip": 1.09783363, + "balance_loss_mlp": 0.2264418, + "epoch": 0.5448970389298061, + "flos": 19281804215040.0, + "grad_norm": 4.574975899997816, + "language_loss": 0.90727305, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.92308199, + "num_input_tokens_seen": 195254870, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.28417969, + "step": 9063, + "time_per_iteration": 2.761207103729248 + }, + { + "auxiliary_loss_clip": 0.01327887, + "auxiliary_loss_mlp": 0.00230969, + "balance_loss_clip": 1.10363102, + "balance_loss_mlp": 0.20477876, + "epoch": 0.544957162182474, + "flos": 20991690316800.0, + "grad_norm": 8.432295001302016, + "language_loss": 0.72192192, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.7375105, + "num_input_tokens_seen": 195273390, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26196289, + "step": 9064, + "time_per_iteration": 2.6254770755767822 + }, + { + "auxiliary_loss_clip": 0.01322708, + "auxiliary_loss_mlp": 0.00218209, + "balance_loss_clip": 1.09474087, + "balance_loss_mlp": 0.18826318, + "epoch": 0.545017285435142, + "flos": 25374587846400.0, + "grad_norm": 4.142378529147941, + "language_loss": 0.88017738, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.89558655, + "num_input_tokens_seen": 195295635, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.29980469, + "step": 9065, + "time_per_iteration": 2.7270543575286865 + }, + { + "auxiliary_loss_clip": 0.0130855, + "auxiliary_loss_mlp": 0.00192815, + "balance_loss_clip": 1.09197819, + "balance_loss_mlp": 0.16754216, + "epoch": 0.54507740868781, + "flos": 19134749934720.0, + "grad_norm": 7.6284124744790365, + "language_loss": 0.86644495, + "learning_rate": 1.805382881379827e-06, + "loss": 0.88145864, + "num_input_tokens_seen": 195312545, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.25268555, + "step": 9066, + "time_per_iteration": 2.634230136871338 + }, + { + "auxiliary_loss_clip": 0.01306542, + "auxiliary_loss_mlp": 0.00238424, + "balance_loss_clip": 1.08320236, + "balance_loss_mlp": 0.21031405, + "epoch": 0.545137531940478, + "flos": 26249802635520.0, + "grad_norm": 5.71839927281471, + "language_loss": 0.85875106, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.8742007, + "num_input_tokens_seen": 195332955, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.28112793, + "step": 9067, + "time_per_iteration": 2.8013784885406494 + }, + { + "auxiliary_loss_clip": 0.0132419, + "auxiliary_loss_mlp": 0.00250204, + "balance_loss_clip": 1.09514105, + "balance_loss_mlp": 0.2202352, + "epoch": 0.545197655193146, + "flos": 37555629995520.0, + "grad_norm": 19.745220947653916, + "language_loss": 0.71678621, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.73253012, + "num_input_tokens_seen": 195355930, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.29980469, + "step": 9068, + "time_per_iteration": 2.811072587966919 + }, + { + "auxiliary_loss_clip": 0.01299864, + "auxiliary_loss_mlp": 0.00238549, + "balance_loss_clip": 1.08400679, + "balance_loss_mlp": 0.21361083, + "epoch": 0.5452577784458139, + "flos": 26031250333440.0, + "grad_norm": 105.98226092410016, + "language_loss": 0.77613515, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.79151928, + "num_input_tokens_seen": 195376445, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.24963379, + "step": 9069, + "time_per_iteration": 2.8020803928375244 + }, + { + "auxiliary_loss_clip": 0.01308317, + "auxiliary_loss_mlp": 0.00217809, + "balance_loss_clip": 1.09062099, + "balance_loss_mlp": 0.19331148, + "epoch": 0.5453179016984819, + "flos": 17639034675840.0, + "grad_norm": 7.099018468138661, + "language_loss": 0.81976545, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.83502674, + "num_input_tokens_seen": 195393725, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24511719, + "step": 9070, + "time_per_iteration": 2.7316269874572754 + }, + { + "auxiliary_loss_clip": 0.01303877, + "auxiliary_loss_mlp": 0.00237645, + "balance_loss_clip": 1.08430028, + "balance_loss_mlp": 0.21128826, + "epoch": 0.5453780249511498, + "flos": 23216679406080.0, + "grad_norm": 3.776360502421378, + "language_loss": 0.68263209, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.69804728, + "num_input_tokens_seen": 195411380, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.2635498, + "step": 9071, + "time_per_iteration": 2.7307615280151367 + }, + { + "auxiliary_loss_clip": 0.01336921, + "auxiliary_loss_mlp": 0.00117637, + "balance_loss_clip": 1.18194687, + "balance_loss_mlp": 0.11024585, + "epoch": 0.5454381482038179, + "flos": 68696504801280.0, + "grad_norm": 0.6936939107651525, + "language_loss": 0.56725907, + "learning_rate": 1.80305733435899e-06, + "loss": 0.58180463, + "num_input_tokens_seen": 195482015, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.07373047, + "step": 9072, + "time_per_iteration": 3.2647197246551514 + }, + { + "auxiliary_loss_clip": 0.01302247, + "auxiliary_loss_mlp": 0.00244259, + "balance_loss_clip": 1.08459961, + "balance_loss_mlp": 0.21794993, + "epoch": 0.5454982714564858, + "flos": 13260626346240.0, + "grad_norm": 1099.535143130576, + "language_loss": 0.77871084, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.79417586, + "num_input_tokens_seen": 195500440, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.26318359, + "step": 9073, + "time_per_iteration": 2.649066686630249 + }, + { + "auxiliary_loss_clip": 0.01307792, + "auxiliary_loss_mlp": 0.00227844, + "balance_loss_clip": 1.09167051, + "balance_loss_mlp": 0.2035608, + "epoch": 0.5455583947091538, + "flos": 21835878733440.0, + "grad_norm": 8.81796971990186, + "language_loss": 0.78568876, + "learning_rate": 1.802282211606627e-06, + "loss": 0.80104512, + "num_input_tokens_seen": 195520860, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.24291992, + "step": 9074, + "time_per_iteration": 2.697845458984375 + }, + { + "auxiliary_loss_clip": 0.01312874, + "auxiliary_loss_mlp": 0.00244015, + "balance_loss_clip": 1.08944535, + "balance_loss_mlp": 0.21786046, + "epoch": 0.5456185179618217, + "flos": 17817438551040.0, + "grad_norm": 13.47702700067254, + "language_loss": 0.76197559, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.7775445, + "num_input_tokens_seen": 195538615, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26135254, + "step": 9075, + "time_per_iteration": 2.58799409866333 + }, + { + "auxiliary_loss_clip": 0.01320931, + "auxiliary_loss_mlp": 0.00240368, + "balance_loss_clip": 1.09520721, + "balance_loss_mlp": 0.21546504, + "epoch": 0.5456786412144897, + "flos": 21069401391360.0, + "grad_norm": 4.821176882022189, + "language_loss": 0.88341331, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.89902627, + "num_input_tokens_seen": 195557460, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.2487793, + "step": 9076, + "time_per_iteration": 2.619471788406372 + }, + { + "auxiliary_loss_clip": 0.01312267, + "auxiliary_loss_mlp": 0.00232697, + "balance_loss_clip": 1.08960438, + "balance_loss_mlp": 0.20598176, + "epoch": 0.5457387644671576, + "flos": 23294965098240.0, + "grad_norm": 13.884819627097615, + "language_loss": 0.85993081, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.87538046, + "num_input_tokens_seen": 195577985, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.26721191, + "step": 9077, + "time_per_iteration": 2.6702609062194824 + }, + { + "auxiliary_loss_clip": 0.01301122, + "auxiliary_loss_mlp": 0.0022541, + "balance_loss_clip": 1.07551289, + "balance_loss_mlp": 0.19936307, + "epoch": 0.5457988877198257, + "flos": 21617039122560.0, + "grad_norm": 7.480703242172585, + "language_loss": 0.78232682, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.79759216, + "num_input_tokens_seen": 195597620, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26025391, + "step": 9078, + "time_per_iteration": 2.6924681663513184 + }, + { + "auxiliary_loss_clip": 0.01295041, + "auxiliary_loss_mlp": 0.00247463, + "balance_loss_clip": 1.07174468, + "balance_loss_mlp": 0.22158276, + "epoch": 0.5458590109724936, + "flos": 23762485543680.0, + "grad_norm": 30.888869745938337, + "language_loss": 0.87974137, + "learning_rate": 1.800344536188764e-06, + "loss": 0.8951664, + "num_input_tokens_seen": 195615910, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25878906, + "step": 9079, + "time_per_iteration": 2.642232894897461 + }, + { + "auxiliary_loss_clip": 0.01316457, + "auxiliary_loss_mlp": 0.00263179, + "balance_loss_clip": 1.081967, + "balance_loss_mlp": 0.23594013, + "epoch": 0.5459191342251616, + "flos": 24424283675520.0, + "grad_norm": 84.50921370584351, + "language_loss": 0.84449714, + "learning_rate": 1.799957023759277e-06, + "loss": 0.86029351, + "num_input_tokens_seen": 195635620, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.27233887, + "step": 9080, + "time_per_iteration": 2.711336612701416 + }, + { + "auxiliary_loss_clip": 0.01307712, + "auxiliary_loss_mlp": 0.00275014, + "balance_loss_clip": 1.08099246, + "balance_loss_mlp": 0.24851349, + "epoch": 0.5459792574778296, + "flos": 23623009032960.0, + "grad_norm": 250.68283453298082, + "language_loss": 0.95534909, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.97117639, + "num_input_tokens_seen": 195652495, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.26501465, + "step": 9081, + "time_per_iteration": 2.6576597690582275 + }, + { + "auxiliary_loss_clip": 0.01301234, + "auxiliary_loss_mlp": 0.00269657, + "balance_loss_clip": 1.07463312, + "balance_loss_mlp": 0.2425009, + "epoch": 0.5460393807304975, + "flos": 19135540033920.0, + "grad_norm": 37.90435050385178, + "language_loss": 0.78042698, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.79613584, + "num_input_tokens_seen": 195671965, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.27148438, + "step": 9082, + "time_per_iteration": 2.6460983753204346 + }, + { + "auxiliary_loss_clip": 0.01296951, + "auxiliary_loss_mlp": 0.00271407, + "balance_loss_clip": 1.07502198, + "balance_loss_mlp": 0.24603924, + "epoch": 0.5460995039831655, + "flos": 35918534805120.0, + "grad_norm": 906.5465964162578, + "language_loss": 0.73053241, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.74621606, + "num_input_tokens_seen": 195694725, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.25378418, + "step": 9083, + "time_per_iteration": 2.7811779975891113 + }, + { + "auxiliary_loss_clip": 0.01285837, + "auxiliary_loss_mlp": 0.00231023, + "balance_loss_clip": 1.06647718, + "balance_loss_mlp": 0.20641845, + "epoch": 0.5461596272358334, + "flos": 26759231274240.0, + "grad_norm": 3.59059725653078, + "language_loss": 0.87030494, + "learning_rate": 1.798407050044766e-06, + "loss": 0.88547355, + "num_input_tokens_seen": 195714090, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24572754, + "step": 9084, + "time_per_iteration": 2.6733345985412598 + }, + { + "auxiliary_loss_clip": 0.01313775, + "auxiliary_loss_mlp": 0.00254221, + "balance_loss_clip": 1.08668506, + "balance_loss_mlp": 0.22860281, + "epoch": 0.5462197504885015, + "flos": 20886580143360.0, + "grad_norm": 2.386000036205379, + "language_loss": 0.83181459, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.8474946, + "num_input_tokens_seen": 195733585, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.25634766, + "step": 9085, + "time_per_iteration": 2.618704319000244 + }, + { + "auxiliary_loss_clip": 0.01301893, + "auxiliary_loss_mlp": 0.002707, + "balance_loss_clip": 1.07157063, + "balance_loss_mlp": 0.24477226, + "epoch": 0.5462798737411694, + "flos": 25804976607360.0, + "grad_norm": 12.53032969717092, + "language_loss": 0.82306778, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.83879364, + "num_input_tokens_seen": 195752820, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.25939941, + "step": 9086, + "time_per_iteration": 2.71980881690979 + }, + { + "auxiliary_loss_clip": 0.01303958, + "auxiliary_loss_mlp": 0.00278612, + "balance_loss_clip": 1.08186209, + "balance_loss_mlp": 0.25376922, + "epoch": 0.5463399969938374, + "flos": 25775027642880.0, + "grad_norm": 7.037331332714847, + "language_loss": 0.82174385, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.83756953, + "num_input_tokens_seen": 195773740, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.24841309, + "step": 9087, + "time_per_iteration": 2.6893765926361084 + }, + { + "auxiliary_loss_clip": 0.0133085, + "auxiliary_loss_mlp": 0.00253788, + "balance_loss_clip": 1.09667087, + "balance_loss_mlp": 0.22706087, + "epoch": 0.5464001202465053, + "flos": 18843298980480.0, + "grad_norm": 8.19151636195717, + "language_loss": 0.87658107, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.89242744, + "num_input_tokens_seen": 195792125, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.2668457, + "step": 9088, + "time_per_iteration": 2.6144771575927734 + }, + { + "auxiliary_loss_clip": 0.01277347, + "auxiliary_loss_mlp": 0.0010224, + "balance_loss_clip": 1.11962652, + "balance_loss_mlp": 0.0955638, + "epoch": 0.5464602434991733, + "flos": 69049541623680.0, + "grad_norm": 0.713821781264253, + "language_loss": 0.57324678, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.58704263, + "num_input_tokens_seen": 195854935, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.06689453, + "step": 9089, + "time_per_iteration": 3.1439671516418457 + }, + { + "auxiliary_loss_clip": 0.01305046, + "auxiliary_loss_mlp": 0.00270683, + "balance_loss_clip": 1.07761967, + "balance_loss_mlp": 0.24382551, + "epoch": 0.5465203667518412, + "flos": 27560039040000.0, + "grad_norm": 25.3806806326181, + "language_loss": 0.83877158, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.8545289, + "num_input_tokens_seen": 195874715, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.26843262, + "step": 9090, + "time_per_iteration": 2.6574392318725586 + }, + { + "auxiliary_loss_clip": 0.0129391, + "auxiliary_loss_mlp": 0.00265304, + "balance_loss_clip": 1.06592917, + "balance_loss_mlp": 0.2368722, + "epoch": 0.5465804900045093, + "flos": 21210206705280.0, + "grad_norm": 9.124672554874346, + "language_loss": 0.82420367, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.83979583, + "num_input_tokens_seen": 195892610, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.28479004, + "step": 9091, + "time_per_iteration": 2.7761759757995605 + }, + { + "auxiliary_loss_clip": 0.01289366, + "auxiliary_loss_mlp": 0.0026185, + "balance_loss_clip": 1.066872, + "balance_loss_mlp": 0.23629168, + "epoch": 0.5466406132571772, + "flos": 22488949860480.0, + "grad_norm": 19.559387878133347, + "language_loss": 0.83430362, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.84981579, + "num_input_tokens_seen": 195911085, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25549316, + "step": 9092, + "time_per_iteration": 2.675853967666626 + }, + { + "auxiliary_loss_clip": 0.01293569, + "auxiliary_loss_mlp": 0.00246749, + "balance_loss_clip": 1.0639379, + "balance_loss_mlp": 0.21953395, + "epoch": 0.5467007365098452, + "flos": 17675843137920.0, + "grad_norm": 41.29888553496963, + "language_loss": 0.87241769, + "learning_rate": 1.794920057818476e-06, + "loss": 0.88782084, + "num_input_tokens_seen": 195929845, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.27209473, + "step": 9093, + "time_per_iteration": 2.6139354705810547 + }, + { + "auxiliary_loss_clip": 0.0128294, + "auxiliary_loss_mlp": 0.00282104, + "balance_loss_clip": 1.0578196, + "balance_loss_mlp": 0.25292131, + "epoch": 0.5467608597625132, + "flos": 15698852524800.0, + "grad_norm": 4.3922282216809245, + "language_loss": 0.78755361, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.80320406, + "num_input_tokens_seen": 195946350, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.29174805, + "step": 9094, + "time_per_iteration": 4.00624942779541 + }, + { + "auxiliary_loss_clip": 0.01307011, + "auxiliary_loss_mlp": 0.00242989, + "balance_loss_clip": 1.07775223, + "balance_loss_mlp": 0.21905203, + "epoch": 0.5468209830151811, + "flos": 24312816794880.0, + "grad_norm": 5.367033609888581, + "language_loss": 0.77119845, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.78669846, + "num_input_tokens_seen": 195959840, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.23950195, + "step": 9095, + "time_per_iteration": 2.629438877105713 + }, + { + "auxiliary_loss_clip": 0.01305322, + "auxiliary_loss_mlp": 0.00262174, + "balance_loss_clip": 1.07865906, + "balance_loss_mlp": 0.2357931, + "epoch": 0.5468811062678491, + "flos": 29166323339520.0, + "grad_norm": 1.9792367738159138, + "language_loss": 0.73694766, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.7526226, + "num_input_tokens_seen": 195981125, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.2635498, + "step": 9096, + "time_per_iteration": 4.18281888961792 + }, + { + "auxiliary_loss_clip": 0.0127147, + "auxiliary_loss_mlp": 0.00078302, + "balance_loss_clip": 1.11457753, + "balance_loss_mlp": 0.07200817, + "epoch": 0.546941229520517, + "flos": 67867037982720.0, + "grad_norm": 0.7298959601909331, + "language_loss": 0.56957185, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.58306956, + "num_input_tokens_seen": 196038880, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.06298828, + "step": 9097, + "time_per_iteration": 3.2156898975372314 + }, + { + "auxiliary_loss_clip": 0.0127126, + "auxiliary_loss_mlp": 0.00075776, + "balance_loss_clip": 1.11461449, + "balance_loss_mlp": 0.06895762, + "epoch": 0.5470013527731851, + "flos": 58270306625280.0, + "grad_norm": 0.9096712615060342, + "language_loss": 0.64433837, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.65780872, + "num_input_tokens_seen": 196099215, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.06835938, + "step": 9098, + "time_per_iteration": 4.489742279052734 + }, + { + "auxiliary_loss_clip": 0.01296888, + "auxiliary_loss_mlp": 0.00253883, + "balance_loss_clip": 1.0700438, + "balance_loss_mlp": 0.22806212, + "epoch": 0.547061476025853, + "flos": 22965915582720.0, + "grad_norm": 22.84276107343891, + "language_loss": 0.82112807, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.83663583, + "num_input_tokens_seen": 196120370, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.25830078, + "step": 9099, + "time_per_iteration": 2.6682024002075195 + }, + { + "auxiliary_loss_clip": 0.01277602, + "auxiliary_loss_mlp": 0.0025921, + "balance_loss_clip": 1.0558821, + "balance_loss_mlp": 0.2347725, + "epoch": 0.547121599278521, + "flos": 29968244426880.0, + "grad_norm": 190.53814388648922, + "language_loss": 0.7909081, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.80627626, + "num_input_tokens_seen": 196139075, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24438477, + "step": 9100, + "time_per_iteration": 2.6874172687530518 + }, + { + "auxiliary_loss_clip": 0.01279749, + "auxiliary_loss_mlp": 0.00293991, + "balance_loss_clip": 1.05787325, + "balance_loss_mlp": 0.26758665, + "epoch": 0.5471817225311889, + "flos": 36535443914880.0, + "grad_norm": 20.596391685621132, + "language_loss": 0.73135102, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.74708843, + "num_input_tokens_seen": 196159990, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.26416016, + "step": 9101, + "time_per_iteration": 2.746894121170044 + }, + { + "auxiliary_loss_clip": 0.01297349, + "auxiliary_loss_mlp": 0.00273991, + "balance_loss_clip": 1.07048082, + "balance_loss_mlp": 0.24715725, + "epoch": 0.5472418457838569, + "flos": 25775243124480.0, + "grad_norm": 7.633484708692332, + "language_loss": 0.8409723, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.85668564, + "num_input_tokens_seen": 196180570, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26818848, + "step": 9102, + "time_per_iteration": 2.6717400550842285 + }, + { + "auxiliary_loss_clip": 0.01291293, + "auxiliary_loss_mlp": 0.00266823, + "balance_loss_clip": 1.06862056, + "balance_loss_mlp": 0.24168126, + "epoch": 0.5473019690365248, + "flos": 27887687925120.0, + "grad_norm": 30.224196486006765, + "language_loss": 0.78212315, + "learning_rate": 1.791046361258413e-06, + "loss": 0.79770422, + "num_input_tokens_seen": 196200300, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25109863, + "step": 9103, + "time_per_iteration": 4.140236854553223 + }, + { + "auxiliary_loss_clip": 0.01287108, + "auxiliary_loss_mlp": 0.00284494, + "balance_loss_clip": 1.0664587, + "balance_loss_mlp": 0.25900719, + "epoch": 0.5473620922891929, + "flos": 57631490219520.0, + "grad_norm": 51.51173705020782, + "language_loss": 0.69597775, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.71169376, + "num_input_tokens_seen": 196228525, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25500488, + "step": 9104, + "time_per_iteration": 3.0095863342285156 + }, + { + "auxiliary_loss_clip": 0.01286746, + "auxiliary_loss_mlp": 0.00304946, + "balance_loss_clip": 1.06239808, + "balance_loss_mlp": 0.27762344, + "epoch": 0.5474222155418608, + "flos": 19354056422400.0, + "grad_norm": 50.90361726412666, + "language_loss": 0.90990937, + "learning_rate": 1.790271716558888e-06, + "loss": 0.92582631, + "num_input_tokens_seen": 196247690, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.27294922, + "step": 9105, + "time_per_iteration": 2.6556460857391357 + }, + { + "auxiliary_loss_clip": 0.01283507, + "auxiliary_loss_mlp": 0.00251079, + "balance_loss_clip": 1.05739117, + "balance_loss_mlp": 0.22689131, + "epoch": 0.5474823387945288, + "flos": 25120448144640.0, + "grad_norm": 35.392858621436474, + "language_loss": 0.86444449, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.87979031, + "num_input_tokens_seen": 196268555, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.24182129, + "step": 9106, + "time_per_iteration": 2.687185764312744 + }, + { + "auxiliary_loss_clip": 0.01304496, + "auxiliary_loss_mlp": 0.00283624, + "balance_loss_clip": 1.0750159, + "balance_loss_mlp": 0.25917432, + "epoch": 0.5475424620471967, + "flos": 18004174381440.0, + "grad_norm": 7.104137335639339, + "language_loss": 0.77810621, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.79398739, + "num_input_tokens_seen": 196285585, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.24450684, + "step": 9107, + "time_per_iteration": 2.646055221557617 + }, + { + "auxiliary_loss_clip": 0.01277716, + "auxiliary_loss_mlp": 0.00298603, + "balance_loss_clip": 1.05283415, + "balance_loss_mlp": 0.27184021, + "epoch": 0.5476025852998647, + "flos": 22309324922880.0, + "grad_norm": 5.432447753472254, + "language_loss": 0.74298769, + "learning_rate": 1.789109809193197e-06, + "loss": 0.75875086, + "num_input_tokens_seen": 196305085, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.26757812, + "step": 9108, + "time_per_iteration": 2.62410044670105 + }, + { + "auxiliary_loss_clip": 0.01274328, + "auxiliary_loss_mlp": 0.00287446, + "balance_loss_clip": 1.05183685, + "balance_loss_mlp": 0.2622692, + "epoch": 0.5476627085525327, + "flos": 20120497850880.0, + "grad_norm": 9.714372886837825, + "language_loss": 0.81373638, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.82935405, + "num_input_tokens_seen": 196323945, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25170898, + "step": 9109, + "time_per_iteration": 2.634201765060425 + }, + { + "auxiliary_loss_clip": 0.01283459, + "auxiliary_loss_mlp": 0.00294748, + "balance_loss_clip": 1.05967534, + "balance_loss_mlp": 0.27066767, + "epoch": 0.5477228318052006, + "flos": 17712579772800.0, + "grad_norm": 5.5680323667355704, + "language_loss": 0.85468125, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.87046325, + "num_input_tokens_seen": 196342200, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.2409668, + "step": 9110, + "time_per_iteration": 2.7130260467529297 + }, + { + "auxiliary_loss_clip": 0.01270492, + "auxiliary_loss_mlp": 0.00271219, + "balance_loss_clip": 1.05132496, + "balance_loss_mlp": 0.24768718, + "epoch": 0.5477829550578687, + "flos": 25848895962240.0, + "grad_norm": 52.80439660224323, + "language_loss": 0.77784771, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.79326481, + "num_input_tokens_seen": 196362940, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.23547363, + "step": 9111, + "time_per_iteration": 2.686607837677002 + }, + { + "auxiliary_loss_clip": 0.01285088, + "auxiliary_loss_mlp": 0.00266405, + "balance_loss_clip": 1.05926514, + "balance_loss_mlp": 0.24170466, + "epoch": 0.5478430783105366, + "flos": 23039676161280.0, + "grad_norm": 7.91922279566762, + "language_loss": 0.78749841, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.80301338, + "num_input_tokens_seen": 196383070, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.24719238, + "step": 9112, + "time_per_iteration": 2.6667351722717285 + }, + { + "auxiliary_loss_clip": 0.01290002, + "auxiliary_loss_mlp": 0.00287425, + "balance_loss_clip": 1.06130409, + "balance_loss_mlp": 0.26202166, + "epoch": 0.5479032015632046, + "flos": 16071210864000.0, + "grad_norm": 23.936475040256123, + "language_loss": 0.972332, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.98810625, + "num_input_tokens_seen": 196398485, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.25415039, + "step": 9113, + "time_per_iteration": 2.725208282470703 + }, + { + "auxiliary_loss_clip": 0.01280551, + "auxiliary_loss_mlp": 0.0028692, + "balance_loss_clip": 1.05597591, + "balance_loss_mlp": 0.26162419, + "epoch": 0.5479633248158725, + "flos": 24278701852800.0, + "grad_norm": 2.557880780959388, + "language_loss": 0.78159702, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.79727179, + "num_input_tokens_seen": 196417725, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.25317383, + "step": 9114, + "time_per_iteration": 2.6955180168151855 + }, + { + "auxiliary_loss_clip": 0.01284236, + "auxiliary_loss_mlp": 0.00299719, + "balance_loss_clip": 1.05626631, + "balance_loss_mlp": 0.27622244, + "epoch": 0.5480234480685405, + "flos": 26358216860160.0, + "grad_norm": 4.821109478195481, + "language_loss": 0.77660894, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.79244852, + "num_input_tokens_seen": 196437840, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.23474121, + "step": 9115, + "time_per_iteration": 2.669281005859375 + }, + { + "auxiliary_loss_clip": 0.01280736, + "auxiliary_loss_mlp": 0.00296184, + "balance_loss_clip": 1.05230951, + "balance_loss_mlp": 0.26876578, + "epoch": 0.5480835713212084, + "flos": 22055077480320.0, + "grad_norm": 13.988057091942085, + "language_loss": 0.80750829, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.82327747, + "num_input_tokens_seen": 196457300, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.27416992, + "step": 9116, + "time_per_iteration": 2.716392755508423 + }, + { + "auxiliary_loss_clip": 0.01282077, + "auxiliary_loss_mlp": 0.00297805, + "balance_loss_clip": 1.05808234, + "balance_loss_mlp": 0.27385592, + "epoch": 0.5481436945738765, + "flos": 25301042749440.0, + "grad_norm": 3.087079955460182, + "language_loss": 0.8344872, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.85028601, + "num_input_tokens_seen": 196476720, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.23974609, + "step": 9117, + "time_per_iteration": 2.6997416019439697 + }, + { + "auxiliary_loss_clip": 0.01266458, + "auxiliary_loss_mlp": 0.00288421, + "balance_loss_clip": 1.04847336, + "balance_loss_mlp": 0.2648648, + "epoch": 0.5482038178265444, + "flos": 33580857772800.0, + "grad_norm": 10.22660061945206, + "language_loss": 0.69425011, + "learning_rate": 1.785237306671674e-06, + "loss": 0.70979893, + "num_input_tokens_seen": 196496765, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.2355957, + "step": 9118, + "time_per_iteration": 2.7307722568511963 + }, + { + "auxiliary_loss_clip": 0.013143, + "auxiliary_loss_mlp": 0.0029331, + "balance_loss_clip": 1.07650435, + "balance_loss_mlp": 0.26737007, + "epoch": 0.5482639410792124, + "flos": 19026192055680.0, + "grad_norm": 4.1170681863119585, + "language_loss": 0.85673237, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.87280846, + "num_input_tokens_seen": 196516220, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.25939941, + "step": 9119, + "time_per_iteration": 2.6092536449432373 + }, + { + "auxiliary_loss_clip": 0.01262711, + "auxiliary_loss_mlp": 0.00303306, + "balance_loss_clip": 1.04587007, + "balance_loss_mlp": 0.27909419, + "epoch": 0.5483240643318803, + "flos": 25410318900480.0, + "grad_norm": 3.93295767342818, + "language_loss": 0.88129234, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.89695251, + "num_input_tokens_seen": 196533860, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24206543, + "step": 9120, + "time_per_iteration": 2.6448616981506348 + }, + { + "auxiliary_loss_clip": 0.01272282, + "auxiliary_loss_mlp": 0.0029982, + "balance_loss_clip": 1.05171394, + "balance_loss_mlp": 0.27501267, + "epoch": 0.5483841875845483, + "flos": 21466896272640.0, + "grad_norm": 144.3474259681268, + "language_loss": 0.88362586, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.89934677, + "num_input_tokens_seen": 196551305, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.24816895, + "step": 9121, + "time_per_iteration": 2.6565299034118652 + }, + { + "auxiliary_loss_clip": 0.01273035, + "auxiliary_loss_mlp": 0.00328627, + "balance_loss_clip": 1.04824734, + "balance_loss_mlp": 0.30368811, + "epoch": 0.5484443108372163, + "flos": 24747263792640.0, + "grad_norm": 29.258307856595916, + "language_loss": 0.68601632, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.70203292, + "num_input_tokens_seen": 196569420, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.24963379, + "step": 9122, + "time_per_iteration": 2.6595962047576904 + }, + { + "auxiliary_loss_clip": 0.01292348, + "auxiliary_loss_mlp": 0.00310593, + "balance_loss_clip": 1.06834745, + "balance_loss_mlp": 0.28623879, + "epoch": 0.5485044340898843, + "flos": 25375377945600.0, + "grad_norm": 205.1266470812938, + "language_loss": 0.78076065, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.79679012, + "num_input_tokens_seen": 196590610, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.24353027, + "step": 9123, + "time_per_iteration": 2.6486737728118896 + }, + { + "auxiliary_loss_clip": 0.01314406, + "auxiliary_loss_mlp": 0.00306953, + "balance_loss_clip": 1.08778679, + "balance_loss_mlp": 0.2821219, + "epoch": 0.5485645573425523, + "flos": 12641167370880.0, + "grad_norm": 8.621156912917927, + "language_loss": 0.89834213, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.91455579, + "num_input_tokens_seen": 196606495, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.24841309, + "step": 9124, + "time_per_iteration": 2.602525234222412 + }, + { + "auxiliary_loss_clip": 0.01301095, + "auxiliary_loss_mlp": 0.00291905, + "balance_loss_clip": 1.07207584, + "balance_loss_mlp": 0.2687546, + "epoch": 0.5486246805952202, + "flos": 28329425383680.0, + "grad_norm": 4.669355929212493, + "language_loss": 0.86606896, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.88199902, + "num_input_tokens_seen": 196626365, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.23168945, + "step": 9125, + "time_per_iteration": 2.669339418411255 + }, + { + "auxiliary_loss_clip": 0.01301777, + "auxiliary_loss_mlp": 0.0030043, + "balance_loss_clip": 1.07182312, + "balance_loss_mlp": 0.27414441, + "epoch": 0.5486848038478882, + "flos": 16800017817600.0, + "grad_norm": 6.516661330397307, + "language_loss": 0.84544933, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.86147135, + "num_input_tokens_seen": 196644465, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.26281738, + "step": 9126, + "time_per_iteration": 2.6191372871398926 + }, + { + "auxiliary_loss_clip": 0.01290388, + "auxiliary_loss_mlp": 0.00304524, + "balance_loss_clip": 1.06338656, + "balance_loss_mlp": 0.27832198, + "epoch": 0.5487449271005561, + "flos": 17236224581760.0, + "grad_norm": 29.94944045710538, + "language_loss": 0.78144944, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.79739851, + "num_input_tokens_seen": 196659160, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.26196289, + "step": 9127, + "time_per_iteration": 2.593217372894287 + }, + { + "auxiliary_loss_clip": 0.01306705, + "auxiliary_loss_mlp": 0.00335745, + "balance_loss_clip": 1.07472849, + "balance_loss_mlp": 0.30789781, + "epoch": 0.5488050503532241, + "flos": 17340867878400.0, + "grad_norm": 2.672175437555318, + "language_loss": 0.89926326, + "learning_rate": 1.781365618532181e-06, + "loss": 0.9156878, + "num_input_tokens_seen": 196677410, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.27844238, + "step": 9128, + "time_per_iteration": 2.651442289352417 + }, + { + "auxiliary_loss_clip": 0.01313337, + "auxiliary_loss_mlp": 0.00322787, + "balance_loss_clip": 1.08180988, + "balance_loss_mlp": 0.29597634, + "epoch": 0.548865173605892, + "flos": 17239169496960.0, + "grad_norm": 62.027072442041074, + "language_loss": 0.80828667, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.82464796, + "num_input_tokens_seen": 196696765, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.26831055, + "step": 9129, + "time_per_iteration": 2.6827189922332764 + }, + { + "auxiliary_loss_clip": 0.01329655, + "auxiliary_loss_mlp": 0.00314936, + "balance_loss_clip": 1.09533834, + "balance_loss_mlp": 0.2869457, + "epoch": 0.5489252968585601, + "flos": 17456716218240.0, + "grad_norm": 3.9277953150102674, + "language_loss": 0.76564896, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.78209484, + "num_input_tokens_seen": 196714895, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.27990723, + "step": 9130, + "time_per_iteration": 2.656524658203125 + }, + { + "auxiliary_loss_clip": 0.01305543, + "auxiliary_loss_mlp": 0.00295275, + "balance_loss_clip": 1.07727206, + "balance_loss_mlp": 0.26914448, + "epoch": 0.548985420111228, + "flos": 26323383646080.0, + "grad_norm": 3.495801944024294, + "language_loss": 0.71826047, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.73426861, + "num_input_tokens_seen": 196735510, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.26135254, + "step": 9131, + "time_per_iteration": 2.7065329551696777 + }, + { + "auxiliary_loss_clip": 0.01314302, + "auxiliary_loss_mlp": 0.00303627, + "balance_loss_clip": 1.0880512, + "balance_loss_mlp": 0.27521926, + "epoch": 0.549045543363896, + "flos": 18693730748160.0, + "grad_norm": 21.692299471016742, + "language_loss": 0.83122849, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.84740782, + "num_input_tokens_seen": 196752855, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.28393555, + "step": 9132, + "time_per_iteration": 2.600705146789551 + }, + { + "auxiliary_loss_clip": 0.0130572, + "auxiliary_loss_mlp": 0.0030376, + "balance_loss_clip": 1.07445431, + "balance_loss_mlp": 0.27839184, + "epoch": 0.5491056666165639, + "flos": 24717386655360.0, + "grad_norm": 5.489185774581199, + "language_loss": 0.89452994, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.91062468, + "num_input_tokens_seen": 196772230, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.25354004, + "step": 9133, + "time_per_iteration": 2.699733257293701 + }, + { + "auxiliary_loss_clip": 0.01299117, + "auxiliary_loss_mlp": 0.00306885, + "balance_loss_clip": 1.07030737, + "balance_loss_mlp": 0.28080219, + "epoch": 0.5491657898692319, + "flos": 21576926609280.0, + "grad_norm": 4.231228282388851, + "language_loss": 0.78270668, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.79876667, + "num_input_tokens_seen": 196790405, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.26074219, + "step": 9134, + "time_per_iteration": 2.6311042308807373 + }, + { + "auxiliary_loss_clip": 0.01335258, + "auxiliary_loss_mlp": 0.00321752, + "balance_loss_clip": 1.10035706, + "balance_loss_mlp": 0.29444104, + "epoch": 0.5492259131219, + "flos": 50476432746240.0, + "grad_norm": 4.349115837916499, + "language_loss": 0.66882795, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.68539804, + "num_input_tokens_seen": 196813785, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.27319336, + "step": 9135, + "time_per_iteration": 2.890806198120117 + }, + { + "auxiliary_loss_clip": 0.01312901, + "auxiliary_loss_mlp": 0.00300633, + "balance_loss_clip": 1.07737947, + "balance_loss_mlp": 0.27265453, + "epoch": 0.5492860363745679, + "flos": 25119262995840.0, + "grad_norm": 4.737822579003588, + "language_loss": 0.83067787, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.8468132, + "num_input_tokens_seen": 196834390, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.27978516, + "step": 9136, + "time_per_iteration": 4.102610111236572 + }, + { + "auxiliary_loss_clip": 0.01329257, + "auxiliary_loss_mlp": 0.00313336, + "balance_loss_clip": 1.0939492, + "balance_loss_mlp": 0.28510767, + "epoch": 0.5493461596272359, + "flos": 22633777497600.0, + "grad_norm": 14.639623698518033, + "language_loss": 0.76795167, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.78437763, + "num_input_tokens_seen": 196853290, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.28271484, + "step": 9137, + "time_per_iteration": 2.7470905780792236 + }, + { + "auxiliary_loss_clip": 0.01276138, + "auxiliary_loss_mlp": 0.0013099, + "balance_loss_clip": 1.12016904, + "balance_loss_mlp": 0.12302645, + "epoch": 0.5494062828799038, + "flos": 66151800754560.0, + "grad_norm": 0.7306641986291927, + "language_loss": 0.64501464, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.65908593, + "num_input_tokens_seen": 196913120, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.07958984, + "step": 9138, + "time_per_iteration": 4.722402334213257 + }, + { + "auxiliary_loss_clip": 0.01339702, + "auxiliary_loss_mlp": 0.00301864, + "balance_loss_clip": 1.1005367, + "balance_loss_mlp": 0.27314654, + "epoch": 0.5494664061325718, + "flos": 21105958458240.0, + "grad_norm": 8.780098686792693, + "language_loss": 0.81385732, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.83027291, + "num_input_tokens_seen": 196931530, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.28735352, + "step": 9139, + "time_per_iteration": 2.6796655654907227 + }, + { + "auxiliary_loss_clip": 0.01313972, + "auxiliary_loss_mlp": 0.00267864, + "balance_loss_clip": 1.08418334, + "balance_loss_mlp": 0.24423666, + "epoch": 0.5495265293852397, + "flos": 14392566616320.0, + "grad_norm": 8.27195513624354, + "language_loss": 0.80774897, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.82356733, + "num_input_tokens_seen": 196949430, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.23657227, + "step": 9140, + "time_per_iteration": 3.9989945888519287 + }, + { + "auxiliary_loss_clip": 0.01308288, + "auxiliary_loss_mlp": 0.00274622, + "balance_loss_clip": 1.07937431, + "balance_loss_mlp": 0.24918295, + "epoch": 0.5495866526379077, + "flos": 25549148966400.0, + "grad_norm": 12.440150855391586, + "language_loss": 0.85307562, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.86890471, + "num_input_tokens_seen": 196968265, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.2545166, + "step": 9141, + "time_per_iteration": 2.658358573913574 + }, + { + "auxiliary_loss_clip": 0.01306237, + "auxiliary_loss_mlp": 0.0030661, + "balance_loss_clip": 1.08368325, + "balance_loss_mlp": 0.27978802, + "epoch": 0.5496467758905756, + "flos": 21317256213120.0, + "grad_norm": 154.09137623612423, + "language_loss": 0.81037712, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.82650554, + "num_input_tokens_seen": 196984930, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26794434, + "step": 9142, + "time_per_iteration": 2.6280899047851562 + }, + { + "auxiliary_loss_clip": 0.01334419, + "auxiliary_loss_mlp": 0.00272616, + "balance_loss_clip": 1.09971356, + "balance_loss_mlp": 0.24493527, + "epoch": 0.5497068991432437, + "flos": 22233086305920.0, + "grad_norm": 15.312549782806851, + "language_loss": 0.84078789, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.85685825, + "num_input_tokens_seen": 197002320, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.27709961, + "step": 9143, + "time_per_iteration": 2.6464383602142334 + }, + { + "auxiliary_loss_clip": 0.01319451, + "auxiliary_loss_mlp": 0.00301205, + "balance_loss_clip": 1.09562302, + "balance_loss_mlp": 0.2732389, + "epoch": 0.5497670223959116, + "flos": 18479093028480.0, + "grad_norm": 5.816449106336075, + "language_loss": 0.89898193, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.91518843, + "num_input_tokens_seen": 197020825, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.27966309, + "step": 9144, + "time_per_iteration": 2.606168270111084 + }, + { + "auxiliary_loss_clip": 0.01326159, + "auxiliary_loss_mlp": 0.00304636, + "balance_loss_clip": 1.0966785, + "balance_loss_mlp": 0.27651471, + "epoch": 0.5498271456485796, + "flos": 29205107049600.0, + "grad_norm": 2.8367914741648463, + "language_loss": 0.78123039, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.7975384, + "num_input_tokens_seen": 197040450, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.28137207, + "step": 9145, + "time_per_iteration": 4.055703639984131 + }, + { + "auxiliary_loss_clip": 0.01315917, + "auxiliary_loss_mlp": 0.00292988, + "balance_loss_clip": 1.09321308, + "balance_loss_mlp": 0.26761991, + "epoch": 0.5498872689012475, + "flos": 34824372664320.0, + "grad_norm": 2.94192085734235, + "language_loss": 0.77478302, + "learning_rate": 1.774398678985076e-06, + "loss": 0.79087204, + "num_input_tokens_seen": 197063930, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25390625, + "step": 9146, + "time_per_iteration": 2.746192693710327 + }, + { + "auxiliary_loss_clip": 0.01308516, + "auxiliary_loss_mlp": 0.00264499, + "balance_loss_clip": 1.08429635, + "balance_loss_mlp": 0.23942924, + "epoch": 0.5499473921539155, + "flos": 25921938268800.0, + "grad_norm": 20.23979727648267, + "language_loss": 0.70443743, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.72016764, + "num_input_tokens_seen": 197082660, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.25048828, + "step": 9147, + "time_per_iteration": 2.6413230895996094 + }, + { + "auxiliary_loss_clip": 0.0133642, + "auxiliary_loss_mlp": 0.00279675, + "balance_loss_clip": 1.10477388, + "balance_loss_mlp": 0.25167239, + "epoch": 0.5500075154065835, + "flos": 22273701609600.0, + "grad_norm": 66.76773150809488, + "language_loss": 0.88238966, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.89855063, + "num_input_tokens_seen": 197100675, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.28015137, + "step": 9148, + "time_per_iteration": 2.6270999908447266 + }, + { + "auxiliary_loss_clip": 0.01339512, + "auxiliary_loss_mlp": 0.00289915, + "balance_loss_clip": 1.10793793, + "balance_loss_mlp": 0.26079232, + "epoch": 0.5500676386592515, + "flos": 28037507552640.0, + "grad_norm": 4.927443629920093, + "language_loss": 0.86861253, + "learning_rate": 1.773237789559453e-06, + "loss": 0.88490689, + "num_input_tokens_seen": 197121320, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.2911377, + "step": 9149, + "time_per_iteration": 2.71547532081604 + }, + { + "auxiliary_loss_clip": 0.0131026, + "auxiliary_loss_mlp": 0.0028088, + "balance_loss_clip": 1.08838749, + "balance_loss_mlp": 0.25511879, + "epoch": 0.5501277619119195, + "flos": 23914819123200.0, + "grad_norm": 12.093015967114729, + "language_loss": 0.80577302, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.82168442, + "num_input_tokens_seen": 197138965, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.25720215, + "step": 9150, + "time_per_iteration": 2.649186134338379 + }, + { + "auxiliary_loss_clip": 0.01324079, + "auxiliary_loss_mlp": 0.00285789, + "balance_loss_clip": 1.09371734, + "balance_loss_mlp": 0.25708386, + "epoch": 0.5501878851645874, + "flos": 20923783655040.0, + "grad_norm": 4.2247317099975445, + "language_loss": 0.82995677, + "learning_rate": 1.772463906245477e-06, + "loss": 0.84605545, + "num_input_tokens_seen": 197156460, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.28723145, + "step": 9151, + "time_per_iteration": 2.6268341541290283 + }, + { + "auxiliary_loss_clip": 0.01310582, + "auxiliary_loss_mlp": 0.00312786, + "balance_loss_clip": 1.08797705, + "balance_loss_mlp": 0.28524864, + "epoch": 0.5502480084172554, + "flos": 20665298407680.0, + "grad_norm": 11.390696422408856, + "language_loss": 0.81993502, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.83616877, + "num_input_tokens_seen": 197175140, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.27563477, + "step": 9152, + "time_per_iteration": 2.6459035873413086 + }, + { + "auxiliary_loss_clip": 0.01302966, + "auxiliary_loss_mlp": 0.00257982, + "balance_loss_clip": 1.0830009, + "balance_loss_mlp": 0.23334172, + "epoch": 0.5503081316699233, + "flos": 26432552056320.0, + "grad_norm": 11.16006621609473, + "language_loss": 0.89670479, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.9123143, + "num_input_tokens_seen": 197194345, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.24633789, + "step": 9153, + "time_per_iteration": 2.7046430110931396 + }, + { + "auxiliary_loss_clip": 0.01306239, + "auxiliary_loss_mlp": 0.00240547, + "balance_loss_clip": 1.08520174, + "balance_loss_mlp": 0.21457113, + "epoch": 0.5503682549225913, + "flos": 30629144718720.0, + "grad_norm": 2.518057211344147, + "language_loss": 0.8193754, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.83484328, + "num_input_tokens_seen": 197215535, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25964355, + "step": 9154, + "time_per_iteration": 2.695378303527832 + }, + { + "auxiliary_loss_clip": 0.01312724, + "auxiliary_loss_mlp": 0.00295995, + "balance_loss_clip": 1.08536291, + "balance_loss_mlp": 0.26720622, + "epoch": 0.5504283781752592, + "flos": 22565439872640.0, + "grad_norm": 10.63711889857913, + "language_loss": 0.80629647, + "learning_rate": 1.770916243273199e-06, + "loss": 0.82238364, + "num_input_tokens_seen": 197234945, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.28771973, + "step": 9155, + "time_per_iteration": 2.7402281761169434 + }, + { + "auxiliary_loss_clip": 0.01291636, + "auxiliary_loss_mlp": 0.00102056, + "balance_loss_clip": 1.14076102, + "balance_loss_mlp": 0.09523746, + "epoch": 0.5504885014279273, + "flos": 67901009270400.0, + "grad_norm": 0.7231750848706211, + "language_loss": 0.55107206, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.56500894, + "num_input_tokens_seen": 197302285, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.06835938, + "step": 9156, + "time_per_iteration": 3.3186023235321045 + }, + { + "auxiliary_loss_clip": 0.01303237, + "auxiliary_loss_mlp": 0.00253273, + "balance_loss_clip": 1.08293915, + "balance_loss_mlp": 0.2276428, + "epoch": 0.5505486246805952, + "flos": 22450058409600.0, + "grad_norm": 12.310861297827715, + "language_loss": 0.88834929, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.90391445, + "num_input_tokens_seen": 197321575, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25646973, + "step": 9157, + "time_per_iteration": 2.7821414470672607 + }, + { + "auxiliary_loss_clip": 0.01322059, + "auxiliary_loss_mlp": 0.00285425, + "balance_loss_clip": 1.08813512, + "balance_loss_mlp": 0.25779179, + "epoch": 0.5506087479332632, + "flos": 26906896085760.0, + "grad_norm": 78.16251524781127, + "language_loss": 0.86535513, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.88142997, + "num_input_tokens_seen": 197340255, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.27661133, + "step": 9158, + "time_per_iteration": 2.6599061489105225 + }, + { + "auxiliary_loss_clip": 0.01323879, + "auxiliary_loss_mlp": 0.00235358, + "balance_loss_clip": 1.09881949, + "balance_loss_mlp": 0.21025217, + "epoch": 0.5506688711859311, + "flos": 22930256355840.0, + "grad_norm": 3.2497958930916906, + "language_loss": 0.77387947, + "learning_rate": 1.769368719290979e-06, + "loss": 0.78947186, + "num_input_tokens_seen": 197360360, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.25097656, + "step": 9159, + "time_per_iteration": 2.713042974472046 + }, + { + "auxiliary_loss_clip": 0.01327442, + "auxiliary_loss_mlp": 0.00277876, + "balance_loss_clip": 1.09161806, + "balance_loss_mlp": 0.24970675, + "epoch": 0.5507289944385991, + "flos": 29606408772480.0, + "grad_norm": 67.29417061694888, + "language_loss": 0.78271234, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.79876554, + "num_input_tokens_seen": 197381905, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.28210449, + "step": 9160, + "time_per_iteration": 2.7726943492889404 + }, + { + "auxiliary_loss_clip": 0.01308601, + "auxiliary_loss_mlp": 0.00259991, + "balance_loss_clip": 1.0829308, + "balance_loss_mlp": 0.23365748, + "epoch": 0.5507891176912671, + "flos": 15334431091200.0, + "grad_norm": 79.45411140024238, + "language_loss": 0.79152691, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.80721283, + "num_input_tokens_seen": 197398555, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.2635498, + "step": 9161, + "time_per_iteration": 2.640010356903076 + }, + { + "auxiliary_loss_clip": 0.01319031, + "auxiliary_loss_mlp": 0.0025429, + "balance_loss_clip": 1.09149086, + "balance_loss_mlp": 0.22709808, + "epoch": 0.5508492409439351, + "flos": 26578313447040.0, + "grad_norm": 4.695628299605547, + "language_loss": 0.76584613, + "learning_rate": 1.768208168081359e-06, + "loss": 0.78157938, + "num_input_tokens_seen": 197419630, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.27172852, + "step": 9162, + "time_per_iteration": 2.714024066925049 + }, + { + "auxiliary_loss_clip": 0.01306231, + "auxiliary_loss_mlp": 0.00275691, + "balance_loss_clip": 1.08569026, + "balance_loss_mlp": 0.24936962, + "epoch": 0.5509093641966031, + "flos": 25443428261760.0, + "grad_norm": 9.085908287400482, + "language_loss": 0.89989579, + "learning_rate": 1.767821335237733e-06, + "loss": 0.91571504, + "num_input_tokens_seen": 197438480, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.26293945, + "step": 9163, + "time_per_iteration": 2.640822649002075 + }, + { + "auxiliary_loss_clip": 0.01315036, + "auxiliary_loss_mlp": 0.00243383, + "balance_loss_clip": 1.09154272, + "balance_loss_mlp": 0.21900505, + "epoch": 0.550969487449271, + "flos": 18698543170560.0, + "grad_norm": 6.295426154411486, + "language_loss": 0.86220992, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.87779415, + "num_input_tokens_seen": 197456755, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.24401855, + "step": 9164, + "time_per_iteration": 2.6175618171691895 + }, + { + "auxiliary_loss_clip": 0.01310204, + "auxiliary_loss_mlp": 0.00261869, + "balance_loss_clip": 1.08100617, + "balance_loss_mlp": 0.23383066, + "epoch": 0.551029610701939, + "flos": 22708723224960.0, + "grad_norm": 8.870982725408156, + "language_loss": 0.81305289, + "learning_rate": 1.767047695977863e-06, + "loss": 0.82877362, + "num_input_tokens_seen": 197475530, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.28015137, + "step": 9165, + "time_per_iteration": 2.6366217136383057 + }, + { + "auxiliary_loss_clip": 0.01301587, + "auxiliary_loss_mlp": 0.00256727, + "balance_loss_clip": 1.07859814, + "balance_loss_mlp": 0.23342201, + "epoch": 0.5510897339546069, + "flos": 12420496166400.0, + "grad_norm": 3.2006937568715688, + "language_loss": 0.86190248, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.87748563, + "num_input_tokens_seen": 197490835, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.2331543, + "step": 9166, + "time_per_iteration": 2.622159719467163 + }, + { + "auxiliary_loss_clip": 0.01308649, + "auxiliary_loss_mlp": 0.00270908, + "balance_loss_clip": 1.0822196, + "balance_loss_mlp": 0.24315622, + "epoch": 0.5511498572072749, + "flos": 18770579896320.0, + "grad_norm": 6.583040284110605, + "language_loss": 0.83517063, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.85096622, + "num_input_tokens_seen": 197508770, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.27734375, + "step": 9167, + "time_per_iteration": 2.6075942516326904 + }, + { + "auxiliary_loss_clip": 0.01297139, + "auxiliary_loss_mlp": 0.00273689, + "balance_loss_clip": 1.07480359, + "balance_loss_mlp": 0.24741569, + "epoch": 0.5512099804599428, + "flos": 19573326996480.0, + "grad_norm": 6.550400884742833, + "language_loss": 0.89343631, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.90914464, + "num_input_tokens_seen": 197527340, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26318359, + "step": 9168, + "time_per_iteration": 2.6348280906677246 + }, + { + "auxiliary_loss_clip": 0.01316508, + "auxiliary_loss_mlp": 0.00295906, + "balance_loss_clip": 1.08917332, + "balance_loss_mlp": 0.26996624, + "epoch": 0.5512701037126109, + "flos": 26245600744320.0, + "grad_norm": 8.583166433478382, + "language_loss": 0.76868284, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.78480697, + "num_input_tokens_seen": 197547280, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.25939941, + "step": 9169, + "time_per_iteration": 2.672537326812744 + }, + { + "auxiliary_loss_clip": 0.01286041, + "auxiliary_loss_mlp": 0.00275363, + "balance_loss_clip": 1.06722951, + "balance_loss_mlp": 0.25156915, + "epoch": 0.5513302269652788, + "flos": 21945406279680.0, + "grad_norm": 42.40284400924623, + "language_loss": 0.92545187, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.94106597, + "num_input_tokens_seen": 197565045, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.23791504, + "step": 9170, + "time_per_iteration": 2.6986801624298096 + }, + { + "auxiliary_loss_clip": 0.01288774, + "auxiliary_loss_mlp": 0.00044896, + "balance_loss_clip": 1.13132024, + "balance_loss_mlp": 0.03960316, + "epoch": 0.5513903502179468, + "flos": 68235948616320.0, + "grad_norm": 0.7856616829754699, + "language_loss": 0.59614694, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.6094836, + "num_input_tokens_seen": 197625005, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.05297852, + "step": 9171, + "time_per_iteration": 3.1536049842834473 + }, + { + "auxiliary_loss_clip": 0.01303453, + "auxiliary_loss_mlp": 0.00332118, + "balance_loss_clip": 1.07807517, + "balance_loss_mlp": 0.30481866, + "epoch": 0.5514504734706147, + "flos": 18734238311040.0, + "grad_norm": 311.2940947127057, + "language_loss": 0.78113705, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.79749274, + "num_input_tokens_seen": 197645050, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.27270508, + "step": 9172, + "time_per_iteration": 2.7170350551605225 + }, + { + "auxiliary_loss_clip": 0.01298283, + "auxiliary_loss_mlp": 0.0028134, + "balance_loss_clip": 1.07160568, + "balance_loss_mlp": 0.25526857, + "epoch": 0.5515105967232827, + "flos": 22270972176000.0, + "grad_norm": 6.594829126546399, + "language_loss": 0.83727586, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.85307205, + "num_input_tokens_seen": 197663910, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.26086426, + "step": 9173, + "time_per_iteration": 2.680867910385132 + }, + { + "auxiliary_loss_clip": 0.01302401, + "auxiliary_loss_mlp": 0.0026227, + "balance_loss_clip": 1.07288718, + "balance_loss_mlp": 0.23619844, + "epoch": 0.5515707199759508, + "flos": 22557682535040.0, + "grad_norm": 5.626932495858452, + "language_loss": 0.82390982, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.83955652, + "num_input_tokens_seen": 197681580, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.26098633, + "step": 9174, + "time_per_iteration": 2.666444778442383 + }, + { + "auxiliary_loss_clip": 0.0129562, + "auxiliary_loss_mlp": 0.00274139, + "balance_loss_clip": 1.06927323, + "balance_loss_mlp": 0.24921221, + "epoch": 0.5516308432286187, + "flos": 28291072636800.0, + "grad_norm": 9.372416410796454, + "language_loss": 0.80398566, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.81968331, + "num_input_tokens_seen": 197702095, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.24938965, + "step": 9175, + "time_per_iteration": 2.726780414581299 + }, + { + "auxiliary_loss_clip": 0.01288879, + "auxiliary_loss_mlp": 0.00333016, + "balance_loss_clip": 1.06489944, + "balance_loss_mlp": 0.3047514, + "epoch": 0.5516909664812867, + "flos": 18764474584320.0, + "grad_norm": 5.497002374256045, + "language_loss": 0.77134836, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.78756732, + "num_input_tokens_seen": 197720720, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.28259277, + "step": 9176, + "time_per_iteration": 2.615628480911255 + }, + { + "auxiliary_loss_clip": 0.01285112, + "auxiliary_loss_mlp": 0.00290729, + "balance_loss_clip": 1.06417811, + "balance_loss_mlp": 0.2651701, + "epoch": 0.5517510897339546, + "flos": 27740346336000.0, + "grad_norm": 7076.325200969735, + "language_loss": 0.78793836, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.80369675, + "num_input_tokens_seen": 197741820, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.2557373, + "step": 9177, + "time_per_iteration": 2.7300925254821777 + }, + { + "auxiliary_loss_clip": 0.01307531, + "auxiliary_loss_mlp": 0.00317453, + "balance_loss_clip": 1.08051252, + "balance_loss_mlp": 0.29058355, + "epoch": 0.5518112129866226, + "flos": 18404470523520.0, + "grad_norm": 14.344623183176186, + "language_loss": 0.8654713, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.88172114, + "num_input_tokens_seen": 197759160, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26879883, + "step": 9178, + "time_per_iteration": 4.1321656703948975 + }, + { + "auxiliary_loss_clip": 0.0130928, + "auxiliary_loss_mlp": 0.00314252, + "balance_loss_clip": 1.0788511, + "balance_loss_mlp": 0.28731048, + "epoch": 0.5518713362392905, + "flos": 25082670015360.0, + "grad_norm": 17.80660709662402, + "language_loss": 0.81235468, + "learning_rate": 1.761633217089826e-06, + "loss": 0.82859004, + "num_input_tokens_seen": 197779760, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.26940918, + "step": 9179, + "time_per_iteration": 2.663072347640991 + }, + { + "auxiliary_loss_clip": 0.01298028, + "auxiliary_loss_mlp": 0.00286091, + "balance_loss_clip": 1.07008374, + "balance_loss_mlp": 0.26102108, + "epoch": 0.5519314594919585, + "flos": 36538999361280.0, + "grad_norm": 4.8476316075375925, + "language_loss": 0.75881416, + "learning_rate": 1.761246535912924e-06, + "loss": 0.77465534, + "num_input_tokens_seen": 197801545, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.25085449, + "step": 9180, + "time_per_iteration": 4.183887958526611 + }, + { + "auxiliary_loss_clip": 0.01292639, + "auxiliary_loss_mlp": 0.0029655, + "balance_loss_clip": 1.06576657, + "balance_loss_mlp": 0.27235037, + "epoch": 0.5519915827446265, + "flos": 20448613612800.0, + "grad_norm": 7.6142765212769135, + "language_loss": 0.76859635, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.7844882, + "num_input_tokens_seen": 197820760, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.24194336, + "step": 9181, + "time_per_iteration": 2.641159772872925 + }, + { + "auxiliary_loss_clip": 0.01295173, + "auxiliary_loss_mlp": 0.0031506, + "balance_loss_clip": 1.06692433, + "balance_loss_mlp": 0.28540075, + "epoch": 0.5520517059972945, + "flos": 23768052151680.0, + "grad_norm": 4.13147972191409, + "language_loss": 0.85825861, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.87436098, + "num_input_tokens_seen": 197840195, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.29663086, + "step": 9182, + "time_per_iteration": 4.2955238819122314 + }, + { + "auxiliary_loss_clip": 0.01298275, + "auxiliary_loss_mlp": 0.00289883, + "balance_loss_clip": 1.06666315, + "balance_loss_mlp": 0.26427692, + "epoch": 0.5521118292499624, + "flos": 22196457411840.0, + "grad_norm": 7.602459589189655, + "language_loss": 0.8880716, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.90395319, + "num_input_tokens_seen": 197859475, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.25634766, + "step": 9183, + "time_per_iteration": 2.6833670139312744 + }, + { + "auxiliary_loss_clip": 0.01274168, + "auxiliary_loss_mlp": 0.00302811, + "balance_loss_clip": 1.0519805, + "balance_loss_mlp": 0.27683565, + "epoch": 0.5521719525026304, + "flos": 23583291569280.0, + "grad_norm": 6.858422757536044, + "language_loss": 0.7295841, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.74535388, + "num_input_tokens_seen": 197879395, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25952148, + "step": 9184, + "time_per_iteration": 2.6711316108703613 + }, + { + "auxiliary_loss_clip": 0.01279558, + "auxiliary_loss_mlp": 0.00319993, + "balance_loss_clip": 1.05669653, + "balance_loss_mlp": 0.29326668, + "epoch": 0.5522320757552983, + "flos": 26137617482880.0, + "grad_norm": 3.0799408093749046, + "language_loss": 0.8134203, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.8294158, + "num_input_tokens_seen": 197900815, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.2668457, + "step": 9185, + "time_per_iteration": 2.652327537536621 + }, + { + "auxiliary_loss_clip": 0.01299563, + "auxiliary_loss_mlp": 0.00297628, + "balance_loss_clip": 1.072999, + "balance_loss_mlp": 0.27073437, + "epoch": 0.5522921990079663, + "flos": 24676160820480.0, + "grad_norm": 10.776513842133722, + "language_loss": 0.7986095, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.81458145, + "num_input_tokens_seen": 197918985, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.26879883, + "step": 9186, + "time_per_iteration": 2.6703619956970215 + }, + { + "auxiliary_loss_clip": 0.01310638, + "auxiliary_loss_mlp": 0.00286202, + "balance_loss_clip": 1.07704663, + "balance_loss_mlp": 0.2595346, + "epoch": 0.5523523222606344, + "flos": 22748153379840.0, + "grad_norm": 15.938584429873675, + "language_loss": 0.73417497, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.75014341, + "num_input_tokens_seen": 197937725, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.2668457, + "step": 9187, + "time_per_iteration": 2.6636083126068115 + }, + { + "auxiliary_loss_clip": 0.0130582, + "auxiliary_loss_mlp": 0.00277931, + "balance_loss_clip": 1.07700872, + "balance_loss_mlp": 0.2523244, + "epoch": 0.5524124455133023, + "flos": 19755825022080.0, + "grad_norm": 11.569137702379281, + "language_loss": 0.84710598, + "learning_rate": 1.758153413657318e-06, + "loss": 0.86294353, + "num_input_tokens_seen": 197955635, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.25598145, + "step": 9188, + "time_per_iteration": 4.030259370803833 + }, + { + "auxiliary_loss_clip": 0.01317512, + "auxiliary_loss_mlp": 0.00315332, + "balance_loss_clip": 1.08652353, + "balance_loss_mlp": 0.2871156, + "epoch": 0.5524725687659703, + "flos": 23294821443840.0, + "grad_norm": 17.7918669835789, + "language_loss": 0.89864779, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.91497624, + "num_input_tokens_seen": 197974490, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.28222656, + "step": 9189, + "time_per_iteration": 2.658393144607544 + }, + { + "auxiliary_loss_clip": 0.01314498, + "auxiliary_loss_mlp": 0.00297572, + "balance_loss_clip": 1.08721638, + "balance_loss_mlp": 0.27139345, + "epoch": 0.5525326920186382, + "flos": 24862178378880.0, + "grad_norm": 5.13103634148809, + "language_loss": 0.82691884, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.84303945, + "num_input_tokens_seen": 197995735, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.26184082, + "step": 9190, + "time_per_iteration": 2.703795909881592 + }, + { + "auxiliary_loss_clip": 0.01308368, + "auxiliary_loss_mlp": 0.00283732, + "balance_loss_clip": 1.07535601, + "balance_loss_mlp": 0.2555275, + "epoch": 0.5525928152713062, + "flos": 13735580906880.0, + "grad_norm": 20.133438063822055, + "language_loss": 0.89702153, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.91294253, + "num_input_tokens_seen": 198009685, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.28198242, + "step": 9191, + "time_per_iteration": 2.586456298828125 + }, + { + "auxiliary_loss_clip": 0.01299897, + "auxiliary_loss_mlp": 0.00277749, + "balance_loss_clip": 1.07175922, + "balance_loss_mlp": 0.25183266, + "epoch": 0.5526529385239741, + "flos": 13071592045440.0, + "grad_norm": 1696.9812322580615, + "language_loss": 0.7799781, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.79575455, + "num_input_tokens_seen": 198026845, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.25927734, + "step": 9192, + "time_per_iteration": 2.595831871032715 + }, + { + "auxiliary_loss_clip": 0.01295959, + "auxiliary_loss_mlp": 0.00263781, + "balance_loss_clip": 1.07334089, + "balance_loss_mlp": 0.23931964, + "epoch": 0.5527130617766421, + "flos": 23148377694720.0, + "grad_norm": 1.875432346121547, + "language_loss": 0.81507015, + "learning_rate": 1.756220509823588e-06, + "loss": 0.8306675, + "num_input_tokens_seen": 198045275, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.24438477, + "step": 9193, + "time_per_iteration": 2.65450119972229 + }, + { + "auxiliary_loss_clip": 0.01282837, + "auxiliary_loss_mlp": 0.00275001, + "balance_loss_clip": 1.06110895, + "balance_loss_mlp": 0.25076622, + "epoch": 0.55277318502931, + "flos": 21285547482240.0, + "grad_norm": 39.78071158914756, + "language_loss": 0.85314852, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.86872691, + "num_input_tokens_seen": 198065760, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.24243164, + "step": 9194, + "time_per_iteration": 2.68658447265625 + }, + { + "auxiliary_loss_clip": 0.01302903, + "auxiliary_loss_mlp": 0.00251511, + "balance_loss_clip": 1.06771278, + "balance_loss_mlp": 0.22573763, + "epoch": 0.5528333082819781, + "flos": 38324549462400.0, + "grad_norm": 5.887565768140376, + "language_loss": 0.74963689, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.76518101, + "num_input_tokens_seen": 198087595, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.25769043, + "step": 9195, + "time_per_iteration": 2.7840492725372314 + }, + { + "auxiliary_loss_clip": 0.01299972, + "auxiliary_loss_mlp": 0.00293323, + "balance_loss_clip": 1.06963313, + "balance_loss_mlp": 0.26484391, + "epoch": 0.552893431534646, + "flos": 13553621585280.0, + "grad_norm": 24.154448103604256, + "language_loss": 0.8197732, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.83570617, + "num_input_tokens_seen": 198104620, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.28491211, + "step": 9196, + "time_per_iteration": 2.6103553771972656 + }, + { + "auxiliary_loss_clip": 0.01281928, + "auxiliary_loss_mlp": 0.00265965, + "balance_loss_clip": 1.06391847, + "balance_loss_mlp": 0.2416102, + "epoch": 0.552953554787314, + "flos": 21939408708480.0, + "grad_norm": 3.754804846064917, + "language_loss": 0.82757616, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.84305507, + "num_input_tokens_seen": 198123565, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24328613, + "step": 9197, + "time_per_iteration": 2.6507315635681152 + }, + { + "auxiliary_loss_clip": 0.01284126, + "auxiliary_loss_mlp": 0.00266478, + "balance_loss_clip": 1.06308651, + "balance_loss_mlp": 0.24076498, + "epoch": 0.5530136780399819, + "flos": 43658002558080.0, + "grad_norm": 2.3452844179867545, + "language_loss": 0.81950808, + "learning_rate": 1.754287837093407e-06, + "loss": 0.8350141, + "num_input_tokens_seen": 198148270, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25720215, + "step": 9198, + "time_per_iteration": 2.851803779602051 + }, + { + "auxiliary_loss_clip": 0.01284614, + "auxiliary_loss_mlp": 0.00271029, + "balance_loss_clip": 1.06321645, + "balance_loss_mlp": 0.24584031, + "epoch": 0.5530738012926499, + "flos": 25045502417280.0, + "grad_norm": 7.357158403293689, + "language_loss": 0.8351658, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.85072219, + "num_input_tokens_seen": 198168810, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.25183105, + "step": 9199, + "time_per_iteration": 2.6823856830596924 + }, + { + "auxiliary_loss_clip": 0.01280425, + "auxiliary_loss_mlp": 0.00263957, + "balance_loss_clip": 1.05979538, + "balance_loss_mlp": 0.23982856, + "epoch": 0.553133924545318, + "flos": 16472081623680.0, + "grad_norm": 2.4453530526082377, + "language_loss": 0.69657421, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.71201801, + "num_input_tokens_seen": 198186200, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24145508, + "step": 9200, + "time_per_iteration": 2.6146469116210938 + }, + { + "auxiliary_loss_clip": 0.01319395, + "auxiliary_loss_mlp": 0.00247713, + "balance_loss_clip": 1.07931364, + "balance_loss_mlp": 0.22103378, + "epoch": 0.5531940477979859, + "flos": 24606207083520.0, + "grad_norm": 409.89672010096854, + "language_loss": 0.72058856, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.73625964, + "num_input_tokens_seen": 198207050, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.26733398, + "step": 9201, + "time_per_iteration": 2.688994884490967 + }, + { + "auxiliary_loss_clip": 0.01287423, + "auxiliary_loss_mlp": 0.00257291, + "balance_loss_clip": 1.06141686, + "balance_loss_mlp": 0.23298393, + "epoch": 0.5532541710506539, + "flos": 22159577122560.0, + "grad_norm": 5.700559431946877, + "language_loss": 0.69297636, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.7084235, + "num_input_tokens_seen": 198224565, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.2434082, + "step": 9202, + "time_per_iteration": 2.63075852394104 + }, + { + "auxiliary_loss_clip": 0.01293318, + "auxiliary_loss_mlp": 0.00266114, + "balance_loss_clip": 1.06588149, + "balance_loss_mlp": 0.24227206, + "epoch": 0.5533142943033218, + "flos": 21397265758080.0, + "grad_norm": 5.497165486518426, + "language_loss": 0.74258363, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.75817788, + "num_input_tokens_seen": 198244790, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.23876953, + "step": 9203, + "time_per_iteration": 2.6561641693115234 + }, + { + "auxiliary_loss_clip": 0.01312614, + "auxiliary_loss_mlp": 0.00296984, + "balance_loss_clip": 1.08192325, + "balance_loss_mlp": 0.26770645, + "epoch": 0.5533744175559898, + "flos": 23550541344000.0, + "grad_norm": 41.55447942499581, + "language_loss": 0.70866883, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.72476482, + "num_input_tokens_seen": 198264375, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.29272461, + "step": 9204, + "time_per_iteration": 2.779344081878662 + }, + { + "auxiliary_loss_clip": 0.01260999, + "auxiliary_loss_mlp": 0.00253924, + "balance_loss_clip": 1.04454088, + "balance_loss_mlp": 0.23017722, + "epoch": 0.5534345408086577, + "flos": 24061514267520.0, + "grad_norm": 2.9993236311591884, + "language_loss": 0.83937734, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.85452658, + "num_input_tokens_seen": 198283895, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.23742676, + "step": 9205, + "time_per_iteration": 2.6571333408355713 + }, + { + "auxiliary_loss_clip": 0.01283398, + "auxiliary_loss_mlp": 0.00266463, + "balance_loss_clip": 1.06239486, + "balance_loss_mlp": 0.24166755, + "epoch": 0.5534946640613257, + "flos": 33771831408000.0, + "grad_norm": 2.0481990325381805, + "language_loss": 0.77680594, + "learning_rate": 1.751196045993537e-06, + "loss": 0.79230452, + "num_input_tokens_seen": 198310035, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.2479248, + "step": 9206, + "time_per_iteration": 2.858829975128174 + }, + { + "auxiliary_loss_clip": 0.01293321, + "auxiliary_loss_mlp": 0.00242714, + "balance_loss_clip": 1.06310844, + "balance_loss_mlp": 0.21857423, + "epoch": 0.5535547873139937, + "flos": 15159223526400.0, + "grad_norm": 135.5862808096717, + "language_loss": 0.83285856, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.84821892, + "num_input_tokens_seen": 198327810, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.24157715, + "step": 9207, + "time_per_iteration": 2.6402580738067627 + }, + { + "auxiliary_loss_clip": 0.01301652, + "auxiliary_loss_mlp": 0.00288865, + "balance_loss_clip": 1.06539965, + "balance_loss_mlp": 0.26151827, + "epoch": 0.5536149105666617, + "flos": 16980863817600.0, + "grad_norm": 24.950233014735783, + "language_loss": 0.76854366, + "learning_rate": 1.750423192272189e-06, + "loss": 0.78444886, + "num_input_tokens_seen": 198343150, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.27331543, + "step": 9208, + "time_per_iteration": 2.6422386169433594 + }, + { + "auxiliary_loss_clip": 0.01291024, + "auxiliary_loss_mlp": 0.00280868, + "balance_loss_clip": 1.05791354, + "balance_loss_mlp": 0.25371251, + "epoch": 0.5536750338193296, + "flos": 18149935772160.0, + "grad_norm": 283.9500217049027, + "language_loss": 0.76866937, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.7843883, + "num_input_tokens_seen": 198360925, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.27172852, + "step": 9209, + "time_per_iteration": 2.6315619945526123 + }, + { + "auxiliary_loss_clip": 0.01279825, + "auxiliary_loss_mlp": 0.00247521, + "balance_loss_clip": 1.05261326, + "balance_loss_mlp": 0.2236197, + "epoch": 0.5537351570719976, + "flos": 22747794243840.0, + "grad_norm": 9.506883509727857, + "language_loss": 0.90178561, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.917059, + "num_input_tokens_seen": 198379265, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.2388916, + "step": 9210, + "time_per_iteration": 2.6867787837982178 + }, + { + "auxiliary_loss_clip": 0.01277639, + "auxiliary_loss_mlp": 0.0023562, + "balance_loss_clip": 1.05325425, + "balance_loss_mlp": 0.21251678, + "epoch": 0.5537952803246655, + "flos": 26356026130560.0, + "grad_norm": 4.210211972755019, + "language_loss": 0.80585515, + "learning_rate": 1.74926398270663e-06, + "loss": 0.8209877, + "num_input_tokens_seen": 198399490, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.23120117, + "step": 9211, + "time_per_iteration": 2.6875083446502686 + }, + { + "auxiliary_loss_clip": 0.01310045, + "auxiliary_loss_mlp": 0.00282415, + "balance_loss_clip": 1.07164836, + "balance_loss_mlp": 0.25491321, + "epoch": 0.5538554035773335, + "flos": 18037427397120.0, + "grad_norm": 21.04233208146956, + "language_loss": 0.76526725, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.78119189, + "num_input_tokens_seen": 198419110, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.27514648, + "step": 9212, + "time_per_iteration": 2.68575119972229 + }, + { + "auxiliary_loss_clip": 0.01285078, + "auxiliary_loss_mlp": 0.00268471, + "balance_loss_clip": 1.04949069, + "balance_loss_mlp": 0.24293678, + "epoch": 0.5539155268300014, + "flos": 31686247002240.0, + "grad_norm": 72.8970512271501, + "language_loss": 0.60455662, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.62009209, + "num_input_tokens_seen": 198441360, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.25561523, + "step": 9213, + "time_per_iteration": 2.7617528438568115 + }, + { + "auxiliary_loss_clip": 0.01294729, + "auxiliary_loss_mlp": 0.00261664, + "balance_loss_clip": 1.06296802, + "balance_loss_mlp": 0.23560455, + "epoch": 0.5539756500826695, + "flos": 15193769431680.0, + "grad_norm": 35.49012003967723, + "language_loss": 0.93366063, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.94922459, + "num_input_tokens_seen": 198459835, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.26049805, + "step": 9214, + "time_per_iteration": 2.6475625038146973 + }, + { + "auxiliary_loss_clip": 0.0126982, + "auxiliary_loss_mlp": 0.00256269, + "balance_loss_clip": 1.04361415, + "balance_loss_mlp": 0.23140207, + "epoch": 0.5540357733353375, + "flos": 26353117128960.0, + "grad_norm": 18.14883041958605, + "language_loss": 0.76768005, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.78294098, + "num_input_tokens_seen": 198478955, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.24902344, + "step": 9215, + "time_per_iteration": 2.7160451412200928 + }, + { + "auxiliary_loss_clip": 0.01274178, + "auxiliary_loss_mlp": 0.00256019, + "balance_loss_clip": 1.04567266, + "balance_loss_mlp": 0.2314261, + "epoch": 0.5540958965880054, + "flos": 21323684747520.0, + "grad_norm": 3.558281739048892, + "language_loss": 0.80919373, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.82449573, + "num_input_tokens_seen": 198499030, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.24609375, + "step": 9216, + "time_per_iteration": 2.7252049446105957 + }, + { + "auxiliary_loss_clip": 0.01271538, + "auxiliary_loss_mlp": 0.00231977, + "balance_loss_clip": 1.04536867, + "balance_loss_mlp": 0.20803991, + "epoch": 0.5541560198406734, + "flos": 25666828899840.0, + "grad_norm": 5.7950178087104876, + "language_loss": 0.78619456, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.80122966, + "num_input_tokens_seen": 198520265, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.23950195, + "step": 9217, + "time_per_iteration": 2.7029402256011963 + }, + { + "auxiliary_loss_clip": 0.01265325, + "auxiliary_loss_mlp": 0.00252823, + "balance_loss_clip": 1.04013419, + "balance_loss_mlp": 0.22941038, + "epoch": 0.5542161430933413, + "flos": 21939624190080.0, + "grad_norm": 66.22637148297137, + "language_loss": 0.83239973, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.84758127, + "num_input_tokens_seen": 198539645, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.23425293, + "step": 9218, + "time_per_iteration": 2.626368999481201 + }, + { + "auxiliary_loss_clip": 0.01279799, + "auxiliary_loss_mlp": 0.00232576, + "balance_loss_clip": 1.04962206, + "balance_loss_mlp": 0.20733932, + "epoch": 0.5542762663460093, + "flos": 19571459489280.0, + "grad_norm": 2.9384406746854985, + "language_loss": 0.79250127, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.80762506, + "num_input_tokens_seen": 198558710, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.2520752, + "step": 9219, + "time_per_iteration": 2.749870777130127 + }, + { + "auxiliary_loss_clip": 0.01283135, + "auxiliary_loss_mlp": 0.00256769, + "balance_loss_clip": 1.05510235, + "balance_loss_mlp": 0.23225985, + "epoch": 0.5543363895986773, + "flos": 19499063627520.0, + "grad_norm": 5.708720702743817, + "language_loss": 0.78888738, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.80428636, + "num_input_tokens_seen": 198577050, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.24499512, + "step": 9220, + "time_per_iteration": 2.6990456581115723 + }, + { + "auxiliary_loss_clip": 0.01263499, + "auxiliary_loss_mlp": 0.00228581, + "balance_loss_clip": 1.03953779, + "balance_loss_mlp": 0.20640856, + "epoch": 0.5543965128513453, + "flos": 22635609091200.0, + "grad_norm": 9.41310251287739, + "language_loss": 0.84294522, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.85786605, + "num_input_tokens_seen": 198595290, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.22180176, + "step": 9221, + "time_per_iteration": 4.02078652381897 + }, + { + "auxiliary_loss_clip": 0.01280433, + "auxiliary_loss_mlp": 0.00244505, + "balance_loss_clip": 1.05488694, + "balance_loss_mlp": 0.22143817, + "epoch": 0.5544566361040132, + "flos": 25989952671360.0, + "grad_norm": 2.545494935459845, + "language_loss": 0.90620995, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.92145932, + "num_input_tokens_seen": 198614110, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.23071289, + "step": 9222, + "time_per_iteration": 4.19714879989624 + }, + { + "auxiliary_loss_clip": 0.01285416, + "auxiliary_loss_mlp": 0.0023476, + "balance_loss_clip": 1.05181324, + "balance_loss_mlp": 0.20867673, + "epoch": 0.5545167593566812, + "flos": 28257568225920.0, + "grad_norm": 5.459283806475052, + "language_loss": 0.85375285, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.8689546, + "num_input_tokens_seen": 198633880, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.2611084, + "step": 9223, + "time_per_iteration": 2.722905397415161 + }, + { + "auxiliary_loss_clip": 0.01280568, + "auxiliary_loss_mlp": 0.00232676, + "balance_loss_clip": 1.04729605, + "balance_loss_mlp": 0.20765391, + "epoch": 0.5545768826093491, + "flos": 28476551491200.0, + "grad_norm": 4.550986790121821, + "language_loss": 0.91364259, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.92877495, + "num_input_tokens_seen": 198653505, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.25036621, + "step": 9224, + "time_per_iteration": 4.0948100090026855 + }, + { + "auxiliary_loss_clip": 0.0127812, + "auxiliary_loss_mlp": 0.00260058, + "balance_loss_clip": 1.04994762, + "balance_loss_mlp": 0.23500015, + "epoch": 0.5546370058620171, + "flos": 18478051534080.0, + "grad_norm": 76.6802932553396, + "language_loss": 0.65972596, + "learning_rate": 1.743855475904141e-06, + "loss": 0.67510772, + "num_input_tokens_seen": 198671890, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.25061035, + "step": 9225, + "time_per_iteration": 2.650106906890869 + }, + { + "auxiliary_loss_clip": 0.0127469, + "auxiliary_loss_mlp": 0.0024432, + "balance_loss_clip": 1.04224229, + "balance_loss_mlp": 0.21907103, + "epoch": 0.554697129114685, + "flos": 22930507751040.0, + "grad_norm": 23.174754402037742, + "language_loss": 0.74313515, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.75832522, + "num_input_tokens_seen": 198691995, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.25244141, + "step": 9226, + "time_per_iteration": 2.72800874710083 + }, + { + "auxiliary_loss_clip": 0.01248446, + "auxiliary_loss_mlp": 0.00240307, + "balance_loss_clip": 1.0251472, + "balance_loss_mlp": 0.21826571, + "epoch": 0.5547572523673531, + "flos": 21797166850560.0, + "grad_norm": 36.653339989624826, + "language_loss": 0.80477083, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.8196584, + "num_input_tokens_seen": 198712440, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.22045898, + "step": 9227, + "time_per_iteration": 2.645087242126465 + }, + { + "auxiliary_loss_clip": 0.01279353, + "auxiliary_loss_mlp": 0.00221815, + "balance_loss_clip": 1.04694676, + "balance_loss_mlp": 0.19850978, + "epoch": 0.5548173756200211, + "flos": 22342829333760.0, + "grad_norm": 87.03254570716952, + "language_loss": 0.79789698, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.81290871, + "num_input_tokens_seen": 198731515, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.23278809, + "step": 9228, + "time_per_iteration": 2.694439172744751 + }, + { + "auxiliary_loss_clip": 0.01258375, + "auxiliary_loss_mlp": 0.00221158, + "balance_loss_clip": 1.03548026, + "balance_loss_mlp": 0.19871122, + "epoch": 0.554877498872689, + "flos": 17858736213120.0, + "grad_norm": 2.4031143898241343, + "language_loss": 0.8329643, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.8477596, + "num_input_tokens_seen": 198749750, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.22460938, + "step": 9229, + "time_per_iteration": 2.768777847290039 + }, + { + "auxiliary_loss_clip": 0.01256081, + "auxiliary_loss_mlp": 0.00238558, + "balance_loss_clip": 1.0274744, + "balance_loss_mlp": 0.2137509, + "epoch": 0.554937622125357, + "flos": 17238343484160.0, + "grad_norm": 23.592011255432762, + "language_loss": 0.77526873, + "learning_rate": 1.741924325613172e-06, + "loss": 0.79021513, + "num_input_tokens_seen": 198768320, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.2479248, + "step": 9230, + "time_per_iteration": 4.0585081577301025 + }, + { + "auxiliary_loss_clip": 0.01275528, + "auxiliary_loss_mlp": 0.00238989, + "balance_loss_clip": 1.04269195, + "balance_loss_mlp": 0.21543312, + "epoch": 0.5549977453780249, + "flos": 25368087484800.0, + "grad_norm": 10.285513914585666, + "language_loss": 0.77531689, + "learning_rate": 1.741538124855163e-06, + "loss": 0.79046202, + "num_input_tokens_seen": 198787230, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.2355957, + "step": 9231, + "time_per_iteration": 2.670149803161621 + }, + { + "auxiliary_loss_clip": 0.0130962, + "auxiliary_loss_mlp": 0.00207972, + "balance_loss_clip": 1.07006764, + "balance_loss_mlp": 0.18297368, + "epoch": 0.555057868630693, + "flos": 25079114568960.0, + "grad_norm": 110.65626190972614, + "language_loss": 0.83714795, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.85232389, + "num_input_tokens_seen": 198806720, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.24975586, + "step": 9232, + "time_per_iteration": 2.6871633529663086 + }, + { + "auxiliary_loss_clip": 0.01251742, + "auxiliary_loss_mlp": 0.00230581, + "balance_loss_clip": 1.03297877, + "balance_loss_mlp": 0.20771676, + "epoch": 0.5551179918833609, + "flos": 26104220812800.0, + "grad_norm": 3.770693044429687, + "language_loss": 0.88847852, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.90330172, + "num_input_tokens_seen": 198826235, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.2286377, + "step": 9233, + "time_per_iteration": 2.681368827819824 + }, + { + "auxiliary_loss_clip": 0.01290574, + "auxiliary_loss_mlp": 0.00223516, + "balance_loss_clip": 1.05400813, + "balance_loss_mlp": 0.19937646, + "epoch": 0.5551781151360289, + "flos": 19384759572480.0, + "grad_norm": 4.130277217992442, + "language_loss": 0.86433923, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.87948012, + "num_input_tokens_seen": 198842655, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.24157715, + "step": 9234, + "time_per_iteration": 2.61818790435791 + }, + { + "auxiliary_loss_clip": 0.01252772, + "auxiliary_loss_mlp": 0.00225525, + "balance_loss_clip": 1.02772999, + "balance_loss_mlp": 0.20292285, + "epoch": 0.5552382383886968, + "flos": 21725956137600.0, + "grad_norm": 11.356326398298755, + "language_loss": 0.73942524, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.75420821, + "num_input_tokens_seen": 198861210, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.22607422, + "step": 9235, + "time_per_iteration": 2.6087398529052734 + }, + { + "auxiliary_loss_clip": 0.01268927, + "auxiliary_loss_mlp": 0.00226014, + "balance_loss_clip": 1.04211235, + "balance_loss_mlp": 0.20230351, + "epoch": 0.5552983616413648, + "flos": 14356189117440.0, + "grad_norm": 17.627753276999723, + "language_loss": 0.76959288, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.78454232, + "num_input_tokens_seen": 198880045, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.23742676, + "step": 9236, + "time_per_iteration": 2.6975584030151367 + }, + { + "auxiliary_loss_clip": 0.01258838, + "auxiliary_loss_mlp": 0.00216938, + "balance_loss_clip": 1.03569329, + "balance_loss_mlp": 0.19452706, + "epoch": 0.5553584848940327, + "flos": 25478548784640.0, + "grad_norm": 53.435653817117704, + "language_loss": 0.92936563, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.94412345, + "num_input_tokens_seen": 198900210, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.22412109, + "step": 9237, + "time_per_iteration": 2.68105149269104 + }, + { + "auxiliary_loss_clip": 0.01249159, + "auxiliary_loss_mlp": 0.00207138, + "balance_loss_clip": 1.02541769, + "balance_loss_mlp": 0.18445221, + "epoch": 0.5554186081467007, + "flos": 22163850840960.0, + "grad_norm": 3.101721231493904, + "language_loss": 0.83523494, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.8497979, + "num_input_tokens_seen": 198919055, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.22680664, + "step": 9238, + "time_per_iteration": 2.6770336627960205 + }, + { + "auxiliary_loss_clip": 0.01258571, + "auxiliary_loss_mlp": 0.00253765, + "balance_loss_clip": 1.02894068, + "balance_loss_mlp": 0.22890967, + "epoch": 0.5554787313993687, + "flos": 49746656125440.0, + "grad_norm": 36.89862941991988, + "language_loss": 0.8660605, + "learning_rate": 1.73844887285358e-06, + "loss": 0.88118386, + "num_input_tokens_seen": 198943505, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.24829102, + "step": 9239, + "time_per_iteration": 2.878704309463501 + }, + { + "auxiliary_loss_clip": 0.01257896, + "auxiliary_loss_mlp": 0.00222221, + "balance_loss_clip": 1.0308814, + "balance_loss_mlp": 0.20027408, + "epoch": 0.5555388546520367, + "flos": 22127365601280.0, + "grad_norm": 185.7099563300428, + "language_loss": 0.86227393, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.87707508, + "num_input_tokens_seen": 198963590, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.21960449, + "step": 9240, + "time_per_iteration": 2.65285587310791 + }, + { + "auxiliary_loss_clip": 0.01232441, + "auxiliary_loss_mlp": 0.00217185, + "balance_loss_clip": 1.01421762, + "balance_loss_mlp": 0.19474961, + "epoch": 0.5555989779047047, + "flos": 24682122478080.0, + "grad_norm": 6.2581925734171095, + "language_loss": 0.71637797, + "learning_rate": 1.737676658740786e-06, + "loss": 0.73087424, + "num_input_tokens_seen": 198982680, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.2244873, + "step": 9241, + "time_per_iteration": 2.641261577606201 + }, + { + "auxiliary_loss_clip": 0.01271737, + "auxiliary_loss_mlp": 0.00199632, + "balance_loss_clip": 1.04074371, + "balance_loss_mlp": 0.17531367, + "epoch": 0.5556591011573726, + "flos": 16106510954880.0, + "grad_norm": 2.884599129599908, + "language_loss": 0.83853716, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.85325086, + "num_input_tokens_seen": 199000185, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.24316406, + "step": 9242, + "time_per_iteration": 2.628225803375244 + }, + { + "auxiliary_loss_clip": 0.01270095, + "auxiliary_loss_mlp": 0.00211615, + "balance_loss_clip": 1.03783607, + "balance_loss_mlp": 0.18714099, + "epoch": 0.5557192244100406, + "flos": 12933695733120.0, + "grad_norm": 6.823280952117062, + "language_loss": 0.75189835, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.76671541, + "num_input_tokens_seen": 199018380, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.24499512, + "step": 9243, + "time_per_iteration": 2.6669983863830566 + }, + { + "auxiliary_loss_clip": 0.01262229, + "auxiliary_loss_mlp": 0.00222876, + "balance_loss_clip": 1.03568554, + "balance_loss_mlp": 0.19909342, + "epoch": 0.5557793476627085, + "flos": 23111712887040.0, + "grad_norm": 261.25664828508303, + "language_loss": 0.83282578, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.84767687, + "num_input_tokens_seen": 199037115, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.23779297, + "step": 9244, + "time_per_iteration": 2.7297770977020264 + }, + { + "auxiliary_loss_clip": 0.01244171, + "auxiliary_loss_mlp": 0.00205387, + "balance_loss_clip": 1.02623641, + "balance_loss_mlp": 0.1847042, + "epoch": 0.5558394709153766, + "flos": 21428040735360.0, + "grad_norm": 2.4186555317238705, + "language_loss": 0.8102212, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.82471681, + "num_input_tokens_seen": 199053375, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.20690918, + "step": 9245, + "time_per_iteration": 2.6762495040893555 + }, + { + "auxiliary_loss_clip": 0.01268605, + "auxiliary_loss_mlp": 0.00232875, + "balance_loss_clip": 1.03745246, + "balance_loss_mlp": 0.2079601, + "epoch": 0.5558995941680445, + "flos": 25078324469760.0, + "grad_norm": 20.751160905158958, + "language_loss": 0.87857181, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.89358658, + "num_input_tokens_seen": 199070930, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.24951172, + "step": 9246, + "time_per_iteration": 2.6793863773345947 + }, + { + "auxiliary_loss_clip": 0.01261091, + "auxiliary_loss_mlp": 0.00217827, + "balance_loss_clip": 1.03484726, + "balance_loss_mlp": 0.19353242, + "epoch": 0.5559597174207125, + "flos": 20011149872640.0, + "grad_norm": 44.92362551642789, + "language_loss": 0.79932368, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.81411284, + "num_input_tokens_seen": 199088675, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.24291992, + "step": 9247, + "time_per_iteration": 2.652797222137451 + }, + { + "auxiliary_loss_clip": 0.01257361, + "auxiliary_loss_mlp": 0.00220543, + "balance_loss_clip": 1.02980351, + "balance_loss_mlp": 0.19624856, + "epoch": 0.5560198406733804, + "flos": 16835677044480.0, + "grad_norm": 14.328898483488326, + "language_loss": 0.8612833, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.87606239, + "num_input_tokens_seen": 199103075, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.24304199, + "step": 9248, + "time_per_iteration": 2.6463027000427246 + }, + { + "auxiliary_loss_clip": 0.01277085, + "auxiliary_loss_mlp": 0.00036032, + "balance_loss_clip": 1.12606239, + "balance_loss_mlp": 0.02959512, + "epoch": 0.5560799639260484, + "flos": 70697051758080.0, + "grad_norm": 0.8621616495606996, + "language_loss": 0.58607435, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.59920549, + "num_input_tokens_seen": 199160325, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.06445312, + "step": 9249, + "time_per_iteration": 3.223930597305298 + }, + { + "auxiliary_loss_clip": 0.01267321, + "auxiliary_loss_mlp": 0.00223189, + "balance_loss_clip": 1.03536725, + "balance_loss_mlp": 0.19950244, + "epoch": 0.5561400871787163, + "flos": 23148593176320.0, + "grad_norm": 15.24530202008838, + "language_loss": 0.87639898, + "learning_rate": 1.734202189316832e-06, + "loss": 0.89130408, + "num_input_tokens_seen": 199179760, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.23706055, + "step": 9250, + "time_per_iteration": 2.649759531021118 + }, + { + "auxiliary_loss_clip": 0.01259425, + "auxiliary_loss_mlp": 0.00236, + "balance_loss_clip": 1.03097677, + "balance_loss_mlp": 0.21036997, + "epoch": 0.5562002104313843, + "flos": 17566423332480.0, + "grad_norm": 23.063512773945064, + "language_loss": 0.82126272, + "learning_rate": 1.733816187358836e-06, + "loss": 0.83621705, + "num_input_tokens_seen": 199196695, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.25622559, + "step": 9251, + "time_per_iteration": 2.6057286262512207 + }, + { + "auxiliary_loss_clip": 0.01248783, + "auxiliary_loss_mlp": 0.00228256, + "balance_loss_clip": 1.02521122, + "balance_loss_mlp": 0.20609525, + "epoch": 0.5562603336840523, + "flos": 25045430590080.0, + "grad_norm": 23.892960766477756, + "language_loss": 0.81937504, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.83414543, + "num_input_tokens_seen": 199217845, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.22167969, + "step": 9252, + "time_per_iteration": 2.757880449295044 + }, + { + "auxiliary_loss_clip": 0.01259032, + "auxiliary_loss_mlp": 0.00226814, + "balance_loss_clip": 1.03071404, + "balance_loss_mlp": 0.20122002, + "epoch": 0.5563204569367203, + "flos": 29059022436480.0, + "grad_norm": 8.4662475965508, + "language_loss": 0.80450588, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.81936443, + "num_input_tokens_seen": 199239250, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.25598145, + "step": 9253, + "time_per_iteration": 2.7066712379455566 + }, + { + "auxiliary_loss_clip": 0.01302324, + "auxiliary_loss_mlp": 0.00217178, + "balance_loss_clip": 1.06609488, + "balance_loss_mlp": 0.19420633, + "epoch": 0.5563805801893883, + "flos": 22090449398400.0, + "grad_norm": 34.865224286180435, + "language_loss": 0.88474876, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.89994377, + "num_input_tokens_seen": 199258320, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.22949219, + "step": 9254, + "time_per_iteration": 2.6208384037017822 + }, + { + "auxiliary_loss_clip": 0.01266097, + "auxiliary_loss_mlp": 0.00028141, + "balance_loss_clip": 1.11656106, + "balance_loss_mlp": 0.02237139, + "epoch": 0.5564407034420562, + "flos": 58636128689280.0, + "grad_norm": 0.8790005084751179, + "language_loss": 0.64771628, + "learning_rate": 1.732272280610387e-06, + "loss": 0.6606586, + "num_input_tokens_seen": 199314840, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.05761719, + "step": 9255, + "time_per_iteration": 3.0035715103149414 + }, + { + "auxiliary_loss_clip": 0.01265433, + "auxiliary_loss_mlp": 0.00205926, + "balance_loss_clip": 1.03685904, + "balance_loss_mlp": 0.1844327, + "epoch": 0.5565008266947242, + "flos": 23112323418240.0, + "grad_norm": 2.522843596717607, + "language_loss": 0.75690848, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.77162206, + "num_input_tokens_seen": 199335405, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.21484375, + "step": 9256, + "time_per_iteration": 2.6445653438568115 + }, + { + "auxiliary_loss_clip": 0.01261002, + "auxiliary_loss_mlp": 0.0020185, + "balance_loss_clip": 1.03569567, + "balance_loss_mlp": 0.17979592, + "epoch": 0.5565609499473921, + "flos": 21578399066880.0, + "grad_norm": 4.804848674420017, + "language_loss": 0.8246454, + "learning_rate": 1.73150038809119e-06, + "loss": 0.83927393, + "num_input_tokens_seen": 199354345, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.22045898, + "step": 9257, + "time_per_iteration": 2.6306402683258057 + }, + { + "auxiliary_loss_clip": 0.01282586, + "auxiliary_loss_mlp": 0.00207846, + "balance_loss_clip": 1.05087173, + "balance_loss_mlp": 0.18476734, + "epoch": 0.5566210732000602, + "flos": 18369637309440.0, + "grad_norm": 649.8681584865938, + "language_loss": 0.71600378, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.73090816, + "num_input_tokens_seen": 199372250, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.23095703, + "step": 9258, + "time_per_iteration": 2.664853811264038 + }, + { + "auxiliary_loss_clip": 0.01277534, + "auxiliary_loss_mlp": 0.00212301, + "balance_loss_clip": 1.0458678, + "balance_loss_mlp": 0.18736291, + "epoch": 0.5566811964527281, + "flos": 25703350053120.0, + "grad_norm": 853.9569332166142, + "language_loss": 0.87404168, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.88893998, + "num_input_tokens_seen": 199392815, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.24926758, + "step": 9259, + "time_per_iteration": 2.7335550785064697 + }, + { + "auxiliary_loss_clip": 0.01250608, + "auxiliary_loss_mlp": 0.00192615, + "balance_loss_clip": 1.02320695, + "balance_loss_mlp": 0.17139557, + "epoch": 0.5567413197053961, + "flos": 26943991856640.0, + "grad_norm": 15.323798765256306, + "language_loss": 0.88668275, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.901115, + "num_input_tokens_seen": 199412375, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.21240234, + "step": 9260, + "time_per_iteration": 2.700984001159668 + }, + { + "auxiliary_loss_clip": 0.01277498, + "auxiliary_loss_mlp": 0.00222255, + "balance_loss_clip": 1.04427087, + "balance_loss_mlp": 0.1966608, + "epoch": 0.556801442958064, + "flos": 20850597694080.0, + "grad_norm": 18.509390554612427, + "language_loss": 0.75380576, + "learning_rate": 1.729956725348256e-06, + "loss": 0.7688033, + "num_input_tokens_seen": 199431490, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.25610352, + "step": 9261, + "time_per_iteration": 2.6386945247650146 + }, + { + "auxiliary_loss_clip": 0.0125936, + "auxiliary_loss_mlp": 0.00060531, + "balance_loss_clip": 1.10856676, + "balance_loss_mlp": 0.05399799, + "epoch": 0.556861566210732, + "flos": 70498213044480.0, + "grad_norm": 0.7255276699105632, + "language_loss": 0.60833156, + "learning_rate": 1.729570835226108e-06, + "loss": 0.62153047, + "num_input_tokens_seen": 199495855, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.06542969, + "step": 9262, + "time_per_iteration": 3.1873230934143066 + }, + { + "auxiliary_loss_clip": 0.01281486, + "auxiliary_loss_mlp": 0.00202575, + "balance_loss_clip": 1.05215669, + "balance_loss_mlp": 0.17863756, + "epoch": 0.5569216894633999, + "flos": 25337276593920.0, + "grad_norm": 3.4872910725068476, + "language_loss": 0.7142123, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.7290529, + "num_input_tokens_seen": 199515870, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.23950195, + "step": 9263, + "time_per_iteration": 4.069705486297607 + }, + { + "auxiliary_loss_clip": 0.0128613, + "auxiliary_loss_mlp": 0.00204453, + "balance_loss_clip": 1.0531292, + "balance_loss_mlp": 0.18225628, + "epoch": 0.556981812716068, + "flos": 22638733574400.0, + "grad_norm": 73.03107491825486, + "language_loss": 0.80733931, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.82224512, + "num_input_tokens_seen": 199535745, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.22180176, + "step": 9264, + "time_per_iteration": 4.066655874252319 + }, + { + "auxiliary_loss_clip": 0.0128513, + "auxiliary_loss_mlp": 0.00203364, + "balance_loss_clip": 1.04877484, + "balance_loss_mlp": 0.1801417, + "epoch": 0.5570419359687359, + "flos": 11035852738560.0, + "grad_norm": 19.888206374686803, + "language_loss": 0.85894465, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.87382954, + "num_input_tokens_seen": 199554035, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.2322998, + "step": 9265, + "time_per_iteration": 2.6315948963165283 + }, + { + "auxiliary_loss_clip": 0.01288663, + "auxiliary_loss_mlp": 0.00201384, + "balance_loss_clip": 1.05562937, + "balance_loss_mlp": 0.18092801, + "epoch": 0.5571020592214039, + "flos": 22823135020800.0, + "grad_norm": 1057.7939186906867, + "language_loss": 0.7717706, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.7866711, + "num_input_tokens_seen": 199576120, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.20446777, + "step": 9266, + "time_per_iteration": 2.6999247074127197 + }, + { + "auxiliary_loss_clip": 0.01278162, + "auxiliary_loss_mlp": 0.00190251, + "balance_loss_clip": 1.04636025, + "balance_loss_mlp": 0.16779205, + "epoch": 0.5571621824740719, + "flos": 22927778317440.0, + "grad_norm": 3.686496863347203, + "language_loss": 0.7633971, + "learning_rate": 1.727641538728533e-06, + "loss": 0.77808124, + "num_input_tokens_seen": 199593780, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.22485352, + "step": 9267, + "time_per_iteration": 4.009453296661377 + }, + { + "auxiliary_loss_clip": 0.01280988, + "auxiliary_loss_mlp": 0.0021126, + "balance_loss_clip": 1.04740047, + "balance_loss_mlp": 0.18880142, + "epoch": 0.5572223057267398, + "flos": 22966705681920.0, + "grad_norm": 100.82011354978276, + "language_loss": 0.81194687, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.82686937, + "num_input_tokens_seen": 199613220, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.22485352, + "step": 9268, + "time_per_iteration": 2.6210644245147705 + }, + { + "auxiliary_loss_clip": 0.01265906, + "auxiliary_loss_mlp": 0.00202222, + "balance_loss_clip": 1.03511846, + "balance_loss_mlp": 0.18021587, + "epoch": 0.5572824289794078, + "flos": 20960053413120.0, + "grad_norm": 4.258674789730526, + "language_loss": 0.83484668, + "learning_rate": 1.726869892322104e-06, + "loss": 0.84952796, + "num_input_tokens_seen": 199632085, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.22033691, + "step": 9269, + "time_per_iteration": 2.641766309738159 + }, + { + "auxiliary_loss_clip": 0.01261086, + "auxiliary_loss_mlp": 0.0019486, + "balance_loss_clip": 1.0298295, + "balance_loss_mlp": 0.1722697, + "epoch": 0.5573425522320757, + "flos": 25042413847680.0, + "grad_norm": 2.9206177314306547, + "language_loss": 0.89177555, + "learning_rate": 1.726484084647256e-06, + "loss": 0.906335, + "num_input_tokens_seen": 199649295, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.22607422, + "step": 9270, + "time_per_iteration": 2.7265725135803223 + }, + { + "auxiliary_loss_clip": 0.01280286, + "auxiliary_loss_mlp": 0.00215647, + "balance_loss_clip": 1.0436976, + "balance_loss_mlp": 0.19128023, + "epoch": 0.5574026754847438, + "flos": 23659637927040.0, + "grad_norm": 3.492643912695584, + "language_loss": 0.90202647, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.91698581, + "num_input_tokens_seen": 199668870, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.24365234, + "step": 9271, + "time_per_iteration": 2.679202079772949 + }, + { + "auxiliary_loss_clip": 0.01282611, + "auxiliary_loss_mlp": 0.0020716, + "balance_loss_clip": 1.0489887, + "balance_loss_mlp": 0.18225744, + "epoch": 0.5574627987374117, + "flos": 24782240661120.0, + "grad_norm": 4.905246154396478, + "language_loss": 0.96309304, + "learning_rate": 1.725712500427442e-06, + "loss": 0.97799081, + "num_input_tokens_seen": 199684870, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.24914551, + "step": 9272, + "time_per_iteration": 2.64282488822937 + }, + { + "auxiliary_loss_clip": 0.01256937, + "auxiliary_loss_mlp": 0.00196455, + "balance_loss_clip": 1.02924538, + "balance_loss_mlp": 0.17442463, + "epoch": 0.5575229219900797, + "flos": 21834944979840.0, + "grad_norm": 23.43012647973238, + "language_loss": 0.92763978, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.94217372, + "num_input_tokens_seen": 199701975, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.22033691, + "step": 9273, + "time_per_iteration": 4.052488327026367 + }, + { + "auxiliary_loss_clip": 0.0124894, + "auxiliary_loss_mlp": 0.00204104, + "balance_loss_clip": 1.01860654, + "balance_loss_mlp": 0.18095329, + "epoch": 0.5575830452427476, + "flos": 27815148408960.0, + "grad_norm": 11.523609270964146, + "language_loss": 0.81845689, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.83298731, + "num_input_tokens_seen": 199721865, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.23156738, + "step": 9274, + "time_per_iteration": 2.7314229011535645 + }, + { + "auxiliary_loss_clip": 0.0128202, + "auxiliary_loss_mlp": 0.0021091, + "balance_loss_clip": 1.04064393, + "balance_loss_mlp": 0.18655592, + "epoch": 0.5576431684954156, + "flos": 17812805696640.0, + "grad_norm": 7.679687511348679, + "language_loss": 0.9245699, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.9394992, + "num_input_tokens_seen": 199736455, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.24389648, + "step": 9275, + "time_per_iteration": 2.615837812423706 + }, + { + "auxiliary_loss_clip": 0.01279128, + "auxiliary_loss_mlp": 0.00211596, + "balance_loss_clip": 1.04539216, + "balance_loss_mlp": 0.18899432, + "epoch": 0.5577032917480835, + "flos": 15486872411520.0, + "grad_norm": 10.393678330077204, + "language_loss": 0.81848973, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.83339697, + "num_input_tokens_seen": 199753125, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.22619629, + "step": 9276, + "time_per_iteration": 2.59586238861084 + }, + { + "auxiliary_loss_clip": 0.01264178, + "auxiliary_loss_mlp": 0.00202907, + "balance_loss_clip": 1.03083158, + "balance_loss_mlp": 0.17826691, + "epoch": 0.5577634150007516, + "flos": 21579763783680.0, + "grad_norm": 4.487328419075129, + "language_loss": 0.82597148, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.84064233, + "num_input_tokens_seen": 199771365, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.24645996, + "step": 9277, + "time_per_iteration": 2.6275746822357178 + }, + { + "auxiliary_loss_clip": 0.01269332, + "auxiliary_loss_mlp": 0.00182453, + "balance_loss_clip": 1.0365597, + "balance_loss_mlp": 0.15753824, + "epoch": 0.5578235382534195, + "flos": 21139750177920.0, + "grad_norm": 3.698636774088309, + "language_loss": 0.77320242, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.7877202, + "num_input_tokens_seen": 199790035, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.24926758, + "step": 9278, + "time_per_iteration": 2.6340715885162354 + }, + { + "auxiliary_loss_clip": 0.01278136, + "auxiliary_loss_mlp": 0.00217559, + "balance_loss_clip": 1.03936267, + "balance_loss_mlp": 0.19173793, + "epoch": 0.5578836615060875, + "flos": 26505199313280.0, + "grad_norm": 36.36057072365084, + "language_loss": 0.81771463, + "learning_rate": 1.723012284057868e-06, + "loss": 0.83267152, + "num_input_tokens_seen": 199811125, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.25817871, + "step": 9279, + "time_per_iteration": 2.684398889541626 + }, + { + "auxiliary_loss_clip": 0.01292389, + "auxiliary_loss_mlp": 0.00192425, + "balance_loss_clip": 1.04707217, + "balance_loss_mlp": 0.16649678, + "epoch": 0.5579437847587555, + "flos": 20153786780160.0, + "grad_norm": 45.377753465418635, + "language_loss": 0.78709865, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.80194676, + "num_input_tokens_seen": 199829915, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.25939941, + "step": 9280, + "time_per_iteration": 2.732231855392456 + }, + { + "auxiliary_loss_clip": 0.01269001, + "auxiliary_loss_mlp": 0.00186163, + "balance_loss_clip": 1.03134346, + "balance_loss_mlp": 0.16253608, + "epoch": 0.5580039080114234, + "flos": 26102281478400.0, + "grad_norm": 2.517288837810889, + "language_loss": 0.83635259, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.85090417, + "num_input_tokens_seen": 199850670, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.23632812, + "step": 9281, + "time_per_iteration": 2.660083293914795 + }, + { + "auxiliary_loss_clip": 0.01276505, + "auxiliary_loss_mlp": 0.00189033, + "balance_loss_clip": 1.04283321, + "balance_loss_mlp": 0.16650245, + "epoch": 0.5580640312640914, + "flos": 13771671096960.0, + "grad_norm": 15.70827732843974, + "language_loss": 0.81235135, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.8270067, + "num_input_tokens_seen": 199867645, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.22546387, + "step": 9282, + "time_per_iteration": 2.6553995609283447 + }, + { + "auxiliary_loss_clip": 0.01255045, + "auxiliary_loss_mlp": 0.00205498, + "balance_loss_clip": 1.02639925, + "balance_loss_mlp": 0.18326569, + "epoch": 0.5581241545167593, + "flos": 17675986792320.0, + "grad_norm": 45.921852808907936, + "language_loss": 0.72577417, + "learning_rate": 1.721469534028297e-06, + "loss": 0.74037957, + "num_input_tokens_seen": 199886320, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.22229004, + "step": 9283, + "time_per_iteration": 2.610285997390747 + }, + { + "auxiliary_loss_clip": 0.01272123, + "auxiliary_loss_mlp": 0.00176208, + "balance_loss_clip": 1.03645563, + "balance_loss_mlp": 0.15250903, + "epoch": 0.5581842777694274, + "flos": 19569161018880.0, + "grad_norm": 66.95675550808059, + "language_loss": 0.91165459, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.92613792, + "num_input_tokens_seen": 199904895, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.23681641, + "step": 9284, + "time_per_iteration": 2.6571788787841797 + }, + { + "auxiliary_loss_clip": 0.01267619, + "auxiliary_loss_mlp": 0.00197113, + "balance_loss_clip": 1.03461349, + "balance_loss_mlp": 0.17285359, + "epoch": 0.5582444010220953, + "flos": 20595165102720.0, + "grad_norm": 493.74854416125146, + "language_loss": 0.93112504, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.94577241, + "num_input_tokens_seen": 199921090, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.24255371, + "step": 9285, + "time_per_iteration": 2.624689817428589 + }, + { + "auxiliary_loss_clip": 0.01263905, + "auxiliary_loss_mlp": 0.0019741, + "balance_loss_clip": 1.02948177, + "balance_loss_mlp": 0.17305604, + "epoch": 0.5583045242747633, + "flos": 19135504120320.0, + "grad_norm": 24.679197555520915, + "language_loss": 0.85119659, + "learning_rate": 1.720312582354912e-06, + "loss": 0.86580974, + "num_input_tokens_seen": 199939925, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.2434082, + "step": 9286, + "time_per_iteration": 2.635106086730957 + }, + { + "auxiliary_loss_clip": 0.01274701, + "auxiliary_loss_mlp": 0.00184997, + "balance_loss_clip": 1.03610981, + "balance_loss_mlp": 0.16175072, + "epoch": 0.5583646475274312, + "flos": 27454569730560.0, + "grad_norm": 2.419638640739014, + "language_loss": 0.81543607, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.83003306, + "num_input_tokens_seen": 199960015, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.23266602, + "step": 9287, + "time_per_iteration": 2.679624319076538 + }, + { + "auxiliary_loss_clip": 0.01291478, + "auxiliary_loss_mlp": 0.0019603, + "balance_loss_clip": 1.048244, + "balance_loss_mlp": 0.1738447, + "epoch": 0.5584247707800992, + "flos": 23653784010240.0, + "grad_norm": 12.869653823729392, + "language_loss": 0.81570399, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.83057904, + "num_input_tokens_seen": 199980505, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.22192383, + "step": 9288, + "time_per_iteration": 2.731786012649536 + }, + { + "auxiliary_loss_clip": 0.01292629, + "auxiliary_loss_mlp": 0.00194382, + "balance_loss_clip": 1.05067873, + "balance_loss_mlp": 0.16956294, + "epoch": 0.5584848940327671, + "flos": 13698880185600.0, + "grad_norm": 4.202439797170646, + "language_loss": 0.90935266, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.92422271, + "num_input_tokens_seen": 199999020, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.24816895, + "step": 9289, + "time_per_iteration": 2.616331100463867 + }, + { + "auxiliary_loss_clip": 0.01301847, + "auxiliary_loss_mlp": 0.00193599, + "balance_loss_clip": 1.05387998, + "balance_loss_mlp": 0.1682675, + "epoch": 0.5585450172854352, + "flos": 27016208150400.0, + "grad_norm": 10.138671960023803, + "language_loss": 0.72823375, + "learning_rate": 1.718770128672817e-06, + "loss": 0.7431882, + "num_input_tokens_seen": 200019020, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.25341797, + "step": 9290, + "time_per_iteration": 2.697014331817627 + }, + { + "auxiliary_loss_clip": 0.01272167, + "auxiliary_loss_mlp": 0.00208404, + "balance_loss_clip": 1.03155291, + "balance_loss_mlp": 0.18328613, + "epoch": 0.5586051405381031, + "flos": 23185653033600.0, + "grad_norm": 30.350657241513083, + "language_loss": 0.8034569, + "learning_rate": 1.7183845418764e-06, + "loss": 0.81826264, + "num_input_tokens_seen": 200038110, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.2512207, + "step": 9291, + "time_per_iteration": 2.674354076385498 + }, + { + "auxiliary_loss_clip": 0.01247743, + "auxiliary_loss_mlp": 0.00191553, + "balance_loss_clip": 1.01669216, + "balance_loss_mlp": 0.16761565, + "epoch": 0.5586652637907711, + "flos": 20775544225920.0, + "grad_norm": 3.5871040708926625, + "language_loss": 0.91129112, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.92568409, + "num_input_tokens_seen": 200056210, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.23937988, + "step": 9292, + "time_per_iteration": 2.6839680671691895 + }, + { + "auxiliary_loss_clip": 0.01267133, + "auxiliary_loss_mlp": 0.00191752, + "balance_loss_clip": 1.0321008, + "balance_loss_mlp": 0.16763589, + "epoch": 0.5587253870434391, + "flos": 28219897837440.0, + "grad_norm": 3.584155499490326, + "language_loss": 0.82460046, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.83918929, + "num_input_tokens_seen": 200075620, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.24145508, + "step": 9293, + "time_per_iteration": 2.6983530521392822 + }, + { + "auxiliary_loss_clip": 0.01273469, + "auxiliary_loss_mlp": 0.00173253, + "balance_loss_clip": 1.03953648, + "balance_loss_mlp": 0.15134238, + "epoch": 0.558785510296107, + "flos": 26615732440320.0, + "grad_norm": 4.79388037820887, + "language_loss": 0.7962364, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.81070364, + "num_input_tokens_seen": 200095945, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.21923828, + "step": 9294, + "time_per_iteration": 2.6944425106048584 + }, + { + "auxiliary_loss_clip": 0.01260297, + "auxiliary_loss_mlp": 0.00194247, + "balance_loss_clip": 1.02639198, + "balance_loss_mlp": 0.17160943, + "epoch": 0.558845633548775, + "flos": 20156767608960.0, + "grad_norm": 7.501233021570425, + "language_loss": 0.78326362, + "learning_rate": 1.716842301625806e-06, + "loss": 0.797809, + "num_input_tokens_seen": 200114185, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.22631836, + "step": 9295, + "time_per_iteration": 2.6134698390960693 + }, + { + "auxiliary_loss_clip": 0.01268399, + "auxiliary_loss_mlp": 0.00180906, + "balance_loss_clip": 1.03306341, + "balance_loss_mlp": 0.15544298, + "epoch": 0.5589057568014429, + "flos": 24350774492160.0, + "grad_norm": 2.979354636627372, + "language_loss": 0.87664104, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.89113408, + "num_input_tokens_seen": 200135030, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.25500488, + "step": 9296, + "time_per_iteration": 2.6588778495788574 + }, + { + "auxiliary_loss_clip": 0.012735, + "auxiliary_loss_mlp": 0.00197393, + "balance_loss_clip": 1.03571403, + "balance_loss_mlp": 0.17401607, + "epoch": 0.558965880054111, + "flos": 21105168359040.0, + "grad_norm": 3.1871653740530532, + "language_loss": 0.70877635, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.72348529, + "num_input_tokens_seen": 200154290, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.23376465, + "step": 9297, + "time_per_iteration": 2.6920580863952637 + }, + { + "auxiliary_loss_clip": 0.01294125, + "auxiliary_loss_mlp": 0.00204355, + "balance_loss_clip": 1.0470705, + "balance_loss_mlp": 0.178665, + "epoch": 0.5590260033067789, + "flos": 18436071513600.0, + "grad_norm": 10.419848436147566, + "language_loss": 0.83309805, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.84808278, + "num_input_tokens_seen": 200171555, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.25708008, + "step": 9298, + "time_per_iteration": 2.5964279174804688 + }, + { + "auxiliary_loss_clip": 0.01220499, + "auxiliary_loss_mlp": 0.00066031, + "balance_loss_clip": 1.04942846, + "balance_loss_mlp": 0.0607381, + "epoch": 0.5590861265594469, + "flos": 70577432490240.0, + "grad_norm": 0.67124782287472, + "language_loss": 0.51922697, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.53209227, + "num_input_tokens_seen": 200237010, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.05297852, + "step": 9299, + "time_per_iteration": 3.219676971435547 + }, + { + "auxiliary_loss_clip": 0.01244737, + "auxiliary_loss_mlp": 0.00173686, + "balance_loss_clip": 1.01729333, + "balance_loss_mlp": 0.15009476, + "epoch": 0.5591462498121148, + "flos": 30664408896000.0, + "grad_norm": 17.37567210140604, + "language_loss": 0.75878358, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.77296782, + "num_input_tokens_seen": 200260820, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.23596191, + "step": 9300, + "time_per_iteration": 2.7128403186798096 + }, + { + "auxiliary_loss_clip": 0.01277608, + "auxiliary_loss_mlp": 0.00224277, + "balance_loss_clip": 1.03993583, + "balance_loss_mlp": 0.19867086, + "epoch": 0.5592063730647828, + "flos": 18150438562560.0, + "grad_norm": 7.133540852533901, + "language_loss": 0.88805753, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.90307641, + "num_input_tokens_seen": 200278035, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.2557373, + "step": 9301, + "time_per_iteration": 2.6187121868133545 + }, + { + "auxiliary_loss_clip": 0.0127021, + "auxiliary_loss_mlp": 0.00210079, + "balance_loss_clip": 1.03489029, + "balance_loss_mlp": 0.18521203, + "epoch": 0.5592664963174507, + "flos": 24060400945920.0, + "grad_norm": 22.25944968283858, + "language_loss": 0.75667787, + "learning_rate": 1.714143795138756e-06, + "loss": 0.77148074, + "num_input_tokens_seen": 200297255, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.24841309, + "step": 9302, + "time_per_iteration": 2.643488645553589 + }, + { + "auxiliary_loss_clip": 0.01284483, + "auxiliary_loss_mlp": 0.00204606, + "balance_loss_clip": 1.04283178, + "balance_loss_mlp": 0.17954864, + "epoch": 0.5593266195701188, + "flos": 19827897661440.0, + "grad_norm": 16.209082775214185, + "language_loss": 0.79700518, + "learning_rate": 1.713758337453878e-06, + "loss": 0.81189609, + "num_input_tokens_seen": 200317505, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.25036621, + "step": 9303, + "time_per_iteration": 2.690110206604004 + }, + { + "auxiliary_loss_clip": 0.01277109, + "auxiliary_loss_mlp": 0.00185709, + "balance_loss_clip": 1.0415889, + "balance_loss_mlp": 0.16214162, + "epoch": 0.5593867428227867, + "flos": 25300755440640.0, + "grad_norm": 2.329413591344766, + "language_loss": 0.7976408, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.81226897, + "num_input_tokens_seen": 200338350, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.2355957, + "step": 9304, + "time_per_iteration": 2.6809065341949463 + }, + { + "auxiliary_loss_clip": 0.01277629, + "auxiliary_loss_mlp": 0.00185228, + "balance_loss_clip": 1.03985238, + "balance_loss_mlp": 0.1606704, + "epoch": 0.5594468660754547, + "flos": 12933013374720.0, + "grad_norm": 270.1594774903958, + "language_loss": 0.85658264, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.87121117, + "num_input_tokens_seen": 200353965, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.24560547, + "step": 9305, + "time_per_iteration": 4.026232481002808 + }, + { + "auxiliary_loss_clip": 0.01254919, + "auxiliary_loss_mlp": 0.00198188, + "balance_loss_clip": 1.0225091, + "balance_loss_mlp": 0.17499012, + "epoch": 0.5595069893281227, + "flos": 19062713208960.0, + "grad_norm": 1.8432494692002628, + "language_loss": 0.76334774, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.77787876, + "num_input_tokens_seen": 200373595, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.23205566, + "step": 9306, + "time_per_iteration": 4.069831609725952 + }, + { + "auxiliary_loss_clip": 0.01215427, + "auxiliary_loss_mlp": 0.00040551, + "balance_loss_clip": 1.03950655, + "balance_loss_mlp": 0.03599764, + "epoch": 0.5595671125807906, + "flos": 70273375862400.0, + "grad_norm": 0.9102188026392314, + "language_loss": 0.60108191, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.61364174, + "num_input_tokens_seen": 200429155, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.0456543, + "step": 9307, + "time_per_iteration": 3.21934175491333 + }, + { + "auxiliary_loss_clip": 0.01261678, + "auxiliary_loss_mlp": 0.00195213, + "balance_loss_clip": 1.03002501, + "balance_loss_mlp": 0.17366013, + "epoch": 0.5596272358334586, + "flos": 20665513889280.0, + "grad_norm": 201.46050085814318, + "language_loss": 0.79755789, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.81212682, + "num_input_tokens_seen": 200448290, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.21533203, + "step": 9308, + "time_per_iteration": 2.633913040161133 + }, + { + "auxiliary_loss_clip": 0.01268041, + "auxiliary_loss_mlp": 0.00231266, + "balance_loss_clip": 1.03203321, + "balance_loss_mlp": 0.2065776, + "epoch": 0.5596873590861265, + "flos": 25041013217280.0, + "grad_norm": 3.9134489749035506, + "language_loss": 0.76166248, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.77665555, + "num_input_tokens_seen": 200466555, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.24707031, + "step": 9309, + "time_per_iteration": 4.06767201423645 + }, + { + "auxiliary_loss_clip": 0.01272063, + "auxiliary_loss_mlp": 0.00214575, + "balance_loss_clip": 1.03764892, + "balance_loss_mlp": 0.18880221, + "epoch": 0.5597474823387946, + "flos": 25958387594880.0, + "grad_norm": 2.8519531364791812, + "language_loss": 0.83253509, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.8474015, + "num_input_tokens_seen": 200485980, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.25769043, + "step": 9310, + "time_per_iteration": 2.7046737670898438 + }, + { + "auxiliary_loss_clip": 0.01319181, + "auxiliary_loss_mlp": 0.00214007, + "balance_loss_clip": 1.06909478, + "balance_loss_mlp": 0.1872566, + "epoch": 0.5598076055914625, + "flos": 26177442687360.0, + "grad_norm": 5.881101541505523, + "language_loss": 0.80758482, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.82291675, + "num_input_tokens_seen": 200504555, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.2677002, + "step": 9311, + "time_per_iteration": 2.66375470161438 + }, + { + "auxiliary_loss_clip": 0.0126829, + "auxiliary_loss_mlp": 0.00199082, + "balance_loss_clip": 1.03278661, + "balance_loss_mlp": 0.1753947, + "epoch": 0.5598677288441305, + "flos": 11655778590720.0, + "grad_norm": 2.503521481248088, + "language_loss": 0.82134271, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.83601642, + "num_input_tokens_seen": 200522700, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.23657227, + "step": 9312, + "time_per_iteration": 2.6776680946350098 + }, + { + "auxiliary_loss_clip": 0.01265913, + "auxiliary_loss_mlp": 0.00210768, + "balance_loss_clip": 1.03273273, + "balance_loss_mlp": 0.18747422, + "epoch": 0.5599278520967984, + "flos": 22966597941120.0, + "grad_norm": 6.982296906362298, + "language_loss": 0.96833366, + "learning_rate": 1.709904360003822e-06, + "loss": 0.98310041, + "num_input_tokens_seen": 200541910, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.23291016, + "step": 9313, + "time_per_iteration": 2.6578779220581055 + }, + { + "auxiliary_loss_clip": 0.01264001, + "auxiliary_loss_mlp": 0.00203423, + "balance_loss_clip": 1.03435254, + "balance_loss_mlp": 0.18145278, + "epoch": 0.5599879753494664, + "flos": 21215557831680.0, + "grad_norm": 9.313189072048134, + "language_loss": 0.82011962, + "learning_rate": 1.709519022520204e-06, + "loss": 0.83479381, + "num_input_tokens_seen": 200562600, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.21972656, + "step": 9314, + "time_per_iteration": 2.658977508544922 + }, + { + "auxiliary_loss_clip": 0.01266906, + "auxiliary_loss_mlp": 0.00217214, + "balance_loss_clip": 1.03185463, + "balance_loss_mlp": 0.19370575, + "epoch": 0.5600480986021343, + "flos": 31903219105920.0, + "grad_norm": 48.25356062747558, + "language_loss": 0.78820252, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.80304372, + "num_input_tokens_seen": 200584795, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.23498535, + "step": 9315, + "time_per_iteration": 4.190639019012451 + }, + { + "auxiliary_loss_clip": 0.01288717, + "auxiliary_loss_mlp": 0.00223362, + "balance_loss_clip": 1.04656935, + "balance_loss_mlp": 0.19916278, + "epoch": 0.5601082218548024, + "flos": 28476048700800.0, + "grad_norm": 5.486012016827079, + "language_loss": 0.7540524, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.76917315, + "num_input_tokens_seen": 200606945, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.24194336, + "step": 9316, + "time_per_iteration": 2.699634313583374 + }, + { + "auxiliary_loss_clip": 0.01279543, + "auxiliary_loss_mlp": 0.00217723, + "balance_loss_clip": 1.0424732, + "balance_loss_mlp": 0.19475186, + "epoch": 0.5601683451074703, + "flos": 24097173494400.0, + "grad_norm": 13.452816834505304, + "language_loss": 0.93555605, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.95052874, + "num_input_tokens_seen": 200626340, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.22961426, + "step": 9317, + "time_per_iteration": 2.6768345832824707 + }, + { + "auxiliary_loss_clip": 0.01291832, + "auxiliary_loss_mlp": 0.00221203, + "balance_loss_clip": 1.04712868, + "balance_loss_mlp": 0.19575158, + "epoch": 0.5602284683601383, + "flos": 26356205698560.0, + "grad_norm": 9.816898147825352, + "language_loss": 0.85460949, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.86973977, + "num_input_tokens_seen": 200644520, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.25463867, + "step": 9318, + "time_per_iteration": 2.672327995300293 + }, + { + "auxiliary_loss_clip": 0.01264285, + "auxiliary_loss_mlp": 0.00206751, + "balance_loss_clip": 1.02960336, + "balance_loss_mlp": 0.18371981, + "epoch": 0.5602885916128063, + "flos": 24496392228480.0, + "grad_norm": 19.462825331456806, + "language_loss": 0.81303, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.82774031, + "num_input_tokens_seen": 200664845, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.23022461, + "step": 9319, + "time_per_iteration": 2.709087610244751 + }, + { + "auxiliary_loss_clip": 0.01261015, + "auxiliary_loss_mlp": 0.00200761, + "balance_loss_clip": 1.02854502, + "balance_loss_mlp": 0.17836119, + "epoch": 0.5603487148654742, + "flos": 27345006270720.0, + "grad_norm": 440.26351998287424, + "language_loss": 0.90335131, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.91796905, + "num_input_tokens_seen": 200686535, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.22412109, + "step": 9320, + "time_per_iteration": 2.6748485565185547 + }, + { + "auxiliary_loss_clip": 0.01205083, + "auxiliary_loss_mlp": 0.00032582, + "balance_loss_clip": 1.03414774, + "balance_loss_mlp": 0.02755091, + "epoch": 0.5604088381181422, + "flos": 54087756180480.0, + "grad_norm": 0.7820002610924723, + "language_loss": 0.51954865, + "learning_rate": 1.706821969374996e-06, + "loss": 0.53192526, + "num_input_tokens_seen": 200736965, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.05029297, + "step": 9321, + "time_per_iteration": 3.0568647384643555 + }, + { + "auxiliary_loss_clip": 0.01271307, + "auxiliary_loss_mlp": 0.00195853, + "balance_loss_clip": 1.0377686, + "balance_loss_mlp": 0.17382348, + "epoch": 0.5604689613708101, + "flos": 22236390357120.0, + "grad_norm": 6.880433868526368, + "language_loss": 0.80632502, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.82099664, + "num_input_tokens_seen": 200757420, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.22021484, + "step": 9322, + "time_per_iteration": 2.6643238067626953 + }, + { + "auxiliary_loss_clip": 0.01278004, + "auxiliary_loss_mlp": 0.00223428, + "balance_loss_clip": 1.03952479, + "balance_loss_mlp": 0.20107639, + "epoch": 0.5605290846234782, + "flos": 35297782940160.0, + "grad_norm": 3.2647704048115878, + "language_loss": 0.78822899, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.80324328, + "num_input_tokens_seen": 200779520, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.22351074, + "step": 9323, + "time_per_iteration": 2.7793431282043457 + }, + { + "auxiliary_loss_clip": 0.01266785, + "auxiliary_loss_mlp": 0.00222846, + "balance_loss_clip": 1.03145266, + "balance_loss_mlp": 0.20006503, + "epoch": 0.5605892078761461, + "flos": 20263314326400.0, + "grad_norm": 7.073652383978384, + "language_loss": 0.69976985, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.71466613, + "num_input_tokens_seen": 200799485, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.22802734, + "step": 9324, + "time_per_iteration": 2.654230833053589 + }, + { + "auxiliary_loss_clip": 0.01260269, + "auxiliary_loss_mlp": 0.00204846, + "balance_loss_clip": 1.02373409, + "balance_loss_mlp": 0.18117082, + "epoch": 0.5606493311288141, + "flos": 17308333134720.0, + "grad_norm": 503.2058489099429, + "language_loss": 0.94122678, + "learning_rate": 1.705281040409226e-06, + "loss": 0.9558779, + "num_input_tokens_seen": 200817540, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.23681641, + "step": 9325, + "time_per_iteration": 2.6195929050445557 + }, + { + "auxiliary_loss_clip": 0.01297684, + "auxiliary_loss_mlp": 0.00219587, + "balance_loss_clip": 1.05107045, + "balance_loss_mlp": 0.19558984, + "epoch": 0.560709454381482, + "flos": 21652985658240.0, + "grad_norm": 3.9234738527864508, + "language_loss": 0.8246423, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.83981502, + "num_input_tokens_seen": 200838380, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.23986816, + "step": 9326, + "time_per_iteration": 2.6424591541290283 + }, + { + "auxiliary_loss_clip": 0.01284822, + "auxiliary_loss_mlp": 0.00232509, + "balance_loss_clip": 1.04180455, + "balance_loss_mlp": 0.20765345, + "epoch": 0.56076957763415, + "flos": 20303355012480.0, + "grad_norm": 23.325912823644604, + "language_loss": 0.88409024, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.89926356, + "num_input_tokens_seen": 200855640, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.24853516, + "step": 9327, + "time_per_iteration": 2.7489984035491943 + }, + { + "auxiliary_loss_clip": 0.01289789, + "auxiliary_loss_mlp": 0.00225921, + "balance_loss_clip": 1.04537809, + "balance_loss_mlp": 0.2019237, + "epoch": 0.5608297008868179, + "flos": 25045897466880.0, + "grad_norm": 5.231217301690262, + "language_loss": 0.86273003, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.87788707, + "num_input_tokens_seen": 200876585, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.2401123, + "step": 9328, + "time_per_iteration": 2.6975080966949463 + }, + { + "auxiliary_loss_clip": 0.01269587, + "auxiliary_loss_mlp": 0.00209141, + "balance_loss_clip": 1.03650331, + "balance_loss_mlp": 0.18544181, + "epoch": 0.560889824139486, + "flos": 19866825025920.0, + "grad_norm": 1.7184326544478867, + "language_loss": 0.79553539, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.81032264, + "num_input_tokens_seen": 200898175, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.23706055, + "step": 9329, + "time_per_iteration": 2.6440136432647705 + }, + { + "auxiliary_loss_clip": 0.01289144, + "auxiliary_loss_mlp": 0.00215756, + "balance_loss_clip": 1.04697144, + "balance_loss_mlp": 0.19025758, + "epoch": 0.5609499473921539, + "flos": 22929394429440.0, + "grad_norm": 3.032018440609573, + "language_loss": 0.90330321, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.91835219, + "num_input_tokens_seen": 200917515, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.25500488, + "step": 9330, + "time_per_iteration": 2.6716148853302 + }, + { + "auxiliary_loss_clip": 0.01227176, + "auxiliary_loss_mlp": 0.0005248, + "balance_loss_clip": 1.05092335, + "balance_loss_mlp": 0.04632924, + "epoch": 0.5610100706448219, + "flos": 53035825455360.0, + "grad_norm": 1.8803728036057583, + "language_loss": 0.57362807, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.58642465, + "num_input_tokens_seen": 200978615, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.06152344, + "step": 9331, + "time_per_iteration": 3.2018020153045654 + }, + { + "auxiliary_loss_clip": 0.01267178, + "auxiliary_loss_mlp": 0.00218777, + "balance_loss_clip": 1.02774763, + "balance_loss_mlp": 0.19542412, + "epoch": 0.5610701938974898, + "flos": 21834944979840.0, + "grad_norm": 5.198470925641551, + "language_loss": 0.90453613, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.91939574, + "num_input_tokens_seen": 200997745, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.23352051, + "step": 9332, + "time_per_iteration": 2.6830239295959473 + }, + { + "auxiliary_loss_clip": 0.01306468, + "auxiliary_loss_mlp": 0.00280748, + "balance_loss_clip": 1.06327724, + "balance_loss_mlp": 0.25374702, + "epoch": 0.5611303171501578, + "flos": 17457183095040.0, + "grad_norm": 15.98296771846845, + "language_loss": 0.91693199, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.93280411, + "num_input_tokens_seen": 201016370, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.26977539, + "step": 9333, + "time_per_iteration": 2.6334753036499023 + }, + { + "auxiliary_loss_clip": 0.01263645, + "auxiliary_loss_mlp": 0.00244149, + "balance_loss_clip": 1.03057349, + "balance_loss_mlp": 0.22002107, + "epoch": 0.5611904404028258, + "flos": 22637799820800.0, + "grad_norm": 31.265213402654283, + "language_loss": 0.79284692, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.80792487, + "num_input_tokens_seen": 201034310, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.24133301, + "step": 9334, + "time_per_iteration": 2.724316358566284 + }, + { + "auxiliary_loss_clip": 0.0127474, + "auxiliary_loss_mlp": 0.00228961, + "balance_loss_clip": 1.03971314, + "balance_loss_mlp": 0.20610824, + "epoch": 0.5612505636554938, + "flos": 14316327999360.0, + "grad_norm": 31.893523464867314, + "language_loss": 0.79720324, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.81224024, + "num_input_tokens_seen": 201052030, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.2286377, + "step": 9335, + "time_per_iteration": 2.6022539138793945 + }, + { + "auxiliary_loss_clip": 0.01283711, + "auxiliary_loss_mlp": 0.00268299, + "balance_loss_clip": 1.0451324, + "balance_loss_mlp": 0.24266897, + "epoch": 0.5613106869081618, + "flos": 16508279554560.0, + "grad_norm": 64.76510055398181, + "language_loss": 0.84250659, + "learning_rate": 1.701044410566205e-06, + "loss": 0.85802668, + "num_input_tokens_seen": 201068445, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.25634766, + "step": 9336, + "time_per_iteration": 2.6027638912200928 + }, + { + "auxiliary_loss_clip": 0.0126969, + "auxiliary_loss_mlp": 0.0022622, + "balance_loss_clip": 1.0347271, + "balance_loss_mlp": 0.20297411, + "epoch": 0.5613708101608297, + "flos": 24058569352320.0, + "grad_norm": 33.84863993286327, + "language_loss": 0.7566973, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.77165639, + "num_input_tokens_seen": 201082140, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.23242188, + "step": 9337, + "time_per_iteration": 2.6524341106414795 + }, + { + "auxiliary_loss_clip": 0.01234624, + "auxiliary_loss_mlp": 0.00064033, + "balance_loss_clip": 1.05670238, + "balance_loss_mlp": 0.05864482, + "epoch": 0.5614309334134977, + "flos": 64905735997440.0, + "grad_norm": 0.9293553935520884, + "language_loss": 0.62208086, + "learning_rate": 1.700274261035102e-06, + "loss": 0.6350674, + "num_input_tokens_seen": 201137245, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.05395508, + "step": 9338, + "time_per_iteration": 3.1099443435668945 + }, + { + "auxiliary_loss_clip": 0.01274615, + "auxiliary_loss_mlp": 0.00226914, + "balance_loss_clip": 1.04057956, + "balance_loss_mlp": 0.20313212, + "epoch": 0.5614910566661656, + "flos": 32919849740160.0, + "grad_norm": 20.572297629865286, + "language_loss": 0.74142063, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.75643593, + "num_input_tokens_seen": 201157270, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.23779297, + "step": 9339, + "time_per_iteration": 2.7918639183044434 + }, + { + "auxiliary_loss_clip": 0.01279791, + "auxiliary_loss_mlp": 0.0022539, + "balance_loss_clip": 1.03949523, + "balance_loss_mlp": 0.20251366, + "epoch": 0.5615511799188336, + "flos": 18588871969920.0, + "grad_norm": 2.8049764955642766, + "language_loss": 0.76047218, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.77552396, + "num_input_tokens_seen": 201174530, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.2286377, + "step": 9340, + "time_per_iteration": 2.5946993827819824 + }, + { + "auxiliary_loss_clip": 0.012791, + "auxiliary_loss_mlp": 0.00229185, + "balance_loss_clip": 1.04582727, + "balance_loss_mlp": 0.20728651, + "epoch": 0.5616113031715015, + "flos": 22820010537600.0, + "grad_norm": 8.912140954401659, + "language_loss": 0.85399431, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.86907721, + "num_input_tokens_seen": 201194905, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.21887207, + "step": 9341, + "time_per_iteration": 2.6831789016723633 + }, + { + "auxiliary_loss_clip": 0.01298499, + "auxiliary_loss_mlp": 0.00215514, + "balance_loss_clip": 1.05056286, + "balance_loss_mlp": 0.19219626, + "epoch": 0.5616714264241696, + "flos": 22345702421760.0, + "grad_norm": 9.227247417284797, + "language_loss": 0.88283855, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.89797872, + "num_input_tokens_seen": 201213715, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.2331543, + "step": 9342, + "time_per_iteration": 2.6636784076690674 + }, + { + "auxiliary_loss_clip": 0.01288857, + "auxiliary_loss_mlp": 0.00237781, + "balance_loss_clip": 1.04450011, + "balance_loss_mlp": 0.21390328, + "epoch": 0.5617315496768375, + "flos": 18807783408000.0, + "grad_norm": 116.03168307505989, + "language_loss": 0.8338185, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.84908485, + "num_input_tokens_seen": 201231415, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.2388916, + "step": 9343, + "time_per_iteration": 2.6479525566101074 + }, + { + "auxiliary_loss_clip": 0.01263127, + "auxiliary_loss_mlp": 0.00228576, + "balance_loss_clip": 1.03089952, + "balance_loss_mlp": 0.20429258, + "epoch": 0.5617916729295055, + "flos": 18369314087040.0, + "grad_norm": 5.875772546650142, + "language_loss": 0.79840708, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.81332409, + "num_input_tokens_seen": 201249625, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.24291992, + "step": 9344, + "time_per_iteration": 2.612426280975342 + }, + { + "auxiliary_loss_clip": 0.01292307, + "auxiliary_loss_mlp": 0.00232162, + "balance_loss_clip": 1.04518127, + "balance_loss_mlp": 0.20601973, + "epoch": 0.5618517961821734, + "flos": 28179964892160.0, + "grad_norm": 13.00954602315971, + "language_loss": 0.75532204, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.7705667, + "num_input_tokens_seen": 201271205, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.2611084, + "step": 9345, + "time_per_iteration": 2.7814781665802 + }, + { + "auxiliary_loss_clip": 0.01284344, + "auxiliary_loss_mlp": 0.0023239, + "balance_loss_clip": 1.04780817, + "balance_loss_mlp": 0.20982412, + "epoch": 0.5619119194348414, + "flos": 15486872411520.0, + "grad_norm": 5.42174462750624, + "language_loss": 0.95697564, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.97214305, + "num_input_tokens_seen": 201287700, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.22546387, + "step": 9346, + "time_per_iteration": 2.636953353881836 + }, + { + "auxiliary_loss_clip": 0.01250567, + "auxiliary_loss_mlp": 0.00233772, + "balance_loss_clip": 1.01846075, + "balance_loss_mlp": 0.20944095, + "epoch": 0.5619720426875094, + "flos": 29128652951040.0, + "grad_norm": 20.821699197393112, + "language_loss": 0.68796104, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.70280445, + "num_input_tokens_seen": 201307530, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.2434082, + "step": 9347, + "time_per_iteration": 4.273086071014404 + }, + { + "auxiliary_loss_clip": 0.01295837, + "auxiliary_loss_mlp": 0.00233165, + "balance_loss_clip": 1.04946828, + "balance_loss_mlp": 0.20774999, + "epoch": 0.5620321659401774, + "flos": 18003743418240.0, + "grad_norm": 8283.129222456137, + "language_loss": 0.80440664, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.81969666, + "num_input_tokens_seen": 201326210, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.25415039, + "step": 9348, + "time_per_iteration": 2.6602141857147217 + }, + { + "auxiliary_loss_clip": 0.01279075, + "auxiliary_loss_mlp": 0.00242797, + "balance_loss_clip": 1.04148865, + "balance_loss_mlp": 0.21832326, + "epoch": 0.5620922891928454, + "flos": 20594518657920.0, + "grad_norm": 16.274033205383855, + "language_loss": 0.88331479, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.89853352, + "num_input_tokens_seen": 201346120, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.24450684, + "step": 9349, + "time_per_iteration": 4.119148015975952 + }, + { + "auxiliary_loss_clip": 0.01283785, + "auxiliary_loss_mlp": 0.00242178, + "balance_loss_clip": 1.04271579, + "balance_loss_mlp": 0.21826501, + "epoch": 0.5621524124455133, + "flos": 26287006147200.0, + "grad_norm": 15.103841265176285, + "language_loss": 0.75146943, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.76672906, + "num_input_tokens_seen": 201365700, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.23913574, + "step": 9350, + "time_per_iteration": 2.6831510066986084 + }, + { + "auxiliary_loss_clip": 0.01256968, + "auxiliary_loss_mlp": 0.00233797, + "balance_loss_clip": 1.01731777, + "balance_loss_mlp": 0.20825069, + "epoch": 0.5622125356981813, + "flos": 12750299867520.0, + "grad_norm": 6.812655920355498, + "language_loss": 0.89105225, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.90595996, + "num_input_tokens_seen": 201382795, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.25549316, + "step": 9351, + "time_per_iteration": 4.058567523956299 + }, + { + "auxiliary_loss_clip": 0.01286617, + "auxiliary_loss_mlp": 0.00227508, + "balance_loss_clip": 1.04391766, + "balance_loss_mlp": 0.20242637, + "epoch": 0.5622726589508492, + "flos": 23805327490560.0, + "grad_norm": 4.544561005761319, + "language_loss": 0.64916372, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.66430509, + "num_input_tokens_seen": 201402780, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.25109863, + "step": 9352, + "time_per_iteration": 2.714797258377075 + }, + { + "auxiliary_loss_clip": 0.01236025, + "auxiliary_loss_mlp": 0.00188881, + "balance_loss_clip": 1.01145864, + "balance_loss_mlp": 0.16718528, + "epoch": 0.5623327822035172, + "flos": 24718212668160.0, + "grad_norm": 9.197428449222343, + "language_loss": 0.78704751, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.80129659, + "num_input_tokens_seen": 201424140, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.21679688, + "step": 9353, + "time_per_iteration": 2.706303596496582 + }, + { + "auxiliary_loss_clip": 0.01268395, + "auxiliary_loss_mlp": 0.00201363, + "balance_loss_clip": 1.02866292, + "balance_loss_mlp": 0.17885666, + "epoch": 0.5623929054561851, + "flos": 14019274523520.0, + "grad_norm": 16.230433974024386, + "language_loss": 0.88239563, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.89709324, + "num_input_tokens_seen": 201439645, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.22485352, + "step": 9354, + "time_per_iteration": 2.6452925205230713 + }, + { + "auxiliary_loss_clip": 0.01278912, + "auxiliary_loss_mlp": 0.00202119, + "balance_loss_clip": 1.03440189, + "balance_loss_mlp": 0.1785033, + "epoch": 0.5624530287088532, + "flos": 20704405340160.0, + "grad_norm": 143.69321353992137, + "language_loss": 0.82449448, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.83930475, + "num_input_tokens_seen": 201459970, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.23583984, + "step": 9355, + "time_per_iteration": 2.6713719367980957 + }, + { + "auxiliary_loss_clip": 0.01242731, + "auxiliary_loss_mlp": 0.00204069, + "balance_loss_clip": 1.01189482, + "balance_loss_mlp": 0.17974989, + "epoch": 0.5625131519615211, + "flos": 21470918595840.0, + "grad_norm": 44.399075738094915, + "language_loss": 0.79378754, + "learning_rate": 1.693344975084274e-06, + "loss": 0.80825555, + "num_input_tokens_seen": 201480055, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.24291992, + "step": 9356, + "time_per_iteration": 2.6768174171447754 + }, + { + "auxiliary_loss_clip": 0.01272571, + "auxiliary_loss_mlp": 0.00186217, + "balance_loss_clip": 1.03449416, + "balance_loss_mlp": 0.16273269, + "epoch": 0.5625732752141891, + "flos": 18698004466560.0, + "grad_norm": 18.03173965373499, + "language_loss": 0.92254114, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.93712914, + "num_input_tokens_seen": 201497645, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.23474121, + "step": 9357, + "time_per_iteration": 2.620073080062866 + }, + { + "auxiliary_loss_clip": 0.01263004, + "auxiliary_loss_mlp": 0.00196716, + "balance_loss_clip": 1.02906191, + "balance_loss_mlp": 0.17575943, + "epoch": 0.562633398466857, + "flos": 16216900427520.0, + "grad_norm": 3.6832055762126097, + "language_loss": 0.81104612, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.82564336, + "num_input_tokens_seen": 201515455, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.2097168, + "step": 9358, + "time_per_iteration": 4.024397611618042 + }, + { + "auxiliary_loss_clip": 0.01263075, + "auxiliary_loss_mlp": 0.00190041, + "balance_loss_clip": 1.0255518, + "balance_loss_mlp": 0.16828549, + "epoch": 0.562693521719525, + "flos": 22491930689280.0, + "grad_norm": 2.106289477644705, + "language_loss": 0.85418761, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.86871868, + "num_input_tokens_seen": 201534500, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.2175293, + "step": 9359, + "time_per_iteration": 2.6565680503845215 + }, + { + "auxiliary_loss_clip": 0.01256646, + "auxiliary_loss_mlp": 0.00207524, + "balance_loss_clip": 1.01678276, + "balance_loss_mlp": 0.18397978, + "epoch": 0.562753644972193, + "flos": 25331171281920.0, + "grad_norm": 37.37790572889734, + "language_loss": 0.79110944, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.80575109, + "num_input_tokens_seen": 201553280, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.23535156, + "step": 9360, + "time_per_iteration": 2.68914794921875 + }, + { + "auxiliary_loss_clip": 0.01301249, + "auxiliary_loss_mlp": 0.00177423, + "balance_loss_clip": 1.0920558, + "balance_loss_mlp": 0.16860119, + "epoch": 0.562813768224861, + "flos": 67392622126080.0, + "grad_norm": 0.758247490978468, + "language_loss": 0.55086803, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.56565475, + "num_input_tokens_seen": 201610030, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.08837891, + "step": 9361, + "time_per_iteration": 3.0636749267578125 + }, + { + "auxiliary_loss_clip": 0.01245527, + "auxiliary_loss_mlp": 0.00191857, + "balance_loss_clip": 1.01291656, + "balance_loss_mlp": 0.170459, + "epoch": 0.562873891477529, + "flos": 23331163029120.0, + "grad_norm": 6.593698565747468, + "language_loss": 0.86419284, + "learning_rate": 1.691036046141018e-06, + "loss": 0.87856674, + "num_input_tokens_seen": 201628370, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.21398926, + "step": 9362, + "time_per_iteration": 2.6486411094665527 + }, + { + "auxiliary_loss_clip": 0.01275911, + "auxiliary_loss_mlp": 0.00193253, + "balance_loss_clip": 1.035429, + "balance_loss_mlp": 0.16907749, + "epoch": 0.5629340147301969, + "flos": 38472824805120.0, + "grad_norm": 15.997256174520711, + "language_loss": 0.81826925, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.8329609, + "num_input_tokens_seen": 201649790, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.24157715, + "step": 9363, + "time_per_iteration": 2.802929639816284 + }, + { + "auxiliary_loss_clip": 0.01244688, + "auxiliary_loss_mlp": 0.00193527, + "balance_loss_clip": 1.01035309, + "balance_loss_mlp": 0.17125852, + "epoch": 0.5629941379828649, + "flos": 29242023252480.0, + "grad_norm": 191.67297933992847, + "language_loss": 0.90358019, + "learning_rate": 1.690266496731839e-06, + "loss": 0.91796231, + "num_input_tokens_seen": 201669175, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.22253418, + "step": 9364, + "time_per_iteration": 2.7277536392211914 + }, + { + "auxiliary_loss_clip": 0.01257558, + "auxiliary_loss_mlp": 0.0018891, + "balance_loss_clip": 1.02176785, + "balance_loss_mlp": 0.16727391, + "epoch": 0.5630542612355328, + "flos": 19420885676160.0, + "grad_norm": 16.55917899808996, + "language_loss": 0.75235415, + "learning_rate": 1.689881739637642e-06, + "loss": 0.76681882, + "num_input_tokens_seen": 201687000, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.21655273, + "step": 9365, + "time_per_iteration": 2.6439905166625977 + }, + { + "auxiliary_loss_clip": 0.01267712, + "auxiliary_loss_mlp": 0.00225257, + "balance_loss_clip": 1.02364206, + "balance_loss_mlp": 0.19881615, + "epoch": 0.5631143844882008, + "flos": 22266303408000.0, + "grad_norm": 7.975158139653375, + "language_loss": 0.91960788, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.93453753, + "num_input_tokens_seen": 201703335, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.2644043, + "step": 9366, + "time_per_iteration": 2.6442556381225586 + }, + { + "auxiliary_loss_clip": 0.01260458, + "auxiliary_loss_mlp": 0.00181895, + "balance_loss_clip": 1.02638698, + "balance_loss_mlp": 0.15987712, + "epoch": 0.5631745077408687, + "flos": 22965305051520.0, + "grad_norm": 6.186785044332759, + "language_loss": 0.81477416, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.8291977, + "num_input_tokens_seen": 201723495, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.2199707, + "step": 9367, + "time_per_iteration": 2.8191027641296387 + }, + { + "auxiliary_loss_clip": 0.01328546, + "auxiliary_loss_mlp": 0.00105313, + "balance_loss_clip": 1.11730814, + "balance_loss_mlp": 0.09839927, + "epoch": 0.5632346309935368, + "flos": 65080515576960.0, + "grad_norm": 0.8682981111096842, + "language_loss": 0.52903414, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.54337275, + "num_input_tokens_seen": 201792615, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.06933594, + "step": 9368, + "time_per_iteration": 3.299403190612793 + }, + { + "auxiliary_loss_clip": 0.01241153, + "auxiliary_loss_mlp": 0.00212293, + "balance_loss_clip": 1.01105404, + "balance_loss_mlp": 0.18969089, + "epoch": 0.5632947542462047, + "flos": 23002903612800.0, + "grad_norm": 7.179947697408982, + "language_loss": 0.762016, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.77655041, + "num_input_tokens_seen": 201812520, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.22619629, + "step": 9369, + "time_per_iteration": 2.6865692138671875 + }, + { + "auxiliary_loss_clip": 0.01253688, + "auxiliary_loss_mlp": 0.00191394, + "balance_loss_clip": 1.01805139, + "balance_loss_mlp": 0.16582322, + "epoch": 0.5633548774988727, + "flos": 30482593228800.0, + "grad_norm": 40.18572690283628, + "language_loss": 0.83168149, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.84613228, + "num_input_tokens_seen": 201834185, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.25585938, + "step": 9370, + "time_per_iteration": 2.8011395931243896 + }, + { + "auxiliary_loss_clip": 0.01257939, + "auxiliary_loss_mlp": 0.00215241, + "balance_loss_clip": 1.02186251, + "balance_loss_mlp": 0.18770361, + "epoch": 0.5634150007515406, + "flos": 18515039564160.0, + "grad_norm": 11.56863364341377, + "language_loss": 0.85781837, + "learning_rate": 1.687573444537108e-06, + "loss": 0.87255013, + "num_input_tokens_seen": 201851305, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.27502441, + "step": 9371, + "time_per_iteration": 2.615680694580078 + }, + { + "auxiliary_loss_clip": 0.01239026, + "auxiliary_loss_mlp": 0.00166695, + "balance_loss_clip": 1.00930715, + "balance_loss_mlp": 0.14316307, + "epoch": 0.5634751240042086, + "flos": 19244672530560.0, + "grad_norm": 12.809551048826341, + "language_loss": 0.84611315, + "learning_rate": 1.687188770067285e-06, + "loss": 0.86017036, + "num_input_tokens_seen": 201870350, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.23510742, + "step": 9372, + "time_per_iteration": 2.644272565841675 + }, + { + "auxiliary_loss_clip": 0.01273331, + "auxiliary_loss_mlp": 0.0020874, + "balance_loss_clip": 1.03705382, + "balance_loss_mlp": 0.1845044, + "epoch": 0.5635352472568766, + "flos": 12020630987520.0, + "grad_norm": 7.180350914273434, + "language_loss": 0.82371873, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.83853948, + "num_input_tokens_seen": 201886800, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.24267578, + "step": 9373, + "time_per_iteration": 2.647299289703369 + }, + { + "auxiliary_loss_clip": 0.01281907, + "auxiliary_loss_mlp": 0.00207512, + "balance_loss_clip": 1.04092252, + "balance_loss_mlp": 0.18047497, + "epoch": 0.5635953705095446, + "flos": 21871645701120.0, + "grad_norm": 36.607510632854364, + "language_loss": 0.92688566, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.94177985, + "num_input_tokens_seen": 201904730, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.27050781, + "step": 9374, + "time_per_iteration": 2.662726402282715 + }, + { + "auxiliary_loss_clip": 0.01238636, + "auxiliary_loss_mlp": 0.00175473, + "balance_loss_clip": 1.01040864, + "balance_loss_mlp": 0.15243012, + "epoch": 0.5636554937622126, + "flos": 27126166659840.0, + "grad_norm": 142.74874946569474, + "language_loss": 0.74423593, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.75837702, + "num_input_tokens_seen": 201924850, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.23010254, + "step": 9375, + "time_per_iteration": 2.692528009414673 + }, + { + "auxiliary_loss_clip": 0.01281244, + "auxiliary_loss_mlp": 0.00181366, + "balance_loss_clip": 1.03812218, + "balance_loss_mlp": 0.15680844, + "epoch": 0.5637156170148805, + "flos": 12926405272320.0, + "grad_norm": 18.45867363681506, + "language_loss": 0.9008683, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.91549438, + "num_input_tokens_seen": 201939500, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.2454834, + "step": 9376, + "time_per_iteration": 2.6400399208068848 + }, + { + "auxiliary_loss_clip": 0.01267666, + "auxiliary_loss_mlp": 0.00230465, + "balance_loss_clip": 1.0280937, + "balance_loss_mlp": 0.2057295, + "epoch": 0.5637757402675485, + "flos": 45551033130240.0, + "grad_norm": 3.9705086160514043, + "language_loss": 0.7535426, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.76852393, + "num_input_tokens_seen": 201963000, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.24743652, + "step": 9377, + "time_per_iteration": 2.882615804672241 + }, + { + "auxiliary_loss_clip": 0.01288528, + "auxiliary_loss_mlp": 0.00178049, + "balance_loss_clip": 1.04627633, + "balance_loss_mlp": 0.15412346, + "epoch": 0.5638358635202164, + "flos": 20886041439360.0, + "grad_norm": 6.555032631286302, + "language_loss": 0.80368447, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.8183502, + "num_input_tokens_seen": 201983145, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.23937988, + "step": 9378, + "time_per_iteration": 2.7459676265716553 + }, + { + "auxiliary_loss_clip": 0.01281496, + "auxiliary_loss_mlp": 0.00231335, + "balance_loss_clip": 1.03587401, + "balance_loss_mlp": 0.20489481, + "epoch": 0.5638959867728844, + "flos": 18806562345600.0, + "grad_norm": 19.947050309716797, + "language_loss": 0.92582643, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.94095469, + "num_input_tokens_seen": 202000335, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.26464844, + "step": 9379, + "time_per_iteration": 2.60707688331604 + }, + { + "auxiliary_loss_clip": 0.01274982, + "auxiliary_loss_mlp": 0.00199804, + "balance_loss_clip": 1.03699112, + "balance_loss_mlp": 0.17531815, + "epoch": 0.5639561100255523, + "flos": 27490336698240.0, + "grad_norm": 7.993554774905916, + "language_loss": 0.81209373, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.82684159, + "num_input_tokens_seen": 202018275, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.24487305, + "step": 9380, + "time_per_iteration": 2.6758649349212646 + }, + { + "auxiliary_loss_clip": 0.01292239, + "auxiliary_loss_mlp": 0.00204396, + "balance_loss_clip": 1.04711771, + "balance_loss_mlp": 0.17956477, + "epoch": 0.5640162332782204, + "flos": 18076570243200.0, + "grad_norm": 24.970826691208035, + "language_loss": 0.85261309, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.86757946, + "num_input_tokens_seen": 202034330, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.24841309, + "step": 9381, + "time_per_iteration": 2.613896131515503 + }, + { + "auxiliary_loss_clip": 0.01294565, + "auxiliary_loss_mlp": 0.00218236, + "balance_loss_clip": 1.04807639, + "balance_loss_mlp": 0.19323818, + "epoch": 0.5640763565308883, + "flos": 20884856290560.0, + "grad_norm": 57.224791283181084, + "language_loss": 0.81130731, + "learning_rate": 1.683342680176499e-06, + "loss": 0.82643533, + "num_input_tokens_seen": 202053100, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.24975586, + "step": 9382, + "time_per_iteration": 2.679006814956665 + }, + { + "auxiliary_loss_clip": 0.01357512, + "auxiliary_loss_mlp": 0.00079857, + "balance_loss_clip": 1.16015303, + "balance_loss_mlp": 0.0703201, + "epoch": 0.5641364797835563, + "flos": 64447912224000.0, + "grad_norm": 0.6919816516151311, + "language_loss": 0.54023123, + "learning_rate": 1.682958136989022e-06, + "loss": 0.55460495, + "num_input_tokens_seen": 202120125, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.09521484, + "step": 9383, + "time_per_iteration": 3.297159433364868 + }, + { + "auxiliary_loss_clip": 0.01274791, + "auxiliary_loss_mlp": 0.00238942, + "balance_loss_clip": 1.03458238, + "balance_loss_mlp": 0.21350275, + "epoch": 0.5641966030362242, + "flos": 18660944609280.0, + "grad_norm": 4.200652438404635, + "language_loss": 0.77298415, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.78812146, + "num_input_tokens_seen": 202138030, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.25439453, + "step": 9384, + "time_per_iteration": 2.71553897857666 + }, + { + "auxiliary_loss_clip": 0.01288196, + "auxiliary_loss_mlp": 0.00219989, + "balance_loss_clip": 1.04399443, + "balance_loss_mlp": 0.19460912, + "epoch": 0.5642567262888922, + "flos": 22492325738880.0, + "grad_norm": 45.019264739489586, + "language_loss": 0.81685507, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.8319369, + "num_input_tokens_seen": 202155580, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.25390625, + "step": 9385, + "time_per_iteration": 2.672147750854492 + }, + { + "auxiliary_loss_clip": 0.01267535, + "auxiliary_loss_mlp": 0.00214877, + "balance_loss_clip": 1.03122473, + "balance_loss_mlp": 0.19101122, + "epoch": 0.5643168495415603, + "flos": 13003972692480.0, + "grad_norm": 44.0681305311584, + "language_loss": 0.91938007, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.93420422, + "num_input_tokens_seen": 202170365, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.23876953, + "step": 9386, + "time_per_iteration": 2.6235334873199463 + }, + { + "auxiliary_loss_clip": 0.01286144, + "auxiliary_loss_mlp": 0.00238349, + "balance_loss_clip": 1.04428577, + "balance_loss_mlp": 0.21119297, + "epoch": 0.5643769727942282, + "flos": 18588297352320.0, + "grad_norm": 14.796624400810662, + "language_loss": 0.79035026, + "learning_rate": 1.681420084607516e-06, + "loss": 0.80559516, + "num_input_tokens_seen": 202189095, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.27197266, + "step": 9387, + "time_per_iteration": 2.651008367538452 + }, + { + "auxiliary_loss_clip": 0.01290708, + "auxiliary_loss_mlp": 0.00227602, + "balance_loss_clip": 1.04537916, + "balance_loss_mlp": 0.20147142, + "epoch": 0.5644370960468962, + "flos": 33806269572480.0, + "grad_norm": 8.845360013977434, + "language_loss": 0.80469322, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.81987637, + "num_input_tokens_seen": 202213500, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.26147461, + "step": 9388, + "time_per_iteration": 2.8031108379364014 + }, + { + "auxiliary_loss_clip": 0.01244439, + "auxiliary_loss_mlp": 0.00239425, + "balance_loss_clip": 1.01494455, + "balance_loss_mlp": 0.21640596, + "epoch": 0.5644972192995641, + "flos": 21214911386880.0, + "grad_norm": 7.604097762177784, + "language_loss": 0.88825774, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.90309638, + "num_input_tokens_seen": 202231920, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.23034668, + "step": 9389, + "time_per_iteration": 4.205033540725708 + }, + { + "auxiliary_loss_clip": 0.01299722, + "auxiliary_loss_mlp": 0.00222747, + "balance_loss_clip": 1.05090344, + "balance_loss_mlp": 0.19785586, + "epoch": 0.5645573425522321, + "flos": 18587722734720.0, + "grad_norm": 8.18842808408754, + "language_loss": 0.78395927, + "learning_rate": 1.680266672116467e-06, + "loss": 0.79918396, + "num_input_tokens_seen": 202247600, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.24914551, + "step": 9390, + "time_per_iteration": 2.687814474105835 + }, + { + "auxiliary_loss_clip": 0.01274441, + "auxiliary_loss_mlp": 0.00221857, + "balance_loss_clip": 1.03667057, + "balance_loss_mlp": 0.19920695, + "epoch": 0.5646174658049, + "flos": 18113809668480.0, + "grad_norm": 16.086095162141465, + "language_loss": 0.99273401, + "learning_rate": 1.6798822255153192e-06, + "loss": 1.00769699, + "num_input_tokens_seen": 202265350, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.2265625, + "step": 9391, + "time_per_iteration": 4.058152437210083 + }, + { + "auxiliary_loss_clip": 0.01297491, + "auxiliary_loss_mlp": 0.00247263, + "balance_loss_clip": 1.04971886, + "balance_loss_mlp": 0.2211915, + "epoch": 0.564677589057568, + "flos": 28329964087680.0, + "grad_norm": 30.008088815856816, + "language_loss": 0.69050515, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.7059527, + "num_input_tokens_seen": 202284285, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.26049805, + "step": 9392, + "time_per_iteration": 2.708153486251831 + }, + { + "auxiliary_loss_clip": 0.01296938, + "auxiliary_loss_mlp": 0.0023433, + "balance_loss_clip": 1.04756761, + "balance_loss_mlp": 0.20900974, + "epoch": 0.564737712310236, + "flos": 22163743100160.0, + "grad_norm": 49.41380278594029, + "language_loss": 0.91470754, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.93002021, + "num_input_tokens_seen": 202303450, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.2532959, + "step": 9393, + "time_per_iteration": 4.1152026653289795 + }, + { + "auxiliary_loss_clip": 0.01292157, + "auxiliary_loss_mlp": 0.00230104, + "balance_loss_clip": 1.05157053, + "balance_loss_mlp": 0.20628645, + "epoch": 0.564797835562904, + "flos": 20959011918720.0, + "grad_norm": 8.328776759181045, + "language_loss": 0.94179976, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.95702231, + "num_input_tokens_seen": 202322315, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.23815918, + "step": 9394, + "time_per_iteration": 2.6988818645477295 + }, + { + "auxiliary_loss_clip": 0.01272907, + "auxiliary_loss_mlp": 0.00229427, + "balance_loss_clip": 1.03767538, + "balance_loss_mlp": 0.20546573, + "epoch": 0.5648579588155719, + "flos": 17420302805760.0, + "grad_norm": 3.1516566631690113, + "language_loss": 0.91135454, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.92637789, + "num_input_tokens_seen": 202339905, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.23986816, + "step": 9395, + "time_per_iteration": 2.607619524002075 + }, + { + "auxiliary_loss_clip": 0.01395207, + "auxiliary_loss_mlp": 0.00091371, + "balance_loss_clip": 1.19881523, + "balance_loss_mlp": 0.08183399, + "epoch": 0.5649180820682399, + "flos": 69929568835200.0, + "grad_norm": 0.8716263030843595, + "language_loss": 0.57606399, + "learning_rate": 1.677960174884597e-06, + "loss": 0.59092975, + "num_input_tokens_seen": 202397320, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.09521484, + "step": 9396, + "time_per_iteration": 3.1884469985961914 + }, + { + "auxiliary_loss_clip": 0.01279039, + "auxiliary_loss_mlp": 0.00253007, + "balance_loss_clip": 1.04056144, + "balance_loss_mlp": 0.22592297, + "epoch": 0.5649782053209078, + "flos": 24973070641920.0, + "grad_norm": 3.24943818158317, + "language_loss": 0.78440988, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.79973036, + "num_input_tokens_seen": 202416865, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.27099609, + "step": 9397, + "time_per_iteration": 2.686253547668457 + }, + { + "auxiliary_loss_clip": 0.01258214, + "auxiliary_loss_mlp": 0.00263778, + "balance_loss_clip": 1.02453685, + "balance_loss_mlp": 0.23897065, + "epoch": 0.5650383285735758, + "flos": 21726602582400.0, + "grad_norm": 9.16493468533091, + "language_loss": 0.76811236, + "learning_rate": 1.67719144001275e-06, + "loss": 0.78333223, + "num_input_tokens_seen": 202436210, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.24816895, + "step": 9398, + "time_per_iteration": 2.693168878555298 + }, + { + "auxiliary_loss_clip": 0.0141854, + "auxiliary_loss_mlp": 0.00134814, + "balance_loss_clip": 1.21753216, + "balance_loss_mlp": 0.12213035, + "epoch": 0.5650984518262439, + "flos": 65904484636800.0, + "grad_norm": 0.7580233559961177, + "language_loss": 0.57255769, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.58809125, + "num_input_tokens_seen": 202492925, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.12695312, + "step": 9399, + "time_per_iteration": 3.0713977813720703 + }, + { + "auxiliary_loss_clip": 0.01251901, + "auxiliary_loss_mlp": 0.00257104, + "balance_loss_clip": 1.01900721, + "balance_loss_mlp": 0.23060372, + "epoch": 0.5651585750789118, + "flos": 21032592929280.0, + "grad_norm": 585.1365714280007, + "language_loss": 0.81883758, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.83392763, + "num_input_tokens_seen": 202511905, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.26513672, + "step": 9400, + "time_per_iteration": 4.02965784072876 + }, + { + "auxiliary_loss_clip": 0.01301005, + "auxiliary_loss_mlp": 0.00251953, + "balance_loss_clip": 1.05402112, + "balance_loss_mlp": 0.22330683, + "epoch": 0.5652186983315798, + "flos": 18551919853440.0, + "grad_norm": 8.30727261742925, + "language_loss": 0.70364535, + "learning_rate": 1.676038429548412e-06, + "loss": 0.71917492, + "num_input_tokens_seen": 202529815, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.28662109, + "step": 9401, + "time_per_iteration": 2.6375527381896973 + }, + { + "auxiliary_loss_clip": 0.01284553, + "auxiliary_loss_mlp": 0.00252391, + "balance_loss_clip": 1.04805279, + "balance_loss_mlp": 0.22705877, + "epoch": 0.5652788215842477, + "flos": 18478662065280.0, + "grad_norm": 17.687245936443407, + "language_loss": 0.88262653, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.89799601, + "num_input_tokens_seen": 202547710, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.2532959, + "step": 9402, + "time_per_iteration": 2.623420476913452 + }, + { + "auxiliary_loss_clip": 0.012717, + "auxiliary_loss_mlp": 0.00264391, + "balance_loss_clip": 1.03586173, + "balance_loss_mlp": 0.24034598, + "epoch": 0.5653389448369157, + "flos": 30044052080640.0, + "grad_norm": 3.821402997174246, + "language_loss": 0.83348989, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.84885073, + "num_input_tokens_seen": 202568835, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.24047852, + "step": 9403, + "time_per_iteration": 2.702528715133667 + }, + { + "auxiliary_loss_clip": 0.01276494, + "auxiliary_loss_mlp": 0.00266521, + "balance_loss_clip": 1.03926992, + "balance_loss_mlp": 0.24086678, + "epoch": 0.5653990680895836, + "flos": 16727550128640.0, + "grad_norm": 148.855513159024, + "language_loss": 0.77008677, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.78551686, + "num_input_tokens_seen": 202587385, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.25646973, + "step": 9404, + "time_per_iteration": 2.6391890048980713 + }, + { + "auxiliary_loss_clip": 0.01274255, + "auxiliary_loss_mlp": 0.00244104, + "balance_loss_clip": 1.0437609, + "balance_loss_mlp": 0.21984504, + "epoch": 0.5654591913422516, + "flos": 14538256179840.0, + "grad_norm": 11.777987163881482, + "language_loss": 0.74376756, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.75895119, + "num_input_tokens_seen": 202604815, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.24255371, + "step": 9405, + "time_per_iteration": 2.6124746799468994 + }, + { + "auxiliary_loss_clip": 0.01265336, + "auxiliary_loss_mlp": 0.00226565, + "balance_loss_clip": 1.03707755, + "balance_loss_mlp": 0.20461817, + "epoch": 0.5655193145949196, + "flos": 26209905603840.0, + "grad_norm": 224.34321790555146, + "language_loss": 0.80255657, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.81747568, + "num_input_tokens_seen": 202623775, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.21948242, + "step": 9406, + "time_per_iteration": 2.6732428073883057 + }, + { + "auxiliary_loss_clip": 0.01268701, + "auxiliary_loss_mlp": 0.00266206, + "balance_loss_clip": 1.03466296, + "balance_loss_mlp": 0.24082646, + "epoch": 0.5655794378475876, + "flos": 25046579825280.0, + "grad_norm": 8.767253534421368, + "language_loss": 0.87785304, + "learning_rate": 1.673732740698882e-06, + "loss": 0.89320213, + "num_input_tokens_seen": 202643375, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.25366211, + "step": 9407, + "time_per_iteration": 2.6691091060638428 + }, + { + "auxiliary_loss_clip": 0.01311196, + "auxiliary_loss_mlp": 0.00227607, + "balance_loss_clip": 1.06716645, + "balance_loss_mlp": 0.20248917, + "epoch": 0.5656395611002555, + "flos": 31032852652800.0, + "grad_norm": 76.9758127854083, + "language_loss": 0.78170252, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.79709053, + "num_input_tokens_seen": 202668400, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.25109863, + "step": 9408, + "time_per_iteration": 2.7607624530792236 + }, + { + "auxiliary_loss_clip": 0.01300895, + "auxiliary_loss_mlp": 0.00238835, + "balance_loss_clip": 1.0645833, + "balance_loss_mlp": 0.2146951, + "epoch": 0.5656996843529235, + "flos": 20229522606720.0, + "grad_norm": 26.117029671547346, + "language_loss": 0.90007973, + "learning_rate": 1.672964276570308e-06, + "loss": 0.91547704, + "num_input_tokens_seen": 202685125, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.24145508, + "step": 9409, + "time_per_iteration": 2.6883575916290283 + }, + { + "auxiliary_loss_clip": 0.01277227, + "auxiliary_loss_mlp": 0.00273686, + "balance_loss_clip": 1.03670895, + "balance_loss_mlp": 0.24824689, + "epoch": 0.5657598076055914, + "flos": 20996251344000.0, + "grad_norm": 6.508517615239209, + "language_loss": 0.86218327, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.87769246, + "num_input_tokens_seen": 202703830, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.2545166, + "step": 9410, + "time_per_iteration": 2.6588029861450195 + }, + { + "auxiliary_loss_clip": 0.01293852, + "auxiliary_loss_mlp": 0.00241982, + "balance_loss_clip": 1.05189204, + "balance_loss_mlp": 0.21780616, + "epoch": 0.5658199308582594, + "flos": 11545999649280.0, + "grad_norm": 5.53411480481222, + "language_loss": 0.91902059, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.93437898, + "num_input_tokens_seen": 202719835, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.24169922, + "step": 9411, + "time_per_iteration": 2.606736660003662 + }, + { + "auxiliary_loss_clip": 0.01294567, + "auxiliary_loss_mlp": 0.00269361, + "balance_loss_clip": 1.05186081, + "balance_loss_mlp": 0.2416213, + "epoch": 0.5658800541109275, + "flos": 14172146807040.0, + "grad_norm": 6.937355624727228, + "language_loss": 0.79223454, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.80787379, + "num_input_tokens_seen": 202736795, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.27722168, + "step": 9412, + "time_per_iteration": 2.695753812789917 + }, + { + "auxiliary_loss_clip": 0.01288793, + "auxiliary_loss_mlp": 0.00244771, + "balance_loss_clip": 1.05150306, + "balance_loss_mlp": 0.22059503, + "epoch": 0.5659401773635954, + "flos": 27305073325440.0, + "grad_norm": 38.94297371143048, + "language_loss": 0.6567654, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.67210102, + "num_input_tokens_seen": 202756900, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.24194336, + "step": 9413, + "time_per_iteration": 2.8388662338256836 + }, + { + "auxiliary_loss_clip": 0.01287742, + "auxiliary_loss_mlp": 0.00234466, + "balance_loss_clip": 1.05135417, + "balance_loss_mlp": 0.20987295, + "epoch": 0.5660003006162634, + "flos": 16728196573440.0, + "grad_norm": 527.0083658371486, + "language_loss": 0.7717663, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.78698838, + "num_input_tokens_seen": 202775145, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.24621582, + "step": 9414, + "time_per_iteration": 2.6344704627990723 + }, + { + "auxiliary_loss_clip": 0.01261083, + "auxiliary_loss_mlp": 0.00232838, + "balance_loss_clip": 1.02849483, + "balance_loss_mlp": 0.20943755, + "epoch": 0.5660604238689313, + "flos": 21653452535040.0, + "grad_norm": 83.71922549429587, + "language_loss": 0.84369254, + "learning_rate": 1.670659182280247e-06, + "loss": 0.85863173, + "num_input_tokens_seen": 202794505, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.23413086, + "step": 9415, + "time_per_iteration": 2.698840856552124 + }, + { + "auxiliary_loss_clip": 0.01420465, + "auxiliary_loss_mlp": 0.00147786, + "balance_loss_clip": 1.21895397, + "balance_loss_mlp": 0.13424401, + "epoch": 0.5661205471215993, + "flos": 68824022083200.0, + "grad_norm": 0.6871304226846888, + "language_loss": 0.48975101, + "learning_rate": 1.670275043523822e-06, + "loss": 0.50543356, + "num_input_tokens_seen": 202858580, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.13574219, + "step": 9416, + "time_per_iteration": 3.2879676818847656 + }, + { + "auxiliary_loss_clip": 0.01284683, + "auxiliary_loss_mlp": 0.00236507, + "balance_loss_clip": 1.04705656, + "balance_loss_mlp": 0.21206921, + "epoch": 0.5661806703742672, + "flos": 28621774177920.0, + "grad_norm": 5.646686034680271, + "language_loss": 0.72522998, + "learning_rate": 1.6698909172706e-06, + "loss": 0.74044192, + "num_input_tokens_seen": 202878565, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.2442627, + "step": 9417, + "time_per_iteration": 2.7214865684509277 + }, + { + "auxiliary_loss_clip": 0.01264087, + "auxiliary_loss_mlp": 0.00255499, + "balance_loss_clip": 1.02831364, + "balance_loss_mlp": 0.22983363, + "epoch": 0.5662407936269352, + "flos": 21397948116480.0, + "grad_norm": 5.0086251830067665, + "language_loss": 0.77808738, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.79328328, + "num_input_tokens_seen": 202897350, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.25671387, + "step": 9418, + "time_per_iteration": 2.637676954269409 + }, + { + "auxiliary_loss_clip": 0.01286101, + "auxiliary_loss_mlp": 0.00233599, + "balance_loss_clip": 1.04357851, + "balance_loss_mlp": 0.20985293, + "epoch": 0.5663009168796032, + "flos": 25660005315840.0, + "grad_norm": 3.6942856181998094, + "language_loss": 0.73864353, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.75384045, + "num_input_tokens_seen": 202916745, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.23730469, + "step": 9419, + "time_per_iteration": 2.6898815631866455 + }, + { + "auxiliary_loss_clip": 0.01434115, + "auxiliary_loss_mlp": 0.00101471, + "balance_loss_clip": 1.23852873, + "balance_loss_mlp": 0.09121904, + "epoch": 0.5663610401322712, + "flos": 67930458422400.0, + "grad_norm": 2.8179447138051716, + "language_loss": 0.59306335, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.60841918, + "num_input_tokens_seen": 202982375, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.10253906, + "step": 9420, + "time_per_iteration": 3.199526071548462 + }, + { + "auxiliary_loss_clip": 0.01290937, + "auxiliary_loss_mlp": 0.00252874, + "balance_loss_clip": 1.05112958, + "balance_loss_mlp": 0.22923522, + "epoch": 0.5664211633849391, + "flos": 24609367480320.0, + "grad_norm": 25.170189238975418, + "language_loss": 0.81410265, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.82954073, + "num_input_tokens_seen": 203002430, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.23632812, + "step": 9421, + "time_per_iteration": 2.705155611038208 + }, + { + "auxiliary_loss_clip": 0.01285963, + "auxiliary_loss_mlp": 0.00275204, + "balance_loss_clip": 1.04286265, + "balance_loss_mlp": 0.25046849, + "epoch": 0.5664812866376071, + "flos": 11648811352320.0, + "grad_norm": 6.5968132317897235, + "language_loss": 0.83163774, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.84724939, + "num_input_tokens_seen": 203019425, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.24755859, + "step": 9422, + "time_per_iteration": 2.6179187297821045 + }, + { + "auxiliary_loss_clip": 0.01259858, + "auxiliary_loss_mlp": 0.00247173, + "balance_loss_clip": 1.0271976, + "balance_loss_mlp": 0.22339037, + "epoch": 0.566541409890275, + "flos": 24643985212800.0, + "grad_norm": 10.02476435827786, + "language_loss": 0.88846588, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.9035362, + "num_input_tokens_seen": 203039035, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.23803711, + "step": 9423, + "time_per_iteration": 2.689324140548706 + }, + { + "auxiliary_loss_clip": 0.01267365, + "auxiliary_loss_mlp": 0.00254561, + "balance_loss_clip": 1.03078818, + "balance_loss_mlp": 0.2276911, + "epoch": 0.566601533142943, + "flos": 22270577126400.0, + "grad_norm": 5.747776705425229, + "language_loss": 0.8671931, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.88241237, + "num_input_tokens_seen": 203059320, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.26843262, + "step": 9424, + "time_per_iteration": 2.6395626068115234 + }, + { + "auxiliary_loss_clip": 0.01296548, + "auxiliary_loss_mlp": 0.00270338, + "balance_loss_clip": 1.04884839, + "balance_loss_mlp": 0.24137016, + "epoch": 0.5666616563956111, + "flos": 29971656218880.0, + "grad_norm": 473.0918542253134, + "language_loss": 0.87241375, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.88808262, + "num_input_tokens_seen": 203078490, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.28955078, + "step": 9425, + "time_per_iteration": 2.7122762203216553 + }, + { + "auxiliary_loss_clip": 0.01285437, + "auxiliary_loss_mlp": 0.00255481, + "balance_loss_clip": 1.04550242, + "balance_loss_mlp": 0.23037598, + "epoch": 0.566721779648279, + "flos": 17781456101760.0, + "grad_norm": 12.898323813194322, + "language_loss": 0.69544578, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.71085489, + "num_input_tokens_seen": 203096065, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.25085449, + "step": 9426, + "time_per_iteration": 2.589808225631714 + }, + { + "auxiliary_loss_clip": 0.01306466, + "auxiliary_loss_mlp": 0.00295766, + "balance_loss_clip": 1.05995107, + "balance_loss_mlp": 0.26765656, + "epoch": 0.566781902900947, + "flos": 21033490769280.0, + "grad_norm": 38.035519278838855, + "language_loss": 0.87789345, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.89391583, + "num_input_tokens_seen": 203115270, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.28149414, + "step": 9427, + "time_per_iteration": 2.633382797241211 + }, + { + "auxiliary_loss_clip": 0.01284462, + "auxiliary_loss_mlp": 0.00257358, + "balance_loss_clip": 1.04864836, + "balance_loss_mlp": 0.23368318, + "epoch": 0.5668420261536149, + "flos": 23148593176320.0, + "grad_norm": 92.36659950356955, + "language_loss": 0.91213995, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.92755818, + "num_input_tokens_seen": 203134290, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.23681641, + "step": 9428, + "time_per_iteration": 2.644094705581665 + }, + { + "auxiliary_loss_clip": 0.0131405, + "auxiliary_loss_mlp": 0.00251509, + "balance_loss_clip": 1.06182432, + "balance_loss_mlp": 0.22497274, + "epoch": 0.5669021494062829, + "flos": 22601601889920.0, + "grad_norm": 17.02089494688732, + "language_loss": 0.80254054, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.81819618, + "num_input_tokens_seen": 203152935, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.265625, + "step": 9429, + "time_per_iteration": 2.6706180572509766 + }, + { + "auxiliary_loss_clip": 0.01308786, + "auxiliary_loss_mlp": 0.0025795, + "balance_loss_clip": 1.05915451, + "balance_loss_mlp": 0.23088998, + "epoch": 0.5669622726589508, + "flos": 17381231786880.0, + "grad_norm": 23.278686229077284, + "language_loss": 0.84170532, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.85737276, + "num_input_tokens_seen": 203170110, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.27099609, + "step": 9430, + "time_per_iteration": 2.625060558319092 + }, + { + "auxiliary_loss_clip": 0.01279955, + "auxiliary_loss_mlp": 0.00249671, + "balance_loss_clip": 1.04131937, + "balance_loss_mlp": 0.22317103, + "epoch": 0.5670223959116188, + "flos": 18763253521920.0, + "grad_norm": 43.988727916137314, + "language_loss": 0.81272662, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.8280229, + "num_input_tokens_seen": 203188825, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.26513672, + "step": 9431, + "time_per_iteration": 4.124488830566406 + }, + { + "auxiliary_loss_clip": 0.01303208, + "auxiliary_loss_mlp": 0.00225027, + "balance_loss_clip": 1.06144714, + "balance_loss_mlp": 0.20253263, + "epoch": 0.5670825191642868, + "flos": 13553334276480.0, + "grad_norm": 5.167696060828471, + "language_loss": 0.78704053, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.80232286, + "num_input_tokens_seen": 203206860, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.22485352, + "step": 9432, + "time_per_iteration": 2.6269383430480957 + }, + { + "auxiliary_loss_clip": 0.01267331, + "auxiliary_loss_mlp": 0.00264477, + "balance_loss_clip": 1.0319314, + "balance_loss_mlp": 0.23918059, + "epoch": 0.5671426424169548, + "flos": 22054035985920.0, + "grad_norm": 4.3311491395458654, + "language_loss": 0.84252489, + "learning_rate": 1.663746609539197e-06, + "loss": 0.85784298, + "num_input_tokens_seen": 203225625, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.25317383, + "step": 9433, + "time_per_iteration": 4.125555515289307 + }, + { + "auxiliary_loss_clip": 0.01296876, + "auxiliary_loss_mlp": 0.00279507, + "balance_loss_clip": 1.0467093, + "balance_loss_mlp": 0.25186267, + "epoch": 0.5672027656696227, + "flos": 21323972056320.0, + "grad_norm": 24.772381571624972, + "language_loss": 0.72664142, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.74240524, + "num_input_tokens_seen": 203242920, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.27648926, + "step": 9434, + "time_per_iteration": 2.661813974380493 + }, + { + "auxiliary_loss_clip": 0.01283502, + "auxiliary_loss_mlp": 0.00242695, + "balance_loss_clip": 1.04531229, + "balance_loss_mlp": 0.21928251, + "epoch": 0.5672628889222907, + "flos": 23514056104320.0, + "grad_norm": 2.8176499140753064, + "language_loss": 0.72435927, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.73962122, + "num_input_tokens_seen": 203261995, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.23413086, + "step": 9435, + "time_per_iteration": 4.23422384262085 + }, + { + "auxiliary_loss_clip": 0.01274549, + "auxiliary_loss_mlp": 0.0025386, + "balance_loss_clip": 1.03867245, + "balance_loss_mlp": 0.23045906, + "epoch": 0.5673230121749586, + "flos": 27121928855040.0, + "grad_norm": 11.439708909348584, + "language_loss": 0.76661015, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.78189433, + "num_input_tokens_seen": 203280670, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.23425293, + "step": 9436, + "time_per_iteration": 2.7549822330474854 + }, + { + "auxiliary_loss_clip": 0.01300404, + "auxiliary_loss_mlp": 0.00274982, + "balance_loss_clip": 1.05279613, + "balance_loss_mlp": 0.24897079, + "epoch": 0.5673831354276266, + "flos": 31141985149440.0, + "grad_norm": 3.050647713736746, + "language_loss": 0.79652268, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.81227654, + "num_input_tokens_seen": 203304800, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.26013184, + "step": 9437, + "time_per_iteration": 2.7605960369110107 + }, + { + "auxiliary_loss_clip": 0.01305971, + "auxiliary_loss_mlp": 0.00275279, + "balance_loss_clip": 1.06174529, + "balance_loss_mlp": 0.24522606, + "epoch": 0.5674432586802945, + "flos": 27673193859840.0, + "grad_norm": 12.74565207869998, + "language_loss": 0.68932235, + "learning_rate": 1.661827179985277e-06, + "loss": 0.70513487, + "num_input_tokens_seen": 203324060, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.30053711, + "step": 9438, + "time_per_iteration": 2.727487802505493 + }, + { + "auxiliary_loss_clip": 0.01306537, + "auxiliary_loss_mlp": 0.0026222, + "balance_loss_clip": 1.06224298, + "balance_loss_mlp": 0.23511185, + "epoch": 0.5675033819329626, + "flos": 26615157822720.0, + "grad_norm": 22.197134547497253, + "language_loss": 0.82553124, + "learning_rate": 1.661443332486909e-06, + "loss": 0.84121883, + "num_input_tokens_seen": 203344360, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.27124023, + "step": 9439, + "time_per_iteration": 2.6804862022399902 + }, + { + "auxiliary_loss_clip": 0.01323936, + "auxiliary_loss_mlp": 0.00266322, + "balance_loss_clip": 1.07330894, + "balance_loss_mlp": 0.23862964, + "epoch": 0.5675635051856306, + "flos": 19098372435840.0, + "grad_norm": 4.364665980405295, + "language_loss": 0.91962183, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.9355244, + "num_input_tokens_seen": 203362115, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.27697754, + "step": 9440, + "time_per_iteration": 2.627060890197754 + }, + { + "auxiliary_loss_clip": 0.01306864, + "auxiliary_loss_mlp": 0.0028581, + "balance_loss_clip": 1.05807018, + "balance_loss_mlp": 0.25911903, + "epoch": 0.5676236284382985, + "flos": 17566315591680.0, + "grad_norm": 9.556778443669904, + "language_loss": 0.83990049, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.85582721, + "num_input_tokens_seen": 203380550, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.26672363, + "step": 9441, + "time_per_iteration": 2.6065282821655273 + }, + { + "auxiliary_loss_clip": 0.01302796, + "auxiliary_loss_mlp": 0.00302321, + "balance_loss_clip": 1.05736589, + "balance_loss_mlp": 0.27539188, + "epoch": 0.5676837516909665, + "flos": 15954069634560.0, + "grad_norm": 12.722921779656293, + "language_loss": 0.90904641, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.92509753, + "num_input_tokens_seen": 203396590, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.26928711, + "step": 9442, + "time_per_iteration": 4.01578426361084 + }, + { + "auxiliary_loss_clip": 0.01325176, + "auxiliary_loss_mlp": 0.00269651, + "balance_loss_clip": 1.07980442, + "balance_loss_mlp": 0.24459356, + "epoch": 0.5677438749436344, + "flos": 18295912644480.0, + "grad_norm": 16.772410941432707, + "language_loss": 0.83406079, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.85000908, + "num_input_tokens_seen": 203414280, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.25085449, + "step": 9443, + "time_per_iteration": 2.6322555541992188 + }, + { + "auxiliary_loss_clip": 0.01306634, + "auxiliary_loss_mlp": 0.00332318, + "balance_loss_clip": 1.06155348, + "balance_loss_mlp": 0.30590081, + "epoch": 0.5678039981963025, + "flos": 17931311642880.0, + "grad_norm": 5.033232119572463, + "language_loss": 0.86400276, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.88039225, + "num_input_tokens_seen": 203433280, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.26428223, + "step": 9444, + "time_per_iteration": 2.6096367835998535 + }, + { + "auxiliary_loss_clip": 0.0132288, + "auxiliary_loss_mlp": 0.00276496, + "balance_loss_clip": 1.07061458, + "balance_loss_mlp": 0.25150961, + "epoch": 0.5678641214489704, + "flos": 19316350120320.0, + "grad_norm": 41.09167978527035, + "language_loss": 0.88463151, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.90062535, + "num_input_tokens_seen": 203449935, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.25, + "step": 9445, + "time_per_iteration": 2.631793260574341 + }, + { + "auxiliary_loss_clip": 0.01328467, + "auxiliary_loss_mlp": 0.00282521, + "balance_loss_clip": 1.07623982, + "balance_loss_mlp": 0.25532985, + "epoch": 0.5679242447016384, + "flos": 27751084502400.0, + "grad_norm": 6.740058325569447, + "language_loss": 0.76941204, + "learning_rate": 1.658756760280259e-06, + "loss": 0.78552192, + "num_input_tokens_seen": 203473025, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.27172852, + "step": 9446, + "time_per_iteration": 2.752338171005249 + }, + { + "auxiliary_loss_clip": 0.0132621, + "auxiliary_loss_mlp": 0.00288208, + "balance_loss_clip": 1.06813729, + "balance_loss_mlp": 0.26003844, + "epoch": 0.5679843679543063, + "flos": 23769093646080.0, + "grad_norm": 56.80437868370588, + "language_loss": 0.8375262, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.85367036, + "num_input_tokens_seen": 203492895, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.28161621, + "step": 9447, + "time_per_iteration": 2.7215120792388916 + }, + { + "auxiliary_loss_clip": 0.01310046, + "auxiliary_loss_mlp": 0.00263464, + "balance_loss_clip": 1.05808377, + "balance_loss_mlp": 0.23751193, + "epoch": 0.5680444912069743, + "flos": 25591883172480.0, + "grad_norm": 5.335723002563481, + "language_loss": 0.85134315, + "learning_rate": 1.657989284462725e-06, + "loss": 0.86707819, + "num_input_tokens_seen": 203513710, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.25976562, + "step": 9448, + "time_per_iteration": 2.6723532676696777 + }, + { + "auxiliary_loss_clip": 0.01347236, + "auxiliary_loss_mlp": 0.0033767, + "balance_loss_clip": 1.08516836, + "balance_loss_mlp": 0.3059485, + "epoch": 0.5681046144596422, + "flos": 23695799944320.0, + "grad_norm": 14.972770433948968, + "language_loss": 0.84621376, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.86306286, + "num_input_tokens_seen": 203531630, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.31738281, + "step": 9449, + "time_per_iteration": 2.7167389392852783 + }, + { + "auxiliary_loss_clip": 0.0131387, + "auxiliary_loss_mlp": 0.00257162, + "balance_loss_clip": 1.06452107, + "balance_loss_mlp": 0.23140061, + "epoch": 0.5681647377123102, + "flos": 28000770917760.0, + "grad_norm": 7.158695909943339, + "language_loss": 0.82618487, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.84189522, + "num_input_tokens_seen": 203551885, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.25769043, + "step": 9450, + "time_per_iteration": 2.690284490585327 + }, + { + "auxiliary_loss_clip": 0.01277357, + "auxiliary_loss_mlp": 0.00286799, + "balance_loss_clip": 1.03609169, + "balance_loss_mlp": 0.26245645, + "epoch": 0.5682248609649782, + "flos": 22747758330240.0, + "grad_norm": 13.814686707503448, + "language_loss": 0.76046687, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.77610844, + "num_input_tokens_seen": 203572250, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.2434082, + "step": 9451, + "time_per_iteration": 2.6433491706848145 + }, + { + "auxiliary_loss_clip": 0.0131585, + "auxiliary_loss_mlp": 0.00315703, + "balance_loss_clip": 1.05810654, + "balance_loss_mlp": 0.28499466, + "epoch": 0.5682849842176462, + "flos": 21288600138240.0, + "grad_norm": 584.2010506434016, + "language_loss": 0.82592309, + "learning_rate": 1.656454488573026e-06, + "loss": 0.84223866, + "num_input_tokens_seen": 203590605, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.30712891, + "step": 9452, + "time_per_iteration": 2.647092580795288 + }, + { + "auxiliary_loss_clip": 0.01317699, + "auxiliary_loss_mlp": 0.0026385, + "balance_loss_clip": 1.06562924, + "balance_loss_mlp": 0.2395428, + "epoch": 0.5683451074703142, + "flos": 21141689512320.0, + "grad_norm": 5.9806021926551916, + "language_loss": 0.77959436, + "learning_rate": 1.656070822132428e-06, + "loss": 0.7954098, + "num_input_tokens_seen": 203610080, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.24316406, + "step": 9453, + "time_per_iteration": 2.682722806930542 + }, + { + "auxiliary_loss_clip": 0.01316107, + "auxiliary_loss_mlp": 0.00290299, + "balance_loss_clip": 1.06574893, + "balance_loss_mlp": 0.26502705, + "epoch": 0.5684052307229821, + "flos": 22344481359360.0, + "grad_norm": 6.536207055751518, + "language_loss": 0.76158798, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.77765203, + "num_input_tokens_seen": 203630060, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.25305176, + "step": 9454, + "time_per_iteration": 2.694864511489868 + }, + { + "auxiliary_loss_clip": 0.01310795, + "auxiliary_loss_mlp": 0.00265781, + "balance_loss_clip": 1.06218648, + "balance_loss_mlp": 0.24109288, + "epoch": 0.5684653539756501, + "flos": 21798639308160.0, + "grad_norm": 25.433528840832444, + "language_loss": 0.71283185, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.72859764, + "num_input_tokens_seen": 203649065, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.24694824, + "step": 9455, + "time_per_iteration": 2.659844398498535 + }, + { + "auxiliary_loss_clip": 0.01302446, + "auxiliary_loss_mlp": 0.00272691, + "balance_loss_clip": 1.04994452, + "balance_loss_mlp": 0.24683467, + "epoch": 0.568525477228318, + "flos": 22999635475200.0, + "grad_norm": 19.94370432969129, + "language_loss": 0.80843806, + "learning_rate": 1.6549199011198e-06, + "loss": 0.82418942, + "num_input_tokens_seen": 203667545, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.25842285, + "step": 9456, + "time_per_iteration": 2.6706864833831787 + }, + { + "auxiliary_loss_clip": 0.01325053, + "auxiliary_loss_mlp": 0.00263925, + "balance_loss_clip": 1.07152903, + "balance_loss_mlp": 0.23722225, + "epoch": 0.568585600480986, + "flos": 21392489249280.0, + "grad_norm": 15.111640205889765, + "language_loss": 0.83967823, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.85556805, + "num_input_tokens_seen": 203686025, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.26696777, + "step": 9457, + "time_per_iteration": 2.6673789024353027 + }, + { + "auxiliary_loss_clip": 0.01304369, + "auxiliary_loss_mlp": 0.00281216, + "balance_loss_clip": 1.05604959, + "balance_loss_mlp": 0.2554903, + "epoch": 0.568645723733654, + "flos": 30007351359360.0, + "grad_norm": 25.115316143147936, + "language_loss": 0.73518568, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.75104153, + "num_input_tokens_seen": 203705540, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.25744629, + "step": 9458, + "time_per_iteration": 2.7337021827697754 + }, + { + "auxiliary_loss_clip": 0.01328941, + "auxiliary_loss_mlp": 0.00282073, + "balance_loss_clip": 1.07254362, + "balance_loss_mlp": 0.25430959, + "epoch": 0.568705846986322, + "flos": 20412667077120.0, + "grad_norm": 7.326732004179747, + "language_loss": 0.75705838, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.7731685, + "num_input_tokens_seen": 203723670, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.27770996, + "step": 9459, + "time_per_iteration": 2.72263765335083 + }, + { + "auxiliary_loss_clip": 0.01299973, + "auxiliary_loss_mlp": 0.00278369, + "balance_loss_clip": 1.04824793, + "balance_loss_mlp": 0.25027108, + "epoch": 0.5687659702389899, + "flos": 17456752131840.0, + "grad_norm": 121.47686369401714, + "language_loss": 0.8633337, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.87911713, + "num_input_tokens_seen": 203739705, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.28112793, + "step": 9460, + "time_per_iteration": 2.801523447036743 + }, + { + "auxiliary_loss_clip": 0.01290257, + "auxiliary_loss_mlp": 0.00310331, + "balance_loss_clip": 1.03771722, + "balance_loss_mlp": 0.28094602, + "epoch": 0.5688260934916579, + "flos": 25406081095680.0, + "grad_norm": 2.7884734577236596, + "language_loss": 0.8004809, + "learning_rate": 1.65300196133547e-06, + "loss": 0.81648684, + "num_input_tokens_seen": 203759000, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.29394531, + "step": 9461, + "time_per_iteration": 2.7450075149536133 + }, + { + "auxiliary_loss_clip": 0.01316907, + "auxiliary_loss_mlp": 0.00288091, + "balance_loss_clip": 1.06316674, + "balance_loss_mlp": 0.26099476, + "epoch": 0.5688862167443258, + "flos": 21608024808960.0, + "grad_norm": 19.625152140584518, + "language_loss": 0.78734565, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.80339563, + "num_input_tokens_seen": 203774295, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.27111816, + "step": 9462, + "time_per_iteration": 2.6267518997192383 + }, + { + "auxiliary_loss_clip": 0.0130065, + "auxiliary_loss_mlp": 0.00249225, + "balance_loss_clip": 1.054003, + "balance_loss_mlp": 0.22633702, + "epoch": 0.5689463399969938, + "flos": 22418996123520.0, + "grad_norm": 94.1876034502321, + "language_loss": 0.79880142, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.81430018, + "num_input_tokens_seen": 203792710, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.22888184, + "step": 9463, + "time_per_iteration": 2.65032958984375 + }, + { + "auxiliary_loss_clip": 0.0129141, + "auxiliary_loss_mlp": 0.00290357, + "balance_loss_clip": 1.04111147, + "balance_loss_mlp": 0.26230741, + "epoch": 0.5690064632496618, + "flos": 18296810484480.0, + "grad_norm": 49.21828255166001, + "language_loss": 0.83404845, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.84986609, + "num_input_tokens_seen": 203811645, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.28063965, + "step": 9464, + "time_per_iteration": 2.6440815925598145 + }, + { + "auxiliary_loss_clip": 0.01296681, + "auxiliary_loss_mlp": 0.00237909, + "balance_loss_clip": 1.04321289, + "balance_loss_mlp": 0.21459207, + "epoch": 0.5690665865023298, + "flos": 21579260993280.0, + "grad_norm": 11.180444025351122, + "language_loss": 0.91059643, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.92594236, + "num_input_tokens_seen": 203830040, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.23303223, + "step": 9465, + "time_per_iteration": 2.660050630569458 + }, + { + "auxiliary_loss_clip": 0.0129427, + "auxiliary_loss_mlp": 0.00256649, + "balance_loss_clip": 1.04480171, + "balance_loss_mlp": 0.2328552, + "epoch": 0.5691267097549978, + "flos": 24421446501120.0, + "grad_norm": 57.90247350703802, + "language_loss": 0.80147874, + "learning_rate": 1.651084350506125e-06, + "loss": 0.81698787, + "num_input_tokens_seen": 203851245, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.23803711, + "step": 9466, + "time_per_iteration": 2.67702317237854 + }, + { + "auxiliary_loss_clip": 0.01405969, + "auxiliary_loss_mlp": 0.00109686, + "balance_loss_clip": 1.19092417, + "balance_loss_mlp": 0.10134158, + "epoch": 0.5691868330076657, + "flos": 61657906199040.0, + "grad_norm": 0.7074040969490105, + "language_loss": 0.55030942, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.56546593, + "num_input_tokens_seen": 203916400, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.08349609, + "step": 9467, + "time_per_iteration": 3.1986613273620605 + }, + { + "auxiliary_loss_clip": 0.01289586, + "auxiliary_loss_mlp": 0.00280702, + "balance_loss_clip": 1.04080665, + "balance_loss_mlp": 0.25373673, + "epoch": 0.5692469562603337, + "flos": 21325193118720.0, + "grad_norm": 31.565755790432682, + "language_loss": 0.71478802, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.73049086, + "num_input_tokens_seen": 203935870, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.26953125, + "step": 9468, + "time_per_iteration": 2.641456127166748 + }, + { + "auxiliary_loss_clip": 0.01261873, + "auxiliary_loss_mlp": 0.00305541, + "balance_loss_clip": 1.02219021, + "balance_loss_mlp": 0.28071022, + "epoch": 0.5693070795130016, + "flos": 23367899664000.0, + "grad_norm": 28.13912212400121, + "language_loss": 0.85583103, + "learning_rate": 1.64993394266317e-06, + "loss": 0.87150514, + "num_input_tokens_seen": 203954950, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.24841309, + "step": 9469, + "time_per_iteration": 2.6305646896362305 + }, + { + "auxiliary_loss_clip": 0.01310494, + "auxiliary_loss_mlp": 0.00306173, + "balance_loss_clip": 1.05125451, + "balance_loss_mlp": 0.27668077, + "epoch": 0.5693672027656697, + "flos": 18697250280960.0, + "grad_norm": 77.49361269948493, + "language_loss": 0.79859614, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.81476283, + "num_input_tokens_seen": 203972715, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.2947998, + "step": 9470, + "time_per_iteration": 2.6173484325408936 + }, + { + "auxiliary_loss_clip": 0.01305868, + "auxiliary_loss_mlp": 0.00301514, + "balance_loss_clip": 1.05031204, + "balance_loss_mlp": 0.27593142, + "epoch": 0.5694273260183376, + "flos": 20449188230400.0, + "grad_norm": 19.858114232308477, + "language_loss": 0.81123686, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.82731068, + "num_input_tokens_seen": 203990775, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.25585938, + "step": 9471, + "time_per_iteration": 2.634957790374756 + }, + { + "auxiliary_loss_clip": 0.01308141, + "auxiliary_loss_mlp": 0.00285926, + "balance_loss_clip": 1.0525775, + "balance_loss_mlp": 0.26196447, + "epoch": 0.5694874492710056, + "flos": 17603195880960.0, + "grad_norm": 110.14699795855836, + "language_loss": 0.66684163, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.68278229, + "num_input_tokens_seen": 204008845, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.23974609, + "step": 9472, + "time_per_iteration": 2.6313059329986572 + }, + { + "auxiliary_loss_clip": 0.01319459, + "auxiliary_loss_mlp": 0.00300304, + "balance_loss_clip": 1.06572008, + "balance_loss_mlp": 0.27499565, + "epoch": 0.5695475725236735, + "flos": 13370836250880.0, + "grad_norm": 15.70141945782699, + "language_loss": 0.81522512, + "learning_rate": 1.648400251450638e-06, + "loss": 0.83142275, + "num_input_tokens_seen": 204023755, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.25317383, + "step": 9473, + "time_per_iteration": 2.660992383956909 + }, + { + "auxiliary_loss_clip": 0.01416715, + "auxiliary_loss_mlp": 0.00091686, + "balance_loss_clip": 1.21103048, + "balance_loss_mlp": 0.08119577, + "epoch": 0.5696076957763415, + "flos": 68174398661760.0, + "grad_norm": 0.6430597559514286, + "language_loss": 0.56895971, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.58404368, + "num_input_tokens_seen": 204091255, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.10498047, + "step": 9474, + "time_per_iteration": 4.6294004917144775 + }, + { + "auxiliary_loss_clip": 0.01294932, + "auxiliary_loss_mlp": 0.00348154, + "balance_loss_clip": 1.04890513, + "balance_loss_mlp": 0.32166529, + "epoch": 0.5696678190290094, + "flos": 33838301525760.0, + "grad_norm": 6.543325823096282, + "language_loss": 0.59244287, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.60887372, + "num_input_tokens_seen": 204113285, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.26489258, + "step": 9475, + "time_per_iteration": 4.326059103012085 + }, + { + "auxiliary_loss_clip": 0.01324683, + "auxiliary_loss_mlp": 0.00352173, + "balance_loss_clip": 1.07081735, + "balance_loss_mlp": 0.326078, + "epoch": 0.5697279422816774, + "flos": 26356600748160.0, + "grad_norm": 2255.4122817241014, + "language_loss": 0.85806334, + "learning_rate": 1.647250122983675e-06, + "loss": 0.87483191, + "num_input_tokens_seen": 204133045, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.26086426, + "step": 9476, + "time_per_iteration": 2.712709903717041 + }, + { + "auxiliary_loss_clip": 0.0135273, + "auxiliary_loss_mlp": 0.00342772, + "balance_loss_clip": 1.08844507, + "balance_loss_mlp": 0.3174516, + "epoch": 0.5697880655343454, + "flos": 22930507751040.0, + "grad_norm": 78.07367177482436, + "language_loss": 0.77452779, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.79148287, + "num_input_tokens_seen": 204152590, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.25341797, + "step": 9477, + "time_per_iteration": 4.123754262924194 + }, + { + "auxiliary_loss_clip": 0.01341476, + "auxiliary_loss_mlp": 0.00317424, + "balance_loss_clip": 1.08028007, + "balance_loss_mlp": 0.29156792, + "epoch": 0.5698481887870134, + "flos": 26761314263040.0, + "grad_norm": 4.462141868883917, + "language_loss": 0.77207267, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.78866166, + "num_input_tokens_seen": 204171815, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.25866699, + "step": 9478, + "time_per_iteration": 2.722459554672241 + }, + { + "auxiliary_loss_clip": 0.01292244, + "auxiliary_loss_mlp": 0.0038331, + "balance_loss_clip": 1.05165339, + "balance_loss_mlp": 0.35799026, + "epoch": 0.5699083120396814, + "flos": 15742269089280.0, + "grad_norm": 15.120795227138883, + "language_loss": 0.74234486, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.75910044, + "num_input_tokens_seen": 204188535, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.25341797, + "step": 9479, + "time_per_iteration": 2.6676619052886963 + }, + { + "auxiliary_loss_clip": 0.01326607, + "auxiliary_loss_mlp": 0.0039282, + "balance_loss_clip": 1.07239223, + "balance_loss_mlp": 0.36567605, + "epoch": 0.5699684352923493, + "flos": 19537272720000.0, + "grad_norm": 225.63175567572338, + "language_loss": 0.7866562, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.80385041, + "num_input_tokens_seen": 204208365, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.27160645, + "step": 9480, + "time_per_iteration": 2.6956722736358643 + }, + { + "auxiliary_loss_clip": 0.0134784, + "auxiliary_loss_mlp": 0.00355464, + "balance_loss_clip": 1.09159875, + "balance_loss_mlp": 0.32985812, + "epoch": 0.5700285585450173, + "flos": 16253349753600.0, + "grad_norm": 76.8150939361463, + "language_loss": 0.81486166, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.8318947, + "num_input_tokens_seen": 204226560, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.25622559, + "step": 9481, + "time_per_iteration": 2.625170946121216 + }, + { + "auxiliary_loss_clip": 0.01354189, + "auxiliary_loss_mlp": 0.00401452, + "balance_loss_clip": 1.09387207, + "balance_loss_mlp": 0.37247229, + "epoch": 0.5700886817976852, + "flos": 19864993432320.0, + "grad_norm": 3.5284540583652695, + "language_loss": 0.87036908, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.88792551, + "num_input_tokens_seen": 204245410, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.28979492, + "step": 9482, + "time_per_iteration": 2.746548652648926 + }, + { + "auxiliary_loss_clip": 0.01329214, + "auxiliary_loss_mlp": 0.0036711, + "balance_loss_clip": 1.07598102, + "balance_loss_mlp": 0.34072858, + "epoch": 0.5701488050503533, + "flos": 23841704989440.0, + "grad_norm": 1.5597158092094803, + "language_loss": 0.82578397, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.84274721, + "num_input_tokens_seen": 204264840, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.26379395, + "step": 9483, + "time_per_iteration": 2.654883861541748 + }, + { + "auxiliary_loss_clip": 0.01330615, + "auxiliary_loss_mlp": 0.00408364, + "balance_loss_clip": 1.07795238, + "balance_loss_mlp": 0.37796605, + "epoch": 0.5702089283030212, + "flos": 23659673840640.0, + "grad_norm": 9.050104637697054, + "language_loss": 0.87493742, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.89232719, + "num_input_tokens_seen": 204284335, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.30371094, + "step": 9484, + "time_per_iteration": 4.050048589706421 + }, + { + "auxiliary_loss_clip": 0.01339198, + "auxiliary_loss_mlp": 0.00398812, + "balance_loss_clip": 1.08654952, + "balance_loss_mlp": 0.36846125, + "epoch": 0.5702690515556892, + "flos": 27891171544320.0, + "grad_norm": 2.990230646663289, + "language_loss": 0.68229276, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.69967282, + "num_input_tokens_seen": 204302590, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.30334473, + "step": 9485, + "time_per_iteration": 2.701058864593506 + }, + { + "auxiliary_loss_clip": 0.01329686, + "auxiliary_loss_mlp": 0.00388487, + "balance_loss_clip": 1.07511306, + "balance_loss_mlp": 0.36040118, + "epoch": 0.5703291748083571, + "flos": 24023951619840.0, + "grad_norm": 39.82727547660677, + "language_loss": 0.71266782, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.72984952, + "num_input_tokens_seen": 204323055, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.28076172, + "step": 9486, + "time_per_iteration": 2.7277863025665283 + }, + { + "auxiliary_loss_clip": 0.01433845, + "auxiliary_loss_mlp": 0.00202547, + "balance_loss_clip": 1.21974325, + "balance_loss_mlp": 0.19081697, + "epoch": 0.5703892980610251, + "flos": 57023382919680.0, + "grad_norm": 0.6583234978675099, + "language_loss": 0.47621384, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.49257779, + "num_input_tokens_seen": 204386160, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.1171875, + "step": 9487, + "time_per_iteration": 3.2048280239105225 + }, + { + "auxiliary_loss_clip": 0.01340464, + "auxiliary_loss_mlp": 0.00382779, + "balance_loss_clip": 1.08796573, + "balance_loss_mlp": 0.35425258, + "epoch": 0.570449421313693, + "flos": 24351025887360.0, + "grad_norm": 224.7900376330075, + "language_loss": 0.93667763, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.95391005, + "num_input_tokens_seen": 204406315, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.28540039, + "step": 9488, + "time_per_iteration": 2.676804542541504 + }, + { + "auxiliary_loss_clip": 0.01351966, + "auxiliary_loss_mlp": 0.00383859, + "balance_loss_clip": 1.09311223, + "balance_loss_mlp": 0.35450926, + "epoch": 0.570509544566361, + "flos": 24828566227200.0, + "grad_norm": 5.744955700333227, + "language_loss": 0.85704881, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.87440705, + "num_input_tokens_seen": 204427645, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.29370117, + "step": 9489, + "time_per_iteration": 2.703979730606079 + }, + { + "auxiliary_loss_clip": 0.01345636, + "auxiliary_loss_mlp": 0.00307697, + "balance_loss_clip": 1.09204745, + "balance_loss_mlp": 0.2820431, + "epoch": 0.570569667819029, + "flos": 21397301671680.0, + "grad_norm": 11.259031357636708, + "language_loss": 0.7589041, + "learning_rate": 1.641884454927604e-06, + "loss": 0.77543741, + "num_input_tokens_seen": 204445910, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.25622559, + "step": 9490, + "time_per_iteration": 2.62622332572937 + }, + { + "auxiliary_loss_clip": 0.01353393, + "auxiliary_loss_mlp": 0.00375354, + "balance_loss_clip": 1.09863091, + "balance_loss_mlp": 0.34648192, + "epoch": 0.570629791071697, + "flos": 23216751233280.0, + "grad_norm": 5.287637481706508, + "language_loss": 0.81493282, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.83222032, + "num_input_tokens_seen": 204464680, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.28881836, + "step": 9491, + "time_per_iteration": 2.675053119659424 + }, + { + "auxiliary_loss_clip": 0.01439145, + "auxiliary_loss_mlp": 0.00203093, + "balance_loss_clip": 1.22674537, + "balance_loss_mlp": 0.19098179, + "epoch": 0.570689914324365, + "flos": 65284666525440.0, + "grad_norm": 0.7736547327408025, + "language_loss": 0.57072335, + "learning_rate": 1.641118147266011e-06, + "loss": 0.58714575, + "num_input_tokens_seen": 204525580, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.12109375, + "step": 9492, + "time_per_iteration": 3.159968376159668 + }, + { + "auxiliary_loss_clip": 0.01344783, + "auxiliary_loss_mlp": 0.00348161, + "balance_loss_clip": 1.09419155, + "balance_loss_mlp": 0.32081428, + "epoch": 0.5707500375770329, + "flos": 21141904993920.0, + "grad_norm": 159.0795834536752, + "language_loss": 0.80841631, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.82534575, + "num_input_tokens_seen": 204541320, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.27307129, + "step": 9493, + "time_per_iteration": 2.6449601650238037 + }, + { + "auxiliary_loss_clip": 0.01359985, + "auxiliary_loss_mlp": 0.0032896, + "balance_loss_clip": 1.10295331, + "balance_loss_mlp": 0.30057675, + "epoch": 0.5708101608297009, + "flos": 20812747737600.0, + "grad_norm": 37.94457608512977, + "language_loss": 0.85849661, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.87538612, + "num_input_tokens_seen": 204560275, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.28405762, + "step": 9494, + "time_per_iteration": 2.696981430053711 + }, + { + "auxiliary_loss_clip": 0.01351188, + "auxiliary_loss_mlp": 0.00335562, + "balance_loss_clip": 1.09187388, + "balance_loss_mlp": 0.30621314, + "epoch": 0.5708702840823688, + "flos": 25812338895360.0, + "grad_norm": 60.419998439698126, + "language_loss": 0.88183045, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.89869791, + "num_input_tokens_seen": 204579430, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.29345703, + "step": 9495, + "time_per_iteration": 2.745171546936035 + }, + { + "auxiliary_loss_clip": 0.01355562, + "auxiliary_loss_mlp": 0.00319951, + "balance_loss_clip": 1.09664178, + "balance_loss_mlp": 0.29100677, + "epoch": 0.5709304073350369, + "flos": 23651916503040.0, + "grad_norm": 14.473053025810746, + "language_loss": 0.74844027, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.76519537, + "num_input_tokens_seen": 204597710, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.28918457, + "step": 9496, + "time_per_iteration": 2.6850390434265137 + }, + { + "auxiliary_loss_clip": 0.01341217, + "auxiliary_loss_mlp": 0.00349954, + "balance_loss_clip": 1.08458924, + "balance_loss_mlp": 0.31979445, + "epoch": 0.5709905305877048, + "flos": 16107552449280.0, + "grad_norm": 12326.858336638337, + "language_loss": 0.76583546, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.78274715, + "num_input_tokens_seen": 204616140, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.30175781, + "step": 9497, + "time_per_iteration": 2.6623964309692383 + }, + { + "auxiliary_loss_clip": 0.01361824, + "auxiliary_loss_mlp": 0.00304502, + "balance_loss_clip": 1.10146129, + "balance_loss_mlp": 0.27560568, + "epoch": 0.5710506538403728, + "flos": 24750819239040.0, + "grad_norm": 30.071309928808784, + "language_loss": 0.88584507, + "learning_rate": 1.638819551358182e-06, + "loss": 0.90250826, + "num_input_tokens_seen": 204636470, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.28857422, + "step": 9498, + "time_per_iteration": 2.7057313919067383 + }, + { + "auxiliary_loss_clip": 0.01324074, + "auxiliary_loss_mlp": 0.00342722, + "balance_loss_clip": 1.07379317, + "balance_loss_mlp": 0.31319362, + "epoch": 0.5711107770930407, + "flos": 21982250655360.0, + "grad_norm": 102.81284430570484, + "language_loss": 0.73993468, + "learning_rate": 1.638436499891469e-06, + "loss": 0.75660264, + "num_input_tokens_seen": 204656640, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.29553223, + "step": 9499, + "time_per_iteration": 2.6671741008758545 + }, + { + "auxiliary_loss_clip": 0.01363616, + "auxiliary_loss_mlp": 0.00312856, + "balance_loss_clip": 1.09932208, + "balance_loss_mlp": 0.28527075, + "epoch": 0.5711709003457087, + "flos": 19574009354880.0, + "grad_norm": 53.54428577052524, + "language_loss": 0.78909451, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.80585927, + "num_input_tokens_seen": 204675475, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.27587891, + "step": 9500, + "time_per_iteration": 2.654447317123413 + }, + { + "auxiliary_loss_clip": 0.01358698, + "auxiliary_loss_mlp": 0.00364738, + "balance_loss_clip": 1.09890401, + "balance_loss_mlp": 0.33467335, + "epoch": 0.5712310235983766, + "flos": 24242683489920.0, + "grad_norm": 8.367776075202885, + "language_loss": 0.84180695, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.85904133, + "num_input_tokens_seen": 204695385, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.30078125, + "step": 9501, + "time_per_iteration": 2.682466983795166 + }, + { + "auxiliary_loss_clip": 0.01358178, + "auxiliary_loss_mlp": 0.00309607, + "balance_loss_clip": 1.09803677, + "balance_loss_mlp": 0.28508598, + "epoch": 0.5712911468510447, + "flos": 20996143603200.0, + "grad_norm": 108.03066598388784, + "language_loss": 0.8156538, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.83233166, + "num_input_tokens_seen": 204714730, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.24511719, + "step": 9502, + "time_per_iteration": 2.6480491161346436 + }, + { + "auxiliary_loss_clip": 0.01335764, + "auxiliary_loss_mlp": 0.00307576, + "balance_loss_clip": 1.08705187, + "balance_loss_mlp": 0.28055131, + "epoch": 0.5713512701037126, + "flos": 18916987731840.0, + "grad_norm": 155.40648795298839, + "language_loss": 0.88948667, + "learning_rate": 1.636904431275105e-06, + "loss": 0.90591997, + "num_input_tokens_seen": 204735025, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.27062988, + "step": 9503, + "time_per_iteration": 2.642984628677368 + }, + { + "auxiliary_loss_clip": 0.01343, + "auxiliary_loss_mlp": 0.00293592, + "balance_loss_clip": 1.08624268, + "balance_loss_mlp": 0.26578006, + "epoch": 0.5714113933563806, + "flos": 17413443308160.0, + "grad_norm": 7.2866724882229645, + "language_loss": 0.96269667, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.97906256, + "num_input_tokens_seen": 204751365, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.27832031, + "step": 9504, + "time_per_iteration": 2.6293106079101562 + }, + { + "auxiliary_loss_clip": 0.01343722, + "auxiliary_loss_mlp": 0.00296836, + "balance_loss_clip": 1.08924603, + "balance_loss_mlp": 0.27035937, + "epoch": 0.5714715166090486, + "flos": 20193360589440.0, + "grad_norm": 22.596502201681204, + "language_loss": 0.82424343, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.84064901, + "num_input_tokens_seen": 204768980, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.26464844, + "step": 9505, + "time_per_iteration": 2.6737210750579834 + }, + { + "auxiliary_loss_clip": 0.01369708, + "auxiliary_loss_mlp": 0.00303882, + "balance_loss_clip": 1.10944605, + "balance_loss_mlp": 0.27770334, + "epoch": 0.5715316398617165, + "flos": 18551668458240.0, + "grad_norm": 98.29166717122054, + "language_loss": 0.88212526, + "learning_rate": 1.635755524332509e-06, + "loss": 0.89886117, + "num_input_tokens_seen": 204788110, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.26171875, + "step": 9506, + "time_per_iteration": 2.6810691356658936 + }, + { + "auxiliary_loss_clip": 0.01357727, + "auxiliary_loss_mlp": 0.00307707, + "balance_loss_clip": 1.09893024, + "balance_loss_mlp": 0.27944213, + "epoch": 0.5715917631143845, + "flos": 18478195188480.0, + "grad_norm": 14.532586995090767, + "language_loss": 0.86623013, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.8828845, + "num_input_tokens_seen": 204807240, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.28271484, + "step": 9507, + "time_per_iteration": 2.6681888103485107 + }, + { + "auxiliary_loss_clip": 0.01345845, + "auxiliary_loss_mlp": 0.002943, + "balance_loss_clip": 1.08564341, + "balance_loss_mlp": 0.26642883, + "epoch": 0.5716518863670524, + "flos": 24020037037440.0, + "grad_norm": 6.659474830415826, + "language_loss": 0.76015747, + "learning_rate": 1.63498965540751e-06, + "loss": 0.77655894, + "num_input_tokens_seen": 204826415, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.27893066, + "step": 9508, + "time_per_iteration": 2.6319730281829834 + }, + { + "auxiliary_loss_clip": 0.01361769, + "auxiliary_loss_mlp": 0.00276886, + "balance_loss_clip": 1.09930491, + "balance_loss_mlp": 0.24722669, + "epoch": 0.5717120096197205, + "flos": 17819485626240.0, + "grad_norm": 11.951729990130966, + "language_loss": 0.87263107, + "learning_rate": 1.634606741699593e-06, + "loss": 0.88901758, + "num_input_tokens_seen": 204844305, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.29675293, + "step": 9509, + "time_per_iteration": 2.6040306091308594 + }, + { + "auxiliary_loss_clip": 0.01352355, + "auxiliary_loss_mlp": 0.00283183, + "balance_loss_clip": 1.09530318, + "balance_loss_mlp": 0.2560035, + "epoch": 0.5717721328723884, + "flos": 21866043179520.0, + "grad_norm": 3.8173765649439244, + "language_loss": 0.82629591, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.84265131, + "num_input_tokens_seen": 204861765, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.27185059, + "step": 9510, + "time_per_iteration": 2.6204874515533447 + }, + { + "auxiliary_loss_clip": 0.01334117, + "auxiliary_loss_mlp": 0.00273615, + "balance_loss_clip": 1.0825187, + "balance_loss_mlp": 0.24581516, + "epoch": 0.5718322561250564, + "flos": 28437624126720.0, + "grad_norm": 3.3909713837631066, + "language_loss": 0.75253379, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.76861107, + "num_input_tokens_seen": 204882505, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.2779541, + "step": 9511, + "time_per_iteration": 2.7327511310577393 + }, + { + "auxiliary_loss_clip": 0.01333541, + "auxiliary_loss_mlp": 0.00284244, + "balance_loss_clip": 1.0781579, + "balance_loss_mlp": 0.25872117, + "epoch": 0.5718923793777243, + "flos": 13551825905280.0, + "grad_norm": 470.8500064530821, + "language_loss": 0.70566094, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.72183877, + "num_input_tokens_seen": 204899830, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.25537109, + "step": 9512, + "time_per_iteration": 2.593287706375122 + }, + { + "auxiliary_loss_clip": 0.01333836, + "auxiliary_loss_mlp": 0.00280529, + "balance_loss_clip": 1.07535648, + "balance_loss_mlp": 0.25468427, + "epoch": 0.5719525026303923, + "flos": 17822035491840.0, + "grad_norm": 11.296060750722116, + "language_loss": 0.84042168, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.85656536, + "num_input_tokens_seen": 204918100, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.25878906, + "step": 9513, + "time_per_iteration": 2.6220977306365967 + }, + { + "auxiliary_loss_clip": 0.01422899, + "auxiliary_loss_mlp": 0.0016482, + "balance_loss_clip": 1.19331074, + "balance_loss_mlp": 0.15699957, + "epoch": 0.5720126258830602, + "flos": 61298042814720.0, + "grad_norm": 0.8911890332407729, + "language_loss": 0.66573429, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.68161148, + "num_input_tokens_seen": 204972925, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.078125, + "step": 9514, + "time_per_iteration": 3.1294803619384766 + }, + { + "auxiliary_loss_clip": 0.01349155, + "auxiliary_loss_mlp": 0.00299231, + "balance_loss_clip": 1.08592618, + "balance_loss_mlp": 0.26877257, + "epoch": 0.5720727491357283, + "flos": 23988040997760.0, + "grad_norm": 2.211867238452027, + "language_loss": 0.86997932, + "learning_rate": 1.63230955093099e-06, + "loss": 0.88646317, + "num_input_tokens_seen": 204990910, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.30419922, + "step": 9515, + "time_per_iteration": 2.684434652328491 + }, + { + "auxiliary_loss_clip": 0.01330651, + "auxiliary_loss_mlp": 0.00253191, + "balance_loss_clip": 1.07384479, + "balance_loss_mlp": 0.22558217, + "epoch": 0.5721328723883962, + "flos": 23405426398080.0, + "grad_norm": 203.59346565579256, + "language_loss": 0.92466652, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.94050497, + "num_input_tokens_seen": 205010500, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.27587891, + "step": 9516, + "time_per_iteration": 4.076708555221558 + }, + { + "auxiliary_loss_clip": 0.01361589, + "auxiliary_loss_mlp": 0.00254155, + "balance_loss_clip": 1.09504128, + "balance_loss_mlp": 0.2270588, + "epoch": 0.5721929956410642, + "flos": 18804910320000.0, + "grad_norm": 5.084058432857291, + "language_loss": 0.94129419, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.95745164, + "num_input_tokens_seen": 205028560, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.27087402, + "step": 9517, + "time_per_iteration": 4.194195985794067 + }, + { + "auxiliary_loss_clip": 0.0132638, + "auxiliary_loss_mlp": 0.00278718, + "balance_loss_clip": 1.07069898, + "balance_loss_mlp": 0.25068003, + "epoch": 0.5722531188937322, + "flos": 27196659100800.0, + "grad_norm": 5.284611898758786, + "language_loss": 0.91773283, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.93378377, + "num_input_tokens_seen": 205048650, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.28027344, + "step": 9518, + "time_per_iteration": 2.7196240425109863 + }, + { + "auxiliary_loss_clip": 0.01325761, + "auxiliary_loss_mlp": 0.00282893, + "balance_loss_clip": 1.07167315, + "balance_loss_mlp": 0.25597551, + "epoch": 0.5723132421464001, + "flos": 15195672852480.0, + "grad_norm": 9.13968363804584, + "language_loss": 0.85878414, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.87487066, + "num_input_tokens_seen": 205066480, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.26940918, + "step": 9519, + "time_per_iteration": 2.652838706970215 + }, + { + "auxiliary_loss_clip": 0.01334095, + "auxiliary_loss_mlp": 0.00237098, + "balance_loss_clip": 1.07602048, + "balance_loss_mlp": 0.21087204, + "epoch": 0.5723733653990681, + "flos": 27599433281280.0, + "grad_norm": 11.343094214285646, + "language_loss": 0.87226212, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.88797402, + "num_input_tokens_seen": 205087475, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.26245117, + "step": 9520, + "time_per_iteration": 4.167596340179443 + }, + { + "auxiliary_loss_clip": 0.01361056, + "auxiliary_loss_mlp": 0.00236332, + "balance_loss_clip": 1.09099627, + "balance_loss_mlp": 0.20873488, + "epoch": 0.572433488651736, + "flos": 18222870337920.0, + "grad_norm": 8.052457681472598, + "language_loss": 0.82560736, + "learning_rate": 1.630012862105243e-06, + "loss": 0.84158123, + "num_input_tokens_seen": 205106495, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.27600098, + "step": 9521, + "time_per_iteration": 2.7077300548553467 + }, + { + "auxiliary_loss_clip": 0.01346006, + "auxiliary_loss_mlp": 0.00252856, + "balance_loss_clip": 1.08632255, + "balance_loss_mlp": 0.22529477, + "epoch": 0.5724936119044041, + "flos": 31249106484480.0, + "grad_norm": 4.095077696755133, + "language_loss": 0.86081707, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.87680572, + "num_input_tokens_seen": 205128285, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.27563477, + "step": 9522, + "time_per_iteration": 2.8005332946777344 + }, + { + "auxiliary_loss_clip": 0.01347392, + "auxiliary_loss_mlp": 0.00215655, + "balance_loss_clip": 1.08900321, + "balance_loss_mlp": 0.19054967, + "epoch": 0.572553735157072, + "flos": 19202189719680.0, + "grad_norm": 15.915097068686556, + "language_loss": 0.76532429, + "learning_rate": 1.629247411248102e-06, + "loss": 0.78095484, + "num_input_tokens_seen": 205146595, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.25109863, + "step": 9523, + "time_per_iteration": 2.6410491466522217 + }, + { + "auxiliary_loss_clip": 0.01322936, + "auxiliary_loss_mlp": 0.00225835, + "balance_loss_clip": 1.06866109, + "balance_loss_mlp": 0.20103897, + "epoch": 0.57261385840974, + "flos": 21214911386880.0, + "grad_norm": 15.255690801638206, + "language_loss": 0.77889961, + "learning_rate": 1.628864706900738e-06, + "loss": 0.79438734, + "num_input_tokens_seen": 205164295, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.24804688, + "step": 9524, + "time_per_iteration": 2.6587541103363037 + }, + { + "auxiliary_loss_clip": 0.01345013, + "auxiliary_loss_mlp": 0.00254935, + "balance_loss_clip": 1.08650279, + "balance_loss_mlp": 0.22744536, + "epoch": 0.5726739816624079, + "flos": 33984529793280.0, + "grad_norm": 4.518755316924043, + "language_loss": 0.72946775, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.74546725, + "num_input_tokens_seen": 205185380, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.27490234, + "step": 9525, + "time_per_iteration": 2.7383334636688232 + }, + { + "auxiliary_loss_clip": 0.01360557, + "auxiliary_loss_mlp": 0.00233057, + "balance_loss_clip": 1.09593034, + "balance_loss_mlp": 0.2060678, + "epoch": 0.5727341049150759, + "flos": 24275972419200.0, + "grad_norm": 3.4369239593359953, + "language_loss": 0.81748772, + "learning_rate": 1.628099340440984e-06, + "loss": 0.83342385, + "num_input_tokens_seen": 205204895, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.27001953, + "step": 9526, + "time_per_iteration": 4.064497470855713 + }, + { + "auxiliary_loss_clip": 0.01357016, + "auxiliary_loss_mlp": 0.00219269, + "balance_loss_clip": 1.09712124, + "balance_loss_mlp": 0.19173166, + "epoch": 0.5727942281677438, + "flos": 28400564269440.0, + "grad_norm": 247.6914980324869, + "language_loss": 0.89040506, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.90616786, + "num_input_tokens_seen": 205223440, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.2755127, + "step": 9527, + "time_per_iteration": 2.76456618309021 + }, + { + "auxiliary_loss_clip": 0.01364645, + "auxiliary_loss_mlp": 0.00241472, + "balance_loss_clip": 1.1015451, + "balance_loss_mlp": 0.21212234, + "epoch": 0.5728543514204119, + "flos": 19536769929600.0, + "grad_norm": 11.699298049230817, + "language_loss": 0.81152493, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.82758611, + "num_input_tokens_seen": 205242800, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.29345703, + "step": 9528, + "time_per_iteration": 2.6819679737091064 + }, + { + "auxiliary_loss_clip": 0.0137655, + "auxiliary_loss_mlp": 0.00249637, + "balance_loss_clip": 1.11085021, + "balance_loss_mlp": 0.22148009, + "epoch": 0.5729144746730798, + "flos": 21506757390720.0, + "grad_norm": 12.709092455020905, + "language_loss": 0.94039983, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.9566617, + "num_input_tokens_seen": 205259465, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.28137207, + "step": 9529, + "time_per_iteration": 2.653095006942749 + }, + { + "auxiliary_loss_clip": 0.01459209, + "auxiliary_loss_mlp": 0.00223592, + "balance_loss_clip": 1.22640824, + "balance_loss_mlp": 0.21424632, + "epoch": 0.5729745979257478, + "flos": 58681628242560.0, + "grad_norm": 0.7562193965533952, + "language_loss": 0.55679214, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.5736202, + "num_input_tokens_seen": 205314100, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.09326172, + "step": 9530, + "time_per_iteration": 2.988734483718872 + }, + { + "auxiliary_loss_clip": 0.0138773, + "auxiliary_loss_mlp": 0.00264759, + "balance_loss_clip": 1.12250495, + "balance_loss_mlp": 0.23539755, + "epoch": 0.5730347211784158, + "flos": 18552099421440.0, + "grad_norm": 11.94689945847789, + "language_loss": 0.75279456, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.76931942, + "num_input_tokens_seen": 205333420, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.29394531, + "step": 9531, + "time_per_iteration": 2.671395778656006 + }, + { + "auxiliary_loss_clip": 0.01384701, + "auxiliary_loss_mlp": 0.00279363, + "balance_loss_clip": 1.116436, + "balance_loss_mlp": 0.24916689, + "epoch": 0.5730948444310837, + "flos": 38031482396160.0, + "grad_norm": 1258.5065325121684, + "language_loss": 0.82803243, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.84467304, + "num_input_tokens_seen": 205350995, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.30187988, + "step": 9532, + "time_per_iteration": 2.750115394592285 + }, + { + "auxiliary_loss_clip": 0.01363342, + "auxiliary_loss_mlp": 0.00227822, + "balance_loss_clip": 1.09825599, + "balance_loss_mlp": 0.19929492, + "epoch": 0.5731549676837517, + "flos": 25227066689280.0, + "grad_norm": 20.43126552580118, + "language_loss": 0.84921956, + "learning_rate": 1.625421002822686e-06, + "loss": 0.86513114, + "num_input_tokens_seen": 205372675, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.28503418, + "step": 9533, + "time_per_iteration": 2.712489128112793 + }, + { + "auxiliary_loss_clip": 0.01358513, + "auxiliary_loss_mlp": 0.00211721, + "balance_loss_clip": 1.09895682, + "balance_loss_mlp": 0.18163224, + "epoch": 0.5732150909364196, + "flos": 23368222886400.0, + "grad_norm": 2.5541247125560202, + "language_loss": 0.92493248, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.94063479, + "num_input_tokens_seen": 205392590, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.30102539, + "step": 9534, + "time_per_iteration": 2.670846462249756 + }, + { + "auxiliary_loss_clip": 0.0138619, + "auxiliary_loss_mlp": 0.00230703, + "balance_loss_clip": 1.12091351, + "balance_loss_mlp": 0.20252213, + "epoch": 0.5732752141890877, + "flos": 23079357711360.0, + "grad_norm": 12.611559435704374, + "language_loss": 0.82256722, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.83873618, + "num_input_tokens_seen": 205414885, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.28198242, + "step": 9535, + "time_per_iteration": 2.72548770904541 + }, + { + "auxiliary_loss_clip": 0.01381818, + "auxiliary_loss_mlp": 0.00236025, + "balance_loss_clip": 1.11702013, + "balance_loss_mlp": 0.2060795, + "epoch": 0.5733353374417556, + "flos": 24352282863360.0, + "grad_norm": 55.32465278951884, + "language_loss": 0.77335292, + "learning_rate": 1.624273356614346e-06, + "loss": 0.78953135, + "num_input_tokens_seen": 205434440, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.29956055, + "step": 9536, + "time_per_iteration": 2.7011988162994385 + }, + { + "auxiliary_loss_clip": 0.0137459, + "auxiliary_loss_mlp": 0.00233286, + "balance_loss_clip": 1.11618888, + "balance_loss_mlp": 0.20515308, + "epoch": 0.5733954606944236, + "flos": 27198849830400.0, + "grad_norm": 14.565433716013404, + "language_loss": 0.77680612, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.79288483, + "num_input_tokens_seen": 205454225, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.28125, + "step": 9537, + "time_per_iteration": 2.6793501377105713 + }, + { + "auxiliary_loss_clip": 0.01369539, + "auxiliary_loss_mlp": 0.00230834, + "balance_loss_clip": 1.10595846, + "balance_loss_mlp": 0.2006862, + "epoch": 0.5734555839470915, + "flos": 28765129357440.0, + "grad_norm": 1148.3882537543811, + "language_loss": 0.71784836, + "learning_rate": 1.623508330355902e-06, + "loss": 0.73385209, + "num_input_tokens_seen": 205474750, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.30187988, + "step": 9538, + "time_per_iteration": 2.732377529144287 + }, + { + "auxiliary_loss_clip": 0.01384449, + "auxiliary_loss_mlp": 0.00255321, + "balance_loss_clip": 1.12226725, + "balance_loss_mlp": 0.22594817, + "epoch": 0.5735157071997595, + "flos": 22966813422720.0, + "grad_norm": 118.42874088718968, + "language_loss": 0.89693344, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.91333115, + "num_input_tokens_seen": 205495495, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.29345703, + "step": 9539, + "time_per_iteration": 2.7041244506835938 + }, + { + "auxiliary_loss_clip": 0.01383451, + "auxiliary_loss_mlp": 0.00240151, + "balance_loss_clip": 1.11889708, + "balance_loss_mlp": 0.21096882, + "epoch": 0.5735758304524274, + "flos": 18989455420800.0, + "grad_norm": 15.200319554989989, + "language_loss": 0.8299011, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.84613705, + "num_input_tokens_seen": 205510070, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.29162598, + "step": 9540, + "time_per_iteration": 2.650118350982666 + }, + { + "auxiliary_loss_clip": 0.01370703, + "auxiliary_loss_mlp": 0.00250578, + "balance_loss_clip": 1.11461413, + "balance_loss_mlp": 0.22059718, + "epoch": 0.5736359537050955, + "flos": 28397942576640.0, + "grad_norm": 10.681080444664913, + "language_loss": 0.84450078, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.8607136, + "num_input_tokens_seen": 205530190, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.29992676, + "step": 9541, + "time_per_iteration": 2.7138330936431885 + }, + { + "auxiliary_loss_clip": 0.01394878, + "auxiliary_loss_mlp": 0.00238416, + "balance_loss_clip": 1.13065362, + "balance_loss_mlp": 0.2085655, + "epoch": 0.5736960769577634, + "flos": 15627210848640.0, + "grad_norm": 31.663419585975785, + "language_loss": 0.74856567, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.76489854, + "num_input_tokens_seen": 205547380, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.29833984, + "step": 9542, + "time_per_iteration": 2.68572735786438 + }, + { + "auxiliary_loss_clip": 0.01385606, + "auxiliary_loss_mlp": 0.00248353, + "balance_loss_clip": 1.12664723, + "balance_loss_mlp": 0.21899202, + "epoch": 0.5737562002104314, + "flos": 18003994813440.0, + "grad_norm": 108.7950036544467, + "language_loss": 0.92444474, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.94078434, + "num_input_tokens_seen": 205566540, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.2935791, + "step": 9543, + "time_per_iteration": 2.59015154838562 + }, + { + "auxiliary_loss_clip": 0.01393686, + "auxiliary_loss_mlp": 0.00240486, + "balance_loss_clip": 1.12789643, + "balance_loss_mlp": 0.20910968, + "epoch": 0.5738163234630994, + "flos": 20698192287360.0, + "grad_norm": 67.42952589647957, + "language_loss": 0.80983889, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.82618058, + "num_input_tokens_seen": 205584200, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.3137207, + "step": 9544, + "time_per_iteration": 2.6574549674987793 + }, + { + "auxiliary_loss_clip": 0.01388467, + "auxiliary_loss_mlp": 0.00233094, + "balance_loss_clip": 1.1252346, + "balance_loss_mlp": 0.2046392, + "epoch": 0.5738764467157673, + "flos": 23149311448320.0, + "grad_norm": 1.6998482286827814, + "language_loss": 0.8388555, + "learning_rate": 1.620831188925733e-06, + "loss": 0.85507113, + "num_input_tokens_seen": 205604675, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.2845459, + "step": 9545, + "time_per_iteration": 2.6500871181488037 + }, + { + "auxiliary_loss_clip": 0.01406756, + "auxiliary_loss_mlp": 0.00202041, + "balance_loss_clip": 1.13750863, + "balance_loss_mlp": 0.17190519, + "epoch": 0.5739365699684353, + "flos": 29492930730240.0, + "grad_norm": 135.88904052877007, + "language_loss": 0.64654195, + "learning_rate": 1.620448797546459e-06, + "loss": 0.6626299, + "num_input_tokens_seen": 205624680, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.30126953, + "step": 9546, + "time_per_iteration": 2.711576461791992 + }, + { + "auxiliary_loss_clip": 0.01389377, + "auxiliary_loss_mlp": 0.00221025, + "balance_loss_clip": 1.12781167, + "balance_loss_mlp": 0.19162774, + "epoch": 0.5739966932211032, + "flos": 14027247342720.0, + "grad_norm": 62.84148774257563, + "language_loss": 0.87907207, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.89517611, + "num_input_tokens_seen": 205641950, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.29394531, + "step": 9547, + "time_per_iteration": 2.620549440383911 + }, + { + "auxiliary_loss_clip": 0.01396562, + "auxiliary_loss_mlp": 0.00237875, + "balance_loss_clip": 1.13270974, + "balance_loss_mlp": 0.20635562, + "epoch": 0.5740568164737713, + "flos": 19062030850560.0, + "grad_norm": 174.7704824548318, + "language_loss": 0.84241855, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.85876292, + "num_input_tokens_seen": 205660130, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.31542969, + "step": 9548, + "time_per_iteration": 2.6797969341278076 + }, + { + "auxiliary_loss_clip": 0.01405027, + "auxiliary_loss_mlp": 0.0023569, + "balance_loss_clip": 1.14042485, + "balance_loss_mlp": 0.20460069, + "epoch": 0.5741169397264392, + "flos": 22127832478080.0, + "grad_norm": 20.424306857333015, + "language_loss": 0.78682894, + "learning_rate": 1.619301709822355e-06, + "loss": 0.80323613, + "num_input_tokens_seen": 205678895, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.31079102, + "step": 9549, + "time_per_iteration": 2.6539008617401123 + }, + { + "auxiliary_loss_clip": 0.01410693, + "auxiliary_loss_mlp": 0.00225376, + "balance_loss_clip": 1.14748526, + "balance_loss_mlp": 0.19333249, + "epoch": 0.5741770629791072, + "flos": 24936836797440.0, + "grad_norm": 40.080706764437174, + "language_loss": 0.83825648, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.85461712, + "num_input_tokens_seen": 205698450, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.3203125, + "step": 9550, + "time_per_iteration": 2.7480530738830566 + }, + { + "auxiliary_loss_clip": 0.01422961, + "auxiliary_loss_mlp": 0.00240256, + "balance_loss_clip": 1.15678501, + "balance_loss_mlp": 0.20745005, + "epoch": 0.5742371862317751, + "flos": 18801462614400.0, + "grad_norm": 48.580923978839984, + "language_loss": 0.76033056, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.77696276, + "num_input_tokens_seen": 205714870, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.32763672, + "step": 9551, + "time_per_iteration": 2.622014045715332 + }, + { + "auxiliary_loss_clip": 0.01414234, + "auxiliary_loss_mlp": 0.00231727, + "balance_loss_clip": 1.14498878, + "balance_loss_mlp": 0.20073237, + "epoch": 0.5742973094844431, + "flos": 24460661174400.0, + "grad_norm": 10.601000108139148, + "language_loss": 0.8158533, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.83231294, + "num_input_tokens_seen": 205736045, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.30981445, + "step": 9552, + "time_per_iteration": 2.6785943508148193 + }, + { + "auxiliary_loss_clip": 0.01423927, + "auxiliary_loss_mlp": 0.00233932, + "balance_loss_clip": 1.15410352, + "balance_loss_mlp": 0.20114957, + "epoch": 0.574357432737111, + "flos": 21652770176640.0, + "grad_norm": 31.764520214113844, + "language_loss": 0.88463163, + "learning_rate": 1.617772461696843e-06, + "loss": 0.90121025, + "num_input_tokens_seen": 205754445, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.32763672, + "step": 9553, + "time_per_iteration": 2.741403818130493 + }, + { + "auxiliary_loss_clip": 0.01409202, + "auxiliary_loss_mlp": 0.00243554, + "balance_loss_clip": 1.1429528, + "balance_loss_mlp": 0.21084332, + "epoch": 0.5744175559897791, + "flos": 16544728880640.0, + "grad_norm": 33.862036657685024, + "language_loss": 0.90517467, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.92170227, + "num_input_tokens_seen": 205770595, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.3269043, + "step": 9554, + "time_per_iteration": 2.595684289932251 + }, + { + "auxiliary_loss_clip": 0.01424484, + "auxiliary_loss_mlp": 0.00248473, + "balance_loss_clip": 1.1518712, + "balance_loss_mlp": 0.21623918, + "epoch": 0.574477679242447, + "flos": 24207598880640.0, + "grad_norm": 49.925996427388775, + "language_loss": 0.79985118, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.81658077, + "num_input_tokens_seen": 205791935, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.32202148, + "step": 9555, + "time_per_iteration": 2.7023661136627197 + }, + { + "auxiliary_loss_clip": 0.01453071, + "auxiliary_loss_mlp": 0.00234593, + "balance_loss_clip": 1.17507172, + "balance_loss_mlp": 0.19897294, + "epoch": 0.574537802495115, + "flos": 14903000835840.0, + "grad_norm": 6.988620674912163, + "language_loss": 0.8248316, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.84170824, + "num_input_tokens_seen": 205807260, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.35620117, + "step": 9556, + "time_per_iteration": 2.596848487854004 + }, + { + "auxiliary_loss_clip": 0.01406997, + "auxiliary_loss_mlp": 0.00220461, + "balance_loss_clip": 1.13936031, + "balance_loss_mlp": 0.18786904, + "epoch": 0.5745979257477829, + "flos": 24934969290240.0, + "grad_norm": 15.752919735442012, + "language_loss": 0.80447233, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.8207469, + "num_input_tokens_seen": 205826885, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.32592773, + "step": 9557, + "time_per_iteration": 2.7069904804229736 + }, + { + "auxiliary_loss_clip": 0.01433943, + "auxiliary_loss_mlp": 0.00212691, + "balance_loss_clip": 1.16319907, + "balance_loss_mlp": 0.17943129, + "epoch": 0.5746580490004509, + "flos": 17235757704960.0, + "grad_norm": 3.421437187785662, + "language_loss": 0.77745157, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.79391789, + "num_input_tokens_seen": 205844630, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.33251953, + "step": 9558, + "time_per_iteration": 2.6226611137390137 + }, + { + "auxiliary_loss_clip": 0.01435607, + "auxiliary_loss_mlp": 0.00254228, + "balance_loss_clip": 1.1595459, + "balance_loss_mlp": 0.22099212, + "epoch": 0.5747181722531189, + "flos": 13187871348480.0, + "grad_norm": 23.07947277328422, + "language_loss": 0.80732465, + "learning_rate": 1.615479024621659e-06, + "loss": 0.82422304, + "num_input_tokens_seen": 205860960, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.33215332, + "step": 9559, + "time_per_iteration": 4.012395143508911 + }, + { + "auxiliary_loss_clip": 0.01414703, + "auxiliary_loss_mlp": 0.00220761, + "balance_loss_clip": 1.15442944, + "balance_loss_mlp": 0.19234137, + "epoch": 0.5747782955057869, + "flos": 22963006581120.0, + "grad_norm": 6.630213964763968, + "language_loss": 0.85280389, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.86915851, + "num_input_tokens_seen": 205880675, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.28466797, + "step": 9560, + "time_per_iteration": 4.370074033737183 + }, + { + "auxiliary_loss_clip": 0.01392567, + "auxiliary_loss_mlp": 0.00238581, + "balance_loss_clip": 1.13303816, + "balance_loss_mlp": 0.20582268, + "epoch": 0.5748384187584549, + "flos": 23403235668480.0, + "grad_norm": 114.80671884901268, + "language_loss": 0.71099973, + "learning_rate": 1.614714662090588e-06, + "loss": 0.72731113, + "num_input_tokens_seen": 205900050, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.32763672, + "step": 9561, + "time_per_iteration": 2.7172443866729736 + }, + { + "auxiliary_loss_clip": 0.01455117, + "auxiliary_loss_mlp": 0.00244467, + "balance_loss_clip": 1.17166066, + "balance_loss_mlp": 0.20848936, + "epoch": 0.5748985420111228, + "flos": 17785514338560.0, + "grad_norm": 74.94103740456544, + "language_loss": 0.79831445, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.81531036, + "num_input_tokens_seen": 205918855, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.36010742, + "step": 9562, + "time_per_iteration": 4.111522912979126 + }, + { + "auxiliary_loss_clip": 0.01412776, + "auxiliary_loss_mlp": 0.00224844, + "balance_loss_clip": 1.14873314, + "balance_loss_mlp": 0.19523221, + "epoch": 0.5749586652637908, + "flos": 19866250408320.0, + "grad_norm": 13.831812112049926, + "language_loss": 0.9030658, + "learning_rate": 1.613950357999751e-06, + "loss": 0.919442, + "num_input_tokens_seen": 205936970, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.29626465, + "step": 9563, + "time_per_iteration": 2.688826322555542 + }, + { + "auxiliary_loss_clip": 0.01436337, + "auxiliary_loss_mlp": 0.00240327, + "balance_loss_clip": 1.16065145, + "balance_loss_mlp": 0.20454001, + "epoch": 0.5750187885164587, + "flos": 21287235421440.0, + "grad_norm": 19.342251816856134, + "language_loss": 0.69610262, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.71286929, + "num_input_tokens_seen": 205954630, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.35766602, + "step": 9564, + "time_per_iteration": 2.6731879711151123 + }, + { + "auxiliary_loss_clip": 0.0140626, + "auxiliary_loss_mlp": 0.00230371, + "balance_loss_clip": 1.14949346, + "balance_loss_mlp": 0.19730261, + "epoch": 0.5750789117691267, + "flos": 18804658924800.0, + "grad_norm": 19.977203224100155, + "language_loss": 0.82786632, + "learning_rate": 1.613186112465078e-06, + "loss": 0.84423256, + "num_input_tokens_seen": 205971510, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.33056641, + "step": 9565, + "time_per_iteration": 2.6063783168792725 + }, + { + "auxiliary_loss_clip": 0.01540477, + "auxiliary_loss_mlp": 0.00157221, + "balance_loss_clip": 1.30951536, + "balance_loss_mlp": 0.14491859, + "epoch": 0.5751390350217946, + "flos": 70663224124800.0, + "grad_norm": 0.7273468117199458, + "language_loss": 0.60254431, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.61952126, + "num_input_tokens_seen": 206035125, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.12255859, + "step": 9566, + "time_per_iteration": 3.233987808227539 + }, + { + "auxiliary_loss_clip": 0.01424111, + "auxiliary_loss_mlp": 0.00214949, + "balance_loss_clip": 1.16094327, + "balance_loss_mlp": 0.18264353, + "epoch": 0.5751991582744627, + "flos": 14246338348800.0, + "grad_norm": 3.1427305739596894, + "language_loss": 0.84891582, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.86530638, + "num_input_tokens_seen": 206052075, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.32275391, + "step": 9567, + "time_per_iteration": 2.620811939239502 + }, + { + "auxiliary_loss_clip": 0.0140936, + "auxiliary_loss_mlp": 0.00236691, + "balance_loss_clip": 1.14966476, + "balance_loss_mlp": 0.20579219, + "epoch": 0.5752592815271306, + "flos": 18328160079360.0, + "grad_norm": 6.259362258969094, + "language_loss": 0.8054316, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.82189214, + "num_input_tokens_seen": 206069970, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.30932617, + "step": 9568, + "time_per_iteration": 2.695604085922241 + }, + { + "auxiliary_loss_clip": 0.01432998, + "auxiliary_loss_mlp": 0.00237016, + "balance_loss_clip": 1.16591275, + "balance_loss_mlp": 0.2048291, + "epoch": 0.5753194047797986, + "flos": 20922742160640.0, + "grad_norm": 38.929850555862814, + "language_loss": 0.80474031, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.82144046, + "num_input_tokens_seen": 206088950, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.32189941, + "step": 9569, + "time_per_iteration": 4.054736137390137 + }, + { + "auxiliary_loss_clip": 0.01428902, + "auxiliary_loss_mlp": 0.00244211, + "balance_loss_clip": 1.16068351, + "balance_loss_mlp": 0.21252568, + "epoch": 0.5753795280324665, + "flos": 19281804215040.0, + "grad_norm": 186.41210737320947, + "language_loss": 0.68330693, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.70003808, + "num_input_tokens_seen": 206107780, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.31689453, + "step": 9570, + "time_per_iteration": 2.649371385574341 + }, + { + "auxiliary_loss_clip": 0.01401218, + "auxiliary_loss_mlp": 0.00241386, + "balance_loss_clip": 1.14614666, + "balance_loss_mlp": 0.21074894, + "epoch": 0.5754396512851345, + "flos": 21652877917440.0, + "grad_norm": 3.1798323129431174, + "language_loss": 0.71055371, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.72697973, + "num_input_tokens_seen": 206127445, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.30639648, + "step": 9571, + "time_per_iteration": 2.6918458938598633 + }, + { + "auxiliary_loss_clip": 0.01399613, + "auxiliary_loss_mlp": 0.00251451, + "balance_loss_clip": 1.14155197, + "balance_loss_mlp": 0.22040847, + "epoch": 0.5754997745378025, + "flos": 51021700179840.0, + "grad_norm": 2.712111575918764, + "language_loss": 0.75114214, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.76765275, + "num_input_tokens_seen": 206152005, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.31054688, + "step": 9572, + "time_per_iteration": 2.9071969985961914 + }, + { + "auxiliary_loss_clip": 0.01429743, + "auxiliary_loss_mlp": 0.0024894, + "balance_loss_clip": 1.16693497, + "balance_loss_mlp": 0.21837442, + "epoch": 0.5755598977904705, + "flos": 22856890826880.0, + "grad_norm": 55.37631738543515, + "language_loss": 0.80231082, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.8190977, + "num_input_tokens_seen": 206169875, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.30541992, + "step": 9573, + "time_per_iteration": 2.6757776737213135 + }, + { + "auxiliary_loss_clip": 0.01436485, + "auxiliary_loss_mlp": 0.00246154, + "balance_loss_clip": 1.17804015, + "balance_loss_mlp": 0.21625586, + "epoch": 0.5756200210431385, + "flos": 38472824805120.0, + "grad_norm": 45.2369197864536, + "language_loss": 0.82233268, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.83915901, + "num_input_tokens_seen": 206192635, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.29858398, + "step": 9574, + "time_per_iteration": 2.8460984230041504 + }, + { + "auxiliary_loss_clip": 0.01443701, + "auxiliary_loss_mlp": 0.00232039, + "balance_loss_clip": 1.16847467, + "balance_loss_mlp": 0.19944681, + "epoch": 0.5756801442958064, + "flos": 23910006700800.0, + "grad_norm": 107.8943582664179, + "language_loss": 0.76139343, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.77815086, + "num_input_tokens_seen": 206211485, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.32617188, + "step": 9575, + "time_per_iteration": 2.7229669094085693 + }, + { + "auxiliary_loss_clip": 0.01404148, + "auxiliary_loss_mlp": 0.00246228, + "balance_loss_clip": 1.14998937, + "balance_loss_mlp": 0.21361271, + "epoch": 0.5757402675484744, + "flos": 21105276099840.0, + "grad_norm": 1.8620458395217068, + "language_loss": 0.86637914, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.88288289, + "num_input_tokens_seen": 206231740, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.32592773, + "step": 9576, + "time_per_iteration": 2.6923670768737793 + }, + { + "auxiliary_loss_clip": 0.01451978, + "auxiliary_loss_mlp": 0.00229584, + "balance_loss_clip": 1.18067646, + "balance_loss_mlp": 0.19607422, + "epoch": 0.5758003908011423, + "flos": 20559110826240.0, + "grad_norm": 3.365319229088547, + "language_loss": 0.78269434, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.79951, + "num_input_tokens_seen": 206250975, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.33532715, + "step": 9577, + "time_per_iteration": 2.7431187629699707 + }, + { + "auxiliary_loss_clip": 0.01422063, + "auxiliary_loss_mlp": 0.00255583, + "balance_loss_clip": 1.15660405, + "balance_loss_mlp": 0.22196597, + "epoch": 0.5758605140538103, + "flos": 16473015377280.0, + "grad_norm": 90.43723559072868, + "language_loss": 0.76126683, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.77804333, + "num_input_tokens_seen": 206268800, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.3359375, + "step": 9578, + "time_per_iteration": 2.680736541748047 + }, + { + "auxiliary_loss_clip": 0.01408497, + "auxiliary_loss_mlp": 0.00227349, + "balance_loss_clip": 1.15366459, + "balance_loss_mlp": 0.19604465, + "epoch": 0.5759206373064782, + "flos": 21287558643840.0, + "grad_norm": 29.890796588854794, + "language_loss": 0.79565424, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.81201267, + "num_input_tokens_seen": 206287190, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.31274414, + "step": 9579, + "time_per_iteration": 2.6789474487304688 + }, + { + "auxiliary_loss_clip": 0.01451555, + "auxiliary_loss_mlp": 0.00235604, + "balance_loss_clip": 1.17488396, + "balance_loss_mlp": 0.19943577, + "epoch": 0.5759807605591463, + "flos": 26067879227520.0, + "grad_norm": 41.67898755034949, + "language_loss": 0.77047598, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.78734756, + "num_input_tokens_seen": 206307020, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.36181641, + "step": 9580, + "time_per_iteration": 2.6817433834075928 + }, + { + "auxiliary_loss_clip": 0.01432464, + "auxiliary_loss_mlp": 0.00234293, + "balance_loss_clip": 1.1650629, + "balance_loss_mlp": 0.20284571, + "epoch": 0.5760408838118142, + "flos": 18873068376960.0, + "grad_norm": 33.743579721503735, + "language_loss": 0.9321388, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.94880641, + "num_input_tokens_seen": 206324095, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.31469727, + "step": 9581, + "time_per_iteration": 2.6775074005126953 + }, + { + "auxiliary_loss_clip": 0.01459201, + "auxiliary_loss_mlp": 0.0023246, + "balance_loss_clip": 1.19334459, + "balance_loss_mlp": 0.19967702, + "epoch": 0.5761010070644822, + "flos": 15378134964480.0, + "grad_norm": 40.29359656836719, + "language_loss": 0.77205396, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.78897059, + "num_input_tokens_seen": 206343210, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.32788086, + "step": 9582, + "time_per_iteration": 2.6716599464416504 + }, + { + "auxiliary_loss_clip": 0.01553694, + "auxiliary_loss_mlp": 0.00174375, + "balance_loss_clip": 1.32204795, + "balance_loss_mlp": 0.16102336, + "epoch": 0.5761611303171501, + "flos": 71471932882560.0, + "grad_norm": 0.6392778892633493, + "language_loss": 0.56461108, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.58189178, + "num_input_tokens_seen": 206415935, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.13378906, + "step": 9583, + "time_per_iteration": 3.3317198753356934 + }, + { + "auxiliary_loss_clip": 0.01434906, + "auxiliary_loss_mlp": 0.00242452, + "balance_loss_clip": 1.17229009, + "balance_loss_mlp": 0.20587857, + "epoch": 0.5762212535698181, + "flos": 16246167033600.0, + "grad_norm": 24.810533088909843, + "language_loss": 0.87876713, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.89554071, + "num_input_tokens_seen": 206431900, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.36572266, + "step": 9584, + "time_per_iteration": 2.6679494380950928 + }, + { + "auxiliary_loss_clip": 0.01550656, + "auxiliary_loss_mlp": 0.00178182, + "balance_loss_clip": 1.32072997, + "balance_loss_mlp": 0.16368611, + "epoch": 0.5762813768224861, + "flos": 70185504216960.0, + "grad_norm": 0.6169154316447756, + "language_loss": 0.49292958, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51021796, + "num_input_tokens_seen": 206501200, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.14453125, + "step": 9585, + "time_per_iteration": 3.1880929470062256 + }, + { + "auxiliary_loss_clip": 0.0143127, + "auxiliary_loss_mlp": 0.00229115, + "balance_loss_clip": 1.17240834, + "balance_loss_mlp": 0.19568847, + "epoch": 0.5763415000751541, + "flos": 20518028645760.0, + "grad_norm": 15.08002149084422, + "language_loss": 0.89063418, + "learning_rate": 1.605165098835465e-06, + "loss": 0.90723801, + "num_input_tokens_seen": 206520575, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.33422852, + "step": 9586, + "time_per_iteration": 2.6386797428131104 + }, + { + "auxiliary_loss_clip": 0.01437279, + "auxiliary_loss_mlp": 0.00243688, + "balance_loss_clip": 1.1707418, + "balance_loss_mlp": 0.20918855, + "epoch": 0.5764016233278221, + "flos": 15815526877440.0, + "grad_norm": 2067.3716947730345, + "language_loss": 0.88098073, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.89779037, + "num_input_tokens_seen": 206538060, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.3449707, + "step": 9587, + "time_per_iteration": 2.629610538482666 + }, + { + "auxiliary_loss_clip": 0.01456662, + "auxiliary_loss_mlp": 0.00221424, + "balance_loss_clip": 1.18955028, + "balance_loss_mlp": 0.18687704, + "epoch": 0.57646174658049, + "flos": 20772312001920.0, + "grad_norm": 6.153924131286978, + "language_loss": 0.73299277, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.74977362, + "num_input_tokens_seen": 206557320, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.34545898, + "step": 9588, + "time_per_iteration": 2.6686694622039795 + }, + { + "auxiliary_loss_clip": 0.01462595, + "auxiliary_loss_mlp": 0.0024576, + "balance_loss_clip": 1.18951237, + "balance_loss_mlp": 0.20928216, + "epoch": 0.576521869833158, + "flos": 23549930812800.0, + "grad_norm": 28.107266995740073, + "language_loss": 0.89437449, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.91145802, + "num_input_tokens_seen": 206575780, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.36450195, + "step": 9589, + "time_per_iteration": 2.7115325927734375 + }, + { + "auxiliary_loss_clip": 0.01428717, + "auxiliary_loss_mlp": 0.0024391, + "balance_loss_clip": 1.17050838, + "balance_loss_mlp": 0.21288025, + "epoch": 0.5765819930858259, + "flos": 20266582464000.0, + "grad_norm": 18.211866273515128, + "language_loss": 0.89396584, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.9106921, + "num_input_tokens_seen": 206594100, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.31018066, + "step": 9590, + "time_per_iteration": 2.6506288051605225 + }, + { + "auxiliary_loss_clip": 0.01444577, + "auxiliary_loss_mlp": 0.00226646, + "balance_loss_clip": 1.18222451, + "balance_loss_mlp": 0.19200343, + "epoch": 0.5766421163384939, + "flos": 23148772744320.0, + "grad_norm": 30.49967056249911, + "language_loss": 0.70963371, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.72634602, + "num_input_tokens_seen": 206613325, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.34643555, + "step": 9591, + "time_per_iteration": 2.6733028888702393 + }, + { + "auxiliary_loss_clip": 0.01457502, + "auxiliary_loss_mlp": 0.00271349, + "balance_loss_clip": 1.18921852, + "balance_loss_mlp": 0.234036, + "epoch": 0.5767022395911618, + "flos": 25848895962240.0, + "grad_norm": 55.28713149832437, + "language_loss": 0.84892225, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.86621082, + "num_input_tokens_seen": 206634265, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.37280273, + "step": 9592, + "time_per_iteration": 2.6924428939819336 + }, + { + "auxiliary_loss_clip": 0.01539356, + "auxiliary_loss_mlp": 0.00153476, + "balance_loss_clip": 1.31743383, + "balance_loss_mlp": 0.13802604, + "epoch": 0.5767623628438299, + "flos": 68293299657600.0, + "grad_norm": 0.7188958521414582, + "language_loss": 0.58906102, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.60598934, + "num_input_tokens_seen": 206696990, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.15429688, + "step": 9593, + "time_per_iteration": 3.3047561645507812 + }, + { + "auxiliary_loss_clip": 0.01465179, + "auxiliary_loss_mlp": 0.00276373, + "balance_loss_clip": 1.19472408, + "balance_loss_mlp": 0.2368193, + "epoch": 0.5768224860964978, + "flos": 30188448754560.0, + "grad_norm": 7.976482320726445, + "language_loss": 0.77412504, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.7915405, + "num_input_tokens_seen": 206717815, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.39575195, + "step": 9594, + "time_per_iteration": 2.725802183151245 + }, + { + "auxiliary_loss_clip": 0.01443911, + "auxiliary_loss_mlp": 0.00254616, + "balance_loss_clip": 1.18057871, + "balance_loss_mlp": 0.2203549, + "epoch": 0.5768826093491658, + "flos": 17895041884800.0, + "grad_norm": 1008.1362043662963, + "language_loss": 0.77702415, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.79400945, + "num_input_tokens_seen": 206735985, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.34277344, + "step": 9595, + "time_per_iteration": 2.639941930770874 + }, + { + "auxiliary_loss_clip": 0.0143403, + "auxiliary_loss_mlp": 0.00246224, + "balance_loss_clip": 1.17612493, + "balance_loss_mlp": 0.21148629, + "epoch": 0.5769427326018337, + "flos": 17457183095040.0, + "grad_norm": 5.398729872333989, + "language_loss": 0.76901031, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.78581285, + "num_input_tokens_seen": 206753370, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.34716797, + "step": 9596, + "time_per_iteration": 2.641700267791748 + }, + { + "auxiliary_loss_clip": 0.01453055, + "auxiliary_loss_mlp": 0.00258641, + "balance_loss_clip": 1.18671834, + "balance_loss_mlp": 0.22209099, + "epoch": 0.5770028558545017, + "flos": 39421728345600.0, + "grad_norm": 9.845850994083678, + "language_loss": 0.74872923, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.76584613, + "num_input_tokens_seen": 206777645, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.36523438, + "step": 9597, + "time_per_iteration": 2.859938859939575 + }, + { + "auxiliary_loss_clip": 0.0144697, + "auxiliary_loss_mlp": 0.00233225, + "balance_loss_clip": 1.18389964, + "balance_loss_mlp": 0.1999895, + "epoch": 0.5770629791071697, + "flos": 21536383132800.0, + "grad_norm": 9.455821831082929, + "language_loss": 0.87039113, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.88719308, + "num_input_tokens_seen": 206794865, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.33227539, + "step": 9598, + "time_per_iteration": 2.739943265914917 + }, + { + "auxiliary_loss_clip": 0.01447512, + "auxiliary_loss_mlp": 0.00243622, + "balance_loss_clip": 1.1825453, + "balance_loss_mlp": 0.20955247, + "epoch": 0.5771231023598377, + "flos": 20886795624960.0, + "grad_norm": 3.91352632019393, + "language_loss": 0.78871602, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.80562735, + "num_input_tokens_seen": 206814095, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.34057617, + "step": 9599, + "time_per_iteration": 2.7219135761260986 + }, + { + "auxiliary_loss_clip": 0.01439654, + "auxiliary_loss_mlp": 0.00254765, + "balance_loss_clip": 1.17770708, + "balance_loss_mlp": 0.21893084, + "epoch": 0.5771832256125057, + "flos": 18077216688000.0, + "grad_norm": 17.777584051302807, + "language_loss": 0.86816299, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.88510716, + "num_input_tokens_seen": 206832245, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.35839844, + "step": 9600, + "time_per_iteration": 2.6362249851226807 + }, + { + "auxiliary_loss_clip": 0.01450962, + "auxiliary_loss_mlp": 0.00248679, + "balance_loss_clip": 1.17875695, + "balance_loss_mlp": 0.21422724, + "epoch": 0.5772433488651736, + "flos": 26359078786560.0, + "grad_norm": 3.0720136824438753, + "language_loss": 0.81757694, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.83457339, + "num_input_tokens_seen": 206851535, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.34448242, + "step": 9601, + "time_per_iteration": 4.134538173675537 + }, + { + "auxiliary_loss_clip": 0.01474031, + "auxiliary_loss_mlp": 0.00242128, + "balance_loss_clip": 1.20284724, + "balance_loss_mlp": 0.20803379, + "epoch": 0.5773034721178416, + "flos": 19680987035520.0, + "grad_norm": 155.4209592535194, + "language_loss": 0.7434082, + "learning_rate": 1.599058274973348e-06, + "loss": 0.76056975, + "num_input_tokens_seen": 206870595, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.34106445, + "step": 9602, + "time_per_iteration": 4.171952486038208 + }, + { + "auxiliary_loss_clip": 0.01436939, + "auxiliary_loss_mlp": 0.00228128, + "balance_loss_clip": 1.18033576, + "balance_loss_mlp": 0.19353396, + "epoch": 0.5773635953705095, + "flos": 25082885496960.0, + "grad_norm": 39.779562569371585, + "language_loss": 0.78704023, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.80369091, + "num_input_tokens_seen": 206892320, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.34594727, + "step": 9603, + "time_per_iteration": 2.747554302215576 + }, + { + "auxiliary_loss_clip": 0.01441512, + "auxiliary_loss_mlp": 0.00265294, + "balance_loss_clip": 1.17627943, + "balance_loss_mlp": 0.23239261, + "epoch": 0.5774237186231775, + "flos": 21032987978880.0, + "grad_norm": 9.592594022469617, + "language_loss": 0.83336353, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.85043156, + "num_input_tokens_seen": 206912485, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.32910156, + "step": 9604, + "time_per_iteration": 2.7098844051361084 + }, + { + "auxiliary_loss_clip": 0.01456609, + "auxiliary_loss_mlp": 0.00233257, + "balance_loss_clip": 1.1841414, + "balance_loss_mlp": 0.19804239, + "epoch": 0.5774838418758454, + "flos": 15231727128960.0, + "grad_norm": 41.24301976616702, + "language_loss": 0.92115927, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.9380579, + "num_input_tokens_seen": 206929100, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.35205078, + "step": 9605, + "time_per_iteration": 4.069299697875977 + }, + { + "auxiliary_loss_clip": 0.0148695, + "auxiliary_loss_mlp": 0.00269639, + "balance_loss_clip": 1.20255578, + "balance_loss_mlp": 0.23251759, + "epoch": 0.5775439651285135, + "flos": 23582609210880.0, + "grad_norm": 20.687228391807416, + "language_loss": 0.86693847, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.88450438, + "num_input_tokens_seen": 206947020, + "router_z_loss_clip": 2.84570312, + "router_z_loss_mlp": 0.37109375, + "step": 9606, + "time_per_iteration": 2.673295497894287 + }, + { + "auxiliary_loss_clip": 0.01459694, + "auxiliary_loss_mlp": 0.00235052, + "balance_loss_clip": 1.19053721, + "balance_loss_mlp": 0.19869326, + "epoch": 0.5776040883811814, + "flos": 18040515966720.0, + "grad_norm": 2.857741229929285, + "language_loss": 0.79754966, + "learning_rate": 1.597150687927619e-06, + "loss": 0.81449711, + "num_input_tokens_seen": 206964065, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.36352539, + "step": 9607, + "time_per_iteration": 2.6456656455993652 + }, + { + "auxiliary_loss_clip": 0.01456953, + "auxiliary_loss_mlp": 0.00247827, + "balance_loss_clip": 1.18458724, + "balance_loss_mlp": 0.21504405, + "epoch": 0.5776642116338494, + "flos": 18624638937600.0, + "grad_norm": 3.667725777142241, + "language_loss": 0.7806868, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.79773462, + "num_input_tokens_seen": 206981940, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.32824707, + "step": 9608, + "time_per_iteration": 2.639314651489258 + }, + { + "auxiliary_loss_clip": 0.01460741, + "auxiliary_loss_mlp": 0.00268914, + "balance_loss_clip": 1.18921745, + "balance_loss_mlp": 0.23310345, + "epoch": 0.5777243348865173, + "flos": 28402539517440.0, + "grad_norm": 107.30129625730933, + "language_loss": 0.84291703, + "learning_rate": 1.596387759940665e-06, + "loss": 0.86021358, + "num_input_tokens_seen": 207002365, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.3581543, + "step": 9609, + "time_per_iteration": 2.711634397506714 + }, + { + "auxiliary_loss_clip": 0.01433535, + "auxiliary_loss_mlp": 0.00262757, + "balance_loss_clip": 1.16726971, + "balance_loss_mlp": 0.23006965, + "epoch": 0.5777844581391853, + "flos": 24024705805440.0, + "grad_norm": 6.4008321242465005, + "language_loss": 0.82737333, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.84433627, + "num_input_tokens_seen": 207021195, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.32714844, + "step": 9610, + "time_per_iteration": 2.673670768737793 + }, + { + "auxiliary_loss_clip": 0.01449479, + "auxiliary_loss_mlp": 0.00253392, + "balance_loss_clip": 1.18254983, + "balance_loss_mlp": 0.21660417, + "epoch": 0.5778445813918534, + "flos": 17777361951360.0, + "grad_norm": 84.96135206811525, + "language_loss": 0.79110777, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.80813646, + "num_input_tokens_seen": 207037465, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.36816406, + "step": 9611, + "time_per_iteration": 4.007632732391357 + }, + { + "auxiliary_loss_clip": 0.01436472, + "auxiliary_loss_mlp": 0.00238774, + "balance_loss_clip": 1.17428815, + "balance_loss_mlp": 0.20451364, + "epoch": 0.5779047046445213, + "flos": 22233194046720.0, + "grad_norm": 9.141056160280792, + "language_loss": 0.90793288, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.92468536, + "num_input_tokens_seen": 207054230, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.34277344, + "step": 9612, + "time_per_iteration": 2.636336326599121 + }, + { + "auxiliary_loss_clip": 0.01440449, + "auxiliary_loss_mlp": 0.00251429, + "balance_loss_clip": 1.17669797, + "balance_loss_mlp": 0.21700147, + "epoch": 0.5779648278971893, + "flos": 21434361528960.0, + "grad_norm": 263.70635302336103, + "language_loss": 0.85398054, + "learning_rate": 1.594862087742667e-06, + "loss": 0.87089926, + "num_input_tokens_seen": 207073150, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.34423828, + "step": 9613, + "time_per_iteration": 2.6690611839294434 + }, + { + "auxiliary_loss_clip": 0.01409065, + "auxiliary_loss_mlp": 0.00238871, + "balance_loss_clip": 1.14917886, + "balance_loss_mlp": 0.20480123, + "epoch": 0.5780249511498572, + "flos": 19026120228480.0, + "grad_norm": 10.105656089672872, + "language_loss": 0.82239854, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.83887792, + "num_input_tokens_seen": 207090375, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.34057617, + "step": 9614, + "time_per_iteration": 2.631988763809204 + }, + { + "auxiliary_loss_clip": 0.01448502, + "auxiliary_loss_mlp": 0.00225746, + "balance_loss_clip": 1.17867661, + "balance_loss_mlp": 0.19041273, + "epoch": 0.5780850744025252, + "flos": 12124663752960.0, + "grad_norm": 36.519197099310404, + "language_loss": 0.92087698, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.93761945, + "num_input_tokens_seen": 207106030, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.35375977, + "step": 9615, + "time_per_iteration": 2.6071360111236572 + }, + { + "auxiliary_loss_clip": 0.01448205, + "auxiliary_loss_mlp": 0.00240299, + "balance_loss_clip": 1.17806101, + "balance_loss_mlp": 0.20131785, + "epoch": 0.5781451976551931, + "flos": 25044425009280.0, + "grad_norm": 36.605750651577985, + "language_loss": 0.75464028, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.77152538, + "num_input_tokens_seen": 207125435, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.38989258, + "step": 9616, + "time_per_iteration": 2.6779191493988037 + }, + { + "auxiliary_loss_clip": 0.01426729, + "auxiliary_loss_mlp": 0.00231506, + "balance_loss_clip": 1.16624582, + "balance_loss_mlp": 0.19843777, + "epoch": 0.5782053209078611, + "flos": 19245606284160.0, + "grad_norm": 28.708519043813727, + "language_loss": 0.85609877, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.87268108, + "num_input_tokens_seen": 207145095, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.33081055, + "step": 9617, + "time_per_iteration": 2.6694273948669434 + }, + { + "auxiliary_loss_clip": 0.014651, + "auxiliary_loss_mlp": 0.00234517, + "balance_loss_clip": 1.19449723, + "balance_loss_mlp": 0.19675133, + "epoch": 0.578265444160529, + "flos": 25993831340160.0, + "grad_norm": 369.0253298540447, + "language_loss": 0.8324995, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.84949565, + "num_input_tokens_seen": 207166045, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.37744141, + "step": 9618, + "time_per_iteration": 2.684602737426758 + }, + { + "auxiliary_loss_clip": 0.01444859, + "auxiliary_loss_mlp": 0.00227155, + "balance_loss_clip": 1.17833626, + "balance_loss_mlp": 0.19260812, + "epoch": 0.5783255674131971, + "flos": 21798603394560.0, + "grad_norm": 3.2015767854196677, + "language_loss": 0.90188551, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.91860563, + "num_input_tokens_seen": 207185290, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.34545898, + "step": 9619, + "time_per_iteration": 2.6560845375061035 + }, + { + "auxiliary_loss_clip": 0.01436437, + "auxiliary_loss_mlp": 0.00225683, + "balance_loss_clip": 1.17079604, + "balance_loss_mlp": 0.19256672, + "epoch": 0.578385690665865, + "flos": 24789746603520.0, + "grad_norm": 4.5009149294394595, + "language_loss": 0.78860444, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.80522567, + "num_input_tokens_seen": 207205505, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.33117676, + "step": 9620, + "time_per_iteration": 2.7850563526153564 + }, + { + "auxiliary_loss_clip": 0.01429592, + "auxiliary_loss_mlp": 0.00217328, + "balance_loss_clip": 1.16856503, + "balance_loss_mlp": 0.18299559, + "epoch": 0.578445813918533, + "flos": 21212864311680.0, + "grad_norm": 42.14217996001199, + "language_loss": 0.84026837, + "learning_rate": 1.591811481689916e-06, + "loss": 0.85673761, + "num_input_tokens_seen": 207225315, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.34301758, + "step": 9621, + "time_per_iteration": 2.719852924346924 + }, + { + "auxiliary_loss_clip": 0.01442048, + "auxiliary_loss_mlp": 0.00251096, + "balance_loss_clip": 1.17373133, + "balance_loss_mlp": 0.21690726, + "epoch": 0.5785059371712009, + "flos": 25046795306880.0, + "grad_norm": 45.831243626173986, + "language_loss": 0.7719934, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.78892487, + "num_input_tokens_seen": 207247690, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.34179688, + "step": 9622, + "time_per_iteration": 2.745157480239868 + }, + { + "auxiliary_loss_clip": 0.01466501, + "auxiliary_loss_mlp": 0.00103415, + "balance_loss_clip": 1.24388826, + "balance_loss_mlp": 0.09173276, + "epoch": 0.5785660604238689, + "flos": 70843172284800.0, + "grad_norm": 0.8285340245317143, + "language_loss": 0.5535413, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.56924045, + "num_input_tokens_seen": 207301735, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.11669922, + "step": 9623, + "time_per_iteration": 3.2049875259399414 + }, + { + "auxiliary_loss_clip": 0.01438836, + "auxiliary_loss_mlp": 0.0025123, + "balance_loss_clip": 1.16507673, + "balance_loss_mlp": 0.21515723, + "epoch": 0.578626183676537, + "flos": 31649977244160.0, + "grad_norm": 15.207848413639272, + "language_loss": 0.79888129, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.81578195, + "num_input_tokens_seen": 207321240, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.36035156, + "step": 9624, + "time_per_iteration": 2.7583019733428955 + }, + { + "auxiliary_loss_clip": 0.01450562, + "auxiliary_loss_mlp": 0.00224622, + "balance_loss_clip": 1.17653394, + "balance_loss_mlp": 0.18904966, + "epoch": 0.5786863069292049, + "flos": 21865181253120.0, + "grad_norm": 97.9894167060353, + "language_loss": 0.89796585, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.91471767, + "num_input_tokens_seen": 207339540, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.35571289, + "step": 9625, + "time_per_iteration": 2.7461719512939453 + }, + { + "auxiliary_loss_clip": 0.01459889, + "auxiliary_loss_mlp": 0.00222062, + "balance_loss_clip": 1.18082142, + "balance_loss_mlp": 0.18706244, + "epoch": 0.5787464301818729, + "flos": 23364954748800.0, + "grad_norm": 3.186677901220657, + "language_loss": 0.76962787, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.78644741, + "num_input_tokens_seen": 207360470, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.34985352, + "step": 9626, + "time_per_iteration": 2.705853223800659 + }, + { + "auxiliary_loss_clip": 0.01451477, + "auxiliary_loss_mlp": 0.00216831, + "balance_loss_clip": 1.18004096, + "balance_loss_mlp": 0.18090123, + "epoch": 0.5788065534345408, + "flos": 30004011394560.0, + "grad_norm": 17.90766355068469, + "language_loss": 0.77449012, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.79117322, + "num_input_tokens_seen": 207383080, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.359375, + "step": 9627, + "time_per_iteration": 2.709117889404297 + }, + { + "auxiliary_loss_clip": 0.01437723, + "auxiliary_loss_mlp": 0.00230496, + "balance_loss_clip": 1.17052197, + "balance_loss_mlp": 0.19773751, + "epoch": 0.5788666766872088, + "flos": 24527849564160.0, + "grad_norm": 104.06457856400391, + "language_loss": 0.91642541, + "learning_rate": 1.589143013764458e-06, + "loss": 0.93310755, + "num_input_tokens_seen": 207401000, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.32714844, + "step": 9628, + "time_per_iteration": 2.6767141819000244 + }, + { + "auxiliary_loss_clip": 0.01423403, + "auxiliary_loss_mlp": 0.0021437, + "balance_loss_clip": 1.160164, + "balance_loss_mlp": 0.181564, + "epoch": 0.5789267999398767, + "flos": 23732823888000.0, + "grad_norm": 6.082001918768536, + "language_loss": 0.79570192, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.81207967, + "num_input_tokens_seen": 207419230, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.32788086, + "step": 9629, + "time_per_iteration": 2.6512436866760254 + }, + { + "auxiliary_loss_clip": 0.014435, + "auxiliary_loss_mlp": 0.00231508, + "balance_loss_clip": 1.1719892, + "balance_loss_mlp": 0.19719976, + "epoch": 0.5789869231925447, + "flos": 21135045496320.0, + "grad_norm": 38.994443655499715, + "language_loss": 0.83530331, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.85205334, + "num_input_tokens_seen": 207437615, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.34326172, + "step": 9630, + "time_per_iteration": 2.6597306728363037 + }, + { + "auxiliary_loss_clip": 0.01432972, + "auxiliary_loss_mlp": 0.00253083, + "balance_loss_clip": 1.16859496, + "balance_loss_mlp": 0.21579471, + "epoch": 0.5790470464452127, + "flos": 21209632087680.0, + "grad_norm": 2.147854018999418, + "language_loss": 0.84217983, + "learning_rate": 1.587999618060523e-06, + "loss": 0.85904038, + "num_input_tokens_seen": 207457270, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.37329102, + "step": 9631, + "time_per_iteration": 2.6188278198242188 + }, + { + "auxiliary_loss_clip": 0.01410517, + "auxiliary_loss_mlp": 0.0022129, + "balance_loss_clip": 1.15132642, + "balance_loss_mlp": 0.18667126, + "epoch": 0.5791071696978807, + "flos": 23404384903680.0, + "grad_norm": 91.31092809183572, + "language_loss": 0.83621573, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.85253382, + "num_input_tokens_seen": 207477890, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.34594727, + "step": 9632, + "time_per_iteration": 2.681260347366333 + }, + { + "auxiliary_loss_clip": 0.01429609, + "auxiliary_loss_mlp": 0.00240364, + "balance_loss_clip": 1.16332185, + "balance_loss_mlp": 0.20693819, + "epoch": 0.5791672929505486, + "flos": 24206521472640.0, + "grad_norm": 5.672213010223808, + "language_loss": 0.86800122, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.88470089, + "num_input_tokens_seen": 207497670, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.33447266, + "step": 9633, + "time_per_iteration": 2.6497247219085693 + }, + { + "auxiliary_loss_clip": 0.01439259, + "auxiliary_loss_mlp": 0.00234516, + "balance_loss_clip": 1.16459334, + "balance_loss_mlp": 0.19870511, + "epoch": 0.5792274162032166, + "flos": 24348871071360.0, + "grad_norm": 17.97165733572477, + "language_loss": 0.87429094, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.89102864, + "num_input_tokens_seen": 207516105, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.3581543, + "step": 9634, + "time_per_iteration": 2.664384126663208 + }, + { + "auxiliary_loss_clip": 0.01415912, + "auxiliary_loss_mlp": 0.00232572, + "balance_loss_clip": 1.14962828, + "balance_loss_mlp": 0.19821578, + "epoch": 0.5792875394558845, + "flos": 20449403712000.0, + "grad_norm": 7.066425009413879, + "language_loss": 0.74153781, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.75802267, + "num_input_tokens_seen": 207533685, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.34338379, + "step": 9635, + "time_per_iteration": 2.666520833969116 + }, + { + "auxiliary_loss_clip": 0.01420193, + "auxiliary_loss_mlp": 0.00241885, + "balance_loss_clip": 1.16108763, + "balance_loss_mlp": 0.20831558, + "epoch": 0.5793476627085525, + "flos": 24060329118720.0, + "grad_norm": 2.2174441867012886, + "language_loss": 0.83170176, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.84832251, + "num_input_tokens_seen": 207552840, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.33569336, + "step": 9636, + "time_per_iteration": 2.755195140838623 + }, + { + "auxiliary_loss_clip": 0.01436141, + "auxiliary_loss_mlp": 0.00196351, + "balance_loss_clip": 1.17231035, + "balance_loss_mlp": 0.16237631, + "epoch": 0.5794077859612206, + "flos": 22054287381120.0, + "grad_norm": 40.70171318751506, + "language_loss": 0.76266015, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.77898502, + "num_input_tokens_seen": 207572095, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.34008789, + "step": 9637, + "time_per_iteration": 2.6696088314056396 + }, + { + "auxiliary_loss_clip": 0.01464596, + "auxiliary_loss_mlp": 0.00230395, + "balance_loss_clip": 1.184062, + "balance_loss_mlp": 0.19327298, + "epoch": 0.5794679092138885, + "flos": 11434855991040.0, + "grad_norm": 8.67695167067145, + "language_loss": 0.8367188, + "learning_rate": 1.585332242234043e-06, + "loss": 0.85366875, + "num_input_tokens_seen": 207587495, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.37109375, + "step": 9638, + "time_per_iteration": 2.624295234680176 + }, + { + "auxiliary_loss_clip": 0.01448612, + "auxiliary_loss_mlp": 0.00205052, + "balance_loss_clip": 1.17957354, + "balance_loss_mlp": 0.17083859, + "epoch": 0.5795280324665565, + "flos": 18880215183360.0, + "grad_norm": 7.408440215478484, + "language_loss": 0.79832411, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.8148607, + "num_input_tokens_seen": 207606795, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.34179688, + "step": 9639, + "time_per_iteration": 2.647068977355957 + }, + { + "auxiliary_loss_clip": 0.01425154, + "auxiliary_loss_mlp": 0.00229599, + "balance_loss_clip": 1.16206157, + "balance_loss_mlp": 0.19519544, + "epoch": 0.5795881557192244, + "flos": 13005947940480.0, + "grad_norm": 39.91116198685101, + "language_loss": 0.77830231, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.79484987, + "num_input_tokens_seen": 207623620, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.34399414, + "step": 9640, + "time_per_iteration": 2.614427328109741 + }, + { + "auxiliary_loss_clip": 0.01455469, + "auxiliary_loss_mlp": 0.0023499, + "balance_loss_clip": 1.17377687, + "balance_loss_mlp": 0.19772509, + "epoch": 0.5796482789718924, + "flos": 19932397303680.0, + "grad_norm": 4.695058627405145, + "language_loss": 0.86997551, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.8868801, + "num_input_tokens_seen": 207639380, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.37255859, + "step": 9641, + "time_per_iteration": 2.6354899406433105 + }, + { + "auxiliary_loss_clip": 0.0144221, + "auxiliary_loss_mlp": 0.00219037, + "balance_loss_clip": 1.17330313, + "balance_loss_mlp": 0.18348902, + "epoch": 0.5797084022245603, + "flos": 21650794928640.0, + "grad_norm": 188.15818271380263, + "language_loss": 0.82008839, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.83670092, + "num_input_tokens_seen": 207657915, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.35546875, + "step": 9642, + "time_per_iteration": 2.636300563812256 + }, + { + "auxiliary_loss_clip": 0.01440127, + "auxiliary_loss_mlp": 0.00219295, + "balance_loss_clip": 1.171875, + "balance_loss_mlp": 0.18474844, + "epoch": 0.5797685254772283, + "flos": 26031573555840.0, + "grad_norm": 8.167228162589902, + "language_loss": 0.81705546, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.83364969, + "num_input_tokens_seen": 207678620, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.34545898, + "step": 9643, + "time_per_iteration": 4.1012115478515625 + }, + { + "auxiliary_loss_clip": 0.01450736, + "auxiliary_loss_mlp": 0.00232338, + "balance_loss_clip": 1.17920184, + "balance_loss_mlp": 0.19550283, + "epoch": 0.5798286487298963, + "flos": 22705167778560.0, + "grad_norm": 74.69520285561644, + "language_loss": 0.77412635, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.79095709, + "num_input_tokens_seen": 207696980, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.3684082, + "step": 9644, + "time_per_iteration": 4.187214612960815 + }, + { + "auxiliary_loss_clip": 0.01449266, + "auxiliary_loss_mlp": 0.00243394, + "balance_loss_clip": 1.1762985, + "balance_loss_mlp": 0.20798859, + "epoch": 0.5798887719825643, + "flos": 23148988225920.0, + "grad_norm": 13.44609471094669, + "language_loss": 0.93683976, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.95376647, + "num_input_tokens_seen": 207714065, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.35424805, + "step": 9645, + "time_per_iteration": 2.632127523422241 + }, + { + "auxiliary_loss_clip": 0.01465977, + "auxiliary_loss_mlp": 0.00226154, + "balance_loss_clip": 1.19033492, + "balance_loss_mlp": 0.18970001, + "epoch": 0.5799488952352322, + "flos": 24426043441920.0, + "grad_norm": 53.14552063980344, + "language_loss": 0.84528553, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.86220682, + "num_input_tokens_seen": 207734720, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.36450195, + "step": 9646, + "time_per_iteration": 2.6819324493408203 + }, + { + "auxiliary_loss_clip": 0.01456734, + "auxiliary_loss_mlp": 0.00268034, + "balance_loss_clip": 1.18353319, + "balance_loss_mlp": 0.23322457, + "epoch": 0.5800090184879002, + "flos": 38395903829760.0, + "grad_norm": 29.273130389197092, + "language_loss": 0.6758033, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.69305098, + "num_input_tokens_seen": 207755435, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.34790039, + "step": 9647, + "time_per_iteration": 4.271580219268799 + }, + { + "auxiliary_loss_clip": 0.0143285, + "auxiliary_loss_mlp": 0.00238698, + "balance_loss_clip": 1.16384375, + "balance_loss_mlp": 0.20303103, + "epoch": 0.5800691417405681, + "flos": 19784840232960.0, + "grad_norm": 127.88481271754934, + "language_loss": 0.92493314, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.9416486, + "num_input_tokens_seen": 207773570, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.35668945, + "step": 9648, + "time_per_iteration": 2.630004405975342 + }, + { + "auxiliary_loss_clip": 0.01504839, + "auxiliary_loss_mlp": 0.00159448, + "balance_loss_clip": 1.26795375, + "balance_loss_mlp": 0.14981619, + "epoch": 0.5801292649932361, + "flos": 70314565783680.0, + "grad_norm": 0.8418800483132198, + "language_loss": 0.62708032, + "learning_rate": 1.581142210256242e-06, + "loss": 0.64372325, + "num_input_tokens_seen": 207830095, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.09619141, + "step": 9649, + "time_per_iteration": 3.220691680908203 + }, + { + "auxiliary_loss_clip": 0.0141049, + "auxiliary_loss_mlp": 0.002321, + "balance_loss_clip": 1.15390563, + "balance_loss_mlp": 0.19841176, + "epoch": 0.5801893882459042, + "flos": 18734812928640.0, + "grad_norm": 33.106368416261816, + "language_loss": 0.87515783, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.8915838, + "num_input_tokens_seen": 207848555, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.33691406, + "step": 9650, + "time_per_iteration": 2.62776255607605 + }, + { + "auxiliary_loss_clip": 0.01434666, + "auxiliary_loss_mlp": 0.00244664, + "balance_loss_clip": 1.16283655, + "balance_loss_mlp": 0.21021225, + "epoch": 0.5802495114985721, + "flos": 15596507698560.0, + "grad_norm": 41.67988874696643, + "language_loss": 0.87757224, + "learning_rate": 1.580380592177698e-06, + "loss": 0.89436555, + "num_input_tokens_seen": 207867060, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.34472656, + "step": 9651, + "time_per_iteration": 2.6643552780151367 + }, + { + "auxiliary_loss_clip": 0.01465181, + "auxiliary_loss_mlp": 0.00262973, + "balance_loss_clip": 1.18738627, + "balance_loss_mlp": 0.22585082, + "epoch": 0.5803096347512401, + "flos": 18255405081600.0, + "grad_norm": 28.957677933807393, + "language_loss": 0.83056247, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.847844, + "num_input_tokens_seen": 207884520, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.37109375, + "step": 9652, + "time_per_iteration": 2.619349956512451 + }, + { + "auxiliary_loss_clip": 0.0143957, + "auxiliary_loss_mlp": 0.00263755, + "balance_loss_clip": 1.16705358, + "balance_loss_mlp": 0.22799179, + "epoch": 0.580369758003908, + "flos": 22893160584960.0, + "grad_norm": 4.331862951973959, + "language_loss": 0.90412879, + "learning_rate": 1.579619037747193e-06, + "loss": 0.92116201, + "num_input_tokens_seen": 207905370, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.35766602, + "step": 9653, + "time_per_iteration": 4.051369667053223 + }, + { + "auxiliary_loss_clip": 0.01435394, + "auxiliary_loss_mlp": 0.00289862, + "balance_loss_clip": 1.16929138, + "balance_loss_mlp": 0.25361055, + "epoch": 0.580429881256576, + "flos": 18697681244160.0, + "grad_norm": 31.483214406602944, + "language_loss": 0.84447879, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.86173129, + "num_input_tokens_seen": 207923790, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.36218262, + "step": 9654, + "time_per_iteration": 2.6401987075805664 + }, + { + "auxiliary_loss_clip": 0.01425825, + "auxiliary_loss_mlp": 0.00224884, + "balance_loss_clip": 1.16430533, + "balance_loss_mlp": 0.19021837, + "epoch": 0.5804900045092439, + "flos": 24681978823680.0, + "grad_norm": 8.277652227205861, + "language_loss": 0.76789933, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.78440642, + "num_input_tokens_seen": 207942335, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.34667969, + "step": 9655, + "time_per_iteration": 2.6327457427978516 + }, + { + "auxiliary_loss_clip": 0.01452082, + "auxiliary_loss_mlp": 0.00258275, + "balance_loss_clip": 1.1701746, + "balance_loss_mlp": 0.22210717, + "epoch": 0.580550127761912, + "flos": 23112790295040.0, + "grad_norm": 23.701028042623665, + "language_loss": 0.79574883, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.81285238, + "num_input_tokens_seen": 207961975, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.36181641, + "step": 9656, + "time_per_iteration": 2.6536576747894287 + }, + { + "auxiliary_loss_clip": 0.01411864, + "auxiliary_loss_mlp": 0.00245404, + "balance_loss_clip": 1.15582013, + "balance_loss_mlp": 0.21202585, + "epoch": 0.5806102510145799, + "flos": 18475681236480.0, + "grad_norm": 9.877090035031314, + "language_loss": 0.78507423, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.80164695, + "num_input_tokens_seen": 207979520, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.33374023, + "step": 9657, + "time_per_iteration": 2.628791570663452 + }, + { + "auxiliary_loss_clip": 0.01461918, + "auxiliary_loss_mlp": 0.00260116, + "balance_loss_clip": 1.18400073, + "balance_loss_mlp": 0.22423354, + "epoch": 0.5806703742672479, + "flos": 23915645136000.0, + "grad_norm": 9.795823399552962, + "language_loss": 0.8014065, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.81862682, + "num_input_tokens_seen": 207998375, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.35913086, + "step": 9658, + "time_per_iteration": 2.675096035003662 + }, + { + "auxiliary_loss_clip": 0.01475242, + "auxiliary_loss_mlp": 0.00101098, + "balance_loss_clip": 1.24778867, + "balance_loss_mlp": 0.09022592, + "epoch": 0.5807304975199158, + "flos": 66311999412480.0, + "grad_norm": 0.6399457300481245, + "language_loss": 0.52739513, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.54315853, + "num_input_tokens_seen": 208060605, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.10888672, + "step": 9659, + "time_per_iteration": 3.1999716758728027 + }, + { + "auxiliary_loss_clip": 0.01435074, + "auxiliary_loss_mlp": 0.00250914, + "balance_loss_clip": 1.16591012, + "balance_loss_mlp": 0.2148886, + "epoch": 0.5807906207725838, + "flos": 31722444933120.0, + "grad_norm": 8.596679560694218, + "language_loss": 0.68746245, + "learning_rate": 1.576954100136366e-06, + "loss": 0.70432234, + "num_input_tokens_seen": 208080320, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.36035156, + "step": 9660, + "time_per_iteration": 2.7469418048858643 + }, + { + "auxiliary_loss_clip": 0.01434586, + "auxiliary_loss_mlp": 0.00288171, + "balance_loss_clip": 1.16705132, + "balance_loss_mlp": 0.2513116, + "epoch": 0.5808507440252517, + "flos": 23801161512960.0, + "grad_norm": 17.84934835151538, + "language_loss": 0.73175955, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.74898708, + "num_input_tokens_seen": 208099305, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.36889648, + "step": 9661, + "time_per_iteration": 2.6879231929779053 + }, + { + "auxiliary_loss_clip": 0.01422389, + "auxiliary_loss_mlp": 0.00236338, + "balance_loss_clip": 1.16557574, + "balance_loss_mlp": 0.2033644, + "epoch": 0.5809108672779197, + "flos": 13698449222400.0, + "grad_norm": 26.514551410090075, + "language_loss": 0.80029958, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.8168869, + "num_input_tokens_seen": 208116960, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.3293457, + "step": 9662, + "time_per_iteration": 2.686223268508911 + }, + { + "auxiliary_loss_clip": 0.01455156, + "auxiliary_loss_mlp": 0.00078035, + "balance_loss_clip": 1.23167884, + "balance_loss_mlp": 0.06635216, + "epoch": 0.5809709905305876, + "flos": 69134866381440.0, + "grad_norm": 0.8442384189804264, + "language_loss": 0.58126581, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.59659779, + "num_input_tokens_seen": 208182190, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.11669922, + "step": 9663, + "time_per_iteration": 3.2520999908447266 + }, + { + "auxiliary_loss_clip": 0.01418864, + "auxiliary_loss_mlp": 0.00251739, + "balance_loss_clip": 1.15786648, + "balance_loss_mlp": 0.21931401, + "epoch": 0.5810311137832557, + "flos": 19827538525440.0, + "grad_norm": 3.4640391172418803, + "language_loss": 0.88910526, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.90581131, + "num_input_tokens_seen": 208197015, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.32446289, + "step": 9664, + "time_per_iteration": 2.6721017360687256 + }, + { + "auxiliary_loss_clip": 0.0142109, + "auxiliary_loss_mlp": 0.0024137, + "balance_loss_clip": 1.15750408, + "balance_loss_mlp": 0.2073475, + "epoch": 0.5810912370359237, + "flos": 29238503719680.0, + "grad_norm": 21.582497458036602, + "language_loss": 0.86945748, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.88608211, + "num_input_tokens_seen": 208215795, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.34033203, + "step": 9665, + "time_per_iteration": 2.7920408248901367 + }, + { + "auxiliary_loss_clip": 0.01447703, + "auxiliary_loss_mlp": 0.00255664, + "balance_loss_clip": 1.17740965, + "balance_loss_mlp": 0.22354892, + "epoch": 0.5811513602885916, + "flos": 22785572373120.0, + "grad_norm": 3.8207792152145474, + "language_loss": 0.8697772, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.8868109, + "num_input_tokens_seen": 208234655, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.32104492, + "step": 9666, + "time_per_iteration": 2.8320844173431396 + }, + { + "auxiliary_loss_clip": 0.01442235, + "auxiliary_loss_mlp": 0.00248341, + "balance_loss_clip": 1.18016648, + "balance_loss_mlp": 0.21348381, + "epoch": 0.5812114835412596, + "flos": 18734346051840.0, + "grad_norm": 2.581206960146033, + "language_loss": 0.86910093, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.88600671, + "num_input_tokens_seen": 208251300, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.34838867, + "step": 9667, + "time_per_iteration": 2.737617254257202 + }, + { + "auxiliary_loss_clip": 0.01460307, + "auxiliary_loss_mlp": 0.00253014, + "balance_loss_clip": 1.1856457, + "balance_loss_mlp": 0.21861057, + "epoch": 0.5812716067939275, + "flos": 26431295080320.0, + "grad_norm": 5.685153963038283, + "language_loss": 0.83597124, + "learning_rate": 1.573909419957653e-06, + "loss": 0.85310441, + "num_input_tokens_seen": 208272685, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.34399414, + "step": 9668, + "time_per_iteration": 2.8012733459472656 + }, + { + "auxiliary_loss_clip": 0.0143394, + "auxiliary_loss_mlp": 0.00238091, + "balance_loss_clip": 1.16953897, + "balance_loss_mlp": 0.20542765, + "epoch": 0.5813317300465956, + "flos": 43397865285120.0, + "grad_norm": 5.199380763942596, + "language_loss": 0.7471211, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.76384139, + "num_input_tokens_seen": 208294315, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.32666016, + "step": 9669, + "time_per_iteration": 2.855381965637207 + }, + { + "auxiliary_loss_clip": 0.01460403, + "auxiliary_loss_mlp": 0.00285664, + "balance_loss_clip": 1.18558264, + "balance_loss_mlp": 0.24987715, + "epoch": 0.5813918532992635, + "flos": 24785472885120.0, + "grad_norm": 25.591232336079564, + "language_loss": 0.80061251, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.81807315, + "num_input_tokens_seen": 208315610, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.35791016, + "step": 9670, + "time_per_iteration": 2.7088444232940674 + }, + { + "auxiliary_loss_clip": 0.01434564, + "auxiliary_loss_mlp": 0.00281517, + "balance_loss_clip": 1.1692338, + "balance_loss_mlp": 0.24463402, + "epoch": 0.5814519765519315, + "flos": 22857357703680.0, + "grad_norm": 4.479030260854064, + "language_loss": 0.88092422, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.898085, + "num_input_tokens_seen": 208334725, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.36914062, + "step": 9671, + "time_per_iteration": 2.7010111808776855 + }, + { + "auxiliary_loss_clip": 0.0146583, + "auxiliary_loss_mlp": 0.00280869, + "balance_loss_clip": 1.18792522, + "balance_loss_mlp": 0.24312693, + "epoch": 0.5815120998045994, + "flos": 24060831909120.0, + "grad_norm": 39.601144432398165, + "language_loss": 0.72438329, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.74185026, + "num_input_tokens_seen": 208353825, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.37744141, + "step": 9672, + "time_per_iteration": 2.702176570892334 + }, + { + "auxiliary_loss_clip": 0.01424635, + "auxiliary_loss_mlp": 0.00263192, + "balance_loss_clip": 1.16630745, + "balance_loss_mlp": 0.22988454, + "epoch": 0.5815722230572674, + "flos": 24279491952000.0, + "grad_norm": 22.673731271433503, + "language_loss": 0.88389784, + "learning_rate": 1.572007019492342e-06, + "loss": 0.90077609, + "num_input_tokens_seen": 208374160, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.33276367, + "step": 9673, + "time_per_iteration": 2.687452554702759 + }, + { + "auxiliary_loss_clip": 0.01442274, + "auxiliary_loss_mlp": 0.00274211, + "balance_loss_clip": 1.17376709, + "balance_loss_mlp": 0.23744704, + "epoch": 0.5816323463099353, + "flos": 22200371994240.0, + "grad_norm": 5.80618551580568, + "language_loss": 0.96549642, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.98266131, + "num_input_tokens_seen": 208392105, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.36767578, + "step": 9674, + "time_per_iteration": 2.651585102081299 + }, + { + "auxiliary_loss_clip": 0.01484701, + "auxiliary_loss_mlp": 0.00249638, + "balance_loss_clip": 1.20471072, + "balance_loss_mlp": 0.21435234, + "epoch": 0.5816924695626033, + "flos": 24134448833280.0, + "grad_norm": 287.80074106744325, + "language_loss": 0.85440999, + "learning_rate": 1.571246172811984e-06, + "loss": 0.87175345, + "num_input_tokens_seen": 208411755, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.3527832, + "step": 9675, + "time_per_iteration": 2.6822450160980225 + }, + { + "auxiliary_loss_clip": 0.0144211, + "auxiliary_loss_mlp": 0.00228608, + "balance_loss_clip": 1.1769191, + "balance_loss_mlp": 0.19515836, + "epoch": 0.5817525928152713, + "flos": 21324223451520.0, + "grad_norm": 3.1830659538130393, + "language_loss": 0.78252339, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.79923058, + "num_input_tokens_seen": 208429995, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.33447266, + "step": 9676, + "time_per_iteration": 2.668804168701172 + }, + { + "auxiliary_loss_clip": 0.01439924, + "auxiliary_loss_mlp": 0.00279569, + "balance_loss_clip": 1.1753583, + "balance_loss_mlp": 0.24533188, + "epoch": 0.5818127160679393, + "flos": 26934510666240.0, + "grad_norm": 82.96040220695258, + "language_loss": 0.7218883, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.73908317, + "num_input_tokens_seen": 208443655, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.34277344, + "step": 9677, + "time_per_iteration": 2.698634147644043 + }, + { + "auxiliary_loss_clip": 0.01415625, + "auxiliary_loss_mlp": 0.00108149, + "balance_loss_clip": 1.19720244, + "balance_loss_mlp": 0.0985164, + "epoch": 0.5818728393206073, + "flos": 63918626342400.0, + "grad_norm": 0.8124567798299924, + "language_loss": 0.54034424, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.55558205, + "num_input_tokens_seen": 208498405, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.09619141, + "step": 9678, + "time_per_iteration": 3.234797239303589 + }, + { + "auxiliary_loss_clip": 0.01403267, + "auxiliary_loss_mlp": 0.00068916, + "balance_loss_clip": 1.1902585, + "balance_loss_mlp": 0.05871172, + "epoch": 0.5819329625732752, + "flos": 64954108638720.0, + "grad_norm": 0.7310492744720045, + "language_loss": 0.55581659, + "learning_rate": 1.569724674667319e-06, + "loss": 0.57053846, + "num_input_tokens_seen": 208559075, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.10205078, + "step": 9679, + "time_per_iteration": 3.0527970790863037 + }, + { + "auxiliary_loss_clip": 0.0140956, + "auxiliary_loss_mlp": 0.00241682, + "balance_loss_clip": 1.15623283, + "balance_loss_mlp": 0.20713517, + "epoch": 0.5819930858259432, + "flos": 21215270522880.0, + "grad_norm": 3.996650612440571, + "language_loss": 0.72403675, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.74054921, + "num_input_tokens_seen": 208577770, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.34545898, + "step": 9680, + "time_per_iteration": 2.6927926540374756 + }, + { + "auxiliary_loss_clip": 0.01424467, + "auxiliary_loss_mlp": 0.00263878, + "balance_loss_clip": 1.16489613, + "balance_loss_mlp": 0.2307613, + "epoch": 0.5820532090786111, + "flos": 19458520151040.0, + "grad_norm": 53.78562312511489, + "language_loss": 0.89734864, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.91423213, + "num_input_tokens_seen": 208595110, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.33129883, + "step": 9681, + "time_per_iteration": 2.6897482872009277 + }, + { + "auxiliary_loss_clip": 0.01397214, + "auxiliary_loss_mlp": 0.00252289, + "balance_loss_clip": 1.14650238, + "balance_loss_mlp": 0.22131892, + "epoch": 0.5821133323312792, + "flos": 17712615686400.0, + "grad_norm": 7.422136703743692, + "language_loss": 0.8108592, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.82735419, + "num_input_tokens_seen": 208612080, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.30981445, + "step": 9682, + "time_per_iteration": 2.662064552307129 + }, + { + "auxiliary_loss_clip": 0.01426837, + "auxiliary_loss_mlp": 0.0029175, + "balance_loss_clip": 1.16312003, + "balance_loss_mlp": 0.25746524, + "epoch": 0.5821734555839471, + "flos": 24571804832640.0, + "grad_norm": 48.096744698587784, + "language_loss": 0.8439886, + "learning_rate": 1.568203437579977e-06, + "loss": 0.86117446, + "num_input_tokens_seen": 208630235, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.34301758, + "step": 9683, + "time_per_iteration": 2.6958487033843994 + }, + { + "auxiliary_loss_clip": 0.01435007, + "auxiliary_loss_mlp": 0.00267785, + "balance_loss_clip": 1.16681135, + "balance_loss_mlp": 0.2355748, + "epoch": 0.5822335788366151, + "flos": 22382259488640.0, + "grad_norm": 5.191046252542036, + "language_loss": 0.80389458, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.82092249, + "num_input_tokens_seen": 208647925, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.32177734, + "step": 9684, + "time_per_iteration": 2.7245402336120605 + }, + { + "auxiliary_loss_clip": 0.0141136, + "auxiliary_loss_mlp": 0.00299464, + "balance_loss_clip": 1.15285933, + "balance_loss_mlp": 0.26742011, + "epoch": 0.582293702089283, + "flos": 26722494639360.0, + "grad_norm": 1883.6574769105978, + "language_loss": 0.84360737, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.86071563, + "num_input_tokens_seen": 208666180, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.32043457, + "step": 9685, + "time_per_iteration": 4.263071537017822 + }, + { + "auxiliary_loss_clip": 0.01408416, + "auxiliary_loss_mlp": 0.00261036, + "balance_loss_clip": 1.15548635, + "balance_loss_mlp": 0.22784799, + "epoch": 0.582353825341951, + "flos": 17348661129600.0, + "grad_norm": 34.98425650947022, + "language_loss": 0.8448981, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.86159253, + "num_input_tokens_seen": 208684240, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.33227539, + "step": 9686, + "time_per_iteration": 2.695342779159546 + }, + { + "auxiliary_loss_clip": 0.01406217, + "auxiliary_loss_mlp": 0.00107428, + "balance_loss_clip": 1.19555593, + "balance_loss_mlp": 0.09669939, + "epoch": 0.5824139485946189, + "flos": 55473261534720.0, + "grad_norm": 0.7966234519230274, + "language_loss": 0.56433403, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.57947052, + "num_input_tokens_seen": 208736090, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.10742188, + "step": 9687, + "time_per_iteration": 4.429518461227417 + }, + { + "auxiliary_loss_clip": 0.01426518, + "auxiliary_loss_mlp": 0.00253128, + "balance_loss_clip": 1.16527152, + "balance_loss_mlp": 0.22156179, + "epoch": 0.582474071847287, + "flos": 20303031790080.0, + "grad_norm": 5.025003757303838, + "language_loss": 0.78661847, + "learning_rate": 1.566302259738727e-06, + "loss": 0.80341488, + "num_input_tokens_seen": 208754600, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.31616211, + "step": 9688, + "time_per_iteration": 2.732639789581299 + }, + { + "auxiliary_loss_clip": 0.0143789, + "auxiliary_loss_mlp": 0.00230918, + "balance_loss_clip": 1.16866505, + "balance_loss_mlp": 0.20216522, + "epoch": 0.5825341950999549, + "flos": 23878010661120.0, + "grad_norm": 211.4461487080634, + "language_loss": 0.73504448, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.75173247, + "num_input_tokens_seen": 208773140, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.2878418, + "step": 9689, + "time_per_iteration": 2.8384385108947754 + }, + { + "auxiliary_loss_clip": 0.0143089, + "auxiliary_loss_mlp": 0.00269078, + "balance_loss_clip": 1.17263961, + "balance_loss_mlp": 0.2345787, + "epoch": 0.5825943183526229, + "flos": 23113041690240.0, + "grad_norm": 5.6604905675176775, + "language_loss": 0.81156147, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.82856119, + "num_input_tokens_seen": 208793410, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.34484863, + "step": 9690, + "time_per_iteration": 4.209598064422607 + }, + { + "auxiliary_loss_clip": 0.01413419, + "auxiliary_loss_mlp": 0.00269502, + "balance_loss_clip": 1.15490603, + "balance_loss_mlp": 0.2347403, + "epoch": 0.5826544416052909, + "flos": 22857429530880.0, + "grad_norm": 2.8693964380939154, + "language_loss": 0.83441389, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.85124314, + "num_input_tokens_seen": 208811920, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.34765625, + "step": 9691, + "time_per_iteration": 2.659247398376465 + }, + { + "auxiliary_loss_clip": 0.01406241, + "auxiliary_loss_mlp": 0.00259712, + "balance_loss_clip": 1.14478111, + "balance_loss_mlp": 0.22695279, + "epoch": 0.5827145648579588, + "flos": 31501845555840.0, + "grad_norm": 23.677123555344057, + "language_loss": 0.87723905, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.89389861, + "num_input_tokens_seen": 208834720, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.32739258, + "step": 9692, + "time_per_iteration": 2.7709574699401855 + }, + { + "auxiliary_loss_clip": 0.01397954, + "auxiliary_loss_mlp": 0.00148867, + "balance_loss_clip": 1.19543648, + "balance_loss_mlp": 0.13823333, + "epoch": 0.5827746881106268, + "flos": 69811817074560.0, + "grad_norm": 0.7704784840901414, + "language_loss": 0.56665868, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.58212692, + "num_input_tokens_seen": 208898415, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.10644531, + "step": 9693, + "time_per_iteration": 3.1583547592163086 + }, + { + "auxiliary_loss_clip": 0.01406627, + "auxiliary_loss_mlp": 0.00247595, + "balance_loss_clip": 1.15042996, + "balance_loss_mlp": 0.21724401, + "epoch": 0.5828348113632947, + "flos": 23112395245440.0, + "grad_norm": 5.6781641455398635, + "language_loss": 0.86403227, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.88057452, + "num_input_tokens_seen": 208919045, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.30322266, + "step": 9694, + "time_per_iteration": 2.7098002433776855 + }, + { + "auxiliary_loss_clip": 0.01394388, + "auxiliary_loss_mlp": 0.00273282, + "balance_loss_clip": 1.14591384, + "balance_loss_mlp": 0.24302708, + "epoch": 0.5828949346159628, + "flos": 21873082245120.0, + "grad_norm": 149.06618324276025, + "language_loss": 0.81838644, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.8350631, + "num_input_tokens_seen": 208939375, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.30249023, + "step": 9695, + "time_per_iteration": 4.156244516372681 + }, + { + "auxiliary_loss_clip": 0.01342892, + "auxiliary_loss_mlp": 0.00104365, + "balance_loss_clip": 1.14802337, + "balance_loss_mlp": 0.09463781, + "epoch": 0.5829550578686307, + "flos": 65962553950080.0, + "grad_norm": 0.7413795438955938, + "language_loss": 0.54374528, + "learning_rate": 1.563261231127095e-06, + "loss": 0.55821782, + "num_input_tokens_seen": 209004760, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.09716797, + "step": 9696, + "time_per_iteration": 3.247711658477783 + }, + { + "auxiliary_loss_clip": 0.0141333, + "auxiliary_loss_mlp": 0.00267658, + "balance_loss_clip": 1.15726376, + "balance_loss_mlp": 0.23566231, + "epoch": 0.5830151811212987, + "flos": 16289799079680.0, + "grad_norm": 160.11330911516362, + "language_loss": 0.85378724, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.87059712, + "num_input_tokens_seen": 209022930, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.32006836, + "step": 9697, + "time_per_iteration": 2.6413979530334473 + }, + { + "auxiliary_loss_clip": 0.01411525, + "auxiliary_loss_mlp": 0.0028151, + "balance_loss_clip": 1.15295231, + "balance_loss_mlp": 0.24665324, + "epoch": 0.5830753043739666, + "flos": 24168851084160.0, + "grad_norm": 180.37702597331017, + "language_loss": 0.83706081, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.85399115, + "num_input_tokens_seen": 209043740, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.34863281, + "step": 9698, + "time_per_iteration": 2.6929476261138916 + }, + { + "auxiliary_loss_clip": 0.01425841, + "auxiliary_loss_mlp": 0.00269421, + "balance_loss_clip": 1.16470349, + "balance_loss_mlp": 0.23642346, + "epoch": 0.5831354276266346, + "flos": 27059050097280.0, + "grad_norm": 9.290964499614601, + "language_loss": 0.89576495, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.91271746, + "num_input_tokens_seen": 209068885, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.33007812, + "step": 9699, + "time_per_iteration": 2.864872694015503 + }, + { + "auxiliary_loss_clip": 0.01437463, + "auxiliary_loss_mlp": 0.00251357, + "balance_loss_clip": 1.16956961, + "balance_loss_mlp": 0.21862237, + "epoch": 0.5831955508793025, + "flos": 23623475909760.0, + "grad_norm": 142.33507769240035, + "language_loss": 0.74456525, + "learning_rate": 1.561741113828305e-06, + "loss": 0.76145351, + "num_input_tokens_seen": 209087340, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.32714844, + "step": 9700, + "time_per_iteration": 2.716184377670288 + }, + { + "auxiliary_loss_clip": 0.01422462, + "auxiliary_loss_mlp": 0.00299775, + "balance_loss_clip": 1.15922928, + "balance_loss_mlp": 0.26770726, + "epoch": 0.5832556741319705, + "flos": 24973250209920.0, + "grad_norm": 20.773391688333017, + "language_loss": 0.77631998, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.79354239, + "num_input_tokens_seen": 209108840, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.32080078, + "step": 9701, + "time_per_iteration": 2.756169557571411 + }, + { + "auxiliary_loss_clip": 0.01404861, + "auxiliary_loss_mlp": 0.00265855, + "balance_loss_clip": 1.14934719, + "balance_loss_mlp": 0.23447911, + "epoch": 0.5833157973846385, + "flos": 23221563655680.0, + "grad_norm": 10.72596626390883, + "language_loss": 0.93243372, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.94914079, + "num_input_tokens_seen": 209127985, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.31347656, + "step": 9702, + "time_per_iteration": 2.7264559268951416 + }, + { + "auxiliary_loss_clip": 0.01424218, + "auxiliary_loss_mlp": 0.00242462, + "balance_loss_clip": 1.16523778, + "balance_loss_mlp": 0.21051362, + "epoch": 0.5833759206373065, + "flos": 21977941023360.0, + "grad_norm": 6.662741201290832, + "language_loss": 0.8293494, + "learning_rate": 1.560601200301392e-06, + "loss": 0.84601623, + "num_input_tokens_seen": 209146885, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.31958008, + "step": 9703, + "time_per_iteration": 2.7970352172851562 + }, + { + "auxiliary_loss_clip": 0.0143473, + "auxiliary_loss_mlp": 0.00262563, + "balance_loss_clip": 1.16516519, + "balance_loss_mlp": 0.22832608, + "epoch": 0.5834360438899745, + "flos": 21762405463680.0, + "grad_norm": 23.9423745139248, + "language_loss": 0.78799343, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.80496633, + "num_input_tokens_seen": 209166130, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.34204102, + "step": 9704, + "time_per_iteration": 2.695429563522339 + }, + { + "auxiliary_loss_clip": 0.01395067, + "auxiliary_loss_mlp": 0.00277758, + "balance_loss_clip": 1.13982272, + "balance_loss_mlp": 0.24504668, + "epoch": 0.5834961671426424, + "flos": 15992566035840.0, + "grad_norm": 16.682774811757273, + "language_loss": 0.87770712, + "learning_rate": 1.559841341236335e-06, + "loss": 0.89443535, + "num_input_tokens_seen": 209183350, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.32739258, + "step": 9705, + "time_per_iteration": 2.636247396469116 + }, + { + "auxiliary_loss_clip": 0.01427153, + "auxiliary_loss_mlp": 0.00285746, + "balance_loss_clip": 1.16194916, + "balance_loss_mlp": 0.25015044, + "epoch": 0.5835562903953104, + "flos": 22818322598400.0, + "grad_norm": 6.933015062768426, + "language_loss": 0.85994565, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.8770746, + "num_input_tokens_seen": 209203945, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.35620117, + "step": 9706, + "time_per_iteration": 2.7172117233276367 + }, + { + "auxiliary_loss_clip": 0.01432049, + "auxiliary_loss_mlp": 0.00261408, + "balance_loss_clip": 1.17264009, + "balance_loss_mlp": 0.22781461, + "epoch": 0.5836164136479783, + "flos": 48468056624640.0, + "grad_norm": 8.693902449237452, + "language_loss": 0.83756614, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.85450071, + "num_input_tokens_seen": 209227080, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.3359375, + "step": 9707, + "time_per_iteration": 3.0046069622039795 + }, + { + "auxiliary_loss_clip": 0.01405243, + "auxiliary_loss_mlp": 0.00277565, + "balance_loss_clip": 1.15778995, + "balance_loss_mlp": 0.24568847, + "epoch": 0.5836765369006464, + "flos": 26905998245760.0, + "grad_norm": 5.9673446611829934, + "language_loss": 0.86950868, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.8863368, + "num_input_tokens_seen": 209248170, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.31884766, + "step": 9708, + "time_per_iteration": 2.8034708499908447 + }, + { + "auxiliary_loss_clip": 0.01425922, + "auxiliary_loss_mlp": 0.00272782, + "balance_loss_clip": 1.16616702, + "balance_loss_mlp": 0.2422287, + "epoch": 0.5837366601533143, + "flos": 20084048524800.0, + "grad_norm": 4.533411672329313, + "language_loss": 0.84687173, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.86385876, + "num_input_tokens_seen": 209267730, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.30529785, + "step": 9709, + "time_per_iteration": 2.6899569034576416 + }, + { + "auxiliary_loss_clip": 0.01385813, + "auxiliary_loss_mlp": 0.00151723, + "balance_loss_clip": 1.1891315, + "balance_loss_mlp": 0.14147066, + "epoch": 0.5837967834059823, + "flos": 65363885971200.0, + "grad_norm": 0.7627342075640489, + "language_loss": 0.56327617, + "learning_rate": 1.557941985915844e-06, + "loss": 0.57865155, + "num_input_tokens_seen": 209332510, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.10253906, + "step": 9710, + "time_per_iteration": 3.2131545543670654 + }, + { + "auxiliary_loss_clip": 0.01419632, + "auxiliary_loss_mlp": 0.00246962, + "balance_loss_clip": 1.16520071, + "balance_loss_mlp": 0.2147751, + "epoch": 0.5838569066586502, + "flos": 25338641310720.0, + "grad_norm": 34.38429744977601, + "language_loss": 0.73220217, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.74886811, + "num_input_tokens_seen": 209353355, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.32202148, + "step": 9711, + "time_per_iteration": 2.756408929824829 + }, + { + "auxiliary_loss_clip": 0.01443891, + "auxiliary_loss_mlp": 0.00302402, + "balance_loss_clip": 1.17221189, + "balance_loss_mlp": 0.26566219, + "epoch": 0.5839170299113182, + "flos": 22229243550720.0, + "grad_norm": 9.294058052943358, + "language_loss": 0.86745489, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.88491786, + "num_input_tokens_seen": 209370960, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.36694336, + "step": 9712, + "time_per_iteration": 2.632793426513672 + }, + { + "auxiliary_loss_clip": 0.01397062, + "auxiliary_loss_mlp": 0.00280924, + "balance_loss_clip": 1.14462507, + "balance_loss_mlp": 0.24783117, + "epoch": 0.5839771531639861, + "flos": 22200012858240.0, + "grad_norm": 48.2739825398545, + "language_loss": 0.79508102, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.81186086, + "num_input_tokens_seen": 209390955, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.33105469, + "step": 9713, + "time_per_iteration": 2.655731439590454 + }, + { + "auxiliary_loss_clip": 0.01450419, + "auxiliary_loss_mlp": 0.00286868, + "balance_loss_clip": 1.17641354, + "balance_loss_mlp": 0.25008041, + "epoch": 0.5840372764166541, + "flos": 22419355259520.0, + "grad_norm": 20.496828783630725, + "language_loss": 0.80820906, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.82558197, + "num_input_tokens_seen": 209410260, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.36791992, + "step": 9714, + "time_per_iteration": 2.661358594894409 + }, + { + "auxiliary_loss_clip": 0.01431267, + "auxiliary_loss_mlp": 0.00279843, + "balance_loss_clip": 1.16513681, + "balance_loss_mlp": 0.24541551, + "epoch": 0.5840973996693221, + "flos": 19828256797440.0, + "grad_norm": 34.24472509443729, + "language_loss": 0.85561264, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.87272376, + "num_input_tokens_seen": 209429920, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.34399414, + "step": 9715, + "time_per_iteration": 2.6563961505889893 + }, + { + "auxiliary_loss_clip": 0.01406895, + "auxiliary_loss_mlp": 0.00260477, + "balance_loss_clip": 1.15223992, + "balance_loss_mlp": 0.2282431, + "epoch": 0.5841575229219901, + "flos": 21142982401920.0, + "grad_norm": 39.68752471541606, + "language_loss": 0.806113, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.82278669, + "num_input_tokens_seen": 209449470, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.32226562, + "step": 9716, + "time_per_iteration": 2.651970863342285 + }, + { + "auxiliary_loss_clip": 0.01406309, + "auxiliary_loss_mlp": 0.00251414, + "balance_loss_clip": 1.15348685, + "balance_loss_mlp": 0.21782109, + "epoch": 0.5842176461746581, + "flos": 24640322025600.0, + "grad_norm": 30.753105783244543, + "language_loss": 0.80958885, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.82616609, + "num_input_tokens_seen": 209467695, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.33618164, + "step": 9717, + "time_per_iteration": 2.704937696456909 + }, + { + "auxiliary_loss_clip": 0.01406436, + "auxiliary_loss_mlp": 0.00265279, + "balance_loss_clip": 1.15433168, + "balance_loss_mlp": 0.23035091, + "epoch": 0.584277769427326, + "flos": 19131158574720.0, + "grad_norm": 484.19668146905826, + "language_loss": 0.87799537, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.89471251, + "num_input_tokens_seen": 209484250, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.34936523, + "step": 9718, + "time_per_iteration": 2.6332077980041504 + }, + { + "auxiliary_loss_clip": 0.01420151, + "auxiliary_loss_mlp": 0.00272484, + "balance_loss_clip": 1.1637001, + "balance_loss_mlp": 0.23650675, + "epoch": 0.584337892679994, + "flos": 22675111073280.0, + "grad_norm": 63.99890798149002, + "language_loss": 0.75018799, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.76711428, + "num_input_tokens_seen": 209502830, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.36010742, + "step": 9719, + "time_per_iteration": 2.7371108531951904 + }, + { + "auxiliary_loss_clip": 0.01421813, + "auxiliary_loss_mlp": 0.00275942, + "balance_loss_clip": 1.16012275, + "balance_loss_mlp": 0.24058405, + "epoch": 0.5843980159326619, + "flos": 31284083352960.0, + "grad_norm": 7.664558167722996, + "language_loss": 0.84203333, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.85901088, + "num_input_tokens_seen": 209525995, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.35351562, + "step": 9720, + "time_per_iteration": 2.7677364349365234 + }, + { + "auxiliary_loss_clip": 0.0143034, + "auxiliary_loss_mlp": 0.00274596, + "balance_loss_clip": 1.16668427, + "balance_loss_mlp": 0.23759331, + "epoch": 0.58445813918533, + "flos": 22748117466240.0, + "grad_norm": 60.915656355718326, + "language_loss": 0.89671397, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.91376334, + "num_input_tokens_seen": 209545895, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.36987305, + "step": 9721, + "time_per_iteration": 2.727766513824463 + }, + { + "auxiliary_loss_clip": 0.0139477, + "auxiliary_loss_mlp": 0.00149993, + "balance_loss_clip": 1.1996814, + "balance_loss_mlp": 0.1381194, + "epoch": 0.5845182624379979, + "flos": 60686556658560.0, + "grad_norm": 1.2501869279179294, + "language_loss": 0.70787096, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.72331864, + "num_input_tokens_seen": 209602315, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.11865234, + "step": 9722, + "time_per_iteration": 3.284536361694336 + }, + { + "auxiliary_loss_clip": 0.01408284, + "auxiliary_loss_mlp": 0.00246688, + "balance_loss_clip": 1.15242648, + "balance_loss_mlp": 0.21366751, + "epoch": 0.5845783856906659, + "flos": 16362446336640.0, + "grad_norm": 2.013606833062, + "language_loss": 0.96209502, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.97864467, + "num_input_tokens_seen": 209617615, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.33056641, + "step": 9723, + "time_per_iteration": 2.662440061569214 + }, + { + "auxiliary_loss_clip": 0.01418168, + "auxiliary_loss_mlp": 0.00246584, + "balance_loss_clip": 1.1625495, + "balance_loss_mlp": 0.21206099, + "epoch": 0.5846385089433338, + "flos": 20083402080000.0, + "grad_norm": 12.948145381204586, + "language_loss": 0.74832976, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.76497728, + "num_input_tokens_seen": 209637005, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.34521484, + "step": 9724, + "time_per_iteration": 2.688657522201538 + }, + { + "auxiliary_loss_clip": 0.01416864, + "auxiliary_loss_mlp": 0.00253066, + "balance_loss_clip": 1.15138268, + "balance_loss_mlp": 0.21954402, + "epoch": 0.5846986321960018, + "flos": 17311062568320.0, + "grad_norm": 238.4142574654594, + "language_loss": 0.94747716, + "learning_rate": 1.552246441587197e-06, + "loss": 0.96417642, + "num_input_tokens_seen": 209653170, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.33544922, + "step": 9725, + "time_per_iteration": 2.6472129821777344 + }, + { + "auxiliary_loss_clip": 0.01428606, + "auxiliary_loss_mlp": 0.00274967, + "balance_loss_clip": 1.16552675, + "balance_loss_mlp": 0.24165946, + "epoch": 0.5847587554486697, + "flos": 17197907748480.0, + "grad_norm": 64.02242267999365, + "language_loss": 0.87494206, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.89197779, + "num_input_tokens_seen": 209671275, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.33276367, + "step": 9726, + "time_per_iteration": 2.6538071632385254 + }, + { + "auxiliary_loss_clip": 0.01412624, + "auxiliary_loss_mlp": 0.00287173, + "balance_loss_clip": 1.15958953, + "balance_loss_mlp": 0.25422311, + "epoch": 0.5848188787013378, + "flos": 24529106540160.0, + "grad_norm": 48.30326865668973, + "language_loss": 0.75050312, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.76750112, + "num_input_tokens_seen": 209690380, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.3293457, + "step": 9727, + "time_per_iteration": 4.101094484329224 + }, + { + "auxiliary_loss_clip": 0.01430454, + "auxiliary_loss_mlp": 0.00257126, + "balance_loss_clip": 1.17099535, + "balance_loss_mlp": 0.22133912, + "epoch": 0.5848790019540057, + "flos": 20628382204800.0, + "grad_norm": 17.410848074698922, + "language_loss": 0.87476617, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.89164197, + "num_input_tokens_seen": 209708845, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.3581543, + "step": 9728, + "time_per_iteration": 2.6479456424713135 + }, + { + "auxiliary_loss_clip": 0.01403506, + "auxiliary_loss_mlp": 0.00235891, + "balance_loss_clip": 1.15106702, + "balance_loss_mlp": 0.20229813, + "epoch": 0.5849391252066737, + "flos": 22418852469120.0, + "grad_norm": 6.631638027114804, + "language_loss": 0.8400023, + "learning_rate": 1.550728272957027e-06, + "loss": 0.85639632, + "num_input_tokens_seen": 209729000, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.3359375, + "step": 9729, + "time_per_iteration": 4.19434666633606 + }, + { + "auxiliary_loss_clip": 0.01417711, + "auxiliary_loss_mlp": 0.00232889, + "balance_loss_clip": 1.15601623, + "balance_loss_mlp": 0.19920063, + "epoch": 0.5849992484593417, + "flos": 25410929431680.0, + "grad_norm": 3.437930934970691, + "language_loss": 0.80134487, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.81785089, + "num_input_tokens_seen": 209747435, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.33642578, + "step": 9730, + "time_per_iteration": 2.743098497390747 + }, + { + "auxiliary_loss_clip": 0.01440211, + "auxiliary_loss_mlp": 0.00250902, + "balance_loss_clip": 1.16888499, + "balance_loss_mlp": 0.21611637, + "epoch": 0.5850593717120096, + "flos": 21065163586560.0, + "grad_norm": 4.684179589701503, + "language_loss": 0.85115719, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.86806834, + "num_input_tokens_seen": 209764910, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.34765625, + "step": 9731, + "time_per_iteration": 2.6558971405029297 + }, + { + "auxiliary_loss_clip": 0.01404522, + "auxiliary_loss_mlp": 0.00231289, + "balance_loss_clip": 1.14792347, + "balance_loss_mlp": 0.1991975, + "epoch": 0.5851194949646776, + "flos": 25301545539840.0, + "grad_norm": 11.506297204459385, + "language_loss": 0.78603309, + "learning_rate": 1.549589825316528e-06, + "loss": 0.80239117, + "num_input_tokens_seen": 209786115, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.32104492, + "step": 9732, + "time_per_iteration": 4.127508163452148 + }, + { + "auxiliary_loss_clip": 0.01456654, + "auxiliary_loss_mlp": 0.00271281, + "balance_loss_clip": 1.17988169, + "balance_loss_mlp": 0.23642465, + "epoch": 0.5851796182173455, + "flos": 23587242065280.0, + "grad_norm": 16.10661296579124, + "language_loss": 0.60794353, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.6252228, + "num_input_tokens_seen": 209806095, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.34863281, + "step": 9733, + "time_per_iteration": 2.7366416454315186 + }, + { + "auxiliary_loss_clip": 0.01438223, + "auxiliary_loss_mlp": 0.00246059, + "balance_loss_clip": 1.17289209, + "balance_loss_mlp": 0.21170317, + "epoch": 0.5852397414700136, + "flos": 24822712310400.0, + "grad_norm": 4.256688825068405, + "language_loss": 0.97526205, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.99210483, + "num_input_tokens_seen": 209823650, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.34326172, + "step": 9734, + "time_per_iteration": 2.7970311641693115 + }, + { + "auxiliary_loss_clip": 0.01398152, + "auxiliary_loss_mlp": 0.00228214, + "balance_loss_clip": 1.14515698, + "balance_loss_mlp": 0.19607508, + "epoch": 0.5852998647226815, + "flos": 19937784343680.0, + "grad_norm": 20.21326748910892, + "language_loss": 0.7736944, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.78995812, + "num_input_tokens_seen": 209843220, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.32177734, + "step": 9735, + "time_per_iteration": 2.701411008834839 + }, + { + "auxiliary_loss_clip": 0.01403656, + "auxiliary_loss_mlp": 0.0024916, + "balance_loss_clip": 1.13879168, + "balance_loss_mlp": 0.21511409, + "epoch": 0.5853599879753495, + "flos": 16720367408640.0, + "grad_norm": 263.04645876018594, + "language_loss": 0.84873033, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.86525857, + "num_input_tokens_seen": 209854880, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.34057617, + "step": 9736, + "time_per_iteration": 2.6219470500946045 + }, + { + "auxiliary_loss_clip": 0.01396198, + "auxiliary_loss_mlp": 0.00272096, + "balance_loss_clip": 1.14207149, + "balance_loss_mlp": 0.23833615, + "epoch": 0.5854201112280174, + "flos": 44456583680640.0, + "grad_norm": 8.658951866056226, + "language_loss": 0.76728857, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.78397155, + "num_input_tokens_seen": 209877870, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.33764648, + "step": 9737, + "time_per_iteration": 4.343982458114624 + }, + { + "auxiliary_loss_clip": 0.01403464, + "auxiliary_loss_mlp": 0.00268338, + "balance_loss_clip": 1.14862061, + "balance_loss_mlp": 0.23851189, + "epoch": 0.5854802344806854, + "flos": 20339193807360.0, + "grad_norm": 11.480425629687103, + "language_loss": 0.88901794, + "learning_rate": 1.547313391573169e-06, + "loss": 0.90573591, + "num_input_tokens_seen": 209896690, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.29833984, + "step": 9738, + "time_per_iteration": 2.667433738708496 + }, + { + "auxiliary_loss_clip": 0.01409477, + "auxiliary_loss_mlp": 0.00272712, + "balance_loss_clip": 1.1469202, + "balance_loss_mlp": 0.23892829, + "epoch": 0.5855403577333533, + "flos": 20921054221440.0, + "grad_norm": 353.676639939439, + "language_loss": 0.77037024, + "learning_rate": 1.546934045946082e-06, + "loss": 0.78719217, + "num_input_tokens_seen": 209914640, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.33813477, + "step": 9739, + "time_per_iteration": 2.647698402404785 + }, + { + "auxiliary_loss_clip": 0.01405644, + "auxiliary_loss_mlp": 0.00272476, + "balance_loss_clip": 1.14459908, + "balance_loss_mlp": 0.23831083, + "epoch": 0.5856004809860214, + "flos": 20448649526400.0, + "grad_norm": 7.0115182218571475, + "language_loss": 0.69584978, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.71263099, + "num_input_tokens_seen": 209933375, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.34155273, + "step": 9740, + "time_per_iteration": 2.6579768657684326 + }, + { + "auxiliary_loss_clip": 0.01404789, + "auxiliary_loss_mlp": 0.00250649, + "balance_loss_clip": 1.14383245, + "balance_loss_mlp": 0.21693689, + "epoch": 0.5856606042386893, + "flos": 19640766781440.0, + "grad_norm": 4.894074308931436, + "language_loss": 0.83230978, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.8488642, + "num_input_tokens_seen": 209952055, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.3371582, + "step": 9741, + "time_per_iteration": 2.6509556770324707 + }, + { + "auxiliary_loss_clip": 0.01389649, + "auxiliary_loss_mlp": 0.00286042, + "balance_loss_clip": 1.13807428, + "balance_loss_mlp": 0.25280672, + "epoch": 0.5857207274913573, + "flos": 21686166846720.0, + "grad_norm": 1.9740762054580359, + "language_loss": 0.83016562, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.84692252, + "num_input_tokens_seen": 209971190, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.33251953, + "step": 9742, + "time_per_iteration": 2.741161346435547 + }, + { + "auxiliary_loss_clip": 0.01374179, + "auxiliary_loss_mlp": 0.00236661, + "balance_loss_clip": 1.12408078, + "balance_loss_mlp": 0.20461723, + "epoch": 0.5857808507440253, + "flos": 23182708118400.0, + "grad_norm": 2.1491418736720416, + "language_loss": 0.81471747, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.83082592, + "num_input_tokens_seen": 209990695, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.32019043, + "step": 9743, + "time_per_iteration": 2.6612117290496826 + }, + { + "auxiliary_loss_clip": 0.01371068, + "auxiliary_loss_mlp": 0.00246096, + "balance_loss_clip": 1.12545824, + "balance_loss_mlp": 0.2168417, + "epoch": 0.5858409739966932, + "flos": 27235299156480.0, + "grad_norm": 6.927540881540055, + "language_loss": 0.87426388, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.89043558, + "num_input_tokens_seen": 210010210, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.29272461, + "step": 9744, + "time_per_iteration": 2.694342851638794 + }, + { + "auxiliary_loss_clip": 0.01402835, + "auxiliary_loss_mlp": 0.00273193, + "balance_loss_clip": 1.14288211, + "balance_loss_mlp": 0.24272308, + "epoch": 0.5859010972493612, + "flos": 27855512317440.0, + "grad_norm": 213.52783245296405, + "language_loss": 0.7824595, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.79921979, + "num_input_tokens_seen": 210030030, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.30517578, + "step": 9745, + "time_per_iteration": 2.7748022079467773 + }, + { + "auxiliary_loss_clip": 0.0138212, + "auxiliary_loss_mlp": 0.00096948, + "balance_loss_clip": 1.18206143, + "balance_loss_mlp": 0.08664801, + "epoch": 0.5859612205020291, + "flos": 70007064428160.0, + "grad_norm": 0.7090240446342765, + "language_loss": 0.52542782, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.54021847, + "num_input_tokens_seen": 210094840, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.10302734, + "step": 9746, + "time_per_iteration": 3.291210651397705 + }, + { + "auxiliary_loss_clip": 0.01402896, + "auxiliary_loss_mlp": 0.0025908, + "balance_loss_clip": 1.14089251, + "balance_loss_mlp": 0.22677404, + "epoch": 0.5860213437546972, + "flos": 24056019486720.0, + "grad_norm": 19.033436102104876, + "language_loss": 0.81006104, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.82668078, + "num_input_tokens_seen": 210114660, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.32287598, + "step": 9747, + "time_per_iteration": 2.712878465652466 + }, + { + "auxiliary_loss_clip": 0.01381367, + "auxiliary_loss_mlp": 0.00271002, + "balance_loss_clip": 1.12615943, + "balance_loss_mlp": 0.24038896, + "epoch": 0.5860814670073651, + "flos": 18947583141120.0, + "grad_norm": 6.110909032507372, + "language_loss": 0.8869549, + "learning_rate": 1.543520710142051e-06, + "loss": 0.90347856, + "num_input_tokens_seen": 210132770, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.30615234, + "step": 9748, + "time_per_iteration": 2.657909393310547 + }, + { + "auxiliary_loss_clip": 0.01385287, + "auxiliary_loss_mlp": 0.00270736, + "balance_loss_clip": 1.12250757, + "balance_loss_mlp": 0.23916942, + "epoch": 0.5861415902600331, + "flos": 22561848512640.0, + "grad_norm": 167.6249827034542, + "language_loss": 0.7958765, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.8124367, + "num_input_tokens_seen": 210151895, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.31542969, + "step": 9749, + "time_per_iteration": 2.7086052894592285 + }, + { + "auxiliary_loss_clip": 0.01388977, + "auxiliary_loss_mlp": 0.00255629, + "balance_loss_clip": 1.13541222, + "balance_loss_mlp": 0.22418121, + "epoch": 0.586201713512701, + "flos": 14392027912320.0, + "grad_norm": 7.373261862963655, + "language_loss": 0.83425915, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.85070515, + "num_input_tokens_seen": 210168040, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.31433105, + "step": 9750, + "time_per_iteration": 2.681375026702881 + }, + { + "auxiliary_loss_clip": 0.01387019, + "auxiliary_loss_mlp": 0.00258892, + "balance_loss_clip": 1.13064337, + "balance_loss_mlp": 0.22732499, + "epoch": 0.586261836765369, + "flos": 19498560837120.0, + "grad_norm": 4.612484450684145, + "language_loss": 0.78279686, + "learning_rate": 1.542383242598344e-06, + "loss": 0.79925591, + "num_input_tokens_seen": 210187720, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.31567383, + "step": 9751, + "time_per_iteration": 2.6950974464416504 + }, + { + "auxiliary_loss_clip": 0.01434978, + "auxiliary_loss_mlp": 0.00263789, + "balance_loss_clip": 1.15478396, + "balance_loss_mlp": 0.23253272, + "epoch": 0.5863219600180369, + "flos": 20701819560960.0, + "grad_norm": 39521.59381134093, + "language_loss": 0.82842034, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.84540802, + "num_input_tokens_seen": 210206080, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.31298828, + "step": 9752, + "time_per_iteration": 2.64135479927063 + }, + { + "auxiliary_loss_clip": 0.01397185, + "auxiliary_loss_mlp": 0.00275451, + "balance_loss_clip": 1.13960862, + "balance_loss_mlp": 0.24393211, + "epoch": 0.586382083270705, + "flos": 19792130693760.0, + "grad_norm": 3.0500934933679598, + "language_loss": 0.8412025, + "learning_rate": 1.541625017642943e-06, + "loss": 0.85792887, + "num_input_tokens_seen": 210225660, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.31518555, + "step": 9753, + "time_per_iteration": 2.675851583480835 + }, + { + "auxiliary_loss_clip": 0.01377004, + "auxiliary_loss_mlp": 0.00262406, + "balance_loss_clip": 1.12910891, + "balance_loss_mlp": 0.23182911, + "epoch": 0.5864422065233729, + "flos": 16500558130560.0, + "grad_norm": 7.1193863385960565, + "language_loss": 0.77977335, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.79616749, + "num_input_tokens_seen": 210242725, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.30578613, + "step": 9754, + "time_per_iteration": 2.70888352394104 + }, + { + "auxiliary_loss_clip": 0.01423825, + "auxiliary_loss_mlp": 0.00277787, + "balance_loss_clip": 1.15616846, + "balance_loss_mlp": 0.24126135, + "epoch": 0.5865023297760409, + "flos": 20413277608320.0, + "grad_norm": 6.877722607870088, + "language_loss": 0.81309968, + "learning_rate": 1.540866862214043e-06, + "loss": 0.8301158, + "num_input_tokens_seen": 210263225, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.36572266, + "step": 9755, + "time_per_iteration": 2.6520674228668213 + }, + { + "auxiliary_loss_clip": 0.01404003, + "auxiliary_loss_mlp": 0.00115846, + "balance_loss_clip": 1.20017838, + "balance_loss_mlp": 0.10382976, + "epoch": 0.5865624530287089, + "flos": 63350769254400.0, + "grad_norm": 0.7180718370044487, + "language_loss": 0.56504208, + "learning_rate": 1.540487810607967e-06, + "loss": 0.58024061, + "num_input_tokens_seen": 210322310, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.12011719, + "step": 9756, + "time_per_iteration": 3.180410146713257 + }, + { + "auxiliary_loss_clip": 0.01382912, + "auxiliary_loss_mlp": 0.00256846, + "balance_loss_clip": 1.12987494, + "balance_loss_mlp": 0.22598237, + "epoch": 0.5866225762813768, + "flos": 27016279977600.0, + "grad_norm": 8.211426860153994, + "language_loss": 0.81716919, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.83356667, + "num_input_tokens_seen": 210340845, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.30847168, + "step": 9757, + "time_per_iteration": 2.730315685272217 + }, + { + "auxiliary_loss_clip": 0.0140941, + "auxiliary_loss_mlp": 0.00100944, + "balance_loss_clip": 1.20291817, + "balance_loss_mlp": 0.08854628, + "epoch": 0.5866826995340448, + "flos": 72987038507520.0, + "grad_norm": 0.8437965040166641, + "language_loss": 0.59838903, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.61349261, + "num_input_tokens_seen": 210397815, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.12402344, + "step": 9758, + "time_per_iteration": 3.16658878326416 + }, + { + "auxiliary_loss_clip": 0.01411879, + "auxiliary_loss_mlp": 0.00288087, + "balance_loss_clip": 1.14421976, + "balance_loss_mlp": 0.25551942, + "epoch": 0.5867428227867127, + "flos": 21285727050240.0, + "grad_norm": 8.396197363694402, + "language_loss": 0.82209337, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.83909297, + "num_input_tokens_seen": 210413900, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.32568359, + "step": 9759, + "time_per_iteration": 2.6666951179504395 + }, + { + "auxiliary_loss_clip": 0.01435044, + "auxiliary_loss_mlp": 0.0026111, + "balance_loss_clip": 1.16591239, + "balance_loss_mlp": 0.23023435, + "epoch": 0.5868029460393808, + "flos": 33468852188160.0, + "grad_norm": 21.177374520220287, + "language_loss": 0.79840982, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.81537139, + "num_input_tokens_seen": 210434110, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.30883789, + "step": 9760, + "time_per_iteration": 2.792797327041626 + }, + { + "auxiliary_loss_clip": 0.01409736, + "auxiliary_loss_mlp": 0.0028046, + "balance_loss_clip": 1.14359736, + "balance_loss_mlp": 0.247797, + "epoch": 0.5868630692920487, + "flos": 17889475276800.0, + "grad_norm": 528.1152137543207, + "language_loss": 0.81315356, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.83005553, + "num_input_tokens_seen": 210451685, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.3269043, + "step": 9761, + "time_per_iteration": 2.639301300048828 + }, + { + "auxiliary_loss_clip": 0.01412976, + "auxiliary_loss_mlp": 0.00276266, + "balance_loss_clip": 1.14202034, + "balance_loss_mlp": 0.24097958, + "epoch": 0.5869231925447167, + "flos": 21035035054080.0, + "grad_norm": 6.872173161679099, + "language_loss": 0.83449221, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.85138464, + "num_input_tokens_seen": 210470825, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.35327148, + "step": 9762, + "time_per_iteration": 2.698421001434326 + }, + { + "auxiliary_loss_clip": 0.01409093, + "auxiliary_loss_mlp": 0.00289978, + "balance_loss_clip": 1.15069401, + "balance_loss_mlp": 0.25750518, + "epoch": 0.5869833157973846, + "flos": 74738219293440.0, + "grad_norm": 12.864468539220121, + "language_loss": 0.78442091, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.80141157, + "num_input_tokens_seen": 210500075, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.32446289, + "step": 9763, + "time_per_iteration": 3.0882415771484375 + }, + { + "auxiliary_loss_clip": 0.01382075, + "auxiliary_loss_mlp": 0.00270897, + "balance_loss_clip": 1.12883854, + "balance_loss_mlp": 0.24107063, + "epoch": 0.5870434390500526, + "flos": 17638998762240.0, + "grad_norm": 6.7168529106438, + "language_loss": 0.87265182, + "learning_rate": 1.53745602625755e-06, + "loss": 0.88918155, + "num_input_tokens_seen": 210518150, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.2980957, + "step": 9764, + "time_per_iteration": 2.722729444503784 + }, + { + "auxiliary_loss_clip": 0.01376821, + "auxiliary_loss_mlp": 0.00277097, + "balance_loss_clip": 1.12352586, + "balance_loss_mlp": 0.24607839, + "epoch": 0.5871035623027205, + "flos": 21506146859520.0, + "grad_norm": 4.4079564612935185, + "language_loss": 0.84810591, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.864645, + "num_input_tokens_seen": 210537760, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.31018066, + "step": 9765, + "time_per_iteration": 2.7128353118896484 + }, + { + "auxiliary_loss_clip": 0.01371248, + "auxiliary_loss_mlp": 0.00257546, + "balance_loss_clip": 1.12267208, + "balance_loss_mlp": 0.22590792, + "epoch": 0.5871636855553886, + "flos": 13551861818880.0, + "grad_norm": 44.09306991308684, + "language_loss": 0.90336722, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.9196552, + "num_input_tokens_seen": 210555515, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.31640625, + "step": 9766, + "time_per_iteration": 2.6595351696014404 + }, + { + "auxiliary_loss_clip": 0.01388311, + "auxiliary_loss_mlp": 0.00291336, + "balance_loss_clip": 1.12799716, + "balance_loss_mlp": 0.25814807, + "epoch": 0.5872238088080565, + "flos": 26212922346240.0, + "grad_norm": 16.6091255532613, + "language_loss": 0.75443053, + "learning_rate": 1.536319396136257e-06, + "loss": 0.771227, + "num_input_tokens_seen": 210575000, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.33178711, + "step": 9767, + "time_per_iteration": 2.739656686782837 + }, + { + "auxiliary_loss_clip": 0.01391968, + "auxiliary_loss_mlp": 0.00272534, + "balance_loss_clip": 1.13189149, + "balance_loss_mlp": 0.2410869, + "epoch": 0.5872839320607245, + "flos": 30665198995200.0, + "grad_norm": 15.326720532632905, + "language_loss": 0.72307867, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.73972368, + "num_input_tokens_seen": 210595185, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.31469727, + "step": 9768, + "time_per_iteration": 2.750864028930664 + }, + { + "auxiliary_loss_clip": 0.01419331, + "auxiliary_loss_mlp": 0.00140451, + "balance_loss_clip": 1.21246338, + "balance_loss_mlp": 0.13019854, + "epoch": 0.5873440553133924, + "flos": 60303570871680.0, + "grad_norm": 0.6958852415824365, + "language_loss": 0.53146732, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.54706514, + "num_input_tokens_seen": 210653210, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.10253906, + "step": 9769, + "time_per_iteration": 4.611809968948364 + }, + { + "auxiliary_loss_clip": 0.01396917, + "auxiliary_loss_mlp": 0.00260354, + "balance_loss_clip": 1.13399017, + "balance_loss_mlp": 0.22845402, + "epoch": 0.5874041785660604, + "flos": 21539292134400.0, + "grad_norm": 3.4231113035819725, + "language_loss": 0.76509166, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.78166437, + "num_input_tokens_seen": 210673750, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.3190918, + "step": 9770, + "time_per_iteration": 2.709575653076172 + }, + { + "auxiliary_loss_clip": 0.01383174, + "auxiliary_loss_mlp": 0.00288948, + "balance_loss_clip": 1.12761188, + "balance_loss_mlp": 0.25806069, + "epoch": 0.5874643018187284, + "flos": 24388947671040.0, + "grad_norm": 60.34532631676938, + "language_loss": 0.78054363, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.79726481, + "num_input_tokens_seen": 210692960, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.30871582, + "step": 9771, + "time_per_iteration": 4.273981809616089 + }, + { + "auxiliary_loss_clip": 0.01421269, + "auxiliary_loss_mlp": 0.00294272, + "balance_loss_clip": 1.15204406, + "balance_loss_mlp": 0.25748366, + "epoch": 0.5875244250713964, + "flos": 28147717457280.0, + "grad_norm": 44.4117845814526, + "language_loss": 0.74955183, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.76670718, + "num_input_tokens_seen": 210714040, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.36791992, + "step": 9772, + "time_per_iteration": 2.703845977783203 + }, + { + "auxiliary_loss_clip": 0.01446436, + "auxiliary_loss_mlp": 0.00284877, + "balance_loss_clip": 1.17081714, + "balance_loss_mlp": 0.24835157, + "epoch": 0.5875845483240644, + "flos": 25812410722560.0, + "grad_norm": 9.655934174215663, + "language_loss": 0.83491206, + "learning_rate": 1.534046611017519e-06, + "loss": 0.85222518, + "num_input_tokens_seen": 210733710, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.36547852, + "step": 9773, + "time_per_iteration": 2.7180943489074707 + }, + { + "auxiliary_loss_clip": 0.01440627, + "auxiliary_loss_mlp": 0.00270122, + "balance_loss_clip": 1.16497207, + "balance_loss_mlp": 0.23671994, + "epoch": 0.5876446715767323, + "flos": 26906572863360.0, + "grad_norm": 25.0679007741106, + "language_loss": 0.64811862, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.6652261, + "num_input_tokens_seen": 210753580, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.33398438, + "step": 9774, + "time_per_iteration": 4.10964560508728 + }, + { + "auxiliary_loss_clip": 0.01427589, + "auxiliary_loss_mlp": 0.0028268, + "balance_loss_clip": 1.15960109, + "balance_loss_mlp": 0.25042203, + "epoch": 0.5877047948294003, + "flos": 36684832579200.0, + "grad_norm": 2.8932602155855176, + "language_loss": 0.75207663, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.76917934, + "num_input_tokens_seen": 210773495, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.32226562, + "step": 9775, + "time_per_iteration": 2.807006597518921 + }, + { + "auxiliary_loss_clip": 0.01384423, + "auxiliary_loss_mlp": 0.00280903, + "balance_loss_clip": 1.12716687, + "balance_loss_mlp": 0.25079089, + "epoch": 0.5877649180820682, + "flos": 26724721282560.0, + "grad_norm": 6.831876699157589, + "language_loss": 0.81679368, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.83344692, + "num_input_tokens_seen": 210793645, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.30102539, + "step": 9776, + "time_per_iteration": 2.714385986328125 + }, + { + "auxiliary_loss_clip": 0.01433439, + "auxiliary_loss_mlp": 0.0027281, + "balance_loss_clip": 1.16240144, + "balance_loss_mlp": 0.2406707, + "epoch": 0.5878250413347362, + "flos": 21032197879680.0, + "grad_norm": 5.6359733475854865, + "language_loss": 0.82073557, + "learning_rate": 1.532531774126821e-06, + "loss": 0.837798, + "num_input_tokens_seen": 210813415, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.32128906, + "step": 9777, + "time_per_iteration": 2.6555569171905518 + }, + { + "auxiliary_loss_clip": 0.01408469, + "auxiliary_loss_mlp": 0.00251401, + "balance_loss_clip": 1.14739919, + "balance_loss_mlp": 0.22252849, + "epoch": 0.5878851645874041, + "flos": 25484259047040.0, + "grad_norm": 7.736189327975747, + "language_loss": 0.80513859, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.82173723, + "num_input_tokens_seen": 210833850, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.28857422, + "step": 9778, + "time_per_iteration": 2.695220947265625 + }, + { + "auxiliary_loss_clip": 0.01385315, + "auxiliary_loss_mlp": 0.00290675, + "balance_loss_clip": 1.12798846, + "balance_loss_mlp": 0.26113516, + "epoch": 0.5879452878400722, + "flos": 23769129559680.0, + "grad_norm": 38.82922060148065, + "language_loss": 0.76430774, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.78106761, + "num_input_tokens_seen": 210853115, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.2956543, + "step": 9779, + "time_per_iteration": 4.1602606773376465 + }, + { + "auxiliary_loss_clip": 0.01417331, + "auxiliary_loss_mlp": 0.00285424, + "balance_loss_clip": 1.14727807, + "balance_loss_mlp": 0.25261739, + "epoch": 0.5880054110927401, + "flos": 17824513530240.0, + "grad_norm": 107.39709557361877, + "language_loss": 0.73889643, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.75592399, + "num_input_tokens_seen": 210872090, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.32836914, + "step": 9780, + "time_per_iteration": 2.6782987117767334 + }, + { + "auxiliary_loss_clip": 0.01402668, + "auxiliary_loss_mlp": 0.00268353, + "balance_loss_clip": 1.14326882, + "balance_loss_mlp": 0.23535591, + "epoch": 0.5880655343454081, + "flos": 19463404400640.0, + "grad_norm": 3.284712928551496, + "language_loss": 0.81150973, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.82821995, + "num_input_tokens_seen": 210888490, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.32995605, + "step": 9781, + "time_per_iteration": 2.652977228164673 + }, + { + "auxiliary_loss_clip": 0.01410003, + "auxiliary_loss_mlp": 0.00295143, + "balance_loss_clip": 1.15040028, + "balance_loss_mlp": 0.26441085, + "epoch": 0.588125657598076, + "flos": 21397588980480.0, + "grad_norm": 7.678013106685158, + "language_loss": 0.74757463, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.76462609, + "num_input_tokens_seen": 210908220, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.30737305, + "step": 9782, + "time_per_iteration": 2.662749767303467 + }, + { + "auxiliary_loss_clip": 0.01422666, + "auxiliary_loss_mlp": 0.00269632, + "balance_loss_clip": 1.15011299, + "balance_loss_mlp": 0.23788667, + "epoch": 0.588185780850744, + "flos": 16034653797120.0, + "grad_norm": 25.22327350441506, + "language_loss": 0.79996306, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.81688601, + "num_input_tokens_seen": 210923945, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.31774902, + "step": 9783, + "time_per_iteration": 2.655618906021118 + }, + { + "auxiliary_loss_clip": 0.01424126, + "auxiliary_loss_mlp": 0.00261337, + "balance_loss_clip": 1.15142429, + "balance_loss_mlp": 0.22831635, + "epoch": 0.588245904103412, + "flos": 23728226947200.0, + "grad_norm": 14.169663181625024, + "language_loss": 0.77080458, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.78765917, + "num_input_tokens_seen": 210941955, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.33056641, + "step": 9784, + "time_per_iteration": 2.666461229324341 + }, + { + "auxiliary_loss_clip": 0.01422437, + "auxiliary_loss_mlp": 0.00271735, + "balance_loss_clip": 1.14948249, + "balance_loss_mlp": 0.23945293, + "epoch": 0.58830602735608, + "flos": 33802534558080.0, + "grad_norm": 53.04076713587355, + "language_loss": 0.77390784, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.79084957, + "num_input_tokens_seen": 210963105, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.32299805, + "step": 9785, + "time_per_iteration": 2.793402671813965 + }, + { + "auxiliary_loss_clip": 0.01405772, + "auxiliary_loss_mlp": 0.00284303, + "balance_loss_clip": 1.1417191, + "balance_loss_mlp": 0.25371432, + "epoch": 0.588366150608748, + "flos": 17090714586240.0, + "grad_norm": 5.545147692579656, + "language_loss": 0.85675335, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.87365413, + "num_input_tokens_seen": 210978720, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.30566406, + "step": 9786, + "time_per_iteration": 2.6206092834472656 + }, + { + "auxiliary_loss_clip": 0.01406214, + "auxiliary_loss_mlp": 0.00276835, + "balance_loss_clip": 1.13977599, + "balance_loss_mlp": 0.24464846, + "epoch": 0.5884262738614159, + "flos": 22127186033280.0, + "grad_norm": 39.204909617521096, + "language_loss": 0.87498975, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.89182031, + "num_input_tokens_seen": 210998750, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.32202148, + "step": 9787, + "time_per_iteration": 2.6749095916748047 + }, + { + "auxiliary_loss_clip": 0.01416978, + "auxiliary_loss_mlp": 0.00280346, + "balance_loss_clip": 1.14495254, + "balance_loss_mlp": 0.24935202, + "epoch": 0.5884863971140839, + "flos": 21031838743680.0, + "grad_norm": 21.770787969223154, + "language_loss": 0.73132122, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.74829441, + "num_input_tokens_seen": 211017550, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.31005859, + "step": 9788, + "time_per_iteration": 2.661489963531494 + }, + { + "auxiliary_loss_clip": 0.01420435, + "auxiliary_loss_mlp": 0.00273089, + "balance_loss_clip": 1.15455675, + "balance_loss_mlp": 0.24252416, + "epoch": 0.5885465203667518, + "flos": 23805112008960.0, + "grad_norm": 2.750449351537852, + "language_loss": 0.86540866, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.88234389, + "num_input_tokens_seen": 211034135, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.3059082, + "step": 9789, + "time_per_iteration": 2.713655710220337 + }, + { + "auxiliary_loss_clip": 0.0140241, + "auxiliary_loss_mlp": 0.00278492, + "balance_loss_clip": 1.14532578, + "balance_loss_mlp": 0.24823651, + "epoch": 0.5886066436194198, + "flos": 18880574319360.0, + "grad_norm": 53.51497680028886, + "language_loss": 0.76628613, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.78309518, + "num_input_tokens_seen": 211053850, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.30224609, + "step": 9790, + "time_per_iteration": 2.6472530364990234 + }, + { + "auxiliary_loss_clip": 0.01417458, + "auxiliary_loss_mlp": 0.00298147, + "balance_loss_clip": 1.15363562, + "balance_loss_mlp": 0.26686651, + "epoch": 0.5886667668720877, + "flos": 24790141653120.0, + "grad_norm": 101.77230041622946, + "language_loss": 0.89600122, + "learning_rate": 1.527232084570895e-06, + "loss": 0.91315734, + "num_input_tokens_seen": 211072165, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.31298828, + "step": 9791, + "time_per_iteration": 2.7023463249206543 + }, + { + "auxiliary_loss_clip": 0.01433553, + "auxiliary_loss_mlp": 0.00295116, + "balance_loss_clip": 1.16339087, + "balance_loss_mlp": 0.26176125, + "epoch": 0.5887268901247558, + "flos": 21614381516160.0, + "grad_norm": 4.837009662505611, + "language_loss": 0.832021, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.84930766, + "num_input_tokens_seen": 211089630, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.33349609, + "step": 9792, + "time_per_iteration": 2.7167999744415283 + }, + { + "auxiliary_loss_clip": 0.01420617, + "auxiliary_loss_mlp": 0.00313429, + "balance_loss_clip": 1.14833128, + "balance_loss_mlp": 0.27752304, + "epoch": 0.5887870133774237, + "flos": 20481722974080.0, + "grad_norm": 8.189055528430718, + "language_loss": 0.76227617, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.77961665, + "num_input_tokens_seen": 211106120, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.35888672, + "step": 9793, + "time_per_iteration": 2.664824962615967 + }, + { + "auxiliary_loss_clip": 0.01425618, + "auxiliary_loss_mlp": 0.00288728, + "balance_loss_clip": 1.15852368, + "balance_loss_mlp": 0.25511089, + "epoch": 0.5888471366300917, + "flos": 19206283870080.0, + "grad_norm": 5.994595209458041, + "language_loss": 0.66458869, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.68173218, + "num_input_tokens_seen": 211122450, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.33642578, + "step": 9794, + "time_per_iteration": 2.628608465194702 + }, + { + "auxiliary_loss_clip": 0.01417619, + "auxiliary_loss_mlp": 0.00274322, + "balance_loss_clip": 1.15152466, + "balance_loss_mlp": 0.24041876, + "epoch": 0.5889072598827596, + "flos": 19972904866560.0, + "grad_norm": 35.84415310977272, + "language_loss": 0.70662928, + "learning_rate": 1.525718531219257e-06, + "loss": 0.72354871, + "num_input_tokens_seen": 211141765, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.33935547, + "step": 9795, + "time_per_iteration": 2.688871145248413 + }, + { + "auxiliary_loss_clip": 0.01447281, + "auxiliary_loss_mlp": 0.00266647, + "balance_loss_clip": 1.18072653, + "balance_loss_mlp": 0.23450767, + "epoch": 0.5889673831354276, + "flos": 20741249715840.0, + "grad_norm": 5.758697575364143, + "language_loss": 0.79649484, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.8136341, + "num_input_tokens_seen": 211160475, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.3215332, + "step": 9796, + "time_per_iteration": 2.7470226287841797 + }, + { + "auxiliary_loss_clip": 0.01441847, + "auxiliary_loss_mlp": 0.00271447, + "balance_loss_clip": 1.17082441, + "balance_loss_mlp": 0.23914167, + "epoch": 0.5890275063880956, + "flos": 25300935008640.0, + "grad_norm": 5.4002259341556975, + "language_loss": 0.88098389, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.89811683, + "num_input_tokens_seen": 211180480, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.32324219, + "step": 9797, + "time_per_iteration": 2.6847426891326904 + }, + { + "auxiliary_loss_clip": 0.01447951, + "auxiliary_loss_mlp": 0.00278393, + "balance_loss_clip": 1.17349219, + "balance_loss_mlp": 0.24446613, + "epoch": 0.5890876296407636, + "flos": 11765377964160.0, + "grad_norm": 13.75189377783656, + "language_loss": 0.8662802, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.88354367, + "num_input_tokens_seen": 211198000, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.33935547, + "step": 9798, + "time_per_iteration": 2.629483461380005 + }, + { + "auxiliary_loss_clip": 0.01435092, + "auxiliary_loss_mlp": 0.00279247, + "balance_loss_clip": 1.17433548, + "balance_loss_mlp": 0.24727558, + "epoch": 0.5891477528934316, + "flos": 13589460380160.0, + "grad_norm": 11.795779008892957, + "language_loss": 0.83138669, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.84853011, + "num_input_tokens_seen": 211214765, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.31982422, + "step": 9799, + "time_per_iteration": 2.609060049057007 + }, + { + "auxiliary_loss_clip": 0.0144122, + "auxiliary_loss_mlp": 0.00297854, + "balance_loss_clip": 1.17073905, + "balance_loss_mlp": 0.26211524, + "epoch": 0.5892078761460995, + "flos": 15049193189760.0, + "grad_norm": 19.96402258493636, + "language_loss": 0.85847509, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.87586582, + "num_input_tokens_seen": 211232335, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.35742188, + "step": 9800, + "time_per_iteration": 2.618119716644287 + }, + { + "auxiliary_loss_clip": 0.01447281, + "auxiliary_loss_mlp": 0.00290441, + "balance_loss_clip": 1.18082881, + "balance_loss_mlp": 0.25618017, + "epoch": 0.5892679993987675, + "flos": 15778215624960.0, + "grad_norm": 9.804624736491053, + "language_loss": 0.87965328, + "learning_rate": 1.523448741022722e-06, + "loss": 0.89703047, + "num_input_tokens_seen": 211249985, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.3425293, + "step": 9801, + "time_per_iteration": 2.607206106185913 + }, + { + "auxiliary_loss_clip": 0.01443605, + "auxiliary_loss_mlp": 0.00270126, + "balance_loss_clip": 1.1759851, + "balance_loss_mlp": 0.23729585, + "epoch": 0.5893281226514354, + "flos": 25265203954560.0, + "grad_norm": 16.152813378129984, + "language_loss": 0.73114479, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.74828213, + "num_input_tokens_seen": 211268425, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.32861328, + "step": 9802, + "time_per_iteration": 2.7237229347229004 + }, + { + "auxiliary_loss_clip": 0.01452632, + "auxiliary_loss_mlp": 0.00268273, + "balance_loss_clip": 1.18295455, + "balance_loss_mlp": 0.23451261, + "epoch": 0.5893882459041034, + "flos": 19458232842240.0, + "grad_norm": 9.922739668701103, + "language_loss": 0.82699907, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.84420812, + "num_input_tokens_seen": 211286680, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.33764648, + "step": 9803, + "time_per_iteration": 2.673612594604492 + }, + { + "auxiliary_loss_clip": 0.01450036, + "auxiliary_loss_mlp": 0.00281425, + "balance_loss_clip": 1.1746794, + "balance_loss_mlp": 0.24301575, + "epoch": 0.5894483691567713, + "flos": 20634056553600.0, + "grad_norm": 2.3734516151172893, + "language_loss": 0.77911937, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.79643404, + "num_input_tokens_seen": 211307700, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.3840332, + "step": 9804, + "time_per_iteration": 2.7216527462005615 + }, + { + "auxiliary_loss_clip": 0.01459676, + "auxiliary_loss_mlp": 0.0024905, + "balance_loss_clip": 1.18780565, + "balance_loss_mlp": 0.21188051, + "epoch": 0.5895084924094394, + "flos": 17778223877760.0, + "grad_norm": 63.24793270227469, + "language_loss": 0.82747436, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.84456164, + "num_input_tokens_seen": 211324835, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.37182617, + "step": 9805, + "time_per_iteration": 2.630599021911621 + }, + { + "auxiliary_loss_clip": 0.01460324, + "auxiliary_loss_mlp": 0.0027511, + "balance_loss_clip": 1.18509316, + "balance_loss_mlp": 0.23891842, + "epoch": 0.5895686156621073, + "flos": 20121072468480.0, + "grad_norm": 4.662170052742493, + "language_loss": 0.86941904, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.88677335, + "num_input_tokens_seen": 211344130, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.36206055, + "step": 9806, + "time_per_iteration": 2.656167984008789 + }, + { + "auxiliary_loss_clip": 0.01476659, + "auxiliary_loss_mlp": 0.0028244, + "balance_loss_clip": 1.19779742, + "balance_loss_mlp": 0.24710615, + "epoch": 0.5896287389147753, + "flos": 20850058990080.0, + "grad_norm": 19.910808251034002, + "language_loss": 0.87735647, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.89494741, + "num_input_tokens_seen": 211362915, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.35302734, + "step": 9807, + "time_per_iteration": 2.6964542865753174 + }, + { + "auxiliary_loss_clip": 0.01461865, + "auxiliary_loss_mlp": 0.00264049, + "balance_loss_clip": 1.19022298, + "balance_loss_mlp": 0.22912024, + "epoch": 0.5896888621674432, + "flos": 14537897043840.0, + "grad_norm": 6.199578155987513, + "language_loss": 0.8352977, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.85255688, + "num_input_tokens_seen": 211380700, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.34936523, + "step": 9808, + "time_per_iteration": 2.7053072452545166 + }, + { + "auxiliary_loss_clip": 0.01470446, + "auxiliary_loss_mlp": 0.00285582, + "balance_loss_clip": 1.19693053, + "balance_loss_mlp": 0.25029653, + "epoch": 0.5897489854201112, + "flos": 20886759711360.0, + "grad_norm": 24.593236949094756, + "language_loss": 0.81472635, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.83228672, + "num_input_tokens_seen": 211400095, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.35302734, + "step": 9809, + "time_per_iteration": 2.699645519256592 + }, + { + "auxiliary_loss_clip": 0.01460456, + "auxiliary_loss_mlp": 0.00272201, + "balance_loss_clip": 1.18939888, + "balance_loss_mlp": 0.23586641, + "epoch": 0.5898091086727792, + "flos": 20011149872640.0, + "grad_norm": 55.05087077654035, + "language_loss": 0.90899014, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.9263168, + "num_input_tokens_seen": 211417810, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.36303711, + "step": 9810, + "time_per_iteration": 2.6804287433624268 + }, + { + "auxiliary_loss_clip": 0.01455618, + "auxiliary_loss_mlp": 0.00260101, + "balance_loss_clip": 1.19197369, + "balance_loss_mlp": 0.22626904, + "epoch": 0.5898692319254472, + "flos": 16253242012800.0, + "grad_norm": 8.154298527227976, + "language_loss": 0.8886925, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.90584964, + "num_input_tokens_seen": 211436020, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.33837891, + "step": 9811, + "time_per_iteration": 2.69559645652771 + }, + { + "auxiliary_loss_clip": 0.01490878, + "auxiliary_loss_mlp": 0.00264192, + "balance_loss_clip": 1.2010963, + "balance_loss_mlp": 0.22635543, + "epoch": 0.5899293551781152, + "flos": 20448541785600.0, + "grad_norm": 4.893443018670184, + "language_loss": 0.84826154, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.86581224, + "num_input_tokens_seen": 211454335, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.37841797, + "step": 9812, + "time_per_iteration": 4.055715799331665 + }, + { + "auxiliary_loss_clip": 0.0146301, + "auxiliary_loss_mlp": 0.00280365, + "balance_loss_clip": 1.19781113, + "balance_loss_mlp": 0.2451265, + "epoch": 0.5899894784307831, + "flos": 13881701433600.0, + "grad_norm": 4.412660621467103, + "language_loss": 0.76213706, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.77957076, + "num_input_tokens_seen": 211472775, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.35253906, + "step": 9813, + "time_per_iteration": 4.165541410446167 + }, + { + "auxiliary_loss_clip": 0.01458701, + "auxiliary_loss_mlp": 0.00279029, + "balance_loss_clip": 1.18943381, + "balance_loss_mlp": 0.24348038, + "epoch": 0.5900496016834511, + "flos": 20083797129600.0, + "grad_norm": 6.664907559916376, + "language_loss": 0.77416551, + "learning_rate": 1.518533098148494e-06, + "loss": 0.79154277, + "num_input_tokens_seen": 211492195, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.35546875, + "step": 9814, + "time_per_iteration": 2.6638174057006836 + }, + { + "auxiliary_loss_clip": 0.01477467, + "auxiliary_loss_mlp": 0.00285508, + "balance_loss_clip": 1.2047962, + "balance_loss_mlp": 0.24869579, + "epoch": 0.590109724936119, + "flos": 20259148348800.0, + "grad_norm": 12.954842924840529, + "language_loss": 0.86188042, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.87951016, + "num_input_tokens_seen": 211510220, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.36816406, + "step": 9815, + "time_per_iteration": 2.6507041454315186 + }, + { + "auxiliary_loss_clip": 0.01473259, + "auxiliary_loss_mlp": 0.00314495, + "balance_loss_clip": 1.19567323, + "balance_loss_mlp": 0.27599066, + "epoch": 0.590169848188787, + "flos": 24235069806720.0, + "grad_norm": 23.221791259601495, + "language_loss": 0.84579408, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.8636716, + "num_input_tokens_seen": 211526260, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.38525391, + "step": 9816, + "time_per_iteration": 4.064629316329956 + }, + { + "auxiliary_loss_clip": 0.01455592, + "auxiliary_loss_mlp": 0.00287666, + "balance_loss_clip": 1.18881845, + "balance_loss_mlp": 0.25395346, + "epoch": 0.590229971441455, + "flos": 17784724239360.0, + "grad_norm": 67.48930658067611, + "language_loss": 0.88326997, + "learning_rate": 1.517399156051309e-06, + "loss": 0.90070248, + "num_input_tokens_seen": 211542890, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.33740234, + "step": 9817, + "time_per_iteration": 2.632958173751831 + }, + { + "auxiliary_loss_clip": 0.01469758, + "auxiliary_loss_mlp": 0.00279928, + "balance_loss_clip": 1.19950151, + "balance_loss_mlp": 0.24025482, + "epoch": 0.590290094694123, + "flos": 22236893147520.0, + "grad_norm": 125.06223203402995, + "language_loss": 0.84421831, + "learning_rate": 1.517021211933682e-06, + "loss": 0.8617152, + "num_input_tokens_seen": 211562685, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.39648438, + "step": 9818, + "time_per_iteration": 2.6349480152130127 + }, + { + "auxiliary_loss_clip": 0.01466887, + "auxiliary_loss_mlp": 0.00284745, + "balance_loss_clip": 1.19916558, + "balance_loss_mlp": 0.2499356, + "epoch": 0.5903502179467909, + "flos": 19098623831040.0, + "grad_norm": 482.9918526664797, + "language_loss": 0.76123679, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.77875304, + "num_input_tokens_seen": 211579960, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.34863281, + "step": 9819, + "time_per_iteration": 2.6330816745758057 + }, + { + "auxiliary_loss_clip": 0.01472736, + "auxiliary_loss_mlp": 0.00317005, + "balance_loss_clip": 1.20528638, + "balance_loss_mlp": 0.2811228, + "epoch": 0.5904103411994589, + "flos": 24235500769920.0, + "grad_norm": 3.4526620377379706, + "language_loss": 0.85529912, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.87319648, + "num_input_tokens_seen": 211599310, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.35864258, + "step": 9820, + "time_per_iteration": 2.6815593242645264 + }, + { + "auxiliary_loss_clip": 0.01532907, + "auxiliary_loss_mlp": 0.00128096, + "balance_loss_clip": 1.30271101, + "balance_loss_mlp": 0.11698586, + "epoch": 0.5904704644521268, + "flos": 64876613045760.0, + "grad_norm": 1.1062929101518333, + "language_loss": 0.64664656, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.66325659, + "num_input_tokens_seen": 211658790, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.11132812, + "step": 9821, + "time_per_iteration": 3.184199333190918 + }, + { + "auxiliary_loss_clip": 0.01465714, + "auxiliary_loss_mlp": 0.00299872, + "balance_loss_clip": 1.19776952, + "balance_loss_mlp": 0.26363283, + "epoch": 0.5905305877047948, + "flos": 19609991804160.0, + "grad_norm": 7.923933232052816, + "language_loss": 0.70191556, + "learning_rate": 1.515509618752521e-06, + "loss": 0.71957135, + "num_input_tokens_seen": 211677240, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.36254883, + "step": 9822, + "time_per_iteration": 4.078229188919067 + }, + { + "auxiliary_loss_clip": 0.01470973, + "auxiliary_loss_mlp": 0.00306622, + "balance_loss_clip": 1.20088136, + "balance_loss_mlp": 0.27186024, + "epoch": 0.5905907109574628, + "flos": 18989634988800.0, + "grad_norm": 49.427773294066775, + "language_loss": 0.90897226, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.92674822, + "num_input_tokens_seen": 211695485, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.34765625, + "step": 9823, + "time_per_iteration": 2.6563172340393066 + }, + { + "auxiliary_loss_clip": 0.01480132, + "auxiliary_loss_mlp": 0.00301715, + "balance_loss_clip": 1.20924211, + "balance_loss_mlp": 0.26445055, + "epoch": 0.5906508342101308, + "flos": 22200407907840.0, + "grad_norm": 66.35162497364094, + "language_loss": 0.8126201, + "learning_rate": 1.514753932336165e-06, + "loss": 0.83043855, + "num_input_tokens_seen": 211713090, + "router_z_loss_clip": 2.71289062, + "router_z_loss_mlp": 0.37255859, + "step": 9824, + "time_per_iteration": 2.685814380645752 + }, + { + "auxiliary_loss_clip": 0.01475089, + "auxiliary_loss_mlp": 0.00351605, + "balance_loss_clip": 1.1895076, + "balance_loss_mlp": 0.31124067, + "epoch": 0.5907109574627988, + "flos": 20886687884160.0, + "grad_norm": 4.23930490315166, + "language_loss": 0.94340652, + "learning_rate": 1.514376116721693e-06, + "loss": 0.96167344, + "num_input_tokens_seen": 211732510, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.40380859, + "step": 9825, + "time_per_iteration": 2.749094009399414 + }, + { + "auxiliary_loss_clip": 0.01450885, + "auxiliary_loss_mlp": 0.00315425, + "balance_loss_clip": 1.18868399, + "balance_loss_mlp": 0.28040123, + "epoch": 0.5907710807154667, + "flos": 21506649649920.0, + "grad_norm": 20.14379221029191, + "language_loss": 0.8200863, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.83774942, + "num_input_tokens_seen": 211748695, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.34985352, + "step": 9826, + "time_per_iteration": 2.7561163902282715 + }, + { + "auxiliary_loss_clip": 0.01489691, + "auxiliary_loss_mlp": 0.00299727, + "balance_loss_clip": 1.21031296, + "balance_loss_mlp": 0.2640354, + "epoch": 0.5908312039681347, + "flos": 22018376759040.0, + "grad_norm": 5.787714201645495, + "language_loss": 0.80117297, + "learning_rate": 1.513620540751793e-06, + "loss": 0.81906712, + "num_input_tokens_seen": 211768545, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.35668945, + "step": 9827, + "time_per_iteration": 2.673064708709717 + }, + { + "auxiliary_loss_clip": 0.01481745, + "auxiliary_loss_mlp": 0.00325626, + "balance_loss_clip": 1.2085743, + "balance_loss_mlp": 0.29041147, + "epoch": 0.5908913272208026, + "flos": 18479523991680.0, + "grad_norm": 1.9019059185243696, + "language_loss": 0.85712254, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.87519622, + "num_input_tokens_seen": 211786665, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.35229492, + "step": 9828, + "time_per_iteration": 2.6774580478668213 + }, + { + "auxiliary_loss_clip": 0.01494964, + "auxiliary_loss_mlp": 0.00316736, + "balance_loss_clip": 1.21175992, + "balance_loss_mlp": 0.27737352, + "epoch": 0.5909514504734706, + "flos": 12312189682560.0, + "grad_norm": 18.95803272588925, + "language_loss": 0.96565235, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.98376936, + "num_input_tokens_seen": 211801215, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.39379883, + "step": 9829, + "time_per_iteration": 2.645596742630005 + }, + { + "auxiliary_loss_clip": 0.01505436, + "auxiliary_loss_mlp": 0.00117243, + "balance_loss_clip": 1.28202176, + "balance_loss_mlp": 0.10484513, + "epoch": 0.5910115737261386, + "flos": 70213262451840.0, + "grad_norm": 0.7464648153642631, + "language_loss": 0.57608312, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.59230995, + "num_input_tokens_seen": 211857005, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.12402344, + "step": 9830, + "time_per_iteration": 3.1185085773468018 + }, + { + "auxiliary_loss_clip": 0.01508483, + "auxiliary_loss_mlp": 0.00297097, + "balance_loss_clip": 1.21290922, + "balance_loss_mlp": 0.25914064, + "epoch": 0.5910716969788066, + "flos": 22017766227840.0, + "grad_norm": 52.86516958465032, + "language_loss": 0.91713881, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.93519461, + "num_input_tokens_seen": 211876675, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.37988281, + "step": 9831, + "time_per_iteration": 2.7070376873016357 + }, + { + "auxiliary_loss_clip": 0.01451208, + "auxiliary_loss_mlp": 0.00267996, + "balance_loss_clip": 1.19184291, + "balance_loss_mlp": 0.23230508, + "epoch": 0.5911318202314745, + "flos": 21251648021760.0, + "grad_norm": 15.231786178291578, + "language_loss": 0.84000003, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.8571921, + "num_input_tokens_seen": 211895725, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.35693359, + "step": 9832, + "time_per_iteration": 2.6910898685455322 + }, + { + "auxiliary_loss_clip": 0.01430203, + "auxiliary_loss_mlp": 0.00318812, + "balance_loss_clip": 1.16868663, + "balance_loss_mlp": 0.28471842, + "epoch": 0.5911919434841425, + "flos": 17821604528640.0, + "grad_norm": 4.907792446618618, + "language_loss": 0.9116441, + "learning_rate": 1.511354255945847e-06, + "loss": 0.92913425, + "num_input_tokens_seen": 211913860, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.34106445, + "step": 9833, + "time_per_iteration": 2.6622090339660645 + }, + { + "auxiliary_loss_clip": 0.01445877, + "auxiliary_loss_mlp": 0.00301013, + "balance_loss_clip": 1.18425393, + "balance_loss_mlp": 0.26422507, + "epoch": 0.5912520667368104, + "flos": 20374781207040.0, + "grad_norm": 40.19126541259085, + "language_loss": 0.816365, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.83383393, + "num_input_tokens_seen": 211932880, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.36767578, + "step": 9834, + "time_per_iteration": 2.734718084335327 + }, + { + "auxiliary_loss_clip": 0.01468623, + "auxiliary_loss_mlp": 0.00262866, + "balance_loss_clip": 1.19714546, + "balance_loss_mlp": 0.23067942, + "epoch": 0.5913121899894784, + "flos": 17930557457280.0, + "grad_norm": 10.74890940781382, + "language_loss": 0.86840856, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.88572341, + "num_input_tokens_seen": 211948625, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.32177734, + "step": 9835, + "time_per_iteration": 2.639735460281372 + }, + { + "auxiliary_loss_clip": 0.01465659, + "auxiliary_loss_mlp": 0.00285, + "balance_loss_clip": 1.1905489, + "balance_loss_mlp": 0.24821189, + "epoch": 0.5913723132421465, + "flos": 22126934638080.0, + "grad_norm": 7.008198970993985, + "language_loss": 0.83002919, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.84753585, + "num_input_tokens_seen": 211965355, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.36791992, + "step": 9836, + "time_per_iteration": 2.675624370574951 + }, + { + "auxiliary_loss_clip": 0.01472647, + "auxiliary_loss_mlp": 0.00287301, + "balance_loss_clip": 1.19862914, + "balance_loss_mlp": 0.2512756, + "epoch": 0.5914324364948144, + "flos": 15697918771200.0, + "grad_norm": 7.848924686274073, + "language_loss": 0.93524665, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.95284605, + "num_input_tokens_seen": 211982245, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.36035156, + "step": 9837, + "time_per_iteration": 2.6648731231689453 + }, + { + "auxiliary_loss_clip": 0.01466642, + "auxiliary_loss_mlp": 0.00314027, + "balance_loss_clip": 1.19505453, + "balance_loss_mlp": 0.27394876, + "epoch": 0.5914925597474824, + "flos": 22747327367040.0, + "grad_norm": 17.927501516814694, + "language_loss": 0.86555082, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.88335752, + "num_input_tokens_seen": 212000250, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.40087891, + "step": 9838, + "time_per_iteration": 2.7165932655334473 + }, + { + "auxiliary_loss_clip": 0.01460026, + "auxiliary_loss_mlp": 0.00280965, + "balance_loss_clip": 1.1871413, + "balance_loss_mlp": 0.24603659, + "epoch": 0.5915526830001503, + "flos": 18292788161280.0, + "grad_norm": 101.15500425465639, + "language_loss": 0.77888095, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.79629087, + "num_input_tokens_seen": 212017505, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.34912109, + "step": 9839, + "time_per_iteration": 2.672856330871582 + }, + { + "auxiliary_loss_clip": 0.01465333, + "auxiliary_loss_mlp": 0.00322869, + "balance_loss_clip": 1.19870758, + "balance_loss_mlp": 0.28670064, + "epoch": 0.5916128062528183, + "flos": 17019072910080.0, + "grad_norm": 20.55763808358542, + "language_loss": 0.73081607, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.74869812, + "num_input_tokens_seen": 212034595, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.36181641, + "step": 9840, + "time_per_iteration": 2.6809816360473633 + }, + { + "auxiliary_loss_clip": 0.01468998, + "auxiliary_loss_mlp": 0.00301625, + "balance_loss_clip": 1.19385648, + "balance_loss_mlp": 0.26319221, + "epoch": 0.5916729295054862, + "flos": 24754231031040.0, + "grad_norm": 5.0660359190492334, + "language_loss": 0.88029432, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.89800054, + "num_input_tokens_seen": 212055775, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.38427734, + "step": 9841, + "time_per_iteration": 2.7070467472076416 + }, + { + "auxiliary_loss_clip": 0.01457733, + "auxiliary_loss_mlp": 0.00274547, + "balance_loss_clip": 1.19044077, + "balance_loss_mlp": 0.24109688, + "epoch": 0.5917330527581542, + "flos": 15958199698560.0, + "grad_norm": 2.993261078552376, + "language_loss": 0.76162952, + "learning_rate": 1.507956080444291e-06, + "loss": 0.77895236, + "num_input_tokens_seen": 212074000, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.33447266, + "step": 9842, + "time_per_iteration": 2.6800053119659424 + }, + { + "auxiliary_loss_clip": 0.01442849, + "auxiliary_loss_mlp": 0.00319291, + "balance_loss_clip": 1.17226994, + "balance_loss_mlp": 0.28226411, + "epoch": 0.5917931760108222, + "flos": 23800730549760.0, + "grad_norm": 28.796897877350066, + "language_loss": 0.8847307, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.90235209, + "num_input_tokens_seen": 212091415, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.37036133, + "step": 9843, + "time_per_iteration": 2.6853530406951904 + }, + { + "auxiliary_loss_clip": 0.01464016, + "auxiliary_loss_mlp": 0.00293084, + "balance_loss_clip": 1.1908443, + "balance_loss_mlp": 0.25794142, + "epoch": 0.5918532992634902, + "flos": 23249609199360.0, + "grad_norm": 53.26492211869063, + "language_loss": 0.91761816, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.93518913, + "num_input_tokens_seen": 212105255, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.35131836, + "step": 9844, + "time_per_iteration": 2.706874132156372 + }, + { + "auxiliary_loss_clip": 0.01441935, + "auxiliary_loss_mlp": 0.0027557, + "balance_loss_clip": 1.17514193, + "balance_loss_mlp": 0.24331211, + "epoch": 0.5919134225161581, + "flos": 19499853726720.0, + "grad_norm": 226.14954246795605, + "language_loss": 0.81536454, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.83253962, + "num_input_tokens_seen": 212122765, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.32226562, + "step": 9845, + "time_per_iteration": 2.632582902908325 + }, + { + "auxiliary_loss_clip": 0.01449562, + "auxiliary_loss_mlp": 0.00283158, + "balance_loss_clip": 1.17169535, + "balance_loss_mlp": 0.24763392, + "epoch": 0.5919735457688261, + "flos": 38800940567040.0, + "grad_norm": 938.7277273818116, + "language_loss": 0.74268115, + "learning_rate": 1.506446264718213e-06, + "loss": 0.76000834, + "num_input_tokens_seen": 212143960, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.35498047, + "step": 9846, + "time_per_iteration": 2.8471009731292725 + }, + { + "auxiliary_loss_clip": 0.01464888, + "auxiliary_loss_mlp": 0.00298333, + "balance_loss_clip": 1.19845295, + "balance_loss_mlp": 0.26440641, + "epoch": 0.592033669021494, + "flos": 22163994495360.0, + "grad_norm": 16.358482299053545, + "language_loss": 0.82697076, + "learning_rate": 1.506068857539931e-06, + "loss": 0.84460294, + "num_input_tokens_seen": 212162005, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.33911133, + "step": 9847, + "time_per_iteration": 2.6835548877716064 + }, + { + "auxiliary_loss_clip": 0.01478768, + "auxiliary_loss_mlp": 0.00275662, + "balance_loss_clip": 1.19633973, + "balance_loss_mlp": 0.24111506, + "epoch": 0.592093792274162, + "flos": 22710985781760.0, + "grad_norm": 14.175486984303348, + "language_loss": 0.71645164, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.73399603, + "num_input_tokens_seen": 212181635, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.34521484, + "step": 9848, + "time_per_iteration": 2.736417531967163 + }, + { + "auxiliary_loss_clip": 0.01445219, + "auxiliary_loss_mlp": 0.0034156, + "balance_loss_clip": 1.17667902, + "balance_loss_mlp": 0.30491549, + "epoch": 0.59215391552683, + "flos": 22528954632960.0, + "grad_norm": 4.298839828126828, + "language_loss": 0.8446424, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.8625102, + "num_input_tokens_seen": 212201615, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.36621094, + "step": 9849, + "time_per_iteration": 2.7862935066223145 + }, + { + "auxiliary_loss_clip": 0.01450549, + "auxiliary_loss_mlp": 0.00292668, + "balance_loss_clip": 1.17817092, + "balance_loss_mlp": 0.25602332, + "epoch": 0.592214038779498, + "flos": 24499013921280.0, + "grad_norm": 2962.871254075516, + "language_loss": 0.8283686, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.84580076, + "num_input_tokens_seen": 212219355, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.36669922, + "step": 9850, + "time_per_iteration": 2.7426486015319824 + }, + { + "auxiliary_loss_clip": 0.01438762, + "auxiliary_loss_mlp": 0.00312984, + "balance_loss_clip": 1.17687035, + "balance_loss_mlp": 0.2811076, + "epoch": 0.592274162032166, + "flos": 21831353619840.0, + "grad_norm": 9.118937782162256, + "language_loss": 0.8215934, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.83911085, + "num_input_tokens_seen": 212236710, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.3190918, + "step": 9851, + "time_per_iteration": 2.6561756134033203 + }, + { + "auxiliary_loss_clip": 0.01443587, + "auxiliary_loss_mlp": 0.00329978, + "balance_loss_clip": 1.17967081, + "balance_loss_mlp": 0.29497844, + "epoch": 0.5923342852848339, + "flos": 24608146417920.0, + "grad_norm": 83.99252907700021, + "language_loss": 0.78716826, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.80490386, + "num_input_tokens_seen": 212256195, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.35009766, + "step": 9852, + "time_per_iteration": 2.7424511909484863 + }, + { + "auxiliary_loss_clip": 0.01437507, + "auxiliary_loss_mlp": 0.00305297, + "balance_loss_clip": 1.1671989, + "balance_loss_mlp": 0.27039286, + "epoch": 0.5923944085375019, + "flos": 19938143479680.0, + "grad_norm": 1.9598793737321079, + "language_loss": 0.87193763, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.88936567, + "num_input_tokens_seen": 212274085, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.34899902, + "step": 9853, + "time_per_iteration": 2.682861089706421 + }, + { + "auxiliary_loss_clip": 0.01470869, + "auxiliary_loss_mlp": 0.00282615, + "balance_loss_clip": 1.19921947, + "balance_loss_mlp": 0.24675706, + "epoch": 0.5924545317901698, + "flos": 28658510812800.0, + "grad_norm": 30.934778056911128, + "language_loss": 0.74047124, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.7580061, + "num_input_tokens_seen": 212295530, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.35888672, + "step": 9854, + "time_per_iteration": 4.245787858963013 + }, + { + "auxiliary_loss_clip": 0.01439293, + "auxiliary_loss_mlp": 0.00294375, + "balance_loss_clip": 1.17464995, + "balance_loss_mlp": 0.26156843, + "epoch": 0.5925146550428378, + "flos": 19864885691520.0, + "grad_norm": 23.62804026185717, + "language_loss": 0.96210563, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.97944224, + "num_input_tokens_seen": 212313770, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.328125, + "step": 9855, + "time_per_iteration": 4.219951868057251 + }, + { + "auxiliary_loss_clip": 0.01443998, + "auxiliary_loss_mlp": 0.00278741, + "balance_loss_clip": 1.17844605, + "balance_loss_mlp": 0.24753246, + "epoch": 0.5925747782955058, + "flos": 15122989681920.0, + "grad_norm": 30.161368309566647, + "language_loss": 0.94176614, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.95899349, + "num_input_tokens_seen": 212331525, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.31225586, + "step": 9856, + "time_per_iteration": 2.64567494392395 + }, + { + "auxiliary_loss_clip": 0.01443545, + "auxiliary_loss_mlp": 0.00275445, + "balance_loss_clip": 1.17439985, + "balance_loss_mlp": 0.24182834, + "epoch": 0.5926349015481738, + "flos": 18405440190720.0, + "grad_norm": 15.530529186061308, + "language_loss": 0.84364498, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.8608349, + "num_input_tokens_seen": 212347295, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.33618164, + "step": 9857, + "time_per_iteration": 2.626005172729492 + }, + { + "auxiliary_loss_clip": 0.01463858, + "auxiliary_loss_mlp": 0.00301267, + "balance_loss_clip": 1.1936816, + "balance_loss_mlp": 0.26629087, + "epoch": 0.5926950248008417, + "flos": 23111138269440.0, + "grad_norm": 39.66020012412898, + "language_loss": 0.72082698, + "learning_rate": 1.501918617901419e-06, + "loss": 0.73847818, + "num_input_tokens_seen": 212365750, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.34985352, + "step": 9858, + "time_per_iteration": 2.7187845706939697 + }, + { + "auxiliary_loss_clip": 0.01438943, + "auxiliary_loss_mlp": 0.00279842, + "balance_loss_clip": 1.17380714, + "balance_loss_mlp": 0.24541402, + "epoch": 0.5927551480535097, + "flos": 28033916192640.0, + "grad_norm": 212.2868180546516, + "language_loss": 0.83492434, + "learning_rate": 1.501541436426501e-06, + "loss": 0.85211217, + "num_input_tokens_seen": 212385300, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.34423828, + "step": 9859, + "time_per_iteration": 4.128331422805786 + }, + { + "auxiliary_loss_clip": 0.0146023, + "auxiliary_loss_mlp": 0.00276058, + "balance_loss_clip": 1.1844486, + "balance_loss_mlp": 0.24084404, + "epoch": 0.5928152713061776, + "flos": 21798675221760.0, + "grad_norm": 4.187935599303439, + "language_loss": 0.84330487, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.86066771, + "num_input_tokens_seen": 212402140, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.35229492, + "step": 9860, + "time_per_iteration": 2.6558737754821777 + }, + { + "auxiliary_loss_clip": 0.01456431, + "auxiliary_loss_mlp": 0.00267485, + "balance_loss_clip": 1.18576884, + "balance_loss_mlp": 0.23436883, + "epoch": 0.5928753945588456, + "flos": 24316839118080.0, + "grad_norm": 33.369115108060264, + "language_loss": 0.83452761, + "learning_rate": 1.500787130195763e-06, + "loss": 0.85176677, + "num_input_tokens_seen": 212421790, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.33081055, + "step": 9861, + "time_per_iteration": 2.6949849128723145 + }, + { + "auxiliary_loss_clip": 0.01423269, + "auxiliary_loss_mlp": 0.00305644, + "balance_loss_clip": 1.16187406, + "balance_loss_mlp": 0.27302876, + "epoch": 0.5929355178115137, + "flos": 26464619923200.0, + "grad_norm": 77.33186359607825, + "language_loss": 0.75122076, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.76850992, + "num_input_tokens_seen": 212442115, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.32617188, + "step": 9862, + "time_per_iteration": 2.694572687149048 + }, + { + "auxiliary_loss_clip": 0.01423971, + "auxiliary_loss_mlp": 0.00273993, + "balance_loss_clip": 1.16096485, + "balance_loss_mlp": 0.24267627, + "epoch": 0.5929956410641816, + "flos": 24965995662720.0, + "grad_norm": 59.65783740673292, + "language_loss": 0.84409404, + "learning_rate": 1.500032899685832e-06, + "loss": 0.86107373, + "num_input_tokens_seen": 212459535, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.31311035, + "step": 9863, + "time_per_iteration": 2.702258586883545 + }, + { + "auxiliary_loss_clip": 0.01472651, + "auxiliary_loss_mlp": 0.00294875, + "balance_loss_clip": 1.19935524, + "balance_loss_mlp": 0.26028088, + "epoch": 0.5930557643168496, + "flos": 26208325405440.0, + "grad_norm": 4.097955551040658, + "language_loss": 0.77815211, + "learning_rate": 1.499655812861921e-06, + "loss": 0.79582739, + "num_input_tokens_seen": 212479385, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.34619141, + "step": 9864, + "time_per_iteration": 4.1070473194122314 + }, + { + "auxiliary_loss_clip": 0.01434026, + "auxiliary_loss_mlp": 0.0028807, + "balance_loss_clip": 1.17057538, + "balance_loss_mlp": 0.25571692, + "epoch": 0.5931158875695175, + "flos": 27854937699840.0, + "grad_norm": 61.56773092020042, + "language_loss": 0.75440192, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.7716229, + "num_input_tokens_seen": 212500060, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.32348633, + "step": 9865, + "time_per_iteration": 2.7351908683776855 + }, + { + "auxiliary_loss_clip": 0.01460661, + "auxiliary_loss_mlp": 0.00282852, + "balance_loss_clip": 1.18790269, + "balance_loss_mlp": 0.24782883, + "epoch": 0.5931760108221855, + "flos": 15413650536960.0, + "grad_norm": 3.88678547848397, + "language_loss": 0.87669456, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.89412963, + "num_input_tokens_seen": 212518590, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.3503418, + "step": 9866, + "time_per_iteration": 2.728977680206299 + }, + { + "auxiliary_loss_clip": 0.01469724, + "auxiliary_loss_mlp": 0.00285482, + "balance_loss_clip": 1.20097351, + "balance_loss_mlp": 0.25315273, + "epoch": 0.5932361340748534, + "flos": 30188520581760.0, + "grad_norm": 31.61678718349712, + "language_loss": 0.78580081, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.80335295, + "num_input_tokens_seen": 212538190, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.32324219, + "step": 9867, + "time_per_iteration": 2.7604734897613525 + }, + { + "auxiliary_loss_clip": 0.01460088, + "auxiliary_loss_mlp": 0.00290878, + "balance_loss_clip": 1.18901873, + "balance_loss_mlp": 0.25370854, + "epoch": 0.5932962573275214, + "flos": 20157557708160.0, + "grad_norm": 85.84715009285594, + "language_loss": 0.75173974, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.76924932, + "num_input_tokens_seen": 212557820, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.37158203, + "step": 9868, + "time_per_iteration": 2.7167835235595703 + }, + { + "auxiliary_loss_clip": 0.01452228, + "auxiliary_loss_mlp": 0.00315207, + "balance_loss_clip": 1.17687941, + "balance_loss_mlp": 0.27956307, + "epoch": 0.5933563805801894, + "flos": 25445906300160.0, + "grad_norm": 9.950357815816668, + "language_loss": 0.81664532, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.83431965, + "num_input_tokens_seen": 212577645, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.35668945, + "step": 9869, + "time_per_iteration": 2.7083303928375244 + }, + { + "auxiliary_loss_clip": 0.01490561, + "auxiliary_loss_mlp": 0.00288896, + "balance_loss_clip": 1.20737004, + "balance_loss_mlp": 0.24969959, + "epoch": 0.5934165038328574, + "flos": 59995740337920.0, + "grad_norm": 5.090103037790406, + "language_loss": 0.79936242, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.81715703, + "num_input_tokens_seen": 212603430, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.3918457, + "step": 9870, + "time_per_iteration": 3.0205936431884766 + }, + { + "auxiliary_loss_clip": 0.01462296, + "auxiliary_loss_mlp": 0.00315965, + "balance_loss_clip": 1.18846273, + "balance_loss_mlp": 0.27939144, + "epoch": 0.5934766270855253, + "flos": 24420548661120.0, + "grad_norm": 16.045545491246536, + "language_loss": 0.81537628, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.83315885, + "num_input_tokens_seen": 212620730, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.3659668, + "step": 9871, + "time_per_iteration": 2.8234212398529053 + }, + { + "auxiliary_loss_clip": 0.01465198, + "auxiliary_loss_mlp": 0.00287943, + "balance_loss_clip": 1.1898669, + "balance_loss_mlp": 0.25115544, + "epoch": 0.5935367503381933, + "flos": 23513158264320.0, + "grad_norm": 82.78545572899289, + "language_loss": 0.83107162, + "learning_rate": 1.496639802503271e-06, + "loss": 0.84860301, + "num_input_tokens_seen": 212639745, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.36791992, + "step": 9872, + "time_per_iteration": 2.7907941341400146 + }, + { + "auxiliary_loss_clip": 0.0146362, + "auxiliary_loss_mlp": 0.00314442, + "balance_loss_clip": 1.18371809, + "balance_loss_mlp": 0.27803516, + "epoch": 0.5935968735908612, + "flos": 18948337326720.0, + "grad_norm": 6691.857552820181, + "language_loss": 0.88066834, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.89844894, + "num_input_tokens_seen": 212655915, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.36401367, + "step": 9873, + "time_per_iteration": 2.663792848587036 + }, + { + "auxiliary_loss_clip": 0.01475834, + "auxiliary_loss_mlp": 0.00292449, + "balance_loss_clip": 1.19771707, + "balance_loss_mlp": 0.25494593, + "epoch": 0.5936569968435292, + "flos": 25483433034240.0, + "grad_norm": 19.84337259200671, + "language_loss": 0.9024213, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.92010415, + "num_input_tokens_seen": 212676115, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.375, + "step": 9874, + "time_per_iteration": 2.7288613319396973 + }, + { + "auxiliary_loss_clip": 0.014908, + "auxiliary_loss_mlp": 0.00120416, + "balance_loss_clip": 1.26804423, + "balance_loss_mlp": 0.10854246, + "epoch": 0.5937171200961973, + "flos": 66378361789440.0, + "grad_norm": 0.693204142766552, + "language_loss": 0.59237176, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.60848391, + "num_input_tokens_seen": 212737560, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.11865234, + "step": 9875, + "time_per_iteration": 3.2925188541412354 + }, + { + "auxiliary_loss_clip": 0.0146191, + "auxiliary_loss_mlp": 0.00280768, + "balance_loss_clip": 1.1799314, + "balance_loss_mlp": 0.24345513, + "epoch": 0.5937772433488652, + "flos": 14903467712640.0, + "grad_norm": 36.12365358482084, + "language_loss": 0.85037589, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.86780262, + "num_input_tokens_seen": 212755365, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.37329102, + "step": 9876, + "time_per_iteration": 2.6889657974243164 + }, + { + "auxiliary_loss_clip": 0.01449746, + "auxiliary_loss_mlp": 0.00301939, + "balance_loss_clip": 1.19027758, + "balance_loss_mlp": 0.26629579, + "epoch": 0.5938373666015332, + "flos": 22561489376640.0, + "grad_norm": 22.38973231787652, + "language_loss": 0.79803997, + "learning_rate": 1.494755415907243e-06, + "loss": 0.81555676, + "num_input_tokens_seen": 212773875, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.35668945, + "step": 9877, + "time_per_iteration": 2.6529200077056885 + }, + { + "auxiliary_loss_clip": 0.01478519, + "auxiliary_loss_mlp": 0.00299851, + "balance_loss_clip": 1.19903016, + "balance_loss_mlp": 0.26239544, + "epoch": 0.5938974898542011, + "flos": 18440883936000.0, + "grad_norm": 8.174596748437049, + "language_loss": 0.89258218, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.91036594, + "num_input_tokens_seen": 212790590, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.37475586, + "step": 9878, + "time_per_iteration": 2.6163697242736816 + }, + { + "auxiliary_loss_clip": 0.01471302, + "auxiliary_loss_mlp": 0.00331032, + "balance_loss_clip": 1.19612551, + "balance_loss_mlp": 0.29402909, + "epoch": 0.5939576131068691, + "flos": 45586728270720.0, + "grad_norm": 4.556496793336722, + "language_loss": 0.77193946, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.78996277, + "num_input_tokens_seen": 212812265, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.36987305, + "step": 9879, + "time_per_iteration": 2.9265189170837402 + }, + { + "auxiliary_loss_clip": 0.0147089, + "auxiliary_loss_mlp": 0.00289772, + "balance_loss_clip": 1.2013489, + "balance_loss_mlp": 0.25439042, + "epoch": 0.594017736359537, + "flos": 23587708942080.0, + "grad_norm": 3.375069495827247, + "language_loss": 0.65870386, + "learning_rate": 1.493625013742401e-06, + "loss": 0.67631048, + "num_input_tokens_seen": 212831915, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.35375977, + "step": 9880, + "time_per_iteration": 2.722580671310425 + }, + { + "auxiliary_loss_clip": 0.01458356, + "auxiliary_loss_mlp": 0.00295207, + "balance_loss_clip": 1.18528271, + "balance_loss_mlp": 0.25989693, + "epoch": 0.594077859612205, + "flos": 29457235589760.0, + "grad_norm": 3.7279646874262236, + "language_loss": 0.86263251, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.88016814, + "num_input_tokens_seen": 212851350, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.35302734, + "step": 9881, + "time_per_iteration": 2.72393798828125 + }, + { + "auxiliary_loss_clip": 0.01480868, + "auxiliary_loss_mlp": 0.00306144, + "balance_loss_clip": 1.20126653, + "balance_loss_mlp": 0.2674244, + "epoch": 0.594137982864873, + "flos": 16800089644800.0, + "grad_norm": 20.847391549236384, + "language_loss": 0.92174178, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.93961185, + "num_input_tokens_seen": 212867995, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.38745117, + "step": 9882, + "time_per_iteration": 2.716397762298584 + }, + { + "auxiliary_loss_clip": 0.01484937, + "auxiliary_loss_mlp": 0.0029434, + "balance_loss_clip": 1.20329404, + "balance_loss_mlp": 0.26048455, + "epoch": 0.594198106117541, + "flos": 12750263953920.0, + "grad_norm": 6.381291255385358, + "language_loss": 0.89115751, + "learning_rate": 1.492494784393667e-06, + "loss": 0.90895033, + "num_input_tokens_seen": 212885220, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.33862305, + "step": 9883, + "time_per_iteration": 2.6225528717041016 + }, + { + "auxiliary_loss_clip": 0.01474819, + "auxiliary_loss_mlp": 0.00273823, + "balance_loss_clip": 1.19872451, + "balance_loss_mlp": 0.23658216, + "epoch": 0.5942582293702089, + "flos": 20996538652800.0, + "grad_norm": 6.705618974231539, + "language_loss": 0.84062123, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.85810763, + "num_input_tokens_seen": 212903195, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.37255859, + "step": 9884, + "time_per_iteration": 2.697502374649048 + }, + { + "auxiliary_loss_clip": 0.01492972, + "auxiliary_loss_mlp": 0.00318569, + "balance_loss_clip": 1.20609856, + "balance_loss_mlp": 0.28197166, + "epoch": 0.5943183526228769, + "flos": 28291431772800.0, + "grad_norm": 10.982445436356532, + "language_loss": 0.77227426, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.79038966, + "num_input_tokens_seen": 212923340, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.36547852, + "step": 9885, + "time_per_iteration": 2.7172701358795166 + }, + { + "auxiliary_loss_clip": 0.01470808, + "auxiliary_loss_mlp": 0.00312502, + "balance_loss_clip": 1.19535732, + "balance_loss_mlp": 0.27340147, + "epoch": 0.5943784758755448, + "flos": 26614619118720.0, + "grad_norm": 31.730910303499858, + "language_loss": 0.85656375, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.87439686, + "num_input_tokens_seen": 212942755, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.39135742, + "step": 9886, + "time_per_iteration": 2.7271623611450195 + }, + { + "auxiliary_loss_clip": 0.01475223, + "auxiliary_loss_mlp": 0.00080213, + "balance_loss_clip": 1.25392604, + "balance_loss_mlp": 0.06819718, + "epoch": 0.5944385991282128, + "flos": 64190935347840.0, + "grad_norm": 0.8419632447241692, + "language_loss": 0.63697332, + "learning_rate": 1.490988081420423e-06, + "loss": 0.65252769, + "num_input_tokens_seen": 212999355, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.12011719, + "step": 9887, + "time_per_iteration": 3.0732152462005615 + }, + { + "auxiliary_loss_clip": 0.01469217, + "auxiliary_loss_mlp": 0.00320778, + "balance_loss_clip": 1.19714367, + "balance_loss_mlp": 0.28742331, + "epoch": 0.5944987223808808, + "flos": 19571998193280.0, + "grad_norm": 5.170479588840294, + "language_loss": 0.76923394, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.78713393, + "num_input_tokens_seen": 213018570, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.33325195, + "step": 9888, + "time_per_iteration": 2.6562225818634033 + }, + { + "auxiliary_loss_clip": 0.01498973, + "auxiliary_loss_mlp": 0.00318698, + "balance_loss_clip": 1.21231246, + "balance_loss_mlp": 0.28043211, + "epoch": 0.5945588456335488, + "flos": 26177586341760.0, + "grad_norm": 26.056069759639005, + "language_loss": 0.8575995, + "learning_rate": 1.490234845687366e-06, + "loss": 0.87577623, + "num_input_tokens_seen": 213037735, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.38305664, + "step": 9889, + "time_per_iteration": 2.7081849575042725 + }, + { + "auxiliary_loss_clip": 0.01452425, + "auxiliary_loss_mlp": 0.00326827, + "balance_loss_clip": 1.1824069, + "balance_loss_mlp": 0.29330555, + "epoch": 0.5946189688862168, + "flos": 20446494710400.0, + "grad_norm": 7.9870982348630415, + "language_loss": 0.77629948, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.79409206, + "num_input_tokens_seen": 213057160, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.33520508, + "step": 9890, + "time_per_iteration": 2.6729488372802734 + }, + { + "auxiliary_loss_clip": 0.0147472, + "auxiliary_loss_mlp": 0.00323432, + "balance_loss_clip": 1.19880402, + "balance_loss_mlp": 0.28821808, + "epoch": 0.5946790921388847, + "flos": 13437521850240.0, + "grad_norm": 19.579375794245728, + "language_loss": 0.76960659, + "learning_rate": 1.489481687275691e-06, + "loss": 0.78758812, + "num_input_tokens_seen": 213073630, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.35253906, + "step": 9891, + "time_per_iteration": 2.6554949283599854 + }, + { + "auxiliary_loss_clip": 0.01481012, + "auxiliary_loss_mlp": 0.00333118, + "balance_loss_clip": 1.20727229, + "balance_loss_mlp": 0.29911971, + "epoch": 0.5947392153915527, + "flos": 20412272027520.0, + "grad_norm": 56.04809253947543, + "language_loss": 0.62161183, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.6397531, + "num_input_tokens_seen": 213092450, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.33984375, + "step": 9892, + "time_per_iteration": 2.657518148422241 + }, + { + "auxiliary_loss_clip": 0.01482562, + "auxiliary_loss_mlp": 0.00118518, + "balance_loss_clip": 1.26126361, + "balance_loss_mlp": 0.10902866, + "epoch": 0.5947993386442206, + "flos": 65619138994560.0, + "grad_norm": 0.8265870626653891, + "language_loss": 0.53819823, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.55420905, + "num_input_tokens_seen": 213155465, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.09472656, + "step": 9893, + "time_per_iteration": 3.250304937362671 + }, + { + "auxiliary_loss_clip": 0.01447285, + "auxiliary_loss_mlp": 0.00301192, + "balance_loss_clip": 1.18700457, + "balance_loss_mlp": 0.26790896, + "epoch": 0.5948594618968887, + "flos": 23183103168000.0, + "grad_norm": 7.123869544728069, + "language_loss": 0.79582244, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.81330723, + "num_input_tokens_seen": 213174875, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.33276367, + "step": 9894, + "time_per_iteration": 2.747856378555298 + }, + { + "auxiliary_loss_clip": 0.01482984, + "auxiliary_loss_mlp": 0.00296263, + "balance_loss_clip": 1.20525575, + "balance_loss_mlp": 0.26028556, + "epoch": 0.5949195851495566, + "flos": 13626771632640.0, + "grad_norm": 42.942595792101834, + "language_loss": 0.85340583, + "learning_rate": 1.487975602873434e-06, + "loss": 0.8711983, + "num_input_tokens_seen": 213192695, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.35961914, + "step": 9895, + "time_per_iteration": 2.7213644981384277 + }, + { + "auxiliary_loss_clip": 0.01478039, + "auxiliary_loss_mlp": 0.00315565, + "balance_loss_clip": 1.19904673, + "balance_loss_mlp": 0.28247282, + "epoch": 0.5949797084022246, + "flos": 19751012599680.0, + "grad_norm": 51.08169300506529, + "language_loss": 0.84464556, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.86258161, + "num_input_tokens_seen": 213211195, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.33105469, + "step": 9896, + "time_per_iteration": 4.1640942096710205 + }, + { + "auxiliary_loss_clip": 0.01486806, + "auxiliary_loss_mlp": 0.00324128, + "balance_loss_clip": 1.20925093, + "balance_loss_mlp": 0.28757837, + "epoch": 0.5950398316548925, + "flos": 25773878407680.0, + "grad_norm": 24.91795900956461, + "language_loss": 0.91103399, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.92914331, + "num_input_tokens_seen": 213231975, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.36523438, + "step": 9897, + "time_per_iteration": 4.181141138076782 + }, + { + "auxiliary_loss_clip": 0.01483905, + "auxiliary_loss_mlp": 0.00311989, + "balance_loss_clip": 1.20670164, + "balance_loss_mlp": 0.27815759, + "epoch": 0.5950999549075605, + "flos": 23039029716480.0, + "grad_norm": 15.449107622286826, + "language_loss": 0.79886782, + "learning_rate": 1.486846243389939e-06, + "loss": 0.8168267, + "num_input_tokens_seen": 213249760, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.33837891, + "step": 9898, + "time_per_iteration": 2.6774189472198486 + }, + { + "auxiliary_loss_clip": 0.01470975, + "auxiliary_loss_mlp": 0.00348705, + "balance_loss_clip": 1.1919241, + "balance_loss_mlp": 0.31115448, + "epoch": 0.5951600781602284, + "flos": 32446367637120.0, + "grad_norm": 307.6613662600995, + "language_loss": 0.72438848, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.7425853, + "num_input_tokens_seen": 213269890, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.37573242, + "step": 9899, + "time_per_iteration": 2.756307363510132 + }, + { + "auxiliary_loss_clip": 0.01495174, + "auxiliary_loss_mlp": 0.00354505, + "balance_loss_clip": 1.21591401, + "balance_loss_mlp": 0.31895661, + "epoch": 0.5952202014128964, + "flos": 23800874204160.0, + "grad_norm": 6.670110021337595, + "language_loss": 0.78000456, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.79850131, + "num_input_tokens_seen": 213289400, + "router_z_loss_clip": 2.79492188, + "router_z_loss_mlp": 0.35571289, + "step": 9900, + "time_per_iteration": 2.695301055908203 + }, + { + "auxiliary_loss_clip": 0.01461183, + "auxiliary_loss_mlp": 0.00308865, + "balance_loss_clip": 1.19379473, + "balance_loss_mlp": 0.2756291, + "epoch": 0.5952803246655644, + "flos": 22492182084480.0, + "grad_norm": 56.51674674973559, + "language_loss": 0.90799189, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.92569232, + "num_input_tokens_seen": 213308040, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.33227539, + "step": 9901, + "time_per_iteration": 4.123290777206421 + }, + { + "auxiliary_loss_clip": 0.01488879, + "auxiliary_loss_mlp": 0.00124454, + "balance_loss_clip": 1.27152586, + "balance_loss_mlp": 0.11458335, + "epoch": 0.5953404479182324, + "flos": 51234688851840.0, + "grad_norm": 0.8002927281896086, + "language_loss": 0.57274765, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.58888096, + "num_input_tokens_seen": 213358585, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.09863281, + "step": 9902, + "time_per_iteration": 3.052156686782837 + }, + { + "auxiliary_loss_clip": 0.01471812, + "auxiliary_loss_mlp": 0.00290761, + "balance_loss_clip": 1.19754338, + "balance_loss_mlp": 0.25936151, + "epoch": 0.5954005711709004, + "flos": 23112682554240.0, + "grad_norm": 14.619705455726475, + "language_loss": 0.85779274, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.87541848, + "num_input_tokens_seen": 213379585, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.31420898, + "step": 9903, + "time_per_iteration": 2.6832470893859863 + }, + { + "auxiliary_loss_clip": 0.01453279, + "auxiliary_loss_mlp": 0.00315017, + "balance_loss_clip": 1.18439054, + "balance_loss_mlp": 0.28290188, + "epoch": 0.5954606944235683, + "flos": 35954732736000.0, + "grad_norm": 35.9819914518803, + "language_loss": 0.83732986, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.85501277, + "num_input_tokens_seen": 213401465, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.32104492, + "step": 9904, + "time_per_iteration": 2.787508249282837 + }, + { + "auxiliary_loss_clip": 0.01443926, + "auxiliary_loss_mlp": 0.00310453, + "balance_loss_clip": 1.17707038, + "balance_loss_mlp": 0.27688342, + "epoch": 0.5955208176762363, + "flos": 30443665864320.0, + "grad_norm": 2.13467934450799, + "language_loss": 0.79422855, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.81177229, + "num_input_tokens_seen": 213422720, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.33569336, + "step": 9905, + "time_per_iteration": 2.7255477905273438 + }, + { + "auxiliary_loss_clip": 0.01476949, + "auxiliary_loss_mlp": 0.00311149, + "balance_loss_clip": 1.19626307, + "balance_loss_mlp": 0.27736464, + "epoch": 0.5955809409289042, + "flos": 17640112083840.0, + "grad_norm": 327.88387189350925, + "language_loss": 0.77459538, + "learning_rate": 1.483835475336295e-06, + "loss": 0.79247636, + "num_input_tokens_seen": 213439480, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.33789062, + "step": 9906, + "time_per_iteration": 4.038356065750122 + }, + { + "auxiliary_loss_clip": 0.01429177, + "auxiliary_loss_mlp": 0.0032698, + "balance_loss_clip": 1.16824639, + "balance_loss_mlp": 0.29565141, + "epoch": 0.5956410641815723, + "flos": 24279887001600.0, + "grad_norm": 25.02617395169087, + "language_loss": 0.82176322, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.83932477, + "num_input_tokens_seen": 213458895, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.31323242, + "step": 9907, + "time_per_iteration": 2.7398808002471924 + }, + { + "auxiliary_loss_clip": 0.01456778, + "auxiliary_loss_mlp": 0.00341845, + "balance_loss_clip": 1.18769634, + "balance_loss_mlp": 0.30753624, + "epoch": 0.5957011874342402, + "flos": 35734277013120.0, + "grad_norm": 13.094683824411073, + "language_loss": 0.73516726, + "learning_rate": 1.483082978767595e-06, + "loss": 0.7531535, + "num_input_tokens_seen": 213481730, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.34277344, + "step": 9908, + "time_per_iteration": 2.8069729804992676 + }, + { + "auxiliary_loss_clip": 0.01440939, + "auxiliary_loss_mlp": 0.00312897, + "balance_loss_clip": 1.18013096, + "balance_loss_mlp": 0.28092489, + "epoch": 0.5957613106869082, + "flos": 21245004005760.0, + "grad_norm": 8.70924690554465, + "language_loss": 0.8347438, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.85228217, + "num_input_tokens_seen": 213497225, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.31933594, + "step": 9909, + "time_per_iteration": 2.6808347702026367 + }, + { + "auxiliary_loss_clip": 0.01474586, + "auxiliary_loss_mlp": 0.001227, + "balance_loss_clip": 1.26097393, + "balance_loss_mlp": 0.11344907, + "epoch": 0.5958214339395761, + "flos": 65940969876480.0, + "grad_norm": 0.8983636155675322, + "language_loss": 0.72977996, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.74575281, + "num_input_tokens_seen": 213556890, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.09228516, + "step": 9910, + "time_per_iteration": 3.2588815689086914 + }, + { + "auxiliary_loss_clip": 0.01446701, + "auxiliary_loss_mlp": 0.00298026, + "balance_loss_clip": 1.17794657, + "balance_loss_mlp": 0.26371709, + "epoch": 0.5958815571922441, + "flos": 23218690567680.0, + "grad_norm": 233.59273087777117, + "language_loss": 0.76708531, + "learning_rate": 1.481954380961799e-06, + "loss": 0.78453255, + "num_input_tokens_seen": 213575800, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.34326172, + "step": 9911, + "time_per_iteration": 2.763695240020752 + }, + { + "auxiliary_loss_clip": 0.01474253, + "auxiliary_loss_mlp": 0.0031906, + "balance_loss_clip": 1.19577897, + "balance_loss_mlp": 0.28153306, + "epoch": 0.595941680444912, + "flos": 16538623568640.0, + "grad_norm": 95.13354104799065, + "language_loss": 0.76984066, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.78777379, + "num_input_tokens_seen": 213592740, + "router_z_loss_clip": 2.78710938, + "router_z_loss_mlp": 0.37548828, + "step": 9912, + "time_per_iteration": 2.6175808906555176 + }, + { + "auxiliary_loss_clip": 0.01469708, + "auxiliary_loss_mlp": 0.00325577, + "balance_loss_clip": 1.19350398, + "balance_loss_mlp": 0.29176968, + "epoch": 0.59600180369758, + "flos": 27818883423360.0, + "grad_norm": 21.222269647692652, + "language_loss": 0.80280912, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.82076192, + "num_input_tokens_seen": 213611970, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.33813477, + "step": 9913, + "time_per_iteration": 2.7510123252868652 + }, + { + "auxiliary_loss_clip": 0.01457527, + "auxiliary_loss_mlp": 0.00306454, + "balance_loss_clip": 1.18296623, + "balance_loss_mlp": 0.26935607, + "epoch": 0.596061926950248, + "flos": 29491566013440.0, + "grad_norm": 595.4558384819093, + "language_loss": 0.87530154, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.89294136, + "num_input_tokens_seen": 213632230, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.37084961, + "step": 9914, + "time_per_iteration": 2.716421604156494 + }, + { + "auxiliary_loss_clip": 0.0145665, + "auxiliary_loss_mlp": 0.00283378, + "balance_loss_clip": 1.18516803, + "balance_loss_mlp": 0.24961753, + "epoch": 0.596122050202916, + "flos": 16836790366080.0, + "grad_norm": 14.853017466440692, + "language_loss": 0.75270629, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.77010655, + "num_input_tokens_seen": 213649645, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.33740234, + "step": 9915, + "time_per_iteration": 2.631695508956909 + }, + { + "auxiliary_loss_clip": 0.01460329, + "auxiliary_loss_mlp": 0.00302083, + "balance_loss_clip": 1.18636656, + "balance_loss_mlp": 0.26934832, + "epoch": 0.596182173455584, + "flos": 20996646393600.0, + "grad_norm": 13.77119772684494, + "language_loss": 0.86788517, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.88550931, + "num_input_tokens_seen": 213668850, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.32739258, + "step": 9916, + "time_per_iteration": 2.70361328125 + }, + { + "auxiliary_loss_clip": 0.0144133, + "auxiliary_loss_mlp": 0.00313288, + "balance_loss_clip": 1.17290998, + "balance_loss_mlp": 0.28071958, + "epoch": 0.5962422967082519, + "flos": 16065680169600.0, + "grad_norm": 39.081681970522034, + "language_loss": 0.91093624, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.92848241, + "num_input_tokens_seen": 213685695, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.32568359, + "step": 9917, + "time_per_iteration": 2.824878692626953 + }, + { + "auxiliary_loss_clip": 0.01423408, + "auxiliary_loss_mlp": 0.00324617, + "balance_loss_clip": 1.16405261, + "balance_loss_mlp": 0.29393274, + "epoch": 0.5963024199609199, + "flos": 12166966995840.0, + "grad_norm": 23.479340317551916, + "language_loss": 0.85183465, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.86931491, + "num_input_tokens_seen": 213703515, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.30688477, + "step": 9918, + "time_per_iteration": 2.694160223007202 + }, + { + "auxiliary_loss_clip": 0.01459415, + "auxiliary_loss_mlp": 0.00293097, + "balance_loss_clip": 1.18749583, + "balance_loss_mlp": 0.26100564, + "epoch": 0.5963625432135878, + "flos": 28074280101120.0, + "grad_norm": 3.660428487562262, + "language_loss": 0.83240718, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.84993231, + "num_input_tokens_seen": 213724170, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.32104492, + "step": 9919, + "time_per_iteration": 2.71893310546875 + }, + { + "auxiliary_loss_clip": 0.01463309, + "auxiliary_loss_mlp": 0.00280284, + "balance_loss_clip": 1.1943146, + "balance_loss_mlp": 0.24564192, + "epoch": 0.5964226664662559, + "flos": 19860324664320.0, + "grad_norm": 9.206648454916273, + "language_loss": 0.85766345, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.87509936, + "num_input_tokens_seen": 213740620, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.34619141, + "step": 9920, + "time_per_iteration": 2.702322006225586 + }, + { + "auxiliary_loss_clip": 0.01455997, + "auxiliary_loss_mlp": 0.00296578, + "balance_loss_clip": 1.18783855, + "balance_loss_mlp": 0.26052916, + "epoch": 0.5964827897189238, + "flos": 12932618325120.0, + "grad_norm": 4.9602620581988, + "language_loss": 0.89488453, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.91241026, + "num_input_tokens_seen": 213755390, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.3605957, + "step": 9921, + "time_per_iteration": 2.638015031814575 + }, + { + "auxiliary_loss_clip": 0.01452564, + "auxiliary_loss_mlp": 0.00296444, + "balance_loss_clip": 1.18512297, + "balance_loss_mlp": 0.26437643, + "epoch": 0.5965429129715918, + "flos": 18150797698560.0, + "grad_norm": 40.96286559492331, + "language_loss": 0.90230548, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.91979557, + "num_input_tokens_seen": 213773225, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.32080078, + "step": 9922, + "time_per_iteration": 2.6307971477508545 + }, + { + "auxiliary_loss_clip": 0.01461636, + "auxiliary_loss_mlp": 0.00287726, + "balance_loss_clip": 1.19154572, + "balance_loss_mlp": 0.25325069, + "epoch": 0.5966030362242597, + "flos": 21763231476480.0, + "grad_norm": 3.0017816044522228, + "language_loss": 0.84551954, + "learning_rate": 1.477441761580111e-06, + "loss": 0.86301315, + "num_input_tokens_seen": 213791860, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.34472656, + "step": 9923, + "time_per_iteration": 2.6804938316345215 + }, + { + "auxiliary_loss_clip": 0.01469778, + "auxiliary_loss_mlp": 0.00296416, + "balance_loss_clip": 1.18920064, + "balance_loss_mlp": 0.26027143, + "epoch": 0.5966631594769277, + "flos": 18807208790400.0, + "grad_norm": 2.484071779984118, + "language_loss": 0.8344422, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.85210413, + "num_input_tokens_seen": 213809455, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.36132812, + "step": 9924, + "time_per_iteration": 2.677790403366089 + }, + { + "auxiliary_loss_clip": 0.01462063, + "auxiliary_loss_mlp": 0.00300946, + "balance_loss_clip": 1.19376004, + "balance_loss_mlp": 0.26747221, + "epoch": 0.5967232827295956, + "flos": 14064163545600.0, + "grad_norm": 31.27193902091006, + "language_loss": 0.72972357, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.74735373, + "num_input_tokens_seen": 213826615, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.3347168, + "step": 9925, + "time_per_iteration": 2.660032033920288 + }, + { + "auxiliary_loss_clip": 0.01448111, + "auxiliary_loss_mlp": 0.00264419, + "balance_loss_clip": 1.1813519, + "balance_loss_mlp": 0.23378187, + "epoch": 0.5967834059822636, + "flos": 17238235743360.0, + "grad_norm": 2.702343106927284, + "language_loss": 0.79081291, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.80793822, + "num_input_tokens_seen": 213844495, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.30639648, + "step": 9926, + "time_per_iteration": 2.684760808944702 + }, + { + "auxiliary_loss_clip": 0.0147669, + "auxiliary_loss_mlp": 0.00312011, + "balance_loss_clip": 1.19738019, + "balance_loss_mlp": 0.27708304, + "epoch": 0.5968435292349316, + "flos": 42520244284800.0, + "grad_norm": 39.28841890999236, + "language_loss": 0.78596556, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.80385256, + "num_input_tokens_seen": 213869125, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.34912109, + "step": 9927, + "time_per_iteration": 2.8469736576080322 + }, + { + "auxiliary_loss_clip": 0.01466686, + "auxiliary_loss_mlp": 0.00311002, + "balance_loss_clip": 1.18580115, + "balance_loss_mlp": 0.2729502, + "epoch": 0.5969036524875996, + "flos": 37630898945280.0, + "grad_norm": 18.239045443682112, + "language_loss": 0.72351992, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.74129683, + "num_input_tokens_seen": 213891115, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.38012695, + "step": 9928, + "time_per_iteration": 2.787754535675049 + }, + { + "auxiliary_loss_clip": 0.01458785, + "auxiliary_loss_mlp": 0.00288509, + "balance_loss_clip": 1.1890806, + "balance_loss_mlp": 0.25591731, + "epoch": 0.5969637757402676, + "flos": 23148377694720.0, + "grad_norm": 2.221003982786883, + "language_loss": 0.75289595, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.77036893, + "num_input_tokens_seen": 213911925, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.32617188, + "step": 9929, + "time_per_iteration": 2.742732048034668 + }, + { + "auxiliary_loss_clip": 0.01459918, + "auxiliary_loss_mlp": 0.00280982, + "balance_loss_clip": 1.1908114, + "balance_loss_mlp": 0.24901041, + "epoch": 0.5970238989929355, + "flos": 24020934877440.0, + "grad_norm": 9.137605278035835, + "language_loss": 0.85337186, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.87078089, + "num_input_tokens_seen": 213930715, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.31982422, + "step": 9930, + "time_per_iteration": 2.6651439666748047 + }, + { + "auxiliary_loss_clip": 0.01484478, + "auxiliary_loss_mlp": 0.00287561, + "balance_loss_clip": 1.20217323, + "balance_loss_mlp": 0.25098789, + "epoch": 0.5970840222456035, + "flos": 19426883247360.0, + "grad_norm": 10.600629686384623, + "language_loss": 0.78108352, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.79880393, + "num_input_tokens_seen": 213950015, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.36572266, + "step": 9931, + "time_per_iteration": 2.676302909851074 + }, + { + "auxiliary_loss_clip": 0.0149596, + "auxiliary_loss_mlp": 0.00119435, + "balance_loss_clip": 1.27994561, + "balance_loss_mlp": 0.10942179, + "epoch": 0.5971441454982714, + "flos": 62976615235200.0, + "grad_norm": 0.8466451225774433, + "language_loss": 0.64000475, + "learning_rate": 1.474059168257065e-06, + "loss": 0.65615869, + "num_input_tokens_seen": 214003330, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.10009766, + "step": 9932, + "time_per_iteration": 3.1133086681365967 + }, + { + "auxiliary_loss_clip": 0.01441198, + "auxiliary_loss_mlp": 0.00319535, + "balance_loss_clip": 1.17533517, + "balance_loss_mlp": 0.2870152, + "epoch": 0.5972042687509395, + "flos": 20266223328000.0, + "grad_norm": 12.997371451895141, + "language_loss": 0.80515039, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.82275772, + "num_input_tokens_seen": 214021680, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.32519531, + "step": 9933, + "time_per_iteration": 2.686845302581787 + }, + { + "auxiliary_loss_clip": 0.01494077, + "auxiliary_loss_mlp": 0.00152225, + "balance_loss_clip": 1.27738965, + "balance_loss_mlp": 0.14197296, + "epoch": 0.5972643920036074, + "flos": 71652383832960.0, + "grad_norm": 0.6462812723430682, + "language_loss": 0.51303875, + "learning_rate": 1.473307699867203e-06, + "loss": 0.5295018, + "num_input_tokens_seen": 214090265, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.10253906, + "step": 9934, + "time_per_iteration": 3.2511589527130127 + }, + { + "auxiliary_loss_clip": 0.01492282, + "auxiliary_loss_mlp": 0.00194215, + "balance_loss_clip": 1.27973986, + "balance_loss_mlp": 0.18215099, + "epoch": 0.5973245152562754, + "flos": 56892702263040.0, + "grad_norm": 0.8183256918868284, + "language_loss": 0.54035115, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.55721611, + "num_input_tokens_seen": 214146375, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.12060547, + "step": 9935, + "time_per_iteration": 3.091986894607544 + }, + { + "auxiliary_loss_clip": 0.01436304, + "auxiliary_loss_mlp": 0.00306821, + "balance_loss_clip": 1.16892934, + "balance_loss_mlp": 0.27296591, + "epoch": 0.5973846385089433, + "flos": 24164361884160.0, + "grad_norm": 31.703535566255717, + "language_loss": 0.7262153, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.74364656, + "num_input_tokens_seen": 214165340, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.33862305, + "step": 9936, + "time_per_iteration": 2.7064056396484375 + }, + { + "auxiliary_loss_clip": 0.01456928, + "auxiliary_loss_mlp": 0.00306356, + "balance_loss_clip": 1.18344986, + "balance_loss_mlp": 0.27230954, + "epoch": 0.5974447617616113, + "flos": 17670599752320.0, + "grad_norm": 12.159980165547088, + "language_loss": 0.74810386, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.7657367, + "num_input_tokens_seen": 214181360, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.34033203, + "step": 9937, + "time_per_iteration": 2.6277570724487305 + }, + { + "auxiliary_loss_clip": 0.01474976, + "auxiliary_loss_mlp": 0.00298282, + "balance_loss_clip": 1.19389868, + "balance_loss_mlp": 0.26421177, + "epoch": 0.5975048850142792, + "flos": 22892514140160.0, + "grad_norm": 6.048507761854739, + "language_loss": 0.87313974, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.8908723, + "num_input_tokens_seen": 214198525, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.34057617, + "step": 9938, + "time_per_iteration": 4.109168291091919 + }, + { + "auxiliary_loss_clip": 0.01471874, + "auxiliary_loss_mlp": 0.00317522, + "balance_loss_clip": 1.20053542, + "balance_loss_mlp": 0.28233147, + "epoch": 0.5975650082669473, + "flos": 24353108876160.0, + "grad_norm": 23.20886952459121, + "language_loss": 0.82296008, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.84085405, + "num_input_tokens_seen": 214218710, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.35205078, + "step": 9939, + "time_per_iteration": 2.8293802738189697 + }, + { + "auxiliary_loss_clip": 0.01459299, + "auxiliary_loss_mlp": 0.00286161, + "balance_loss_clip": 1.17672634, + "balance_loss_mlp": 0.25085166, + "epoch": 0.5976251315196152, + "flos": 20923352691840.0, + "grad_norm": 24.050588997409434, + "language_loss": 0.80364656, + "learning_rate": 1.471053774486878e-06, + "loss": 0.82110119, + "num_input_tokens_seen": 214237800, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.35302734, + "step": 9940, + "time_per_iteration": 4.1443095207214355 + }, + { + "auxiliary_loss_clip": 0.01441628, + "auxiliary_loss_mlp": 0.00312594, + "balance_loss_clip": 1.18044686, + "balance_loss_mlp": 0.28019306, + "epoch": 0.5976852547722832, + "flos": 35844594658560.0, + "grad_norm": 22.760262060806372, + "language_loss": 0.75501961, + "learning_rate": 1.470678190375664e-06, + "loss": 0.77256185, + "num_input_tokens_seen": 214260355, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.32397461, + "step": 9941, + "time_per_iteration": 2.7739100456237793 + }, + { + "auxiliary_loss_clip": 0.01430977, + "auxiliary_loss_mlp": 0.00278607, + "balance_loss_clip": 1.16526663, + "balance_loss_mlp": 0.24737376, + "epoch": 0.5977453780249512, + "flos": 12855948744960.0, + "grad_norm": 17.71245679925724, + "language_loss": 0.850465, + "learning_rate": 1.470302626336386e-06, + "loss": 0.86756086, + "num_input_tokens_seen": 214277120, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.31201172, + "step": 9942, + "time_per_iteration": 2.6258857250213623 + }, + { + "auxiliary_loss_clip": 0.01451314, + "auxiliary_loss_mlp": 0.00303559, + "balance_loss_clip": 1.17525077, + "balance_loss_mlp": 0.27096742, + "epoch": 0.5978055012776191, + "flos": 20959155573120.0, + "grad_norm": 7.992167556418348, + "language_loss": 0.80958295, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.82713175, + "num_input_tokens_seen": 214295300, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.32592773, + "step": 9943, + "time_per_iteration": 4.0693888664245605 + }, + { + "auxiliary_loss_clip": 0.01440144, + "auxiliary_loss_mlp": 0.0029277, + "balance_loss_clip": 1.17322516, + "balance_loss_mlp": 0.25996354, + "epoch": 0.5978656245302871, + "flos": 34058003063040.0, + "grad_norm": 7.422623337657951, + "language_loss": 0.69532597, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.71265513, + "num_input_tokens_seen": 214317050, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.328125, + "step": 9944, + "time_per_iteration": 2.7847647666931152 + }, + { + "auxiliary_loss_clip": 0.01451754, + "auxiliary_loss_mlp": 0.00261823, + "balance_loss_clip": 1.18276691, + "balance_loss_mlp": 0.2292784, + "epoch": 0.597925747782955, + "flos": 37373275624320.0, + "grad_norm": 22.0019708187025, + "language_loss": 0.79236156, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.8094973, + "num_input_tokens_seen": 214337470, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.32519531, + "step": 9945, + "time_per_iteration": 2.856948137283325 + }, + { + "auxiliary_loss_clip": 0.01439302, + "auxiliary_loss_mlp": 0.00283254, + "balance_loss_clip": 1.17165434, + "balance_loss_mlp": 0.24887398, + "epoch": 0.5979858710356231, + "flos": 25374803328000.0, + "grad_norm": 5.517424661009387, + "language_loss": 0.76749527, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.78472078, + "num_input_tokens_seen": 214357975, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.34375, + "step": 9946, + "time_per_iteration": 2.675006628036499 + }, + { + "auxiliary_loss_clip": 0.01462467, + "auxiliary_loss_mlp": 0.00271614, + "balance_loss_clip": 1.18352222, + "balance_loss_mlp": 0.23651902, + "epoch": 0.598045994288291, + "flos": 13698413308800.0, + "grad_norm": 8.167560815739536, + "language_loss": 0.96883655, + "learning_rate": 1.468425107717461e-06, + "loss": 0.98617738, + "num_input_tokens_seen": 214374125, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.35107422, + "step": 9947, + "time_per_iteration": 2.607351064682007 + }, + { + "auxiliary_loss_clip": 0.01428167, + "auxiliary_loss_mlp": 0.00285363, + "balance_loss_clip": 1.17003906, + "balance_loss_mlp": 0.25507158, + "epoch": 0.598106117540959, + "flos": 21981352815360.0, + "grad_norm": 43.451970692821355, + "language_loss": 0.80415773, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.82129306, + "num_input_tokens_seen": 214393395, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.30310059, + "step": 9948, + "time_per_iteration": 4.051477670669556 + }, + { + "auxiliary_loss_clip": 0.01442361, + "auxiliary_loss_mlp": 0.00329912, + "balance_loss_clip": 1.16869009, + "balance_loss_mlp": 0.29526973, + "epoch": 0.5981662407936269, + "flos": 20559362221440.0, + "grad_norm": 156.0013169781479, + "language_loss": 0.96911025, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.98683298, + "num_input_tokens_seen": 214411550, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.34667969, + "step": 9949, + "time_per_iteration": 2.680999994277954 + }, + { + "auxiliary_loss_clip": 0.01399549, + "auxiliary_loss_mlp": 0.00266415, + "balance_loss_clip": 1.14603794, + "balance_loss_mlp": 0.23782897, + "epoch": 0.5982263640462949, + "flos": 14063840323200.0, + "grad_norm": 31.737759764207784, + "language_loss": 0.76224053, + "learning_rate": 1.467298838320673e-06, + "loss": 0.77890027, + "num_input_tokens_seen": 214429780, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.28588867, + "step": 9950, + "time_per_iteration": 2.650062084197998 + }, + { + "auxiliary_loss_clip": 0.01411719, + "auxiliary_loss_mlp": 0.00287219, + "balance_loss_clip": 1.15177262, + "balance_loss_mlp": 0.25367391, + "epoch": 0.5982864872989628, + "flos": 17707228646400.0, + "grad_norm": 7.342001567037778, + "language_loss": 0.84373963, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.86072898, + "num_input_tokens_seen": 214447775, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.33544922, + "step": 9951, + "time_per_iteration": 2.673225164413452 + }, + { + "auxiliary_loss_clip": 0.01452765, + "auxiliary_loss_mlp": 0.00291251, + "balance_loss_clip": 1.17769158, + "balance_loss_mlp": 0.25408131, + "epoch": 0.5983466105516309, + "flos": 16764789553920.0, + "grad_norm": 5.413803811957908, + "language_loss": 0.80164111, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.81908131, + "num_input_tokens_seen": 214467245, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.37207031, + "step": 9952, + "time_per_iteration": 2.6738555431365967 + }, + { + "auxiliary_loss_clip": 0.01409831, + "auxiliary_loss_mlp": 0.00325798, + "balance_loss_clip": 1.14393866, + "balance_loss_mlp": 0.29032165, + "epoch": 0.5984067338042988, + "flos": 20042714949120.0, + "grad_norm": 24.20598760213838, + "language_loss": 0.88844258, + "learning_rate": 1.466172750724613e-06, + "loss": 0.90579885, + "num_input_tokens_seen": 214484385, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.35449219, + "step": 9953, + "time_per_iteration": 2.6685738563537598 + }, + { + "auxiliary_loss_clip": 0.0141559, + "auxiliary_loss_mlp": 0.00310054, + "balance_loss_clip": 1.15498137, + "balance_loss_mlp": 0.27600789, + "epoch": 0.5984668570569668, + "flos": 26319900026880.0, + "grad_norm": 34.59490127669727, + "language_loss": 0.75923145, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.77648795, + "num_input_tokens_seen": 214503465, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.34033203, + "step": 9954, + "time_per_iteration": 2.7091217041015625 + }, + { + "auxiliary_loss_clip": 0.01421775, + "auxiliary_loss_mlp": 0.00294897, + "balance_loss_clip": 1.15674758, + "balance_loss_mlp": 0.2621623, + "epoch": 0.5985269803096348, + "flos": 20593728558720.0, + "grad_norm": 15.342614143934, + "language_loss": 0.78620476, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.80337143, + "num_input_tokens_seen": 214520725, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.32739258, + "step": 9955, + "time_per_iteration": 2.656789541244507 + }, + { + "auxiliary_loss_clip": 0.01425061, + "auxiliary_loss_mlp": 0.00271302, + "balance_loss_clip": 1.16198909, + "balance_loss_mlp": 0.23830506, + "epoch": 0.5985871035623027, + "flos": 26865382942080.0, + "grad_norm": 8.58997471027159, + "language_loss": 0.72642851, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.74339211, + "num_input_tokens_seen": 214540675, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.32983398, + "step": 9956, + "time_per_iteration": 2.7821648120880127 + }, + { + "auxiliary_loss_clip": 0.01426443, + "auxiliary_loss_mlp": 0.002887, + "balance_loss_clip": 1.15773392, + "balance_loss_mlp": 0.25555944, + "epoch": 0.5986472268149707, + "flos": 19609704495360.0, + "grad_norm": 4.9674951190536225, + "language_loss": 0.8141045, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.83125591, + "num_input_tokens_seen": 214559910, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.33154297, + "step": 9957, + "time_per_iteration": 2.6411690711975098 + }, + { + "auxiliary_loss_clip": 0.01420599, + "auxiliary_loss_mlp": 0.00289234, + "balance_loss_clip": 1.1601181, + "balance_loss_mlp": 0.25747705, + "epoch": 0.5987073500676386, + "flos": 21794616984960.0, + "grad_norm": 127.06982704033746, + "language_loss": 0.9068898, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.9239881, + "num_input_tokens_seen": 214575960, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.31726074, + "step": 9958, + "time_per_iteration": 2.6876919269561768 + }, + { + "auxiliary_loss_clip": 0.01412053, + "auxiliary_loss_mlp": 0.00314302, + "balance_loss_clip": 1.14579391, + "balance_loss_mlp": 0.2793501, + "epoch": 0.5987674733203067, + "flos": 24314361079680.0, + "grad_norm": 4.6597231288866725, + "language_loss": 0.7479493, + "learning_rate": 1.463921122471864e-06, + "loss": 0.76521283, + "num_input_tokens_seen": 214594230, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.34985352, + "step": 9959, + "time_per_iteration": 2.703627109527588 + }, + { + "auxiliary_loss_clip": 0.01412324, + "auxiliary_loss_mlp": 0.00287018, + "balance_loss_clip": 1.14995635, + "balance_loss_mlp": 0.25657228, + "epoch": 0.5988275965729746, + "flos": 21320201128320.0, + "grad_norm": 23.130679042259803, + "language_loss": 0.89380878, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.91080225, + "num_input_tokens_seen": 214613130, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.30444336, + "step": 9960, + "time_per_iteration": 2.7357676029205322 + }, + { + "auxiliary_loss_clip": 0.01419354, + "auxiliary_loss_mlp": 0.0030443, + "balance_loss_clip": 1.1552521, + "balance_loss_mlp": 0.27338773, + "epoch": 0.5988877198256426, + "flos": 25118041933440.0, + "grad_norm": 47.95128779185925, + "language_loss": 0.85555208, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.87278986, + "num_input_tokens_seen": 214634470, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.31030273, + "step": 9961, + "time_per_iteration": 2.697101593017578 + }, + { + "auxiliary_loss_clip": 0.0143416, + "auxiliary_loss_mlp": 0.00320286, + "balance_loss_clip": 1.16486359, + "balance_loss_mlp": 0.2882427, + "epoch": 0.5989478430783105, + "flos": 26429104350720.0, + "grad_norm": 117.63022810737114, + "language_loss": 0.72965074, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.74719524, + "num_input_tokens_seen": 214654030, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.32055664, + "step": 9962, + "time_per_iteration": 2.6937830448150635 + }, + { + "auxiliary_loss_clip": 0.01426123, + "auxiliary_loss_mlp": 0.00304516, + "balance_loss_clip": 1.16073608, + "balance_loss_mlp": 0.27163777, + "epoch": 0.5990079663309785, + "flos": 25778439434880.0, + "grad_norm": 30.52698421455057, + "language_loss": 0.79537928, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.81268567, + "num_input_tokens_seen": 214676985, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.32861328, + "step": 9963, + "time_per_iteration": 2.8732857704162598 + }, + { + "auxiliary_loss_clip": 0.01425132, + "auxiliary_loss_mlp": 0.00298939, + "balance_loss_clip": 1.15832818, + "balance_loss_mlp": 0.26534623, + "epoch": 0.5990680895836464, + "flos": 36831779118720.0, + "grad_norm": 4.817555347968191, + "language_loss": 0.72408265, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.74132335, + "num_input_tokens_seen": 214700105, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.3359375, + "step": 9964, + "time_per_iteration": 2.797788381576538 + }, + { + "auxiliary_loss_clip": 0.01417687, + "auxiliary_loss_mlp": 0.00292045, + "balance_loss_clip": 1.15923047, + "balance_loss_mlp": 0.26097909, + "epoch": 0.5991282128363145, + "flos": 24133550993280.0, + "grad_norm": 24.64264429824892, + "language_loss": 0.83266008, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.84975743, + "num_input_tokens_seen": 214717885, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.31054688, + "step": 9965, + "time_per_iteration": 2.7253520488739014 + }, + { + "auxiliary_loss_clip": 0.01425355, + "auxiliary_loss_mlp": 0.00324647, + "balance_loss_clip": 1.15585756, + "balance_loss_mlp": 0.2913157, + "epoch": 0.5991883360889824, + "flos": 10304064956160.0, + "grad_norm": 33.46685471446255, + "language_loss": 0.84343249, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.86093247, + "num_input_tokens_seen": 214733680, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.33325195, + "step": 9966, + "time_per_iteration": 2.613053321838379 + }, + { + "auxiliary_loss_clip": 0.01394039, + "auxiliary_loss_mlp": 0.00298493, + "balance_loss_clip": 1.13706374, + "balance_loss_mlp": 0.26841682, + "epoch": 0.5992484593416504, + "flos": 23951196622080.0, + "grad_norm": 37.94913086317696, + "language_loss": 0.80649388, + "learning_rate": 1.460920090376422e-06, + "loss": 0.82341921, + "num_input_tokens_seen": 214753285, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.30065918, + "step": 9967, + "time_per_iteration": 2.7463812828063965 + }, + { + "auxiliary_loss_clip": 0.01432821, + "auxiliary_loss_mlp": 0.003284, + "balance_loss_clip": 1.15505719, + "balance_loss_mlp": 0.29523596, + "epoch": 0.5993085825943184, + "flos": 11944105061760.0, + "grad_norm": 4.399801749845032, + "language_loss": 0.768875, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.78648728, + "num_input_tokens_seen": 214767810, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.33154297, + "step": 9968, + "time_per_iteration": 2.6479532718658447 + }, + { + "auxiliary_loss_clip": 0.01417193, + "auxiliary_loss_mlp": 0.00369468, + "balance_loss_clip": 1.1518054, + "balance_loss_mlp": 0.33361012, + "epoch": 0.5993687058469863, + "flos": 19026838500480.0, + "grad_norm": 4.410219532635535, + "language_loss": 0.85895246, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.87681901, + "num_input_tokens_seen": 214786040, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.35864258, + "step": 9969, + "time_per_iteration": 2.7295374870300293 + }, + { + "auxiliary_loss_clip": 0.01420789, + "auxiliary_loss_mlp": 0.0031866, + "balance_loss_clip": 1.15566957, + "balance_loss_mlp": 0.28394622, + "epoch": 0.5994288290996543, + "flos": 14282967242880.0, + "grad_norm": 8.359661781838778, + "language_loss": 0.86651123, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.88390577, + "num_input_tokens_seen": 214803110, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.34716797, + "step": 9970, + "time_per_iteration": 2.6438724994659424 + }, + { + "auxiliary_loss_clip": 0.01474053, + "auxiliary_loss_mlp": 0.00348452, + "balance_loss_clip": 1.19013047, + "balance_loss_mlp": 0.31090146, + "epoch": 0.5994889523523222, + "flos": 19206643006080.0, + "grad_norm": 9.018295407229688, + "language_loss": 0.71078378, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.72900885, + "num_input_tokens_seen": 214819945, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.37548828, + "step": 9971, + "time_per_iteration": 2.685835123062134 + }, + { + "auxiliary_loss_clip": 0.01414932, + "auxiliary_loss_mlp": 0.0032318, + "balance_loss_clip": 1.15447164, + "balance_loss_mlp": 0.29223365, + "epoch": 0.5995490756049903, + "flos": 28037040675840.0, + "grad_norm": 6759.724836545048, + "language_loss": 0.84792924, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.86531031, + "num_input_tokens_seen": 214838810, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.30957031, + "step": 9972, + "time_per_iteration": 2.721536159515381 + }, + { + "auxiliary_loss_clip": 0.01434767, + "auxiliary_loss_mlp": 0.00337197, + "balance_loss_clip": 1.15508723, + "balance_loss_mlp": 0.30069494, + "epoch": 0.5996091988576582, + "flos": 29052953038080.0, + "grad_norm": 20.14085245547083, + "language_loss": 0.85426974, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.87198937, + "num_input_tokens_seen": 214857040, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.36499023, + "step": 9973, + "time_per_iteration": 2.746941566467285 + }, + { + "auxiliary_loss_clip": 0.0141588, + "auxiliary_loss_mlp": 0.00338812, + "balance_loss_clip": 1.15154767, + "balance_loss_mlp": 0.3056246, + "epoch": 0.5996693221103262, + "flos": 20813968800000.0, + "grad_norm": 12.42926830487094, + "language_loss": 0.74373686, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.76128376, + "num_input_tokens_seen": 214873375, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.33203125, + "step": 9974, + "time_per_iteration": 2.6495375633239746 + }, + { + "auxiliary_loss_clip": 0.01413327, + "auxiliary_loss_mlp": 0.00342228, + "balance_loss_clip": 1.14885831, + "balance_loss_mlp": 0.30954036, + "epoch": 0.5997294453629941, + "flos": 23768914078080.0, + "grad_norm": 1.892862428779597, + "language_loss": 0.8177439, + "learning_rate": 1.457920366566428e-06, + "loss": 0.83529943, + "num_input_tokens_seen": 214893900, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.32702637, + "step": 9975, + "time_per_iteration": 2.732412815093994 + }, + { + "auxiliary_loss_clip": 0.01436103, + "auxiliary_loss_mlp": 0.00330124, + "balance_loss_clip": 1.16395473, + "balance_loss_mlp": 0.29715109, + "epoch": 0.5997895686156621, + "flos": 20960017499520.0, + "grad_norm": 6.598833590115363, + "language_loss": 0.86684233, + "learning_rate": 1.457545493441611e-06, + "loss": 0.88450456, + "num_input_tokens_seen": 214912110, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.32983398, + "step": 9976, + "time_per_iteration": 2.6756153106689453 + }, + { + "auxiliary_loss_clip": 0.01422332, + "auxiliary_loss_mlp": 0.00320024, + "balance_loss_clip": 1.15816283, + "balance_loss_mlp": 0.28743201, + "epoch": 0.59984969186833, + "flos": 28365443746560.0, + "grad_norm": 26.868308199820838, + "language_loss": 0.84783989, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.86526346, + "num_input_tokens_seen": 214930140, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.32568359, + "step": 9977, + "time_per_iteration": 2.7380430698394775 + }, + { + "auxiliary_loss_clip": 0.01415575, + "auxiliary_loss_mlp": 0.00332755, + "balance_loss_clip": 1.15214062, + "balance_loss_mlp": 0.30049741, + "epoch": 0.5999098151209981, + "flos": 22565906749440.0, + "grad_norm": 2.7700301138224472, + "language_loss": 0.75697339, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.77445674, + "num_input_tokens_seen": 214949200, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.32275391, + "step": 9978, + "time_per_iteration": 2.693101644515991 + }, + { + "auxiliary_loss_clip": 0.01421437, + "auxiliary_loss_mlp": 0.00340515, + "balance_loss_clip": 1.15556455, + "balance_loss_mlp": 0.30735072, + "epoch": 0.599969938373666, + "flos": 18768712389120.0, + "grad_norm": 94.87843972793368, + "language_loss": 0.90451181, + "learning_rate": 1.456420997543594e-06, + "loss": 0.92213136, + "num_input_tokens_seen": 214965775, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.33154297, + "step": 9979, + "time_per_iteration": 2.6999776363372803 + }, + { + "auxiliary_loss_clip": 0.01407356, + "auxiliary_loss_mlp": 0.00316183, + "balance_loss_clip": 1.15129149, + "balance_loss_mlp": 0.28572461, + "epoch": 0.600030061626334, + "flos": 11327231865600.0, + "grad_norm": 33.330531075699845, + "language_loss": 0.78634769, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.80358303, + "num_input_tokens_seen": 214982480, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.30480957, + "step": 9980, + "time_per_iteration": 2.6430375576019287 + }, + { + "auxiliary_loss_clip": 0.01456948, + "auxiliary_loss_mlp": 0.00367139, + "balance_loss_clip": 1.17284119, + "balance_loss_mlp": 0.32982603, + "epoch": 0.600090184879002, + "flos": 16578664254720.0, + "grad_norm": 74.7111821112081, + "language_loss": 0.81710738, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.83534825, + "num_input_tokens_seen": 214998110, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.37329102, + "step": 9981, + "time_per_iteration": 4.097184896469116 + }, + { + "auxiliary_loss_clip": 0.01415323, + "auxiliary_loss_mlp": 0.00331601, + "balance_loss_clip": 1.15387344, + "balance_loss_mlp": 0.29931867, + "epoch": 0.6001503081316699, + "flos": 23618627573760.0, + "grad_norm": 21.386118672943045, + "language_loss": 0.84527588, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.86274511, + "num_input_tokens_seen": 215017995, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.32275391, + "step": 9982, + "time_per_iteration": 4.168829441070557 + }, + { + "auxiliary_loss_clip": 0.01442417, + "auxiliary_loss_mlp": 0.00318097, + "balance_loss_clip": 1.16906667, + "balance_loss_mlp": 0.28421801, + "epoch": 0.6002104313843379, + "flos": 20667668705280.0, + "grad_norm": 6.584850856373448, + "language_loss": 0.79686552, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.81447065, + "num_input_tokens_seen": 215038285, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.33886719, + "step": 9983, + "time_per_iteration": 2.7342581748962402 + }, + { + "auxiliary_loss_clip": 0.01428253, + "auxiliary_loss_mlp": 0.00341015, + "balance_loss_clip": 1.15854931, + "balance_loss_mlp": 0.30727839, + "epoch": 0.6002705546370058, + "flos": 22455229968000.0, + "grad_norm": 8.7836698087294, + "language_loss": 0.87948149, + "learning_rate": 1.454547250154447e-06, + "loss": 0.89717418, + "num_input_tokens_seen": 215057825, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.3371582, + "step": 9984, + "time_per_iteration": 2.6695799827575684 + }, + { + "auxiliary_loss_clip": 0.0141668, + "auxiliary_loss_mlp": 0.00334178, + "balance_loss_clip": 1.15147495, + "balance_loss_mlp": 0.30187231, + "epoch": 0.6003306778896739, + "flos": 25191982080000.0, + "grad_norm": 3.042314356800506, + "language_loss": 0.90574348, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.92325205, + "num_input_tokens_seen": 215077790, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.32275391, + "step": 9985, + "time_per_iteration": 4.211479902267456 + }, + { + "auxiliary_loss_clip": 0.01418196, + "auxiliary_loss_mlp": 0.00328035, + "balance_loss_clip": 1.15351558, + "balance_loss_mlp": 0.29664764, + "epoch": 0.6003908011423418, + "flos": 26687733252480.0, + "grad_norm": 16.329012150277222, + "language_loss": 0.77744532, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.79490769, + "num_input_tokens_seen": 215097650, + "router_z_loss_clip": 2.6484375, + "router_z_loss_mlp": 0.31396484, + "step": 9986, + "time_per_iteration": 2.726940870285034 + }, + { + "auxiliary_loss_clip": 0.01406799, + "auxiliary_loss_mlp": 0.00329612, + "balance_loss_clip": 1.14479637, + "balance_loss_mlp": 0.2962814, + "epoch": 0.6004509243950098, + "flos": 22565080736640.0, + "grad_norm": 15.808759722515509, + "language_loss": 0.78561515, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.80297923, + "num_input_tokens_seen": 215118235, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.33374023, + "step": 9987, + "time_per_iteration": 2.6985161304473877 + }, + { + "auxiliary_loss_clip": 0.01398374, + "auxiliary_loss_mlp": 0.00314395, + "balance_loss_clip": 1.14199054, + "balance_loss_mlp": 0.28349549, + "epoch": 0.6005110476476777, + "flos": 19719303868800.0, + "grad_norm": 21.174343423968683, + "language_loss": 0.91119695, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.92832458, + "num_input_tokens_seen": 215136755, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.30883789, + "step": 9988, + "time_per_iteration": 2.689006805419922 + }, + { + "auxiliary_loss_clip": 0.01408264, + "auxiliary_loss_mlp": 0.00366229, + "balance_loss_clip": 1.14733624, + "balance_loss_mlp": 0.33471, + "epoch": 0.6005711709003457, + "flos": 17712543859200.0, + "grad_norm": 55.3597455517253, + "language_loss": 0.7397275, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.75747252, + "num_input_tokens_seen": 215155225, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.31494141, + "step": 9989, + "time_per_iteration": 4.088709831237793 + }, + { + "auxiliary_loss_clip": 0.01396081, + "auxiliary_loss_mlp": 0.00354184, + "balance_loss_clip": 1.1405251, + "balance_loss_mlp": 0.32214028, + "epoch": 0.6006312941530136, + "flos": 18514464946560.0, + "grad_norm": 3.061140142917169, + "language_loss": 0.86524117, + "learning_rate": 1.452299436003257e-06, + "loss": 0.88274384, + "num_input_tokens_seen": 215174815, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.3203125, + "step": 9990, + "time_per_iteration": 2.7315266132354736 + }, + { + "auxiliary_loss_clip": 0.01421567, + "auxiliary_loss_mlp": 0.00334098, + "balance_loss_clip": 1.15310717, + "balance_loss_mlp": 0.29995665, + "epoch": 0.6006914174056817, + "flos": 21390837223680.0, + "grad_norm": 15.166533304580021, + "language_loss": 0.88687658, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.90443325, + "num_input_tokens_seen": 215192045, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.34130859, + "step": 9991, + "time_per_iteration": 2.728395700454712 + }, + { + "auxiliary_loss_clip": 0.01408061, + "auxiliary_loss_mlp": 0.00323974, + "balance_loss_clip": 1.14694834, + "balance_loss_mlp": 0.29338509, + "epoch": 0.6007515406583496, + "flos": 12750515349120.0, + "grad_norm": 10.353042928028572, + "language_loss": 0.89440346, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.91172385, + "num_input_tokens_seen": 215209885, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.30566406, + "step": 9992, + "time_per_iteration": 2.6152234077453613 + }, + { + "auxiliary_loss_clip": 0.01385108, + "auxiliary_loss_mlp": 0.0036004, + "balance_loss_clip": 1.13179946, + "balance_loss_mlp": 0.32773402, + "epoch": 0.6008116639110176, + "flos": 19206894401280.0, + "grad_norm": 11.692409729580566, + "language_loss": 0.74481988, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.7622714, + "num_input_tokens_seen": 215228150, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.32299805, + "step": 9993, + "time_per_iteration": 2.6675519943237305 + }, + { + "auxiliary_loss_clip": 0.01399492, + "auxiliary_loss_mlp": 0.00365354, + "balance_loss_clip": 1.13650084, + "balance_loss_mlp": 0.33395433, + "epoch": 0.6008717871636855, + "flos": 17055342668160.0, + "grad_norm": 165.05488672115393, + "language_loss": 0.89324188, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.9108904, + "num_input_tokens_seen": 215243755, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.31347656, + "step": 9994, + "time_per_iteration": 2.6259684562683105 + }, + { + "auxiliary_loss_clip": 0.01383571, + "auxiliary_loss_mlp": 0.00305486, + "balance_loss_clip": 1.13291883, + "balance_loss_mlp": 0.27716172, + "epoch": 0.6009319104163535, + "flos": 20298686244480.0, + "grad_norm": 61.14265997248346, + "language_loss": 0.76577556, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.78266621, + "num_input_tokens_seen": 215262130, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.28344727, + "step": 9995, + "time_per_iteration": 2.6715784072875977 + }, + { + "auxiliary_loss_clip": 0.01379922, + "auxiliary_loss_mlp": 0.0038359, + "balance_loss_clip": 1.12658429, + "balance_loss_mlp": 0.35130814, + "epoch": 0.6009920336690215, + "flos": 21836776573440.0, + "grad_norm": 116.335673296125, + "language_loss": 0.86408412, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.88171923, + "num_input_tokens_seen": 215281785, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.32275391, + "step": 9996, + "time_per_iteration": 2.6766254901885986 + }, + { + "auxiliary_loss_clip": 0.01394947, + "auxiliary_loss_mlp": 0.00346584, + "balance_loss_clip": 1.13672185, + "balance_loss_mlp": 0.31688845, + "epoch": 0.6010521569216895, + "flos": 22596107109120.0, + "grad_norm": 89.90947501264846, + "language_loss": 0.86918008, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.88659537, + "num_input_tokens_seen": 215297550, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.29675293, + "step": 9997, + "time_per_iteration": 2.783836603164673 + }, + { + "auxiliary_loss_clip": 0.01397064, + "auxiliary_loss_mlp": 0.00352174, + "balance_loss_clip": 1.13661647, + "balance_loss_mlp": 0.31815159, + "epoch": 0.6011122801743575, + "flos": 19171702051200.0, + "grad_norm": 127.2899225913262, + "language_loss": 0.810799, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.82829148, + "num_input_tokens_seen": 215316360, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.33984375, + "step": 9998, + "time_per_iteration": 2.6948649883270264 + }, + { + "auxiliary_loss_clip": 0.01392374, + "auxiliary_loss_mlp": 0.00355598, + "balance_loss_clip": 1.13603759, + "balance_loss_mlp": 0.32133746, + "epoch": 0.6011724034270254, + "flos": 25010022758400.0, + "grad_norm": 7.8213173865757835, + "language_loss": 0.78690922, + "learning_rate": 1.448929117633027e-06, + "loss": 0.80438888, + "num_input_tokens_seen": 215336405, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.34228516, + "step": 9999, + "time_per_iteration": 2.7241053581237793 + }, + { + "auxiliary_loss_clip": 0.0139637, + "auxiliary_loss_mlp": 0.0036153, + "balance_loss_clip": 1.13139212, + "balance_loss_mlp": 0.32886651, + "epoch": 0.6012325266796934, + "flos": 21797669640960.0, + "grad_norm": 212.4734671827191, + "language_loss": 0.84363669, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.86121571, + "num_input_tokens_seen": 215356590, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.32666016, + "step": 10000, + "time_per_iteration": 2.752117156982422 + }, + { + "auxiliary_loss_clip": 0.01399424, + "auxiliary_loss_mlp": 0.00371839, + "balance_loss_clip": 1.13352656, + "balance_loss_mlp": 0.33722112, + "epoch": 0.6012926499323613, + "flos": 19573003774080.0, + "grad_norm": 33.91547378385103, + "language_loss": 0.86740804, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.88512075, + "num_input_tokens_seen": 215374295, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.34594727, + "step": 10001, + "time_per_iteration": 2.630244493484497 + }, + { + "auxiliary_loss_clip": 0.01386873, + "auxiliary_loss_mlp": 0.00352205, + "balance_loss_clip": 1.12464726, + "balance_loss_mlp": 0.31858772, + "epoch": 0.6013527731850293, + "flos": 34860786076800.0, + "grad_norm": 120.95259470600065, + "language_loss": 0.65830135, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.6756922, + "num_input_tokens_seen": 215394535, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.33642578, + "step": 10002, + "time_per_iteration": 2.8082430362701416 + }, + { + "auxiliary_loss_clip": 0.01401788, + "auxiliary_loss_mlp": 0.00368575, + "balance_loss_clip": 1.13562226, + "balance_loss_mlp": 0.33398014, + "epoch": 0.6014128964376972, + "flos": 23291948355840.0, + "grad_norm": 18.84599855044039, + "language_loss": 0.84107, + "learning_rate": 1.447431741055314e-06, + "loss": 0.85877365, + "num_input_tokens_seen": 215414355, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.34594727, + "step": 10003, + "time_per_iteration": 2.731093645095825 + }, + { + "auxiliary_loss_clip": 0.01363977, + "auxiliary_loss_mlp": 0.00348852, + "balance_loss_clip": 1.10672331, + "balance_loss_mlp": 0.3176426, + "epoch": 0.6014730196903653, + "flos": 24820916630400.0, + "grad_norm": 237.16173582452151, + "language_loss": 0.84011924, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.85724747, + "num_input_tokens_seen": 215428280, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.31201172, + "step": 10004, + "time_per_iteration": 2.691577196121216 + }, + { + "auxiliary_loss_clip": 0.0139457, + "auxiliary_loss_mlp": 0.003245, + "balance_loss_clip": 1.13333392, + "balance_loss_mlp": 0.29329115, + "epoch": 0.6015331429430332, + "flos": 23112359331840.0, + "grad_norm": 18.204634744951473, + "language_loss": 0.80178177, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.81897247, + "num_input_tokens_seen": 215448970, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.31201172, + "step": 10005, + "time_per_iteration": 2.662774085998535 + }, + { + "auxiliary_loss_clip": 0.0135618, + "auxiliary_loss_mlp": 0.00362015, + "balance_loss_clip": 1.10944676, + "balance_loss_mlp": 0.3304956, + "epoch": 0.6015932661957012, + "flos": 19201363706880.0, + "grad_norm": 10.446694389590224, + "language_loss": 0.81328583, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.83046782, + "num_input_tokens_seen": 215465260, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.31567383, + "step": 10006, + "time_per_iteration": 2.656230926513672 + }, + { + "auxiliary_loss_clip": 0.01371245, + "auxiliary_loss_mlp": 0.0036986, + "balance_loss_clip": 1.11591864, + "balance_loss_mlp": 0.33524194, + "epoch": 0.6016533894483691, + "flos": 18113630100480.0, + "grad_norm": 8.52947219937548, + "language_loss": 0.82551503, + "learning_rate": 1.445934699732685e-06, + "loss": 0.84292614, + "num_input_tokens_seen": 215482725, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.34619141, + "step": 10007, + "time_per_iteration": 2.638892650604248 + }, + { + "auxiliary_loss_clip": 0.01365793, + "auxiliary_loss_mlp": 0.00363211, + "balance_loss_clip": 1.1106261, + "balance_loss_mlp": 0.33264586, + "epoch": 0.6017135127010371, + "flos": 16216900427520.0, + "grad_norm": 171.01913804272465, + "language_loss": 0.7793895, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.79667956, + "num_input_tokens_seen": 215500420, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.30541992, + "step": 10008, + "time_per_iteration": 2.6441187858581543 + }, + { + "auxiliary_loss_clip": 0.01383902, + "auxiliary_loss_mlp": 0.00371711, + "balance_loss_clip": 1.1241219, + "balance_loss_mlp": 0.3383683, + "epoch": 0.6017736359537051, + "flos": 23444246021760.0, + "grad_norm": 5.3810085446696405, + "language_loss": 0.82220966, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.83976573, + "num_input_tokens_seen": 215522260, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.33325195, + "step": 10009, + "time_per_iteration": 2.7488009929656982 + }, + { + "auxiliary_loss_clip": 0.01373333, + "auxiliary_loss_mlp": 0.0036897, + "balance_loss_clip": 1.11871123, + "balance_loss_mlp": 0.33401746, + "epoch": 0.601833759206373, + "flos": 23514056104320.0, + "grad_norm": 165.24135270335407, + "language_loss": 0.82287663, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.84029967, + "num_input_tokens_seen": 215541715, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.34985352, + "step": 10010, + "time_per_iteration": 2.690584182739258 + }, + { + "auxiliary_loss_clip": 0.01419469, + "auxiliary_loss_mlp": 0.00175042, + "balance_loss_clip": 1.22174513, + "balance_loss_mlp": 0.1637409, + "epoch": 0.6018938824590411, + "flos": 63991668648960.0, + "grad_norm": 0.796262007626621, + "language_loss": 0.54903471, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.56497979, + "num_input_tokens_seen": 215603020, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.11279297, + "step": 10011, + "time_per_iteration": 3.352522850036621 + }, + { + "auxiliary_loss_clip": 0.01371451, + "auxiliary_loss_mlp": 0.00342503, + "balance_loss_clip": 1.11778474, + "balance_loss_mlp": 0.3104839, + "epoch": 0.601954005711709, + "flos": 34640007131520.0, + "grad_norm": 9.087404836282108, + "language_loss": 0.67406696, + "learning_rate": 1.44406387091556e-06, + "loss": 0.69120657, + "num_input_tokens_seen": 215625115, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.32055664, + "step": 10012, + "time_per_iteration": 2.784060001373291 + }, + { + "auxiliary_loss_clip": 0.01385787, + "auxiliary_loss_mlp": 0.00342689, + "balance_loss_clip": 1.13004017, + "balance_loss_mlp": 0.31206399, + "epoch": 0.602014128964377, + "flos": 19427062815360.0, + "grad_norm": 16.96608319141926, + "language_loss": 0.8038578, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.82114255, + "num_input_tokens_seen": 215643730, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.30603027, + "step": 10013, + "time_per_iteration": 2.693093776702881 + }, + { + "auxiliary_loss_clip": 0.01378521, + "auxiliary_loss_mlp": 0.0033451, + "balance_loss_clip": 1.12764573, + "balance_loss_mlp": 0.30313414, + "epoch": 0.6020742522170449, + "flos": 28329389470080.0, + "grad_norm": 17.62575474386003, + "language_loss": 0.86706638, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.8841967, + "num_input_tokens_seen": 215664425, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.3137207, + "step": 10014, + "time_per_iteration": 2.7528038024902344 + }, + { + "auxiliary_loss_clip": 0.01382223, + "auxiliary_loss_mlp": 0.00330473, + "balance_loss_clip": 1.12719321, + "balance_loss_mlp": 0.30033669, + "epoch": 0.6021343754697129, + "flos": 22747040058240.0, + "grad_norm": 126.87806393996347, + "language_loss": 0.78545964, + "learning_rate": 1.442941626485624e-06, + "loss": 0.80258662, + "num_input_tokens_seen": 215684280, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.30151367, + "step": 10015, + "time_per_iteration": 2.725839853286743 + }, + { + "auxiliary_loss_clip": 0.01447186, + "auxiliary_loss_mlp": 0.00193064, + "balance_loss_clip": 1.23957062, + "balance_loss_mlp": 0.18128571, + "epoch": 0.6021944987223808, + "flos": 65752007402880.0, + "grad_norm": 0.7952304105629564, + "language_loss": 0.54120386, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.55760646, + "num_input_tokens_seen": 215739780, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.11767578, + "step": 10016, + "time_per_iteration": 3.111849784851074 + }, + { + "auxiliary_loss_clip": 0.01385223, + "auxiliary_loss_mlp": 0.00381718, + "balance_loss_clip": 1.12660623, + "balance_loss_mlp": 0.34760031, + "epoch": 0.6022546219750489, + "flos": 16105182151680.0, + "grad_norm": 182.24485996368898, + "language_loss": 0.88485789, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.90252733, + "num_input_tokens_seen": 215757885, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.34155273, + "step": 10017, + "time_per_iteration": 2.790952205657959 + }, + { + "auxiliary_loss_clip": 0.01384378, + "auxiliary_loss_mlp": 0.00357076, + "balance_loss_clip": 1.1296382, + "balance_loss_mlp": 0.32605767, + "epoch": 0.6023147452277168, + "flos": 25512555985920.0, + "grad_norm": 14.720220608036652, + "language_loss": 0.89246893, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.9098835, + "num_input_tokens_seen": 215776415, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.31005859, + "step": 10018, + "time_per_iteration": 2.6985058784484863 + }, + { + "auxiliary_loss_clip": 0.01395153, + "auxiliary_loss_mlp": 0.00390667, + "balance_loss_clip": 1.12641501, + "balance_loss_mlp": 0.35604897, + "epoch": 0.6023748684803848, + "flos": 22636075968000.0, + "grad_norm": 6.619338265337981, + "language_loss": 0.85383737, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.87169552, + "num_input_tokens_seen": 215794865, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.34594727, + "step": 10019, + "time_per_iteration": 2.6992669105529785 + }, + { + "auxiliary_loss_clip": 0.0141248, + "auxiliary_loss_mlp": 0.00350844, + "balance_loss_clip": 1.14784706, + "balance_loss_mlp": 0.31777489, + "epoch": 0.6024349917330527, + "flos": 26210444307840.0, + "grad_norm": 1093.0814034223424, + "language_loss": 0.78414643, + "learning_rate": 1.441071641765681e-06, + "loss": 0.80177963, + "num_input_tokens_seen": 215816840, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.33081055, + "step": 10020, + "time_per_iteration": 2.7343335151672363 + }, + { + "auxiliary_loss_clip": 0.01407783, + "auxiliary_loss_mlp": 0.00365303, + "balance_loss_clip": 1.13937521, + "balance_loss_mlp": 0.33230591, + "epoch": 0.6024951149857207, + "flos": 21251755762560.0, + "grad_norm": 11.012329164623267, + "language_loss": 0.71682167, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.7345525, + "num_input_tokens_seen": 215836100, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.32983398, + "step": 10021, + "time_per_iteration": 2.6939637660980225 + }, + { + "auxiliary_loss_clip": 0.01389736, + "auxiliary_loss_mlp": 0.00369549, + "balance_loss_clip": 1.13008201, + "balance_loss_mlp": 0.33612305, + "epoch": 0.6025552382383887, + "flos": 26943453152640.0, + "grad_norm": 28.28146704550219, + "language_loss": 0.86402833, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.88162118, + "num_input_tokens_seen": 215858480, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.33422852, + "step": 10022, + "time_per_iteration": 4.189125061035156 + }, + { + "auxiliary_loss_clip": 0.01416716, + "auxiliary_loss_mlp": 0.00393812, + "balance_loss_clip": 1.14870226, + "balance_loss_mlp": 0.35917026, + "epoch": 0.6026153614910567, + "flos": 31684379495040.0, + "grad_norm": 199.89469858768322, + "language_loss": 0.72753763, + "learning_rate": 1.439949905155693e-06, + "loss": 0.7456429, + "num_input_tokens_seen": 215879950, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.34643555, + "step": 10023, + "time_per_iteration": 2.7947092056274414 + }, + { + "auxiliary_loss_clip": 0.01384301, + "auxiliary_loss_mlp": 0.00379743, + "balance_loss_clip": 1.12416601, + "balance_loss_mlp": 0.34846276, + "epoch": 0.6026754847437247, + "flos": 29312731175040.0, + "grad_norm": 4.446007709718987, + "language_loss": 0.82879972, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.84644008, + "num_input_tokens_seen": 215899830, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.31298828, + "step": 10024, + "time_per_iteration": 4.198841094970703 + }, + { + "auxiliary_loss_clip": 0.01413043, + "auxiliary_loss_mlp": 0.00355383, + "balance_loss_clip": 1.14978266, + "balance_loss_mlp": 0.32090729, + "epoch": 0.6027356079963926, + "flos": 23586775188480.0, + "grad_norm": 32.73060047105878, + "language_loss": 0.80887246, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.82655674, + "num_input_tokens_seen": 215920440, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.3449707, + "step": 10025, + "time_per_iteration": 2.698819637298584 + }, + { + "auxiliary_loss_clip": 0.01400091, + "auxiliary_loss_mlp": 0.00409323, + "balance_loss_clip": 1.13130748, + "balance_loss_mlp": 0.37091392, + "epoch": 0.6027957312490606, + "flos": 20813753318400.0, + "grad_norm": 87.38505111028007, + "language_loss": 0.78043985, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.79853398, + "num_input_tokens_seen": 215940535, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.38427734, + "step": 10026, + "time_per_iteration": 2.697399139404297 + }, + { + "auxiliary_loss_clip": 0.01376961, + "auxiliary_loss_mlp": 0.0038991, + "balance_loss_clip": 1.1268183, + "balance_loss_mlp": 0.35836738, + "epoch": 0.6028558545017285, + "flos": 19935773182080.0, + "grad_norm": 8.582245925781649, + "language_loss": 0.85602224, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.87369096, + "num_input_tokens_seen": 215958045, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.31542969, + "step": 10027, + "time_per_iteration": 4.059483051300049 + }, + { + "auxiliary_loss_clip": 0.01404831, + "auxiliary_loss_mlp": 0.00370006, + "balance_loss_clip": 1.13367069, + "balance_loss_mlp": 0.33636466, + "epoch": 0.6029159777543965, + "flos": 22820836550400.0, + "grad_norm": 32.01795149673977, + "language_loss": 0.80158675, + "learning_rate": 1.438080769071171e-06, + "loss": 0.8193351, + "num_input_tokens_seen": 215977330, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.33642578, + "step": 10028, + "time_per_iteration": 2.69870662689209 + }, + { + "auxiliary_loss_clip": 0.01408731, + "auxiliary_loss_mlp": 0.00357698, + "balance_loss_clip": 1.14081025, + "balance_loss_mlp": 0.32400972, + "epoch": 0.6029761010070644, + "flos": 23587242065280.0, + "grad_norm": 5.091545404845053, + "language_loss": 0.89652336, + "learning_rate": 1.437707005721669e-06, + "loss": 0.91418767, + "num_input_tokens_seen": 215997865, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.33691406, + "step": 10029, + "time_per_iteration": 2.734306812286377 + }, + { + "auxiliary_loss_clip": 0.01386816, + "auxiliary_loss_mlp": 0.00397401, + "balance_loss_clip": 1.13176537, + "balance_loss_mlp": 0.36354566, + "epoch": 0.6030362242597325, + "flos": 13662430859520.0, + "grad_norm": 22.679296217885597, + "language_loss": 0.87580943, + "learning_rate": 1.437333263694373e-06, + "loss": 0.8936516, + "num_input_tokens_seen": 216016230, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.33862305, + "step": 10030, + "time_per_iteration": 2.6607537269592285 + }, + { + "auxiliary_loss_clip": 0.01392784, + "auxiliary_loss_mlp": 0.00375209, + "balance_loss_clip": 1.13007843, + "balance_loss_mlp": 0.34278435, + "epoch": 0.6030963475124004, + "flos": 24422883045120.0, + "grad_norm": 7.676899543469771, + "language_loss": 0.76504254, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.78272247, + "num_input_tokens_seen": 216035785, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.32397461, + "step": 10031, + "time_per_iteration": 4.124647855758667 + }, + { + "auxiliary_loss_clip": 0.01419182, + "auxiliary_loss_mlp": 0.00370145, + "balance_loss_clip": 1.14310479, + "balance_loss_mlp": 0.33421564, + "epoch": 0.6031564707650684, + "flos": 29644043247360.0, + "grad_norm": 20.653430256619757, + "language_loss": 0.80190986, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.81980312, + "num_input_tokens_seen": 216059555, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.35961914, + "step": 10032, + "time_per_iteration": 2.739248037338257 + }, + { + "auxiliary_loss_clip": 0.0144057, + "auxiliary_loss_mlp": 0.00373092, + "balance_loss_clip": 1.16110349, + "balance_loss_mlp": 0.3397606, + "epoch": 0.6032165940177363, + "flos": 16618776768000.0, + "grad_norm": 13.752817789924428, + "language_loss": 0.76260555, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.78074217, + "num_input_tokens_seen": 216077235, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.33349609, + "step": 10033, + "time_per_iteration": 2.764418363571167 + }, + { + "auxiliary_loss_clip": 0.01404987, + "auxiliary_loss_mlp": 0.00358523, + "balance_loss_clip": 1.14420271, + "balance_loss_mlp": 0.32521638, + "epoch": 0.6032767172704043, + "flos": 17488173553920.0, + "grad_norm": 31.322335385330483, + "language_loss": 0.82789582, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.84553087, + "num_input_tokens_seen": 216094985, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.33300781, + "step": 10034, + "time_per_iteration": 2.7168924808502197 + }, + { + "auxiliary_loss_clip": 0.01412685, + "auxiliary_loss_mlp": 0.00352157, + "balance_loss_clip": 1.14363217, + "balance_loss_mlp": 0.31989875, + "epoch": 0.6033368405230723, + "flos": 26832955939200.0, + "grad_norm": 12.225238831012959, + "language_loss": 0.80988961, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.82753801, + "num_input_tokens_seen": 216115905, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.32250977, + "step": 10035, + "time_per_iteration": 2.7675068378448486 + }, + { + "auxiliary_loss_clip": 0.01424459, + "auxiliary_loss_mlp": 0.00308614, + "balance_loss_clip": 1.15920365, + "balance_loss_mlp": 0.27826291, + "epoch": 0.6033969637757403, + "flos": 16909904499840.0, + "grad_norm": 65.93353515652042, + "language_loss": 0.92079592, + "learning_rate": 1.435091260090536e-06, + "loss": 0.93812662, + "num_input_tokens_seen": 216132420, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.30371094, + "step": 10036, + "time_per_iteration": 2.701313018798828 + }, + { + "auxiliary_loss_clip": 0.01403286, + "auxiliary_loss_mlp": 0.00386095, + "balance_loss_clip": 1.137393, + "balance_loss_mlp": 0.35157198, + "epoch": 0.6034570870284083, + "flos": 22930076787840.0, + "grad_norm": 3.687153998503546, + "language_loss": 0.80040634, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.81830013, + "num_input_tokens_seen": 216149800, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.34521484, + "step": 10037, + "time_per_iteration": 2.7304418087005615 + }, + { + "auxiliary_loss_clip": 0.01394249, + "auxiliary_loss_mlp": 0.0034517, + "balance_loss_clip": 1.13791943, + "balance_loss_mlp": 0.31527254, + "epoch": 0.6035172102810762, + "flos": 23366319465600.0, + "grad_norm": 6.424099168543208, + "language_loss": 0.90217859, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.91957271, + "num_input_tokens_seen": 216168200, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.29907227, + "step": 10038, + "time_per_iteration": 2.7651429176330566 + }, + { + "auxiliary_loss_clip": 0.01418226, + "auxiliary_loss_mlp": 0.00349251, + "balance_loss_clip": 1.1490773, + "balance_loss_mlp": 0.31605172, + "epoch": 0.6035773335337442, + "flos": 20887082933760.0, + "grad_norm": 6.908621980332366, + "language_loss": 0.84893328, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.86660802, + "num_input_tokens_seen": 216187105, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.33215332, + "step": 10039, + "time_per_iteration": 2.6870992183685303 + }, + { + "auxiliary_loss_clip": 0.01393702, + "auxiliary_loss_mlp": 0.00362023, + "balance_loss_clip": 1.13654971, + "balance_loss_mlp": 0.32962161, + "epoch": 0.6036374567864121, + "flos": 24936298093440.0, + "grad_norm": 21.346097857028997, + "language_loss": 0.78317034, + "learning_rate": 1.433597019260301e-06, + "loss": 0.80072761, + "num_input_tokens_seen": 216205440, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.32385254, + "step": 10040, + "time_per_iteration": 2.776392936706543 + }, + { + "auxiliary_loss_clip": 0.01429959, + "auxiliary_loss_mlp": 0.00360893, + "balance_loss_clip": 1.15673327, + "balance_loss_mlp": 0.32563055, + "epoch": 0.6036975800390801, + "flos": 23148269953920.0, + "grad_norm": 10.923943473666517, + "language_loss": 0.87104869, + "learning_rate": 1.433223512712475e-06, + "loss": 0.88895726, + "num_input_tokens_seen": 216223130, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.35229492, + "step": 10041, + "time_per_iteration": 2.689458131790161 + }, + { + "auxiliary_loss_clip": 0.01398139, + "auxiliary_loss_mlp": 0.00368595, + "balance_loss_clip": 1.13630593, + "balance_loss_mlp": 0.33493006, + "epoch": 0.603757703291748, + "flos": 18660729127680.0, + "grad_norm": 31.76278218852651, + "language_loss": 0.83529419, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.85296154, + "num_input_tokens_seen": 216240260, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.33642578, + "step": 10042, + "time_per_iteration": 2.746649980545044 + }, + { + "auxiliary_loss_clip": 0.0141027, + "auxiliary_loss_mlp": 0.003485, + "balance_loss_clip": 1.14237142, + "balance_loss_mlp": 0.315741, + "epoch": 0.6038178265444161, + "flos": 19682603147520.0, + "grad_norm": 3.116310999879363, + "language_loss": 0.91292864, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.9305163, + "num_input_tokens_seen": 216258510, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.32763672, + "step": 10043, + "time_per_iteration": 2.6706154346466064 + }, + { + "auxiliary_loss_clip": 0.01400642, + "auxiliary_loss_mlp": 0.00364479, + "balance_loss_clip": 1.13600218, + "balance_loss_mlp": 0.32983637, + "epoch": 0.603877949797084, + "flos": 22638230784000.0, + "grad_norm": 66.82341253836053, + "language_loss": 0.75687504, + "learning_rate": 1.432103122078974e-06, + "loss": 0.77452624, + "num_input_tokens_seen": 216277550, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.34643555, + "step": 10044, + "time_per_iteration": 2.7031021118164062 + }, + { + "auxiliary_loss_clip": 0.01412751, + "auxiliary_loss_mlp": 0.00377578, + "balance_loss_clip": 1.14456022, + "balance_loss_mlp": 0.34121925, + "epoch": 0.603938073049752, + "flos": 25447881548160.0, + "grad_norm": 22061.505549041354, + "language_loss": 0.84637535, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.86427855, + "num_input_tokens_seen": 216296690, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.36352539, + "step": 10045, + "time_per_iteration": 2.678995370864868 + }, + { + "auxiliary_loss_clip": 0.01391119, + "auxiliary_loss_mlp": 0.00362724, + "balance_loss_clip": 1.13315773, + "balance_loss_mlp": 0.33017978, + "epoch": 0.6039981963024199, + "flos": 22340135813760.0, + "grad_norm": 46.295235088850376, + "language_loss": 0.82562768, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.84316611, + "num_input_tokens_seen": 216316110, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.32543945, + "step": 10046, + "time_per_iteration": 2.7199976444244385 + }, + { + "auxiliary_loss_clip": 0.01382638, + "auxiliary_loss_mlp": 0.0036478, + "balance_loss_clip": 1.12455332, + "balance_loss_mlp": 0.33354694, + "epoch": 0.6040583195550879, + "flos": 20703148364160.0, + "grad_norm": 121.10851888363715, + "language_loss": 0.91447353, + "learning_rate": 1.430982925257827e-06, + "loss": 0.93194771, + "num_input_tokens_seen": 216333855, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.31201172, + "step": 10047, + "time_per_iteration": 2.6805508136749268 + }, + { + "auxiliary_loss_clip": 0.01389062, + "auxiliary_loss_mlp": 0.00352415, + "balance_loss_clip": 1.13196588, + "balance_loss_mlp": 0.32117009, + "epoch": 0.604118442807756, + "flos": 27163118776320.0, + "grad_norm": 22.94984451865178, + "language_loss": 0.81849307, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.83590794, + "num_input_tokens_seen": 216354890, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.31237793, + "step": 10048, + "time_per_iteration": 2.7736077308654785 + }, + { + "auxiliary_loss_clip": 0.01421584, + "auxiliary_loss_mlp": 0.00386862, + "balance_loss_clip": 1.14375663, + "balance_loss_mlp": 0.34990752, + "epoch": 0.6041785660604239, + "flos": 30881524654080.0, + "grad_norm": 8.650273594110924, + "language_loss": 0.76201725, + "learning_rate": 1.430236235239386e-06, + "loss": 0.78010166, + "num_input_tokens_seen": 216376055, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.36962891, + "step": 10049, + "time_per_iteration": 2.753739356994629 + }, + { + "auxiliary_loss_clip": 0.01378498, + "auxiliary_loss_mlp": 0.00366462, + "balance_loss_clip": 1.12516141, + "balance_loss_mlp": 0.3373273, + "epoch": 0.6042386893130919, + "flos": 19938215306880.0, + "grad_norm": 303.3669984298203, + "language_loss": 0.72586989, + "learning_rate": 1.429862922631336e-06, + "loss": 0.74331951, + "num_input_tokens_seen": 216396295, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.29125977, + "step": 10050, + "time_per_iteration": 2.7078661918640137 + }, + { + "auxiliary_loss_clip": 0.01392814, + "auxiliary_loss_mlp": 0.00364624, + "balance_loss_clip": 1.13310981, + "balance_loss_mlp": 0.33291477, + "epoch": 0.6042988125657598, + "flos": 32415915882240.0, + "grad_norm": 48.51616758596713, + "language_loss": 0.76481068, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.78238499, + "num_input_tokens_seen": 216416605, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.31713867, + "step": 10051, + "time_per_iteration": 2.793001413345337 + }, + { + "auxiliary_loss_clip": 0.01378964, + "auxiliary_loss_mlp": 0.00382931, + "balance_loss_clip": 1.12012076, + "balance_loss_mlp": 0.34812158, + "epoch": 0.6043589358184278, + "flos": 17420805596160.0, + "grad_norm": 167.54155774548067, + "language_loss": 0.71879667, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.73641562, + "num_input_tokens_seen": 216435130, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.34790039, + "step": 10052, + "time_per_iteration": 2.666473388671875 + }, + { + "auxiliary_loss_clip": 0.0141374, + "auxiliary_loss_mlp": 0.00374922, + "balance_loss_clip": 1.14234591, + "balance_loss_mlp": 0.3405419, + "epoch": 0.6044190590710957, + "flos": 27672834723840.0, + "grad_norm": 7.665452784558671, + "language_loss": 0.76911253, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.78699911, + "num_input_tokens_seen": 216455640, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.34375, + "step": 10053, + "time_per_iteration": 2.8005988597869873 + }, + { + "auxiliary_loss_clip": 0.01451805, + "auxiliary_loss_mlp": 0.00145913, + "balance_loss_clip": 1.24142885, + "balance_loss_mlp": 0.13632822, + "epoch": 0.6044791823237637, + "flos": 65316267515520.0, + "grad_norm": 0.7101076357213952, + "language_loss": 0.59531099, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.61128813, + "num_input_tokens_seen": 216518130, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.09570312, + "step": 10054, + "time_per_iteration": 3.2851803302764893 + }, + { + "auxiliary_loss_clip": 0.01372389, + "auxiliary_loss_mlp": 0.00359623, + "balance_loss_clip": 1.11830497, + "balance_loss_mlp": 0.32879591, + "epoch": 0.6045393055764317, + "flos": 24492369905280.0, + "grad_norm": 21.441109663922614, + "language_loss": 0.90379274, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.9211129, + "num_input_tokens_seen": 216536845, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.30773926, + "step": 10055, + "time_per_iteration": 2.72823429107666 + }, + { + "auxiliary_loss_clip": 0.01367196, + "auxiliary_loss_mlp": 0.00366263, + "balance_loss_clip": 1.11345339, + "balance_loss_mlp": 0.33257419, + "epoch": 0.6045994288290997, + "flos": 19054345340160.0, + "grad_norm": 16.164333565827146, + "language_loss": 0.79717952, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.81451416, + "num_input_tokens_seen": 216551860, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.3371582, + "step": 10056, + "time_per_iteration": 2.6509318351745605 + }, + { + "auxiliary_loss_clip": 0.01363168, + "auxiliary_loss_mlp": 0.00327146, + "balance_loss_clip": 1.11467159, + "balance_loss_mlp": 0.29783228, + "epoch": 0.6046595520817676, + "flos": 26576697335040.0, + "grad_norm": 21.329281014458694, + "language_loss": 0.85696441, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.87386757, + "num_input_tokens_seen": 216574775, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.29309082, + "step": 10057, + "time_per_iteration": 2.7698428630828857 + }, + { + "auxiliary_loss_clip": 0.01375607, + "auxiliary_loss_mlp": 0.00369317, + "balance_loss_clip": 1.12095106, + "balance_loss_mlp": 0.33665347, + "epoch": 0.6047196753344356, + "flos": 13582277660160.0, + "grad_norm": 4.366083848443535, + "language_loss": 0.83402324, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.8514725, + "num_input_tokens_seen": 216590100, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.3269043, + "step": 10058, + "time_per_iteration": 2.667850971221924 + }, + { + "auxiliary_loss_clip": 0.01362487, + "auxiliary_loss_mlp": 0.00363871, + "balance_loss_clip": 1.11136568, + "balance_loss_mlp": 0.33480775, + "epoch": 0.6047797985871035, + "flos": 25520456977920.0, + "grad_norm": 35.484951616276206, + "language_loss": 0.7847538, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.80201739, + "num_input_tokens_seen": 216610145, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.29077148, + "step": 10059, + "time_per_iteration": 2.7339673042297363 + }, + { + "auxiliary_loss_clip": 0.01385335, + "auxiliary_loss_mlp": 0.00351014, + "balance_loss_clip": 1.12584615, + "balance_loss_mlp": 0.31889877, + "epoch": 0.6048399218397715, + "flos": 20520147548160.0, + "grad_norm": 14.880429145083259, + "language_loss": 0.81422424, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.83158767, + "num_input_tokens_seen": 216630625, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.32104492, + "step": 10060, + "time_per_iteration": 2.733057737350464 + }, + { + "auxiliary_loss_clip": 0.01368341, + "auxiliary_loss_mlp": 0.00374249, + "balance_loss_clip": 1.11605501, + "balance_loss_mlp": 0.34203842, + "epoch": 0.6049000450924396, + "flos": 20408788408320.0, + "grad_norm": 81.25122532662198, + "language_loss": 0.80465806, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.82208401, + "num_input_tokens_seen": 216649255, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.32202148, + "step": 10061, + "time_per_iteration": 2.642153739929199 + }, + { + "auxiliary_loss_clip": 0.01380953, + "auxiliary_loss_mlp": 0.0035148, + "balance_loss_clip": 1.12329459, + "balance_loss_mlp": 0.31948438, + "epoch": 0.6049601683451075, + "flos": 20741357456640.0, + "grad_norm": 396.9208360417663, + "language_loss": 0.74054599, + "learning_rate": 1.425384861715639e-06, + "loss": 0.75787032, + "num_input_tokens_seen": 216668100, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.32006836, + "step": 10062, + "time_per_iteration": 2.7986693382263184 + }, + { + "auxiliary_loss_clip": 0.01365828, + "auxiliary_loss_mlp": 0.0036562, + "balance_loss_clip": 1.11554813, + "balance_loss_mlp": 0.33264619, + "epoch": 0.6050202915977755, + "flos": 20083114771200.0, + "grad_norm": 20.05489731746714, + "language_loss": 0.79688358, + "learning_rate": 1.425011831266978e-06, + "loss": 0.81419814, + "num_input_tokens_seen": 216686125, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.32958984, + "step": 10063, + "time_per_iteration": 2.70166277885437 + }, + { + "auxiliary_loss_clip": 0.01373403, + "auxiliary_loss_mlp": 0.00340815, + "balance_loss_clip": 1.11994696, + "balance_loss_mlp": 0.31063104, + "epoch": 0.6050804148504434, + "flos": 15960821391360.0, + "grad_norm": 20.15265765746853, + "language_loss": 0.90201575, + "learning_rate": 1.424638822621926e-06, + "loss": 0.91915792, + "num_input_tokens_seen": 216704265, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.30175781, + "step": 10064, + "time_per_iteration": 4.0784430503845215 + }, + { + "auxiliary_loss_clip": 0.01377701, + "auxiliary_loss_mlp": 0.00343466, + "balance_loss_clip": 1.12264001, + "balance_loss_mlp": 0.31352013, + "epoch": 0.6051405381031114, + "flos": 17456644391040.0, + "grad_norm": 809.0444523403607, + "language_loss": 0.89554393, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.91275561, + "num_input_tokens_seen": 216721765, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.29931641, + "step": 10065, + "time_per_iteration": 2.643646240234375 + }, + { + "auxiliary_loss_clip": 0.01396454, + "auxiliary_loss_mlp": 0.00375019, + "balance_loss_clip": 1.13279963, + "balance_loss_mlp": 0.34113991, + "epoch": 0.6052006613557793, + "flos": 11400130517760.0, + "grad_norm": 22.31329378901884, + "language_loss": 0.87311888, + "learning_rate": 1.423892870799226e-06, + "loss": 0.89083362, + "num_input_tokens_seen": 216738295, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.33862305, + "step": 10066, + "time_per_iteration": 4.059038400650024 + }, + { + "auxiliary_loss_clip": 0.01360542, + "auxiliary_loss_mlp": 0.0032523, + "balance_loss_clip": 1.11197805, + "balance_loss_mlp": 0.29528505, + "epoch": 0.6052607846084473, + "flos": 24750998807040.0, + "grad_norm": 64.1785660122441, + "language_loss": 0.79090273, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.80776048, + "num_input_tokens_seen": 216759875, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.29956055, + "step": 10067, + "time_per_iteration": 2.690668821334839 + }, + { + "auxiliary_loss_clip": 0.01408838, + "auxiliary_loss_mlp": 0.00362781, + "balance_loss_clip": 1.14468336, + "balance_loss_mlp": 0.33212072, + "epoch": 0.6053209078611153, + "flos": 20741141975040.0, + "grad_norm": 59.78947552336681, + "language_loss": 0.73199022, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.74970639, + "num_input_tokens_seen": 216780705, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.30664062, + "step": 10068, + "time_per_iteration": 2.6877613067626953 + }, + { + "auxiliary_loss_clip": 0.01381368, + "auxiliary_loss_mlp": 0.00352349, + "balance_loss_clip": 1.12344146, + "balance_loss_mlp": 0.31978148, + "epoch": 0.6053810311137833, + "flos": 18953149749120.0, + "grad_norm": 7.396543040406006, + "language_loss": 0.9466393, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.9639765, + "num_input_tokens_seen": 216797625, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.32568359, + "step": 10069, + "time_per_iteration": 4.17581582069397 + }, + { + "auxiliary_loss_clip": 0.01385969, + "auxiliary_loss_mlp": 0.0033679, + "balance_loss_clip": 1.13251066, + "balance_loss_mlp": 0.30448383, + "epoch": 0.6054411543664512, + "flos": 23951124794880.0, + "grad_norm": 21.21559379305466, + "language_loss": 0.90082568, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.91805327, + "num_input_tokens_seen": 216817610, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.32299805, + "step": 10070, + "time_per_iteration": 2.7493646144866943 + }, + { + "auxiliary_loss_clip": 0.01383593, + "auxiliary_loss_mlp": 0.00370144, + "balance_loss_clip": 1.12719846, + "balance_loss_mlp": 0.33749259, + "epoch": 0.6055012776191192, + "flos": 20593979953920.0, + "grad_norm": 32.79556114047225, + "language_loss": 0.9284023, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.94593966, + "num_input_tokens_seen": 216836835, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.32641602, + "step": 10071, + "time_per_iteration": 2.72204327583313 + }, + { + "auxiliary_loss_clip": 0.01381046, + "auxiliary_loss_mlp": 0.00355487, + "balance_loss_clip": 1.12445712, + "balance_loss_mlp": 0.32439759, + "epoch": 0.6055614008717871, + "flos": 30298191782400.0, + "grad_norm": 6.80160558168375, + "language_loss": 0.84189111, + "learning_rate": 1.421655540088603e-06, + "loss": 0.85925645, + "num_input_tokens_seen": 216856760, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.31079102, + "step": 10072, + "time_per_iteration": 2.813603401184082 + }, + { + "auxiliary_loss_clip": 0.01388594, + "auxiliary_loss_mlp": 0.00350192, + "balance_loss_clip": 1.1248982, + "balance_loss_mlp": 0.31574088, + "epoch": 0.6056215241244551, + "flos": 27125017424640.0, + "grad_norm": 9.237194381581729, + "language_loss": 0.81141913, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.828807, + "num_input_tokens_seen": 216878795, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.34448242, + "step": 10073, + "time_per_iteration": 2.7730884552001953 + }, + { + "auxiliary_loss_clip": 0.01442877, + "auxiliary_loss_mlp": 0.00181589, + "balance_loss_clip": 1.23707008, + "balance_loss_mlp": 0.17100285, + "epoch": 0.6056816473771232, + "flos": 56007323925120.0, + "grad_norm": 0.7482639451386884, + "language_loss": 0.54875761, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.56500232, + "num_input_tokens_seen": 216937800, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.10595703, + "step": 10074, + "time_per_iteration": 4.644582271575928 + }, + { + "auxiliary_loss_clip": 0.01362079, + "auxiliary_loss_mlp": 0.00370399, + "balance_loss_clip": 1.11302197, + "balance_loss_mlp": 0.33973873, + "epoch": 0.6057417706297911, + "flos": 23549499849600.0, + "grad_norm": 30.720600990048236, + "language_loss": 0.87769461, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.89501941, + "num_input_tokens_seen": 216955280, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.30639648, + "step": 10075, + "time_per_iteration": 2.7310824394226074 + }, + { + "auxiliary_loss_clip": 0.01369091, + "auxiliary_loss_mlp": 0.00345965, + "balance_loss_clip": 1.11677766, + "balance_loss_mlp": 0.31382599, + "epoch": 0.6058018938824591, + "flos": 27744296832000.0, + "grad_norm": 6.961684023990378, + "language_loss": 0.84749061, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.86464119, + "num_input_tokens_seen": 216976950, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.32128906, + "step": 10076, + "time_per_iteration": 2.767946720123291 + }, + { + "auxiliary_loss_clip": 0.01410538, + "auxiliary_loss_mlp": 0.00348037, + "balance_loss_clip": 1.14050126, + "balance_loss_mlp": 0.31236923, + "epoch": 0.605862017135127, + "flos": 22783381643520.0, + "grad_norm": 2.9841658670083167, + "language_loss": 0.81456, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.83214575, + "num_input_tokens_seen": 216996945, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.35668945, + "step": 10077, + "time_per_iteration": 2.724966287612915 + }, + { + "auxiliary_loss_clip": 0.01424774, + "auxiliary_loss_mlp": 0.00354337, + "balance_loss_clip": 1.15732479, + "balance_loss_mlp": 0.32014826, + "epoch": 0.605922140387795, + "flos": 21215019127680.0, + "grad_norm": 38.03664165584043, + "language_loss": 0.64428616, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.66207731, + "num_input_tokens_seen": 217016580, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.34179688, + "step": 10078, + "time_per_iteration": 2.7018587589263916 + }, + { + "auxiliary_loss_clip": 0.0137951, + "auxiliary_loss_mlp": 0.00361864, + "balance_loss_clip": 1.12142754, + "balance_loss_mlp": 0.32920098, + "epoch": 0.6059822636404629, + "flos": 27268372604160.0, + "grad_norm": 11.442252058735331, + "language_loss": 0.774984, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.79239774, + "num_input_tokens_seen": 217037300, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.32666016, + "step": 10079, + "time_per_iteration": 2.785423994064331 + }, + { + "auxiliary_loss_clip": 0.01366346, + "auxiliary_loss_mlp": 0.00347094, + "balance_loss_clip": 1.11416888, + "balance_loss_mlp": 0.31648073, + "epoch": 0.606042386893131, + "flos": 20631327120000.0, + "grad_norm": 430.215436466252, + "language_loss": 0.7100482, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.72718263, + "num_input_tokens_seen": 217055805, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.30639648, + "step": 10080, + "time_per_iteration": 2.7140510082244873 + }, + { + "auxiliary_loss_clip": 0.01372893, + "auxiliary_loss_mlp": 0.00350743, + "balance_loss_clip": 1.11933529, + "balance_loss_mlp": 0.31834161, + "epoch": 0.6061025101457989, + "flos": 23002293081600.0, + "grad_norm": 6.371870723174296, + "language_loss": 0.79043579, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.80767214, + "num_input_tokens_seen": 217074175, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.32397461, + "step": 10081, + "time_per_iteration": 2.681917667388916 + }, + { + "auxiliary_loss_clip": 0.01382206, + "auxiliary_loss_mlp": 0.0034257, + "balance_loss_clip": 1.12598431, + "balance_loss_mlp": 0.3091442, + "epoch": 0.6061626333984669, + "flos": 29898937134720.0, + "grad_norm": 5.5520270052032386, + "language_loss": 0.75048065, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.76772845, + "num_input_tokens_seen": 217095695, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.33422852, + "step": 10082, + "time_per_iteration": 2.83628249168396 + }, + { + "auxiliary_loss_clip": 0.01372684, + "auxiliary_loss_mlp": 0.00351697, + "balance_loss_clip": 1.12413275, + "balance_loss_mlp": 0.32036883, + "epoch": 0.6062227566511348, + "flos": 25009196745600.0, + "grad_norm": 23.434233036760993, + "language_loss": 0.71492517, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.73216897, + "num_input_tokens_seen": 217116260, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.31323242, + "step": 10083, + "time_per_iteration": 2.73077392578125 + }, + { + "auxiliary_loss_clip": 0.01382275, + "auxiliary_loss_mlp": 0.00353852, + "balance_loss_clip": 1.12460113, + "balance_loss_mlp": 0.31951964, + "epoch": 0.6062828799038028, + "flos": 19463943104640.0, + "grad_norm": 92.63399668096643, + "language_loss": 0.81832999, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.83569121, + "num_input_tokens_seen": 217134465, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.34326172, + "step": 10084, + "time_per_iteration": 2.7303669452667236 + }, + { + "auxiliary_loss_clip": 0.01422759, + "auxiliary_loss_mlp": 0.0032559, + "balance_loss_clip": 1.14788365, + "balance_loss_mlp": 0.29199672, + "epoch": 0.6063430031564707, + "flos": 13589568120960.0, + "grad_norm": 18.75990410324582, + "language_loss": 0.81391239, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.83139586, + "num_input_tokens_seen": 217149920, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.33618164, + "step": 10085, + "time_per_iteration": 2.7210023403167725 + }, + { + "auxiliary_loss_clip": 0.01398075, + "auxiliary_loss_mlp": 0.00343789, + "balance_loss_clip": 1.14021707, + "balance_loss_mlp": 0.31024376, + "epoch": 0.6064031264091387, + "flos": 23255499029760.0, + "grad_norm": 20.343862263650355, + "language_loss": 0.83850443, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.85592306, + "num_input_tokens_seen": 217168165, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.33544922, + "step": 10086, + "time_per_iteration": 2.697746992111206 + }, + { + "auxiliary_loss_clip": 0.01381928, + "auxiliary_loss_mlp": 0.00358033, + "balance_loss_clip": 1.12934804, + "balance_loss_mlp": 0.32424951, + "epoch": 0.6064632496618068, + "flos": 22458462192000.0, + "grad_norm": 4.743494513085421, + "language_loss": 0.78794312, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.80534279, + "num_input_tokens_seen": 217190070, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.33813477, + "step": 10087, + "time_per_iteration": 2.7177181243896484 + }, + { + "auxiliary_loss_clip": 0.01371487, + "auxiliary_loss_mlp": 0.00326705, + "balance_loss_clip": 1.12008905, + "balance_loss_mlp": 0.29737949, + "epoch": 0.6065233729144747, + "flos": 25118652464640.0, + "grad_norm": 13.745864745597467, + "language_loss": 0.88883924, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.90582114, + "num_input_tokens_seen": 217209370, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.29370117, + "step": 10088, + "time_per_iteration": 2.6820290088653564 + }, + { + "auxiliary_loss_clip": 0.01377298, + "auxiliary_loss_mlp": 0.00339116, + "balance_loss_clip": 1.12404668, + "balance_loss_mlp": 0.30912304, + "epoch": 0.6065834961671427, + "flos": 23477355383040.0, + "grad_norm": 15.632830248645542, + "language_loss": 0.79043835, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.80760252, + "num_input_tokens_seen": 217226990, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.30004883, + "step": 10089, + "time_per_iteration": 2.7188403606414795 + }, + { + "auxiliary_loss_clip": 0.01382409, + "auxiliary_loss_mlp": 0.00370826, + "balance_loss_clip": 1.12645495, + "balance_loss_mlp": 0.33956891, + "epoch": 0.6066436194198106, + "flos": 17019396132480.0, + "grad_norm": 94.10094466305266, + "language_loss": 0.88282692, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.90035927, + "num_input_tokens_seen": 217244585, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.31274414, + "step": 10090, + "time_per_iteration": 2.62555193901062 + }, + { + "auxiliary_loss_clip": 0.01403751, + "auxiliary_loss_mlp": 0.00353951, + "balance_loss_clip": 1.12948108, + "balance_loss_mlp": 0.31849816, + "epoch": 0.6067037426724786, + "flos": 18514752255360.0, + "grad_norm": 79.32742255965753, + "language_loss": 0.85637534, + "learning_rate": 1.4145758826341e-06, + "loss": 0.87395233, + "num_input_tokens_seen": 217263435, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.35449219, + "step": 10091, + "time_per_iteration": 2.691298246383667 + }, + { + "auxiliary_loss_clip": 0.0140267, + "auxiliary_loss_mlp": 0.00359507, + "balance_loss_clip": 1.14079869, + "balance_loss_mlp": 0.32598579, + "epoch": 0.6067638659251465, + "flos": 22345989730560.0, + "grad_norm": 80.3433076029316, + "language_loss": 0.85912657, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.87674832, + "num_input_tokens_seen": 217283725, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.33520508, + "step": 10092, + "time_per_iteration": 2.6874682903289795 + }, + { + "auxiliary_loss_clip": 0.01382456, + "auxiliary_loss_mlp": 0.00346335, + "balance_loss_clip": 1.12161732, + "balance_loss_mlp": 0.31424397, + "epoch": 0.6068239891778145, + "flos": 12451019748480.0, + "grad_norm": 23.284199083678068, + "language_loss": 0.82325512, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.84054303, + "num_input_tokens_seen": 217301120, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.32092285, + "step": 10093, + "time_per_iteration": 2.6994102001190186 + }, + { + "auxiliary_loss_clip": 0.01388, + "auxiliary_loss_mlp": 0.00327426, + "balance_loss_clip": 1.13207436, + "balance_loss_mlp": 0.29616922, + "epoch": 0.6068841124304825, + "flos": 23185868515200.0, + "grad_norm": 6.753543263943484, + "language_loss": 0.92117089, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.93832517, + "num_input_tokens_seen": 217319585, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.31225586, + "step": 10094, + "time_per_iteration": 2.700490713119507 + }, + { + "auxiliary_loss_clip": 0.01371258, + "auxiliary_loss_mlp": 0.0036393, + "balance_loss_clip": 1.11805618, + "balance_loss_mlp": 0.32875109, + "epoch": 0.6069442356831505, + "flos": 18587902302720.0, + "grad_norm": 21.742915418759353, + "language_loss": 0.81178498, + "learning_rate": 1.413086446353919e-06, + "loss": 0.82913685, + "num_input_tokens_seen": 217338880, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.35168457, + "step": 10095, + "time_per_iteration": 2.7087674140930176 + }, + { + "auxiliary_loss_clip": 0.01360843, + "auxiliary_loss_mlp": 0.00403723, + "balance_loss_clip": 1.10967743, + "balance_loss_mlp": 0.36970109, + "epoch": 0.6070043589358184, + "flos": 20960340721920.0, + "grad_norm": 9.660705163410816, + "language_loss": 0.82410479, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.84175038, + "num_input_tokens_seen": 217357480, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.34008789, + "step": 10096, + "time_per_iteration": 2.707672357559204 + }, + { + "auxiliary_loss_clip": 0.01376796, + "auxiliary_loss_mlp": 0.00353906, + "balance_loss_clip": 1.1217066, + "balance_loss_mlp": 0.3211236, + "epoch": 0.6070644821884864, + "flos": 11692443398400.0, + "grad_norm": 239.2376926420582, + "language_loss": 0.86988461, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.88719171, + "num_input_tokens_seen": 217374575, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.32788086, + "step": 10097, + "time_per_iteration": 2.7126564979553223 + }, + { + "auxiliary_loss_clip": 0.01382553, + "auxiliary_loss_mlp": 0.00339575, + "balance_loss_clip": 1.12442851, + "balance_loss_mlp": 0.30893871, + "epoch": 0.6071246054411543, + "flos": 19310568030720.0, + "grad_norm": 91.98899817815985, + "language_loss": 0.74287921, + "learning_rate": 1.411969602780478e-06, + "loss": 0.76010048, + "num_input_tokens_seen": 217392950, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.30664062, + "step": 10098, + "time_per_iteration": 2.674354076385498 + }, + { + "auxiliary_loss_clip": 0.01393277, + "auxiliary_loss_mlp": 0.00336529, + "balance_loss_clip": 1.13659894, + "balance_loss_mlp": 0.30381769, + "epoch": 0.6071847286938223, + "flos": 17749029098880.0, + "grad_norm": 444.29980506589413, + "language_loss": 0.86657536, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.88387334, + "num_input_tokens_seen": 217412145, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.32714844, + "step": 10099, + "time_per_iteration": 2.6824686527252197 + }, + { + "auxiliary_loss_clip": 0.01396294, + "auxiliary_loss_mlp": 0.00363111, + "balance_loss_clip": 1.13027906, + "balance_loss_mlp": 0.32505971, + "epoch": 0.6072448519464904, + "flos": 22637512512000.0, + "grad_norm": 30.71544842921217, + "language_loss": 0.7864114, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.8040055, + "num_input_tokens_seen": 217432080, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.38037109, + "step": 10100, + "time_per_iteration": 2.714069366455078 + }, + { + "auxiliary_loss_clip": 0.01395473, + "auxiliary_loss_mlp": 0.00357124, + "balance_loss_clip": 1.13699305, + "balance_loss_mlp": 0.32090837, + "epoch": 0.6073049751991583, + "flos": 19537308633600.0, + "grad_norm": 19.337434748723, + "language_loss": 0.76148027, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.77900624, + "num_input_tokens_seen": 217450945, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.36206055, + "step": 10101, + "time_per_iteration": 2.749934434890747 + }, + { + "auxiliary_loss_clip": 0.01377563, + "auxiliary_loss_mlp": 0.00341808, + "balance_loss_clip": 1.12305474, + "balance_loss_mlp": 0.31033698, + "epoch": 0.6073650984518263, + "flos": 28294233033600.0, + "grad_norm": 76.19143520797958, + "language_loss": 0.74929255, + "learning_rate": 1.410480790256154e-06, + "loss": 0.76648629, + "num_input_tokens_seen": 217473105, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.31445312, + "step": 10102, + "time_per_iteration": 2.756674289703369 + }, + { + "auxiliary_loss_clip": 0.01390762, + "auxiliary_loss_mlp": 0.00355898, + "balance_loss_clip": 1.13431644, + "balance_loss_mlp": 0.32313913, + "epoch": 0.6074252217044942, + "flos": 25664422688640.0, + "grad_norm": 21.456476036803846, + "language_loss": 0.81733859, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.83480513, + "num_input_tokens_seen": 217491780, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.32751465, + "step": 10103, + "time_per_iteration": 2.735938787460327 + }, + { + "auxiliary_loss_clip": 0.01417834, + "auxiliary_loss_mlp": 0.00363579, + "balance_loss_clip": 1.14367366, + "balance_loss_mlp": 0.32781613, + "epoch": 0.6074853449571622, + "flos": 22857106308480.0, + "grad_norm": 17.796964218554898, + "language_loss": 0.83364522, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.85145932, + "num_input_tokens_seen": 217510605, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.35742188, + "step": 10104, + "time_per_iteration": 2.7056121826171875 + }, + { + "auxiliary_loss_clip": 0.01448338, + "auxiliary_loss_mlp": 0.00255981, + "balance_loss_clip": 1.24455547, + "balance_loss_mlp": 0.24439387, + "epoch": 0.6075454682098301, + "flos": 67111406547840.0, + "grad_norm": 0.7314854318948669, + "language_loss": 0.54972637, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.56676954, + "num_input_tokens_seen": 217574815, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.11572266, + "step": 10105, + "time_per_iteration": 3.188776969909668 + }, + { + "auxiliary_loss_clip": 0.0146054, + "auxiliary_loss_mlp": 0.00171155, + "balance_loss_clip": 1.25075245, + "balance_loss_mlp": 0.15971135, + "epoch": 0.6076055914624982, + "flos": 70712024751360.0, + "grad_norm": 0.7695728644510808, + "language_loss": 0.56780499, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.58412194, + "num_input_tokens_seen": 217632375, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.11425781, + "step": 10106, + "time_per_iteration": 4.461594343185425 + }, + { + "auxiliary_loss_clip": 0.01405454, + "auxiliary_loss_mlp": 0.00338685, + "balance_loss_clip": 1.14830875, + "balance_loss_mlp": 0.30715367, + "epoch": 0.6076657147151661, + "flos": 28364545906560.0, + "grad_norm": 3.5130612308512, + "language_loss": 0.73797512, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.75541651, + "num_input_tokens_seen": 217653055, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.31530762, + "step": 10107, + "time_per_iteration": 2.775022268295288 + }, + { + "auxiliary_loss_clip": 0.01407729, + "auxiliary_loss_mlp": 0.00360463, + "balance_loss_clip": 1.14367723, + "balance_loss_mlp": 0.32641721, + "epoch": 0.6077258379678341, + "flos": 15049767807360.0, + "grad_norm": 25.723340664811968, + "language_loss": 0.87567061, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.89335257, + "num_input_tokens_seen": 217671520, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.34008789, + "step": 10108, + "time_per_iteration": 4.161105394363403 + }, + { + "auxiliary_loss_clip": 0.01417428, + "auxiliary_loss_mlp": 0.00360369, + "balance_loss_clip": 1.14803755, + "balance_loss_mlp": 0.32424834, + "epoch": 0.607785961220502, + "flos": 36167251553280.0, + "grad_norm": 75.46112504239606, + "language_loss": 0.78915799, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.80693591, + "num_input_tokens_seen": 217691880, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.36132812, + "step": 10109, + "time_per_iteration": 2.8070921897888184 + }, + { + "auxiliary_loss_clip": 0.01413395, + "auxiliary_loss_mlp": 0.00355095, + "balance_loss_clip": 1.15668607, + "balance_loss_mlp": 0.32243139, + "epoch": 0.60784608447317, + "flos": 22524249951360.0, + "grad_norm": 129.30306139544265, + "language_loss": 0.86111408, + "learning_rate": 1.407504239132653e-06, + "loss": 0.87879896, + "num_input_tokens_seen": 217710530, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.3269043, + "step": 10110, + "time_per_iteration": 2.7106804847717285 + }, + { + "auxiliary_loss_clip": 0.01420245, + "auxiliary_loss_mlp": 0.00339877, + "balance_loss_clip": 1.15241671, + "balance_loss_mlp": 0.30664146, + "epoch": 0.6079062077258379, + "flos": 23841166285440.0, + "grad_norm": 13.480916004276034, + "language_loss": 0.80485928, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.82246053, + "num_input_tokens_seen": 217728650, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.33227539, + "step": 10111, + "time_per_iteration": 4.2622339725494385 + }, + { + "auxiliary_loss_clip": 0.01429174, + "auxiliary_loss_mlp": 0.00368184, + "balance_loss_clip": 1.15861237, + "balance_loss_mlp": 0.3317779, + "epoch": 0.6079663309785059, + "flos": 23367037737600.0, + "grad_norm": 9.607394578279044, + "language_loss": 0.75023311, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.76820672, + "num_input_tokens_seen": 217747135, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.36401367, + "step": 10112, + "time_per_iteration": 2.691591501235962 + }, + { + "auxiliary_loss_clip": 0.01439266, + "auxiliary_loss_mlp": 0.00155696, + "balance_loss_clip": 1.23403096, + "balance_loss_mlp": 0.14396535, + "epoch": 0.6080264542311739, + "flos": 71382873110400.0, + "grad_norm": 0.6248874244874802, + "language_loss": 0.48978099, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.50573063, + "num_input_tokens_seen": 217811860, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.1171875, + "step": 10113, + "time_per_iteration": 3.2471537590026855 + }, + { + "auxiliary_loss_clip": 0.01438742, + "auxiliary_loss_mlp": 0.00199673, + "balance_loss_clip": 1.23373866, + "balance_loss_mlp": 0.18822894, + "epoch": 0.6080865774838419, + "flos": 66529833442560.0, + "grad_norm": 0.8232318243870913, + "language_loss": 0.564821, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.58120513, + "num_input_tokens_seen": 217866510, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.11425781, + "step": 10114, + "time_per_iteration": 3.0797135829925537 + }, + { + "auxiliary_loss_clip": 0.01427169, + "auxiliary_loss_mlp": 0.00352926, + "balance_loss_clip": 1.15827012, + "balance_loss_mlp": 0.31845117, + "epoch": 0.6081467007365099, + "flos": 19207935895680.0, + "grad_norm": 3.4101776899876257, + "language_loss": 0.80236083, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.82016182, + "num_input_tokens_seen": 217885650, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.34472656, + "step": 10115, + "time_per_iteration": 2.686617136001587 + }, + { + "auxiliary_loss_clip": 0.01406714, + "auxiliary_loss_mlp": 0.00319946, + "balance_loss_clip": 1.14684415, + "balance_loss_mlp": 0.28651965, + "epoch": 0.6082068239891778, + "flos": 24167737762560.0, + "grad_norm": 11.753707122779716, + "language_loss": 0.78753686, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.80480349, + "num_input_tokens_seen": 217905300, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.33422852, + "step": 10116, + "time_per_iteration": 4.135500431060791 + }, + { + "auxiliary_loss_clip": 0.01423777, + "auxiliary_loss_mlp": 0.00348436, + "balance_loss_clip": 1.1504817, + "balance_loss_mlp": 0.31303057, + "epoch": 0.6082669472418458, + "flos": 37413316310400.0, + "grad_norm": 31.402995429809234, + "language_loss": 0.62902546, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.64674753, + "num_input_tokens_seen": 217927845, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.35400391, + "step": 10117, + "time_per_iteration": 2.796548366546631 + }, + { + "auxiliary_loss_clip": 0.01419041, + "auxiliary_loss_mlp": 0.00352323, + "balance_loss_clip": 1.15244973, + "balance_loss_mlp": 0.31856337, + "epoch": 0.6083270704945137, + "flos": 15085534775040.0, + "grad_norm": 29.22168270138686, + "language_loss": 0.78485942, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.80257308, + "num_input_tokens_seen": 217946145, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.33740234, + "step": 10118, + "time_per_iteration": 2.756643295288086 + }, + { + "auxiliary_loss_clip": 0.0141003, + "auxiliary_loss_mlp": 0.00347184, + "balance_loss_clip": 1.1464839, + "balance_loss_mlp": 0.31418732, + "epoch": 0.6083871937471818, + "flos": 20668458804480.0, + "grad_norm": 9.819555949579659, + "language_loss": 0.8108902, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.82846236, + "num_input_tokens_seen": 217965190, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.33007812, + "step": 10119, + "time_per_iteration": 2.6960272789001465 + }, + { + "auxiliary_loss_clip": 0.01393577, + "auxiliary_loss_mlp": 0.00371094, + "balance_loss_clip": 1.1386863, + "balance_loss_mlp": 0.33490214, + "epoch": 0.6084473169998497, + "flos": 21506901045120.0, + "grad_norm": 207.52861740714823, + "language_loss": 0.7528249, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.77047157, + "num_input_tokens_seen": 217983625, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.36181641, + "step": 10120, + "time_per_iteration": 2.7375195026397705 + }, + { + "auxiliary_loss_clip": 0.0143036, + "auxiliary_loss_mlp": 0.00370424, + "balance_loss_clip": 1.16187143, + "balance_loss_mlp": 0.33633021, + "epoch": 0.6085074402525177, + "flos": 26870051710080.0, + "grad_norm": 16.91133655401329, + "language_loss": 0.81980246, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.83781034, + "num_input_tokens_seen": 218006005, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.34082031, + "step": 10121, + "time_per_iteration": 2.7258880138397217 + }, + { + "auxiliary_loss_clip": 0.01388383, + "auxiliary_loss_mlp": 0.00343341, + "balance_loss_clip": 1.13453937, + "balance_loss_mlp": 0.31091568, + "epoch": 0.6085675635051856, + "flos": 10889839952640.0, + "grad_norm": 18.95768367400698, + "language_loss": 0.87254941, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.88986671, + "num_input_tokens_seen": 218024195, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.32421875, + "step": 10122, + "time_per_iteration": 2.6450514793395996 + }, + { + "auxiliary_loss_clip": 0.01422955, + "auxiliary_loss_mlp": 0.00338767, + "balance_loss_clip": 1.15930903, + "balance_loss_mlp": 0.30655706, + "epoch": 0.6086276867578536, + "flos": 34862186707200.0, + "grad_norm": 154.21622201150421, + "language_loss": 0.62281632, + "learning_rate": 1.402670413578284e-06, + "loss": 0.64043361, + "num_input_tokens_seen": 218047190, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.32275391, + "step": 10123, + "time_per_iteration": 2.781679153442383 + }, + { + "auxiliary_loss_clip": 0.01404998, + "auxiliary_loss_mlp": 0.00355869, + "balance_loss_clip": 1.14705181, + "balance_loss_mlp": 0.3226577, + "epoch": 0.6086878100105215, + "flos": 20047706939520.0, + "grad_norm": 12.298946292190369, + "language_loss": 0.8216821, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.83929074, + "num_input_tokens_seen": 218065945, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.33251953, + "step": 10124, + "time_per_iteration": 2.6461029052734375 + }, + { + "auxiliary_loss_clip": 0.01406631, + "auxiliary_loss_mlp": 0.0033992, + "balance_loss_clip": 1.14227962, + "balance_loss_mlp": 0.30689925, + "epoch": 0.6087479332631895, + "flos": 18332469711360.0, + "grad_norm": 7.29010548935568, + "language_loss": 0.74558353, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.76304913, + "num_input_tokens_seen": 218085285, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.32983398, + "step": 10125, + "time_per_iteration": 2.653268337249756 + }, + { + "auxiliary_loss_clip": 0.01406044, + "auxiliary_loss_mlp": 0.0032416, + "balance_loss_clip": 1.14585078, + "balance_loss_mlp": 0.29204458, + "epoch": 0.6088080565158575, + "flos": 24493411399680.0, + "grad_norm": 48.60187426945705, + "language_loss": 0.83360851, + "learning_rate": 1.40155545786479e-06, + "loss": 0.8509106, + "num_input_tokens_seen": 218104735, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.32104492, + "step": 10126, + "time_per_iteration": 2.6528241634368896 + }, + { + "auxiliary_loss_clip": 0.0142124, + "auxiliary_loss_mlp": 0.00339419, + "balance_loss_clip": 1.15060377, + "balance_loss_mlp": 0.30441886, + "epoch": 0.6088681797685255, + "flos": 10269016260480.0, + "grad_norm": 43.82410713811943, + "language_loss": 0.84347671, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.86108333, + "num_input_tokens_seen": 218121855, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.35009766, + "step": 10127, + "time_per_iteration": 2.7650816440582275 + }, + { + "auxiliary_loss_clip": 0.01448582, + "auxiliary_loss_mlp": 0.00371814, + "balance_loss_clip": 1.17093515, + "balance_loss_mlp": 0.33767253, + "epoch": 0.6089283030211935, + "flos": 21973703218560.0, + "grad_norm": 16.262045112949433, + "language_loss": 0.8162694, + "learning_rate": 1.400812267497691e-06, + "loss": 0.83447337, + "num_input_tokens_seen": 218137325, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.34155273, + "step": 10128, + "time_per_iteration": 2.6754305362701416 + }, + { + "auxiliary_loss_clip": 0.01409805, + "auxiliary_loss_mlp": 0.00348877, + "balance_loss_clip": 1.14811122, + "balance_loss_mlp": 0.31583184, + "epoch": 0.6089884262738614, + "flos": 17785191116160.0, + "grad_norm": 123.35631374989457, + "language_loss": 0.81727278, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.83485961, + "num_input_tokens_seen": 218155530, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.33056641, + "step": 10129, + "time_per_iteration": 2.6481165885925293 + }, + { + "auxiliary_loss_clip": 0.014071, + "auxiliary_loss_mlp": 0.0033319, + "balance_loss_clip": 1.14911056, + "balance_loss_mlp": 0.29928726, + "epoch": 0.6090485495265294, + "flos": 36910423946880.0, + "grad_norm": 13.882699882564525, + "language_loss": 0.71640515, + "learning_rate": 1.400069168015626e-06, + "loss": 0.7338081, + "num_input_tokens_seen": 218182535, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.33935547, + "step": 10130, + "time_per_iteration": 2.8195807933807373 + }, + { + "auxiliary_loss_clip": 0.01403635, + "auxiliary_loss_mlp": 0.0035653, + "balance_loss_clip": 1.1485424, + "balance_loss_mlp": 0.32558292, + "epoch": 0.6091086727791973, + "flos": 19899036547200.0, + "grad_norm": 157.49363812218576, + "language_loss": 0.82820857, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.84581029, + "num_input_tokens_seen": 218201740, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.30932617, + "step": 10131, + "time_per_iteration": 2.6640162467956543 + }, + { + "auxiliary_loss_clip": 0.01405156, + "auxiliary_loss_mlp": 0.00333853, + "balance_loss_clip": 1.14548469, + "balance_loss_mlp": 0.30096269, + "epoch": 0.6091687960318654, + "flos": 22163635359360.0, + "grad_norm": 8.881889416896676, + "language_loss": 0.82807595, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.84546602, + "num_input_tokens_seen": 218219800, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.32897949, + "step": 10132, + "time_per_iteration": 2.663092613220215 + }, + { + "auxiliary_loss_clip": 0.01391381, + "auxiliary_loss_mlp": 0.00343853, + "balance_loss_clip": 1.14110363, + "balance_loss_mlp": 0.31560084, + "epoch": 0.6092289192845333, + "flos": 21465280160640.0, + "grad_norm": 24.509764475942717, + "language_loss": 0.79691362, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.81426597, + "num_input_tokens_seen": 218237585, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.2824707, + "step": 10133, + "time_per_iteration": 2.7175862789154053 + }, + { + "auxiliary_loss_clip": 0.0143269, + "auxiliary_loss_mlp": 0.00341103, + "balance_loss_clip": 1.16466022, + "balance_loss_mlp": 0.30519736, + "epoch": 0.6092890425372013, + "flos": 28694924225280.0, + "grad_norm": 5.8201236429387935, + "language_loss": 0.72110081, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.73883879, + "num_input_tokens_seen": 218258700, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.35913086, + "step": 10134, + "time_per_iteration": 2.7426187992095947 + }, + { + "auxiliary_loss_clip": 0.01393979, + "auxiliary_loss_mlp": 0.00336596, + "balance_loss_clip": 1.13930273, + "balance_loss_mlp": 0.3051959, + "epoch": 0.6093491657898692, + "flos": 20813178700800.0, + "grad_norm": 13.825067920335895, + "language_loss": 0.86201477, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.8793205, + "num_input_tokens_seen": 218275655, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.3137207, + "step": 10135, + "time_per_iteration": 2.6537997722625732 + }, + { + "auxiliary_loss_clip": 0.0142838, + "auxiliary_loss_mlp": 0.00352401, + "balance_loss_clip": 1.16305256, + "balance_loss_mlp": 0.31849772, + "epoch": 0.6094092890425372, + "flos": 25446983708160.0, + "grad_norm": 8.911015124169474, + "language_loss": 0.78631675, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.80412447, + "num_input_tokens_seen": 218295720, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.33862305, + "step": 10136, + "time_per_iteration": 2.715176582336426 + }, + { + "auxiliary_loss_clip": 0.01413937, + "auxiliary_loss_mlp": 0.00336894, + "balance_loss_clip": 1.15614426, + "balance_loss_mlp": 0.30568486, + "epoch": 0.6094694122952051, + "flos": 35621265847680.0, + "grad_norm": 5.7657809988054884, + "language_loss": 0.80138963, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.81889796, + "num_input_tokens_seen": 218316745, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.31201172, + "step": 10137, + "time_per_iteration": 2.778026580810547 + }, + { + "auxiliary_loss_clip": 0.01395503, + "auxiliary_loss_mlp": 0.00354944, + "balance_loss_clip": 1.13356817, + "balance_loss_mlp": 0.32242334, + "epoch": 0.6095295355478731, + "flos": 24456962073600.0, + "grad_norm": 8.061528840206416, + "language_loss": 0.85268211, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.87018663, + "num_input_tokens_seen": 218335385, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.32519531, + "step": 10138, + "time_per_iteration": 2.717181921005249 + }, + { + "auxiliary_loss_clip": 0.01416003, + "auxiliary_loss_mlp": 0.0033503, + "balance_loss_clip": 1.15742946, + "balance_loss_mlp": 0.30355883, + "epoch": 0.6095896588005411, + "flos": 15633208419840.0, + "grad_norm": 825.5703708869182, + "language_loss": 0.86358529, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.88109559, + "num_input_tokens_seen": 218353320, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.31420898, + "step": 10139, + "time_per_iteration": 2.6477596759796143 + }, + { + "auxiliary_loss_clip": 0.01407606, + "auxiliary_loss_mlp": 0.00325972, + "balance_loss_clip": 1.14520741, + "balance_loss_mlp": 0.29163969, + "epoch": 0.6096497820532091, + "flos": 15550577182080.0, + "grad_norm": 1150.5951866058797, + "language_loss": 0.90187693, + "learning_rate": 1.396355037825315e-06, + "loss": 0.9192127, + "num_input_tokens_seen": 218365620, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.34326172, + "step": 10140, + "time_per_iteration": 2.6439151763916016 + }, + { + "auxiliary_loss_clip": 0.01418463, + "auxiliary_loss_mlp": 0.00331638, + "balance_loss_clip": 1.15179873, + "balance_loss_mlp": 0.29952329, + "epoch": 0.6097099053058771, + "flos": 24204474397440.0, + "grad_norm": 181.3751431473662, + "language_loss": 0.82364297, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.84114397, + "num_input_tokens_seen": 218383785, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.32080078, + "step": 10141, + "time_per_iteration": 2.702707529067993 + }, + { + "auxiliary_loss_clip": 0.01388784, + "auxiliary_loss_mlp": 0.00328151, + "balance_loss_clip": 1.13326621, + "balance_loss_mlp": 0.29667932, + "epoch": 0.609770028558545, + "flos": 19570238426880.0, + "grad_norm": 12.04571873041154, + "language_loss": 0.83394068, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.85111004, + "num_input_tokens_seen": 218399055, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.31445312, + "step": 10142, + "time_per_iteration": 2.7253119945526123 + }, + { + "auxiliary_loss_clip": 0.01403864, + "auxiliary_loss_mlp": 0.00321829, + "balance_loss_clip": 1.1455009, + "balance_loss_mlp": 0.29062033, + "epoch": 0.609830151811213, + "flos": 23949185460480.0, + "grad_norm": 992.1151287864252, + "language_loss": 0.84799099, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.86524796, + "num_input_tokens_seen": 218419120, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.31213379, + "step": 10143, + "time_per_iteration": 2.6774985790252686 + }, + { + "auxiliary_loss_clip": 0.0143022, + "auxiliary_loss_mlp": 0.00341272, + "balance_loss_clip": 1.16142213, + "balance_loss_mlp": 0.30772692, + "epoch": 0.6098902750638809, + "flos": 16179732829440.0, + "grad_norm": 28.03567529922613, + "language_loss": 0.82640839, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.84412336, + "num_input_tokens_seen": 218435290, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.33532715, + "step": 10144, + "time_per_iteration": 2.655001640319824 + }, + { + "auxiliary_loss_clip": 0.01406624, + "auxiliary_loss_mlp": 0.00340013, + "balance_loss_clip": 1.14639914, + "balance_loss_mlp": 0.30789793, + "epoch": 0.609950398316549, + "flos": 44526393763200.0, + "grad_norm": 23.672989946781517, + "language_loss": 0.80638891, + "learning_rate": 1.394498830235383e-06, + "loss": 0.82385528, + "num_input_tokens_seen": 218457880, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.32104492, + "step": 10145, + "time_per_iteration": 2.8796489238739014 + }, + { + "auxiliary_loss_clip": 0.01442488, + "auxiliary_loss_mlp": 0.00300655, + "balance_loss_clip": 1.17207432, + "balance_loss_mlp": 0.26885018, + "epoch": 0.6100105215692169, + "flos": 23221743223680.0, + "grad_norm": 7.7390733760303485, + "language_loss": 0.7657882, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.78321964, + "num_input_tokens_seen": 218475930, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.31787109, + "step": 10146, + "time_per_iteration": 2.6904456615448 + }, + { + "auxiliary_loss_clip": 0.0138927, + "auxiliary_loss_mlp": 0.00341038, + "balance_loss_clip": 1.13676715, + "balance_loss_mlp": 0.31063959, + "epoch": 0.6100706448218849, + "flos": 15012564295680.0, + "grad_norm": 18.798041537233658, + "language_loss": 0.84101844, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.85832155, + "num_input_tokens_seen": 218493675, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.30419922, + "step": 10147, + "time_per_iteration": 2.6950643062591553 + }, + { + "auxiliary_loss_clip": 0.01385472, + "auxiliary_loss_mlp": 0.00326429, + "balance_loss_clip": 1.13217282, + "balance_loss_mlp": 0.29533923, + "epoch": 0.6101307680745528, + "flos": 19639976682240.0, + "grad_norm": 7.573165399977149, + "language_loss": 0.85740203, + "learning_rate": 1.393385381096786e-06, + "loss": 0.87452102, + "num_input_tokens_seen": 218511780, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.31091309, + "step": 10148, + "time_per_iteration": 2.652137517929077 + }, + { + "auxiliary_loss_clip": 0.01415261, + "auxiliary_loss_mlp": 0.00350341, + "balance_loss_clip": 1.14990628, + "balance_loss_mlp": 0.3173435, + "epoch": 0.6101908913272208, + "flos": 29935566028800.0, + "grad_norm": 7.343686786973513, + "language_loss": 0.63910842, + "learning_rate": 1.39301427737093e-06, + "loss": 0.65676445, + "num_input_tokens_seen": 218531850, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.33007812, + "step": 10149, + "time_per_iteration": 4.169723987579346 + }, + { + "auxiliary_loss_clip": 0.01437167, + "auxiliary_loss_mlp": 0.00317269, + "balance_loss_clip": 1.1712451, + "balance_loss_mlp": 0.28517812, + "epoch": 0.6102510145798887, + "flos": 21798639308160.0, + "grad_norm": 8.897404324538678, + "language_loss": 0.86660653, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.88415092, + "num_input_tokens_seen": 218551245, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.32104492, + "step": 10150, + "time_per_iteration": 4.109920263290405 + }, + { + "auxiliary_loss_clip": 0.01413246, + "auxiliary_loss_mlp": 0.00297736, + "balance_loss_clip": 1.15008068, + "balance_loss_mlp": 0.26872051, + "epoch": 0.6103111378325567, + "flos": 20706129192960.0, + "grad_norm": 6.092779973094965, + "language_loss": 0.75045216, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.76756203, + "num_input_tokens_seen": 218571365, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.2902832, + "step": 10151, + "time_per_iteration": 2.648402690887451 + }, + { + "auxiliary_loss_clip": 0.01421744, + "auxiliary_loss_mlp": 0.0033584, + "balance_loss_clip": 1.15826344, + "balance_loss_mlp": 0.30415377, + "epoch": 0.6103712610852247, + "flos": 29381643417600.0, + "grad_norm": 13.128318821377084, + "language_loss": 0.76986068, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.78743649, + "num_input_tokens_seen": 218588315, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.31689453, + "step": 10152, + "time_per_iteration": 2.698082447052002 + }, + { + "auxiliary_loss_clip": 0.01382514, + "auxiliary_loss_mlp": 0.00346353, + "balance_loss_clip": 1.13014603, + "balance_loss_mlp": 0.31557363, + "epoch": 0.6104313843378927, + "flos": 20813035046400.0, + "grad_norm": 394.00079057746973, + "language_loss": 0.83327156, + "learning_rate": 1.391530092777811e-06, + "loss": 0.85056025, + "num_input_tokens_seen": 218605940, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.30749512, + "step": 10153, + "time_per_iteration": 2.776413679122925 + }, + { + "auxiliary_loss_clip": 0.01394079, + "auxiliary_loss_mlp": 0.00342017, + "balance_loss_clip": 1.1397922, + "balance_loss_mlp": 0.30956799, + "epoch": 0.6104915075905607, + "flos": 26578457101440.0, + "grad_norm": 16.08697004663068, + "language_loss": 0.86527348, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.8826344, + "num_input_tokens_seen": 218626100, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.32421875, + "step": 10154, + "time_per_iteration": 4.103141784667969 + }, + { + "auxiliary_loss_clip": 0.01383251, + "auxiliary_loss_mlp": 0.00318986, + "balance_loss_clip": 1.13071609, + "balance_loss_mlp": 0.28888577, + "epoch": 0.6105516308432286, + "flos": 23915788790400.0, + "grad_norm": 20.60544685304977, + "language_loss": 0.76634461, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.78336698, + "num_input_tokens_seen": 218645060, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.3013916, + "step": 10155, + "time_per_iteration": 2.7075273990631104 + }, + { + "auxiliary_loss_clip": 0.01401665, + "auxiliary_loss_mlp": 0.00322315, + "balance_loss_clip": 1.14329481, + "balance_loss_mlp": 0.29050988, + "epoch": 0.6106117540958966, + "flos": 31577365900800.0, + "grad_norm": 14.23682051616567, + "language_loss": 0.77256221, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.78980201, + "num_input_tokens_seen": 218667690, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.31811523, + "step": 10156, + "time_per_iteration": 2.7359888553619385 + }, + { + "auxiliary_loss_clip": 0.01388633, + "auxiliary_loss_mlp": 0.00325296, + "balance_loss_clip": 1.13702869, + "balance_loss_mlp": 0.29570806, + "epoch": 0.6106718773485645, + "flos": 19608160210560.0, + "grad_norm": 2057.5861620480755, + "language_loss": 0.73719037, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.75432968, + "num_input_tokens_seen": 218687505, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.29589844, + "step": 10157, + "time_per_iteration": 2.698759078979492 + }, + { + "auxiliary_loss_clip": 0.01406119, + "auxiliary_loss_mlp": 0.00334313, + "balance_loss_clip": 1.14585304, + "balance_loss_mlp": 0.30291301, + "epoch": 0.6107320006012326, + "flos": 17123895774720.0, + "grad_norm": 21.622986220576117, + "language_loss": 0.81859446, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.83599877, + "num_input_tokens_seen": 218705315, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.31420898, + "step": 10158, + "time_per_iteration": 4.125421047210693 + }, + { + "auxiliary_loss_clip": 0.01387707, + "auxiliary_loss_mlp": 0.00311785, + "balance_loss_clip": 1.13374531, + "balance_loss_mlp": 0.28198272, + "epoch": 0.6107921238539005, + "flos": 30148228500480.0, + "grad_norm": 15.22769154299476, + "language_loss": 0.77387702, + "learning_rate": 1.389304508366635e-06, + "loss": 0.79087198, + "num_input_tokens_seen": 218725735, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.29785156, + "step": 10159, + "time_per_iteration": 2.7974021434783936 + }, + { + "auxiliary_loss_clip": 0.01411477, + "auxiliary_loss_mlp": 0.00332029, + "balance_loss_clip": 1.14834106, + "balance_loss_mlp": 0.29996198, + "epoch": 0.6108522471065685, + "flos": 18440273404800.0, + "grad_norm": 2.5990064601297758, + "language_loss": 0.85611284, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.87354791, + "num_input_tokens_seen": 218743215, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.32080078, + "step": 10160, + "time_per_iteration": 2.6853291988372803 + }, + { + "auxiliary_loss_clip": 0.01504664, + "auxiliary_loss_mlp": 0.00146015, + "balance_loss_clip": 1.2857604, + "balance_loss_mlp": 0.13695467, + "epoch": 0.6109123703592364, + "flos": 64135454791680.0, + "grad_norm": 0.8058367495320924, + "language_loss": 0.61166143, + "learning_rate": 1.388562832007295e-06, + "loss": 0.62816823, + "num_input_tokens_seen": 218806440, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.09082031, + "step": 10161, + "time_per_iteration": 3.3114585876464844 + }, + { + "auxiliary_loss_clip": 0.01392835, + "auxiliary_loss_mlp": 0.00345508, + "balance_loss_clip": 1.13549519, + "balance_loss_mlp": 0.31141371, + "epoch": 0.6109724936119044, + "flos": 20667848273280.0, + "grad_norm": 21.983055209717882, + "language_loss": 0.8387239, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.85610729, + "num_input_tokens_seen": 218825720, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.34082031, + "step": 10162, + "time_per_iteration": 2.720425605773926 + }, + { + "auxiliary_loss_clip": 0.01402221, + "auxiliary_loss_mlp": 0.00337803, + "balance_loss_clip": 1.14472413, + "balance_loss_mlp": 0.30576, + "epoch": 0.6110326168645723, + "flos": 31351882273920.0, + "grad_norm": 16.10582637719134, + "language_loss": 0.77684319, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.79424345, + "num_input_tokens_seen": 218847735, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.32080078, + "step": 10163, + "time_per_iteration": 2.728424310684204 + }, + { + "auxiliary_loss_clip": 0.01392289, + "auxiliary_loss_mlp": 0.00319579, + "balance_loss_clip": 1.13942516, + "balance_loss_mlp": 0.28797624, + "epoch": 0.6110927401172404, + "flos": 25003378742400.0, + "grad_norm": 14.08357147298663, + "language_loss": 0.66350484, + "learning_rate": 1.387450491396625e-06, + "loss": 0.68062347, + "num_input_tokens_seen": 218866585, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.3157959, + "step": 10164, + "time_per_iteration": 2.673356533050537 + }, + { + "auxiliary_loss_clip": 0.01387773, + "auxiliary_loss_mlp": 0.00316662, + "balance_loss_clip": 1.13360381, + "balance_loss_mlp": 0.28712207, + "epoch": 0.6111528633699083, + "flos": 26248078782720.0, + "grad_norm": 6.007281713504845, + "language_loss": 0.80657876, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.82362306, + "num_input_tokens_seen": 218885560, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.29541016, + "step": 10165, + "time_per_iteration": 2.6646978855133057 + }, + { + "auxiliary_loss_clip": 0.01439314, + "auxiliary_loss_mlp": 0.00290513, + "balance_loss_clip": 1.17762589, + "balance_loss_mlp": 0.26130676, + "epoch": 0.6112129866225763, + "flos": 22382474970240.0, + "grad_norm": 5159.176398691883, + "language_loss": 0.85352325, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.87082154, + "num_input_tokens_seen": 218905055, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.29187012, + "step": 10166, + "time_per_iteration": 2.6702864170074463 + }, + { + "auxiliary_loss_clip": 0.01385, + "auxiliary_loss_mlp": 0.00332517, + "balance_loss_clip": 1.13142872, + "balance_loss_mlp": 0.30121264, + "epoch": 0.6112731098752443, + "flos": 25227892702080.0, + "grad_norm": 8.233390608617913, + "language_loss": 0.75700086, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.774176, + "num_input_tokens_seen": 218924030, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.31274414, + "step": 10167, + "time_per_iteration": 2.694436550140381 + }, + { + "auxiliary_loss_clip": 0.01400558, + "auxiliary_loss_mlp": 0.00308573, + "balance_loss_clip": 1.14807463, + "balance_loss_mlp": 0.27984339, + "epoch": 0.6113332331279122, + "flos": 22893160584960.0, + "grad_norm": 95.12774586983802, + "language_loss": 0.84838176, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.86547309, + "num_input_tokens_seen": 218943750, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.28759766, + "step": 10168, + "time_per_iteration": 2.6640512943267822 + }, + { + "auxiliary_loss_clip": 0.01433216, + "auxiliary_loss_mlp": 0.00342832, + "balance_loss_clip": 1.15721452, + "balance_loss_mlp": 0.3085711, + "epoch": 0.6113933563805802, + "flos": 18620329305600.0, + "grad_norm": 10.04455279760714, + "language_loss": 0.94177383, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.95953429, + "num_input_tokens_seen": 218957585, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.34228516, + "step": 10169, + "time_per_iteration": 2.6380956172943115 + }, + { + "auxiliary_loss_clip": 0.01371984, + "auxiliary_loss_mlp": 0.00305695, + "balance_loss_clip": 1.12437296, + "balance_loss_mlp": 0.2768226, + "epoch": 0.6114534796332481, + "flos": 41866275317760.0, + "grad_norm": 21.76900538953039, + "language_loss": 0.84999937, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.86677611, + "num_input_tokens_seen": 218980025, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.28857422, + "step": 10170, + "time_per_iteration": 2.851030111312866 + }, + { + "auxiliary_loss_clip": 0.01395845, + "auxiliary_loss_mlp": 0.00336631, + "balance_loss_clip": 1.13515723, + "balance_loss_mlp": 0.30277592, + "epoch": 0.6115136028859162, + "flos": 21908454163200.0, + "grad_norm": 178.02249484449067, + "language_loss": 0.75650293, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.77382767, + "num_input_tokens_seen": 218998200, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.33862305, + "step": 10171, + "time_per_iteration": 2.710383415222168 + }, + { + "auxiliary_loss_clip": 0.01417686, + "auxiliary_loss_mlp": 0.00327215, + "balance_loss_clip": 1.15005875, + "balance_loss_mlp": 0.29498059, + "epoch": 0.6115737261385841, + "flos": 28804846821120.0, + "grad_norm": 3.1090078868166375, + "language_loss": 0.85728896, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.87473798, + "num_input_tokens_seen": 219017910, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.32226562, + "step": 10172, + "time_per_iteration": 2.752166748046875 + }, + { + "auxiliary_loss_clip": 0.01402422, + "auxiliary_loss_mlp": 0.00342478, + "balance_loss_clip": 1.13884115, + "balance_loss_mlp": 0.3114596, + "epoch": 0.6116338493912521, + "flos": 21251468453760.0, + "grad_norm": 9.099355395337989, + "language_loss": 0.73096067, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.74840969, + "num_input_tokens_seen": 219037730, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.30981445, + "step": 10173, + "time_per_iteration": 2.793596029281616 + }, + { + "auxiliary_loss_clip": 0.01394196, + "auxiliary_loss_mlp": 0.00339462, + "balance_loss_clip": 1.13551712, + "balance_loss_mlp": 0.30753809, + "epoch": 0.61169397264392, + "flos": 17530189488000.0, + "grad_norm": 61.17207488840093, + "language_loss": 0.64096582, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.65830243, + "num_input_tokens_seen": 219056755, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.31958008, + "step": 10174, + "time_per_iteration": 2.7037415504455566 + }, + { + "auxiliary_loss_clip": 0.0141137, + "auxiliary_loss_mlp": 0.00326254, + "balance_loss_clip": 1.15074873, + "balance_loss_mlp": 0.29450858, + "epoch": 0.611754095896588, + "flos": 23951555758080.0, + "grad_norm": 305.5275609759678, + "language_loss": 0.71752936, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.7349056, + "num_input_tokens_seen": 219076985, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.31750488, + "step": 10175, + "time_per_iteration": 2.6987216472625732 + }, + { + "auxiliary_loss_clip": 0.01372154, + "auxiliary_loss_mlp": 0.00318909, + "balance_loss_clip": 1.12543678, + "balance_loss_mlp": 0.28910619, + "epoch": 0.6118142191492559, + "flos": 25994872834560.0, + "grad_norm": 13.208036683213807, + "language_loss": 0.89972198, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.91663259, + "num_input_tokens_seen": 219096050, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.29785156, + "step": 10176, + "time_per_iteration": 2.695727825164795 + }, + { + "auxiliary_loss_clip": 0.01399556, + "auxiliary_loss_mlp": 0.00319446, + "balance_loss_clip": 1.14340019, + "balance_loss_mlp": 0.28776032, + "epoch": 0.611874342401924, + "flos": 24603190341120.0, + "grad_norm": 120.91008113790944, + "language_loss": 0.83466119, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.85185122, + "num_input_tokens_seen": 219112665, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.31689453, + "step": 10177, + "time_per_iteration": 2.6574885845184326 + }, + { + "auxiliary_loss_clip": 0.01412148, + "auxiliary_loss_mlp": 0.00322645, + "balance_loss_clip": 1.15259647, + "balance_loss_mlp": 0.29103082, + "epoch": 0.6119344656545919, + "flos": 15887132640000.0, + "grad_norm": 11.108774757286444, + "language_loss": 0.83411562, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.85146356, + "num_input_tokens_seen": 219129120, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.31640625, + "step": 10178, + "time_per_iteration": 2.728905439376831 + }, + { + "auxiliary_loss_clip": 0.01402324, + "auxiliary_loss_mlp": 0.00326735, + "balance_loss_clip": 1.14457607, + "balance_loss_mlp": 0.29619318, + "epoch": 0.6119945889072599, + "flos": 21652877917440.0, + "grad_norm": 43.962402011027905, + "language_loss": 0.75148714, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.76877779, + "num_input_tokens_seen": 219148950, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.30541992, + "step": 10179, + "time_per_iteration": 2.66497802734375 + }, + { + "auxiliary_loss_clip": 0.01374543, + "auxiliary_loss_mlp": 0.00328511, + "balance_loss_clip": 1.12268043, + "balance_loss_mlp": 0.29718244, + "epoch": 0.6120547121599279, + "flos": 13772533023360.0, + "grad_norm": 9.76193333277153, + "language_loss": 0.91107917, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.92810971, + "num_input_tokens_seen": 219165585, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.31323242, + "step": 10180, + "time_per_iteration": 2.636752128601074 + }, + { + "auxiliary_loss_clip": 0.01403594, + "auxiliary_loss_mlp": 0.00332982, + "balance_loss_clip": 1.1463356, + "balance_loss_mlp": 0.29943642, + "epoch": 0.6121148354125958, + "flos": 20079164275200.0, + "grad_norm": 100.37943296545427, + "language_loss": 0.83140588, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.84877169, + "num_input_tokens_seen": 219183280, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.33544922, + "step": 10181, + "time_per_iteration": 2.666895627975464 + }, + { + "auxiliary_loss_clip": 0.01406338, + "auxiliary_loss_mlp": 0.0031145, + "balance_loss_clip": 1.14437795, + "balance_loss_mlp": 0.28012139, + "epoch": 0.6121749586652638, + "flos": 13471313569920.0, + "grad_norm": 17.025039314181704, + "language_loss": 0.88203812, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.899216, + "num_input_tokens_seen": 219197200, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.31323242, + "step": 10182, + "time_per_iteration": 2.6499643325805664 + }, + { + "auxiliary_loss_clip": 0.01376639, + "auxiliary_loss_mlp": 0.00287353, + "balance_loss_clip": 1.12909746, + "balance_loss_mlp": 0.25889724, + "epoch": 0.6122350819179317, + "flos": 20120533764480.0, + "grad_norm": 4.3045141785300105, + "language_loss": 0.87879872, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.89543861, + "num_input_tokens_seen": 219216825, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.28430176, + "step": 10183, + "time_per_iteration": 2.6414413452148438 + }, + { + "auxiliary_loss_clip": 0.01472288, + "auxiliary_loss_mlp": 0.00068482, + "balance_loss_clip": 1.26163709, + "balance_loss_mlp": 0.05856404, + "epoch": 0.6122952051705998, + "flos": 65429242767360.0, + "grad_norm": 0.7060268937869889, + "language_loss": 0.61983848, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.63524616, + "num_input_tokens_seen": 219283795, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.09912109, + "step": 10184, + "time_per_iteration": 3.3234522342681885 + }, + { + "auxiliary_loss_clip": 0.01391733, + "auxiliary_loss_mlp": 0.00311602, + "balance_loss_clip": 1.13920689, + "balance_loss_mlp": 0.28214517, + "epoch": 0.6123553284232677, + "flos": 20376253664640.0, + "grad_norm": 36.10985644557028, + "language_loss": 0.90229249, + "learning_rate": 1.379669981812101e-06, + "loss": 0.91932583, + "num_input_tokens_seen": 219302385, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.29455566, + "step": 10185, + "time_per_iteration": 2.6838324069976807 + }, + { + "auxiliary_loss_clip": 0.01420831, + "auxiliary_loss_mlp": 0.00313528, + "balance_loss_clip": 1.15670311, + "balance_loss_mlp": 0.28134179, + "epoch": 0.6124154516759357, + "flos": 23987645948160.0, + "grad_norm": 7.497138232067033, + "language_loss": 0.82583523, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.84317881, + "num_input_tokens_seen": 219319765, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.32202148, + "step": 10186, + "time_per_iteration": 2.6775832176208496 + }, + { + "auxiliary_loss_clip": 0.01366989, + "auxiliary_loss_mlp": 0.0031782, + "balance_loss_clip": 1.12242603, + "balance_loss_mlp": 0.28880483, + "epoch": 0.6124755749286036, + "flos": 21468799693440.0, + "grad_norm": 40.40421837782595, + "language_loss": 0.83104181, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.8478899, + "num_input_tokens_seen": 219337440, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.29016113, + "step": 10187, + "time_per_iteration": 2.6952874660491943 + }, + { + "auxiliary_loss_clip": 0.01407499, + "auxiliary_loss_mlp": 0.00317177, + "balance_loss_clip": 1.14778745, + "balance_loss_mlp": 0.28696954, + "epoch": 0.6125356981812716, + "flos": 23879195809920.0, + "grad_norm": 47.82696632771355, + "language_loss": 0.87675321, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.89399993, + "num_input_tokens_seen": 219357525, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.30200195, + "step": 10188, + "time_per_iteration": 2.70984148979187 + }, + { + "auxiliary_loss_clip": 0.01389049, + "auxiliary_loss_mlp": 0.00325026, + "balance_loss_clip": 1.1384604, + "balance_loss_mlp": 0.29334044, + "epoch": 0.6125958214339395, + "flos": 14425604150400.0, + "grad_norm": 19.979920938196226, + "language_loss": 0.83503169, + "learning_rate": 1.378189152155896e-06, + "loss": 0.85217243, + "num_input_tokens_seen": 219374855, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.31689453, + "step": 10189, + "time_per_iteration": 2.68572998046875 + }, + { + "auxiliary_loss_clip": 0.01402527, + "auxiliary_loss_mlp": 0.00324729, + "balance_loss_clip": 1.14764977, + "balance_loss_mlp": 0.29439047, + "epoch": 0.6126559446866076, + "flos": 23259090389760.0, + "grad_norm": 76.68403760737405, + "language_loss": 0.78213727, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.79940987, + "num_input_tokens_seen": 219394740, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.30334473, + "step": 10190, + "time_per_iteration": 2.6652724742889404 + }, + { + "auxiliary_loss_clip": 0.01404074, + "auxiliary_loss_mlp": 0.00307764, + "balance_loss_clip": 1.14471745, + "balance_loss_mlp": 0.27700794, + "epoch": 0.6127160679392755, + "flos": 26864808324480.0, + "grad_norm": 20.248031896861217, + "language_loss": 0.74499094, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.76210928, + "num_input_tokens_seen": 219413755, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.30761719, + "step": 10191, + "time_per_iteration": 4.118507385253906 + }, + { + "auxiliary_loss_clip": 0.01415043, + "auxiliary_loss_mlp": 0.00304729, + "balance_loss_clip": 1.14872169, + "balance_loss_mlp": 0.27220884, + "epoch": 0.6127761911919435, + "flos": 26396425952640.0, + "grad_norm": 215.91755173146439, + "language_loss": 0.82921231, + "learning_rate": 1.377078777445467e-06, + "loss": 0.84641004, + "num_input_tokens_seen": 219433560, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.32519531, + "step": 10192, + "time_per_iteration": 4.1279003620147705 + }, + { + "auxiliary_loss_clip": 0.01414912, + "auxiliary_loss_mlp": 0.00316177, + "balance_loss_clip": 1.1577605, + "balance_loss_mlp": 0.28556389, + "epoch": 0.6128363144446115, + "flos": 22634747164800.0, + "grad_norm": 10.987532618968952, + "language_loss": 0.91648179, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.93379271, + "num_input_tokens_seen": 219452640, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.3059082, + "step": 10193, + "time_per_iteration": 2.651946544647217 + }, + { + "auxiliary_loss_clip": 0.01408802, + "auxiliary_loss_mlp": 0.00309728, + "balance_loss_clip": 1.15031171, + "balance_loss_mlp": 0.2798067, + "epoch": 0.6128964376972794, + "flos": 26759051706240.0, + "grad_norm": 5.121694639487218, + "language_loss": 0.78058589, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.79777122, + "num_input_tokens_seen": 219468585, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.29931641, + "step": 10194, + "time_per_iteration": 2.689164400100708 + }, + { + "auxiliary_loss_clip": 0.01482105, + "auxiliary_loss_mlp": 0.00109229, + "balance_loss_clip": 1.27324152, + "balance_loss_mlp": 0.10002613, + "epoch": 0.6129565609499474, + "flos": 65567929178880.0, + "grad_norm": 0.8153659682324801, + "language_loss": 0.58600307, + "learning_rate": 1.375968615326149e-06, + "loss": 0.60191637, + "num_input_tokens_seen": 219523015, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.09179688, + "step": 10195, + "time_per_iteration": 2.9926364421844482 + }, + { + "auxiliary_loss_clip": 0.01407985, + "auxiliary_loss_mlp": 0.00305516, + "balance_loss_clip": 1.15236664, + "balance_loss_mlp": 0.27435449, + "epoch": 0.6130166842026153, + "flos": 16362087200640.0, + "grad_norm": 30.182693182359465, + "language_loss": 0.76531321, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.78244817, + "num_input_tokens_seen": 219539980, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.31152344, + "step": 10196, + "time_per_iteration": 4.09409236907959 + }, + { + "auxiliary_loss_clip": 0.01413547, + "auxiliary_loss_mlp": 0.0032627, + "balance_loss_clip": 1.15582013, + "balance_loss_mlp": 0.29581243, + "epoch": 0.6130768074552834, + "flos": 23652455207040.0, + "grad_norm": 39.7341049056273, + "language_loss": 0.77832603, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.79572415, + "num_input_tokens_seen": 219556980, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.30456543, + "step": 10197, + "time_per_iteration": 2.70631742477417 + }, + { + "auxiliary_loss_clip": 0.01413531, + "auxiliary_loss_mlp": 0.00313255, + "balance_loss_clip": 1.15420318, + "balance_loss_mlp": 0.2823084, + "epoch": 0.6131369307079513, + "flos": 20047455544320.0, + "grad_norm": 32.023290801156, + "language_loss": 0.85849857, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.8757664, + "num_input_tokens_seen": 219576410, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.30957031, + "step": 10198, + "time_per_iteration": 2.7635929584503174 + }, + { + "auxiliary_loss_clip": 0.01401386, + "auxiliary_loss_mlp": 0.0033593, + "balance_loss_clip": 1.1452744, + "balance_loss_mlp": 0.30309987, + "epoch": 0.6131970539606193, + "flos": 22672166158080.0, + "grad_norm": 22.603215981633642, + "language_loss": 0.77934039, + "learning_rate": 1.374488730519181e-06, + "loss": 0.79671353, + "num_input_tokens_seen": 219597180, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.32836914, + "step": 10199, + "time_per_iteration": 2.741129159927368 + }, + { + "auxiliary_loss_clip": 0.01420889, + "auxiliary_loss_mlp": 0.00362574, + "balance_loss_clip": 1.15520036, + "balance_loss_mlp": 0.32857585, + "epoch": 0.6132571772132872, + "flos": 26870913636480.0, + "grad_norm": 5.279650747972043, + "language_loss": 0.69432974, + "learning_rate": 1.374118818580993e-06, + "loss": 0.7121644, + "num_input_tokens_seen": 219617630, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.34008789, + "step": 10200, + "time_per_iteration": 4.195077657699585 + }, + { + "auxiliary_loss_clip": 0.01402688, + "auxiliary_loss_mlp": 0.00311545, + "balance_loss_clip": 1.14921415, + "balance_loss_mlp": 0.28126615, + "epoch": 0.6133173004659552, + "flos": 22892657794560.0, + "grad_norm": 31.65163013511999, + "language_loss": 0.7593872, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.77652955, + "num_input_tokens_seen": 219637025, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.30273438, + "step": 10201, + "time_per_iteration": 2.72731614112854 + }, + { + "auxiliary_loss_clip": 0.01406002, + "auxiliary_loss_mlp": 0.0032966, + "balance_loss_clip": 1.14888859, + "balance_loss_mlp": 0.29694897, + "epoch": 0.6133774237186231, + "flos": 20485098852480.0, + "grad_norm": 358.3052255003385, + "language_loss": 0.90342724, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.92078388, + "num_input_tokens_seen": 219656625, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.32714844, + "step": 10202, + "time_per_iteration": 2.7551000118255615 + }, + { + "auxiliary_loss_clip": 0.01488896, + "auxiliary_loss_mlp": 0.00085086, + "balance_loss_clip": 1.27357364, + "balance_loss_mlp": 0.07612149, + "epoch": 0.6134375469712912, + "flos": 69413065217280.0, + "grad_norm": 0.8813776463294625, + "language_loss": 0.66673565, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.68247545, + "num_input_tokens_seen": 219718090, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.08984375, + "step": 10203, + "time_per_iteration": 3.2225589752197266 + }, + { + "auxiliary_loss_clip": 0.013856, + "auxiliary_loss_mlp": 0.00302976, + "balance_loss_clip": 1.13506126, + "balance_loss_mlp": 0.27400821, + "epoch": 0.6134976702239591, + "flos": 41281541815680.0, + "grad_norm": 3.5918123458233477, + "language_loss": 0.67286181, + "learning_rate": 1.37263940830327e-06, + "loss": 0.68974757, + "num_input_tokens_seen": 219740100, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.28955078, + "step": 10204, + "time_per_iteration": 2.8573427200317383 + }, + { + "auxiliary_loss_clip": 0.01400168, + "auxiliary_loss_mlp": 0.00277333, + "balance_loss_clip": 1.14650261, + "balance_loss_mlp": 0.24908018, + "epoch": 0.6135577934766271, + "flos": 22346600261760.0, + "grad_norm": 6.97235840907787, + "language_loss": 0.78757811, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.80435312, + "num_input_tokens_seen": 219761225, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.28283691, + "step": 10205, + "time_per_iteration": 2.7259349822998047 + }, + { + "auxiliary_loss_clip": 0.01394998, + "auxiliary_loss_mlp": 0.00311779, + "balance_loss_clip": 1.1420083, + "balance_loss_mlp": 0.2827996, + "epoch": 0.6136179167292951, + "flos": 23728155120000.0, + "grad_norm": 32.98834682429746, + "language_loss": 0.82571387, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.84278166, + "num_input_tokens_seen": 219780085, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.28979492, + "step": 10206, + "time_per_iteration": 2.680375814437866 + }, + { + "auxiliary_loss_clip": 0.0138903, + "auxiliary_loss_mlp": 0.00307091, + "balance_loss_clip": 1.13589489, + "balance_loss_mlp": 0.27578691, + "epoch": 0.613678039981963, + "flos": 26024678144640.0, + "grad_norm": 50.89342108462829, + "language_loss": 0.82410944, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.84107059, + "num_input_tokens_seen": 219797895, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.31298828, + "step": 10207, + "time_per_iteration": 2.7162060737609863 + }, + { + "auxiliary_loss_clip": 0.01397691, + "auxiliary_loss_mlp": 0.00333475, + "balance_loss_clip": 1.14297175, + "balance_loss_mlp": 0.30187249, + "epoch": 0.613738163234631, + "flos": 9859957200000.0, + "grad_norm": 8.690659942166308, + "language_loss": 0.89909399, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.91640562, + "num_input_tokens_seen": 219811295, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.3157959, + "step": 10208, + "time_per_iteration": 2.6198158264160156 + }, + { + "auxiliary_loss_clip": 0.01412777, + "auxiliary_loss_mlp": 0.00322341, + "balance_loss_clip": 1.14839792, + "balance_loss_mlp": 0.28979695, + "epoch": 0.613798286487299, + "flos": 33182070001920.0, + "grad_norm": 89.01791566024771, + "language_loss": 0.79643828, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.81378949, + "num_input_tokens_seen": 219832735, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.32543945, + "step": 10209, + "time_per_iteration": 2.827317476272583 + }, + { + "auxiliary_loss_clip": 0.01374173, + "auxiliary_loss_mlp": 0.00284529, + "balance_loss_clip": 1.12344694, + "balance_loss_mlp": 0.25614551, + "epoch": 0.613858409739967, + "flos": 25627901535360.0, + "grad_norm": 11.17945399522078, + "language_loss": 0.80704725, + "learning_rate": 1.37042100685438e-06, + "loss": 0.82363427, + "num_input_tokens_seen": 219852755, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.28381348, + "step": 10210, + "time_per_iteration": 2.7028067111968994 + }, + { + "auxiliary_loss_clip": 0.01474995, + "auxiliary_loss_mlp": 0.0005603, + "balance_loss_clip": 1.26476407, + "balance_loss_mlp": 0.04854394, + "epoch": 0.6139185329926349, + "flos": 67192313932800.0, + "grad_norm": 0.8525612326084457, + "language_loss": 0.64536607, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.66067624, + "num_input_tokens_seen": 219922785, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.07470703, + "step": 10211, + "time_per_iteration": 3.334641933441162 + }, + { + "auxiliary_loss_clip": 0.01395349, + "auxiliary_loss_mlp": 0.00306556, + "balance_loss_clip": 1.14203405, + "balance_loss_mlp": 0.27727863, + "epoch": 0.6139786562453029, + "flos": 21543637680000.0, + "grad_norm": 2.9019121054602626, + "language_loss": 0.81712282, + "learning_rate": 1.369681730544801e-06, + "loss": 0.83414185, + "num_input_tokens_seen": 219942215, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.29321289, + "step": 10212, + "time_per_iteration": 2.6846156120300293 + }, + { + "auxiliary_loss_clip": 0.01393795, + "auxiliary_loss_mlp": 0.00286489, + "balance_loss_clip": 1.14456344, + "balance_loss_mlp": 0.25733086, + "epoch": 0.6140387794979708, + "flos": 26068489758720.0, + "grad_norm": 11.928191321990656, + "language_loss": 0.7992453, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.81604815, + "num_input_tokens_seen": 219963830, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.29174805, + "step": 10213, + "time_per_iteration": 2.731318235397339 + }, + { + "auxiliary_loss_clip": 0.01424394, + "auxiliary_loss_mlp": 0.0032757, + "balance_loss_clip": 1.15732121, + "balance_loss_mlp": 0.29526412, + "epoch": 0.6140989027506388, + "flos": 23694614795520.0, + "grad_norm": 5.183432008459838, + "language_loss": 0.81250525, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.83002484, + "num_input_tokens_seen": 219983815, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.32348633, + "step": 10214, + "time_per_iteration": 2.7029788494110107 + }, + { + "auxiliary_loss_clip": 0.01403749, + "auxiliary_loss_mlp": 0.00294793, + "balance_loss_clip": 1.14701986, + "balance_loss_mlp": 0.2654677, + "epoch": 0.6141590260033067, + "flos": 22231721589120.0, + "grad_norm": 2.0699245643812794, + "language_loss": 0.80723977, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.82422513, + "num_input_tokens_seen": 220003165, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.29309082, + "step": 10215, + "time_per_iteration": 2.7386810779571533 + }, + { + "auxiliary_loss_clip": 0.01395768, + "auxiliary_loss_mlp": 0.00312357, + "balance_loss_clip": 1.14141893, + "balance_loss_mlp": 0.28172037, + "epoch": 0.6142191492559748, + "flos": 23871653953920.0, + "grad_norm": 13.015180909257753, + "language_loss": 0.86385763, + "learning_rate": 1.368203464858542e-06, + "loss": 0.88093889, + "num_input_tokens_seen": 220021015, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.30639648, + "step": 10216, + "time_per_iteration": 2.7087652683258057 + }, + { + "auxiliary_loss_clip": 0.01402498, + "auxiliary_loss_mlp": 0.00331425, + "balance_loss_clip": 1.14583206, + "balance_loss_mlp": 0.29730716, + "epoch": 0.6142792725086427, + "flos": 15042513260160.0, + "grad_norm": 14.836003221330513, + "language_loss": 0.8555876, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.87292683, + "num_input_tokens_seen": 220035780, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.34106445, + "step": 10217, + "time_per_iteration": 2.633204460144043 + }, + { + "auxiliary_loss_clip": 0.01406709, + "auxiliary_loss_mlp": 0.00333101, + "balance_loss_clip": 1.14774239, + "balance_loss_mlp": 0.29915029, + "epoch": 0.6143393957613107, + "flos": 23330947547520.0, + "grad_norm": 7.451417629783373, + "language_loss": 0.87003624, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.88743436, + "num_input_tokens_seen": 220054280, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.33984375, + "step": 10218, + "time_per_iteration": 2.667884588241577 + }, + { + "auxiliary_loss_clip": 0.01402811, + "auxiliary_loss_mlp": 0.00312001, + "balance_loss_clip": 1.14686739, + "balance_loss_mlp": 0.28255653, + "epoch": 0.6143995190139786, + "flos": 20117086058880.0, + "grad_norm": 21.823206040943585, + "language_loss": 0.87114942, + "learning_rate": 1.367095017101569e-06, + "loss": 0.88829756, + "num_input_tokens_seen": 220074120, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.29467773, + "step": 10219, + "time_per_iteration": 2.7967169284820557 + }, + { + "auxiliary_loss_clip": 0.01389444, + "auxiliary_loss_mlp": 0.00288302, + "balance_loss_clip": 1.13724744, + "balance_loss_mlp": 0.26028833, + "epoch": 0.6144596422666466, + "flos": 42303559489920.0, + "grad_norm": 1693.8833671428456, + "language_loss": 0.75078511, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.76756251, + "num_input_tokens_seen": 220096320, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.2800293, + "step": 10220, + "time_per_iteration": 2.8495051860809326 + }, + { + "auxiliary_loss_clip": 0.01397879, + "auxiliary_loss_mlp": 0.00302743, + "balance_loss_clip": 1.14342809, + "balance_loss_mlp": 0.27165362, + "epoch": 0.6145197655193146, + "flos": 21573622558080.0, + "grad_norm": 196.16123100509924, + "language_loss": 0.79424471, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.81125093, + "num_input_tokens_seen": 220114850, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.31103516, + "step": 10221, + "time_per_iteration": 2.702946186065674 + }, + { + "auxiliary_loss_clip": 0.01371008, + "auxiliary_loss_mlp": 0.00280897, + "balance_loss_clip": 1.12811971, + "balance_loss_mlp": 0.25269243, + "epoch": 0.6145798887719826, + "flos": 21471098163840.0, + "grad_norm": 28.36384682868476, + "language_loss": 0.85059416, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.86711323, + "num_input_tokens_seen": 220133395, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.28222656, + "step": 10222, + "time_per_iteration": 2.669923782348633 + }, + { + "auxiliary_loss_clip": 0.0138599, + "auxiliary_loss_mlp": 0.00292334, + "balance_loss_clip": 1.12894678, + "balance_loss_mlp": 0.26195917, + "epoch": 0.6146400120246506, + "flos": 20777016683520.0, + "grad_norm": 7.297915762188477, + "language_loss": 0.85951918, + "learning_rate": 1.365617422821788e-06, + "loss": 0.87630248, + "num_input_tokens_seen": 220152790, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.30395508, + "step": 10223, + "time_per_iteration": 2.6733436584472656 + }, + { + "auxiliary_loss_clip": 0.01394828, + "auxiliary_loss_mlp": 0.00281297, + "balance_loss_clip": 1.14525664, + "balance_loss_mlp": 0.25181693, + "epoch": 0.6147001352773185, + "flos": 13881306384000.0, + "grad_norm": 51.79091811223888, + "language_loss": 0.87256837, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.88932955, + "num_input_tokens_seen": 220169535, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.29467773, + "step": 10224, + "time_per_iteration": 2.6552085876464844 + }, + { + "auxiliary_loss_clip": 0.01387349, + "auxiliary_loss_mlp": 0.00327948, + "balance_loss_clip": 1.1399436, + "balance_loss_mlp": 0.29766864, + "epoch": 0.6147602585299865, + "flos": 56641791807360.0, + "grad_norm": 4.521746423717136, + "language_loss": 0.70374733, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.7209003, + "num_input_tokens_seen": 220195305, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.3026123, + "step": 10225, + "time_per_iteration": 3.0338363647460938 + }, + { + "auxiliary_loss_clip": 0.01407127, + "auxiliary_loss_mlp": 0.00318185, + "balance_loss_clip": 1.1461817, + "balance_loss_mlp": 0.28608176, + "epoch": 0.6148203817826544, + "flos": 32817217605120.0, + "grad_norm": 16.333622333045568, + "language_loss": 0.71716499, + "learning_rate": 1.364509479649357e-06, + "loss": 0.73441815, + "num_input_tokens_seen": 220215040, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.32104492, + "step": 10226, + "time_per_iteration": 2.7693545818328857 + }, + { + "auxiliary_loss_clip": 0.01376049, + "auxiliary_loss_mlp": 0.00314174, + "balance_loss_clip": 1.12788332, + "balance_loss_mlp": 0.28260696, + "epoch": 0.6148805050353224, + "flos": 18332038748160.0, + "grad_norm": 5.296876212244825, + "language_loss": 0.8237431, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.84064531, + "num_input_tokens_seen": 220234205, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.31542969, + "step": 10227, + "time_per_iteration": 2.6796963214874268 + }, + { + "auxiliary_loss_clip": 0.01403947, + "auxiliary_loss_mlp": 0.00330376, + "balance_loss_clip": 1.14189124, + "balance_loss_mlp": 0.29671121, + "epoch": 0.6149406282879903, + "flos": 14063983977600.0, + "grad_norm": 117.01235407722645, + "language_loss": 0.75855792, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.77590108, + "num_input_tokens_seen": 220252730, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.33666992, + "step": 10228, + "time_per_iteration": 2.62235689163208 + }, + { + "auxiliary_loss_clip": 0.01362434, + "auxiliary_loss_mlp": 0.00308052, + "balance_loss_clip": 1.12151074, + "balance_loss_mlp": 0.27817863, + "epoch": 0.6150007515406584, + "flos": 25190186400000.0, + "grad_norm": 8.321035014720488, + "language_loss": 0.79614115, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.812846, + "num_input_tokens_seen": 220273345, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.29882812, + "step": 10229, + "time_per_iteration": 2.731825113296509 + }, + { + "auxiliary_loss_clip": 0.0140363, + "auxiliary_loss_mlp": 0.00299313, + "balance_loss_clip": 1.1514101, + "balance_loss_mlp": 0.26839042, + "epoch": 0.6150608747933263, + "flos": 21945262625280.0, + "grad_norm": 70.30550121399833, + "language_loss": 0.84539586, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.86242533, + "num_input_tokens_seen": 220293845, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.30932617, + "step": 10230, + "time_per_iteration": 2.7254226207733154 + }, + { + "auxiliary_loss_clip": 0.01387247, + "auxiliary_loss_mlp": 0.00318839, + "balance_loss_clip": 1.13332534, + "balance_loss_mlp": 0.28814244, + "epoch": 0.6151209980459943, + "flos": 30117453523200.0, + "grad_norm": 109.68028173697545, + "language_loss": 0.80185747, + "learning_rate": 1.36266338983927e-06, + "loss": 0.81891835, + "num_input_tokens_seen": 220316070, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.30700684, + "step": 10231, + "time_per_iteration": 2.742785930633545 + }, + { + "auxiliary_loss_clip": 0.01379086, + "auxiliary_loss_mlp": 0.00282895, + "balance_loss_clip": 1.12908816, + "balance_loss_mlp": 0.25364131, + "epoch": 0.6151811212986622, + "flos": 30008356940160.0, + "grad_norm": 5.540551093458924, + "language_loss": 0.78180552, + "learning_rate": 1.362294244324858e-06, + "loss": 0.79842532, + "num_input_tokens_seen": 220335695, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.29211426, + "step": 10232, + "time_per_iteration": 2.735069513320923 + }, + { + "auxiliary_loss_clip": 0.01372102, + "auxiliary_loss_mlp": 0.00302616, + "balance_loss_clip": 1.13018966, + "balance_loss_mlp": 0.27439919, + "epoch": 0.6152412445513302, + "flos": 18872888808960.0, + "grad_norm": 14.28632563718053, + "language_loss": 0.97926742, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.9960146, + "num_input_tokens_seen": 220353720, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.28259277, + "step": 10233, + "time_per_iteration": 4.036175012588501 + }, + { + "auxiliary_loss_clip": 0.01399026, + "auxiliary_loss_mlp": 0.00294072, + "balance_loss_clip": 1.14431763, + "balance_loss_mlp": 0.26479372, + "epoch": 0.6153013678039982, + "flos": 25703601448320.0, + "grad_norm": 148.91879731053226, + "language_loss": 0.78029811, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.79722905, + "num_input_tokens_seen": 220372515, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.29272461, + "step": 10234, + "time_per_iteration": 2.7076327800750732 + }, + { + "auxiliary_loss_clip": 0.01381902, + "auxiliary_loss_mlp": 0.00323823, + "balance_loss_clip": 1.12982035, + "balance_loss_mlp": 0.29304266, + "epoch": 0.6153614910566662, + "flos": 28510271383680.0, + "grad_norm": 26.137144006190223, + "language_loss": 0.7423653, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.7594226, + "num_input_tokens_seen": 220393490, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.30786133, + "step": 10235, + "time_per_iteration": 4.193690776824951 + }, + { + "auxiliary_loss_clip": 0.01377185, + "auxiliary_loss_mlp": 0.00318389, + "balance_loss_clip": 1.12294006, + "balance_loss_mlp": 0.28751379, + "epoch": 0.6154216143093342, + "flos": 23549787158400.0, + "grad_norm": 13.37427996547943, + "language_loss": 0.89442253, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.91137826, + "num_input_tokens_seen": 220412855, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.30859375, + "step": 10236, + "time_per_iteration": 2.6831564903259277 + }, + { + "auxiliary_loss_clip": 0.01392038, + "auxiliary_loss_mlp": 0.00328095, + "balance_loss_clip": 1.13540697, + "balance_loss_mlp": 0.29748181, + "epoch": 0.6154817375620021, + "flos": 22748081552640.0, + "grad_norm": 128.91756436094624, + "language_loss": 0.85189259, + "learning_rate": 1.360448879760721e-06, + "loss": 0.86909389, + "num_input_tokens_seen": 220433440, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.30639648, + "step": 10237, + "time_per_iteration": 2.675614595413208 + }, + { + "auxiliary_loss_clip": 0.01370948, + "auxiliary_loss_mlp": 0.00314848, + "balance_loss_clip": 1.12729025, + "balance_loss_mlp": 0.2867859, + "epoch": 0.6155418608146701, + "flos": 27162975121920.0, + "grad_norm": 6.347301806843105, + "language_loss": 0.84089464, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.85775256, + "num_input_tokens_seen": 220453445, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.28088379, + "step": 10238, + "time_per_iteration": 4.1746087074279785 + }, + { + "auxiliary_loss_clip": 0.01427398, + "auxiliary_loss_mlp": 0.00087487, + "balance_loss_clip": 1.22660685, + "balance_loss_mlp": 0.07799748, + "epoch": 0.615601984067338, + "flos": 68811165014400.0, + "grad_norm": 0.7521241109959493, + "language_loss": 0.56857097, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.58371985, + "num_input_tokens_seen": 220509730, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.09472656, + "step": 10239, + "time_per_iteration": 3.1684136390686035 + }, + { + "auxiliary_loss_clip": 0.01370573, + "auxiliary_loss_mlp": 0.00325768, + "balance_loss_clip": 1.1222899, + "balance_loss_mlp": 0.29365337, + "epoch": 0.615662107320006, + "flos": 15517144598400.0, + "grad_norm": 10.617246017245686, + "language_loss": 0.86749786, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.88446122, + "num_input_tokens_seen": 220527295, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.32104492, + "step": 10240, + "time_per_iteration": 2.6389002799987793 + }, + { + "auxiliary_loss_clip": 0.01355866, + "auxiliary_loss_mlp": 0.00303922, + "balance_loss_clip": 1.10708141, + "balance_loss_mlp": 0.2725457, + "epoch": 0.615722230572674, + "flos": 21063691128960.0, + "grad_norm": 22.261158350816274, + "language_loss": 0.80748713, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.824085, + "num_input_tokens_seen": 220542730, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.3137207, + "step": 10241, + "time_per_iteration": 2.6556334495544434 + }, + { + "auxiliary_loss_clip": 0.01370013, + "auxiliary_loss_mlp": 0.00307948, + "balance_loss_clip": 1.12658489, + "balance_loss_mlp": 0.27890909, + "epoch": 0.615782353825342, + "flos": 23256791919360.0, + "grad_norm": 17.529558241851316, + "language_loss": 0.77576458, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.79254425, + "num_input_tokens_seen": 220562995, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.2902832, + "step": 10242, + "time_per_iteration": 4.096301555633545 + }, + { + "auxiliary_loss_clip": 0.01355495, + "auxiliary_loss_mlp": 0.00300452, + "balance_loss_clip": 1.11260772, + "balance_loss_mlp": 0.27283159, + "epoch": 0.6158424770780099, + "flos": 21103911383040.0, + "grad_norm": 32.14584363943654, + "language_loss": 0.791291, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.80785048, + "num_input_tokens_seen": 220581775, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.27600098, + "step": 10243, + "time_per_iteration": 2.9034903049468994 + }, + { + "auxiliary_loss_clip": 0.01406832, + "auxiliary_loss_mlp": 0.00110753, + "balance_loss_clip": 1.21354723, + "balance_loss_mlp": 0.10302867, + "epoch": 0.6159026003306779, + "flos": 70333276769280.0, + "grad_norm": 0.7426087223286683, + "language_loss": 0.56621015, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.58138597, + "num_input_tokens_seen": 220646395, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.07714844, + "step": 10244, + "time_per_iteration": 3.2529876232147217 + }, + { + "auxiliary_loss_clip": 0.01400548, + "auxiliary_loss_mlp": 0.00332658, + "balance_loss_clip": 1.14897919, + "balance_loss_mlp": 0.29987502, + "epoch": 0.6159627235833458, + "flos": 33874355802240.0, + "grad_norm": 5.6137483231180605, + "language_loss": 0.70712399, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.72445607, + "num_input_tokens_seen": 220668335, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.32751465, + "step": 10245, + "time_per_iteration": 2.800830364227295 + }, + { + "auxiliary_loss_clip": 0.01384134, + "auxiliary_loss_mlp": 0.0029949, + "balance_loss_clip": 1.13193321, + "balance_loss_mlp": 0.27030793, + "epoch": 0.6160228468360138, + "flos": 26575440359040.0, + "grad_norm": 637.8307481429061, + "language_loss": 0.85531867, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.87215489, + "num_input_tokens_seen": 220688915, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.29187012, + "step": 10246, + "time_per_iteration": 2.7459628582000732 + }, + { + "auxiliary_loss_clip": 0.01377553, + "auxiliary_loss_mlp": 0.00345757, + "balance_loss_clip": 1.12837029, + "balance_loss_mlp": 0.31324863, + "epoch": 0.6160829700886818, + "flos": 17193274894080.0, + "grad_norm": 28.902117184959202, + "language_loss": 0.952402, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.96963513, + "num_input_tokens_seen": 220703465, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.32519531, + "step": 10247, + "time_per_iteration": 2.624130964279175 + }, + { + "auxiliary_loss_clip": 0.01393499, + "auxiliary_loss_mlp": 0.00327093, + "balance_loss_clip": 1.14122975, + "balance_loss_mlp": 0.29690921, + "epoch": 0.6161430933413498, + "flos": 23623547736960.0, + "grad_norm": 16.656727882253012, + "language_loss": 0.86366528, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.88087124, + "num_input_tokens_seen": 220722090, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.30200195, + "step": 10248, + "time_per_iteration": 2.658085823059082 + }, + { + "auxiliary_loss_clip": 0.01363348, + "auxiliary_loss_mlp": 0.00286359, + "balance_loss_clip": 1.12203252, + "balance_loss_mlp": 0.25880995, + "epoch": 0.6162032165940178, + "flos": 23002436736000.0, + "grad_norm": 11.384735941322692, + "language_loss": 0.93775725, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.95425439, + "num_input_tokens_seen": 220741075, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.27526855, + "step": 10249, + "time_per_iteration": 2.6998984813690186 + }, + { + "auxiliary_loss_clip": 0.01368045, + "auxiliary_loss_mlp": 0.0032382, + "balance_loss_clip": 1.11650324, + "balance_loss_mlp": 0.29203904, + "epoch": 0.6162633398466857, + "flos": 39421979740800.0, + "grad_norm": 19.43668053007184, + "language_loss": 0.79307961, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.80999827, + "num_input_tokens_seen": 220763395, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.31774902, + "step": 10250, + "time_per_iteration": 2.829848051071167 + }, + { + "auxiliary_loss_clip": 0.01378828, + "auxiliary_loss_mlp": 0.00293526, + "balance_loss_clip": 1.1374284, + "balance_loss_mlp": 0.26529759, + "epoch": 0.6163234630993537, + "flos": 19244672530560.0, + "grad_norm": 13.395631514954118, + "language_loss": 0.79563594, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.81235945, + "num_input_tokens_seen": 220780640, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.2824707, + "step": 10251, + "time_per_iteration": 2.6603810787200928 + }, + { + "auxiliary_loss_clip": 0.01354105, + "auxiliary_loss_mlp": 0.00311331, + "balance_loss_clip": 1.10996377, + "balance_loss_mlp": 0.28218395, + "epoch": 0.6163835863520216, + "flos": 15961791058560.0, + "grad_norm": 50.91367438595711, + "language_loss": 0.75851047, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.77516484, + "num_input_tokens_seen": 220797960, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.29162598, + "step": 10252, + "time_per_iteration": 2.629621744155884 + }, + { + "auxiliary_loss_clip": 0.01366431, + "auxiliary_loss_mlp": 0.00149606, + "balance_loss_clip": 1.17238593, + "balance_loss_mlp": 0.14064187, + "epoch": 0.6164437096046896, + "flos": 68103834393600.0, + "grad_norm": 0.8580420957238833, + "language_loss": 0.5709179, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.58607829, + "num_input_tokens_seen": 220856930, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.08984375, + "step": 10253, + "time_per_iteration": 3.2444961071014404 + }, + { + "auxiliary_loss_clip": 0.01371107, + "auxiliary_loss_mlp": 0.00336553, + "balance_loss_clip": 1.1207937, + "balance_loss_mlp": 0.3051528, + "epoch": 0.6165038328573575, + "flos": 21361211481600.0, + "grad_norm": 53.57994765528177, + "language_loss": 0.84480655, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.86188316, + "num_input_tokens_seen": 220877595, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.3137207, + "step": 10254, + "time_per_iteration": 2.6631457805633545 + }, + { + "auxiliary_loss_clip": 0.01364188, + "auxiliary_loss_mlp": 0.00313698, + "balance_loss_clip": 1.11770916, + "balance_loss_mlp": 0.28371692, + "epoch": 0.6165639561100256, + "flos": 21101972048640.0, + "grad_norm": 4.202670685329964, + "language_loss": 0.87594062, + "learning_rate": 1.353810600008846e-06, + "loss": 0.89271951, + "num_input_tokens_seen": 220896880, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.29968262, + "step": 10255, + "time_per_iteration": 2.700911521911621 + }, + { + "auxiliary_loss_clip": 0.01363407, + "auxiliary_loss_mlp": 0.00332078, + "balance_loss_clip": 1.11639452, + "balance_loss_mlp": 0.30091661, + "epoch": 0.6166240793626935, + "flos": 25338533569920.0, + "grad_norm": 90.9569689757983, + "language_loss": 0.73819876, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.75515366, + "num_input_tokens_seen": 220916425, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.3112793, + "step": 10256, + "time_per_iteration": 2.724639654159546 + }, + { + "auxiliary_loss_clip": 0.01358737, + "auxiliary_loss_mlp": 0.00326751, + "balance_loss_clip": 1.12193894, + "balance_loss_mlp": 0.29846269, + "epoch": 0.6166842026153615, + "flos": 19682639061120.0, + "grad_norm": 267.3401052259844, + "language_loss": 0.80877471, + "learning_rate": 1.353073501949825e-06, + "loss": 0.82562959, + "num_input_tokens_seen": 220935050, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.28283691, + "step": 10257, + "time_per_iteration": 2.721689224243164 + }, + { + "auxiliary_loss_clip": 0.01375659, + "auxiliary_loss_mlp": 0.00291524, + "balance_loss_clip": 1.12794709, + "balance_loss_mlp": 0.26210332, + "epoch": 0.6167443258680294, + "flos": 19318361281920.0, + "grad_norm": 5.955286700811938, + "language_loss": 0.79839259, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.81506443, + "num_input_tokens_seen": 220953085, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.29394531, + "step": 10258, + "time_per_iteration": 2.6874730587005615 + }, + { + "auxiliary_loss_clip": 0.01393111, + "auxiliary_loss_mlp": 0.00289577, + "balance_loss_clip": 1.13808441, + "balance_loss_mlp": 0.25789139, + "epoch": 0.6168044491206974, + "flos": 25265239868160.0, + "grad_norm": 86.53541082372406, + "language_loss": 0.77083635, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.78766322, + "num_input_tokens_seen": 220969050, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.31713867, + "step": 10259, + "time_per_iteration": 2.7840352058410645 + }, + { + "auxiliary_loss_clip": 0.01353392, + "auxiliary_loss_mlp": 0.00291249, + "balance_loss_clip": 1.11261761, + "balance_loss_mlp": 0.26318687, + "epoch": 0.6168645723733654, + "flos": 13219903301760.0, + "grad_norm": 10627.191722239806, + "language_loss": 0.81016296, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.82660931, + "num_input_tokens_seen": 220985825, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.28051758, + "step": 10260, + "time_per_iteration": 2.621734857559204 + }, + { + "auxiliary_loss_clip": 0.01376981, + "auxiliary_loss_mlp": 0.0031863, + "balance_loss_clip": 1.12733889, + "balance_loss_mlp": 0.28529871, + "epoch": 0.6169246956260334, + "flos": 26652038112000.0, + "grad_norm": 43.10229572201512, + "language_loss": 0.76286292, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.77981907, + "num_input_tokens_seen": 221004465, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.33312988, + "step": 10261, + "time_per_iteration": 2.7354576587677 + }, + { + "auxiliary_loss_clip": 0.01345106, + "auxiliary_loss_mlp": 0.00296538, + "balance_loss_clip": 1.10794187, + "balance_loss_mlp": 0.26797497, + "epoch": 0.6169848188787014, + "flos": 23148413608320.0, + "grad_norm": 57.453005224503876, + "language_loss": 0.7844969, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.80091339, + "num_input_tokens_seen": 221023260, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.28552246, + "step": 10262, + "time_per_iteration": 2.660094976425171 + }, + { + "auxiliary_loss_clip": 0.0137033, + "auxiliary_loss_mlp": 0.00303822, + "balance_loss_clip": 1.12635183, + "balance_loss_mlp": 0.27556944, + "epoch": 0.6170449421313693, + "flos": 23331917214720.0, + "grad_norm": 36.535067065516515, + "language_loss": 0.77669787, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.79343939, + "num_input_tokens_seen": 221043090, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.28234863, + "step": 10263, + "time_per_iteration": 2.7001962661743164 + }, + { + "auxiliary_loss_clip": 0.0137559, + "auxiliary_loss_mlp": 0.00301764, + "balance_loss_clip": 1.12455773, + "balance_loss_mlp": 0.27324921, + "epoch": 0.6171050653840373, + "flos": 15851617067520.0, + "grad_norm": 3.4755049248303154, + "language_loss": 0.85906315, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.87583667, + "num_input_tokens_seen": 221061435, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.28515625, + "step": 10264, + "time_per_iteration": 2.704101085662842 + }, + { + "auxiliary_loss_clip": 0.01362878, + "auxiliary_loss_mlp": 0.00306715, + "balance_loss_clip": 1.12152898, + "balance_loss_mlp": 0.27655464, + "epoch": 0.6171651886367052, + "flos": 20045516209920.0, + "grad_norm": 74.30899437798362, + "language_loss": 0.91193867, + "learning_rate": 1.350126092092247e-06, + "loss": 0.92863464, + "num_input_tokens_seen": 221078705, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.30175781, + "step": 10265, + "time_per_iteration": 2.779444694519043 + }, + { + "auxiliary_loss_clip": 0.01339441, + "auxiliary_loss_mlp": 0.00302565, + "balance_loss_clip": 1.10518861, + "balance_loss_mlp": 0.27364454, + "epoch": 0.6172253118893732, + "flos": 26432695710720.0, + "grad_norm": 46.39462733124493, + "language_loss": 0.74780476, + "learning_rate": 1.349757776608153e-06, + "loss": 0.76422483, + "num_input_tokens_seen": 221099245, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.28918457, + "step": 10266, + "time_per_iteration": 2.7015230655670166 + }, + { + "auxiliary_loss_clip": 0.01337234, + "auxiliary_loss_mlp": 0.00304033, + "balance_loss_clip": 1.10221577, + "balance_loss_mlp": 0.2752918, + "epoch": 0.6172854351420412, + "flos": 22632879657600.0, + "grad_norm": 26.10713879144964, + "language_loss": 0.81359679, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.83000946, + "num_input_tokens_seen": 221116930, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.28747559, + "step": 10267, + "time_per_iteration": 2.6755475997924805 + }, + { + "auxiliary_loss_clip": 0.0138718, + "auxiliary_loss_mlp": 0.00314406, + "balance_loss_clip": 1.13086081, + "balance_loss_mlp": 0.28341141, + "epoch": 0.6173455583947092, + "flos": 21212936138880.0, + "grad_norm": 614.5453537193556, + "language_loss": 0.81657004, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.83358592, + "num_input_tokens_seen": 221137660, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.30957031, + "step": 10268, + "time_per_iteration": 2.878925323486328 + }, + { + "auxiliary_loss_clip": 0.01346474, + "auxiliary_loss_mlp": 0.00343059, + "balance_loss_clip": 1.10368586, + "balance_loss_mlp": 0.31339973, + "epoch": 0.6174056816473771, + "flos": 19500284689920.0, + "grad_norm": 11.688741318605787, + "language_loss": 0.83770859, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.85460389, + "num_input_tokens_seen": 221156225, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.296875, + "step": 10269, + "time_per_iteration": 2.705042839050293 + }, + { + "auxiliary_loss_clip": 0.01354819, + "auxiliary_loss_mlp": 0.00321531, + "balance_loss_clip": 1.11026239, + "balance_loss_mlp": 0.28971392, + "epoch": 0.6174658049000451, + "flos": 15997342544640.0, + "grad_norm": 20.412722592967107, + "language_loss": 0.8388263, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.85558981, + "num_input_tokens_seen": 221173820, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.31848145, + "step": 10270, + "time_per_iteration": 2.6347737312316895 + }, + { + "auxiliary_loss_clip": 0.01353062, + "auxiliary_loss_mlp": 0.00302779, + "balance_loss_clip": 1.11645126, + "balance_loss_mlp": 0.27395457, + "epoch": 0.617525928152713, + "flos": 21903893136000.0, + "grad_norm": 1582.3365300415737, + "language_loss": 0.899279, + "learning_rate": 1.347916569325736e-06, + "loss": 0.91583741, + "num_input_tokens_seen": 221191815, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.28833008, + "step": 10271, + "time_per_iteration": 2.6874799728393555 + }, + { + "auxiliary_loss_clip": 0.01363473, + "auxiliary_loss_mlp": 0.00291686, + "balance_loss_clip": 1.11978984, + "balance_loss_mlp": 0.26121596, + "epoch": 0.617586051405381, + "flos": 21105958458240.0, + "grad_norm": 9.114701664126727, + "language_loss": 0.8549006, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.87145215, + "num_input_tokens_seen": 221211205, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.3046875, + "step": 10272, + "time_per_iteration": 2.752615213394165 + }, + { + "auxiliary_loss_clip": 0.01346488, + "auxiliary_loss_mlp": 0.00138376, + "balance_loss_clip": 1.16200197, + "balance_loss_mlp": 0.1286485, + "epoch": 0.617646174658049, + "flos": 58610776665600.0, + "grad_norm": 0.8164147237824814, + "language_loss": 0.58093882, + "learning_rate": 1.347180259404513e-06, + "loss": 0.59578741, + "num_input_tokens_seen": 221268430, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.09716797, + "step": 10273, + "time_per_iteration": 3.052072286605835 + }, + { + "auxiliary_loss_clip": 0.01338217, + "auxiliary_loss_mlp": 0.00314393, + "balance_loss_clip": 1.10537589, + "balance_loss_mlp": 0.28605664, + "epoch": 0.617706297910717, + "flos": 13878684691200.0, + "grad_norm": 25.09440450125433, + "language_loss": 0.82122755, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.83775365, + "num_input_tokens_seen": 221281930, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.28344727, + "step": 10274, + "time_per_iteration": 2.664402484893799 + }, + { + "auxiliary_loss_clip": 0.01337515, + "auxiliary_loss_mlp": 0.0030349, + "balance_loss_clip": 1.09908044, + "balance_loss_mlp": 0.2729128, + "epoch": 0.617766421163385, + "flos": 19208438686080.0, + "grad_norm": 9.752213902476496, + "language_loss": 0.86300278, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.87941283, + "num_input_tokens_seen": 221301605, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.3059082, + "step": 10275, + "time_per_iteration": 4.081395864486694 + }, + { + "auxiliary_loss_clip": 0.01341631, + "auxiliary_loss_mlp": 0.00292762, + "balance_loss_clip": 1.10428214, + "balance_loss_mlp": 0.26474744, + "epoch": 0.6178265444160529, + "flos": 22565978576640.0, + "grad_norm": 27.92713540644189, + "language_loss": 0.84484267, + "learning_rate": 1.346075980219998e-06, + "loss": 0.86118662, + "num_input_tokens_seen": 221320105, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.28051758, + "step": 10276, + "time_per_iteration": 2.665997266769409 + }, + { + "auxiliary_loss_clip": 0.01354482, + "auxiliary_loss_mlp": 0.00304517, + "balance_loss_clip": 1.11154985, + "balance_loss_mlp": 0.27423814, + "epoch": 0.6178866676687209, + "flos": 11984289402240.0, + "grad_norm": 73.91974072106456, + "language_loss": 0.88795459, + "learning_rate": 1.345707936733612e-06, + "loss": 0.90454453, + "num_input_tokens_seen": 221335915, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.30273438, + "step": 10277, + "time_per_iteration": 4.104903697967529 + }, + { + "auxiliary_loss_clip": 0.01354277, + "auxiliary_loss_mlp": 0.00294336, + "balance_loss_clip": 1.10784173, + "balance_loss_mlp": 0.26541618, + "epoch": 0.6179467909213888, + "flos": 20991510748800.0, + "grad_norm": 4.013649701955217, + "language_loss": 0.89176893, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.9082551, + "num_input_tokens_seen": 221353965, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.28955078, + "step": 10278, + "time_per_iteration": 2.6586687564849854 + }, + { + "auxiliary_loss_clip": 0.01327633, + "auxiliary_loss_mlp": 0.00308268, + "balance_loss_clip": 1.09221995, + "balance_loss_mlp": 0.28080255, + "epoch": 0.6180069141740568, + "flos": 25338102606720.0, + "grad_norm": 2.1528391895138554, + "language_loss": 0.79312682, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.80948585, + "num_input_tokens_seen": 221374080, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.27453613, + "step": 10279, + "time_per_iteration": 2.7586822509765625 + }, + { + "auxiliary_loss_clip": 0.01331671, + "auxiliary_loss_mlp": 0.00289967, + "balance_loss_clip": 1.09926867, + "balance_loss_mlp": 0.26331162, + "epoch": 0.6180670374267248, + "flos": 19645722858240.0, + "grad_norm": 43.62626522796563, + "language_loss": 0.76009506, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.7763114, + "num_input_tokens_seen": 221392910, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.26647949, + "step": 10280, + "time_per_iteration": 4.100741624832153 + }, + { + "auxiliary_loss_clip": 0.01357017, + "auxiliary_loss_mlp": 0.00305611, + "balance_loss_clip": 1.10967875, + "balance_loss_mlp": 0.27695328, + "epoch": 0.6181271606793928, + "flos": 19464876858240.0, + "grad_norm": 5.417593170297812, + "language_loss": 0.78298187, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.79960823, + "num_input_tokens_seen": 221410990, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.28662109, + "step": 10281, + "time_per_iteration": 2.6784181594848633 + }, + { + "auxiliary_loss_clip": 0.01348529, + "auxiliary_loss_mlp": 0.0029273, + "balance_loss_clip": 1.11138749, + "balance_loss_mlp": 0.26512086, + "epoch": 0.6181872839320607, + "flos": 25594289383680.0, + "grad_norm": 39.01796415989131, + "language_loss": 0.82523167, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.84164423, + "num_input_tokens_seen": 221431020, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.27624512, + "step": 10282, + "time_per_iteration": 2.7049720287323 + }, + { + "auxiliary_loss_clip": 0.01345026, + "auxiliary_loss_mlp": 0.0030228, + "balance_loss_clip": 1.09855986, + "balance_loss_mlp": 0.27116591, + "epoch": 0.6182474071847287, + "flos": 25551806572800.0, + "grad_norm": 91.10626086387813, + "language_loss": 0.75733966, + "learning_rate": 1.343500197330931e-06, + "loss": 0.77381271, + "num_input_tokens_seen": 221453235, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.31066895, + "step": 10283, + "time_per_iteration": 2.710658073425293 + }, + { + "auxiliary_loss_clip": 0.01360879, + "auxiliary_loss_mlp": 0.00307302, + "balance_loss_clip": 1.10927725, + "balance_loss_mlp": 0.27621174, + "epoch": 0.6183075304373966, + "flos": 22123738327680.0, + "grad_norm": 12.416485689060497, + "language_loss": 0.8198539, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.83653575, + "num_input_tokens_seen": 221472560, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.31103516, + "step": 10284, + "time_per_iteration": 4.113955497741699 + }, + { + "auxiliary_loss_clip": 0.0133929, + "auxiliary_loss_mlp": 0.00283167, + "balance_loss_clip": 1.10690284, + "balance_loss_mlp": 0.25549853, + "epoch": 0.6183676536900646, + "flos": 22455589104000.0, + "grad_norm": 1.6985633687752506, + "language_loss": 0.81200504, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.82822961, + "num_input_tokens_seen": 221492835, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2767334, + "step": 10285, + "time_per_iteration": 2.703740119934082 + }, + { + "auxiliary_loss_clip": 0.01327715, + "auxiliary_loss_mlp": 0.0034032, + "balance_loss_clip": 1.09178376, + "balance_loss_mlp": 0.31070834, + "epoch": 0.6184277769427327, + "flos": 23364128736000.0, + "grad_norm": 5.80315752464746, + "language_loss": 0.78312767, + "learning_rate": 1.342396663517503e-06, + "loss": 0.79980803, + "num_input_tokens_seen": 221511870, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.29638672, + "step": 10286, + "time_per_iteration": 2.6791162490844727 + }, + { + "auxiliary_loss_clip": 0.01322338, + "auxiliary_loss_mlp": 0.00299232, + "balance_loss_clip": 1.09062994, + "balance_loss_mlp": 0.27093178, + "epoch": 0.6184879001954006, + "flos": 22711057608960.0, + "grad_norm": 116.71674161772613, + "language_loss": 0.81978011, + "learning_rate": 1.342028868767199e-06, + "loss": 0.83599579, + "num_input_tokens_seen": 221529915, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.2833252, + "step": 10287, + "time_per_iteration": 2.7467212677001953 + }, + { + "auxiliary_loss_clip": 0.01333417, + "auxiliary_loss_mlp": 0.00298757, + "balance_loss_clip": 1.09627295, + "balance_loss_mlp": 0.26897866, + "epoch": 0.6185480234480686, + "flos": 23841920471040.0, + "grad_norm": 16.8847798171533, + "language_loss": 0.79017127, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.80649304, + "num_input_tokens_seen": 221549745, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.29785156, + "step": 10288, + "time_per_iteration": 2.7699875831604004 + }, + { + "auxiliary_loss_clip": 0.01312715, + "auxiliary_loss_mlp": 0.00291869, + "balance_loss_clip": 1.08390772, + "balance_loss_mlp": 0.2650229, + "epoch": 0.6186081467007365, + "flos": 45477595774080.0, + "grad_norm": 29.730126467011132, + "language_loss": 0.79036582, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.80641162, + "num_input_tokens_seen": 221572455, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.26843262, + "step": 10289, + "time_per_iteration": 2.94004225730896 + }, + { + "auxiliary_loss_clip": 0.01320971, + "auxiliary_loss_mlp": 0.00336465, + "balance_loss_clip": 1.08413577, + "balance_loss_mlp": 0.30477884, + "epoch": 0.6186682699534045, + "flos": 23550864566400.0, + "grad_norm": 44.450132934390595, + "language_loss": 0.84731477, + "learning_rate": 1.340925634274056e-06, + "loss": 0.8638891, + "num_input_tokens_seen": 221591325, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.31689453, + "step": 10290, + "time_per_iteration": 2.6856114864349365 + }, + { + "auxiliary_loss_clip": 0.01337517, + "auxiliary_loss_mlp": 0.00287203, + "balance_loss_clip": 1.0964371, + "balance_loss_mlp": 0.25999987, + "epoch": 0.6187283932060724, + "flos": 25774201630080.0, + "grad_norm": 28.367403396745818, + "language_loss": 0.88374537, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.89999259, + "num_input_tokens_seen": 221611640, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.27197266, + "step": 10291, + "time_per_iteration": 2.6968815326690674 + }, + { + "auxiliary_loss_clip": 0.01319707, + "auxiliary_loss_mlp": 0.00290195, + "balance_loss_clip": 1.08679938, + "balance_loss_mlp": 0.26134658, + "epoch": 0.6187885164587404, + "flos": 25265203954560.0, + "grad_norm": 52.73498050822463, + "language_loss": 0.86184186, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.87794089, + "num_input_tokens_seen": 221631225, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.28869629, + "step": 10292, + "time_per_iteration": 2.6798181533813477 + }, + { + "auxiliary_loss_clip": 0.01348709, + "auxiliary_loss_mlp": 0.0026399, + "balance_loss_clip": 1.09937668, + "balance_loss_mlp": 0.23351979, + "epoch": 0.6188486397114084, + "flos": 26250772302720.0, + "grad_norm": 69.69838648168762, + "language_loss": 0.82910991, + "learning_rate": 1.339822624710401e-06, + "loss": 0.8452369, + "num_input_tokens_seen": 221651035, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.30493164, + "step": 10293, + "time_per_iteration": 2.692646026611328 + }, + { + "auxiliary_loss_clip": 0.01315389, + "auxiliary_loss_mlp": 0.0027418, + "balance_loss_clip": 1.08135569, + "balance_loss_mlp": 0.24770372, + "epoch": 0.6189087629640764, + "flos": 20923388605440.0, + "grad_norm": 6.3308887274663626, + "language_loss": 0.89162821, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.90752393, + "num_input_tokens_seen": 221671300, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.26477051, + "step": 10294, + "time_per_iteration": 2.683659076690674 + }, + { + "auxiliary_loss_clip": 0.01329106, + "auxiliary_loss_mlp": 0.00260574, + "balance_loss_clip": 1.09229636, + "balance_loss_mlp": 0.23385873, + "epoch": 0.6189688862167443, + "flos": 14829814874880.0, + "grad_norm": 444.39393047994537, + "language_loss": 0.80359685, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.81949365, + "num_input_tokens_seen": 221687320, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.26721191, + "step": 10295, + "time_per_iteration": 2.613093376159668 + }, + { + "auxiliary_loss_clip": 0.0133357, + "auxiliary_loss_mlp": 0.00251673, + "balance_loss_clip": 1.09719515, + "balance_loss_mlp": 0.2243022, + "epoch": 0.6190290094694123, + "flos": 24285058560000.0, + "grad_norm": 19.21635585239069, + "language_loss": 0.76998067, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.78583312, + "num_input_tokens_seen": 221710175, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.27368164, + "step": 10296, + "time_per_iteration": 2.896052360534668 + }, + { + "auxiliary_loss_clip": 0.01314847, + "auxiliary_loss_mlp": 0.00260187, + "balance_loss_clip": 1.07885885, + "balance_loss_mlp": 0.23242362, + "epoch": 0.6190891327220802, + "flos": 22529457423360.0, + "grad_norm": 12.404155965891597, + "language_loss": 0.79470301, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.81045341, + "num_input_tokens_seen": 221728145, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.27770996, + "step": 10297, + "time_per_iteration": 2.6926326751708984 + }, + { + "auxiliary_loss_clip": 0.01344733, + "auxiliary_loss_mlp": 0.0008149, + "balance_loss_clip": 1.16297114, + "balance_loss_mlp": 0.07128606, + "epoch": 0.6191492559747482, + "flos": 67729357152000.0, + "grad_norm": 0.8894633889288233, + "language_loss": 0.63918138, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.65344363, + "num_input_tokens_seen": 221786100, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.10205078, + "step": 10298, + "time_per_iteration": 3.0680267810821533 + }, + { + "auxiliary_loss_clip": 0.01306697, + "auxiliary_loss_mlp": 0.00286818, + "balance_loss_clip": 1.07410157, + "balance_loss_mlp": 0.2592687, + "epoch": 0.6192093792274163, + "flos": 22346672088960.0, + "grad_norm": 8.480071554814659, + "language_loss": 0.80737364, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.82330877, + "num_input_tokens_seen": 221806450, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2755127, + "step": 10299, + "time_per_iteration": 2.7057738304138184 + }, + { + "auxiliary_loss_clip": 0.01333334, + "auxiliary_loss_mlp": 0.00299846, + "balance_loss_clip": 1.09058869, + "balance_loss_mlp": 0.27090204, + "epoch": 0.6192695024800842, + "flos": 13553944807680.0, + "grad_norm": 61.07979480623225, + "language_loss": 0.75831079, + "learning_rate": 1.337249812568732e-06, + "loss": 0.77464259, + "num_input_tokens_seen": 221823330, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.28955078, + "step": 10300, + "time_per_iteration": 2.6401803493499756 + }, + { + "auxiliary_loss_clip": 0.01319797, + "auxiliary_loss_mlp": 0.0028605, + "balance_loss_clip": 1.08303118, + "balance_loss_mlp": 0.25695091, + "epoch": 0.6193296257327522, + "flos": 17415310815360.0, + "grad_norm": 12.97222966674437, + "language_loss": 0.74148345, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.75754189, + "num_input_tokens_seen": 221839360, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.29125977, + "step": 10301, + "time_per_iteration": 2.702589750289917 + }, + { + "auxiliary_loss_clip": 0.01309587, + "auxiliary_loss_mlp": 0.0025704, + "balance_loss_clip": 1.07835281, + "balance_loss_mlp": 0.23051588, + "epoch": 0.6193897489854201, + "flos": 31101118450560.0, + "grad_norm": 16.90812788536881, + "language_loss": 0.79458094, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.8102473, + "num_input_tokens_seen": 221859465, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.26550293, + "step": 10302, + "time_per_iteration": 2.825822591781616 + }, + { + "auxiliary_loss_clip": 0.01326335, + "auxiliary_loss_mlp": 0.00298065, + "balance_loss_clip": 1.08827925, + "balance_loss_mlp": 0.26883516, + "epoch": 0.6194498722380881, + "flos": 19134031662720.0, + "grad_norm": 21.74750643775397, + "language_loss": 0.89180356, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.90804756, + "num_input_tokens_seen": 221878555, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.29211426, + "step": 10303, + "time_per_iteration": 2.684495449066162 + }, + { + "auxiliary_loss_clip": 0.0132975, + "auxiliary_loss_mlp": 0.00276661, + "balance_loss_clip": 1.08833849, + "balance_loss_mlp": 0.2477643, + "epoch": 0.619509995490756, + "flos": 21835088634240.0, + "grad_norm": 7.122673019165301, + "language_loss": 0.84131449, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.8573786, + "num_input_tokens_seen": 221898790, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.28930664, + "step": 10304, + "time_per_iteration": 2.6788198947906494 + }, + { + "auxiliary_loss_clip": 0.01337511, + "auxiliary_loss_mlp": 0.00302038, + "balance_loss_clip": 1.09131718, + "balance_loss_mlp": 0.27030435, + "epoch": 0.619570118743424, + "flos": 23806548552960.0, + "grad_norm": 21.218733675918063, + "language_loss": 0.8615973, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.87799287, + "num_input_tokens_seen": 221918875, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.31726074, + "step": 10305, + "time_per_iteration": 2.734121561050415 + }, + { + "auxiliary_loss_clip": 0.01350067, + "auxiliary_loss_mlp": 0.00271062, + "balance_loss_clip": 1.10116887, + "balance_loss_mlp": 0.24080698, + "epoch": 0.619630241996092, + "flos": 21101612912640.0, + "grad_norm": 5.404759790101958, + "language_loss": 0.87966186, + "learning_rate": 1.335045524968045e-06, + "loss": 0.89587313, + "num_input_tokens_seen": 221937895, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.30273438, + "step": 10306, + "time_per_iteration": 2.6942226886749268 + }, + { + "auxiliary_loss_clip": 0.01328511, + "auxiliary_loss_mlp": 0.0025202, + "balance_loss_clip": 1.09310675, + "balance_loss_mlp": 0.22549549, + "epoch": 0.61969036524876, + "flos": 27308269635840.0, + "grad_norm": 14.380783020694226, + "language_loss": 0.87048328, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.88628858, + "num_input_tokens_seen": 221955920, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.265625, + "step": 10307, + "time_per_iteration": 2.7160332202911377 + }, + { + "auxiliary_loss_clip": 0.01333065, + "auxiliary_loss_mlp": 0.00082543, + "balance_loss_clip": 1.1554296, + "balance_loss_mlp": 0.07558092, + "epoch": 0.6197504885014279, + "flos": 51648955384320.0, + "grad_norm": 0.7912963125069601, + "language_loss": 0.59144467, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.60560071, + "num_input_tokens_seen": 222011405, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.06982422, + "step": 10308, + "time_per_iteration": 3.195725202560425 + }, + { + "auxiliary_loss_clip": 0.01326088, + "auxiliary_loss_mlp": 0.0026348, + "balance_loss_clip": 1.09439206, + "balance_loss_mlp": 0.23910134, + "epoch": 0.6198106117540959, + "flos": 30557107992960.0, + "grad_norm": 5.5014056179783655, + "language_loss": 0.74738109, + "learning_rate": 1.333943721384037e-06, + "loss": 0.76327676, + "num_input_tokens_seen": 222034545, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.24365234, + "step": 10309, + "time_per_iteration": 2.779743194580078 + }, + { + "auxiliary_loss_clip": 0.01324105, + "auxiliary_loss_mlp": 0.00280597, + "balance_loss_clip": 1.08863282, + "balance_loss_mlp": 0.25159386, + "epoch": 0.6198707350067638, + "flos": 18909733184640.0, + "grad_norm": 20.253684711443317, + "language_loss": 0.78604639, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.80209339, + "num_input_tokens_seen": 222052690, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.28991699, + "step": 10310, + "time_per_iteration": 2.930680513381958 + }, + { + "auxiliary_loss_clip": 0.01310858, + "auxiliary_loss_mlp": 0.00261402, + "balance_loss_clip": 1.0755564, + "balance_loss_mlp": 0.23150463, + "epoch": 0.6199308582594318, + "flos": 21433858738560.0, + "grad_norm": 16.308277927828247, + "language_loss": 0.86123443, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.87695694, + "num_input_tokens_seen": 222069095, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.29882812, + "step": 10311, + "time_per_iteration": 2.669586658477783 + }, + { + "auxiliary_loss_clip": 0.01313671, + "auxiliary_loss_mlp": 0.00280749, + "balance_loss_clip": 1.07409644, + "balance_loss_mlp": 0.25020745, + "epoch": 0.6199909815120999, + "flos": 18407379525120.0, + "grad_norm": 5.526139196498862, + "language_loss": 0.80482996, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.8207742, + "num_input_tokens_seen": 222087360, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.30517578, + "step": 10312, + "time_per_iteration": 2.638352632522583 + }, + { + "auxiliary_loss_clip": 0.01363256, + "auxiliary_loss_mlp": 0.00277246, + "balance_loss_clip": 1.11205673, + "balance_loss_mlp": 0.24720523, + "epoch": 0.6200511047647678, + "flos": 21466860359040.0, + "grad_norm": 6.74330042052667, + "language_loss": 0.81297863, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.82938361, + "num_input_tokens_seen": 222106130, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.30065918, + "step": 10313, + "time_per_iteration": 2.6630566120147705 + }, + { + "auxiliary_loss_clip": 0.01328985, + "auxiliary_loss_mlp": 0.00304442, + "balance_loss_clip": 1.08747315, + "balance_loss_mlp": 0.2743775, + "epoch": 0.6201112280174358, + "flos": 18215903099520.0, + "grad_norm": 11.966818274433072, + "language_loss": 0.87325078, + "learning_rate": 1.332107887401416e-06, + "loss": 0.88958502, + "num_input_tokens_seen": 222123125, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.30041504, + "step": 10314, + "time_per_iteration": 2.6283528804779053 + }, + { + "auxiliary_loss_clip": 0.01292074, + "auxiliary_loss_mlp": 0.00285189, + "balance_loss_clip": 1.0626018, + "balance_loss_mlp": 0.25910625, + "epoch": 0.6201713512701037, + "flos": 20011185786240.0, + "grad_norm": 34.86822390583382, + "language_loss": 0.86603028, + "learning_rate": 1.331740796528812e-06, + "loss": 0.88180292, + "num_input_tokens_seen": 222140655, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.26086426, + "step": 10315, + "time_per_iteration": 2.6606106758117676 + }, + { + "auxiliary_loss_clip": 0.01315731, + "auxiliary_loss_mlp": 0.00309613, + "balance_loss_clip": 1.07904232, + "balance_loss_mlp": 0.28255227, + "epoch": 0.6202314745227717, + "flos": 22487692884480.0, + "grad_norm": 15.054500392658369, + "language_loss": 0.82163858, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.83789206, + "num_input_tokens_seen": 222160450, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.27038574, + "step": 10316, + "time_per_iteration": 2.679185390472412 + }, + { + "auxiliary_loss_clip": 0.01301301, + "auxiliary_loss_mlp": 0.00289528, + "balance_loss_clip": 1.06601143, + "balance_loss_mlp": 0.26364762, + "epoch": 0.6202915977754396, + "flos": 26828682220800.0, + "grad_norm": 268.56319940345185, + "language_loss": 0.85409701, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.87000525, + "num_input_tokens_seen": 222179170, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.25891113, + "step": 10317, + "time_per_iteration": 4.110605955123901 + }, + { + "auxiliary_loss_clip": 0.01327176, + "auxiliary_loss_mlp": 0.00036235, + "balance_loss_clip": 1.1505487, + "balance_loss_mlp": 0.02622138, + "epoch": 0.6203517210281076, + "flos": 62742694890240.0, + "grad_norm": 0.6854700409506946, + "language_loss": 0.58545113, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.59908521, + "num_input_tokens_seen": 222242660, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.10009766, + "step": 10318, + "time_per_iteration": 3.218869924545288 + }, + { + "auxiliary_loss_clip": 0.01301446, + "auxiliary_loss_mlp": 0.00272652, + "balance_loss_clip": 1.06839919, + "balance_loss_mlp": 0.24487662, + "epoch": 0.6204118442807756, + "flos": 23404277162880.0, + "grad_norm": 86.90050356144388, + "language_loss": 0.86543071, + "learning_rate": 1.330272686582143e-06, + "loss": 0.8811717, + "num_input_tokens_seen": 222262170, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.27770996, + "step": 10319, + "time_per_iteration": 4.089164972305298 + }, + { + "auxiliary_loss_clip": 0.01298236, + "auxiliary_loss_mlp": 0.00243126, + "balance_loss_clip": 1.06812084, + "balance_loss_mlp": 0.21847314, + "epoch": 0.6204719675334436, + "flos": 20193647898240.0, + "grad_norm": 22.361338837415108, + "language_loss": 0.74299365, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.75840735, + "num_input_tokens_seen": 222280375, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.24682617, + "step": 10320, + "time_per_iteration": 2.6544108390808105 + }, + { + "auxiliary_loss_clip": 0.01267685, + "auxiliary_loss_mlp": 0.00261629, + "balance_loss_clip": 1.04825246, + "balance_loss_mlp": 0.23704842, + "epoch": 0.6205320907861115, + "flos": 13188050916480.0, + "grad_norm": 42.712745146082, + "language_loss": 0.82159984, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.83689296, + "num_input_tokens_seen": 222297325, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24609375, + "step": 10321, + "time_per_iteration": 2.6668434143066406 + }, + { + "auxiliary_loss_clip": 0.0128626, + "auxiliary_loss_mlp": 0.00256819, + "balance_loss_clip": 1.05706275, + "balance_loss_mlp": 0.23112971, + "epoch": 0.6205922140387795, + "flos": 20668386977280.0, + "grad_norm": 1027.398906896536, + "language_loss": 0.82300013, + "learning_rate": 1.329171870732758e-06, + "loss": 0.83843088, + "num_input_tokens_seen": 222317095, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.25683594, + "step": 10322, + "time_per_iteration": 4.142731428146362 + }, + { + "auxiliary_loss_clip": 0.01302827, + "auxiliary_loss_mlp": 0.00253767, + "balance_loss_clip": 1.0723716, + "balance_loss_mlp": 0.22870967, + "epoch": 0.6206523372914474, + "flos": 23877831093120.0, + "grad_norm": 13.503279999236085, + "language_loss": 0.80342805, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.81899399, + "num_input_tokens_seen": 222337055, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.25048828, + "step": 10323, + "time_per_iteration": 2.682368516921997 + }, + { + "auxiliary_loss_clip": 0.0131784, + "auxiliary_loss_mlp": 0.00290007, + "balance_loss_clip": 1.07716918, + "balance_loss_mlp": 0.26153958, + "epoch": 0.6207124605441154, + "flos": 13406603218560.0, + "grad_norm": 16.36340821888915, + "language_loss": 0.68777829, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.70385677, + "num_input_tokens_seen": 222354515, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.28479004, + "step": 10324, + "time_per_iteration": 2.6142630577087402 + }, + { + "auxiliary_loss_clip": 0.0130858, + "auxiliary_loss_mlp": 0.00265909, + "balance_loss_clip": 1.0737046, + "balance_loss_mlp": 0.23866998, + "epoch": 0.6207725837967835, + "flos": 18916341287040.0, + "grad_norm": 117.90523726719724, + "language_loss": 0.85766852, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.87341344, + "num_input_tokens_seen": 222372755, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.27209473, + "step": 10325, + "time_per_iteration": 2.653644561767578 + }, + { + "auxiliary_loss_clip": 0.01301715, + "auxiliary_loss_mlp": 0.00281493, + "balance_loss_clip": 1.0667268, + "balance_loss_mlp": 0.25365782, + "epoch": 0.6208327070494514, + "flos": 23980211832960.0, + "grad_norm": 7.341069893402933, + "language_loss": 0.8015582, + "learning_rate": 1.327704472462003e-06, + "loss": 0.81739026, + "num_input_tokens_seen": 222391380, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.27832031, + "step": 10326, + "time_per_iteration": 2.6519041061401367 + }, + { + "auxiliary_loss_clip": 0.01299021, + "auxiliary_loss_mlp": 0.00270881, + "balance_loss_clip": 1.0672996, + "balance_loss_mlp": 0.2429141, + "epoch": 0.6208928303021194, + "flos": 22820405587200.0, + "grad_norm": 7.406109409997229, + "language_loss": 0.85666013, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.87235916, + "num_input_tokens_seen": 222411165, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.27954102, + "step": 10327, + "time_per_iteration": 4.056640386581421 + }, + { + "auxiliary_loss_clip": 0.01324192, + "auxiliary_loss_mlp": 0.00289167, + "balance_loss_clip": 1.08589053, + "balance_loss_mlp": 0.25998494, + "epoch": 0.6209529535547873, + "flos": 17564519911680.0, + "grad_norm": 47.33054883231845, + "language_loss": 0.89487028, + "learning_rate": 1.326970926232066e-06, + "loss": 0.91100395, + "num_input_tokens_seen": 222428110, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.29174805, + "step": 10328, + "time_per_iteration": 2.629875898361206 + }, + { + "auxiliary_loss_clip": 0.01307258, + "auxiliary_loss_mlp": 0.00251887, + "balance_loss_clip": 1.07707441, + "balance_loss_mlp": 0.22469571, + "epoch": 0.6210130768074553, + "flos": 22011912311040.0, + "grad_norm": 39.36442437649523, + "language_loss": 0.86923897, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.88483042, + "num_input_tokens_seen": 222446385, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.27172852, + "step": 10329, + "time_per_iteration": 2.666555404663086 + }, + { + "auxiliary_loss_clip": 0.01300047, + "auxiliary_loss_mlp": 0.00114146, + "balance_loss_clip": 1.12392437, + "balance_loss_mlp": 0.10303535, + "epoch": 0.6210732000601232, + "flos": 63676873854720.0, + "grad_norm": 0.7646719881809241, + "language_loss": 0.61239672, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.62653869, + "num_input_tokens_seen": 222502150, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.11132812, + "step": 10330, + "time_per_iteration": 3.164947032928467 + }, + { + "auxiliary_loss_clip": 0.0132177, + "auxiliary_loss_mlp": 0.00257894, + "balance_loss_clip": 1.08602214, + "balance_loss_mlp": 0.23073792, + "epoch": 0.6211333233127913, + "flos": 24243365848320.0, + "grad_norm": 4.225844054072377, + "language_loss": 0.86939335, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.88519001, + "num_input_tokens_seen": 222519880, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.27160645, + "step": 10331, + "time_per_iteration": 2.6863324642181396 + }, + { + "auxiliary_loss_clip": 0.01297963, + "auxiliary_loss_mlp": 0.0024593, + "balance_loss_clip": 1.06531143, + "balance_loss_mlp": 0.21975169, + "epoch": 0.6211934465654592, + "flos": 16943803960320.0, + "grad_norm": 121.54085396143287, + "language_loss": 0.74299192, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.75843084, + "num_input_tokens_seen": 222538545, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.26184082, + "step": 10332, + "time_per_iteration": 2.6324405670166016 + }, + { + "auxiliary_loss_clip": 0.01303904, + "auxiliary_loss_mlp": 0.00262297, + "balance_loss_clip": 1.073192, + "balance_loss_mlp": 0.23710853, + "epoch": 0.6212535698181272, + "flos": 15267386355840.0, + "grad_norm": 771.8662423434789, + "language_loss": 0.83086157, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.84652358, + "num_input_tokens_seen": 222556935, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.25195312, + "step": 10333, + "time_per_iteration": 2.6692850589752197 + }, + { + "auxiliary_loss_clip": 0.01299102, + "auxiliary_loss_mlp": 0.00252802, + "balance_loss_clip": 1.07220256, + "balance_loss_mlp": 0.22782733, + "epoch": 0.6213136930707951, + "flos": 13443950384640.0, + "grad_norm": 227.68621501900185, + "language_loss": 0.79658079, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.81209975, + "num_input_tokens_seen": 222574035, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.25012207, + "step": 10334, + "time_per_iteration": 2.7203922271728516 + }, + { + "auxiliary_loss_clip": 0.01288179, + "auxiliary_loss_mlp": 0.00250015, + "balance_loss_clip": 1.06226158, + "balance_loss_mlp": 0.22444472, + "epoch": 0.6213738163234631, + "flos": 18111223889280.0, + "grad_norm": 10.484090048423955, + "language_loss": 0.787974, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.80335587, + "num_input_tokens_seen": 222592290, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.25561523, + "step": 10335, + "time_per_iteration": 2.7480292320251465 + }, + { + "auxiliary_loss_clip": 0.01299076, + "auxiliary_loss_mlp": 0.00237957, + "balance_loss_clip": 1.07427835, + "balance_loss_mlp": 0.21394783, + "epoch": 0.621433939576131, + "flos": 25337348421120.0, + "grad_norm": 2.960374593638807, + "language_loss": 0.8607294, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.87609971, + "num_input_tokens_seen": 222612805, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.24023438, + "step": 10336, + "time_per_iteration": 2.7224369049072266 + }, + { + "auxiliary_loss_clip": 0.01281781, + "auxiliary_loss_mlp": 0.00277109, + "balance_loss_clip": 1.05884147, + "balance_loss_mlp": 0.25085929, + "epoch": 0.621494062828799, + "flos": 22565619440640.0, + "grad_norm": 47.51365612811129, + "language_loss": 0.82197535, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.83756429, + "num_input_tokens_seen": 222632260, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26220703, + "step": 10337, + "time_per_iteration": 2.6502163410186768 + }, + { + "auxiliary_loss_clip": 0.01323989, + "auxiliary_loss_mlp": 0.00280627, + "balance_loss_clip": 1.08490527, + "balance_loss_mlp": 0.25308961, + "epoch": 0.621554186081467, + "flos": 27417976750080.0, + "grad_norm": 45.973465797231476, + "language_loss": 0.72699428, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.74304044, + "num_input_tokens_seen": 222653570, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.27539062, + "step": 10338, + "time_per_iteration": 2.6785459518432617 + }, + { + "auxiliary_loss_clip": 0.01284727, + "auxiliary_loss_mlp": 0.00256705, + "balance_loss_clip": 1.05719924, + "balance_loss_mlp": 0.23045543, + "epoch": 0.621614309334135, + "flos": 22346815743360.0, + "grad_norm": 138.92332659416812, + "language_loss": 0.78443766, + "learning_rate": 1.322938249724991e-06, + "loss": 0.79985201, + "num_input_tokens_seen": 222672480, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.26269531, + "step": 10339, + "time_per_iteration": 2.658094644546509 + }, + { + "auxiliary_loss_clip": 0.01287575, + "auxiliary_loss_mlp": 0.00223372, + "balance_loss_clip": 1.06112695, + "balance_loss_mlp": 0.19962594, + "epoch": 0.621674432586803, + "flos": 19281229597440.0, + "grad_norm": 12.414805152957209, + "language_loss": 0.78956461, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.80467409, + "num_input_tokens_seen": 222691200, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.23742676, + "step": 10340, + "time_per_iteration": 2.6325876712799072 + }, + { + "auxiliary_loss_clip": 0.01302086, + "auxiliary_loss_mlp": 0.00232947, + "balance_loss_clip": 1.070508, + "balance_loss_mlp": 0.20732927, + "epoch": 0.6217345558394709, + "flos": 21609533180160.0, + "grad_norm": 185.39481677242802, + "language_loss": 0.78590894, + "learning_rate": 1.322205369037788e-06, + "loss": 0.80125928, + "num_input_tokens_seen": 222709975, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.25634766, + "step": 10341, + "time_per_iteration": 2.667786121368408 + }, + { + "auxiliary_loss_clip": 0.01317361, + "auxiliary_loss_mlp": 0.00242558, + "balance_loss_clip": 1.08301306, + "balance_loss_mlp": 0.21615291, + "epoch": 0.6217946790921389, + "flos": 18004102554240.0, + "grad_norm": 12.449379530199982, + "language_loss": 0.88974571, + "learning_rate": 1.321838967240299e-06, + "loss": 0.9053449, + "num_input_tokens_seen": 222729005, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.26416016, + "step": 10342, + "time_per_iteration": 2.6338534355163574 + }, + { + "auxiliary_loss_clip": 0.01317224, + "auxiliary_loss_mlp": 0.00099972, + "balance_loss_clip": 1.14166021, + "balance_loss_mlp": 0.09215173, + "epoch": 0.6218548023448068, + "flos": 61973631768960.0, + "grad_norm": 0.7788421358116109, + "language_loss": 0.56792647, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.58209842, + "num_input_tokens_seen": 222786090, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.078125, + "step": 10343, + "time_per_iteration": 3.074636697769165 + }, + { + "auxiliary_loss_clip": 0.01287572, + "auxiliary_loss_mlp": 0.00245501, + "balance_loss_clip": 1.06415153, + "balance_loss_mlp": 0.2197998, + "epoch": 0.6219149255974749, + "flos": 25739152934400.0, + "grad_norm": 4.825099779455435, + "language_loss": 0.80245674, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.81778741, + "num_input_tokens_seen": 222806100, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25708008, + "step": 10344, + "time_per_iteration": 2.698265552520752 + }, + { + "auxiliary_loss_clip": 0.013225, + "auxiliary_loss_mlp": 0.00260659, + "balance_loss_clip": 1.08795476, + "balance_loss_mlp": 0.23382463, + "epoch": 0.6219750488501428, + "flos": 25411073086080.0, + "grad_norm": 8.06868354690829, + "language_loss": 0.69254136, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.70837295, + "num_input_tokens_seen": 222826575, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.26867676, + "step": 10345, + "time_per_iteration": 2.711132526397705 + }, + { + "auxiliary_loss_clip": 0.01311377, + "auxiliary_loss_mlp": 0.00262418, + "balance_loss_clip": 1.07804346, + "balance_loss_mlp": 0.23399878, + "epoch": 0.6220351721028108, + "flos": 20047383717120.0, + "grad_norm": 11.544681077962773, + "language_loss": 0.84592593, + "learning_rate": 1.320373617348614e-06, + "loss": 0.86166394, + "num_input_tokens_seen": 222845285, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.28417969, + "step": 10346, + "time_per_iteration": 2.6951956748962402 + }, + { + "auxiliary_loss_clip": 0.01338247, + "auxiliary_loss_mlp": 0.00250807, + "balance_loss_clip": 1.09729278, + "balance_loss_mlp": 0.22251855, + "epoch": 0.6220952953554787, + "flos": 27488397363840.0, + "grad_norm": 3.5352178875141003, + "language_loss": 0.78411019, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.80000073, + "num_input_tokens_seen": 222864575, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.28283691, + "step": 10347, + "time_per_iteration": 2.740128755569458 + }, + { + "auxiliary_loss_clip": 0.01319571, + "auxiliary_loss_mlp": 0.00235082, + "balance_loss_clip": 1.09024906, + "balance_loss_mlp": 0.20988104, + "epoch": 0.6221554186081467, + "flos": 19207612673280.0, + "grad_norm": 94.38055846182816, + "language_loss": 0.79825634, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.81380284, + "num_input_tokens_seen": 222884420, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.25219727, + "step": 10348, + "time_per_iteration": 2.6733741760253906 + }, + { + "auxiliary_loss_clip": 0.01330569, + "auxiliary_loss_mlp": 0.00109322, + "balance_loss_clip": 1.15283823, + "balance_loss_mlp": 0.09983259, + "epoch": 0.6222155418608146, + "flos": 62950939989120.0, + "grad_norm": 0.7858844834115778, + "language_loss": 0.53497344, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.54937232, + "num_input_tokens_seen": 222944690, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.09472656, + "step": 10349, + "time_per_iteration": 3.195889472961426 + }, + { + "auxiliary_loss_clip": 0.0130958, + "auxiliary_loss_mlp": 0.00249782, + "balance_loss_clip": 1.07720351, + "balance_loss_mlp": 0.22386573, + "epoch": 0.6222756651134826, + "flos": 22601099099520.0, + "grad_norm": 169.74657484505445, + "language_loss": 0.78270519, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.79829884, + "num_input_tokens_seen": 222962990, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.25927734, + "step": 10350, + "time_per_iteration": 2.816863775253296 + }, + { + "auxiliary_loss_clip": 0.01321268, + "auxiliary_loss_mlp": 0.00251098, + "balance_loss_clip": 1.08810639, + "balance_loss_mlp": 0.22404924, + "epoch": 0.6223357883661506, + "flos": 21142228216320.0, + "grad_norm": 3.74417644680404, + "language_loss": 0.67113638, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.68686002, + "num_input_tokens_seen": 222980715, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.27062988, + "step": 10351, + "time_per_iteration": 2.658879041671753 + }, + { + "auxiliary_loss_clip": 0.0132832, + "auxiliary_loss_mlp": 0.00093404, + "balance_loss_clip": 1.15198517, + "balance_loss_mlp": 0.0849162, + "epoch": 0.6223959116188186, + "flos": 63765071700480.0, + "grad_norm": 0.7963870113234681, + "language_loss": 0.60843891, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.62265617, + "num_input_tokens_seen": 223040685, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.08496094, + "step": 10352, + "time_per_iteration": 3.1323366165161133 + }, + { + "auxiliary_loss_clip": 0.01321444, + "auxiliary_loss_mlp": 0.00236402, + "balance_loss_clip": 1.09064555, + "balance_loss_mlp": 0.20899597, + "epoch": 0.6224560348714866, + "flos": 22565727181440.0, + "grad_norm": 35.56450302318675, + "language_loss": 0.91001862, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.92559707, + "num_input_tokens_seen": 223059000, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.27404785, + "step": 10353, + "time_per_iteration": 2.673476457595825 + }, + { + "auxiliary_loss_clip": 0.01308074, + "auxiliary_loss_mlp": 0.00247064, + "balance_loss_clip": 1.08094811, + "balance_loss_mlp": 0.2217558, + "epoch": 0.6225161581241545, + "flos": 24097748112000.0, + "grad_norm": 7.670524804699495, + "language_loss": 0.83554864, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.85110003, + "num_input_tokens_seen": 223079345, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.25305176, + "step": 10354, + "time_per_iteration": 2.6669516563415527 + }, + { + "auxiliary_loss_clip": 0.01303437, + "auxiliary_loss_mlp": 0.00251317, + "balance_loss_clip": 1.07422304, + "balance_loss_mlp": 0.22569901, + "epoch": 0.6225762813768225, + "flos": 20443513881600.0, + "grad_norm": 22.053093716543074, + "language_loss": 0.8461594, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.86170697, + "num_input_tokens_seen": 223097880, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.25610352, + "step": 10355, + "time_per_iteration": 2.6259427070617676 + }, + { + "auxiliary_loss_clip": 0.01309345, + "auxiliary_loss_mlp": 0.00242098, + "balance_loss_clip": 1.08026695, + "balance_loss_mlp": 0.21612188, + "epoch": 0.6226364046294904, + "flos": 27198131558400.0, + "grad_norm": 5.851559024277878, + "language_loss": 0.85372877, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.86924326, + "num_input_tokens_seen": 223118185, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.2598877, + "step": 10356, + "time_per_iteration": 2.9869544506073 + }, + { + "auxiliary_loss_clip": 0.0132444, + "auxiliary_loss_mlp": 0.00274733, + "balance_loss_clip": 1.09021139, + "balance_loss_mlp": 0.24760097, + "epoch": 0.6226965278821585, + "flos": 20445776438400.0, + "grad_norm": 20.281764857799615, + "language_loss": 0.78111285, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.7971046, + "num_input_tokens_seen": 223137600, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.27148438, + "step": 10357, + "time_per_iteration": 2.6503679752349854 + }, + { + "auxiliary_loss_clip": 0.01337755, + "auxiliary_loss_mlp": 0.00279745, + "balance_loss_clip": 1.09654653, + "balance_loss_mlp": 0.2502763, + "epoch": 0.6227566511348264, + "flos": 22162737519360.0, + "grad_norm": 12.79856802309104, + "language_loss": 0.86712182, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.88329685, + "num_input_tokens_seen": 223154360, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.29467773, + "step": 10358, + "time_per_iteration": 2.6606297492980957 + }, + { + "auxiliary_loss_clip": 0.01310164, + "auxiliary_loss_mlp": 0.00238386, + "balance_loss_clip": 1.07904267, + "balance_loss_mlp": 0.21186206, + "epoch": 0.6228167743874944, + "flos": 18040875102720.0, + "grad_norm": 759.1143070867084, + "language_loss": 0.91620111, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.93168664, + "num_input_tokens_seen": 223172255, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.265625, + "step": 10359, + "time_per_iteration": 4.064505338668823 + }, + { + "auxiliary_loss_clip": 0.01326422, + "auxiliary_loss_mlp": 0.00231584, + "balance_loss_clip": 1.09085572, + "balance_loss_mlp": 0.20461929, + "epoch": 0.6228768976401623, + "flos": 17742851959680.0, + "grad_norm": 4.723569207089729, + "language_loss": 0.82445198, + "learning_rate": 1.315248145768822e-06, + "loss": 0.8400321, + "num_input_tokens_seen": 223186965, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.26977539, + "step": 10360, + "time_per_iteration": 2.6189181804656982 + }, + { + "auxiliary_loss_clip": 0.01338534, + "auxiliary_loss_mlp": 0.00270905, + "balance_loss_clip": 1.09944868, + "balance_loss_mlp": 0.24253309, + "epoch": 0.6229370208928303, + "flos": 17894934144000.0, + "grad_norm": 14.827282265720882, + "language_loss": 0.86011112, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.87620544, + "num_input_tokens_seen": 223206045, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.28381348, + "step": 10361, + "time_per_iteration": 4.092345952987671 + }, + { + "auxiliary_loss_clip": 0.01316102, + "auxiliary_loss_mlp": 0.00230217, + "balance_loss_clip": 1.08622253, + "balance_loss_mlp": 0.20598215, + "epoch": 0.6229971441454982, + "flos": 17347763289600.0, + "grad_norm": 3.1165834374882126, + "language_loss": 0.75942373, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.77488697, + "num_input_tokens_seen": 223224820, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.24243164, + "step": 10362, + "time_per_iteration": 2.631775379180908 + }, + { + "auxiliary_loss_clip": 0.01315844, + "auxiliary_loss_mlp": 0.00276331, + "balance_loss_clip": 1.08375299, + "balance_loss_mlp": 0.24899636, + "epoch": 0.6230572673981662, + "flos": 29241376807680.0, + "grad_norm": 7.406725308922693, + "language_loss": 0.76214391, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.77806568, + "num_input_tokens_seen": 223243205, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.27355957, + "step": 10363, + "time_per_iteration": 2.7273290157318115 + }, + { + "auxiliary_loss_clip": 0.01328076, + "auxiliary_loss_mlp": 0.00262576, + "balance_loss_clip": 1.08901477, + "balance_loss_mlp": 0.23630184, + "epoch": 0.6231173906508342, + "flos": 16325961096960.0, + "grad_norm": 2.294169353109777, + "language_loss": 0.94826496, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.96417147, + "num_input_tokens_seen": 223261370, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.26269531, + "step": 10364, + "time_per_iteration": 4.01880407333374 + }, + { + "auxiliary_loss_clip": 0.01343185, + "auxiliary_loss_mlp": 0.00080751, + "balance_loss_clip": 1.16561604, + "balance_loss_mlp": 0.07331251, + "epoch": 0.6231775139035022, + "flos": 68702032517760.0, + "grad_norm": 0.9034384558497288, + "language_loss": 0.60344177, + "learning_rate": 1.313418851605015e-06, + "loss": 0.61768115, + "num_input_tokens_seen": 223315050, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.07421875, + "step": 10365, + "time_per_iteration": 3.1797680854797363 + }, + { + "auxiliary_loss_clip": 0.01332513, + "auxiliary_loss_mlp": 0.00254394, + "balance_loss_clip": 1.09341562, + "balance_loss_mlp": 0.22534317, + "epoch": 0.6232376371561702, + "flos": 19821038163840.0, + "grad_norm": 4.925275668886382, + "language_loss": 0.85040683, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.86627591, + "num_input_tokens_seen": 223332130, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.29064941, + "step": 10366, + "time_per_iteration": 2.6355018615722656 + }, + { + "auxiliary_loss_clip": 0.0133767, + "auxiliary_loss_mlp": 0.00238238, + "balance_loss_clip": 1.09752154, + "balance_loss_mlp": 0.21055728, + "epoch": 0.6232977604088381, + "flos": 23258264376960.0, + "grad_norm": 5.521972441108516, + "language_loss": 0.84866011, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.86441916, + "num_input_tokens_seen": 223351605, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.27697754, + "step": 10367, + "time_per_iteration": 2.7085306644439697 + }, + { + "auxiliary_loss_clip": 0.01329546, + "auxiliary_loss_mlp": 0.00240021, + "balance_loss_clip": 1.09611905, + "balance_loss_mlp": 0.21424839, + "epoch": 0.6233578836615061, + "flos": 21106425335040.0, + "grad_norm": 16.468926353292385, + "language_loss": 0.84709287, + "learning_rate": 1.312321587418457e-06, + "loss": 0.86278856, + "num_input_tokens_seen": 223372090, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.2578125, + "step": 10368, + "time_per_iteration": 2.661933660507202 + }, + { + "auxiliary_loss_clip": 0.01342355, + "auxiliary_loss_mlp": 0.00269108, + "balance_loss_clip": 1.10135508, + "balance_loss_mlp": 0.24035501, + "epoch": 0.623418006914174, + "flos": 23769416868480.0, + "grad_norm": 8.308856144875948, + "language_loss": 0.8006134, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.816728, + "num_input_tokens_seen": 223390110, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.28759766, + "step": 10369, + "time_per_iteration": 4.071518898010254 + }, + { + "auxiliary_loss_clip": 0.01316041, + "auxiliary_loss_mlp": 0.00259388, + "balance_loss_clip": 1.08648896, + "balance_loss_mlp": 0.23169559, + "epoch": 0.6234781301668421, + "flos": 17890480857600.0, + "grad_norm": 10.402359575816, + "language_loss": 0.94451338, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.96026772, + "num_input_tokens_seen": 223404205, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.27661133, + "step": 10370, + "time_per_iteration": 2.647996664047241 + }, + { + "auxiliary_loss_clip": 0.01341479, + "auxiliary_loss_mlp": 0.00286832, + "balance_loss_clip": 1.10475254, + "balance_loss_mlp": 0.25716048, + "epoch": 0.62353825341951, + "flos": 26175503352960.0, + "grad_norm": 5.669240266890785, + "language_loss": 0.71796858, + "learning_rate": 1.311224557923402e-06, + "loss": 0.73425168, + "num_input_tokens_seen": 223424855, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.296875, + "step": 10371, + "time_per_iteration": 2.7879505157470703 + }, + { + "auxiliary_loss_clip": 0.01335062, + "auxiliary_loss_mlp": 0.00267255, + "balance_loss_clip": 1.10400271, + "balance_loss_mlp": 0.2411481, + "epoch": 0.623598376672178, + "flos": 31139902160640.0, + "grad_norm": 70.31724079767974, + "language_loss": 0.8212204, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.83724362, + "num_input_tokens_seen": 223447225, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.26086426, + "step": 10372, + "time_per_iteration": 2.7664785385131836 + }, + { + "auxiliary_loss_clip": 0.013717, + "auxiliary_loss_mlp": 0.00275482, + "balance_loss_clip": 1.1210475, + "balance_loss_mlp": 0.24444015, + "epoch": 0.6236584999248459, + "flos": 23730202195200.0, + "grad_norm": 4.255487798977755, + "language_loss": 0.83825195, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.85472381, + "num_input_tokens_seen": 223467520, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.31030273, + "step": 10373, + "time_per_iteration": 2.754181146621704 + }, + { + "auxiliary_loss_clip": 0.01337034, + "auxiliary_loss_mlp": 0.00253566, + "balance_loss_clip": 1.10494709, + "balance_loss_mlp": 0.22755468, + "epoch": 0.6237186231775139, + "flos": 21762764599680.0, + "grad_norm": 13.294638636650623, + "language_loss": 0.76040423, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.77631027, + "num_input_tokens_seen": 223488130, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.26000977, + "step": 10374, + "time_per_iteration": 2.6562414169311523 + }, + { + "auxiliary_loss_clip": 0.01352857, + "auxiliary_loss_mlp": 0.00240788, + "balance_loss_clip": 1.11076021, + "balance_loss_mlp": 0.21437109, + "epoch": 0.6237787464301818, + "flos": 14939486075520.0, + "grad_norm": 2.674659180123151, + "language_loss": 0.84392256, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.85985905, + "num_input_tokens_seen": 223505105, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.2644043, + "step": 10375, + "time_per_iteration": 2.650298833847046 + }, + { + "auxiliary_loss_clip": 0.01331773, + "auxiliary_loss_mlp": 0.00253114, + "balance_loss_clip": 1.09864628, + "balance_loss_mlp": 0.22730517, + "epoch": 0.6238388696828499, + "flos": 35590311302400.0, + "grad_norm": 19.584333926395477, + "language_loss": 0.76569808, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.78154695, + "num_input_tokens_seen": 223528065, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.25817871, + "step": 10376, + "time_per_iteration": 2.766901969909668 + }, + { + "auxiliary_loss_clip": 0.01354674, + "auxiliary_loss_mlp": 0.00279682, + "balance_loss_clip": 1.11204231, + "balance_loss_mlp": 0.24895018, + "epoch": 0.6238989929355178, + "flos": 23623511823360.0, + "grad_norm": 12.874054872415622, + "language_loss": 0.86961001, + "learning_rate": 1.309031204505301e-06, + "loss": 0.88595361, + "num_input_tokens_seen": 223547305, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.30712891, + "step": 10377, + "time_per_iteration": 2.6772079467773438 + }, + { + "auxiliary_loss_clip": 0.01344586, + "auxiliary_loss_mlp": 0.00255796, + "balance_loss_clip": 1.10530043, + "balance_loss_mlp": 0.22885454, + "epoch": 0.6239591161881858, + "flos": 22087468569600.0, + "grad_norm": 4.053190268110635, + "language_loss": 0.77822709, + "learning_rate": 1.308665737227052e-06, + "loss": 0.79423094, + "num_input_tokens_seen": 223567205, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.26928711, + "step": 10378, + "time_per_iteration": 2.6243035793304443 + }, + { + "auxiliary_loss_clip": 0.01334251, + "auxiliary_loss_mlp": 0.00250529, + "balance_loss_clip": 1.10041142, + "balance_loss_mlp": 0.2213106, + "epoch": 0.6240192394408538, + "flos": 24535930124160.0, + "grad_norm": 12.40190590246399, + "language_loss": 0.83606064, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.85190845, + "num_input_tokens_seen": 223586560, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.29223633, + "step": 10379, + "time_per_iteration": 2.7167038917541504 + }, + { + "auxiliary_loss_clip": 0.01346753, + "auxiliary_loss_mlp": 0.00262924, + "balance_loss_clip": 1.10820162, + "balance_loss_mlp": 0.23446855, + "epoch": 0.6240793626935217, + "flos": 27931930502400.0, + "grad_norm": 3.3374631801015546, + "language_loss": 0.84772301, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.86381978, + "num_input_tokens_seen": 223610595, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.28442383, + "step": 10380, + "time_per_iteration": 2.744037389755249 + }, + { + "auxiliary_loss_clip": 0.01327406, + "auxiliary_loss_mlp": 0.00241794, + "balance_loss_clip": 1.09490943, + "balance_loss_mlp": 0.21527007, + "epoch": 0.6241394859461897, + "flos": 22892514140160.0, + "grad_norm": 7.008684197903679, + "language_loss": 0.87047923, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.88617122, + "num_input_tokens_seen": 223630230, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.26550293, + "step": 10381, + "time_per_iteration": 2.7291626930236816 + }, + { + "auxiliary_loss_clip": 0.0134302, + "auxiliary_loss_mlp": 0.0028329, + "balance_loss_clip": 1.10799968, + "balance_loss_mlp": 0.25607419, + "epoch": 0.6241996091988576, + "flos": 12750766744320.0, + "grad_norm": 2.4094153241824112, + "language_loss": 0.83631754, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.85258061, + "num_input_tokens_seen": 223648360, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.27209473, + "step": 10382, + "time_per_iteration": 2.608396530151367 + }, + { + "auxiliary_loss_clip": 0.01344036, + "auxiliary_loss_mlp": 0.00269763, + "balance_loss_clip": 1.10806036, + "balance_loss_mlp": 0.24309596, + "epoch": 0.6242597324515257, + "flos": 25851302173440.0, + "grad_norm": 361.17519112919894, + "language_loss": 0.83507735, + "learning_rate": 1.306838794344911e-06, + "loss": 0.85121536, + "num_input_tokens_seen": 223671255, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.26672363, + "step": 10383, + "time_per_iteration": 2.7286007404327393 + }, + { + "auxiliary_loss_clip": 0.01334922, + "auxiliary_loss_mlp": 0.00244739, + "balance_loss_clip": 1.09984112, + "balance_loss_mlp": 0.21893036, + "epoch": 0.6243198557041936, + "flos": 19937712516480.0, + "grad_norm": 9.016681350425893, + "language_loss": 0.82794553, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.84374213, + "num_input_tokens_seen": 223689860, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.25817871, + "step": 10384, + "time_per_iteration": 2.6625802516937256 + }, + { + "auxiliary_loss_clip": 0.01343935, + "auxiliary_loss_mlp": 0.00259852, + "balance_loss_clip": 1.10291934, + "balance_loss_mlp": 0.23300651, + "epoch": 0.6243799789568616, + "flos": 18406194376320.0, + "grad_norm": 8.755352052072526, + "language_loss": 0.75496769, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.77100563, + "num_input_tokens_seen": 223707835, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.26831055, + "step": 10385, + "time_per_iteration": 2.7052927017211914 + }, + { + "auxiliary_loss_clip": 0.01341627, + "auxiliary_loss_mlp": 0.0006174, + "balance_loss_clip": 1.16149354, + "balance_loss_mlp": 0.05115395, + "epoch": 0.6244401022095295, + "flos": 66027587523840.0, + "grad_norm": 0.762078135743884, + "language_loss": 0.61437762, + "learning_rate": 1.305742943921692e-06, + "loss": 0.62841129, + "num_input_tokens_seen": 223771875, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.10595703, + "step": 10386, + "time_per_iteration": 3.165860652923584 + }, + { + "auxiliary_loss_clip": 0.01350701, + "auxiliary_loss_mlp": 0.00263467, + "balance_loss_clip": 1.1084739, + "balance_loss_mlp": 0.23595381, + "epoch": 0.6245002254621975, + "flos": 24571266128640.0, + "grad_norm": 17.45672213259028, + "language_loss": 0.79576224, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.81190395, + "num_input_tokens_seen": 223788895, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.27539062, + "step": 10387, + "time_per_iteration": 2.7145533561706543 + }, + { + "auxiliary_loss_clip": 0.01348709, + "auxiliary_loss_mlp": 0.00288051, + "balance_loss_clip": 1.10569286, + "balance_loss_mlp": 0.26037085, + "epoch": 0.6245603487148654, + "flos": 29168837291520.0, + "grad_norm": 6.400978725763331, + "language_loss": 0.71545857, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.73182619, + "num_input_tokens_seen": 223810385, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.27685547, + "step": 10388, + "time_per_iteration": 2.83872389793396 + }, + { + "auxiliary_loss_clip": 0.01338606, + "auxiliary_loss_mlp": 0.00277971, + "balance_loss_clip": 1.1001035, + "balance_loss_mlp": 0.25141111, + "epoch": 0.6246204719675335, + "flos": 14790097411200.0, + "grad_norm": 10.044758002148804, + "language_loss": 0.85740817, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.8735739, + "num_input_tokens_seen": 223826040, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.26574707, + "step": 10389, + "time_per_iteration": 2.682607889175415 + }, + { + "auxiliary_loss_clip": 0.01320263, + "auxiliary_loss_mlp": 0.0027873, + "balance_loss_clip": 1.08948433, + "balance_loss_mlp": 0.25287339, + "epoch": 0.6246805952202014, + "flos": 12493538472960.0, + "grad_norm": 4.490717287663689, + "language_loss": 0.70891106, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.72490096, + "num_input_tokens_seen": 223842300, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.25878906, + "step": 10390, + "time_per_iteration": 2.7706573009490967 + }, + { + "auxiliary_loss_clip": 0.01329603, + "auxiliary_loss_mlp": 0.00255591, + "balance_loss_clip": 1.09194398, + "balance_loss_mlp": 0.22748137, + "epoch": 0.6247407184728694, + "flos": 12786677366400.0, + "grad_norm": 885.0060605204962, + "language_loss": 0.86846793, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.88431978, + "num_input_tokens_seen": 223858320, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.28125, + "step": 10391, + "time_per_iteration": 2.7689011096954346 + }, + { + "auxiliary_loss_clip": 0.0135915, + "auxiliary_loss_mlp": 0.00268444, + "balance_loss_clip": 1.11455822, + "balance_loss_mlp": 0.24076359, + "epoch": 0.6248008417255374, + "flos": 40629188960640.0, + "grad_norm": 4.687086035717943, + "language_loss": 0.71962172, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.73589766, + "num_input_tokens_seen": 223883545, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.27636719, + "step": 10392, + "time_per_iteration": 2.8763935565948486 + }, + { + "auxiliary_loss_clip": 0.01351295, + "auxiliary_loss_mlp": 0.0024818, + "balance_loss_clip": 1.10641575, + "balance_loss_mlp": 0.21968938, + "epoch": 0.6248609649782053, + "flos": 19902017376000.0, + "grad_norm": 31.60986690666227, + "language_loss": 0.82824206, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.84423685, + "num_input_tokens_seen": 223901445, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.28515625, + "step": 10393, + "time_per_iteration": 2.652094602584839 + }, + { + "auxiliary_loss_clip": 0.01337241, + "auxiliary_loss_mlp": 0.00257256, + "balance_loss_clip": 1.0950222, + "balance_loss_mlp": 0.23077917, + "epoch": 0.6249210882308733, + "flos": 19682746801920.0, + "grad_norm": 31.734883472654403, + "language_loss": 0.90238667, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.91833162, + "num_input_tokens_seen": 223920170, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.26501465, + "step": 10394, + "time_per_iteration": 2.6590681076049805 + }, + { + "auxiliary_loss_clip": 0.01349266, + "auxiliary_loss_mlp": 0.00269364, + "balance_loss_clip": 1.10449719, + "balance_loss_mlp": 0.24235077, + "epoch": 0.6249812114835412, + "flos": 13990726189440.0, + "grad_norm": 65.90563672470965, + "language_loss": 0.84000355, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.85618979, + "num_input_tokens_seen": 223936495, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.2701416, + "step": 10395, + "time_per_iteration": 2.5890016555786133 + }, + { + "auxiliary_loss_clip": 0.01344918, + "auxiliary_loss_mlp": 0.00261116, + "balance_loss_clip": 1.10419345, + "balance_loss_mlp": 0.23295861, + "epoch": 0.6250413347362093, + "flos": 14530031965440.0, + "grad_norm": 4.074906178134511, + "language_loss": 0.82101476, + "learning_rate": 1.302091822487119e-06, + "loss": 0.83707505, + "num_input_tokens_seen": 223950070, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.28137207, + "step": 10396, + "time_per_iteration": 2.6247849464416504 + }, + { + "auxiliary_loss_clip": 0.013487, + "auxiliary_loss_mlp": 0.00256699, + "balance_loss_clip": 1.10811937, + "balance_loss_mlp": 0.22978181, + "epoch": 0.6251014579888772, + "flos": 22963006581120.0, + "grad_norm": 10.991154107588617, + "language_loss": 0.82503033, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.84108436, + "num_input_tokens_seen": 223970065, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.26916504, + "step": 10397, + "time_per_iteration": 2.670114040374756 + }, + { + "auxiliary_loss_clip": 0.01332651, + "auxiliary_loss_mlp": 0.00267436, + "balance_loss_clip": 1.09537876, + "balance_loss_mlp": 0.24041152, + "epoch": 0.6251615812415452, + "flos": 28111232217600.0, + "grad_norm": 15.574226082831267, + "language_loss": 0.8395201, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.85552096, + "num_input_tokens_seen": 223990315, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.26989746, + "step": 10398, + "time_per_iteration": 2.713029384613037 + }, + { + "auxiliary_loss_clip": 0.01341577, + "auxiliary_loss_mlp": 0.00284747, + "balance_loss_clip": 1.09926987, + "balance_loss_mlp": 0.25663757, + "epoch": 0.6252217044942131, + "flos": 26724469887360.0, + "grad_norm": 5.842797253686681, + "language_loss": 0.82888776, + "learning_rate": 1.300997001489483e-06, + "loss": 0.84515095, + "num_input_tokens_seen": 224009960, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.28125, + "step": 10399, + "time_per_iteration": 2.6707003116607666 + }, + { + "auxiliary_loss_clip": 0.01355855, + "auxiliary_loss_mlp": 0.00246344, + "balance_loss_clip": 1.11021662, + "balance_loss_mlp": 0.21862751, + "epoch": 0.6252818277468811, + "flos": 20006768413440.0, + "grad_norm": 27.19580313097312, + "language_loss": 0.82074273, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.83676469, + "num_input_tokens_seen": 224028870, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.27722168, + "step": 10400, + "time_per_iteration": 2.6289303302764893 + }, + { + "auxiliary_loss_clip": 0.01311519, + "auxiliary_loss_mlp": 0.00087559, + "balance_loss_clip": 1.13840103, + "balance_loss_mlp": 0.07973859, + "epoch": 0.625341950999549, + "flos": 59278285059840.0, + "grad_norm": 0.8102866019197914, + "language_loss": 0.55975646, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.57374722, + "num_input_tokens_seen": 224094140, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.078125, + "step": 10401, + "time_per_iteration": 3.22579026222229 + }, + { + "auxiliary_loss_clip": 0.01326139, + "auxiliary_loss_mlp": 0.00270131, + "balance_loss_clip": 1.09041953, + "balance_loss_mlp": 0.24328507, + "epoch": 0.625402074252217, + "flos": 20157090831360.0, + "grad_norm": 28.958252519829728, + "language_loss": 0.90508121, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.92104393, + "num_input_tokens_seen": 224113235, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.26818848, + "step": 10402, + "time_per_iteration": 4.123790502548218 + }, + { + "auxiliary_loss_clip": 0.01326017, + "auxiliary_loss_mlp": 0.00259324, + "balance_loss_clip": 1.09209812, + "balance_loss_mlp": 0.23382533, + "epoch": 0.625462197504885, + "flos": 29132531619840.0, + "grad_norm": 14.127950612954809, + "language_loss": 0.77789795, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.79375142, + "num_input_tokens_seen": 224134530, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.25500488, + "step": 10403, + "time_per_iteration": 4.132863283157349 + }, + { + "auxiliary_loss_clip": 0.01334924, + "auxiliary_loss_mlp": 0.00252713, + "balance_loss_clip": 1.09993267, + "balance_loss_mlp": 0.22603391, + "epoch": 0.625522320757553, + "flos": 26104436294400.0, + "grad_norm": 86.43826748192993, + "language_loss": 0.79481924, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.81069559, + "num_input_tokens_seen": 224154170, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.26660156, + "step": 10404, + "time_per_iteration": 2.704120397567749 + }, + { + "auxiliary_loss_clip": 0.01329499, + "auxiliary_loss_mlp": 0.0027953, + "balance_loss_clip": 1.09390962, + "balance_loss_mlp": 0.25209987, + "epoch": 0.625582444010221, + "flos": 20630967984000.0, + "grad_norm": 2114.5465056534167, + "language_loss": 0.76070237, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.77679271, + "num_input_tokens_seen": 224172730, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.27416992, + "step": 10405, + "time_per_iteration": 2.653061628341675 + }, + { + "auxiliary_loss_clip": 0.01322626, + "auxiliary_loss_mlp": 0.00250096, + "balance_loss_clip": 1.09114003, + "balance_loss_mlp": 0.22366706, + "epoch": 0.6256425672628889, + "flos": 20521512264960.0, + "grad_norm": 28.391013653367466, + "language_loss": 0.85545009, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.87117732, + "num_input_tokens_seen": 224192620, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.26403809, + "step": 10406, + "time_per_iteration": 2.674349546432495 + }, + { + "auxiliary_loss_clip": 0.01320766, + "auxiliary_loss_mlp": 0.00258269, + "balance_loss_clip": 1.09020996, + "balance_loss_mlp": 0.23179284, + "epoch": 0.6257026905155569, + "flos": 29529200488320.0, + "grad_norm": 6.24189336385816, + "language_loss": 0.76582867, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.78161901, + "num_input_tokens_seen": 224214660, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.26464844, + "step": 10407, + "time_per_iteration": 4.117915630340576 + }, + { + "auxiliary_loss_clip": 0.0132234, + "auxiliary_loss_mlp": 0.00281979, + "balance_loss_clip": 1.09012055, + "balance_loss_mlp": 0.25768426, + "epoch": 0.6257628137682248, + "flos": 24024885373440.0, + "grad_norm": 2.2064013586626854, + "language_loss": 0.90433538, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.92037857, + "num_input_tokens_seen": 224234170, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.24291992, + "step": 10408, + "time_per_iteration": 2.68227481842041 + }, + { + "auxiliary_loss_clip": 0.01321803, + "auxiliary_loss_mlp": 0.00275116, + "balance_loss_clip": 1.08886981, + "balance_loss_mlp": 0.25055876, + "epoch": 0.6258229370208929, + "flos": 20850956830080.0, + "grad_norm": 12.63246138116142, + "language_loss": 0.86920726, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.88517642, + "num_input_tokens_seen": 224253115, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.24560547, + "step": 10409, + "time_per_iteration": 2.6660523414611816 + }, + { + "auxiliary_loss_clip": 0.01316532, + "auxiliary_loss_mlp": 0.00281407, + "balance_loss_clip": 1.08667564, + "balance_loss_mlp": 0.2538341, + "epoch": 0.6258830602735608, + "flos": 22231542021120.0, + "grad_norm": 28.428126305879452, + "language_loss": 0.7715919, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.78757131, + "num_input_tokens_seen": 224271375, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.27587891, + "step": 10410, + "time_per_iteration": 2.6441843509674072 + }, + { + "auxiliary_loss_clip": 0.01302935, + "auxiliary_loss_mlp": 0.00264751, + "balance_loss_clip": 1.08108997, + "balance_loss_mlp": 0.23912065, + "epoch": 0.6259431835262288, + "flos": 25076887925760.0, + "grad_norm": 3.850156390872138, + "language_loss": 0.74762821, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.76330507, + "num_input_tokens_seen": 224290315, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25646973, + "step": 10411, + "time_per_iteration": 4.0202476978302 + }, + { + "auxiliary_loss_clip": 0.0133406, + "auxiliary_loss_mlp": 0.00256214, + "balance_loss_clip": 1.09374714, + "balance_loss_mlp": 0.22889078, + "epoch": 0.6260033067788967, + "flos": 28252288926720.0, + "grad_norm": 40.89638216731196, + "language_loss": 0.79599249, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.81189525, + "num_input_tokens_seen": 224310545, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.27331543, + "step": 10412, + "time_per_iteration": 2.7574076652526855 + }, + { + "auxiliary_loss_clip": 0.01324835, + "auxiliary_loss_mlp": 0.00269927, + "balance_loss_clip": 1.09307802, + "balance_loss_mlp": 0.24172217, + "epoch": 0.6260634300315647, + "flos": 23367432787200.0, + "grad_norm": 82.31161830053041, + "language_loss": 0.77125812, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.78720576, + "num_input_tokens_seen": 224331115, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.28222656, + "step": 10413, + "time_per_iteration": 2.6715426445007324 + }, + { + "auxiliary_loss_clip": 0.01346565, + "auxiliary_loss_mlp": 0.00272228, + "balance_loss_clip": 1.1019907, + "balance_loss_mlp": 0.24430926, + "epoch": 0.6261235532842326, + "flos": 18035308494720.0, + "grad_norm": 40.66113303459306, + "language_loss": 0.90316451, + "learning_rate": 1.295526482316796e-06, + "loss": 0.91935241, + "num_input_tokens_seen": 224347525, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.27905273, + "step": 10414, + "time_per_iteration": 2.6534852981567383 + }, + { + "auxiliary_loss_clip": 0.01316471, + "auxiliary_loss_mlp": 0.0026896, + "balance_loss_clip": 1.08513832, + "balance_loss_mlp": 0.24213783, + "epoch": 0.6261836765369007, + "flos": 22011265866240.0, + "grad_norm": 7.591153931373517, + "language_loss": 0.81124806, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.82710236, + "num_input_tokens_seen": 224367045, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.26818848, + "step": 10415, + "time_per_iteration": 2.634305715560913 + }, + { + "auxiliary_loss_clip": 0.01319453, + "auxiliary_loss_mlp": 0.00265152, + "balance_loss_clip": 1.0896765, + "balance_loss_mlp": 0.23981974, + "epoch": 0.6262437997895686, + "flos": 24936010784640.0, + "grad_norm": 63.39845988689822, + "language_loss": 0.8183682, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.83421421, + "num_input_tokens_seen": 224388860, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.25305176, + "step": 10416, + "time_per_iteration": 2.725013017654419 + }, + { + "auxiliary_loss_clip": 0.01304585, + "auxiliary_loss_mlp": 0.0024013, + "balance_loss_clip": 1.07669377, + "balance_loss_mlp": 0.21472698, + "epoch": 0.6263039230422366, + "flos": 31608428186880.0, + "grad_norm": 7.518597740985801, + "language_loss": 0.90905929, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.92450643, + "num_input_tokens_seen": 224409645, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.25402832, + "step": 10417, + "time_per_iteration": 2.732072353363037 + }, + { + "auxiliary_loss_clip": 0.01344354, + "auxiliary_loss_mlp": 0.00244837, + "balance_loss_clip": 1.10315061, + "balance_loss_mlp": 0.21819374, + "epoch": 0.6263640462949046, + "flos": 17639465639040.0, + "grad_norm": 80.69032500854921, + "language_loss": 0.73287094, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.74876285, + "num_input_tokens_seen": 224428530, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.26672363, + "step": 10418, + "time_per_iteration": 2.6286497116088867 + }, + { + "auxiliary_loss_clip": 0.01331137, + "auxiliary_loss_mlp": 0.00278216, + "balance_loss_clip": 1.09382498, + "balance_loss_mlp": 0.25057095, + "epoch": 0.6264241695475725, + "flos": 19974951941760.0, + "grad_norm": 32.90128397452683, + "language_loss": 0.92345577, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.93954927, + "num_input_tokens_seen": 224447175, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.2767334, + "step": 10419, + "time_per_iteration": 2.646334648132324 + }, + { + "auxiliary_loss_clip": 0.0134835, + "auxiliary_loss_mlp": 0.00252965, + "balance_loss_clip": 1.10705709, + "balance_loss_mlp": 0.22591668, + "epoch": 0.6264842928002405, + "flos": 27344323912320.0, + "grad_norm": 5.2950551501701035, + "language_loss": 0.7134397, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.72945279, + "num_input_tokens_seen": 224469445, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.27026367, + "step": 10420, + "time_per_iteration": 2.7286884784698486 + }, + { + "auxiliary_loss_clip": 0.01326697, + "auxiliary_loss_mlp": 0.00262887, + "balance_loss_clip": 1.09235549, + "balance_loss_mlp": 0.23657787, + "epoch": 0.6265444160529084, + "flos": 22997265177600.0, + "grad_norm": 110.27835241838423, + "language_loss": 0.93244493, + "learning_rate": 1.292975627485741e-06, + "loss": 0.94834077, + "num_input_tokens_seen": 224486590, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.26318359, + "step": 10421, + "time_per_iteration": 2.665055990219116 + }, + { + "auxiliary_loss_clip": 0.01313569, + "auxiliary_loss_mlp": 0.00247235, + "balance_loss_clip": 1.08054805, + "balance_loss_mlp": 0.22030574, + "epoch": 0.6266045393055765, + "flos": 19938323047680.0, + "grad_norm": 155.73058555067522, + "language_loss": 0.88227272, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.89788079, + "num_input_tokens_seen": 224502795, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.26928711, + "step": 10422, + "time_per_iteration": 2.652860403060913 + }, + { + "auxiliary_loss_clip": 0.0131458, + "auxiliary_loss_mlp": 0.00268075, + "balance_loss_clip": 1.08182096, + "balance_loss_mlp": 0.24087167, + "epoch": 0.6266646625582444, + "flos": 24389091325440.0, + "grad_norm": 15.699479029572815, + "language_loss": 0.82155341, + "learning_rate": 1.292247052906389e-06, + "loss": 0.83737993, + "num_input_tokens_seen": 224522300, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.27233887, + "step": 10423, + "time_per_iteration": 2.6847589015960693 + }, + { + "auxiliary_loss_clip": 0.01288345, + "auxiliary_loss_mlp": 0.002365, + "balance_loss_clip": 1.06130791, + "balance_loss_mlp": 0.21278977, + "epoch": 0.6267247858109124, + "flos": 14683802088960.0, + "grad_norm": 52.665510222894035, + "language_loss": 0.86662817, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.88187659, + "num_input_tokens_seen": 224538260, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.23706055, + "step": 10424, + "time_per_iteration": 2.619312047958374 + }, + { + "auxiliary_loss_clip": 0.01318309, + "auxiliary_loss_mlp": 0.00262144, + "balance_loss_clip": 1.08714747, + "balance_loss_mlp": 0.23581108, + "epoch": 0.6267849090635803, + "flos": 24929977299840.0, + "grad_norm": 107.59525905105433, + "language_loss": 0.77174336, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.78754789, + "num_input_tokens_seen": 224559155, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.26330566, + "step": 10425, + "time_per_iteration": 2.691349983215332 + }, + { + "auxiliary_loss_clip": 0.01293004, + "auxiliary_loss_mlp": 0.00250553, + "balance_loss_clip": 1.07052159, + "balance_loss_mlp": 0.22538809, + "epoch": 0.6268450323162483, + "flos": 25337851211520.0, + "grad_norm": 75.29472115060186, + "language_loss": 0.82863641, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.84407198, + "num_input_tokens_seen": 224578660, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25183105, + "step": 10426, + "time_per_iteration": 2.689901828765869 + }, + { + "auxiliary_loss_clip": 0.01313556, + "auxiliary_loss_mlp": 0.00240309, + "balance_loss_clip": 1.08151197, + "balance_loss_mlp": 0.21590656, + "epoch": 0.6269051555689162, + "flos": 26177299032960.0, + "grad_norm": 119.2563067294524, + "language_loss": 0.85685945, + "learning_rate": 1.290790225914929e-06, + "loss": 0.87239808, + "num_input_tokens_seen": 224599080, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.24401855, + "step": 10427, + "time_per_iteration": 2.81270432472229 + }, + { + "auxiliary_loss_clip": 0.01319034, + "auxiliary_loss_mlp": 0.00235834, + "balance_loss_clip": 1.08539176, + "balance_loss_mlp": 0.20987034, + "epoch": 0.6269652788215843, + "flos": 18256877539200.0, + "grad_norm": 231.24490482829583, + "language_loss": 0.75583982, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.77138853, + "num_input_tokens_seen": 224614225, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.25964355, + "step": 10428, + "time_per_iteration": 2.7009880542755127 + }, + { + "auxiliary_loss_clip": 0.01312031, + "auxiliary_loss_mlp": 0.00239316, + "balance_loss_clip": 1.08198619, + "balance_loss_mlp": 0.213805, + "epoch": 0.6270254020742522, + "flos": 11765413877760.0, + "grad_norm": 3.7551174064192936, + "language_loss": 0.80538917, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.82090265, + "num_input_tokens_seen": 224632365, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.25512695, + "step": 10429, + "time_per_iteration": 2.6409249305725098 + }, + { + "auxiliary_loss_clip": 0.01324609, + "auxiliary_loss_mlp": 0.00230519, + "balance_loss_clip": 1.08811343, + "balance_loss_mlp": 0.20255294, + "epoch": 0.6270855253269202, + "flos": 23475631530240.0, + "grad_norm": 29.500078038513994, + "language_loss": 0.86391199, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.87946326, + "num_input_tokens_seen": 224651125, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.27941895, + "step": 10430, + "time_per_iteration": 2.6862316131591797 + }, + { + "auxiliary_loss_clip": 0.01251359, + "auxiliary_loss_mlp": 0.00083428, + "balance_loss_clip": 1.07324374, + "balance_loss_mlp": 0.0744155, + "epoch": 0.6271456485795882, + "flos": 70064520232320.0, + "grad_norm": 0.7522777165611687, + "language_loss": 0.58938313, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.60273099, + "num_input_tokens_seen": 224716115, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.09033203, + "step": 10431, + "time_per_iteration": 3.273266553878784 + }, + { + "auxiliary_loss_clip": 0.01251259, + "auxiliary_loss_mlp": 0.00075898, + "balance_loss_clip": 1.07331789, + "balance_loss_mlp": 0.06750559, + "epoch": 0.6272057718322561, + "flos": 65156718280320.0, + "grad_norm": 0.8576167730837732, + "language_loss": 0.63294315, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.64621472, + "num_input_tokens_seen": 224782930, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.08398438, + "step": 10432, + "time_per_iteration": 3.179032802581787 + }, + { + "auxiliary_loss_clip": 0.01293705, + "auxiliary_loss_mlp": 0.00233941, + "balance_loss_clip": 1.06968701, + "balance_loss_mlp": 0.21026629, + "epoch": 0.6272658950849241, + "flos": 24389342720640.0, + "grad_norm": 147.07556694217993, + "language_loss": 0.73676288, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.75203931, + "num_input_tokens_seen": 224802010, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.2364502, + "step": 10433, + "time_per_iteration": 2.7192723751068115 + }, + { + "auxiliary_loss_clip": 0.01334424, + "auxiliary_loss_mlp": 0.00218409, + "balance_loss_clip": 1.09119797, + "balance_loss_mlp": 0.18956015, + "epoch": 0.627326018337592, + "flos": 17966001202560.0, + "grad_norm": 31.259050625114433, + "language_loss": 0.7667942, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.78232253, + "num_input_tokens_seen": 224818875, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.28845215, + "step": 10434, + "time_per_iteration": 2.6397247314453125 + }, + { + "auxiliary_loss_clip": 0.01310251, + "auxiliary_loss_mlp": 0.00229963, + "balance_loss_clip": 1.07723844, + "balance_loss_mlp": 0.20124534, + "epoch": 0.6273861415902601, + "flos": 20230097224320.0, + "grad_norm": 13.162355238446764, + "language_loss": 0.92797196, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.94337416, + "num_input_tokens_seen": 224837790, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.28723145, + "step": 10435, + "time_per_iteration": 2.7174577713012695 + }, + { + "auxiliary_loss_clip": 0.01257471, + "auxiliary_loss_mlp": 0.00071901, + "balance_loss_clip": 1.08293295, + "balance_loss_mlp": 0.06179186, + "epoch": 0.627446264842928, + "flos": 64953210798720.0, + "grad_norm": 0.71809535520133, + "language_loss": 0.6110658, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.62435955, + "num_input_tokens_seen": 224899685, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.10107422, + "step": 10436, + "time_per_iteration": 3.1275362968444824 + }, + { + "auxiliary_loss_clip": 0.01306439, + "auxiliary_loss_mlp": 0.00228892, + "balance_loss_clip": 1.07917476, + "balance_loss_mlp": 0.20338088, + "epoch": 0.627506388095596, + "flos": 23584261236480.0, + "grad_norm": 13.847731655750236, + "language_loss": 0.85308814, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.86844146, + "num_input_tokens_seen": 224918650, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.25524902, + "step": 10437, + "time_per_iteration": 2.658050298690796 + }, + { + "auxiliary_loss_clip": 0.01268495, + "auxiliary_loss_mlp": 0.0011132, + "balance_loss_clip": 1.09378052, + "balance_loss_mlp": 0.10135391, + "epoch": 0.6275665113482639, + "flos": 67583631674880.0, + "grad_norm": 0.7368724395395051, + "language_loss": 0.53695112, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.55074924, + "num_input_tokens_seen": 224981575, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.09960938, + "step": 10438, + "time_per_iteration": 3.0657083988189697 + }, + { + "auxiliary_loss_clip": 0.01320932, + "auxiliary_loss_mlp": 0.00234235, + "balance_loss_clip": 1.08675218, + "balance_loss_mlp": 0.20722215, + "epoch": 0.6276266346009319, + "flos": 27636924101760.0, + "grad_norm": 32.86300113927001, + "language_loss": 0.91692388, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.93247557, + "num_input_tokens_seen": 225000820, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.27038574, + "step": 10439, + "time_per_iteration": 2.7613890171051025 + }, + { + "auxiliary_loss_clip": 0.01315649, + "auxiliary_loss_mlp": 0.00237212, + "balance_loss_clip": 1.08465505, + "balance_loss_mlp": 0.20719466, + "epoch": 0.6276867578535998, + "flos": 22746142218240.0, + "grad_norm": 18.81804963355678, + "language_loss": 0.8892622, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.90479082, + "num_input_tokens_seen": 225017585, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.30029297, + "step": 10440, + "time_per_iteration": 2.635059356689453 + }, + { + "auxiliary_loss_clip": 0.01306757, + "auxiliary_loss_mlp": 0.00202852, + "balance_loss_clip": 1.08481753, + "balance_loss_mlp": 0.17692348, + "epoch": 0.6277468811062679, + "flos": 24644200694400.0, + "grad_norm": 47.240359905268356, + "language_loss": 0.80488908, + "learning_rate": 1.285694725799337e-06, + "loss": 0.81998515, + "num_input_tokens_seen": 225039085, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.25915527, + "step": 10441, + "time_per_iteration": 2.6990294456481934 + }, + { + "auxiliary_loss_clip": 0.01299143, + "auxiliary_loss_mlp": 0.00221999, + "balance_loss_clip": 1.07389534, + "balance_loss_mlp": 0.19512957, + "epoch": 0.6278070043589358, + "flos": 19678975873920.0, + "grad_norm": 4.585640952974947, + "language_loss": 0.81251407, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.82772547, + "num_input_tokens_seen": 225058105, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.26867676, + "step": 10442, + "time_per_iteration": 2.6201555728912354 + }, + { + "auxiliary_loss_clip": 0.01306357, + "auxiliary_loss_mlp": 0.00229691, + "balance_loss_clip": 1.07650208, + "balance_loss_mlp": 0.20119961, + "epoch": 0.6278671276116038, + "flos": 22121834906880.0, + "grad_norm": 124.1113859010021, + "language_loss": 0.78122354, + "learning_rate": 1.284967229712762e-06, + "loss": 0.79658401, + "num_input_tokens_seen": 225077605, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.28491211, + "step": 10443, + "time_per_iteration": 2.6732349395751953 + }, + { + "auxiliary_loss_clip": 0.01302043, + "auxiliary_loss_mlp": 0.00244872, + "balance_loss_clip": 1.06963265, + "balance_loss_mlp": 0.21733472, + "epoch": 0.6279272508642717, + "flos": 23038562839680.0, + "grad_norm": 45.62759597102971, + "language_loss": 0.80014038, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.81560957, + "num_input_tokens_seen": 225097775, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2755127, + "step": 10444, + "time_per_iteration": 4.132807493209839 + }, + { + "auxiliary_loss_clip": 0.01291482, + "auxiliary_loss_mlp": 0.00203938, + "balance_loss_clip": 1.06503439, + "balance_loss_mlp": 0.17599498, + "epoch": 0.6279873741169397, + "flos": 19824090819840.0, + "grad_norm": 323.9000556479844, + "language_loss": 0.79359996, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.80855417, + "num_input_tokens_seen": 225115585, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.27941895, + "step": 10445, + "time_per_iteration": 4.087191104888916 + }, + { + "auxiliary_loss_clip": 0.01300094, + "auxiliary_loss_mlp": 0.00221388, + "balance_loss_clip": 1.07146406, + "balance_loss_mlp": 0.19605637, + "epoch": 0.6280474973696077, + "flos": 23915393740800.0, + "grad_norm": 4.051088917182642, + "language_loss": 0.76879764, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.7840125, + "num_input_tokens_seen": 225135575, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.25317383, + "step": 10446, + "time_per_iteration": 2.7174670696258545 + }, + { + "auxiliary_loss_clip": 0.01309734, + "auxiliary_loss_mlp": 0.00229767, + "balance_loss_clip": 1.07861543, + "balance_loss_mlp": 0.20326632, + "epoch": 0.6281076206222757, + "flos": 17967976450560.0, + "grad_norm": 11.916150447348647, + "language_loss": 0.83875918, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.85415423, + "num_input_tokens_seen": 225154230, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.26489258, + "step": 10447, + "time_per_iteration": 2.773871898651123 + }, + { + "auxiliary_loss_clip": 0.01251262, + "auxiliary_loss_mlp": 0.00057019, + "balance_loss_clip": 1.08673668, + "balance_loss_mlp": 0.04600453, + "epoch": 0.6281677438749437, + "flos": 66778370622720.0, + "grad_norm": 0.6685129180595251, + "language_loss": 0.51600134, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.52908421, + "num_input_tokens_seen": 225213650, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.11035156, + "step": 10448, + "time_per_iteration": 3.0455448627471924 + }, + { + "auxiliary_loss_clip": 0.013235, + "auxiliary_loss_mlp": 0.00223829, + "balance_loss_clip": 1.09320772, + "balance_loss_mlp": 0.19694722, + "epoch": 0.6282278671276116, + "flos": 11656173640320.0, + "grad_norm": 399.3275957078817, + "language_loss": 1.0084759, + "learning_rate": 1.282785392633079e-06, + "loss": 1.02394915, + "num_input_tokens_seen": 225230135, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.26904297, + "step": 10449, + "time_per_iteration": 4.049811601638794 + }, + { + "auxiliary_loss_clip": 0.01302753, + "auxiliary_loss_mlp": 0.00232359, + "balance_loss_clip": 1.07747746, + "balance_loss_mlp": 0.20776649, + "epoch": 0.6282879903802796, + "flos": 42741597847680.0, + "grad_norm": 9.937495098080724, + "language_loss": 0.68571544, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.70106661, + "num_input_tokens_seen": 225253520, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.24572754, + "step": 10450, + "time_per_iteration": 2.834761142730713 + }, + { + "auxiliary_loss_clip": 0.01301831, + "auxiliary_loss_mlp": 0.00218139, + "balance_loss_clip": 1.07948458, + "balance_loss_mlp": 0.19378486, + "epoch": 0.6283481136329475, + "flos": 20009210538240.0, + "grad_norm": 5.377746820468152, + "language_loss": 0.83257997, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.84777963, + "num_input_tokens_seen": 225272460, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.24353027, + "step": 10451, + "time_per_iteration": 2.6560468673706055 + }, + { + "auxiliary_loss_clip": 0.01317832, + "auxiliary_loss_mlp": 0.0021809, + "balance_loss_clip": 1.08811355, + "balance_loss_mlp": 0.18889546, + "epoch": 0.6284082368856155, + "flos": 21904431840000.0, + "grad_norm": 12.914296180069416, + "language_loss": 0.85119534, + "learning_rate": 1.281694841064566e-06, + "loss": 0.86655462, + "num_input_tokens_seen": 225291700, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.29187012, + "step": 10452, + "time_per_iteration": 2.666250467300415 + }, + { + "auxiliary_loss_clip": 0.01291397, + "auxiliary_loss_mlp": 0.00222872, + "balance_loss_clip": 1.07043362, + "balance_loss_mlp": 0.19602621, + "epoch": 0.6284683601382834, + "flos": 25484187219840.0, + "grad_norm": 14.62094236623885, + "language_loss": 0.80647463, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.82161731, + "num_input_tokens_seen": 225311470, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26831055, + "step": 10453, + "time_per_iteration": 4.0778467655181885 + }, + { + "auxiliary_loss_clip": 0.01296106, + "auxiliary_loss_mlp": 0.00208728, + "balance_loss_clip": 1.07050312, + "balance_loss_mlp": 0.18195391, + "epoch": 0.6285284833909515, + "flos": 16538695395840.0, + "grad_norm": 3.1622425976700104, + "language_loss": 0.88310778, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.89815617, + "num_input_tokens_seen": 225328385, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26794434, + "step": 10454, + "time_per_iteration": 2.6679470539093018 + }, + { + "auxiliary_loss_clip": 0.01297718, + "auxiliary_loss_mlp": 0.00233539, + "balance_loss_clip": 1.07891154, + "balance_loss_mlp": 0.20827839, + "epoch": 0.6285886066436194, + "flos": 22820692896000.0, + "grad_norm": 42.943975526142346, + "language_loss": 0.89965487, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.91496742, + "num_input_tokens_seen": 225348415, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25268555, + "step": 10455, + "time_per_iteration": 2.674139976501465 + }, + { + "auxiliary_loss_clip": 0.01298343, + "auxiliary_loss_mlp": 0.00215138, + "balance_loss_clip": 1.07538998, + "balance_loss_mlp": 0.1909145, + "epoch": 0.6286487298962874, + "flos": 24715734629760.0, + "grad_norm": 9.125608513769647, + "language_loss": 0.89761829, + "learning_rate": 1.280241153705706e-06, + "loss": 0.91275305, + "num_input_tokens_seen": 225367740, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.24230957, + "step": 10456, + "time_per_iteration": 2.66049861907959 + }, + { + "auxiliary_loss_clip": 0.01302072, + "auxiliary_loss_mlp": 0.00230222, + "balance_loss_clip": 1.07806313, + "balance_loss_mlp": 0.20294675, + "epoch": 0.6287088531489553, + "flos": 20740818752640.0, + "grad_norm": 15.115764829805041, + "language_loss": 0.81673115, + "learning_rate": 1.27987780006486e-06, + "loss": 0.83205414, + "num_input_tokens_seen": 225388405, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.27282715, + "step": 10457, + "time_per_iteration": 2.665334463119507 + }, + { + "auxiliary_loss_clip": 0.01312895, + "auxiliary_loss_mlp": 0.00227216, + "balance_loss_clip": 1.08229327, + "balance_loss_mlp": 0.20053747, + "epoch": 0.6287689764016233, + "flos": 23070630706560.0, + "grad_norm": 17.897092378241208, + "language_loss": 0.88829565, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.90369678, + "num_input_tokens_seen": 225408360, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.26660156, + "step": 10458, + "time_per_iteration": 2.6691384315490723 + }, + { + "auxiliary_loss_clip": 0.01311068, + "auxiliary_loss_mlp": 0.00226431, + "balance_loss_clip": 1.08221674, + "balance_loss_mlp": 0.19927472, + "epoch": 0.6288290996542913, + "flos": 32233669251840.0, + "grad_norm": 5.753699275963532, + "language_loss": 0.6914717, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.70684671, + "num_input_tokens_seen": 225431310, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.27111816, + "step": 10459, + "time_per_iteration": 2.7937045097351074 + }, + { + "auxiliary_loss_clip": 0.01305186, + "auxiliary_loss_mlp": 0.00202337, + "balance_loss_clip": 1.07785642, + "balance_loss_mlp": 0.17618228, + "epoch": 0.6288892229069593, + "flos": 24641327606400.0, + "grad_norm": 102.04876601120655, + "language_loss": 0.85218853, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.86726373, + "num_input_tokens_seen": 225450385, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.26159668, + "step": 10460, + "time_per_iteration": 2.6624557971954346 + }, + { + "auxiliary_loss_clip": 0.01313559, + "auxiliary_loss_mlp": 0.00224403, + "balance_loss_clip": 1.08890986, + "balance_loss_mlp": 0.19705683, + "epoch": 0.6289493461596273, + "flos": 17858341163520.0, + "grad_norm": 49.89238880377945, + "language_loss": 0.8155368, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.8309164, + "num_input_tokens_seen": 225467325, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.27355957, + "step": 10461, + "time_per_iteration": 2.6436572074890137 + }, + { + "auxiliary_loss_clip": 0.01309974, + "auxiliary_loss_mlp": 0.0022474, + "balance_loss_clip": 1.08981895, + "balance_loss_mlp": 0.20089795, + "epoch": 0.6290094694122952, + "flos": 22345379199360.0, + "grad_norm": 5.600496338467464, + "language_loss": 0.77838862, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.79373586, + "num_input_tokens_seen": 225487370, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.23852539, + "step": 10462, + "time_per_iteration": 2.645785093307495 + }, + { + "auxiliary_loss_clip": 0.01290452, + "auxiliary_loss_mlp": 0.00214325, + "balance_loss_clip": 1.0699693, + "balance_loss_mlp": 0.18900478, + "epoch": 0.6290695926649632, + "flos": 28402431776640.0, + "grad_norm": 2.44829488318691, + "language_loss": 0.80815423, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.82320201, + "num_input_tokens_seen": 225506915, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.25317383, + "step": 10463, + "time_per_iteration": 2.703110694885254 + }, + { + "auxiliary_loss_clip": 0.01316995, + "auxiliary_loss_mlp": 0.00227154, + "balance_loss_clip": 1.09152842, + "balance_loss_mlp": 0.20114288, + "epoch": 0.6291297159176311, + "flos": 21505464501120.0, + "grad_norm": 49.188995264057866, + "language_loss": 0.79169226, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.80713379, + "num_input_tokens_seen": 225525670, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.26025391, + "step": 10464, + "time_per_iteration": 2.612696886062622 + }, + { + "auxiliary_loss_clip": 0.01295749, + "auxiliary_loss_mlp": 0.00209993, + "balance_loss_clip": 1.07721591, + "balance_loss_mlp": 0.18467319, + "epoch": 0.6291898391702991, + "flos": 12203308581120.0, + "grad_norm": 11.837603454522661, + "language_loss": 0.76647252, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.7815299, + "num_input_tokens_seen": 225542235, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.2532959, + "step": 10465, + "time_per_iteration": 2.5982189178466797 + }, + { + "auxiliary_loss_clip": 0.0122853, + "auxiliary_loss_mlp": 0.00049187, + "balance_loss_clip": 1.07035804, + "balance_loss_mlp": 0.0406515, + "epoch": 0.629249962422967, + "flos": 69299479434240.0, + "grad_norm": 0.6613717268604755, + "language_loss": 0.59109247, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.60386962, + "num_input_tokens_seen": 225607185, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.08544922, + "step": 10466, + "time_per_iteration": 3.2266221046447754 + }, + { + "auxiliary_loss_clip": 0.01299689, + "auxiliary_loss_mlp": 0.00230864, + "balance_loss_clip": 1.07644236, + "balance_loss_mlp": 0.20453019, + "epoch": 0.6293100856756351, + "flos": 40077888042240.0, + "grad_norm": 27.490037973019035, + "language_loss": 0.7271623, + "learning_rate": 1.276245767820154e-06, + "loss": 0.74246788, + "num_input_tokens_seen": 225628785, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.2635498, + "step": 10467, + "time_per_iteration": 2.795806646347046 + }, + { + "auxiliary_loss_clip": 0.01227062, + "auxiliary_loss_mlp": 0.00077115, + "balance_loss_clip": 1.06868148, + "balance_loss_mlp": 0.0687225, + "epoch": 0.629370208928303, + "flos": 67501108177920.0, + "grad_norm": 0.7829882300932713, + "language_loss": 0.55987191, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.57291365, + "num_input_tokens_seen": 225678980, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.08398438, + "step": 10468, + "time_per_iteration": 2.910860300064087 + }, + { + "auxiliary_loss_clip": 0.0123457, + "auxiliary_loss_mlp": 0.00048448, + "balance_loss_clip": 1.07459927, + "balance_loss_mlp": 0.04019896, + "epoch": 0.629430332180971, + "flos": 60660450449280.0, + "grad_norm": 0.7474143396581256, + "language_loss": 0.57185102, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.58468115, + "num_input_tokens_seen": 225740295, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.08251953, + "step": 10469, + "time_per_iteration": 3.0529420375823975 + }, + { + "auxiliary_loss_clip": 0.01232786, + "auxiliary_loss_mlp": 0.00044508, + "balance_loss_clip": 1.07292223, + "balance_loss_mlp": 0.03582941, + "epoch": 0.6294904554336389, + "flos": 66869764778880.0, + "grad_norm": 0.6615524631400618, + "language_loss": 0.51093459, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.52370751, + "num_input_tokens_seen": 225805615, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.08691406, + "step": 10470, + "time_per_iteration": 3.2080726623535156 + }, + { + "auxiliary_loss_clip": 0.01286084, + "auxiliary_loss_mlp": 0.00266937, + "balance_loss_clip": 1.06659698, + "balance_loss_mlp": 0.24170008, + "epoch": 0.6295505786863069, + "flos": 42522794150400.0, + "grad_norm": 42.42711519263081, + "language_loss": 0.82002109, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.83555126, + "num_input_tokens_seen": 225826585, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25219727, + "step": 10471, + "time_per_iteration": 2.8393959999084473 + }, + { + "auxiliary_loss_clip": 0.01309797, + "auxiliary_loss_mlp": 0.00238911, + "balance_loss_clip": 1.0819006, + "balance_loss_mlp": 0.21332842, + "epoch": 0.629610701938975, + "flos": 17384140788480.0, + "grad_norm": 2.5338258236648254, + "language_loss": 0.70775318, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.72324032, + "num_input_tokens_seen": 225844095, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.25622559, + "step": 10472, + "time_per_iteration": 2.813768148422241 + }, + { + "auxiliary_loss_clip": 0.01299197, + "auxiliary_loss_mlp": 0.00228856, + "balance_loss_clip": 1.07713366, + "balance_loss_mlp": 0.20274951, + "epoch": 0.6296708251916429, + "flos": 24242934885120.0, + "grad_norm": 4.185411259076127, + "language_loss": 0.78346848, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.79874897, + "num_input_tokens_seen": 225864310, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.26098633, + "step": 10473, + "time_per_iteration": 2.665252685546875 + }, + { + "auxiliary_loss_clip": 0.01302129, + "auxiliary_loss_mlp": 0.00232216, + "balance_loss_clip": 1.07811022, + "balance_loss_mlp": 0.20474976, + "epoch": 0.6297309484443109, + "flos": 19278536077440.0, + "grad_norm": 30.00697463266715, + "language_loss": 0.82215714, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.83750057, + "num_input_tokens_seen": 225883830, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.27453613, + "step": 10474, + "time_per_iteration": 2.6721673011779785 + }, + { + "auxiliary_loss_clip": 0.01289783, + "auxiliary_loss_mlp": 0.0025506, + "balance_loss_clip": 1.07057619, + "balance_loss_mlp": 0.22950159, + "epoch": 0.6297910716969788, + "flos": 30662685043200.0, + "grad_norm": 7.722634786897449, + "language_loss": 0.74187678, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.75732517, + "num_input_tokens_seen": 225905755, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25561523, + "step": 10475, + "time_per_iteration": 2.743040084838867 + }, + { + "auxiliary_loss_clip": 0.01300802, + "auxiliary_loss_mlp": 0.00241993, + "balance_loss_clip": 1.07871127, + "balance_loss_mlp": 0.21624345, + "epoch": 0.6298511949496468, + "flos": 14423018371200.0, + "grad_norm": 556.8246388500883, + "language_loss": 0.98293203, + "learning_rate": 1.272979284940101e-06, + "loss": 0.99835998, + "num_input_tokens_seen": 225922155, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25732422, + "step": 10476, + "time_per_iteration": 2.657285451889038 + }, + { + "auxiliary_loss_clip": 0.01291537, + "auxiliary_loss_mlp": 0.00235121, + "balance_loss_clip": 1.07328606, + "balance_loss_mlp": 0.21102899, + "epoch": 0.6299113182023147, + "flos": 23514163845120.0, + "grad_norm": 47.35024011585484, + "language_loss": 0.83851445, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.85378104, + "num_input_tokens_seen": 225941060, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24084473, + "step": 10477, + "time_per_iteration": 2.6901097297668457 + }, + { + "auxiliary_loss_clip": 0.01297243, + "auxiliary_loss_mlp": 0.00245016, + "balance_loss_clip": 1.07562208, + "balance_loss_mlp": 0.21859935, + "epoch": 0.6299714414549827, + "flos": 22674500542080.0, + "grad_norm": 3.2684281014951724, + "language_loss": 0.76136935, + "learning_rate": 1.272253702758138e-06, + "loss": 0.77679193, + "num_input_tokens_seen": 225960870, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.26416016, + "step": 10478, + "time_per_iteration": 2.6311099529266357 + }, + { + "auxiliary_loss_clip": 0.01320348, + "auxiliary_loss_mlp": 0.00248858, + "balance_loss_clip": 1.08842671, + "balance_loss_mlp": 0.22095115, + "epoch": 0.6300315647076506, + "flos": 14501735026560.0, + "grad_norm": 34.81070648508225, + "language_loss": 0.80639517, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.82208723, + "num_input_tokens_seen": 225977895, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.27941895, + "step": 10479, + "time_per_iteration": 2.6669647693634033 + }, + { + "auxiliary_loss_clip": 0.01318322, + "auxiliary_loss_mlp": 0.00237285, + "balance_loss_clip": 1.09409463, + "balance_loss_mlp": 0.21123759, + "epoch": 0.6300916879603187, + "flos": 21871681614720.0, + "grad_norm": 3.1517775783913478, + "language_loss": 0.81133556, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.82689172, + "num_input_tokens_seen": 225997835, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26037598, + "step": 10480, + "time_per_iteration": 2.6561458110809326 + }, + { + "auxiliary_loss_clip": 0.01295229, + "auxiliary_loss_mlp": 0.00253851, + "balance_loss_clip": 1.07219505, + "balance_loss_mlp": 0.22726689, + "epoch": 0.6301518112129866, + "flos": 21834047139840.0, + "grad_norm": 27.69267149676469, + "language_loss": 0.85056055, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.86605144, + "num_input_tokens_seen": 226017620, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26599121, + "step": 10481, + "time_per_iteration": 2.6612236499786377 + }, + { + "auxiliary_loss_clip": 0.01219402, + "auxiliary_loss_mlp": 0.00074645, + "balance_loss_clip": 1.06257498, + "balance_loss_mlp": 0.06715885, + "epoch": 0.6302119344656546, + "flos": 44334237957120.0, + "grad_norm": 0.8898791841472752, + "language_loss": 0.61671746, + "learning_rate": 1.2708028696588e-06, + "loss": 0.62965786, + "num_input_tokens_seen": 226068755, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.07470703, + "step": 10482, + "time_per_iteration": 2.863495111465454 + }, + { + "auxiliary_loss_clip": 0.01310829, + "auxiliary_loss_mlp": 0.00270011, + "balance_loss_clip": 1.08569515, + "balance_loss_mlp": 0.24354699, + "epoch": 0.6302720577183225, + "flos": 11217919800960.0, + "grad_norm": 18.607221788901246, + "language_loss": 0.90974414, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.92555255, + "num_input_tokens_seen": 226084395, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.26477051, + "step": 10483, + "time_per_iteration": 2.6381165981292725 + }, + { + "auxiliary_loss_clip": 0.01309493, + "auxiliary_loss_mlp": 0.00275602, + "balance_loss_clip": 1.09049034, + "balance_loss_mlp": 0.24770705, + "epoch": 0.6303321809709905, + "flos": 27964932122880.0, + "grad_norm": 187.98326172989837, + "language_loss": 0.80532765, + "learning_rate": 1.270077618961487e-06, + "loss": 0.82117867, + "num_input_tokens_seen": 226105890, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.27905273, + "step": 10484, + "time_per_iteration": 2.696347236633301 + }, + { + "auxiliary_loss_clip": 0.013059, + "auxiliary_loss_mlp": 0.0026539, + "balance_loss_clip": 1.08035231, + "balance_loss_mlp": 0.2388902, + "epoch": 0.6303923042236586, + "flos": 28220759763840.0, + "grad_norm": 66.57353691499989, + "language_loss": 0.81342, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.82913291, + "num_input_tokens_seen": 226126760, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.26489258, + "step": 10485, + "time_per_iteration": 2.7066617012023926 + }, + { + "auxiliary_loss_clip": 0.01317279, + "auxiliary_loss_mlp": 0.00288412, + "balance_loss_clip": 1.08779418, + "balance_loss_mlp": 0.26017177, + "epoch": 0.6304524274763265, + "flos": 27631034271360.0, + "grad_norm": 91.20207700720978, + "language_loss": 0.89396644, + "learning_rate": 1.269352478979093e-06, + "loss": 0.91002333, + "num_input_tokens_seen": 226147315, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.28234863, + "step": 10486, + "time_per_iteration": 2.7110085487365723 + }, + { + "auxiliary_loss_clip": 0.01287875, + "auxiliary_loss_mlp": 0.00267984, + "balance_loss_clip": 1.07089555, + "balance_loss_mlp": 0.24247277, + "epoch": 0.6305125507289945, + "flos": 17311313963520.0, + "grad_norm": 22.807515473657496, + "language_loss": 0.7302919, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.74585056, + "num_input_tokens_seen": 226165935, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25512695, + "step": 10487, + "time_per_iteration": 5.454029321670532 + }, + { + "auxiliary_loss_clip": 0.01303264, + "auxiliary_loss_mlp": 0.0027983, + "balance_loss_clip": 1.08051205, + "balance_loss_mlp": 0.25279301, + "epoch": 0.6305726739816624, + "flos": 25808280658560.0, + "grad_norm": 22.04759777228893, + "language_loss": 0.73413277, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.74996364, + "num_input_tokens_seen": 226186890, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.27038574, + "step": 10488, + "time_per_iteration": 2.6791250705718994 + }, + { + "auxiliary_loss_clip": 0.01307135, + "auxiliary_loss_mlp": 0.0029332, + "balance_loss_clip": 1.08334506, + "balance_loss_mlp": 0.26530594, + "epoch": 0.6306327972343304, + "flos": 21797454159360.0, + "grad_norm": 133.24050336275735, + "language_loss": 0.73821825, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.75422281, + "num_input_tokens_seen": 226206710, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.28027344, + "step": 10489, + "time_per_iteration": 2.728381872177124 + }, + { + "auxiliary_loss_clip": 0.01323996, + "auxiliary_loss_mlp": 0.00297226, + "balance_loss_clip": 1.0873512, + "balance_loss_mlp": 0.26866388, + "epoch": 0.6306929204869983, + "flos": 20777375819520.0, + "grad_norm": 368.1018256784661, + "language_loss": 0.8172462, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.83345842, + "num_input_tokens_seen": 226225565, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.28527832, + "step": 10490, + "time_per_iteration": 2.6850428581237793 + }, + { + "auxiliary_loss_clip": 0.01310483, + "auxiliary_loss_mlp": 0.00277903, + "balance_loss_clip": 1.08596933, + "balance_loss_mlp": 0.25196293, + "epoch": 0.6307530437396663, + "flos": 23654214973440.0, + "grad_norm": 8.719511135553441, + "language_loss": 0.86087537, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.87675923, + "num_input_tokens_seen": 226243680, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.25952148, + "step": 10491, + "time_per_iteration": 4.124901294708252 + }, + { + "auxiliary_loss_clip": 0.01309927, + "auxiliary_loss_mlp": 0.00274925, + "balance_loss_clip": 1.08708549, + "balance_loss_mlp": 0.24799588, + "epoch": 0.6308131669923343, + "flos": 24719002767360.0, + "grad_norm": 791.924397346603, + "language_loss": 0.65929377, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.67514223, + "num_input_tokens_seen": 226264345, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26916504, + "step": 10492, + "time_per_iteration": 2.816011428833008 + }, + { + "auxiliary_loss_clip": 0.01318105, + "auxiliary_loss_mlp": 0.00274449, + "balance_loss_clip": 1.08956194, + "balance_loss_mlp": 0.24730493, + "epoch": 0.6308732902450023, + "flos": 22565403959040.0, + "grad_norm": 14.305467315759804, + "language_loss": 0.73560584, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.75153136, + "num_input_tokens_seen": 226283165, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27160645, + "step": 10493, + "time_per_iteration": 2.679671287536621 + }, + { + "auxiliary_loss_clip": 0.01323307, + "auxiliary_loss_mlp": 0.00275118, + "balance_loss_clip": 1.09761822, + "balance_loss_mlp": 0.24859405, + "epoch": 0.6309334134976702, + "flos": 24644200694400.0, + "grad_norm": 4.65604658537384, + "language_loss": 0.87346387, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.88944817, + "num_input_tokens_seen": 226304080, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.265625, + "step": 10494, + "time_per_iteration": 2.7208423614501953 + }, + { + "auxiliary_loss_clip": 0.01314796, + "auxiliary_loss_mlp": 0.00308316, + "balance_loss_clip": 1.09154201, + "balance_loss_mlp": 0.2798489, + "epoch": 0.6309935367503382, + "flos": 41427949651200.0, + "grad_norm": 6.144772537645815, + "language_loss": 0.87370008, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.8899312, + "num_input_tokens_seen": 226325925, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.28479004, + "step": 10495, + "time_per_iteration": 4.2698540687561035 + }, + { + "auxiliary_loss_clip": 0.01326241, + "auxiliary_loss_mlp": 0.00288082, + "balance_loss_clip": 1.09592509, + "balance_loss_mlp": 0.26067603, + "epoch": 0.6310536600030061, + "flos": 15118931445120.0, + "grad_norm": 18.77949496924361, + "language_loss": 0.79489958, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.81104285, + "num_input_tokens_seen": 226344190, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.27404785, + "step": 10496, + "time_per_iteration": 2.670269012451172 + }, + { + "auxiliary_loss_clip": 0.01333405, + "auxiliary_loss_mlp": 0.00269902, + "balance_loss_clip": 1.10212088, + "balance_loss_mlp": 0.24204287, + "epoch": 0.6311137832556741, + "flos": 15231619388160.0, + "grad_norm": 10.022518054428645, + "language_loss": 0.91682839, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.93286151, + "num_input_tokens_seen": 226361520, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.27856445, + "step": 10497, + "time_per_iteration": 2.7033205032348633 + }, + { + "auxiliary_loss_clip": 0.01323455, + "auxiliary_loss_mlp": 0.00298296, + "balance_loss_clip": 1.09835744, + "balance_loss_mlp": 0.27118734, + "epoch": 0.6311739065083422, + "flos": 22018664067840.0, + "grad_norm": 19.208189953743506, + "language_loss": 0.83645415, + "learning_rate": 1.265003970256247e-06, + "loss": 0.85267174, + "num_input_tokens_seen": 226381920, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.27148438, + "step": 10498, + "time_per_iteration": 2.655501365661621 + }, + { + "auxiliary_loss_clip": 0.01311073, + "auxiliary_loss_mlp": 0.00288406, + "balance_loss_clip": 1.08733416, + "balance_loss_mlp": 0.26008224, + "epoch": 0.6312340297610101, + "flos": 22710770300160.0, + "grad_norm": 215.63383821804354, + "language_loss": 0.78874195, + "learning_rate": 1.264641775364217e-06, + "loss": 0.80473673, + "num_input_tokens_seen": 226400035, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.28356934, + "step": 10499, + "time_per_iteration": 2.679158926010132 + }, + { + "auxiliary_loss_clip": 0.01344267, + "auxiliary_loss_mlp": 0.00258478, + "balance_loss_clip": 1.11507225, + "balance_loss_mlp": 0.23009461, + "epoch": 0.6312941530136781, + "flos": 24280102483200.0, + "grad_norm": 45.014474383215095, + "language_loss": 0.80946761, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.825495, + "num_input_tokens_seen": 226418280, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.28393555, + "step": 10500, + "time_per_iteration": 2.675501585006714 + }, + { + "auxiliary_loss_clip": 0.01339468, + "auxiliary_loss_mlp": 0.00277484, + "balance_loss_clip": 1.11015546, + "balance_loss_mlp": 0.25037605, + "epoch": 0.631354276266346, + "flos": 21725956137600.0, + "grad_norm": 39.73185089281494, + "language_loss": 0.82313228, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.83930182, + "num_input_tokens_seen": 226436650, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.27099609, + "step": 10501, + "time_per_iteration": 2.663654088973999 + }, + { + "auxiliary_loss_clip": 0.0131708, + "auxiliary_loss_mlp": 0.00296549, + "balance_loss_clip": 1.09309042, + "balance_loss_mlp": 0.26915431, + "epoch": 0.631414399519014, + "flos": 24025100855040.0, + "grad_norm": 70.96265931519326, + "language_loss": 0.82411981, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.84025609, + "num_input_tokens_seen": 226456275, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.27404785, + "step": 10502, + "time_per_iteration": 2.671628713607788 + }, + { + "auxiliary_loss_clip": 0.01328655, + "auxiliary_loss_mlp": 0.00289506, + "balance_loss_clip": 1.09777331, + "balance_loss_mlp": 0.26046693, + "epoch": 0.6314745227716819, + "flos": 24315797623680.0, + "grad_norm": 79.15230845305945, + "language_loss": 0.94462478, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.96080637, + "num_input_tokens_seen": 226473610, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.29052734, + "step": 10503, + "time_per_iteration": 2.686699390411377 + }, + { + "auxiliary_loss_clip": 0.01330485, + "auxiliary_loss_mlp": 0.0028073, + "balance_loss_clip": 1.1029948, + "balance_loss_mlp": 0.25439653, + "epoch": 0.6315346460243499, + "flos": 23366391292800.0, + "grad_norm": 13.498137145671677, + "language_loss": 0.9288348, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.94494694, + "num_input_tokens_seen": 226493665, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.26379395, + "step": 10504, + "time_per_iteration": 2.6633176803588867 + }, + { + "auxiliary_loss_clip": 0.01331111, + "auxiliary_loss_mlp": 0.00271241, + "balance_loss_clip": 1.09903598, + "balance_loss_mlp": 0.24403772, + "epoch": 0.6315947692770179, + "flos": 20260333497600.0, + "grad_norm": 60.61797786712131, + "language_loss": 0.86404645, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.88006997, + "num_input_tokens_seen": 226511625, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.27209473, + "step": 10505, + "time_per_iteration": 2.6564669609069824 + }, + { + "auxiliary_loss_clip": 0.01348583, + "auxiliary_loss_mlp": 0.00276843, + "balance_loss_clip": 1.11541641, + "balance_loss_mlp": 0.24744575, + "epoch": 0.6316548925296859, + "flos": 25265850399360.0, + "grad_norm": 3.1095890039918777, + "language_loss": 0.88854164, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.90479594, + "num_input_tokens_seen": 226530085, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.29406738, + "step": 10506, + "time_per_iteration": 2.6361513137817383 + }, + { + "auxiliary_loss_clip": 0.01332652, + "auxiliary_loss_mlp": 0.00283154, + "balance_loss_clip": 1.10220051, + "balance_loss_mlp": 0.25599849, + "epoch": 0.6317150157823538, + "flos": 22930579578240.0, + "grad_norm": 8.644313832844142, + "language_loss": 0.8164649, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.83262289, + "num_input_tokens_seen": 226548115, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.27160645, + "step": 10507, + "time_per_iteration": 2.6307029724121094 + }, + { + "auxiliary_loss_clip": 0.01354197, + "auxiliary_loss_mlp": 0.00287338, + "balance_loss_clip": 1.11983871, + "balance_loss_mlp": 0.26021743, + "epoch": 0.6317751390350218, + "flos": 22527051212160.0, + "grad_norm": 129.47837017597664, + "language_loss": 0.74742448, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.76383978, + "num_input_tokens_seen": 226567955, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.27124023, + "step": 10508, + "time_per_iteration": 2.6589043140411377 + }, + { + "auxiliary_loss_clip": 0.01321448, + "auxiliary_loss_mlp": 0.00310975, + "balance_loss_clip": 1.09703457, + "balance_loss_mlp": 0.28176862, + "epoch": 0.6318352622876897, + "flos": 23294749616640.0, + "grad_norm": 62.596241956924565, + "language_loss": 0.76933801, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.78566223, + "num_input_tokens_seen": 226588205, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.29174805, + "step": 10509, + "time_per_iteration": 2.6819825172424316 + }, + { + "auxiliary_loss_clip": 0.01320525, + "auxiliary_loss_mlp": 0.00278602, + "balance_loss_clip": 1.09719253, + "balance_loss_mlp": 0.25192267, + "epoch": 0.6318953855403577, + "flos": 20704082117760.0, + "grad_norm": 49.88383846675862, + "language_loss": 0.85031915, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.86631042, + "num_input_tokens_seen": 226606965, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26672363, + "step": 10510, + "time_per_iteration": 2.682595729827881 + }, + { + "auxiliary_loss_clip": 0.01323395, + "auxiliary_loss_mlp": 0.00300456, + "balance_loss_clip": 1.09809446, + "balance_loss_mlp": 0.27034363, + "epoch": 0.6319555087930258, + "flos": 22820046451200.0, + "grad_norm": 11.02926850767085, + "language_loss": 0.77811778, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.79435623, + "num_input_tokens_seen": 226627845, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.30114746, + "step": 10511, + "time_per_iteration": 2.7533230781555176 + }, + { + "auxiliary_loss_clip": 0.01334263, + "auxiliary_loss_mlp": 0.00294632, + "balance_loss_clip": 1.11174393, + "balance_loss_mlp": 0.26856068, + "epoch": 0.6320156320456937, + "flos": 19970929618560.0, + "grad_norm": 2.3114343406731654, + "language_loss": 0.87071538, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.88700426, + "num_input_tokens_seen": 226645855, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.26074219, + "step": 10512, + "time_per_iteration": 2.689199686050415 + }, + { + "auxiliary_loss_clip": 0.01326969, + "auxiliary_loss_mlp": 0.00290279, + "balance_loss_clip": 1.10498095, + "balance_loss_mlp": 0.26125163, + "epoch": 0.6320757552983617, + "flos": 27013406889600.0, + "grad_norm": 54.8947147873753, + "language_loss": 0.77372831, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.78990078, + "num_input_tokens_seen": 226665375, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.29040527, + "step": 10513, + "time_per_iteration": 2.663728713989258 + }, + { + "auxiliary_loss_clip": 0.0133713, + "auxiliary_loss_mlp": 0.00289462, + "balance_loss_clip": 1.10562468, + "balance_loss_mlp": 0.26187667, + "epoch": 0.6321358785510296, + "flos": 23695943598720.0, + "grad_norm": 4.087661614772674, + "language_loss": 0.74548328, + "learning_rate": 1.259212205855459e-06, + "loss": 0.76174915, + "num_input_tokens_seen": 226685270, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.27587891, + "step": 10514, + "time_per_iteration": 2.6908814907073975 + }, + { + "auxiliary_loss_clip": 0.01327737, + "auxiliary_loss_mlp": 0.00256787, + "balance_loss_clip": 1.0994184, + "balance_loss_mlp": 0.22929721, + "epoch": 0.6321960018036976, + "flos": 25995231970560.0, + "grad_norm": 75.57308177373021, + "language_loss": 0.82892728, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.84477258, + "num_input_tokens_seen": 226705325, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27490234, + "step": 10515, + "time_per_iteration": 2.7107856273651123 + }, + { + "auxiliary_loss_clip": 0.01343457, + "auxiliary_loss_mlp": 0.00265093, + "balance_loss_clip": 1.11480808, + "balance_loss_mlp": 0.23746008, + "epoch": 0.6322561250563655, + "flos": 22821016118400.0, + "grad_norm": 139.29874030006522, + "language_loss": 0.94322914, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.95931464, + "num_input_tokens_seen": 226723815, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.27661133, + "step": 10516, + "time_per_iteration": 2.7027688026428223 + }, + { + "auxiliary_loss_clip": 0.01336798, + "auxiliary_loss_mlp": 0.00307335, + "balance_loss_clip": 1.10597086, + "balance_loss_mlp": 0.27741349, + "epoch": 0.6323162483090335, + "flos": 18988413926400.0, + "grad_norm": 17.555480703658063, + "language_loss": 0.88267249, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.89911383, + "num_input_tokens_seen": 226741550, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.29956055, + "step": 10517, + "time_per_iteration": 2.6591944694519043 + }, + { + "auxiliary_loss_clip": 0.01342472, + "auxiliary_loss_mlp": 0.00272236, + "balance_loss_clip": 1.11362755, + "balance_loss_mlp": 0.24456739, + "epoch": 0.6323763715617015, + "flos": 19865173000320.0, + "grad_norm": 37.61367839318699, + "language_loss": 0.8506034, + "learning_rate": 1.257765386189541e-06, + "loss": 0.86675048, + "num_input_tokens_seen": 226761115, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.27661133, + "step": 10518, + "time_per_iteration": 2.791691541671753 + }, + { + "auxiliary_loss_clip": 0.01343397, + "auxiliary_loss_mlp": 0.00269699, + "balance_loss_clip": 1.11945474, + "balance_loss_mlp": 0.24285318, + "epoch": 0.6324364948143695, + "flos": 22782699285120.0, + "grad_norm": 98.17468794949811, + "language_loss": 0.89840209, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.91453302, + "num_input_tokens_seen": 226782225, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26831055, + "step": 10519, + "time_per_iteration": 2.691584348678589 + }, + { + "auxiliary_loss_clip": 0.01343288, + "auxiliary_loss_mlp": 0.00282326, + "balance_loss_clip": 1.11886859, + "balance_loss_mlp": 0.25549152, + "epoch": 0.6324966180670374, + "flos": 22235923480320.0, + "grad_norm": 32.1289285037465, + "language_loss": 0.79021704, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.80647314, + "num_input_tokens_seen": 226802375, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.26855469, + "step": 10520, + "time_per_iteration": 2.676027774810791 + }, + { + "auxiliary_loss_clip": 0.01346722, + "auxiliary_loss_mlp": 0.0027238, + "balance_loss_clip": 1.11556292, + "balance_loss_mlp": 0.24429393, + "epoch": 0.6325567413197054, + "flos": 21689183589120.0, + "grad_norm": 41.48705552374485, + "language_loss": 0.8169682, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.83315915, + "num_input_tokens_seen": 226822165, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.28076172, + "step": 10521, + "time_per_iteration": 2.7076125144958496 + }, + { + "auxiliary_loss_clip": 0.01323583, + "auxiliary_loss_mlp": 0.00272355, + "balance_loss_clip": 1.10105503, + "balance_loss_mlp": 0.24472275, + "epoch": 0.6326168645723733, + "flos": 19937137898880.0, + "grad_norm": 24.29992747594278, + "language_loss": 0.79727411, + "learning_rate": 1.256319016853377e-06, + "loss": 0.81323349, + "num_input_tokens_seen": 226841645, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.27636719, + "step": 10522, + "time_per_iteration": 2.6867616176605225 + }, + { + "auxiliary_loss_clip": 0.01331023, + "auxiliary_loss_mlp": 0.00298386, + "balance_loss_clip": 1.1086812, + "balance_loss_mlp": 0.26984692, + "epoch": 0.6326769878250413, + "flos": 20230348619520.0, + "grad_norm": 58.70127845209635, + "language_loss": 0.89387542, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.91016954, + "num_input_tokens_seen": 226860355, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.28588867, + "step": 10523, + "time_per_iteration": 2.678992748260498 + }, + { + "auxiliary_loss_clip": 0.01337561, + "auxiliary_loss_mlp": 0.00297592, + "balance_loss_clip": 1.11221588, + "balance_loss_mlp": 0.26898199, + "epoch": 0.6327371110777094, + "flos": 20775759707520.0, + "grad_norm": 161.22467828262245, + "language_loss": 0.83487695, + "learning_rate": 1.255596001333195e-06, + "loss": 0.85122848, + "num_input_tokens_seen": 226878390, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.28613281, + "step": 10524, + "time_per_iteration": 2.623450756072998 + }, + { + "auxiliary_loss_clip": 0.01360633, + "auxiliary_loss_mlp": 0.00301925, + "balance_loss_clip": 1.12141442, + "balance_loss_mlp": 0.27047753, + "epoch": 0.6327972343303773, + "flos": 30336544529280.0, + "grad_norm": 15.775232020846635, + "language_loss": 0.91151363, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.92813921, + "num_input_tokens_seen": 226898420, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.31445312, + "step": 10525, + "time_per_iteration": 2.7243878841400146 + }, + { + "auxiliary_loss_clip": 0.01327665, + "auxiliary_loss_mlp": 0.00276279, + "balance_loss_clip": 1.10500479, + "balance_loss_mlp": 0.24902809, + "epoch": 0.6328573575830453, + "flos": 17092258871040.0, + "grad_norm": 52.37301500884562, + "language_loss": 0.73967981, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.75571924, + "num_input_tokens_seen": 226916305, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.27258301, + "step": 10526, + "time_per_iteration": 2.6159050464630127 + }, + { + "auxiliary_loss_clip": 0.01346119, + "auxiliary_loss_mlp": 0.00258147, + "balance_loss_clip": 1.11765838, + "balance_loss_mlp": 0.23012093, + "epoch": 0.6329174808357132, + "flos": 25047154442880.0, + "grad_norm": 7.646450863832346, + "language_loss": 0.79812837, + "learning_rate": 1.254511689796244e-06, + "loss": 0.81417108, + "num_input_tokens_seen": 226937705, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.28015137, + "step": 10527, + "time_per_iteration": 2.686269998550415 + }, + { + "auxiliary_loss_clip": 0.01325942, + "auxiliary_loss_mlp": 0.00272921, + "balance_loss_clip": 1.10585308, + "balance_loss_mlp": 0.24823233, + "epoch": 0.6329776040883812, + "flos": 16836826279680.0, + "grad_norm": 11.92973678827094, + "language_loss": 0.79051054, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.80649918, + "num_input_tokens_seen": 226954880, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.24694824, + "step": 10528, + "time_per_iteration": 2.644465446472168 + }, + { + "auxiliary_loss_clip": 0.0135735, + "auxiliary_loss_mlp": 0.00268851, + "balance_loss_clip": 1.12125754, + "balance_loss_mlp": 0.24045566, + "epoch": 0.6330377273410491, + "flos": 13516705382400.0, + "grad_norm": 108.0096857467456, + "language_loss": 0.78227496, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.79853702, + "num_input_tokens_seen": 226972595, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.28442383, + "step": 10529, + "time_per_iteration": 4.06558895111084 + }, + { + "auxiliary_loss_clip": 0.01380851, + "auxiliary_loss_mlp": 0.00314538, + "balance_loss_clip": 1.13495815, + "balance_loss_mlp": 0.28316164, + "epoch": 0.6330978505937171, + "flos": 21538825257600.0, + "grad_norm": 1078.6756324067644, + "language_loss": 0.84894574, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.86589968, + "num_input_tokens_seen": 226991910, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.3137207, + "step": 10530, + "time_per_iteration": 4.058784008026123 + }, + { + "auxiliary_loss_clip": 0.01374139, + "auxiliary_loss_mlp": 0.002916, + "balance_loss_clip": 1.13512123, + "balance_loss_mlp": 0.26347899, + "epoch": 0.6331579738463851, + "flos": 25009484054400.0, + "grad_norm": 13.302855306741977, + "language_loss": 0.798401, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.81505841, + "num_input_tokens_seen": 227010175, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.28112793, + "step": 10531, + "time_per_iteration": 2.6525802612304688 + }, + { + "auxiliary_loss_clip": 0.01356099, + "auxiliary_loss_mlp": 0.00293567, + "balance_loss_clip": 1.12648046, + "balance_loss_mlp": 0.26583844, + "epoch": 0.6332180970990531, + "flos": 14976007228800.0, + "grad_norm": 25.847278370402414, + "language_loss": 0.86606884, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.8825655, + "num_input_tokens_seen": 227025540, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.27709961, + "step": 10532, + "time_per_iteration": 2.645313262939453 + }, + { + "auxiliary_loss_clip": 0.01322851, + "auxiliary_loss_mlp": 0.00286897, + "balance_loss_clip": 1.10144877, + "balance_loss_mlp": 0.25978839, + "epoch": 0.633278220351721, + "flos": 22706963458560.0, + "grad_norm": 37.8406908746276, + "language_loss": 0.80476636, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.82086384, + "num_input_tokens_seen": 227045520, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.27099609, + "step": 10533, + "time_per_iteration": 4.174433469772339 + }, + { + "auxiliary_loss_clip": 0.0135768, + "auxiliary_loss_mlp": 0.00295776, + "balance_loss_clip": 1.11843181, + "balance_loss_mlp": 0.26597396, + "epoch": 0.633338343604389, + "flos": 12602922364800.0, + "grad_norm": 24.190370037818564, + "language_loss": 0.88274497, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.89927953, + "num_input_tokens_seen": 227059420, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.29797363, + "step": 10534, + "time_per_iteration": 2.6452763080596924 + }, + { + "auxiliary_loss_clip": 0.01346558, + "auxiliary_loss_mlp": 0.00256907, + "balance_loss_clip": 1.11720872, + "balance_loss_mlp": 0.23052615, + "epoch": 0.6333984668570569, + "flos": 25960111447680.0, + "grad_norm": 482.0229138995103, + "language_loss": 0.91808206, + "learning_rate": 1.251621437204777e-06, + "loss": 0.93411672, + "num_input_tokens_seen": 227081310, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.26391602, + "step": 10535, + "time_per_iteration": 2.745957136154175 + }, + { + "auxiliary_loss_clip": 0.01325087, + "auxiliary_loss_mlp": 0.00286144, + "balance_loss_clip": 1.09930897, + "balance_loss_mlp": 0.2588093, + "epoch": 0.6334585901097249, + "flos": 23659242877440.0, + "grad_norm": 5.38501606689798, + "language_loss": 0.85000926, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.86612153, + "num_input_tokens_seen": 227100365, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.2734375, + "step": 10536, + "time_per_iteration": 2.6630067825317383 + }, + { + "auxiliary_loss_clip": 0.01351178, + "auxiliary_loss_mlp": 0.002882, + "balance_loss_clip": 1.1219331, + "balance_loss_mlp": 0.26039994, + "epoch": 0.633518713362393, + "flos": 28760496503040.0, + "grad_norm": 2.1745772193660047, + "language_loss": 0.6876117, + "learning_rate": 1.250899157568855e-06, + "loss": 0.70400554, + "num_input_tokens_seen": 227119680, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.27832031, + "step": 10537, + "time_per_iteration": 4.087575912475586 + }, + { + "auxiliary_loss_clip": 0.01285261, + "auxiliary_loss_mlp": 0.00075888, + "balance_loss_clip": 1.13508105, + "balance_loss_mlp": 0.06825836, + "epoch": 0.6335788366150609, + "flos": 70420322401920.0, + "grad_norm": 0.7758460645531864, + "language_loss": 0.51978451, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.53339601, + "num_input_tokens_seen": 227184465, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.07617188, + "step": 10538, + "time_per_iteration": 3.3048202991485596 + }, + { + "auxiliary_loss_clip": 0.01374052, + "auxiliary_loss_mlp": 0.00278435, + "balance_loss_clip": 1.12995338, + "balance_loss_mlp": 0.24784636, + "epoch": 0.6336389598677289, + "flos": 23732069702400.0, + "grad_norm": 6.928084117716129, + "language_loss": 0.92803842, + "learning_rate": 1.250176991556848e-06, + "loss": 0.94456339, + "num_input_tokens_seen": 227202185, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.30541992, + "step": 10539, + "time_per_iteration": 2.6838860511779785 + }, + { + "auxiliary_loss_clip": 0.01363694, + "auxiliary_loss_mlp": 0.00281315, + "balance_loss_clip": 1.12340999, + "balance_loss_mlp": 0.25027335, + "epoch": 0.6336990831203968, + "flos": 29276676898560.0, + "grad_norm": 28.997992376609446, + "language_loss": 0.92767864, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.94412875, + "num_input_tokens_seen": 227222020, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.3104248, + "step": 10540, + "time_per_iteration": 2.7283031940460205 + }, + { + "auxiliary_loss_clip": 0.01325125, + "auxiliary_loss_mlp": 0.00270078, + "balance_loss_clip": 1.10351515, + "balance_loss_mlp": 0.24460261, + "epoch": 0.6337592063730648, + "flos": 29096836479360.0, + "grad_norm": 4.35250806662915, + "language_loss": 0.79992342, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.81587553, + "num_input_tokens_seen": 227240885, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25463867, + "step": 10541, + "time_per_iteration": 2.7414968013763428 + }, + { + "auxiliary_loss_clip": 0.01360766, + "auxiliary_loss_mlp": 0.00288023, + "balance_loss_clip": 1.12363267, + "balance_loss_mlp": 0.25912666, + "epoch": 0.6338193296257327, + "flos": 34706477249280.0, + "grad_norm": 31.403505779751132, + "language_loss": 0.92241836, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.93890625, + "num_input_tokens_seen": 227257880, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.28881836, + "step": 10542, + "time_per_iteration": 2.766915798187256 + }, + { + "auxiliary_loss_clip": 0.0132393, + "auxiliary_loss_mlp": 0.00286572, + "balance_loss_clip": 1.09685445, + "balance_loss_mlp": 0.25723416, + "epoch": 0.6338794528784008, + "flos": 16687581269760.0, + "grad_norm": 20.846128405104068, + "language_loss": 0.83930314, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.85540819, + "num_input_tokens_seen": 227274840, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.29370117, + "step": 10543, + "time_per_iteration": 2.629380464553833 + }, + { + "auxiliary_loss_clip": 0.01313919, + "auxiliary_loss_mlp": 0.00269651, + "balance_loss_clip": 1.09413671, + "balance_loss_mlp": 0.24322221, + "epoch": 0.6339395761310687, + "flos": 22346600261760.0, + "grad_norm": 3.3172628751848645, + "language_loss": 0.8014912, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.8173269, + "num_input_tokens_seen": 227294835, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26428223, + "step": 10544, + "time_per_iteration": 2.63185453414917 + }, + { + "auxiliary_loss_clip": 0.01344067, + "auxiliary_loss_mlp": 0.00305171, + "balance_loss_clip": 1.10837948, + "balance_loss_mlp": 0.27412885, + "epoch": 0.6339996993837367, + "flos": 18551812112640.0, + "grad_norm": 26.716536863925903, + "language_loss": 0.76594895, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.78244132, + "num_input_tokens_seen": 227314935, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.31054688, + "step": 10545, + "time_per_iteration": 2.6522269248962402 + }, + { + "auxiliary_loss_clip": 0.01341311, + "auxiliary_loss_mlp": 0.00244556, + "balance_loss_clip": 1.11327648, + "balance_loss_mlp": 0.21582688, + "epoch": 0.6340598226364046, + "flos": 12969498614400.0, + "grad_norm": 22.74412926869667, + "language_loss": 0.81185508, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.82771379, + "num_input_tokens_seen": 227332905, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.2869873, + "step": 10546, + "time_per_iteration": 2.626230001449585 + }, + { + "auxiliary_loss_clip": 0.0132056, + "auxiliary_loss_mlp": 0.00276664, + "balance_loss_clip": 1.10158587, + "balance_loss_mlp": 0.24978226, + "epoch": 0.6341199458890726, + "flos": 26687984647680.0, + "grad_norm": 190.21026855553092, + "language_loss": 0.82361376, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.83958602, + "num_input_tokens_seen": 227354915, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26916504, + "step": 10547, + "time_per_iteration": 2.7480995655059814 + }, + { + "auxiliary_loss_clip": 0.01358349, + "auxiliary_loss_mlp": 0.00304082, + "balance_loss_clip": 1.12230945, + "balance_loss_mlp": 0.2731595, + "epoch": 0.6341800691417405, + "flos": 18734274224640.0, + "grad_norm": 10.809650008122539, + "language_loss": 0.74136889, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.75799322, + "num_input_tokens_seen": 227372990, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.30932617, + "step": 10548, + "time_per_iteration": 2.66064453125 + }, + { + "auxiliary_loss_clip": 0.01341313, + "auxiliary_loss_mlp": 0.0027612, + "balance_loss_clip": 1.10986996, + "balance_loss_mlp": 0.24668752, + "epoch": 0.6342401923944085, + "flos": 26249443499520.0, + "grad_norm": 11.641476388040372, + "language_loss": 0.71027946, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.7264539, + "num_input_tokens_seen": 227393270, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.29455566, + "step": 10549, + "time_per_iteration": 2.754582405090332 + }, + { + "auxiliary_loss_clip": 0.01315401, + "auxiliary_loss_mlp": 0.00277646, + "balance_loss_clip": 1.09372258, + "balance_loss_mlp": 0.25231433, + "epoch": 0.6343003156470765, + "flos": 24680937329280.0, + "grad_norm": 37.18195478220987, + "language_loss": 0.8052808, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.82121134, + "num_input_tokens_seen": 227413630, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.25317383, + "step": 10550, + "time_per_iteration": 2.6711015701293945 + }, + { + "auxiliary_loss_clip": 0.01257966, + "auxiliary_loss_mlp": 0.00083199, + "balance_loss_clip": 1.11191511, + "balance_loss_mlp": 0.07566463, + "epoch": 0.6343604388997445, + "flos": 69805352626560.0, + "grad_norm": 0.6907094952451559, + "language_loss": 0.57379198, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.58720362, + "num_input_tokens_seen": 227476630, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.07519531, + "step": 10551, + "time_per_iteration": 3.1610169410705566 + }, + { + "auxiliary_loss_clip": 0.01311695, + "auxiliary_loss_mlp": 0.00288281, + "balance_loss_clip": 1.08892286, + "balance_loss_mlp": 0.26010019, + "epoch": 0.6344205621524125, + "flos": 21982430223360.0, + "grad_norm": 44.73949610362728, + "language_loss": 0.73916811, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.7551679, + "num_input_tokens_seen": 227496060, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.28198242, + "step": 10552, + "time_per_iteration": 2.6520326137542725 + }, + { + "auxiliary_loss_clip": 0.01325463, + "auxiliary_loss_mlp": 0.00287496, + "balance_loss_clip": 1.09622538, + "balance_loss_mlp": 0.25927877, + "epoch": 0.6344806854050804, + "flos": 20448865008000.0, + "grad_norm": 8.905538246087303, + "language_loss": 0.89343464, + "learning_rate": 1.24512502014147e-06, + "loss": 0.90956426, + "num_input_tokens_seen": 227513440, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.2824707, + "step": 10553, + "time_per_iteration": 2.650280237197876 + }, + { + "auxiliary_loss_clip": 0.01351206, + "auxiliary_loss_mlp": 0.00298466, + "balance_loss_clip": 1.1169312, + "balance_loss_mlp": 0.26954556, + "epoch": 0.6345408086577484, + "flos": 40510611187200.0, + "grad_norm": 63.647205690877044, + "language_loss": 0.64050281, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.65699947, + "num_input_tokens_seen": 227535395, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.28918457, + "step": 10554, + "time_per_iteration": 2.8377532958984375 + }, + { + "auxiliary_loss_clip": 0.01318878, + "auxiliary_loss_mlp": 0.00289761, + "balance_loss_clip": 1.09408283, + "balance_loss_mlp": 0.26265314, + "epoch": 0.6346009319104163, + "flos": 21361319222400.0, + "grad_norm": 1206.206816046274, + "language_loss": 0.79167342, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.80775976, + "num_input_tokens_seen": 227554545, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.27099609, + "step": 10555, + "time_per_iteration": 2.667522430419922 + }, + { + "auxiliary_loss_clip": 0.01250465, + "auxiliary_loss_mlp": 0.00111293, + "balance_loss_clip": 1.10930061, + "balance_loss_mlp": 0.10147049, + "epoch": 0.6346610551630844, + "flos": 71365419100800.0, + "grad_norm": 0.7777050815664506, + "language_loss": 0.54714429, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.56076193, + "num_input_tokens_seen": 227608575, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.09814453, + "step": 10556, + "time_per_iteration": 3.0919930934906006 + }, + { + "auxiliary_loss_clip": 0.01336657, + "auxiliary_loss_mlp": 0.00292407, + "balance_loss_clip": 1.10174751, + "balance_loss_mlp": 0.26317739, + "epoch": 0.6347211784157523, + "flos": 25411504049280.0, + "grad_norm": 504.5100521692488, + "language_loss": 0.81261861, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.82890922, + "num_input_tokens_seen": 227628175, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.29223633, + "step": 10557, + "time_per_iteration": 2.7303178310394287 + }, + { + "auxiliary_loss_clip": 0.01333366, + "auxiliary_loss_mlp": 0.00280541, + "balance_loss_clip": 1.10345936, + "balance_loss_mlp": 0.25305146, + "epoch": 0.6347813016684203, + "flos": 15742735966080.0, + "grad_norm": 127.9082046193094, + "language_loss": 0.77378857, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.7899276, + "num_input_tokens_seen": 227645330, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.27490234, + "step": 10558, + "time_per_iteration": 2.681154251098633 + }, + { + "auxiliary_loss_clip": 0.01314243, + "auxiliary_loss_mlp": 0.00270705, + "balance_loss_clip": 1.09096694, + "balance_loss_mlp": 0.24202371, + "epoch": 0.6348414249210882, + "flos": 21464777370240.0, + "grad_norm": 59.874803885321356, + "language_loss": 0.8296538, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.84550333, + "num_input_tokens_seen": 227665250, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.28710938, + "step": 10559, + "time_per_iteration": 2.7034597396850586 + }, + { + "auxiliary_loss_clip": 0.01332005, + "auxiliary_loss_mlp": 0.00291362, + "balance_loss_clip": 1.10459042, + "balance_loss_mlp": 0.25942561, + "epoch": 0.6349015481737562, + "flos": 21653057485440.0, + "grad_norm": 38.94226085146236, + "language_loss": 0.78078067, + "learning_rate": 1.242601136020078e-06, + "loss": 0.79701436, + "num_input_tokens_seen": 227685070, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.31896973, + "step": 10560, + "time_per_iteration": 2.81333065032959 + }, + { + "auxiliary_loss_clip": 0.0131792, + "auxiliary_loss_mlp": 0.00273001, + "balance_loss_clip": 1.0946579, + "balance_loss_mlp": 0.24748977, + "epoch": 0.6349616714264241, + "flos": 22194984954240.0, + "grad_norm": 5.326271664519876, + "language_loss": 0.84202111, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.85793036, + "num_input_tokens_seen": 227704430, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25549316, + "step": 10561, + "time_per_iteration": 2.714729070663452 + }, + { + "auxiliary_loss_clip": 0.01337224, + "auxiliary_loss_mlp": 0.00302365, + "balance_loss_clip": 1.10393858, + "balance_loss_mlp": 0.27363545, + "epoch": 0.6350217946790921, + "flos": 25410354814080.0, + "grad_norm": 9.747282608484559, + "language_loss": 0.79002392, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.80641985, + "num_input_tokens_seen": 227724920, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.2878418, + "step": 10562, + "time_per_iteration": 2.722564220428467 + }, + { + "auxiliary_loss_clip": 0.01327346, + "auxiliary_loss_mlp": 0.00267306, + "balance_loss_clip": 1.09910798, + "balance_loss_mlp": 0.23857649, + "epoch": 0.63508191793176, + "flos": 19718944732800.0, + "grad_norm": 105.97214236043743, + "language_loss": 0.88938457, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.90533113, + "num_input_tokens_seen": 227743400, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.28735352, + "step": 10563, + "time_per_iteration": 2.6756014823913574 + }, + { + "auxiliary_loss_clip": 0.0135452, + "auxiliary_loss_mlp": 0.00299684, + "balance_loss_clip": 1.11891019, + "balance_loss_mlp": 0.27056164, + "epoch": 0.6351420411844281, + "flos": 18186923802240.0, + "grad_norm": 4.476276113717623, + "language_loss": 0.88366556, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.90020758, + "num_input_tokens_seen": 227759990, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.29125977, + "step": 10564, + "time_per_iteration": 2.8209893703460693 + }, + { + "auxiliary_loss_clip": 0.01332871, + "auxiliary_loss_mlp": 0.00292022, + "balance_loss_clip": 1.10435045, + "balance_loss_mlp": 0.2624577, + "epoch": 0.6352021644370961, + "flos": 33726511422720.0, + "grad_norm": 6.512906752465071, + "language_loss": 0.79385221, + "learning_rate": 1.240799222993407e-06, + "loss": 0.81010115, + "num_input_tokens_seen": 227780835, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.2956543, + "step": 10565, + "time_per_iteration": 2.8383054733276367 + }, + { + "auxiliary_loss_clip": 0.0131599, + "auxiliary_loss_mlp": 0.00274645, + "balance_loss_clip": 1.09311402, + "balance_loss_mlp": 0.24643975, + "epoch": 0.635262287689764, + "flos": 20374781207040.0, + "grad_norm": 54.93111422183994, + "language_loss": 0.77848387, + "learning_rate": 1.240438926700324e-06, + "loss": 0.7943902, + "num_input_tokens_seen": 227798580, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.28173828, + "step": 10566, + "time_per_iteration": 2.6383228302001953 + }, + { + "auxiliary_loss_clip": 0.01329952, + "auxiliary_loss_mlp": 0.0027112, + "balance_loss_clip": 1.10359693, + "balance_loss_mlp": 0.24421448, + "epoch": 0.635322410942432, + "flos": 27525421307520.0, + "grad_norm": 68.87142713039943, + "language_loss": 0.7506966, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.7667073, + "num_input_tokens_seen": 227819210, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.26904297, + "step": 10567, + "time_per_iteration": 2.7130463123321533 + }, + { + "auxiliary_loss_clip": 0.01349669, + "auxiliary_loss_mlp": 0.00291157, + "balance_loss_clip": 1.11893106, + "balance_loss_mlp": 0.26090193, + "epoch": 0.6353825341950999, + "flos": 21543601766400.0, + "grad_norm": 1541.3643730587303, + "language_loss": 0.92180783, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.93821603, + "num_input_tokens_seen": 227838340, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.30236816, + "step": 10568, + "time_per_iteration": 2.6699957847595215 + }, + { + "auxiliary_loss_clip": 0.01313571, + "auxiliary_loss_mlp": 0.00280484, + "balance_loss_clip": 1.09068072, + "balance_loss_mlp": 0.25368541, + "epoch": 0.635442657447768, + "flos": 31759756185600.0, + "grad_norm": 256.21715968186214, + "language_loss": 0.91819584, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.93413639, + "num_input_tokens_seen": 227859170, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26794434, + "step": 10569, + "time_per_iteration": 2.7247705459594727 + }, + { + "auxiliary_loss_clip": 0.01308586, + "auxiliary_loss_mlp": 0.00271629, + "balance_loss_clip": 1.08682728, + "balance_loss_mlp": 0.24391288, + "epoch": 0.6355027807004359, + "flos": 19828831415040.0, + "grad_norm": 5.283491377691104, + "language_loss": 0.75400567, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.76980776, + "num_input_tokens_seen": 227878545, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.27685547, + "step": 10570, + "time_per_iteration": 2.6391632556915283 + }, + { + "auxiliary_loss_clip": 0.01319254, + "auxiliary_loss_mlp": 0.00312411, + "balance_loss_clip": 1.09207618, + "balance_loss_mlp": 0.28284726, + "epoch": 0.6355629039531039, + "flos": 30372383324160.0, + "grad_norm": 20.663064496200164, + "language_loss": 0.76403725, + "learning_rate": 1.2386378775476e-06, + "loss": 0.78035384, + "num_input_tokens_seen": 227898875, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.29553223, + "step": 10571, + "time_per_iteration": 4.137019157409668 + }, + { + "auxiliary_loss_clip": 0.01336463, + "auxiliary_loss_mlp": 0.00272592, + "balance_loss_clip": 1.10576677, + "balance_loss_mlp": 0.24717677, + "epoch": 0.6356230272057718, + "flos": 17932065828480.0, + "grad_norm": 62.99466995143967, + "language_loss": 0.78540283, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.80149341, + "num_input_tokens_seen": 227917130, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.25415039, + "step": 10572, + "time_per_iteration": 4.145476818084717 + }, + { + "auxiliary_loss_clip": 0.01313356, + "auxiliary_loss_mlp": 0.00284814, + "balance_loss_clip": 1.09317589, + "balance_loss_mlp": 0.25769374, + "epoch": 0.6356831504584398, + "flos": 25375844822400.0, + "grad_norm": 450.9350439627282, + "language_loss": 0.86173201, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.87771368, + "num_input_tokens_seen": 227939550, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.27111816, + "step": 10573, + "time_per_iteration": 2.7504618167877197 + }, + { + "auxiliary_loss_clip": 0.01325075, + "auxiliary_loss_mlp": 0.00312687, + "balance_loss_clip": 1.10082138, + "balance_loss_mlp": 0.28411299, + "epoch": 0.6357432737111077, + "flos": 46500331720320.0, + "grad_norm": 18.16833877964789, + "language_loss": 0.75729454, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.77367222, + "num_input_tokens_seen": 227962200, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.28564453, + "step": 10574, + "time_per_iteration": 2.8483505249023438 + }, + { + "auxiliary_loss_clip": 0.01328424, + "auxiliary_loss_mlp": 0.00312892, + "balance_loss_clip": 1.10196209, + "balance_loss_mlp": 0.28528342, + "epoch": 0.6358033969637757, + "flos": 17274361847040.0, + "grad_norm": 38.578103353933045, + "language_loss": 0.96183187, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.97824502, + "num_input_tokens_seen": 227979270, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.27612305, + "step": 10575, + "time_per_iteration": 4.047002077102661 + }, + { + "auxiliary_loss_clip": 0.01318665, + "auxiliary_loss_mlp": 0.00287593, + "balance_loss_clip": 1.09834802, + "balance_loss_mlp": 0.26258239, + "epoch": 0.6358635202164437, + "flos": 27125520215040.0, + "grad_norm": 30.785877605115054, + "language_loss": 0.7782805, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.79434311, + "num_input_tokens_seen": 228000550, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25036621, + "step": 10576, + "time_per_iteration": 2.701847553253174 + }, + { + "auxiliary_loss_clip": 0.01320756, + "auxiliary_loss_mlp": 0.00282659, + "balance_loss_clip": 1.0984118, + "balance_loss_mlp": 0.25673103, + "epoch": 0.6359236434691117, + "flos": 27525205825920.0, + "grad_norm": 6.9852240189034065, + "language_loss": 0.74206769, + "learning_rate": 1.236477571455085e-06, + "loss": 0.75810182, + "num_input_tokens_seen": 228022005, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25939941, + "step": 10577, + "time_per_iteration": 2.7426161766052246 + }, + { + "auxiliary_loss_clip": 0.01324331, + "auxiliary_loss_mlp": 0.00270133, + "balance_loss_clip": 1.10084534, + "balance_loss_mlp": 0.24430029, + "epoch": 0.6359837667217797, + "flos": 39348290989440.0, + "grad_norm": 30.03863631893891, + "language_loss": 0.80647767, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.82242227, + "num_input_tokens_seen": 228043770, + "router_z_loss_clip": 2.23339844, + "router_z_loss_mlp": 0.25830078, + "step": 10578, + "time_per_iteration": 2.835326671600342 + }, + { + "auxiliary_loss_clip": 0.01229762, + "auxiliary_loss_mlp": 0.00033582, + "balance_loss_clip": 1.09323525, + "balance_loss_mlp": 0.02704948, + "epoch": 0.6360438899744476, + "flos": 56413797206400.0, + "grad_norm": 0.6842579444044682, + "language_loss": 0.53678942, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.54942286, + "num_input_tokens_seen": 228104985, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06542969, + "step": 10579, + "time_per_iteration": 4.5555689334869385 + }, + { + "auxiliary_loss_clip": 0.01315212, + "auxiliary_loss_mlp": 0.00280001, + "balance_loss_clip": 1.08977056, + "balance_loss_mlp": 0.25115204, + "epoch": 0.6361040132271156, + "flos": 24973106555520.0, + "grad_norm": 267.3025372176447, + "language_loss": 0.856583, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.87253517, + "num_input_tokens_seen": 228125620, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.28833008, + "step": 10580, + "time_per_iteration": 2.781122922897339 + }, + { + "auxiliary_loss_clip": 0.01311756, + "auxiliary_loss_mlp": 0.00275417, + "balance_loss_clip": 1.09254336, + "balance_loss_mlp": 0.25006151, + "epoch": 0.6361641364797835, + "flos": 23259198130560.0, + "grad_norm": 21.476537014699993, + "language_loss": 0.73673242, + "learning_rate": 1.235037946268301e-06, + "loss": 0.75260419, + "num_input_tokens_seen": 228143495, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.25354004, + "step": 10581, + "time_per_iteration": 2.684227705001831 + }, + { + "auxiliary_loss_clip": 0.01302435, + "auxiliary_loss_mlp": 0.00292298, + "balance_loss_clip": 1.08379912, + "balance_loss_mlp": 0.26664382, + "epoch": 0.6362242597324516, + "flos": 25994513698560.0, + "grad_norm": 5.141423367266589, + "language_loss": 0.76509702, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.78104436, + "num_input_tokens_seen": 228166500, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25683594, + "step": 10582, + "time_per_iteration": 2.7454686164855957 + }, + { + "auxiliary_loss_clip": 0.01325981, + "auxiliary_loss_mlp": 0.00295575, + "balance_loss_clip": 1.1028285, + "balance_loss_mlp": 0.26894403, + "epoch": 0.6362843829851195, + "flos": 25703242312320.0, + "grad_norm": 884.9362521624552, + "language_loss": 0.92255235, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.93876791, + "num_input_tokens_seen": 228185325, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26623535, + "step": 10583, + "time_per_iteration": 2.7152318954467773 + }, + { + "auxiliary_loss_clip": 0.01332179, + "auxiliary_loss_mlp": 0.00281124, + "balance_loss_clip": 1.10757589, + "balance_loss_mlp": 0.25463554, + "epoch": 0.6363445062377875, + "flos": 20522912895360.0, + "grad_norm": 9.329491129970437, + "language_loss": 0.82221699, + "learning_rate": 1.233958531908538e-06, + "loss": 0.83835, + "num_input_tokens_seen": 228204050, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26525879, + "step": 10584, + "time_per_iteration": 2.793417453765869 + }, + { + "auxiliary_loss_clip": 0.01309238, + "auxiliary_loss_mlp": 0.00304942, + "balance_loss_clip": 1.08440208, + "balance_loss_mlp": 0.27665412, + "epoch": 0.6364046294904554, + "flos": 19463799450240.0, + "grad_norm": 8.385514560613922, + "language_loss": 0.80287683, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.8190186, + "num_input_tokens_seen": 228222430, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.28283691, + "step": 10585, + "time_per_iteration": 2.6669907569885254 + }, + { + "auxiliary_loss_clip": 0.01317462, + "auxiliary_loss_mlp": 0.00322815, + "balance_loss_clip": 1.09626436, + "balance_loss_mlp": 0.29511088, + "epoch": 0.6364647527431234, + "flos": 20995892208000.0, + "grad_norm": 39.665895191565184, + "language_loss": 0.89501727, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.91142005, + "num_input_tokens_seen": 228241925, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.27709961, + "step": 10586, + "time_per_iteration": 2.6465060710906982 + }, + { + "auxiliary_loss_clip": 0.01299748, + "auxiliary_loss_mlp": 0.00307631, + "balance_loss_clip": 1.08604169, + "balance_loss_mlp": 0.28209648, + "epoch": 0.6365248759957913, + "flos": 25770789838080.0, + "grad_norm": 142.703178435658, + "language_loss": 0.78477156, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.80084538, + "num_input_tokens_seen": 228262535, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.25561523, + "step": 10587, + "time_per_iteration": 2.691361904144287 + }, + { + "auxiliary_loss_clip": 0.01321178, + "auxiliary_loss_mlp": 0.00304628, + "balance_loss_clip": 1.09844053, + "balance_loss_mlp": 0.27681684, + "epoch": 0.6365849992484593, + "flos": 22455589104000.0, + "grad_norm": 87.46129068108255, + "language_loss": 0.83905983, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.85531783, + "num_input_tokens_seen": 228281340, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.27819824, + "step": 10588, + "time_per_iteration": 2.675084114074707 + }, + { + "auxiliary_loss_clip": 0.01302388, + "auxiliary_loss_mlp": 0.00302321, + "balance_loss_clip": 1.08493948, + "balance_loss_mlp": 0.27445, + "epoch": 0.6366451225011273, + "flos": 19025689265280.0, + "grad_norm": 8.929159192450316, + "language_loss": 0.84385502, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.85990214, + "num_input_tokens_seen": 228300865, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.27856445, + "step": 10589, + "time_per_iteration": 2.6993770599365234 + }, + { + "auxiliary_loss_clip": 0.01325631, + "auxiliary_loss_mlp": 0.00310442, + "balance_loss_clip": 1.10555232, + "balance_loss_mlp": 0.28302419, + "epoch": 0.6367052457537953, + "flos": 25228395492480.0, + "grad_norm": 34.248760932962085, + "language_loss": 0.77532065, + "learning_rate": 1.231800487863257e-06, + "loss": 0.79168141, + "num_input_tokens_seen": 228320815, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.27404785, + "step": 10590, + "time_per_iteration": 2.7116940021514893 + }, + { + "auxiliary_loss_clip": 0.01322416, + "auxiliary_loss_mlp": 0.00303556, + "balance_loss_clip": 1.09325576, + "balance_loss_mlp": 0.27537453, + "epoch": 0.6367653690064633, + "flos": 19208438686080.0, + "grad_norm": 9.310516476267823, + "language_loss": 0.86528552, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.88154525, + "num_input_tokens_seen": 228339065, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.28173828, + "step": 10591, + "time_per_iteration": 2.64190411567688 + }, + { + "auxiliary_loss_clip": 0.01312331, + "auxiliary_loss_mlp": 0.00312402, + "balance_loss_clip": 1.09363484, + "balance_loss_mlp": 0.2859135, + "epoch": 0.6368254922591312, + "flos": 23546806329600.0, + "grad_norm": 24.679587247814577, + "language_loss": 0.95076942, + "learning_rate": 1.231081372744317e-06, + "loss": 0.9670167, + "num_input_tokens_seen": 228359210, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.26489258, + "step": 10592, + "time_per_iteration": 2.7151992321014404 + }, + { + "auxiliary_loss_clip": 0.01301811, + "auxiliary_loss_mlp": 0.00322995, + "balance_loss_clip": 1.08371139, + "balance_loss_mlp": 0.294909, + "epoch": 0.6368856155117992, + "flos": 26467313443200.0, + "grad_norm": 15.008840950403757, + "language_loss": 0.73125017, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.74749821, + "num_input_tokens_seen": 228379630, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.28100586, + "step": 10593, + "time_per_iteration": 2.702620506286621 + }, + { + "auxiliary_loss_clip": 0.0130733, + "auxiliary_loss_mlp": 0.00297479, + "balance_loss_clip": 1.08614588, + "balance_loss_mlp": 0.27078795, + "epoch": 0.6369457387644671, + "flos": 33692432394240.0, + "grad_norm": 48.75733659355035, + "language_loss": 0.71147186, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.72751993, + "num_input_tokens_seen": 228401410, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26660156, + "step": 10594, + "time_per_iteration": 2.78024959564209 + }, + { + "auxiliary_loss_clip": 0.01201549, + "auxiliary_loss_mlp": 0.00097548, + "balance_loss_clip": 1.06060171, + "balance_loss_mlp": 0.09006175, + "epoch": 0.6370058620171352, + "flos": 70908600908160.0, + "grad_norm": 0.7380371548155644, + "language_loss": 0.54037547, + "learning_rate": 1.230002918781022e-06, + "loss": 0.55336642, + "num_input_tokens_seen": 228470335, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.07470703, + "step": 10595, + "time_per_iteration": 3.2539753913879395 + }, + { + "auxiliary_loss_clip": 0.01329838, + "auxiliary_loss_mlp": 0.00295213, + "balance_loss_clip": 1.10137868, + "balance_loss_mlp": 0.26677004, + "epoch": 0.6370659852698031, + "flos": 21141940907520.0, + "grad_norm": 4.495530354474944, + "language_loss": 0.74232197, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.75857258, + "num_input_tokens_seen": 228490765, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.28442383, + "step": 10596, + "time_per_iteration": 2.6934781074523926 + }, + { + "auxiliary_loss_clip": 0.01316101, + "auxiliary_loss_mlp": 0.00294426, + "balance_loss_clip": 1.09574032, + "balance_loss_mlp": 0.26618522, + "epoch": 0.6371261085224711, + "flos": 20193288762240.0, + "grad_norm": 121.13850350845412, + "language_loss": 0.89264911, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.90875441, + "num_input_tokens_seen": 228509700, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.28210449, + "step": 10597, + "time_per_iteration": 2.7051053047180176 + }, + { + "auxiliary_loss_clip": 0.01347598, + "auxiliary_loss_mlp": 0.00295189, + "balance_loss_clip": 1.11183405, + "balance_loss_mlp": 0.26926091, + "epoch": 0.637186231775139, + "flos": 19683536901120.0, + "grad_norm": 19.48611780179347, + "language_loss": 0.79286706, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.80929494, + "num_input_tokens_seen": 228529050, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.25927734, + "step": 10598, + "time_per_iteration": 2.7356369495391846 + }, + { + "auxiliary_loss_clip": 0.01344298, + "auxiliary_loss_mlp": 0.00322295, + "balance_loss_clip": 1.1100204, + "balance_loss_mlp": 0.29289785, + "epoch": 0.637246355027807, + "flos": 13071196995840.0, + "grad_norm": 14.526710003121748, + "language_loss": 0.75372213, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.77038813, + "num_input_tokens_seen": 228544665, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.29394531, + "step": 10599, + "time_per_iteration": 2.6622183322906494 + }, + { + "auxiliary_loss_clip": 0.01351309, + "auxiliary_loss_mlp": 0.00341526, + "balance_loss_clip": 1.11247814, + "balance_loss_mlp": 0.31005466, + "epoch": 0.6373064782804749, + "flos": 18222654856320.0, + "grad_norm": 51.44489662200496, + "language_loss": 0.89542979, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.91235811, + "num_input_tokens_seen": 228562060, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.31494141, + "step": 10600, + "time_per_iteration": 2.6249659061431885 + }, + { + "auxiliary_loss_clip": 0.01326175, + "auxiliary_loss_mlp": 0.00320362, + "balance_loss_clip": 1.10197735, + "balance_loss_mlp": 0.29215717, + "epoch": 0.637366601533143, + "flos": 24498475217280.0, + "grad_norm": 101.74078541289214, + "language_loss": 0.84652841, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.86299378, + "num_input_tokens_seen": 228582550, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.28186035, + "step": 10601, + "time_per_iteration": 2.7083680629730225 + }, + { + "auxiliary_loss_clip": 0.01337833, + "auxiliary_loss_mlp": 0.003199, + "balance_loss_clip": 1.10256243, + "balance_loss_mlp": 0.2898595, + "epoch": 0.6374267247858109, + "flos": 26359042872960.0, + "grad_norm": 23.299745438839214, + "language_loss": 0.76056683, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.77714419, + "num_input_tokens_seen": 228604960, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.30029297, + "step": 10602, + "time_per_iteration": 2.7213990688323975 + }, + { + "auxiliary_loss_clip": 0.01320383, + "auxiliary_loss_mlp": 0.0032622, + "balance_loss_clip": 1.09391093, + "balance_loss_mlp": 0.29794407, + "epoch": 0.6374868480384789, + "flos": 20371728551040.0, + "grad_norm": 5.622948204089622, + "language_loss": 0.84514511, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.86161113, + "num_input_tokens_seen": 228622195, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.2833252, + "step": 10603, + "time_per_iteration": 2.7457683086395264 + }, + { + "auxiliary_loss_clip": 0.01322071, + "auxiliary_loss_mlp": 0.00309544, + "balance_loss_clip": 1.09786975, + "balance_loss_mlp": 0.28117183, + "epoch": 0.6375469712911469, + "flos": 20996251344000.0, + "grad_norm": 14.071101774520777, + "language_loss": 0.86968106, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.88599718, + "num_input_tokens_seen": 228639735, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.28344727, + "step": 10604, + "time_per_iteration": 2.6339969635009766 + }, + { + "auxiliary_loss_clip": 0.01336357, + "auxiliary_loss_mlp": 0.00320454, + "balance_loss_clip": 1.10180521, + "balance_loss_mlp": 0.28912613, + "epoch": 0.6376070945438148, + "flos": 19715748422400.0, + "grad_norm": 181.5412169242318, + "language_loss": 0.84276414, + "learning_rate": 1.226409972197281e-06, + "loss": 0.85933226, + "num_input_tokens_seen": 228658195, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.31298828, + "step": 10605, + "time_per_iteration": 2.702362298965454 + }, + { + "auxiliary_loss_clip": 0.01350643, + "auxiliary_loss_mlp": 0.00321684, + "balance_loss_clip": 1.1133796, + "balance_loss_mlp": 0.29002243, + "epoch": 0.6376672177964828, + "flos": 21506757390720.0, + "grad_norm": 12.502249458799074, + "language_loss": 0.73801053, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.7547338, + "num_input_tokens_seen": 228677415, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.31665039, + "step": 10606, + "time_per_iteration": 2.7296223640441895 + }, + { + "auxiliary_loss_clip": 0.01315504, + "auxiliary_loss_mlp": 0.00319205, + "balance_loss_clip": 1.0948875, + "balance_loss_mlp": 0.29181054, + "epoch": 0.6377273410491507, + "flos": 18843873598080.0, + "grad_norm": 6098.390928700006, + "language_loss": 0.8364442, + "learning_rate": 1.225691734459971e-06, + "loss": 0.85279125, + "num_input_tokens_seen": 228696450, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.27392578, + "step": 10607, + "time_per_iteration": 2.647829055786133 + }, + { + "auxiliary_loss_clip": 0.01342008, + "auxiliary_loss_mlp": 0.00322227, + "balance_loss_clip": 1.11412847, + "balance_loss_mlp": 0.29331884, + "epoch": 0.6377874643018188, + "flos": 53062970181120.0, + "grad_norm": 3.4758798882768303, + "language_loss": 0.72130346, + "learning_rate": 1.225332659627278e-06, + "loss": 0.7379458, + "num_input_tokens_seen": 228721600, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.2890625, + "step": 10608, + "time_per_iteration": 2.9333865642547607 + }, + { + "auxiliary_loss_clip": 0.01216183, + "auxiliary_loss_mlp": 0.00062506, + "balance_loss_clip": 1.07641888, + "balance_loss_mlp": 0.0566887, + "epoch": 0.6378475875544867, + "flos": 65135026465920.0, + "grad_norm": 0.7314460600515144, + "language_loss": 0.51699847, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.52978534, + "num_input_tokens_seen": 228784535, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.05810547, + "step": 10609, + "time_per_iteration": 3.1176838874816895 + }, + { + "auxiliary_loss_clip": 0.0132074, + "auxiliary_loss_mlp": 0.00296537, + "balance_loss_clip": 1.09488153, + "balance_loss_mlp": 0.270549, + "epoch": 0.6379077108071547, + "flos": 23002759958400.0, + "grad_norm": 65.1116446195222, + "language_loss": 0.82907069, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.84524345, + "num_input_tokens_seen": 228804110, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.25964355, + "step": 10610, + "time_per_iteration": 2.832923412322998 + }, + { + "auxiliary_loss_clip": 0.01216455, + "auxiliary_loss_mlp": 0.00092342, + "balance_loss_clip": 1.07738543, + "balance_loss_mlp": 0.08561824, + "epoch": 0.6379678340598226, + "flos": 67601947610880.0, + "grad_norm": 0.8208219875041748, + "language_loss": 0.6206125, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.63370049, + "num_input_tokens_seen": 228867705, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.06738281, + "step": 10611, + "time_per_iteration": 3.158858299255371 + }, + { + "auxiliary_loss_clip": 0.01333591, + "auxiliary_loss_mlp": 0.00313696, + "balance_loss_clip": 1.10324073, + "balance_loss_mlp": 0.28522837, + "epoch": 0.6380279573124906, + "flos": 29680061610240.0, + "grad_norm": 10.045676310935978, + "language_loss": 0.79281938, + "learning_rate": 1.223896654187282e-06, + "loss": 0.8092922, + "num_input_tokens_seen": 228889215, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.28491211, + "step": 10612, + "time_per_iteration": 2.7353837490081787 + }, + { + "auxiliary_loss_clip": 0.01208946, + "auxiliary_loss_mlp": 0.00065819, + "balance_loss_clip": 1.06953239, + "balance_loss_mlp": 0.05795102, + "epoch": 0.6380880805651585, + "flos": 66484046580480.0, + "grad_norm": 0.7081144700903569, + "language_loss": 0.56981945, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.58256716, + "num_input_tokens_seen": 228948465, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.07861328, + "step": 10613, + "time_per_iteration": 4.428763389587402 + }, + { + "auxiliary_loss_clip": 0.01343138, + "auxiliary_loss_mlp": 0.00294512, + "balance_loss_clip": 1.11144853, + "balance_loss_mlp": 0.26646221, + "epoch": 0.6381482038178266, + "flos": 23914998691200.0, + "grad_norm": 5.061211440882698, + "language_loss": 0.81358731, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.8299638, + "num_input_tokens_seen": 228967955, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.28051758, + "step": 10614, + "time_per_iteration": 4.1110756397247314 + }, + { + "auxiliary_loss_clip": 0.01344319, + "auxiliary_loss_mlp": 0.00316979, + "balance_loss_clip": 1.10902727, + "balance_loss_mlp": 0.2870571, + "epoch": 0.6382083270704945, + "flos": 24243042625920.0, + "grad_norm": 7.349668164184247, + "language_loss": 0.86890954, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.88552248, + "num_input_tokens_seen": 228985495, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.29907227, + "step": 10615, + "time_per_iteration": 2.653588056564331 + }, + { + "auxiliary_loss_clip": 0.01219515, + "auxiliary_loss_mlp": 0.00059782, + "balance_loss_clip": 1.07779527, + "balance_loss_mlp": 0.05305844, + "epoch": 0.6382684503231625, + "flos": 70775552931840.0, + "grad_norm": 0.7543205484595451, + "language_loss": 0.54915917, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.56195211, + "num_input_tokens_seen": 229052995, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.06738281, + "step": 10616, + "time_per_iteration": 3.2411060333251953 + }, + { + "auxiliary_loss_clip": 0.01347075, + "auxiliary_loss_mlp": 0.00345276, + "balance_loss_clip": 1.11577249, + "balance_loss_mlp": 0.31537852, + "epoch": 0.6383285735758305, + "flos": 16544836621440.0, + "grad_norm": 11.710828436496751, + "language_loss": 0.90828246, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.92520595, + "num_input_tokens_seen": 229071030, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.29833984, + "step": 10617, + "time_per_iteration": 4.006694793701172 + }, + { + "auxiliary_loss_clip": 0.01341521, + "auxiliary_loss_mlp": 0.00336123, + "balance_loss_clip": 1.1041348, + "balance_loss_mlp": 0.30535498, + "epoch": 0.6383886968284984, + "flos": 14427651225600.0, + "grad_norm": 8.715667214244343, + "language_loss": 0.94235021, + "learning_rate": 1.221743529196936e-06, + "loss": 0.95912671, + "num_input_tokens_seen": 229088275, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.30761719, + "step": 10618, + "time_per_iteration": 2.648388147354126 + }, + { + "auxiliary_loss_clip": 0.01324167, + "auxiliary_loss_mlp": 0.0033205, + "balance_loss_clip": 1.09509158, + "balance_loss_mlp": 0.30227152, + "epoch": 0.6384488200811664, + "flos": 17929659617280.0, + "grad_norm": 3.5579882786992916, + "language_loss": 0.82357633, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.84013855, + "num_input_tokens_seen": 229105190, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.29772949, + "step": 10619, + "time_per_iteration": 2.6519646644592285 + }, + { + "auxiliary_loss_clip": 0.01348496, + "auxiliary_loss_mlp": 0.00337648, + "balance_loss_clip": 1.10785949, + "balance_loss_mlp": 0.30699962, + "epoch": 0.6385089433338343, + "flos": 18515578268160.0, + "grad_norm": 10.870535672394851, + "language_loss": 0.83973491, + "learning_rate": 1.221026056814193e-06, + "loss": 0.85659635, + "num_input_tokens_seen": 229122290, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.30664062, + "step": 10620, + "time_per_iteration": 2.656034469604492 + }, + { + "auxiliary_loss_clip": 0.01358901, + "auxiliary_loss_mlp": 0.00320406, + "balance_loss_clip": 1.1215564, + "balance_loss_mlp": 0.29236776, + "epoch": 0.6385690665865024, + "flos": 24753620499840.0, + "grad_norm": 45.499741641004306, + "language_loss": 0.80624843, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.8230415, + "num_input_tokens_seen": 229141620, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.28027344, + "step": 10621, + "time_per_iteration": 4.113484621047974 + }, + { + "auxiliary_loss_clip": 0.01336078, + "auxiliary_loss_mlp": 0.00302662, + "balance_loss_clip": 1.10585892, + "balance_loss_mlp": 0.27414674, + "epoch": 0.6386291898391703, + "flos": 20120569678080.0, + "grad_norm": 98.95786076539129, + "language_loss": 0.83346635, + "learning_rate": 1.220308702586529e-06, + "loss": 0.84985375, + "num_input_tokens_seen": 229161570, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.28540039, + "step": 10622, + "time_per_iteration": 2.648211717605591 + }, + { + "auxiliary_loss_clip": 0.01330546, + "auxiliary_loss_mlp": 0.00278665, + "balance_loss_clip": 1.10214043, + "balance_loss_mlp": 0.25122309, + "epoch": 0.6386893130918383, + "flos": 16867278034560.0, + "grad_norm": 262.04378774587144, + "language_loss": 0.81171858, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.8278107, + "num_input_tokens_seen": 229178465, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.2746582, + "step": 10623, + "time_per_iteration": 2.7018375396728516 + }, + { + "auxiliary_loss_clip": 0.01317438, + "auxiliary_loss_mlp": 0.00315886, + "balance_loss_clip": 1.09532583, + "balance_loss_mlp": 0.28890878, + "epoch": 0.6387494363445062, + "flos": 22966274718720.0, + "grad_norm": 2.5337377322700347, + "language_loss": 0.81829232, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.8346256, + "num_input_tokens_seen": 229198975, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.27001953, + "step": 10624, + "time_per_iteration": 2.7024853229522705 + }, + { + "auxiliary_loss_clip": 0.01349868, + "auxiliary_loss_mlp": 0.00308223, + "balance_loss_clip": 1.11360657, + "balance_loss_mlp": 0.27991036, + "epoch": 0.6388095595971742, + "flos": 22857716839680.0, + "grad_norm": 9.832380484943643, + "language_loss": 0.88159788, + "learning_rate": 1.21923289302382e-06, + "loss": 0.89817882, + "num_input_tokens_seen": 229218825, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.28308105, + "step": 10625, + "time_per_iteration": 2.6761586666107178 + }, + { + "auxiliary_loss_clip": 0.01349081, + "auxiliary_loss_mlp": 0.00314837, + "balance_loss_clip": 1.11456025, + "balance_loss_mlp": 0.28309143, + "epoch": 0.6388696828498421, + "flos": 17311529445120.0, + "grad_norm": 13.019868406840951, + "language_loss": 0.80655426, + "learning_rate": 1.218874349031654e-06, + "loss": 0.82319343, + "num_input_tokens_seen": 229236060, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.31750488, + "step": 10626, + "time_per_iteration": 2.681581735610962 + }, + { + "auxiliary_loss_clip": 0.01350904, + "auxiliary_loss_mlp": 0.00308505, + "balance_loss_clip": 1.11415482, + "balance_loss_mlp": 0.27913171, + "epoch": 0.6389298061025102, + "flos": 17128636369920.0, + "grad_norm": 28.81007041173824, + "language_loss": 0.79676735, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.81336153, + "num_input_tokens_seen": 229255160, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.29394531, + "step": 10627, + "time_per_iteration": 2.60990571975708 + }, + { + "auxiliary_loss_clip": 0.01347028, + "auxiliary_loss_mlp": 0.00310295, + "balance_loss_clip": 1.1088109, + "balance_loss_mlp": 0.27949154, + "epoch": 0.6389899293551781, + "flos": 27710971989120.0, + "grad_norm": 16.262848661465867, + "language_loss": 0.75712878, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.77370203, + "num_input_tokens_seen": 229278705, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.30786133, + "step": 10628, + "time_per_iteration": 2.6972618103027344 + }, + { + "auxiliary_loss_clip": 0.01317486, + "auxiliary_loss_mlp": 0.00319573, + "balance_loss_clip": 1.09477806, + "balance_loss_mlp": 0.29171357, + "epoch": 0.6390500526078461, + "flos": 21215701486080.0, + "grad_norm": 22.977909727304432, + "language_loss": 0.73906642, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.75543702, + "num_input_tokens_seen": 229299990, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.27832031, + "step": 10629, + "time_per_iteration": 2.73294997215271 + }, + { + "auxiliary_loss_clip": 0.0136349, + "auxiliary_loss_mlp": 0.00358543, + "balance_loss_clip": 1.11489511, + "balance_loss_mlp": 0.32452106, + "epoch": 0.6391101758605141, + "flos": 21581056673280.0, + "grad_norm": 25.995776257919722, + "language_loss": 0.84266829, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.85988867, + "num_input_tokens_seen": 229319230, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.34033203, + "step": 10630, + "time_per_iteration": 2.695523738861084 + }, + { + "auxiliary_loss_clip": 0.01315178, + "auxiliary_loss_mlp": 0.00336247, + "balance_loss_clip": 1.09144425, + "balance_loss_mlp": 0.30738604, + "epoch": 0.639170299113182, + "flos": 19900473091200.0, + "grad_norm": 26.608633351860895, + "language_loss": 0.75109178, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.76760602, + "num_input_tokens_seen": 229338600, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.28869629, + "step": 10631, + "time_per_iteration": 2.643413782119751 + }, + { + "auxiliary_loss_clip": 0.01213964, + "auxiliary_loss_mlp": 0.00116791, + "balance_loss_clip": 1.06785309, + "balance_loss_mlp": 0.10754063, + "epoch": 0.63923042236585, + "flos": 69877604833920.0, + "grad_norm": 0.7548304968797463, + "language_loss": 0.62241942, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.63572693, + "num_input_tokens_seen": 229402420, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.09228516, + "step": 10632, + "time_per_iteration": 3.222689151763916 + }, + { + "auxiliary_loss_clip": 0.01328005, + "auxiliary_loss_mlp": 0.00318873, + "balance_loss_clip": 1.09855294, + "balance_loss_mlp": 0.29051358, + "epoch": 0.639290545618518, + "flos": 22674823764480.0, + "grad_norm": 11.163677810233947, + "language_loss": 0.74276173, + "learning_rate": 1.216365371217893e-06, + "loss": 0.75923049, + "num_input_tokens_seen": 229419185, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.28344727, + "step": 10633, + "time_per_iteration": 2.6719958782196045 + }, + { + "auxiliary_loss_clip": 0.01321772, + "auxiliary_loss_mlp": 0.00324685, + "balance_loss_clip": 1.09514141, + "balance_loss_mlp": 0.29698098, + "epoch": 0.639350668871186, + "flos": 19829190551040.0, + "grad_norm": 5.594006695077692, + "language_loss": 0.89060926, + "learning_rate": 1.216007064569225e-06, + "loss": 0.90707392, + "num_input_tokens_seen": 229436735, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.27722168, + "step": 10634, + "time_per_iteration": 2.715884208679199 + }, + { + "auxiliary_loss_clip": 0.01357959, + "auxiliary_loss_mlp": 0.00328902, + "balance_loss_clip": 1.12123692, + "balance_loss_mlp": 0.2998988, + "epoch": 0.6394107921238539, + "flos": 20553328736640.0, + "grad_norm": 76.91496261733667, + "language_loss": 0.82268906, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.83955771, + "num_input_tokens_seen": 229455595, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.2902832, + "step": 10635, + "time_per_iteration": 2.680143117904663 + }, + { + "auxiliary_loss_clip": 0.01322751, + "auxiliary_loss_mlp": 0.00338067, + "balance_loss_clip": 1.09458733, + "balance_loss_mlp": 0.30919451, + "epoch": 0.6394709153765219, + "flos": 25774991729280.0, + "grad_norm": 23.107799391006544, + "language_loss": 0.77259934, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.78920758, + "num_input_tokens_seen": 229476230, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.28857422, + "step": 10636, + "time_per_iteration": 2.693263530731201 + }, + { + "auxiliary_loss_clip": 0.0134059, + "auxiliary_loss_mlp": 0.00344051, + "balance_loss_clip": 1.10173059, + "balance_loss_mlp": 0.31305683, + "epoch": 0.6395310386291898, + "flos": 17530153574400.0, + "grad_norm": 16.717493452812853, + "language_loss": 0.81575036, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.83259672, + "num_input_tokens_seen": 229494300, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.31030273, + "step": 10637, + "time_per_iteration": 2.639697551727295 + }, + { + "auxiliary_loss_clip": 0.01328877, + "auxiliary_loss_mlp": 0.00371586, + "balance_loss_clip": 1.09828448, + "balance_loss_mlp": 0.33928025, + "epoch": 0.6395911618818578, + "flos": 18588225525120.0, + "grad_norm": 27.74804647802693, + "language_loss": 0.85659611, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.87360078, + "num_input_tokens_seen": 229512985, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.32324219, + "step": 10638, + "time_per_iteration": 2.633134365081787 + }, + { + "auxiliary_loss_clip": 0.01349738, + "auxiliary_loss_mlp": 0.00307842, + "balance_loss_clip": 1.11145258, + "balance_loss_mlp": 0.27639496, + "epoch": 0.6396512851345257, + "flos": 28366557068160.0, + "grad_norm": 19.49472039380725, + "language_loss": 0.8831014, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.89967722, + "num_input_tokens_seen": 229534270, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.31445312, + "step": 10639, + "time_per_iteration": 2.724376678466797 + }, + { + "auxiliary_loss_clip": 0.01224251, + "auxiliary_loss_mlp": 0.00158425, + "balance_loss_clip": 1.07620311, + "balance_loss_mlp": 0.14712372, + "epoch": 0.6397114083871938, + "flos": 70724307202560.0, + "grad_norm": 0.8234453156705953, + "language_loss": 0.5838567, + "learning_rate": 1.21385784946359e-06, + "loss": 0.59768343, + "num_input_tokens_seen": 229596455, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.11279297, + "step": 10640, + "time_per_iteration": 3.137103319168091 + }, + { + "auxiliary_loss_clip": 0.01316726, + "auxiliary_loss_mlp": 0.00334964, + "balance_loss_clip": 1.09166861, + "balance_loss_mlp": 0.30664015, + "epoch": 0.6397715316398617, + "flos": 18142537570560.0, + "grad_norm": 15.26380223263372, + "language_loss": 0.83760935, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.85412621, + "num_input_tokens_seen": 229612860, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.28320312, + "step": 10641, + "time_per_iteration": 2.6568174362182617 + }, + { + "auxiliary_loss_clip": 0.01337911, + "auxiliary_loss_mlp": 0.00321641, + "balance_loss_clip": 1.09575891, + "balance_loss_mlp": 0.29249439, + "epoch": 0.6398316548925297, + "flos": 25739512070400.0, + "grad_norm": 114.2933621908355, + "language_loss": 0.7285403, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.74513578, + "num_input_tokens_seen": 229633960, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.29162598, + "step": 10642, + "time_per_iteration": 2.7038278579711914 + }, + { + "auxiliary_loss_clip": 0.01222902, + "auxiliary_loss_mlp": 0.00116077, + "balance_loss_clip": 1.07788885, + "balance_loss_mlp": 0.10854261, + "epoch": 0.6398917781451977, + "flos": 71214234756480.0, + "grad_norm": 1.1274009292353036, + "language_loss": 0.55601174, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.5694015, + "num_input_tokens_seen": 229686730, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.07519531, + "step": 10643, + "time_per_iteration": 3.1127641201019287 + }, + { + "auxiliary_loss_clip": 0.01351902, + "auxiliary_loss_mlp": 0.00342855, + "balance_loss_clip": 1.11119437, + "balance_loss_mlp": 0.31016749, + "epoch": 0.6399519013978656, + "flos": 20521835487360.0, + "grad_norm": 408255.3123770578, + "language_loss": 0.8417291, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.85867667, + "num_input_tokens_seen": 229704800, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.32666016, + "step": 10644, + "time_per_iteration": 2.6148338317871094 + }, + { + "auxiliary_loss_clip": 0.0135124, + "auxiliary_loss_mlp": 0.00329187, + "balance_loss_clip": 1.11523223, + "balance_loss_mlp": 0.29850292, + "epoch": 0.6400120246505336, + "flos": 24460840742400.0, + "grad_norm": 170.29372487351387, + "language_loss": 0.87225366, + "learning_rate": 1.212067656542203e-06, + "loss": 0.88905799, + "num_input_tokens_seen": 229725265, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.30688477, + "step": 10645, + "time_per_iteration": 2.7186522483825684 + }, + { + "auxiliary_loss_clip": 0.01337389, + "auxiliary_loss_mlp": 0.00333971, + "balance_loss_clip": 1.09449184, + "balance_loss_mlp": 0.30142692, + "epoch": 0.6400721479032015, + "flos": 28366090191360.0, + "grad_norm": 100.64132652135164, + "language_loss": 0.83293349, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.8496471, + "num_input_tokens_seen": 229744840, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.32543945, + "step": 10646, + "time_per_iteration": 2.72652530670166 + }, + { + "auxiliary_loss_clip": 0.01332968, + "auxiliary_loss_mlp": 0.00325344, + "balance_loss_clip": 1.10186863, + "balance_loss_mlp": 0.29618585, + "epoch": 0.6401322711558696, + "flos": 17816540711040.0, + "grad_norm": 122.39719522474606, + "language_loss": 0.89471602, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.91129917, + "num_input_tokens_seen": 229759095, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.29174805, + "step": 10647, + "time_per_iteration": 2.613165855407715 + }, + { + "auxiliary_loss_clip": 0.01353473, + "auxiliary_loss_mlp": 0.00309361, + "balance_loss_clip": 1.12343788, + "balance_loss_mlp": 0.28264642, + "epoch": 0.6401923944085375, + "flos": 26030855283840.0, + "grad_norm": 7.063916518980274, + "language_loss": 0.80300188, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.81963015, + "num_input_tokens_seen": 229777750, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.26757812, + "step": 10648, + "time_per_iteration": 2.704153537750244 + }, + { + "auxiliary_loss_clip": 0.0129949, + "auxiliary_loss_mlp": 0.00317216, + "balance_loss_clip": 1.07668388, + "balance_loss_mlp": 0.28880799, + "epoch": 0.6402525176612055, + "flos": 23586451966080.0, + "grad_norm": 10.157343880643904, + "language_loss": 0.84352458, + "learning_rate": 1.210636039936138e-06, + "loss": 0.85969162, + "num_input_tokens_seen": 229796785, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.28430176, + "step": 10649, + "time_per_iteration": 2.6895945072174072 + }, + { + "auxiliary_loss_clip": 0.01339647, + "auxiliary_loss_mlp": 0.00291847, + "balance_loss_clip": 1.10395288, + "balance_loss_mlp": 0.26311785, + "epoch": 0.6403126409138734, + "flos": 18041413806720.0, + "grad_norm": 22.804704480556165, + "language_loss": 0.85011041, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.86642528, + "num_input_tokens_seen": 229815425, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.2869873, + "step": 10650, + "time_per_iteration": 2.665221929550171 + }, + { + "auxiliary_loss_clip": 0.01309893, + "auxiliary_loss_mlp": 0.00306538, + "balance_loss_clip": 1.08697009, + "balance_loss_mlp": 0.27855986, + "epoch": 0.6403727641665414, + "flos": 21979485308160.0, + "grad_norm": 139.90529594696036, + "language_loss": 0.76489186, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.78105617, + "num_input_tokens_seen": 229834545, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.27966309, + "step": 10651, + "time_per_iteration": 2.6852502822875977 + }, + { + "auxiliary_loss_clip": 0.01327999, + "auxiliary_loss_mlp": 0.00321454, + "balance_loss_clip": 1.09663665, + "balance_loss_mlp": 0.29056704, + "epoch": 0.6404328874192093, + "flos": 24895539135360.0, + "grad_norm": 7.445664833555336, + "language_loss": 0.74666989, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.76316446, + "num_input_tokens_seen": 229849175, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.30871582, + "step": 10652, + "time_per_iteration": 2.686128616333008 + }, + { + "auxiliary_loss_clip": 0.01353393, + "auxiliary_loss_mlp": 0.00279338, + "balance_loss_clip": 1.11752033, + "balance_loss_mlp": 0.25174123, + "epoch": 0.6404930106718774, + "flos": 17597198309760.0, + "grad_norm": 24.19916343581083, + "language_loss": 0.87889749, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.89522475, + "num_input_tokens_seen": 229865400, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.27587891, + "step": 10653, + "time_per_iteration": 2.592432975769043 + }, + { + "auxiliary_loss_clip": 0.01373041, + "auxiliary_loss_mlp": 0.00351488, + "balance_loss_clip": 1.12196898, + "balance_loss_mlp": 0.31946856, + "epoch": 0.6405531339245453, + "flos": 20157880930560.0, + "grad_norm": 28.96365066113406, + "language_loss": 0.83259159, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.84983689, + "num_input_tokens_seen": 229882945, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.3203125, + "step": 10654, + "time_per_iteration": 2.6467502117156982 + }, + { + "auxiliary_loss_clip": 0.01348849, + "auxiliary_loss_mlp": 0.00338311, + "balance_loss_clip": 1.10617971, + "balance_loss_mlp": 0.30780578, + "epoch": 0.6406132571772133, + "flos": 21942281796480.0, + "grad_norm": 31.06370620608267, + "language_loss": 0.80821943, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.82509112, + "num_input_tokens_seen": 229901590, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.30505371, + "step": 10655, + "time_per_iteration": 4.159826040267944 + }, + { + "auxiliary_loss_clip": 0.01333661, + "auxiliary_loss_mlp": 0.00319405, + "balance_loss_clip": 1.10070205, + "balance_loss_mlp": 0.28835148, + "epoch": 0.6406733804298813, + "flos": 28768002445440.0, + "grad_norm": 23.62355876494868, + "language_loss": 0.89710861, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.91363931, + "num_input_tokens_seen": 229922535, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.31066895, + "step": 10656, + "time_per_iteration": 4.142528295516968 + }, + { + "auxiliary_loss_clip": 0.01333688, + "auxiliary_loss_mlp": 0.00327145, + "balance_loss_clip": 1.10241759, + "balance_loss_mlp": 0.2988686, + "epoch": 0.6407335036825492, + "flos": 17457183095040.0, + "grad_norm": 24.178742518927642, + "language_loss": 0.82640803, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.84301639, + "num_input_tokens_seen": 229939575, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.28320312, + "step": 10657, + "time_per_iteration": 2.6341371536254883 + }, + { + "auxiliary_loss_clip": 0.01320295, + "auxiliary_loss_mlp": 0.00275876, + "balance_loss_clip": 1.09565341, + "balance_loss_mlp": 0.24893469, + "epoch": 0.6407936269352172, + "flos": 22125282612480.0, + "grad_norm": 40.59238665603964, + "language_loss": 0.85083246, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.86679423, + "num_input_tokens_seen": 229958840, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.26977539, + "step": 10658, + "time_per_iteration": 2.667628049850464 + }, + { + "auxiliary_loss_clip": 0.01340838, + "auxiliary_loss_mlp": 0.00310861, + "balance_loss_clip": 1.10162902, + "balance_loss_mlp": 0.28126171, + "epoch": 0.6408537501878852, + "flos": 23110635479040.0, + "grad_norm": 18.353326279660113, + "language_loss": 0.81593251, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.83244956, + "num_input_tokens_seen": 229979680, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.29614258, + "step": 10659, + "time_per_iteration": 4.090449094772339 + }, + { + "auxiliary_loss_clip": 0.0134869, + "auxiliary_loss_mlp": 0.00281506, + "balance_loss_clip": 1.10716021, + "balance_loss_mlp": 0.25225246, + "epoch": 0.6409138734405532, + "flos": 16472440759680.0, + "grad_norm": 512.5291181837384, + "language_loss": 0.85643709, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.87273896, + "num_input_tokens_seen": 229996830, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.2923584, + "step": 10660, + "time_per_iteration": 2.6665236949920654 + }, + { + "auxiliary_loss_clip": 0.01376292, + "auxiliary_loss_mlp": 0.00326408, + "balance_loss_clip": 1.12399507, + "balance_loss_mlp": 0.29200381, + "epoch": 0.6409739966932211, + "flos": 22777922776320.0, + "grad_norm": 3.3015202651241116, + "language_loss": 0.78246343, + "learning_rate": 1.206344067135727e-06, + "loss": 0.79949033, + "num_input_tokens_seen": 230015115, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.34423828, + "step": 10661, + "time_per_iteration": 2.6736040115356445 + }, + { + "auxiliary_loss_clip": 0.01328664, + "auxiliary_loss_mlp": 0.00307794, + "balance_loss_clip": 1.09757793, + "balance_loss_mlp": 0.28106761, + "epoch": 0.6410341199458891, + "flos": 25152049134720.0, + "grad_norm": 7929.995398006301, + "language_loss": 0.82002431, + "learning_rate": 1.205986598033362e-06, + "loss": 0.83638889, + "num_input_tokens_seen": 230035515, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.26733398, + "step": 10662, + "time_per_iteration": 2.6910316944122314 + }, + { + "auxiliary_loss_clip": 0.01317689, + "auxiliary_loss_mlp": 0.00303787, + "balance_loss_clip": 1.09084582, + "balance_loss_mlp": 0.27708453, + "epoch": 0.641094243198557, + "flos": 27046193028480.0, + "grad_norm": 4.2612103985848, + "language_loss": 0.76433611, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.78055084, + "num_input_tokens_seen": 230054355, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.26708984, + "step": 10663, + "time_per_iteration": 4.1028971672058105 + }, + { + "auxiliary_loss_clip": 0.01356595, + "auxiliary_loss_mlp": 0.00312872, + "balance_loss_clip": 1.11673951, + "balance_loss_mlp": 0.28149593, + "epoch": 0.641154366451225, + "flos": 25374551932800.0, + "grad_norm": 107.64854643305142, + "language_loss": 0.80389732, + "learning_rate": 1.205271750169389e-06, + "loss": 0.82059199, + "num_input_tokens_seen": 230074605, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.3137207, + "step": 10664, + "time_per_iteration": 2.718365430831909 + }, + { + "auxiliary_loss_clip": 0.0133628, + "auxiliary_loss_mlp": 0.002924, + "balance_loss_clip": 1.1003015, + "balance_loss_mlp": 0.26251382, + "epoch": 0.6412144897038929, + "flos": 25153342024320.0, + "grad_norm": 19.679797835954265, + "language_loss": 0.72053957, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.73682636, + "num_input_tokens_seen": 230093820, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.29870605, + "step": 10665, + "time_per_iteration": 2.710953712463379 + }, + { + "auxiliary_loss_clip": 0.01314872, + "auxiliary_loss_mlp": 0.00316425, + "balance_loss_clip": 1.08849788, + "balance_loss_mlp": 0.28891134, + "epoch": 0.641274612956561, + "flos": 23440762402560.0, + "grad_norm": 32.9218329751044, + "language_loss": 0.7088744, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.72518742, + "num_input_tokens_seen": 230114285, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.27514648, + "step": 10666, + "time_per_iteration": 2.6887588500976562 + }, + { + "auxiliary_loss_clip": 0.01356908, + "auxiliary_loss_mlp": 0.00298794, + "balance_loss_clip": 1.10984373, + "balance_loss_mlp": 0.26923043, + "epoch": 0.6413347362092289, + "flos": 19427493778560.0, + "grad_norm": 772.6745325066196, + "language_loss": 0.790196, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.80675298, + "num_input_tokens_seen": 230132760, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.2956543, + "step": 10667, + "time_per_iteration": 2.683084487915039 + }, + { + "auxiliary_loss_clip": 0.01379603, + "auxiliary_loss_mlp": 0.00312841, + "balance_loss_clip": 1.12442875, + "balance_loss_mlp": 0.27996284, + "epoch": 0.6413948594618969, + "flos": 17196578945280.0, + "grad_norm": 15.20963880344858, + "language_loss": 0.90221524, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.91913962, + "num_input_tokens_seen": 230149690, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.32861328, + "step": 10668, + "time_per_iteration": 2.597987413406372 + }, + { + "auxiliary_loss_clip": 0.01337505, + "auxiliary_loss_mlp": 0.00327794, + "balance_loss_clip": 1.10466564, + "balance_loss_mlp": 0.29851598, + "epoch": 0.6414549827145648, + "flos": 22269787027200.0, + "grad_norm": 23.97061313322171, + "language_loss": 0.75147891, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.76813185, + "num_input_tokens_seen": 230166950, + "router_z_loss_clip": 2.32714844, + "router_z_loss_mlp": 0.29272461, + "step": 10669, + "time_per_iteration": 2.6343696117401123 + }, + { + "auxiliary_loss_clip": 0.01359775, + "auxiliary_loss_mlp": 0.00290798, + "balance_loss_clip": 1.11276102, + "balance_loss_mlp": 0.25945795, + "epoch": 0.6415151059672328, + "flos": 19640192163840.0, + "grad_norm": 11.954597199552616, + "language_loss": 0.86773574, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.88424146, + "num_input_tokens_seen": 230184785, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.31347656, + "step": 10670, + "time_per_iteration": 2.6281027793884277 + }, + { + "auxiliary_loss_clip": 0.01361802, + "auxiliary_loss_mlp": 0.00315907, + "balance_loss_clip": 1.11736345, + "balance_loss_mlp": 0.28491294, + "epoch": 0.6415752292199008, + "flos": 14865833237760.0, + "grad_norm": 8.292064065908924, + "language_loss": 0.98550797, + "learning_rate": 1.20277073264638e-06, + "loss": 1.002285, + "num_input_tokens_seen": 230201385, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.30981445, + "step": 10671, + "time_per_iteration": 2.6256699562072754 + }, + { + "auxiliary_loss_clip": 0.01341935, + "auxiliary_loss_mlp": 0.00288512, + "balance_loss_clip": 1.11278129, + "balance_loss_mlp": 0.26176143, + "epoch": 0.6416353524725688, + "flos": 13735580906880.0, + "grad_norm": 29.76189061889072, + "language_loss": 0.76253045, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.77883488, + "num_input_tokens_seen": 230220380, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.26745605, + "step": 10672, + "time_per_iteration": 2.6550514698028564 + }, + { + "auxiliary_loss_clip": 0.01385912, + "auxiliary_loss_mlp": 0.00334315, + "balance_loss_clip": 1.12602019, + "balance_loss_mlp": 0.30253413, + "epoch": 0.6416954757252368, + "flos": 24534924543360.0, + "grad_norm": 502.35399592939575, + "language_loss": 0.84440792, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.86161017, + "num_input_tokens_seen": 230239845, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.31787109, + "step": 10673, + "time_per_iteration": 2.6766021251678467 + }, + { + "auxiliary_loss_clip": 0.01328113, + "auxiliary_loss_mlp": 0.0033996, + "balance_loss_clip": 1.09643078, + "balance_loss_mlp": 0.31011045, + "epoch": 0.6417555989779047, + "flos": 27710002321920.0, + "grad_norm": 20.479062424451495, + "language_loss": 0.76618505, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.78286582, + "num_input_tokens_seen": 230262420, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.29833984, + "step": 10674, + "time_per_iteration": 2.7230372428894043 + }, + { + "auxiliary_loss_clip": 0.01358943, + "auxiliary_loss_mlp": 0.00316197, + "balance_loss_clip": 1.11307144, + "balance_loss_mlp": 0.28620407, + "epoch": 0.6418157222305727, + "flos": 20556632787840.0, + "grad_norm": 2344.9367301071816, + "language_loss": 0.76126474, + "learning_rate": 1.201342244560338e-06, + "loss": 0.77801615, + "num_input_tokens_seen": 230279950, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.29968262, + "step": 10675, + "time_per_iteration": 2.6205568313598633 + }, + { + "auxiliary_loss_clip": 0.01333198, + "auxiliary_loss_mlp": 0.00301878, + "balance_loss_clip": 1.10226107, + "balance_loss_mlp": 0.27338701, + "epoch": 0.6418758454832406, + "flos": 22601530062720.0, + "grad_norm": 58.47165484355713, + "language_loss": 0.75011736, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.76646817, + "num_input_tokens_seen": 230299705, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.28515625, + "step": 10676, + "time_per_iteration": 2.7537074089050293 + }, + { + "auxiliary_loss_clip": 0.01373542, + "auxiliary_loss_mlp": 0.00334536, + "balance_loss_clip": 1.12329364, + "balance_loss_mlp": 0.30366105, + "epoch": 0.6419359687359086, + "flos": 27375098889600.0, + "grad_norm": 163.52950400605167, + "language_loss": 0.86886257, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.88594341, + "num_input_tokens_seen": 230320030, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.30859375, + "step": 10677, + "time_per_iteration": 2.6823160648345947 + }, + { + "auxiliary_loss_clip": 0.01310617, + "auxiliary_loss_mlp": 0.00123141, + "balance_loss_clip": 1.15532196, + "balance_loss_mlp": 0.11484366, + "epoch": 0.6419960919885765, + "flos": 67251924552960.0, + "grad_norm": 0.7494537753268423, + "language_loss": 0.59592843, + "learning_rate": 1.200271196442818e-06, + "loss": 0.61026603, + "num_input_tokens_seen": 230381495, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.08300781, + "step": 10678, + "time_per_iteration": 3.2689905166625977 + }, + { + "auxiliary_loss_clip": 0.01345558, + "auxiliary_loss_mlp": 0.00324342, + "balance_loss_clip": 1.11296332, + "balance_loss_mlp": 0.29581547, + "epoch": 0.6420562152412446, + "flos": 19901873721600.0, + "grad_norm": 17.8297637875541, + "language_loss": 0.75319016, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.76988918, + "num_input_tokens_seen": 230401385, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.28503418, + "step": 10679, + "time_per_iteration": 2.685029983520508 + }, + { + "auxiliary_loss_clip": 0.01331891, + "auxiliary_loss_mlp": 0.00327855, + "balance_loss_clip": 1.09549677, + "balance_loss_mlp": 0.29881543, + "epoch": 0.6421163384939125, + "flos": 24790177566720.0, + "grad_norm": 11.902538981217267, + "language_loss": 0.81772494, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.83432245, + "num_input_tokens_seen": 230421340, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.29077148, + "step": 10680, + "time_per_iteration": 2.6739790439605713 + }, + { + "auxiliary_loss_clip": 0.01342985, + "auxiliary_loss_mlp": 0.00325421, + "balance_loss_clip": 1.10704935, + "balance_loss_mlp": 0.29642966, + "epoch": 0.6421764617465805, + "flos": 25592816926080.0, + "grad_norm": 19.722652260014982, + "language_loss": 0.76807952, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.78476357, + "num_input_tokens_seen": 230441270, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.28955078, + "step": 10681, + "time_per_iteration": 2.6980271339416504 + }, + { + "auxiliary_loss_clip": 0.01327806, + "auxiliary_loss_mlp": 0.00298586, + "balance_loss_clip": 1.0939095, + "balance_loss_mlp": 0.27036902, + "epoch": 0.6422365849992484, + "flos": 14134727813760.0, + "grad_norm": 17.78720700960706, + "language_loss": 0.8271836, + "learning_rate": 1.198843556910427e-06, + "loss": 0.84344745, + "num_input_tokens_seen": 230457455, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.28234863, + "step": 10682, + "time_per_iteration": 2.6231837272644043 + }, + { + "auxiliary_loss_clip": 0.01342895, + "auxiliary_loss_mlp": 0.00327691, + "balance_loss_clip": 1.10734749, + "balance_loss_mlp": 0.29934281, + "epoch": 0.6422967082519164, + "flos": 22383911514240.0, + "grad_norm": 13.631304141263522, + "language_loss": 0.8443259, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.86103171, + "num_input_tokens_seen": 230478955, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.28369141, + "step": 10683, + "time_per_iteration": 2.7008073329925537 + }, + { + "auxiliary_loss_clip": 0.01336205, + "auxiliary_loss_mlp": 0.0031586, + "balance_loss_clip": 1.09681129, + "balance_loss_mlp": 0.28612953, + "epoch": 0.6423568315045844, + "flos": 14647927380480.0, + "grad_norm": 23.744220296617364, + "language_loss": 0.77394795, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.79046863, + "num_input_tokens_seen": 230496425, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.29736328, + "step": 10684, + "time_per_iteration": 2.6368422508239746 + }, + { + "auxiliary_loss_clip": 0.01346603, + "auxiliary_loss_mlp": 0.00307142, + "balance_loss_clip": 1.10610342, + "balance_loss_mlp": 0.27804339, + "epoch": 0.6424169547572524, + "flos": 26833925606400.0, + "grad_norm": 97337.47497438289, + "language_loss": 0.80968869, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.82622617, + "num_input_tokens_seen": 230516245, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.29101562, + "step": 10685, + "time_per_iteration": 2.7682058811187744 + }, + { + "auxiliary_loss_clip": 0.01325487, + "auxiliary_loss_mlp": 0.0030697, + "balance_loss_clip": 1.09484315, + "balance_loss_mlp": 0.27790701, + "epoch": 0.6424770780099204, + "flos": 22707430335360.0, + "grad_norm": 18.511401761047868, + "language_loss": 0.81843221, + "learning_rate": 1.197416403456935e-06, + "loss": 0.83475679, + "num_input_tokens_seen": 230534745, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.29052734, + "step": 10686, + "time_per_iteration": 2.661789655685425 + }, + { + "auxiliary_loss_clip": 0.01353426, + "auxiliary_loss_mlp": 0.00301887, + "balance_loss_clip": 1.11218619, + "balance_loss_mlp": 0.27079698, + "epoch": 0.6425372012625883, + "flos": 28469512425600.0, + "grad_norm": 12.579080959748042, + "language_loss": 0.79340529, + "learning_rate": 1.197059691144867e-06, + "loss": 0.8099584, + "num_input_tokens_seen": 230555895, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.31054688, + "step": 10687, + "time_per_iteration": 2.746061325073242 + }, + { + "auxiliary_loss_clip": 0.01316961, + "auxiliary_loss_mlp": 0.00312147, + "balance_loss_clip": 1.08382881, + "balance_loss_mlp": 0.28327501, + "epoch": 0.6425973245152563, + "flos": 29351694453120.0, + "grad_norm": 27.32791772907604, + "language_loss": 0.74618113, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.76247221, + "num_input_tokens_seen": 230577460, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.28857422, + "step": 10688, + "time_per_iteration": 2.7221336364746094 + }, + { + "auxiliary_loss_clip": 0.01321772, + "auxiliary_loss_mlp": 0.00312687, + "balance_loss_clip": 1.08904886, + "balance_loss_mlp": 0.28485137, + "epoch": 0.6426574477679242, + "flos": 16430388912000.0, + "grad_norm": 5.411239791948602, + "language_loss": 0.81101358, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.82735813, + "num_input_tokens_seen": 230595030, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.27880859, + "step": 10689, + "time_per_iteration": 2.659327745437622 + }, + { + "auxiliary_loss_clip": 0.01344541, + "auxiliary_loss_mlp": 0.00330248, + "balance_loss_clip": 1.10644925, + "balance_loss_mlp": 0.30032659, + "epoch": 0.6427175710205922, + "flos": 21835914647040.0, + "grad_norm": 7.017430681310821, + "language_loss": 0.79712224, + "learning_rate": 1.195989736948226e-06, + "loss": 0.81387013, + "num_input_tokens_seen": 230615135, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.29931641, + "step": 10690, + "time_per_iteration": 2.6815927028656006 + }, + { + "auxiliary_loss_clip": 0.01324978, + "auxiliary_loss_mlp": 0.00320252, + "balance_loss_clip": 1.09327841, + "balance_loss_mlp": 0.29173732, + "epoch": 0.6427776942732601, + "flos": 17786627660160.0, + "grad_norm": 13.7320593810427, + "language_loss": 0.82946026, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.84591258, + "num_input_tokens_seen": 230631965, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.28515625, + "step": 10691, + "time_per_iteration": 2.6586573123931885 + }, + { + "auxiliary_loss_clip": 0.013463, + "auxiliary_loss_mlp": 0.00311266, + "balance_loss_clip": 1.1048671, + "balance_loss_mlp": 0.28173751, + "epoch": 0.6428378175259282, + "flos": 15085893911040.0, + "grad_norm": 5.307170341695963, + "language_loss": 0.83715361, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.85372925, + "num_input_tokens_seen": 230649565, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.29504395, + "step": 10692, + "time_per_iteration": 2.624025821685791 + }, + { + "auxiliary_loss_clip": 0.01331775, + "auxiliary_loss_mlp": 0.00314227, + "balance_loss_clip": 1.09623003, + "balance_loss_mlp": 0.28618857, + "epoch": 0.6428979407785961, + "flos": 23841776816640.0, + "grad_norm": 79.61309968870607, + "language_loss": 0.71691859, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.73337859, + "num_input_tokens_seen": 230669265, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.27990723, + "step": 10693, + "time_per_iteration": 2.6710762977600098 + }, + { + "auxiliary_loss_clip": 0.0132488, + "auxiliary_loss_mlp": 0.00315287, + "balance_loss_clip": 1.08703244, + "balance_loss_mlp": 0.28538904, + "epoch": 0.6429580640312641, + "flos": 32926852892160.0, + "grad_norm": 46.853845247639526, + "language_loss": 0.71183372, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.72823542, + "num_input_tokens_seen": 230690575, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.29858398, + "step": 10694, + "time_per_iteration": 2.765357255935669 + }, + { + "auxiliary_loss_clip": 0.0130019, + "auxiliary_loss_mlp": 0.00297019, + "balance_loss_clip": 1.07420921, + "balance_loss_mlp": 0.2698276, + "epoch": 0.643018187283932, + "flos": 21068359896960.0, + "grad_norm": 3.252992517307486, + "language_loss": 0.85806751, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.87403965, + "num_input_tokens_seen": 230709420, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.27197266, + "step": 10695, + "time_per_iteration": 2.649407148361206 + }, + { + "auxiliary_loss_clip": 0.0131022, + "auxiliary_loss_mlp": 0.00357152, + "balance_loss_clip": 1.07827497, + "balance_loss_mlp": 0.3268255, + "epoch": 0.6430783105366, + "flos": 26724649455360.0, + "grad_norm": 27.73559592309693, + "language_loss": 0.79991156, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.8165853, + "num_input_tokens_seen": 230729350, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.30322266, + "step": 10696, + "time_per_iteration": 2.6948697566986084 + }, + { + "auxiliary_loss_clip": 0.01323345, + "auxiliary_loss_mlp": 0.00298418, + "balance_loss_clip": 1.09383464, + "balance_loss_mlp": 0.27144074, + "epoch": 0.643138433789268, + "flos": 23696841438720.0, + "grad_norm": 3.0206054369893245, + "language_loss": 0.81609833, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.83231592, + "num_input_tokens_seen": 230749220, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.26989746, + "step": 10697, + "time_per_iteration": 4.084696054458618 + }, + { + "auxiliary_loss_clip": 0.01294932, + "auxiliary_loss_mlp": 0.00317092, + "balance_loss_clip": 1.07026029, + "balance_loss_mlp": 0.28944701, + "epoch": 0.643198557041936, + "flos": 34202184255360.0, + "grad_norm": 600.4021243228092, + "language_loss": 0.73032963, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.74644983, + "num_input_tokens_seen": 230770245, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.27600098, + "step": 10698, + "time_per_iteration": 4.19050145149231 + }, + { + "auxiliary_loss_clip": 0.01315527, + "auxiliary_loss_mlp": 0.0010435, + "balance_loss_clip": 1.16339445, + "balance_loss_mlp": 0.09543266, + "epoch": 0.643258680294604, + "flos": 67626473621760.0, + "grad_norm": 0.812920265420946, + "language_loss": 0.63186079, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.64605957, + "num_input_tokens_seen": 230837030, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.08935547, + "step": 10699, + "time_per_iteration": 3.1632678508758545 + }, + { + "auxiliary_loss_clip": 0.01302956, + "auxiliary_loss_mlp": 0.00324988, + "balance_loss_clip": 1.08118749, + "balance_loss_mlp": 0.29585317, + "epoch": 0.6433188035472719, + "flos": 25185984508800.0, + "grad_norm": 40.85415054257934, + "language_loss": 0.74683642, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.76311582, + "num_input_tokens_seen": 230856845, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.29125977, + "step": 10700, + "time_per_iteration": 2.6777923107147217 + }, + { + "auxiliary_loss_clip": 0.01329664, + "auxiliary_loss_mlp": 0.00316409, + "balance_loss_clip": 1.09267402, + "balance_loss_mlp": 0.28810903, + "epoch": 0.6433789267999399, + "flos": 24973573432320.0, + "grad_norm": 50.56559278397293, + "language_loss": 0.80156291, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.81802368, + "num_input_tokens_seen": 230878785, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.28295898, + "step": 10701, + "time_per_iteration": 4.231788635253906 + }, + { + "auxiliary_loss_clip": 0.01330371, + "auxiliary_loss_mlp": 0.00323912, + "balance_loss_clip": 1.09410453, + "balance_loss_mlp": 0.29399049, + "epoch": 0.6434390500526078, + "flos": 17566028282880.0, + "grad_norm": 4.97021459977323, + "language_loss": 0.92045039, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.93699324, + "num_input_tokens_seen": 230895445, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.29907227, + "step": 10702, + "time_per_iteration": 2.6464922428131104 + }, + { + "auxiliary_loss_clip": 0.01313446, + "auxiliary_loss_mlp": 0.00302305, + "balance_loss_clip": 1.0835762, + "balance_loss_mlp": 0.27295554, + "epoch": 0.6434991733052758, + "flos": 20843594542080.0, + "grad_norm": 32.39803082886074, + "language_loss": 0.80850756, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.82466507, + "num_input_tokens_seen": 230911375, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.29370117, + "step": 10703, + "time_per_iteration": 2.6282846927642822 + }, + { + "auxiliary_loss_clip": 0.01315776, + "auxiliary_loss_mlp": 0.00115334, + "balance_loss_clip": 1.16131842, + "balance_loss_mlp": 0.10708441, + "epoch": 0.6435592965579437, + "flos": 66094596345600.0, + "grad_norm": 0.680218486158399, + "language_loss": 0.53896624, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.55327737, + "num_input_tokens_seen": 230975990, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.08251953, + "step": 10704, + "time_per_iteration": 3.201376438140869 + }, + { + "auxiliary_loss_clip": 0.01310034, + "auxiliary_loss_mlp": 0.00306732, + "balance_loss_clip": 1.07935798, + "balance_loss_mlp": 0.28006503, + "epoch": 0.6436194198106118, + "flos": 23768842250880.0, + "grad_norm": 17.30830626801004, + "language_loss": 0.84590888, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.86207646, + "num_input_tokens_seen": 230997110, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.26672363, + "step": 10705, + "time_per_iteration": 4.0692055225372314 + }, + { + "auxiliary_loss_clip": 0.01311479, + "auxiliary_loss_mlp": 0.0029741, + "balance_loss_clip": 1.08359408, + "balance_loss_mlp": 0.2708621, + "epoch": 0.6436795430632797, + "flos": 20230312705920.0, + "grad_norm": 44.85573731157887, + "language_loss": 0.85418254, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.87027144, + "num_input_tokens_seen": 231015590, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.26550293, + "step": 10706, + "time_per_iteration": 2.6756386756896973 + }, + { + "auxiliary_loss_clip": 0.01318136, + "auxiliary_loss_mlp": 0.0029887, + "balance_loss_clip": 1.08474374, + "balance_loss_mlp": 0.27076039, + "epoch": 0.6437396663159477, + "flos": 20301846641280.0, + "grad_norm": 6.7825924808703695, + "language_loss": 0.88451457, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.90068471, + "num_input_tokens_seen": 231033800, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.28088379, + "step": 10707, + "time_per_iteration": 2.671919107437134 + }, + { + "auxiliary_loss_clip": 0.01312046, + "auxiliary_loss_mlp": 0.00315569, + "balance_loss_clip": 1.08333206, + "balance_loss_mlp": 0.28548014, + "epoch": 0.6437997895686156, + "flos": 23878585278720.0, + "grad_norm": 2.2188549014854835, + "language_loss": 0.9100771, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.92635322, + "num_input_tokens_seen": 231053160, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.30078125, + "step": 10708, + "time_per_iteration": 2.7166755199432373 + }, + { + "auxiliary_loss_clip": 0.01351728, + "auxiliary_loss_mlp": 0.00286186, + "balance_loss_clip": 1.10238707, + "balance_loss_mlp": 0.25535846, + "epoch": 0.6438599128212836, + "flos": 18989275852800.0, + "grad_norm": 20.387078870458232, + "language_loss": 0.76430678, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.7806859, + "num_input_tokens_seen": 231069470, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.30834961, + "step": 10709, + "time_per_iteration": 2.720517873764038 + }, + { + "auxiliary_loss_clip": 0.01309742, + "auxiliary_loss_mlp": 0.00306567, + "balance_loss_clip": 1.08307099, + "balance_loss_mlp": 0.2794705, + "epoch": 0.6439200360739517, + "flos": 24096347481600.0, + "grad_norm": 27.306571572081623, + "language_loss": 0.8660053, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.88216841, + "num_input_tokens_seen": 231088205, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.27087402, + "step": 10710, + "time_per_iteration": 2.776304006576538 + }, + { + "auxiliary_loss_clip": 0.01304962, + "auxiliary_loss_mlp": 0.00293075, + "balance_loss_clip": 1.07817686, + "balance_loss_mlp": 0.26398745, + "epoch": 0.6439801593266196, + "flos": 31902141697920.0, + "grad_norm": 1406.4648131354552, + "language_loss": 0.73065841, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.74663877, + "num_input_tokens_seen": 231107850, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.29077148, + "step": 10711, + "time_per_iteration": 2.7382729053497314 + }, + { + "auxiliary_loss_clip": 0.01318632, + "auxiliary_loss_mlp": 0.00305511, + "balance_loss_clip": 1.08457756, + "balance_loss_mlp": 0.27704394, + "epoch": 0.6440402825792876, + "flos": 27125879351040.0, + "grad_norm": 13.622845220683967, + "language_loss": 0.85337508, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.86961645, + "num_input_tokens_seen": 231127200, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.28479004, + "step": 10712, + "time_per_iteration": 2.690814971923828 + }, + { + "auxiliary_loss_clip": 0.01312156, + "auxiliary_loss_mlp": 0.00336387, + "balance_loss_clip": 1.0783546, + "balance_loss_mlp": 0.30770504, + "epoch": 0.6441004058319555, + "flos": 20667704618880.0, + "grad_norm": 32.200878243272435, + "language_loss": 0.88778603, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.90427148, + "num_input_tokens_seen": 231146360, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.28686523, + "step": 10713, + "time_per_iteration": 2.6412696838378906 + }, + { + "auxiliary_loss_clip": 0.01326011, + "auxiliary_loss_mlp": 0.00276101, + "balance_loss_clip": 1.09427571, + "balance_loss_mlp": 0.24771738, + "epoch": 0.6441605290846235, + "flos": 26026006947840.0, + "grad_norm": 8.234217640409533, + "language_loss": 0.8437326, + "learning_rate": 1.187440012188684e-06, + "loss": 0.85975367, + "num_input_tokens_seen": 231168350, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.28356934, + "step": 10714, + "time_per_iteration": 2.67676043510437 + }, + { + "auxiliary_loss_clip": 0.01306295, + "auxiliary_loss_mlp": 0.00296083, + "balance_loss_clip": 1.07961869, + "balance_loss_mlp": 0.26767525, + "epoch": 0.6442206523372914, + "flos": 24899489631360.0, + "grad_norm": 22421.676018611077, + "language_loss": 0.86433238, + "learning_rate": 1.187084157517583e-06, + "loss": 0.88035619, + "num_input_tokens_seen": 231188385, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.28430176, + "step": 10715, + "time_per_iteration": 2.684894561767578 + }, + { + "auxiliary_loss_clip": 0.01319914, + "auxiliary_loss_mlp": 0.00275578, + "balance_loss_clip": 1.08339787, + "balance_loss_mlp": 0.24938796, + "epoch": 0.6442807755899594, + "flos": 25156322853120.0, + "grad_norm": 166.13733262170632, + "language_loss": 0.88165456, + "learning_rate": 1.186728333672332e-06, + "loss": 0.89760947, + "num_input_tokens_seen": 231209880, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.26159668, + "step": 10716, + "time_per_iteration": 2.729490280151367 + }, + { + "auxiliary_loss_clip": 0.01304743, + "auxiliary_loss_mlp": 0.00306356, + "balance_loss_clip": 1.07291889, + "balance_loss_mlp": 0.27520663, + "epoch": 0.6443408988426274, + "flos": 27344503480320.0, + "grad_norm": 8.449046098868852, + "language_loss": 0.84509248, + "learning_rate": 1.186372540666424e-06, + "loss": 0.86120343, + "num_input_tokens_seen": 231230765, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.31152344, + "step": 10717, + "time_per_iteration": 2.725562334060669 + }, + { + "auxiliary_loss_clip": 0.01307052, + "auxiliary_loss_mlp": 0.00285619, + "balance_loss_clip": 1.08064222, + "balance_loss_mlp": 0.25781959, + "epoch": 0.6444010220952954, + "flos": 27928339142400.0, + "grad_norm": 21.734757054577855, + "language_loss": 0.76128006, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.77720678, + "num_input_tokens_seen": 231252350, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.2779541, + "step": 10718, + "time_per_iteration": 2.7151403427124023 + }, + { + "auxiliary_loss_clip": 0.01304842, + "auxiliary_loss_mlp": 0.0016248, + "balance_loss_clip": 1.15339351, + "balance_loss_mlp": 0.15327679, + "epoch": 0.6444611453479633, + "flos": 71215024855680.0, + "grad_norm": 0.7811786789954092, + "language_loss": 0.49165267, + "learning_rate": 1.185661047226603e-06, + "loss": 0.5063259, + "num_input_tokens_seen": 231313865, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.09179688, + "step": 10719, + "time_per_iteration": 3.282759189605713 + }, + { + "auxiliary_loss_clip": 0.01313111, + "auxiliary_loss_mlp": 0.00281955, + "balance_loss_clip": 1.07995832, + "balance_loss_mlp": 0.25466788, + "epoch": 0.6445212686006313, + "flos": 22705131864960.0, + "grad_norm": 14.680650291142943, + "language_loss": 0.85627401, + "learning_rate": 1.18530534681967e-06, + "loss": 0.87222469, + "num_input_tokens_seen": 231331710, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.27294922, + "step": 10720, + "time_per_iteration": 2.6423840522766113 + }, + { + "auxiliary_loss_clip": 0.01326735, + "auxiliary_loss_mlp": 0.00318832, + "balance_loss_clip": 1.09560108, + "balance_loss_mlp": 0.28910115, + "epoch": 0.6445813918532992, + "flos": 21178821196800.0, + "grad_norm": 84.02311705959364, + "language_loss": 0.84558862, + "learning_rate": 1.18494967730604e-06, + "loss": 0.86204427, + "num_input_tokens_seen": 231350705, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.29711914, + "step": 10721, + "time_per_iteration": 2.800558090209961 + }, + { + "auxiliary_loss_clip": 0.01317548, + "auxiliary_loss_mlp": 0.00286394, + "balance_loss_clip": 1.08352542, + "balance_loss_mlp": 0.25675827, + "epoch": 0.6446415151059672, + "flos": 25191910252800.0, + "grad_norm": 40.4678420022837, + "language_loss": 0.79766321, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.81370258, + "num_input_tokens_seen": 231369550, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.29614258, + "step": 10722, + "time_per_iteration": 2.7468130588531494 + }, + { + "auxiliary_loss_clip": 0.01298919, + "auxiliary_loss_mlp": 0.00294978, + "balance_loss_clip": 1.07498002, + "balance_loss_mlp": 0.26868093, + "epoch": 0.6447016383586353, + "flos": 25302227898240.0, + "grad_norm": 39.612232102485095, + "language_loss": 0.85095108, + "learning_rate": 1.184238431012635e-06, + "loss": 0.86689001, + "num_input_tokens_seen": 231389285, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.26281738, + "step": 10723, + "time_per_iteration": 2.767415761947632 + }, + { + "auxiliary_loss_clip": 0.01326847, + "auxiliary_loss_mlp": 0.00297835, + "balance_loss_clip": 1.09121251, + "balance_loss_mlp": 0.2669833, + "epoch": 0.6447617616113032, + "flos": 27703142824320.0, + "grad_norm": 12.66167836315832, + "language_loss": 0.64569581, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.6619426, + "num_input_tokens_seen": 231408820, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.30834961, + "step": 10724, + "time_per_iteration": 2.731027603149414 + }, + { + "auxiliary_loss_clip": 0.01291514, + "auxiliary_loss_mlp": 0.00262359, + "balance_loss_clip": 1.07273698, + "balance_loss_mlp": 0.23750389, + "epoch": 0.6448218848639712, + "flos": 23039101543680.0, + "grad_norm": 9.541597324522032, + "language_loss": 0.9066236, + "learning_rate": 1.183527308454271e-06, + "loss": 0.92216229, + "num_input_tokens_seen": 231428100, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.2487793, + "step": 10725, + "time_per_iteration": 2.7188286781311035 + }, + { + "auxiliary_loss_clip": 0.01289461, + "auxiliary_loss_mlp": 0.00295542, + "balance_loss_clip": 1.06460261, + "balance_loss_mlp": 0.26669306, + "epoch": 0.6448820081166391, + "flos": 24496104919680.0, + "grad_norm": 5.967770022035746, + "language_loss": 0.87847137, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.89432138, + "num_input_tokens_seen": 231445810, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.28857422, + "step": 10726, + "time_per_iteration": 2.675412178039551 + }, + { + "auxiliary_loss_clip": 0.01307411, + "auxiliary_loss_mlp": 0.00295177, + "balance_loss_clip": 1.0765748, + "balance_loss_mlp": 0.26697236, + "epoch": 0.6449421313693071, + "flos": 22419283432320.0, + "grad_norm": 3.493504970105384, + "language_loss": 0.90047497, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.91650087, + "num_input_tokens_seen": 231463570, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.28210449, + "step": 10727, + "time_per_iteration": 2.671908140182495 + }, + { + "auxiliary_loss_clip": 0.01329109, + "auxiliary_loss_mlp": 0.00307715, + "balance_loss_clip": 1.09151781, + "balance_loss_mlp": 0.27779344, + "epoch": 0.645002254621975, + "flos": 20225715765120.0, + "grad_norm": 4.868185748158972, + "language_loss": 0.87236303, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.88873124, + "num_input_tokens_seen": 231482155, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.29931641, + "step": 10728, + "time_per_iteration": 2.6317687034606934 + }, + { + "auxiliary_loss_clip": 0.0132198, + "auxiliary_loss_mlp": 0.00304138, + "balance_loss_clip": 1.08487439, + "balance_loss_mlp": 0.27488363, + "epoch": 0.645062377874643, + "flos": 27855440490240.0, + "grad_norm": 57.866097702824455, + "language_loss": 0.83071196, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.84697318, + "num_input_tokens_seen": 231502465, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.29296875, + "step": 10729, + "time_per_iteration": 2.726339101791382 + }, + { + "auxiliary_loss_clip": 0.01306367, + "auxiliary_loss_mlp": 0.00306995, + "balance_loss_clip": 1.0784297, + "balance_loss_mlp": 0.2785995, + "epoch": 0.645122501127311, + "flos": 25301509626240.0, + "grad_norm": 12.052163546143952, + "language_loss": 0.73962629, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.75575995, + "num_input_tokens_seen": 231522740, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.28393555, + "step": 10730, + "time_per_iteration": 2.667731761932373 + }, + { + "auxiliary_loss_clip": 0.01299718, + "auxiliary_loss_mlp": 0.00290916, + "balance_loss_clip": 1.07361937, + "balance_loss_mlp": 0.26271072, + "epoch": 0.645182624379979, + "flos": 18807352444800.0, + "grad_norm": 75.88892762581303, + "language_loss": 0.71826249, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.73416883, + "num_input_tokens_seen": 231542050, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.28173828, + "step": 10731, + "time_per_iteration": 2.669257164001465 + }, + { + "auxiliary_loss_clip": 0.01303376, + "auxiliary_loss_mlp": 0.0030065, + "balance_loss_clip": 1.07697392, + "balance_loss_mlp": 0.27387595, + "epoch": 0.6452427476326469, + "flos": 18332182402560.0, + "grad_norm": 9.817721914389624, + "language_loss": 0.74866617, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.76470637, + "num_input_tokens_seen": 231560380, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26733398, + "step": 10732, + "time_per_iteration": 2.7008848190307617 + }, + { + "auxiliary_loss_clip": 0.01315408, + "auxiliary_loss_mlp": 0.00302713, + "balance_loss_clip": 1.08754039, + "balance_loss_mlp": 0.27481776, + "epoch": 0.6453028708853149, + "flos": 22784746360320.0, + "grad_norm": 23.12309009759477, + "language_loss": 0.83241469, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.84859586, + "num_input_tokens_seen": 231580810, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.27905273, + "step": 10733, + "time_per_iteration": 2.6557767391204834 + }, + { + "auxiliary_loss_clip": 0.01303108, + "auxiliary_loss_mlp": 0.00320078, + "balance_loss_clip": 1.07788813, + "balance_loss_mlp": 0.29196835, + "epoch": 0.6453629941379828, + "flos": 23945989150080.0, + "grad_norm": 17.085864774036413, + "language_loss": 0.79946458, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.81569648, + "num_input_tokens_seen": 231600585, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.28088379, + "step": 10734, + "time_per_iteration": 2.658217430114746 + }, + { + "auxiliary_loss_clip": 0.01305443, + "auxiliary_loss_mlp": 0.00286994, + "balance_loss_clip": 1.08627105, + "balance_loss_mlp": 0.26093459, + "epoch": 0.6454231173906508, + "flos": 17676381841920.0, + "grad_norm": 131.4420528248961, + "language_loss": 0.80526853, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.82119286, + "num_input_tokens_seen": 231618765, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26062012, + "step": 10735, + "time_per_iteration": 2.6392030715942383 + }, + { + "auxiliary_loss_clip": 0.01330969, + "auxiliary_loss_mlp": 0.00281154, + "balance_loss_clip": 1.09827232, + "balance_loss_mlp": 0.25317585, + "epoch": 0.6454832406433189, + "flos": 23292774368640.0, + "grad_norm": 20.825800911493026, + "language_loss": 0.81851739, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.8346386, + "num_input_tokens_seen": 231638525, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.27966309, + "step": 10736, + "time_per_iteration": 2.7035329341888428 + }, + { + "auxiliary_loss_clip": 0.01341724, + "auxiliary_loss_mlp": 0.00340651, + "balance_loss_clip": 1.10309303, + "balance_loss_mlp": 0.31155241, + "epoch": 0.6455433638959868, + "flos": 20157198572160.0, + "grad_norm": 9.252236520536625, + "language_loss": 0.79894322, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.81576693, + "num_input_tokens_seen": 231656785, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.29077148, + "step": 10737, + "time_per_iteration": 2.662172794342041 + }, + { + "auxiliary_loss_clip": 0.0127804, + "auxiliary_loss_mlp": 0.00169682, + "balance_loss_clip": 1.12661386, + "balance_loss_mlp": 0.15971565, + "epoch": 0.6456034871486548, + "flos": 66532922012160.0, + "grad_norm": 0.7816289395522237, + "language_loss": 0.57614064, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.59061778, + "num_input_tokens_seen": 231719075, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.09960938, + "step": 10738, + "time_per_iteration": 3.222099781036377 + }, + { + "auxiliary_loss_clip": 0.01326658, + "auxiliary_loss_mlp": 0.00319654, + "balance_loss_clip": 1.09726882, + "balance_loss_mlp": 0.29217663, + "epoch": 0.6456636104013227, + "flos": 24206090509440.0, + "grad_norm": 24.229133226154076, + "language_loss": 0.81520712, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.83167028, + "num_input_tokens_seen": 231737810, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.27453613, + "step": 10739, + "time_per_iteration": 4.026391267776489 + }, + { + "auxiliary_loss_clip": 0.01357341, + "auxiliary_loss_mlp": 0.00279314, + "balance_loss_clip": 1.11291504, + "balance_loss_mlp": 0.25119245, + "epoch": 0.6457237336539907, + "flos": 23624086440960.0, + "grad_norm": 24.529417613839975, + "language_loss": 0.81073087, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.82709742, + "num_input_tokens_seen": 231756140, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.28112793, + "step": 10740, + "time_per_iteration": 4.098350524902344 + }, + { + "auxiliary_loss_clip": 0.0124572, + "auxiliary_loss_mlp": 0.00250186, + "balance_loss_clip": 1.09107995, + "balance_loss_mlp": 0.23616698, + "epoch": 0.6457838569066586, + "flos": 65846023251840.0, + "grad_norm": 0.6904326731980418, + "language_loss": 0.54570508, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.56066406, + "num_input_tokens_seen": 231823665, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.140625, + "step": 10741, + "time_per_iteration": 3.142732858657837 + }, + { + "auxiliary_loss_clip": 0.01329505, + "auxiliary_loss_mlp": 0.00329344, + "balance_loss_clip": 1.10058069, + "balance_loss_mlp": 0.30074573, + "epoch": 0.6458439801593266, + "flos": 22381972179840.0, + "grad_norm": 2.403242501216037, + "language_loss": 0.85374171, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.87033015, + "num_input_tokens_seen": 231844500, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.28625488, + "step": 10742, + "time_per_iteration": 2.6630959510803223 + }, + { + "auxiliary_loss_clip": 0.01331913, + "auxiliary_loss_mlp": 0.00302407, + "balance_loss_clip": 1.10325646, + "balance_loss_mlp": 0.27448821, + "epoch": 0.6459041034119946, + "flos": 24789243813120.0, + "grad_norm": 3.6667474233441446, + "language_loss": 0.870942, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.88728517, + "num_input_tokens_seen": 231864510, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.27941895, + "step": 10743, + "time_per_iteration": 4.064514398574829 + }, + { + "auxiliary_loss_clip": 0.012994, + "auxiliary_loss_mlp": 0.00315404, + "balance_loss_clip": 1.07820904, + "balance_loss_mlp": 0.2876049, + "epoch": 0.6459642266646626, + "flos": 18325358818560.0, + "grad_norm": 60.936628852573364, + "language_loss": 0.78146189, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.79760993, + "num_input_tokens_seen": 231881555, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.27770996, + "step": 10744, + "time_per_iteration": 2.698080539703369 + }, + { + "auxiliary_loss_clip": 0.01307335, + "auxiliary_loss_mlp": 0.00303401, + "balance_loss_clip": 1.08324337, + "balance_loss_mlp": 0.27487412, + "epoch": 0.6460243499173305, + "flos": 43581368891520.0, + "grad_norm": 244.25563984337583, + "language_loss": 0.74622309, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.76233041, + "num_input_tokens_seen": 231905945, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.28515625, + "step": 10745, + "time_per_iteration": 2.8522021770477295 + }, + { + "auxiliary_loss_clip": 0.01328276, + "auxiliary_loss_mlp": 0.00341423, + "balance_loss_clip": 1.09755492, + "balance_loss_mlp": 0.31033334, + "epoch": 0.6460844731699985, + "flos": 19244026085760.0, + "grad_norm": 319.2883526141002, + "language_loss": 0.83232874, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.84902573, + "num_input_tokens_seen": 231922535, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.31054688, + "step": 10746, + "time_per_iteration": 2.80941104888916 + }, + { + "auxiliary_loss_clip": 0.01347739, + "auxiliary_loss_mlp": 0.00308635, + "balance_loss_clip": 1.11012089, + "balance_loss_mlp": 0.28031093, + "epoch": 0.6461445964226664, + "flos": 27453348668160.0, + "grad_norm": 6.191671461099192, + "language_loss": 0.73541844, + "learning_rate": 1.175713157660413e-06, + "loss": 0.75198221, + "num_input_tokens_seen": 231944800, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.2833252, + "step": 10747, + "time_per_iteration": 2.7193171977996826 + }, + { + "auxiliary_loss_clip": 0.01341882, + "auxiliary_loss_mlp": 0.00280693, + "balance_loss_clip": 1.10703421, + "balance_loss_mlp": 0.25440785, + "epoch": 0.6462047196753344, + "flos": 20295489934080.0, + "grad_norm": 20.102376468234404, + "language_loss": 0.76496315, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.78118885, + "num_input_tokens_seen": 231962970, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.26306152, + "step": 10748, + "time_per_iteration": 4.173394441604614 + }, + { + "auxiliary_loss_clip": 0.0132423, + "auxiliary_loss_mlp": 0.00321309, + "balance_loss_clip": 1.09281838, + "balance_loss_mlp": 0.29015988, + "epoch": 0.6462648429280025, + "flos": 22018340845440.0, + "grad_norm": 7.855046563370563, + "language_loss": 0.82895851, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.84541392, + "num_input_tokens_seen": 231981195, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.31164551, + "step": 10749, + "time_per_iteration": 2.6884281635284424 + }, + { + "auxiliary_loss_clip": 0.01338155, + "auxiliary_loss_mlp": 0.00308455, + "balance_loss_clip": 1.10100746, + "balance_loss_mlp": 0.2792486, + "epoch": 0.6463249661806704, + "flos": 27781141207680.0, + "grad_norm": 15.7975072149449, + "language_loss": 0.8390249, + "learning_rate": 1.17464876058473e-06, + "loss": 0.85549098, + "num_input_tokens_seen": 232001735, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.29199219, + "step": 10750, + "time_per_iteration": 2.718506336212158 + }, + { + "auxiliary_loss_clip": 0.01339825, + "auxiliary_loss_mlp": 0.00338753, + "balance_loss_clip": 1.1039772, + "balance_loss_mlp": 0.30900997, + "epoch": 0.6463850894333384, + "flos": 22050588280320.0, + "grad_norm": 8.429314550379159, + "language_loss": 0.77104133, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.78782713, + "num_input_tokens_seen": 232019830, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.29748535, + "step": 10751, + "time_per_iteration": 2.7429609298706055 + }, + { + "auxiliary_loss_clip": 0.01342395, + "auxiliary_loss_mlp": 0.00325281, + "balance_loss_clip": 1.10439014, + "balance_loss_mlp": 0.2948468, + "epoch": 0.6464452126860063, + "flos": 21106245767040.0, + "grad_norm": 17.47510365386952, + "language_loss": 0.81949103, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.83616781, + "num_input_tokens_seen": 232039625, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.30432129, + "step": 10752, + "time_per_iteration": 2.738900661468506 + }, + { + "auxiliary_loss_clip": 0.01354249, + "auxiliary_loss_mlp": 0.00295482, + "balance_loss_clip": 1.10984087, + "balance_loss_mlp": 0.26789695, + "epoch": 0.6465053359386743, + "flos": 16028045694720.0, + "grad_norm": 3.2119133803899858, + "language_loss": 0.85521722, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.87171447, + "num_input_tokens_seen": 232055855, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.27587891, + "step": 10753, + "time_per_iteration": 2.5810434818267822 + }, + { + "auxiliary_loss_clip": 0.01356084, + "auxiliary_loss_mlp": 0.00275643, + "balance_loss_clip": 1.11618018, + "balance_loss_mlp": 0.24752162, + "epoch": 0.6465654591913422, + "flos": 23398674641280.0, + "grad_norm": 14.851778734079934, + "language_loss": 0.91777778, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.93409508, + "num_input_tokens_seen": 232073475, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.28137207, + "step": 10754, + "time_per_iteration": 2.6652534008026123 + }, + { + "auxiliary_loss_clip": 0.01348586, + "auxiliary_loss_mlp": 0.00306996, + "balance_loss_clip": 1.11220014, + "balance_loss_mlp": 0.27808744, + "epoch": 0.6466255824440102, + "flos": 15377273038080.0, + "grad_norm": 784.7520789983366, + "language_loss": 0.67307508, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.68963087, + "num_input_tokens_seen": 232091090, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.28894043, + "step": 10755, + "time_per_iteration": 2.6186342239379883 + }, + { + "auxiliary_loss_clip": 0.01330873, + "auxiliary_loss_mlp": 0.00304834, + "balance_loss_clip": 1.09996843, + "balance_loss_mlp": 0.27653384, + "epoch": 0.6466857056966782, + "flos": 16252846963200.0, + "grad_norm": 91.06577232174001, + "language_loss": 0.74805844, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.7644155, + "num_input_tokens_seen": 232107320, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.28295898, + "step": 10756, + "time_per_iteration": 2.6364352703094482 + }, + { + "auxiliary_loss_clip": 0.01347428, + "auxiliary_loss_mlp": 0.00291438, + "balance_loss_clip": 1.10717773, + "balance_loss_mlp": 0.26222014, + "epoch": 0.6467458289493462, + "flos": 21178246579200.0, + "grad_norm": 325.8720967911381, + "language_loss": 0.85229284, + "learning_rate": 1.172166263444844e-06, + "loss": 0.86868155, + "num_input_tokens_seen": 232123930, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.29248047, + "step": 10757, + "time_per_iteration": 2.628664255142212 + }, + { + "auxiliary_loss_clip": 0.01329231, + "auxiliary_loss_mlp": 0.00282996, + "balance_loss_clip": 1.10330987, + "balance_loss_mlp": 0.25632867, + "epoch": 0.6468059522020141, + "flos": 17968299672960.0, + "grad_norm": 121.10742290079726, + "language_loss": 0.80957353, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.82569587, + "num_input_tokens_seen": 232142905, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26672363, + "step": 10758, + "time_per_iteration": 2.625394821166992 + }, + { + "auxiliary_loss_clip": 0.01338379, + "auxiliary_loss_mlp": 0.00288376, + "balance_loss_clip": 1.10378814, + "balance_loss_mlp": 0.26002765, + "epoch": 0.6468660754546821, + "flos": 17890157635200.0, + "grad_norm": 8.687228669089507, + "language_loss": 0.75754702, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.77381456, + "num_input_tokens_seen": 232162230, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.28356934, + "step": 10759, + "time_per_iteration": 2.6569833755493164 + }, + { + "auxiliary_loss_clip": 0.01350751, + "auxiliary_loss_mlp": 0.00315239, + "balance_loss_clip": 1.11034489, + "balance_loss_mlp": 0.28548473, + "epoch": 0.64692619870735, + "flos": 22600991358720.0, + "grad_norm": 3.8941473003734868, + "language_loss": 0.84276462, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.85942453, + "num_input_tokens_seen": 232182700, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.29772949, + "step": 10760, + "time_per_iteration": 2.687274932861328 + }, + { + "auxiliary_loss_clip": 0.01312002, + "auxiliary_loss_mlp": 0.0029163, + "balance_loss_clip": 1.09101236, + "balance_loss_mlp": 0.26514208, + "epoch": 0.646986321960018, + "flos": 49600786993920.0, + "grad_norm": 2.2837612846846294, + "language_loss": 0.71805936, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.73409569, + "num_input_tokens_seen": 232208235, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26489258, + "step": 10761, + "time_per_iteration": 2.905994415283203 + }, + { + "auxiliary_loss_clip": 0.01324376, + "auxiliary_loss_mlp": 0.00289977, + "balance_loss_clip": 1.09917188, + "balance_loss_mlp": 0.26309496, + "epoch": 0.6470464452126861, + "flos": 21908454163200.0, + "grad_norm": 383.6544492556188, + "language_loss": 0.7960043, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.81214786, + "num_input_tokens_seen": 232228720, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.2689209, + "step": 10762, + "time_per_iteration": 2.673093318939209 + }, + { + "auxiliary_loss_clip": 0.01336074, + "auxiliary_loss_mlp": 0.00269285, + "balance_loss_clip": 1.09851921, + "balance_loss_mlp": 0.24057984, + "epoch": 0.647106568465354, + "flos": 18106124158080.0, + "grad_norm": 18.260874418798167, + "language_loss": 0.89714921, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.91320276, + "num_input_tokens_seen": 232244655, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.28735352, + "step": 10763, + "time_per_iteration": 2.633495330810547 + }, + { + "auxiliary_loss_clip": 0.01259553, + "auxiliary_loss_mlp": 0.00142986, + "balance_loss_clip": 1.10569191, + "balance_loss_mlp": 0.13363951, + "epoch": 0.647166691718022, + "flos": 69480038125440.0, + "grad_norm": 0.7135658666553623, + "language_loss": 0.57263792, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.58666331, + "num_input_tokens_seen": 232308685, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.09326172, + "step": 10764, + "time_per_iteration": 3.382359504699707 + }, + { + "auxiliary_loss_clip": 0.01344946, + "auxiliary_loss_mlp": 0.00270749, + "balance_loss_clip": 1.10814929, + "balance_loss_mlp": 0.24401061, + "epoch": 0.6472268149706899, + "flos": 34095170661120.0, + "grad_norm": 44.148347374971486, + "language_loss": 0.68867326, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.70483029, + "num_input_tokens_seen": 232327520, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.26745605, + "step": 10765, + "time_per_iteration": 2.844805955886841 + }, + { + "auxiliary_loss_clip": 0.01321875, + "auxiliary_loss_mlp": 0.00254692, + "balance_loss_clip": 1.09764957, + "balance_loss_mlp": 0.23127964, + "epoch": 0.6472869382233579, + "flos": 28111232217600.0, + "grad_norm": 32.363547897738556, + "language_loss": 0.70425141, + "learning_rate": 1.168976742243437e-06, + "loss": 0.72001714, + "num_input_tokens_seen": 232349025, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.23400879, + "step": 10766, + "time_per_iteration": 2.725289821624756 + }, + { + "auxiliary_loss_clip": 0.01344679, + "auxiliary_loss_mlp": 0.00286274, + "balance_loss_clip": 1.11068082, + "balance_loss_mlp": 0.25831988, + "epoch": 0.6473470614760258, + "flos": 22492146170880.0, + "grad_norm": 15.20918143205395, + "language_loss": 0.83187079, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.84818035, + "num_input_tokens_seen": 232367835, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.27941895, + "step": 10767, + "time_per_iteration": 2.6951723098754883 + }, + { + "auxiliary_loss_clip": 0.01345116, + "auxiliary_loss_mlp": 0.00291186, + "balance_loss_clip": 1.10492575, + "balance_loss_mlp": 0.26193228, + "epoch": 0.6474071847286939, + "flos": 14538938538240.0, + "grad_norm": 19.975224066405758, + "language_loss": 0.86490571, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.88126868, + "num_input_tokens_seen": 232385840, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.29211426, + "step": 10768, + "time_per_iteration": 2.611851453781128 + }, + { + "auxiliary_loss_clip": 0.01334769, + "auxiliary_loss_mlp": 0.00275231, + "balance_loss_clip": 1.10100198, + "balance_loss_mlp": 0.24831353, + "epoch": 0.6474673079813618, + "flos": 24098214988800.0, + "grad_norm": 28.26879493342924, + "language_loss": 0.77963531, + "learning_rate": 1.167914135250663e-06, + "loss": 0.7957353, + "num_input_tokens_seen": 232406205, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.26904297, + "step": 10769, + "time_per_iteration": 2.733921766281128 + }, + { + "auxiliary_loss_clip": 0.01320033, + "auxiliary_loss_mlp": 0.00245359, + "balance_loss_clip": 1.09681988, + "balance_loss_mlp": 0.22185141, + "epoch": 0.6475274312340298, + "flos": 14976186796800.0, + "grad_norm": 4.639272128220012, + "language_loss": 0.81506139, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.8307153, + "num_input_tokens_seen": 232424995, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.23510742, + "step": 10770, + "time_per_iteration": 2.6413686275482178 + }, + { + "auxiliary_loss_clip": 0.01345824, + "auxiliary_loss_mlp": 0.0030139, + "balance_loss_clip": 1.10437727, + "balance_loss_mlp": 0.27020502, + "epoch": 0.6475875544866977, + "flos": 25045322849280.0, + "grad_norm": 9.22970949790574, + "language_loss": 0.80166233, + "learning_rate": 1.167205888330325e-06, + "loss": 0.81813443, + "num_input_tokens_seen": 232445870, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.31176758, + "step": 10771, + "time_per_iteration": 2.6896181106567383 + }, + { + "auxiliary_loss_clip": 0.01356822, + "auxiliary_loss_mlp": 0.00254782, + "balance_loss_clip": 1.11836958, + "balance_loss_mlp": 0.22866371, + "epoch": 0.6476476777393657, + "flos": 16472153450880.0, + "grad_norm": 17.109170471519175, + "language_loss": 0.82782757, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.8439436, + "num_input_tokens_seen": 232464285, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.2611084, + "step": 10772, + "time_per_iteration": 2.636303424835205 + }, + { + "auxiliary_loss_clip": 0.01313425, + "auxiliary_loss_mlp": 0.00253298, + "balance_loss_clip": 1.0933646, + "balance_loss_mlp": 0.22970621, + "epoch": 0.6477078009920336, + "flos": 25812267068160.0, + "grad_norm": 2.779353302228359, + "language_loss": 0.86815, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.8838172, + "num_input_tokens_seen": 232485815, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.23608398, + "step": 10773, + "time_per_iteration": 2.738797187805176 + }, + { + "auxiliary_loss_clip": 0.01317412, + "auxiliary_loss_mlp": 0.00282746, + "balance_loss_clip": 1.09484887, + "balance_loss_mlp": 0.25622159, + "epoch": 0.6477679242447016, + "flos": 17676130446720.0, + "grad_norm": 25.056881632149924, + "language_loss": 0.85503864, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.87104017, + "num_input_tokens_seen": 232504875, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26525879, + "step": 10774, + "time_per_iteration": 2.7165367603302 + }, + { + "auxiliary_loss_clip": 0.01337698, + "auxiliary_loss_mlp": 0.00291456, + "balance_loss_clip": 1.09878194, + "balance_loss_mlp": 0.26315585, + "epoch": 0.6478280474973696, + "flos": 21032305620480.0, + "grad_norm": 31.106835653613732, + "language_loss": 0.82331723, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.83960879, + "num_input_tokens_seen": 232521945, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.28308105, + "step": 10775, + "time_per_iteration": 2.6899709701538086 + }, + { + "auxiliary_loss_clip": 0.01355915, + "auxiliary_loss_mlp": 0.00277137, + "balance_loss_clip": 1.11401057, + "balance_loss_mlp": 0.24784693, + "epoch": 0.6478881707500376, + "flos": 21616931381760.0, + "grad_norm": 2499.9818019981512, + "language_loss": 0.74004424, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.75637472, + "num_input_tokens_seen": 232541500, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.29284668, + "step": 10776, + "time_per_iteration": 2.670818328857422 + }, + { + "auxiliary_loss_clip": 0.01360637, + "auxiliary_loss_mlp": 0.00254698, + "balance_loss_clip": 1.11568558, + "balance_loss_mlp": 0.22608779, + "epoch": 0.6479482940027056, + "flos": 18442571875200.0, + "grad_norm": 8.899358488006493, + "language_loss": 0.93113399, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.94728732, + "num_input_tokens_seen": 232559720, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.28601074, + "step": 10777, + "time_per_iteration": 2.6123886108398438 + }, + { + "auxiliary_loss_clip": 0.01336811, + "auxiliary_loss_mlp": 0.00257837, + "balance_loss_clip": 1.10277283, + "balance_loss_mlp": 0.23065743, + "epoch": 0.6480084172553735, + "flos": 22164066322560.0, + "grad_norm": 16.090908856207978, + "language_loss": 0.81631726, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.83226371, + "num_input_tokens_seen": 232579370, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.27185059, + "step": 10778, + "time_per_iteration": 2.6568751335144043 + }, + { + "auxiliary_loss_clip": 0.01308834, + "auxiliary_loss_mlp": 0.0027704, + "balance_loss_clip": 1.08525181, + "balance_loss_mlp": 0.24902616, + "epoch": 0.6480685405080415, + "flos": 24316228586880.0, + "grad_norm": 79160.87210018265, + "language_loss": 0.83842218, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.85428089, + "num_input_tokens_seen": 232600495, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.28015137, + "step": 10779, + "time_per_iteration": 2.6914546489715576 + }, + { + "auxiliary_loss_clip": 0.01289477, + "auxiliary_loss_mlp": 0.00055286, + "balance_loss_clip": 1.13717651, + "balance_loss_mlp": 0.04608352, + "epoch": 0.6481286637607094, + "flos": 59891207760000.0, + "grad_norm": 0.7173531710917146, + "language_loss": 0.5862698, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.59971744, + "num_input_tokens_seen": 232663165, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.09179688, + "step": 10780, + "time_per_iteration": 3.208174467086792 + }, + { + "auxiliary_loss_clip": 0.01343452, + "auxiliary_loss_mlp": 0.00263436, + "balance_loss_clip": 1.10809398, + "balance_loss_mlp": 0.23525444, + "epoch": 0.6481887870133775, + "flos": 25484187219840.0, + "grad_norm": 6.8958670901715395, + "language_loss": 0.8721869, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.88825583, + "num_input_tokens_seen": 232683385, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.28186035, + "step": 10781, + "time_per_iteration": 4.106105089187622 + }, + { + "auxiliary_loss_clip": 0.01353539, + "auxiliary_loss_mlp": 0.0029613, + "balance_loss_clip": 1.10981393, + "balance_loss_mlp": 0.26477778, + "epoch": 0.6482489102660454, + "flos": 19930206574080.0, + "grad_norm": 63.73623491790509, + "language_loss": 0.88285816, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.89935482, + "num_input_tokens_seen": 232699095, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.31347656, + "step": 10782, + "time_per_iteration": 3.995553970336914 + }, + { + "auxiliary_loss_clip": 0.01342911, + "auxiliary_loss_mlp": 0.00269892, + "balance_loss_clip": 1.10329831, + "balance_loss_mlp": 0.24166296, + "epoch": 0.6483090335187134, + "flos": 26979471515520.0, + "grad_norm": 78.61265279091884, + "language_loss": 0.76340872, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.77953672, + "num_input_tokens_seen": 232717920, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.28259277, + "step": 10783, + "time_per_iteration": 2.66917085647583 + }, + { + "auxiliary_loss_clip": 0.01362366, + "auxiliary_loss_mlp": 0.00280101, + "balance_loss_clip": 1.11683488, + "balance_loss_mlp": 0.24734193, + "epoch": 0.6483691567713813, + "flos": 25077965333760.0, + "grad_norm": 11.459332036609275, + "language_loss": 0.96980155, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.9862262, + "num_input_tokens_seen": 232737605, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.32751465, + "step": 10784, + "time_per_iteration": 2.6881103515625 + }, + { + "auxiliary_loss_clip": 0.01327628, + "auxiliary_loss_mlp": 0.00264081, + "balance_loss_clip": 1.0981276, + "balance_loss_mlp": 0.23656771, + "epoch": 0.6484292800240493, + "flos": 16105972250880.0, + "grad_norm": 239.09806897478433, + "language_loss": 0.83756113, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.85347819, + "num_input_tokens_seen": 232755110, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.27526855, + "step": 10785, + "time_per_iteration": 4.0510499477386475 + }, + { + "auxiliary_loss_clip": 0.01318504, + "auxiliary_loss_mlp": 0.00267074, + "balance_loss_clip": 1.09514427, + "balance_loss_mlp": 0.24152738, + "epoch": 0.6484894032767172, + "flos": 28840398307200.0, + "grad_norm": 4.2229506196830116, + "language_loss": 0.74777281, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.76362854, + "num_input_tokens_seen": 232779040, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.2557373, + "step": 10786, + "time_per_iteration": 2.7529633045196533 + }, + { + "auxiliary_loss_clip": 0.01323391, + "auxiliary_loss_mlp": 0.00262492, + "balance_loss_clip": 1.09358859, + "balance_loss_mlp": 0.2353245, + "epoch": 0.6485495265293852, + "flos": 30227052896640.0, + "grad_norm": 11.731744451924255, + "language_loss": 0.79601479, + "learning_rate": 1.161544469455041e-06, + "loss": 0.81187367, + "num_input_tokens_seen": 232800515, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.27185059, + "step": 10787, + "time_per_iteration": 2.7386820316314697 + }, + { + "auxiliary_loss_clip": 0.01343393, + "auxiliary_loss_mlp": 0.00247028, + "balance_loss_clip": 1.10071754, + "balance_loss_mlp": 0.21851283, + "epoch": 0.6486096497820532, + "flos": 20082181017600.0, + "grad_norm": 4.5152024859208755, + "language_loss": 0.93218565, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.94808978, + "num_input_tokens_seen": 232818450, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.28491211, + "step": 10788, + "time_per_iteration": 2.6683664321899414 + }, + { + "auxiliary_loss_clip": 0.01350534, + "auxiliary_loss_mlp": 0.00271396, + "balance_loss_clip": 1.11250591, + "balance_loss_mlp": 0.24333426, + "epoch": 0.6486697730347212, + "flos": 17129067333120.0, + "grad_norm": 4.70036056042283, + "language_loss": 0.86564225, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.88186157, + "num_input_tokens_seen": 232834785, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.28076172, + "step": 10789, + "time_per_iteration": 2.631643533706665 + }, + { + "auxiliary_loss_clip": 0.01349984, + "auxiliary_loss_mlp": 0.0025723, + "balance_loss_clip": 1.11410141, + "balance_loss_mlp": 0.22888175, + "epoch": 0.6487298962873892, + "flos": 38911940570880.0, + "grad_norm": 1.867650976865241, + "language_loss": 0.82616329, + "learning_rate": 1.160483857897479e-06, + "loss": 0.84223545, + "num_input_tokens_seen": 232856050, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.2833252, + "step": 10790, + "time_per_iteration": 4.225125312805176 + }, + { + "auxiliary_loss_clip": 0.01338725, + "auxiliary_loss_mlp": 0.00264431, + "balance_loss_clip": 1.10341883, + "balance_loss_mlp": 0.23650065, + "epoch": 0.6487900195400571, + "flos": 11947840076160.0, + "grad_norm": 9.56806208971264, + "language_loss": 0.70118952, + "learning_rate": 1.160130384362823e-06, + "loss": 0.71722102, + "num_input_tokens_seen": 232873945, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.27966309, + "step": 10791, + "time_per_iteration": 2.6226983070373535 + }, + { + "auxiliary_loss_clip": 0.01315325, + "auxiliary_loss_mlp": 0.00287412, + "balance_loss_clip": 1.08614385, + "balance_loss_mlp": 0.2608521, + "epoch": 0.6488501427927251, + "flos": 22344445445760.0, + "grad_norm": 55.532448080571946, + "language_loss": 0.93290299, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.94893038, + "num_input_tokens_seen": 232892160, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.265625, + "step": 10792, + "time_per_iteration": 2.669800043106079 + }, + { + "auxiliary_loss_clip": 0.01334541, + "auxiliary_loss_mlp": 0.00274743, + "balance_loss_clip": 1.09680295, + "balance_loss_mlp": 0.24702723, + "epoch": 0.648910266045393, + "flos": 22236282616320.0, + "grad_norm": 39.689573973104004, + "language_loss": 0.87642914, + "learning_rate": 1.159423532850735e-06, + "loss": 0.89252198, + "num_input_tokens_seen": 232911725, + "router_z_loss_clip": 2.37792969, + "router_z_loss_mlp": 0.27734375, + "step": 10793, + "time_per_iteration": 2.7394931316375732 + }, + { + "auxiliary_loss_clip": 0.0133411, + "auxiliary_loss_mlp": 0.00255158, + "balance_loss_clip": 1.09483576, + "balance_loss_mlp": 0.22806132, + "epoch": 0.6489703892980611, + "flos": 25301258231040.0, + "grad_norm": 189.64928913580118, + "language_loss": 0.81923133, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.83512396, + "num_input_tokens_seen": 232929085, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.27099609, + "step": 10794, + "time_per_iteration": 2.659733533859253 + }, + { + "auxiliary_loss_clip": 0.01325891, + "auxiliary_loss_mlp": 0.00258847, + "balance_loss_clip": 1.09341717, + "balance_loss_mlp": 0.2319061, + "epoch": 0.649030512550729, + "flos": 24571912573440.0, + "grad_norm": 3.7494719571295607, + "language_loss": 0.79422694, + "learning_rate": 1.158716808837621e-06, + "loss": 0.81007439, + "num_input_tokens_seen": 232949455, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.26928711, + "step": 10795, + "time_per_iteration": 2.6960673332214355 + }, + { + "auxiliary_loss_clip": 0.0132579, + "auxiliary_loss_mlp": 0.00260981, + "balance_loss_clip": 1.09294939, + "balance_loss_mlp": 0.23345591, + "epoch": 0.649090635803397, + "flos": 26244702904320.0, + "grad_norm": 47.42106347021005, + "language_loss": 0.63449788, + "learning_rate": 1.158363494676679e-06, + "loss": 0.65036559, + "num_input_tokens_seen": 232969445, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.27539062, + "step": 10796, + "time_per_iteration": 2.7699289321899414 + }, + { + "auxiliary_loss_clip": 0.01309481, + "auxiliary_loss_mlp": 0.0025496, + "balance_loss_clip": 1.08294535, + "balance_loss_mlp": 0.22910401, + "epoch": 0.6491507590560649, + "flos": 24937375501440.0, + "grad_norm": 68.54088277262782, + "language_loss": 0.85224193, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.8678863, + "num_input_tokens_seen": 232988900, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.25830078, + "step": 10797, + "time_per_iteration": 2.6600160598754883 + }, + { + "auxiliary_loss_clip": 0.01305197, + "auxiliary_loss_mlp": 0.00238306, + "balance_loss_clip": 1.08357942, + "balance_loss_mlp": 0.2145004, + "epoch": 0.6492108823087329, + "flos": 19499781899520.0, + "grad_norm": 40.57215913000819, + "language_loss": 0.77702451, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.79245955, + "num_input_tokens_seen": 233005060, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.23803711, + "step": 10798, + "time_per_iteration": 2.6526808738708496 + }, + { + "auxiliary_loss_clip": 0.01302329, + "auxiliary_loss_mlp": 0.00254397, + "balance_loss_clip": 1.07667589, + "balance_loss_mlp": 0.2283856, + "epoch": 0.6492710055614008, + "flos": 19719303868800.0, + "grad_norm": 3.2033624557506575, + "language_loss": 0.82063174, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.83619899, + "num_input_tokens_seen": 233023375, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26013184, + "step": 10799, + "time_per_iteration": 2.6514625549316406 + }, + { + "auxiliary_loss_clip": 0.01317734, + "auxiliary_loss_mlp": 0.00261163, + "balance_loss_clip": 1.08127308, + "balance_loss_mlp": 0.23507985, + "epoch": 0.6493311288140688, + "flos": 24317018686080.0, + "grad_norm": 3.674873016486915, + "language_loss": 0.791924, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.80771291, + "num_input_tokens_seen": 233043130, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.26098633, + "step": 10800, + "time_per_iteration": 2.6840381622314453 + }, + { + "auxiliary_loss_clip": 0.01246621, + "auxiliary_loss_mlp": 0.00037137, + "balance_loss_clip": 1.10311341, + "balance_loss_mlp": 0.02984124, + "epoch": 0.6493912520667368, + "flos": 70934635290240.0, + "grad_norm": 0.7604428421242306, + "language_loss": 0.59695607, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.60979366, + "num_input_tokens_seen": 233110560, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.07275391, + "step": 10801, + "time_per_iteration": 3.238823175430298 + }, + { + "auxiliary_loss_clip": 0.01340394, + "auxiliary_loss_mlp": 0.00251214, + "balance_loss_clip": 1.10578763, + "balance_loss_mlp": 0.22368857, + "epoch": 0.6494513753194048, + "flos": 25337779384320.0, + "grad_norm": 6.0003088884595535, + "language_loss": 0.85357761, + "learning_rate": 1.156244280393614e-06, + "loss": 0.8694936, + "num_input_tokens_seen": 233130080, + "router_z_loss_clip": 2.34472656, + "router_z_loss_mlp": 0.27526855, + "step": 10802, + "time_per_iteration": 2.642927885055542 + }, + { + "auxiliary_loss_clip": 0.01299158, + "auxiliary_loss_mlp": 0.00244491, + "balance_loss_clip": 1.07487178, + "balance_loss_mlp": 0.21876523, + "epoch": 0.6495114985720728, + "flos": 24681978823680.0, + "grad_norm": 5.293251220230723, + "language_loss": 0.84095216, + "learning_rate": 1.155891189918541e-06, + "loss": 0.85638869, + "num_input_tokens_seen": 233150235, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.25744629, + "step": 10803, + "time_per_iteration": 2.637833595275879 + }, + { + "auxiliary_loss_clip": 0.01319215, + "auxiliary_loss_mlp": 0.00265643, + "balance_loss_clip": 1.08804369, + "balance_loss_mlp": 0.23878554, + "epoch": 0.6495716218247407, + "flos": 23651162317440.0, + "grad_norm": 5.546422514154618, + "language_loss": 0.81867421, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.83452278, + "num_input_tokens_seen": 233166710, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.26879883, + "step": 10804, + "time_per_iteration": 2.643524408340454 + }, + { + "auxiliary_loss_clip": 0.01296793, + "auxiliary_loss_mlp": 0.0024966, + "balance_loss_clip": 1.07171845, + "balance_loss_mlp": 0.22491232, + "epoch": 0.6496317450774087, + "flos": 22346169298560.0, + "grad_norm": 16.335717509204155, + "language_loss": 0.80198616, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.81745064, + "num_input_tokens_seen": 233185445, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.24719238, + "step": 10805, + "time_per_iteration": 2.661180019378662 + }, + { + "auxiliary_loss_clip": 0.01282467, + "auxiliary_loss_mlp": 0.00259115, + "balance_loss_clip": 1.06082177, + "balance_loss_mlp": 0.2336998, + "epoch": 0.6496918683300766, + "flos": 30518647505280.0, + "grad_norm": 4.483512410245368, + "language_loss": 0.76072735, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.77614319, + "num_input_tokens_seen": 233205805, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25415039, + "step": 10806, + "time_per_iteration": 2.720574378967285 + }, + { + "auxiliary_loss_clip": 0.01302445, + "auxiliary_loss_mlp": 0.00275668, + "balance_loss_clip": 1.07413697, + "balance_loss_mlp": 0.24894123, + "epoch": 0.6497519915827447, + "flos": 12458992567680.0, + "grad_norm": 70.05050627970263, + "language_loss": 0.90847087, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.92425197, + "num_input_tokens_seen": 233224215, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.26733398, + "step": 10807, + "time_per_iteration": 2.669102430343628 + }, + { + "auxiliary_loss_clip": 0.0125141, + "auxiliary_loss_mlp": 0.00077813, + "balance_loss_clip": 1.10896492, + "balance_loss_mlp": 0.07004017, + "epoch": 0.6498121148354126, + "flos": 69093748287360.0, + "grad_norm": 0.7798654109677798, + "language_loss": 0.5826239, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.59591615, + "num_input_tokens_seen": 233294440, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.07763672, + "step": 10808, + "time_per_iteration": 3.3394484519958496 + }, + { + "auxiliary_loss_clip": 0.0132107, + "auxiliary_loss_mlp": 0.00271014, + "balance_loss_clip": 1.09410596, + "balance_loss_mlp": 0.2456466, + "epoch": 0.6498722380880806, + "flos": 36897135914880.0, + "grad_norm": 40.23593847558313, + "language_loss": 0.69967175, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.71559262, + "num_input_tokens_seen": 233316125, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.25378418, + "step": 10809, + "time_per_iteration": 2.8150007724761963 + }, + { + "auxiliary_loss_clip": 0.01300553, + "auxiliary_loss_mlp": 0.00255318, + "balance_loss_clip": 1.07653749, + "balance_loss_mlp": 0.230892, + "epoch": 0.6499323613407485, + "flos": 29017760688000.0, + "grad_norm": 9.1422833826576, + "language_loss": 0.86161268, + "learning_rate": 1.153420453586008e-06, + "loss": 0.8771714, + "num_input_tokens_seen": 233336140, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.24438477, + "step": 10810, + "time_per_iteration": 2.7254974842071533 + }, + { + "auxiliary_loss_clip": 0.01274905, + "auxiliary_loss_mlp": 0.00258824, + "balance_loss_clip": 1.05977416, + "balance_loss_mlp": 0.23486295, + "epoch": 0.6499924845934165, + "flos": 20119240874880.0, + "grad_norm": 445.03035216624124, + "language_loss": 0.80536771, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.820705, + "num_input_tokens_seen": 233356095, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.23986816, + "step": 10811, + "time_per_iteration": 2.7155773639678955 + }, + { + "auxiliary_loss_clip": 0.01292604, + "auxiliary_loss_mlp": 0.00240633, + "balance_loss_clip": 1.07647943, + "balance_loss_mlp": 0.21545586, + "epoch": 0.6500526078460844, + "flos": 24421338760320.0, + "grad_norm": 3.5987500807277457, + "language_loss": 0.83604407, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.85137641, + "num_input_tokens_seen": 233376830, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.25183105, + "step": 10812, + "time_per_iteration": 2.728624105453491 + }, + { + "auxiliary_loss_clip": 0.0130738, + "auxiliary_loss_mlp": 0.00244917, + "balance_loss_clip": 1.07861483, + "balance_loss_mlp": 0.22095615, + "epoch": 0.6501127310987524, + "flos": 23331019374720.0, + "grad_norm": 57.38540825969017, + "language_loss": 0.91851151, + "learning_rate": 1.152362047854413e-06, + "loss": 0.93403447, + "num_input_tokens_seen": 233395275, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.23937988, + "step": 10813, + "time_per_iteration": 2.6959879398345947 + }, + { + "auxiliary_loss_clip": 0.01300348, + "auxiliary_loss_mlp": 0.00254417, + "balance_loss_clip": 1.07556367, + "balance_loss_mlp": 0.22900152, + "epoch": 0.6501728543514204, + "flos": 18697824898560.0, + "grad_norm": 6.268670782662336, + "language_loss": 0.87700891, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.89255655, + "num_input_tokens_seen": 233413345, + "router_z_loss_clip": 2.24707031, + "router_z_loss_mlp": 0.25427246, + "step": 10814, + "time_per_iteration": 2.624119281768799 + }, + { + "auxiliary_loss_clip": 0.01295442, + "auxiliary_loss_mlp": 0.00267859, + "balance_loss_clip": 1.07126594, + "balance_loss_mlp": 0.24131152, + "epoch": 0.6502329776040884, + "flos": 44199858199680.0, + "grad_norm": 10.703023218735192, + "language_loss": 0.74109316, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.75672615, + "num_input_tokens_seen": 233436105, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.26538086, + "step": 10815, + "time_per_iteration": 2.852877140045166 + }, + { + "auxiliary_loss_clip": 0.01329269, + "auxiliary_loss_mlp": 0.00266482, + "balance_loss_clip": 1.09029078, + "balance_loss_mlp": 0.23788375, + "epoch": 0.6502931008567564, + "flos": 14574741419520.0, + "grad_norm": 2.0379314357948095, + "language_loss": 0.85311669, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.86907423, + "num_input_tokens_seen": 233452320, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.28613281, + "step": 10816, + "time_per_iteration": 2.6741676330566406 + }, + { + "auxiliary_loss_clip": 0.01306141, + "auxiliary_loss_mlp": 0.00240187, + "balance_loss_clip": 1.07985997, + "balance_loss_mlp": 0.21493825, + "epoch": 0.6503532241094243, + "flos": 21395003201280.0, + "grad_norm": 25.939950616979125, + "language_loss": 0.79197395, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.80743718, + "num_input_tokens_seen": 233469920, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.25231934, + "step": 10817, + "time_per_iteration": 2.621716022491455 + }, + { + "auxiliary_loss_clip": 0.01307837, + "auxiliary_loss_mlp": 0.00260525, + "balance_loss_clip": 1.07818115, + "balance_loss_mlp": 0.23493078, + "epoch": 0.6504133473620923, + "flos": 74740840986240.0, + "grad_norm": 18.665952518613842, + "language_loss": 0.78025353, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.79593718, + "num_input_tokens_seen": 233499780, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.25610352, + "step": 10818, + "time_per_iteration": 3.0672836303710938 + }, + { + "auxiliary_loss_clip": 0.01301771, + "auxiliary_loss_mlp": 0.00239627, + "balance_loss_clip": 1.07335925, + "balance_loss_mlp": 0.21478328, + "epoch": 0.6504734706147602, + "flos": 19713270384000.0, + "grad_norm": 16.941868268918817, + "language_loss": 0.73588908, + "learning_rate": 1.150246104600249e-06, + "loss": 0.75130308, + "num_input_tokens_seen": 233518235, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.24816895, + "step": 10819, + "time_per_iteration": 2.6197569370269775 + }, + { + "auxiliary_loss_clip": 0.01300378, + "auxiliary_loss_mlp": 0.00236663, + "balance_loss_clip": 1.07193351, + "balance_loss_mlp": 0.21030548, + "epoch": 0.6505335938674283, + "flos": 25556870390400.0, + "grad_norm": 195.40005439179754, + "language_loss": 0.90328932, + "learning_rate": 1.14989356009286e-06, + "loss": 0.91865969, + "num_input_tokens_seen": 233535215, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.26391602, + "step": 10820, + "time_per_iteration": 2.68359112739563 + }, + { + "auxiliary_loss_clip": 0.01296191, + "auxiliary_loss_mlp": 0.00240695, + "balance_loss_clip": 1.0693531, + "balance_loss_mlp": 0.21370634, + "epoch": 0.6505937171200962, + "flos": 17821424960640.0, + "grad_norm": 19.114068574446925, + "language_loss": 0.88493919, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.90030807, + "num_input_tokens_seen": 233552775, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.27001953, + "step": 10821, + "time_per_iteration": 2.5827677249908447 + }, + { + "auxiliary_loss_clip": 0.01288657, + "auxiliary_loss_mlp": 0.00241711, + "balance_loss_clip": 1.06992018, + "balance_loss_mlp": 0.21740375, + "epoch": 0.6506538403727642, + "flos": 20668135582080.0, + "grad_norm": 4.616899053142082, + "language_loss": 0.8567943, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.87209797, + "num_input_tokens_seen": 233572080, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24316406, + "step": 10822, + "time_per_iteration": 2.640397787094116 + }, + { + "auxiliary_loss_clip": 0.0127793, + "auxiliary_loss_mlp": 0.00237102, + "balance_loss_clip": 1.0601728, + "balance_loss_mlp": 0.211555, + "epoch": 0.6507139636254321, + "flos": 11721422695680.0, + "grad_norm": 3.142345118238088, + "language_loss": 0.94891667, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.96406704, + "num_input_tokens_seen": 233589155, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25549316, + "step": 10823, + "time_per_iteration": 4.1874823570251465 + }, + { + "auxiliary_loss_clip": 0.01272579, + "auxiliary_loss_mlp": 0.00252436, + "balance_loss_clip": 1.05208588, + "balance_loss_mlp": 0.22727136, + "epoch": 0.6507740868781001, + "flos": 26761745226240.0, + "grad_norm": 5.444236386355637, + "language_loss": 0.73727709, + "learning_rate": 1.148483704558183e-06, + "loss": 0.7525273, + "num_input_tokens_seen": 233608180, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25170898, + "step": 10824, + "time_per_iteration": 4.226592302322388 + }, + { + "auxiliary_loss_clip": 0.01307538, + "auxiliary_loss_mlp": 0.00279907, + "balance_loss_clip": 1.07050574, + "balance_loss_mlp": 0.25058109, + "epoch": 0.650834210130768, + "flos": 16471722487680.0, + "grad_norm": 13.282082838102562, + "language_loss": 0.95928645, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.97516096, + "num_input_tokens_seen": 233625750, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.29321289, + "step": 10825, + "time_per_iteration": 2.6215689182281494 + }, + { + "auxiliary_loss_clip": 0.01300841, + "auxiliary_loss_mlp": 0.00261407, + "balance_loss_clip": 1.06924725, + "balance_loss_mlp": 0.2328445, + "epoch": 0.650894333383436, + "flos": 17128672283520.0, + "grad_norm": 4.144501009329079, + "language_loss": 0.8600111, + "learning_rate": 1.147778970474885e-06, + "loss": 0.8756336, + "num_input_tokens_seen": 233644235, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.2857666, + "step": 10826, + "time_per_iteration": 2.6049647331237793 + }, + { + "auxiliary_loss_clip": 0.01295533, + "auxiliary_loss_mlp": 0.00251067, + "balance_loss_clip": 1.06661355, + "balance_loss_mlp": 0.22535405, + "epoch": 0.650954456636104, + "flos": 18734238311040.0, + "grad_norm": 249.5837689238325, + "language_loss": 0.77834958, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.79381555, + "num_input_tokens_seen": 233662845, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.25744629, + "step": 10827, + "time_per_iteration": 2.583498001098633 + }, + { + "auxiliary_loss_clip": 0.01285052, + "auxiliary_loss_mlp": 0.00250878, + "balance_loss_clip": 1.06051934, + "balance_loss_mlp": 0.2261183, + "epoch": 0.651014579888772, + "flos": 24528244613760.0, + "grad_norm": 29.205940852598506, + "language_loss": 0.87294555, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.88830489, + "num_input_tokens_seen": 233681990, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.24780273, + "step": 10828, + "time_per_iteration": 4.0123419761657715 + }, + { + "auxiliary_loss_clip": 0.01294181, + "auxiliary_loss_mlp": 0.00238217, + "balance_loss_clip": 1.07513452, + "balance_loss_mlp": 0.21315941, + "epoch": 0.65107470314144, + "flos": 24061083304320.0, + "grad_norm": 10.006285032163035, + "language_loss": 0.95966613, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.97499013, + "num_input_tokens_seen": 233698930, + "router_z_loss_clip": 2.19238281, + "router_z_loss_mlp": 0.25024414, + "step": 10829, + "time_per_iteration": 2.6456191539764404 + }, + { + "auxiliary_loss_clip": 0.01223824, + "auxiliary_loss_mlp": 0.00075867, + "balance_loss_clip": 1.0825969, + "balance_loss_mlp": 0.06757022, + "epoch": 0.6511348263941079, + "flos": 72480734352000.0, + "grad_norm": 0.6307632301644442, + "language_loss": 0.5503993, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.56339622, + "num_input_tokens_seen": 233769825, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.08300781, + "step": 10830, + "time_per_iteration": 3.2903075218200684 + }, + { + "auxiliary_loss_clip": 0.01314613, + "auxiliary_loss_mlp": 0.00231533, + "balance_loss_clip": 1.0787282, + "balance_loss_mlp": 0.20435371, + "epoch": 0.6511949496467759, + "flos": 23367684182400.0, + "grad_norm": 6.955038588159152, + "language_loss": 0.8252368, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.84069824, + "num_input_tokens_seen": 233787095, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.27148438, + "step": 10831, + "time_per_iteration": 2.678997039794922 + }, + { + "auxiliary_loss_clip": 0.0122712, + "auxiliary_loss_mlp": 0.00068981, + "balance_loss_clip": 1.08712292, + "balance_loss_mlp": 0.06106578, + "epoch": 0.6512550728994438, + "flos": 67333191073920.0, + "grad_norm": 2.251470749504655, + "language_loss": 0.50390238, + "learning_rate": 1.145665544243828e-06, + "loss": 0.51686341, + "num_input_tokens_seen": 233853050, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.07910156, + "step": 10832, + "time_per_iteration": 4.680277109146118 + }, + { + "auxiliary_loss_clip": 0.01299993, + "auxiliary_loss_mlp": 0.00266957, + "balance_loss_clip": 1.0704875, + "balance_loss_mlp": 0.2402299, + "epoch": 0.6513151961521119, + "flos": 21141689512320.0, + "grad_norm": 7.526681362631786, + "language_loss": 0.94736075, + "learning_rate": 1.145313419848316e-06, + "loss": 0.96303028, + "num_input_tokens_seen": 233871385, + "router_z_loss_clip": 2.29785156, + "router_z_loss_mlp": 0.26721191, + "step": 10833, + "time_per_iteration": 2.6669704914093018 + }, + { + "auxiliary_loss_clip": 0.01313473, + "auxiliary_loss_mlp": 0.00231704, + "balance_loss_clip": 1.08460236, + "balance_loss_mlp": 0.20624103, + "epoch": 0.6513753194047798, + "flos": 15158828476800.0, + "grad_norm": 7.70050729489033, + "language_loss": 0.92257738, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.93802917, + "num_input_tokens_seen": 233888175, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.25439453, + "step": 10834, + "time_per_iteration": 2.7741682529449463 + }, + { + "auxiliary_loss_clip": 0.01305665, + "auxiliary_loss_mlp": 0.00228555, + "balance_loss_clip": 1.07809997, + "balance_loss_mlp": 0.20234078, + "epoch": 0.6514354426574478, + "flos": 30226621933440.0, + "grad_norm": 69.16813707458631, + "language_loss": 0.84736121, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.86270338, + "num_input_tokens_seen": 233911470, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.26245117, + "step": 10835, + "time_per_iteration": 2.7275969982147217 + }, + { + "auxiliary_loss_clip": 0.01293411, + "auxiliary_loss_mlp": 0.00254651, + "balance_loss_clip": 1.07137311, + "balance_loss_mlp": 0.22943833, + "epoch": 0.6514955659101157, + "flos": 24205587719040.0, + "grad_norm": 14.075011727168357, + "language_loss": 0.85170782, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.86718845, + "num_input_tokens_seen": 233932135, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25219727, + "step": 10836, + "time_per_iteration": 2.815384864807129 + }, + { + "auxiliary_loss_clip": 0.01292613, + "auxiliary_loss_mlp": 0.00249708, + "balance_loss_clip": 1.06740022, + "balance_loss_mlp": 0.22298093, + "epoch": 0.6515556891627837, + "flos": 12377761960320.0, + "grad_norm": 7.849347601684581, + "language_loss": 0.89417213, + "learning_rate": 1.143905246497783e-06, + "loss": 0.90959531, + "num_input_tokens_seen": 233947880, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.26721191, + "step": 10837, + "time_per_iteration": 2.69932222366333 + }, + { + "auxiliary_loss_clip": 0.01288782, + "auxiliary_loss_mlp": 0.00246684, + "balance_loss_clip": 1.06484973, + "balance_loss_mlp": 0.22039844, + "epoch": 0.6516158124154516, + "flos": 49601217957120.0, + "grad_norm": 33.29754927574795, + "language_loss": 0.70345891, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.71881354, + "num_input_tokens_seen": 233971475, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26257324, + "step": 10838, + "time_per_iteration": 2.9870166778564453 + }, + { + "auxiliary_loss_clip": 0.01229782, + "auxiliary_loss_mlp": 0.0005363, + "balance_loss_clip": 1.09023428, + "balance_loss_mlp": 0.04557143, + "epoch": 0.6516759356681197, + "flos": 59702748076800.0, + "grad_norm": 0.7145862166443198, + "language_loss": 0.59511012, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.60794425, + "num_input_tokens_seen": 234030690, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.08056641, + "step": 10839, + "time_per_iteration": 3.2294921875 + }, + { + "auxiliary_loss_clip": 0.01303032, + "auxiliary_loss_mlp": 0.00214501, + "balance_loss_clip": 1.07881522, + "balance_loss_mlp": 0.19120756, + "epoch": 0.6517360589207876, + "flos": 37450807130880.0, + "grad_norm": 47.56012204937173, + "language_loss": 0.74625266, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.761428, + "num_input_tokens_seen": 234052470, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.23291016, + "step": 10840, + "time_per_iteration": 2.785402297973633 + }, + { + "auxiliary_loss_clip": 0.01292196, + "auxiliary_loss_mlp": 0.00239624, + "balance_loss_clip": 1.06625164, + "balance_loss_mlp": 0.21525773, + "epoch": 0.6517961821734556, + "flos": 25374911068800.0, + "grad_norm": 148.77497003546088, + "language_loss": 0.83755374, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.85287189, + "num_input_tokens_seen": 234071495, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.24401855, + "step": 10841, + "time_per_iteration": 2.7215096950531006 + }, + { + "auxiliary_loss_clip": 0.01283327, + "auxiliary_loss_mlp": 0.00232953, + "balance_loss_clip": 1.05874097, + "balance_loss_mlp": 0.20651235, + "epoch": 0.6518563054261236, + "flos": 28766996864640.0, + "grad_norm": 355.1152710825266, + "language_loss": 0.70957315, + "learning_rate": 1.142145760331648e-06, + "loss": 0.72473598, + "num_input_tokens_seen": 234092325, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.26403809, + "step": 10842, + "time_per_iteration": 2.710496187210083 + }, + { + "auxiliary_loss_clip": 0.01225244, + "auxiliary_loss_mlp": 0.00093816, + "balance_loss_clip": 1.08697748, + "balance_loss_mlp": 0.08690225, + "epoch": 0.6519164286787915, + "flos": 68924750797440.0, + "grad_norm": 0.798550831622932, + "language_loss": 0.55364186, + "learning_rate": 1.141793960634807e-06, + "loss": 0.56683248, + "num_input_tokens_seen": 234148005, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.06933594, + "step": 10843, + "time_per_iteration": 2.972451686859131 + }, + { + "auxiliary_loss_clip": 0.01317606, + "auxiliary_loss_mlp": 0.00262029, + "balance_loss_clip": 1.07867944, + "balance_loss_mlp": 0.23254859, + "epoch": 0.6519765519314595, + "flos": 20441933683200.0, + "grad_norm": 50.65795358658335, + "language_loss": 0.89522696, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.91102326, + "num_input_tokens_seen": 234164280, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.2947998, + "step": 10844, + "time_per_iteration": 2.602994680404663 + }, + { + "auxiliary_loss_clip": 0.01279171, + "auxiliary_loss_mlp": 0.00232202, + "balance_loss_clip": 1.05967951, + "balance_loss_mlp": 0.20713212, + "epoch": 0.6520366751841274, + "flos": 28402970480640.0, + "grad_norm": 14.618272414952026, + "language_loss": 0.69535124, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.71046489, + "num_input_tokens_seen": 234185090, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25061035, + "step": 10845, + "time_per_iteration": 2.699364423751831 + }, + { + "auxiliary_loss_clip": 0.01311695, + "auxiliary_loss_mlp": 0.0026032, + "balance_loss_clip": 1.07956731, + "balance_loss_mlp": 0.23411788, + "epoch": 0.6520967984367955, + "flos": 22273414300800.0, + "grad_norm": 42.89005955420811, + "language_loss": 0.87229002, + "learning_rate": 1.140738756857194e-06, + "loss": 0.8880102, + "num_input_tokens_seen": 234204050, + "router_z_loss_clip": 2.32324219, + "router_z_loss_mlp": 0.26208496, + "step": 10846, + "time_per_iteration": 2.6589903831481934 + }, + { + "auxiliary_loss_clip": 0.01222547, + "auxiliary_loss_mlp": 0.00080894, + "balance_loss_clip": 1.08497393, + "balance_loss_mlp": 0.07383708, + "epoch": 0.6521569216894634, + "flos": 68917140092160.0, + "grad_norm": 0.7009818564968063, + "language_loss": 0.59312677, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.60616112, + "num_input_tokens_seen": 234269790, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.07080078, + "step": 10847, + "time_per_iteration": 3.221574544906616 + }, + { + "auxiliary_loss_clip": 0.01321869, + "auxiliary_loss_mlp": 0.00240488, + "balance_loss_clip": 1.08798528, + "balance_loss_mlp": 0.21372592, + "epoch": 0.6522170449421314, + "flos": 29130520458240.0, + "grad_norm": 3.9761476451735263, + "language_loss": 0.89747679, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.91310036, + "num_input_tokens_seen": 234290135, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.26782227, + "step": 10848, + "time_per_iteration": 2.7506299018859863 + }, + { + "auxiliary_loss_clip": 0.01298486, + "auxiliary_loss_mlp": 0.00278664, + "balance_loss_clip": 1.07502723, + "balance_loss_mlp": 0.25246161, + "epoch": 0.6522771681947993, + "flos": 26651930371200.0, + "grad_norm": 9.925496980787841, + "language_loss": 0.83815867, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.85393018, + "num_input_tokens_seen": 234309535, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26196289, + "step": 10849, + "time_per_iteration": 2.7233543395996094 + }, + { + "auxiliary_loss_clip": 0.01282492, + "auxiliary_loss_mlp": 0.00249557, + "balance_loss_clip": 1.06497765, + "balance_loss_mlp": 0.22212738, + "epoch": 0.6523372914474673, + "flos": 25739763465600.0, + "grad_norm": 9.19941028012679, + "language_loss": 0.75777763, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.77309811, + "num_input_tokens_seen": 234328755, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.27429199, + "step": 10850, + "time_per_iteration": 2.745002031326294 + }, + { + "auxiliary_loss_clip": 0.01287758, + "auxiliary_loss_mlp": 0.00227489, + "balance_loss_clip": 1.0700624, + "balance_loss_mlp": 0.20145406, + "epoch": 0.6523974147001352, + "flos": 24827345164800.0, + "grad_norm": 2.369021006181498, + "language_loss": 0.75944996, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.77460241, + "num_input_tokens_seen": 234348655, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.26037598, + "step": 10851, + "time_per_iteration": 2.7214741706848145 + }, + { + "auxiliary_loss_clip": 0.01286302, + "auxiliary_loss_mlp": 0.00254547, + "balance_loss_clip": 1.06644654, + "balance_loss_mlp": 0.22875053, + "epoch": 0.6524575379528033, + "flos": 26317637470080.0, + "grad_norm": 8.969155720989066, + "language_loss": 0.82404292, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.83945149, + "num_input_tokens_seen": 234367445, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25805664, + "step": 10852, + "time_per_iteration": 2.723938465118408 + }, + { + "auxiliary_loss_clip": 0.01274717, + "auxiliary_loss_mlp": 0.00245065, + "balance_loss_clip": 1.05061221, + "balance_loss_mlp": 0.21991169, + "epoch": 0.6525176612054712, + "flos": 19494143464320.0, + "grad_norm": 75.09056126562844, + "language_loss": 0.77297634, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.78817415, + "num_input_tokens_seen": 234384825, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.25158691, + "step": 10853, + "time_per_iteration": 2.6487016677856445 + }, + { + "auxiliary_loss_clip": 0.01222563, + "auxiliary_loss_mlp": 0.0006233, + "balance_loss_clip": 1.08478498, + "balance_loss_mlp": 0.05551087, + "epoch": 0.6525777844581392, + "flos": 71706894721920.0, + "grad_norm": 0.6973560723838015, + "language_loss": 0.62520504, + "learning_rate": 1.137926314758634e-06, + "loss": 0.63805389, + "num_input_tokens_seen": 234450630, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06835938, + "step": 10854, + "time_per_iteration": 3.280423164367676 + }, + { + "auxiliary_loss_clip": 0.01279398, + "auxiliary_loss_mlp": 0.00231685, + "balance_loss_clip": 1.06129956, + "balance_loss_mlp": 0.20632935, + "epoch": 0.6526379077108072, + "flos": 26653115520000.0, + "grad_norm": 3.103985738594293, + "language_loss": 0.86076581, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.87587667, + "num_input_tokens_seen": 234473505, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25341797, + "step": 10855, + "time_per_iteration": 2.7648627758026123 + }, + { + "auxiliary_loss_clip": 0.01288302, + "auxiliary_loss_mlp": 0.00239359, + "balance_loss_clip": 1.0703547, + "balance_loss_mlp": 0.21408656, + "epoch": 0.6526980309634751, + "flos": 22820369673600.0, + "grad_norm": 7.602607194479932, + "language_loss": 0.87962317, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.89489973, + "num_input_tokens_seen": 234492485, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25292969, + "step": 10856, + "time_per_iteration": 2.6949477195739746 + }, + { + "auxiliary_loss_clip": 0.01290298, + "auxiliary_loss_mlp": 0.00251532, + "balance_loss_clip": 1.06494153, + "balance_loss_mlp": 0.2225765, + "epoch": 0.6527581542161431, + "flos": 28365048696960.0, + "grad_norm": 35.58260499781198, + "language_loss": 0.80950785, + "learning_rate": 1.136872187988815e-06, + "loss": 0.82492614, + "num_input_tokens_seen": 234512645, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.28967285, + "step": 10857, + "time_per_iteration": 2.7428953647613525 + }, + { + "auxiliary_loss_clip": 0.0127374, + "auxiliary_loss_mlp": 0.00228445, + "balance_loss_clip": 1.05705476, + "balance_loss_mlp": 0.20308964, + "epoch": 0.652818277468811, + "flos": 18369206346240.0, + "grad_norm": 7.941383962838423, + "language_loss": 0.73880857, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.75383043, + "num_input_tokens_seen": 234529310, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.25366211, + "step": 10858, + "time_per_iteration": 2.64495849609375 + }, + { + "auxiliary_loss_clip": 0.01262392, + "auxiliary_loss_mlp": 0.002086, + "balance_loss_clip": 1.05052924, + "balance_loss_mlp": 0.1858307, + "epoch": 0.6528784007214791, + "flos": 18036170421120.0, + "grad_norm": 41.214894556864316, + "language_loss": 0.84832132, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.86303127, + "num_input_tokens_seen": 234546685, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.22753906, + "step": 10859, + "time_per_iteration": 2.683326482772827 + }, + { + "auxiliary_loss_clip": 0.0128133, + "auxiliary_loss_mlp": 0.00240958, + "balance_loss_clip": 1.0593698, + "balance_loss_mlp": 0.21671087, + "epoch": 0.652938523974147, + "flos": 22382008093440.0, + "grad_norm": 12.065000921612917, + "language_loss": 0.74920744, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.76443034, + "num_input_tokens_seen": 234566255, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24230957, + "step": 10860, + "time_per_iteration": 2.714423418045044 + }, + { + "auxiliary_loss_clip": 0.01294689, + "auxiliary_loss_mlp": 0.00232013, + "balance_loss_clip": 1.0665282, + "balance_loss_mlp": 0.20504773, + "epoch": 0.652998647226815, + "flos": 16764035368320.0, + "grad_norm": 195.00025949255462, + "language_loss": 0.76952171, + "learning_rate": 1.135467143909712e-06, + "loss": 0.78478873, + "num_input_tokens_seen": 234585405, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.27001953, + "step": 10861, + "time_per_iteration": 2.6455233097076416 + }, + { + "auxiliary_loss_clip": 0.01281819, + "auxiliary_loss_mlp": 0.00276504, + "balance_loss_clip": 1.05797172, + "balance_loss_mlp": 0.25002784, + "epoch": 0.6530587704794829, + "flos": 35772522019200.0, + "grad_norm": 3.5289487713951955, + "language_loss": 0.73840225, + "learning_rate": 1.135115964814572e-06, + "loss": 0.75398552, + "num_input_tokens_seen": 234608095, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26489258, + "step": 10862, + "time_per_iteration": 2.7689011096954346 + }, + { + "auxiliary_loss_clip": 0.01283393, + "auxiliary_loss_mlp": 0.00242434, + "balance_loss_clip": 1.0593338, + "balance_loss_mlp": 0.21685168, + "epoch": 0.6531188937321509, + "flos": 19316134638720.0, + "grad_norm": 25.94521152131059, + "language_loss": 0.84953922, + "learning_rate": 1.13476481851592e-06, + "loss": 0.86479747, + "num_input_tokens_seen": 234627335, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.25598145, + "step": 10863, + "time_per_iteration": 2.6877613067626953 + }, + { + "auxiliary_loss_clip": 0.01245649, + "auxiliary_loss_mlp": 0.00250211, + "balance_loss_clip": 1.03676891, + "balance_loss_mlp": 0.22782359, + "epoch": 0.6531790169848188, + "flos": 22893771116160.0, + "grad_norm": 26.342126279011076, + "language_loss": 0.8174206, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.83237922, + "num_input_tokens_seen": 234646540, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.22399902, + "step": 10864, + "time_per_iteration": 2.637723684310913 + }, + { + "auxiliary_loss_clip": 0.012788, + "auxiliary_loss_mlp": 0.00240179, + "balance_loss_clip": 1.05818832, + "balance_loss_mlp": 0.21491827, + "epoch": 0.6532391402374869, + "flos": 29563530912000.0, + "grad_norm": 570.0223015947254, + "language_loss": 0.93262708, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.94781685, + "num_input_tokens_seen": 234665470, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25244141, + "step": 10865, + "time_per_iteration": 4.09613561630249 + }, + { + "auxiliary_loss_clip": 0.01256346, + "auxiliary_loss_mlp": 0.00274787, + "balance_loss_clip": 1.04206967, + "balance_loss_mlp": 0.24989566, + "epoch": 0.6532992634901548, + "flos": 23105463920640.0, + "grad_norm": 14.008411349440369, + "language_loss": 0.89589578, + "learning_rate": 1.133711576532051e-06, + "loss": 0.9112072, + "num_input_tokens_seen": 234683955, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24853516, + "step": 10866, + "time_per_iteration": 4.1007304191589355 + }, + { + "auxiliary_loss_clip": 0.01261706, + "auxiliary_loss_mlp": 0.00277374, + "balance_loss_clip": 1.04998112, + "balance_loss_mlp": 0.25403258, + "epoch": 0.6533593867428228, + "flos": 26067340523520.0, + "grad_norm": 7.596231931343413, + "language_loss": 0.87043715, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.8858279, + "num_input_tokens_seen": 234704595, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.23352051, + "step": 10867, + "time_per_iteration": 2.7413864135742188 + }, + { + "auxiliary_loss_clip": 0.01274013, + "auxiliary_loss_mlp": 0.00253397, + "balance_loss_clip": 1.05289078, + "balance_loss_mlp": 0.22787458, + "epoch": 0.6534195099954908, + "flos": 21212469262080.0, + "grad_norm": 3.624982020097747, + "language_loss": 0.91373181, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.92900598, + "num_input_tokens_seen": 234724090, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25524902, + "step": 10868, + "time_per_iteration": 2.6586413383483887 + }, + { + "auxiliary_loss_clip": 0.01275886, + "auxiliary_loss_mlp": 0.00264243, + "balance_loss_clip": 1.05205798, + "balance_loss_mlp": 0.23773122, + "epoch": 0.6534796332481587, + "flos": 19646584784640.0, + "grad_norm": 101.50793370570032, + "language_loss": 0.89552003, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.91092134, + "num_input_tokens_seen": 234742560, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26525879, + "step": 10869, + "time_per_iteration": 2.7347662448883057 + }, + { + "auxiliary_loss_clip": 0.01278368, + "auxiliary_loss_mlp": 0.0023664, + "balance_loss_clip": 1.05649662, + "balance_loss_mlp": 0.21160582, + "epoch": 0.6535397565008267, + "flos": 24022479162240.0, + "grad_norm": 11.191159583769872, + "language_loss": 0.80640614, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.82155621, + "num_input_tokens_seen": 234762315, + "router_z_loss_clip": 2.21777344, + "router_z_loss_mlp": 0.25036621, + "step": 10870, + "time_per_iteration": 4.124929666519165 + }, + { + "auxiliary_loss_clip": 0.01282197, + "auxiliary_loss_mlp": 0.00244609, + "balance_loss_clip": 1.06227899, + "balance_loss_mlp": 0.21889579, + "epoch": 0.6535998797534947, + "flos": 24602759377920.0, + "grad_norm": 15.031720567961912, + "language_loss": 0.83353996, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.84880805, + "num_input_tokens_seen": 234781300, + "router_z_loss_clip": 2.19824219, + "router_z_loss_mlp": 0.25732422, + "step": 10871, + "time_per_iteration": 2.761345386505127 + }, + { + "auxiliary_loss_clip": 0.01262372, + "auxiliary_loss_mlp": 0.00248098, + "balance_loss_clip": 1.04933739, + "balance_loss_mlp": 0.22269417, + "epoch": 0.6536600030061627, + "flos": 23364164649600.0, + "grad_norm": 6.777374970670331, + "language_loss": 0.63868195, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.6537866, + "num_input_tokens_seen": 234801040, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.25402832, + "step": 10872, + "time_per_iteration": 2.723134756088257 + }, + { + "auxiliary_loss_clip": 0.01294994, + "auxiliary_loss_mlp": 0.00231035, + "balance_loss_clip": 1.07190061, + "balance_loss_mlp": 0.20765772, + "epoch": 0.6537201262588306, + "flos": 23878477537920.0, + "grad_norm": 151.6286478481261, + "language_loss": 0.81741726, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.8326776, + "num_input_tokens_seen": 234821415, + "router_z_loss_clip": 2.23144531, + "router_z_loss_mlp": 0.23364258, + "step": 10873, + "time_per_iteration": 2.735045909881592 + }, + { + "auxiliary_loss_clip": 0.0127064, + "auxiliary_loss_mlp": 0.00252806, + "balance_loss_clip": 1.05174458, + "balance_loss_mlp": 0.22710408, + "epoch": 0.6537802495114986, + "flos": 24354760901760.0, + "grad_norm": 3.4160412792141113, + "language_loss": 0.82272679, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.83796126, + "num_input_tokens_seen": 234843795, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25695801, + "step": 10874, + "time_per_iteration": 4.147740602493286 + }, + { + "auxiliary_loss_clip": 0.01304264, + "auxiliary_loss_mlp": 0.00238089, + "balance_loss_clip": 1.07497644, + "balance_loss_mlp": 0.21326992, + "epoch": 0.6538403727641665, + "flos": 27996892248960.0, + "grad_norm": 18.202891541769056, + "language_loss": 0.87526995, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.89069349, + "num_input_tokens_seen": 234862350, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.24816895, + "step": 10875, + "time_per_iteration": 2.755470037460327 + }, + { + "auxiliary_loss_clip": 0.0127696, + "auxiliary_loss_mlp": 0.00235264, + "balance_loss_clip": 1.05871844, + "balance_loss_mlp": 0.21058796, + "epoch": 0.6539004960168345, + "flos": 27563594486400.0, + "grad_norm": 7.6632994093100315, + "language_loss": 0.76511437, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.7802366, + "num_input_tokens_seen": 234881790, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24694824, + "step": 10876, + "time_per_iteration": 2.7656705379486084 + }, + { + "auxiliary_loss_clip": 0.01306393, + "auxiliary_loss_mlp": 0.00251464, + "balance_loss_clip": 1.07597184, + "balance_loss_mlp": 0.22547595, + "epoch": 0.6539606192695024, + "flos": 14530067879040.0, + "grad_norm": 32.326042237307156, + "language_loss": 0.871952, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.88753057, + "num_input_tokens_seen": 234897775, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.2598877, + "step": 10877, + "time_per_iteration": 2.753566265106201 + }, + { + "auxiliary_loss_clip": 0.01295032, + "auxiliary_loss_mlp": 0.00260711, + "balance_loss_clip": 1.07344127, + "balance_loss_mlp": 0.23707192, + "epoch": 0.6540207425221705, + "flos": 21616356764160.0, + "grad_norm": 24.19132912524503, + "language_loss": 0.88866556, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.90422308, + "num_input_tokens_seen": 234918395, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.2364502, + "step": 10878, + "time_per_iteration": 2.6627676486968994 + }, + { + "auxiliary_loss_clip": 0.0129288, + "auxiliary_loss_mlp": 0.00250057, + "balance_loss_clip": 1.06390071, + "balance_loss_mlp": 0.22367632, + "epoch": 0.6540808657748384, + "flos": 17668983640320.0, + "grad_norm": 6.753474428707174, + "language_loss": 0.93942773, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.95485705, + "num_input_tokens_seen": 234936260, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.26416016, + "step": 10879, + "time_per_iteration": 2.7481799125671387 + }, + { + "auxiliary_loss_clip": 0.01284997, + "auxiliary_loss_mlp": 0.0026615, + "balance_loss_clip": 1.05776119, + "balance_loss_mlp": 0.23987672, + "epoch": 0.6541409890275064, + "flos": 14538292093440.0, + "grad_norm": 39.384032894606925, + "language_loss": 0.81656718, + "learning_rate": 1.128800362199601e-06, + "loss": 0.83207864, + "num_input_tokens_seen": 234952110, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.26269531, + "step": 10880, + "time_per_iteration": 2.6587209701538086 + }, + { + "auxiliary_loss_clip": 0.0126964, + "auxiliary_loss_mlp": 0.00241536, + "balance_loss_clip": 1.05515063, + "balance_loss_mlp": 0.21813522, + "epoch": 0.6542011122801744, + "flos": 17165301177600.0, + "grad_norm": 28.87966381069261, + "language_loss": 0.92245233, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.93756407, + "num_input_tokens_seen": 234970810, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.23425293, + "step": 10881, + "time_per_iteration": 2.681338310241699 + }, + { + "auxiliary_loss_clip": 0.01263386, + "auxiliary_loss_mlp": 0.00249294, + "balance_loss_clip": 1.04325151, + "balance_loss_mlp": 0.2252848, + "epoch": 0.6542612355328423, + "flos": 18186600579840.0, + "grad_norm": 2.4609817992105563, + "language_loss": 0.86997545, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.88510221, + "num_input_tokens_seen": 234989565, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.24023438, + "step": 10882, + "time_per_iteration": 2.6458663940429688 + }, + { + "auxiliary_loss_clip": 0.01295012, + "auxiliary_loss_mlp": 0.00246287, + "balance_loss_clip": 1.0626266, + "balance_loss_mlp": 0.221861, + "epoch": 0.6543213587855103, + "flos": 19792453916160.0, + "grad_norm": 14.450162427544912, + "language_loss": 0.88762236, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.90303534, + "num_input_tokens_seen": 235007955, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.24450684, + "step": 10883, + "time_per_iteration": 2.721550226211548 + }, + { + "auxiliary_loss_clip": 0.01290626, + "auxiliary_loss_mlp": 0.00243798, + "balance_loss_clip": 1.06385863, + "balance_loss_mlp": 0.22017065, + "epoch": 0.6543814820381783, + "flos": 21105096531840.0, + "grad_norm": 11.487302224720281, + "language_loss": 0.96227038, + "learning_rate": 1.127398345803988e-06, + "loss": 0.97761464, + "num_input_tokens_seen": 235024860, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.2364502, + "step": 10884, + "time_per_iteration": 2.6525158882141113 + }, + { + "auxiliary_loss_clip": 0.01312682, + "auxiliary_loss_mlp": 0.00244969, + "balance_loss_clip": 1.07974529, + "balance_loss_mlp": 0.21997052, + "epoch": 0.6544416052908463, + "flos": 20194042947840.0, + "grad_norm": 172.97193056660404, + "language_loss": 0.93572617, + "learning_rate": 1.127047924394715e-06, + "loss": 0.95130265, + "num_input_tokens_seen": 235043815, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.25, + "step": 10885, + "time_per_iteration": 2.689781427383423 + }, + { + "auxiliary_loss_clip": 0.0128043, + "auxiliary_loss_mlp": 0.00236048, + "balance_loss_clip": 1.05666947, + "balance_loss_mlp": 0.21207532, + "epoch": 0.6545017285435142, + "flos": 23368258800000.0, + "grad_norm": 10.207961806352907, + "language_loss": 0.81304878, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.82821357, + "num_input_tokens_seen": 235062985, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.23986816, + "step": 10886, + "time_per_iteration": 2.6667215824127197 + }, + { + "auxiliary_loss_clip": 0.01264493, + "auxiliary_loss_mlp": 0.0021361, + "balance_loss_clip": 1.04882336, + "balance_loss_mlp": 0.1903169, + "epoch": 0.6545618517961822, + "flos": 19134714021120.0, + "grad_norm": 33.35775084237621, + "language_loss": 0.84717309, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.86195409, + "num_input_tokens_seen": 235081670, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.23303223, + "step": 10887, + "time_per_iteration": 2.68330454826355 + }, + { + "auxiliary_loss_clip": 0.01271209, + "auxiliary_loss_mlp": 0.0025897, + "balance_loss_clip": 1.05113912, + "balance_loss_mlp": 0.23471075, + "epoch": 0.6546219750488501, + "flos": 14938624149120.0, + "grad_norm": 12.55933496050534, + "language_loss": 0.87230873, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.88761055, + "num_input_tokens_seen": 235098510, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24267578, + "step": 10888, + "time_per_iteration": 2.589668035507202 + }, + { + "auxiliary_loss_clip": 0.01270934, + "auxiliary_loss_mlp": 0.00252172, + "balance_loss_clip": 1.05471718, + "balance_loss_mlp": 0.2292596, + "epoch": 0.6546820983015181, + "flos": 36320518886400.0, + "grad_norm": 52.45069538924412, + "language_loss": 0.73613513, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.75136614, + "num_input_tokens_seen": 235119990, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.22912598, + "step": 10889, + "time_per_iteration": 2.8222599029541016 + }, + { + "auxiliary_loss_clip": 0.01285453, + "auxiliary_loss_mlp": 0.00259404, + "balance_loss_clip": 1.05499232, + "balance_loss_mlp": 0.23199777, + "epoch": 0.654742221554186, + "flos": 20411446014720.0, + "grad_norm": 2.48036852256772, + "language_loss": 0.86782783, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.8832764, + "num_input_tokens_seen": 235139255, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.27416992, + "step": 10890, + "time_per_iteration": 2.6636970043182373 + }, + { + "auxiliary_loss_clip": 0.01280255, + "auxiliary_loss_mlp": 0.00275465, + "balance_loss_clip": 1.0580585, + "balance_loss_mlp": 0.24950102, + "epoch": 0.6548023448068541, + "flos": 24863650836480.0, + "grad_norm": 28.147015632290092, + "language_loss": 0.76915932, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.78471649, + "num_input_tokens_seen": 235158455, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.2598877, + "step": 10891, + "time_per_iteration": 2.6773128509521484 + }, + { + "auxiliary_loss_clip": 0.01268136, + "auxiliary_loss_mlp": 0.00221296, + "balance_loss_clip": 1.05103266, + "balance_loss_mlp": 0.19894443, + "epoch": 0.654862468059522, + "flos": 21427573858560.0, + "grad_norm": 130.20932052948413, + "language_loss": 0.86174089, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.87663519, + "num_input_tokens_seen": 235177350, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.22363281, + "step": 10892, + "time_per_iteration": 2.68110990524292 + }, + { + "auxiliary_loss_clip": 0.01285467, + "auxiliary_loss_mlp": 0.00249194, + "balance_loss_clip": 1.06006312, + "balance_loss_mlp": 0.22518542, + "epoch": 0.65492259131219, + "flos": 26577846570240.0, + "grad_norm": 9.075569114855618, + "language_loss": 0.86423004, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.87957662, + "num_input_tokens_seen": 235196435, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.2401123, + "step": 10893, + "time_per_iteration": 2.7410171031951904 + }, + { + "auxiliary_loss_clip": 0.01282963, + "auxiliary_loss_mlp": 0.0021491, + "balance_loss_clip": 1.05641222, + "balance_loss_mlp": 0.19110423, + "epoch": 0.6549827145648579, + "flos": 21501334437120.0, + "grad_norm": 7.9243298832215885, + "language_loss": 0.7744534, + "learning_rate": 1.123895622914766e-06, + "loss": 0.78943217, + "num_input_tokens_seen": 235215430, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.23791504, + "step": 10894, + "time_per_iteration": 2.66680645942688 + }, + { + "auxiliary_loss_clip": 0.01287132, + "auxiliary_loss_mlp": 0.00218249, + "balance_loss_clip": 1.05959558, + "balance_loss_mlp": 0.19332188, + "epoch": 0.6550428378175259, + "flos": 22594275515520.0, + "grad_norm": 207.53590672908507, + "language_loss": 0.76451582, + "learning_rate": 1.123545533127549e-06, + "loss": 0.77956963, + "num_input_tokens_seen": 235232015, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.24938965, + "step": 10895, + "time_per_iteration": 2.694896936416626 + }, + { + "auxiliary_loss_clip": 0.01267342, + "auxiliary_loss_mlp": 0.00208987, + "balance_loss_clip": 1.05093527, + "balance_loss_mlp": 0.18551511, + "epoch": 0.655102961070194, + "flos": 12823809050880.0, + "grad_norm": 46.72849748378113, + "language_loss": 0.85037613, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.86513948, + "num_input_tokens_seen": 235248115, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.23474121, + "step": 10896, + "time_per_iteration": 2.631767511367798 + }, + { + "auxiliary_loss_clip": 0.01262139, + "auxiliary_loss_mlp": 0.00217296, + "balance_loss_clip": 1.04549205, + "balance_loss_mlp": 0.19359674, + "epoch": 0.6551630843228619, + "flos": 24791075406720.0, + "grad_norm": 4.185231757657812, + "language_loss": 0.76614445, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.78093874, + "num_input_tokens_seen": 235270785, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.23706055, + "step": 10897, + "time_per_iteration": 2.754528045654297 + }, + { + "auxiliary_loss_clip": 0.01273615, + "auxiliary_loss_mlp": 0.00234587, + "balance_loss_clip": 1.05341506, + "balance_loss_mlp": 0.21110269, + "epoch": 0.6552232075755299, + "flos": 16724461559040.0, + "grad_norm": 186.3534395257103, + "language_loss": 0.82690084, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.8419829, + "num_input_tokens_seen": 235287905, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.23474121, + "step": 10898, + "time_per_iteration": 2.692232370376587 + }, + { + "auxiliary_loss_clip": 0.01283559, + "auxiliary_loss_mlp": 0.00224523, + "balance_loss_clip": 1.06355453, + "balance_loss_mlp": 0.20069274, + "epoch": 0.6552833308281978, + "flos": 22016473338240.0, + "grad_norm": 46.320242335365464, + "language_loss": 0.84008509, + "learning_rate": 1.122145506463827e-06, + "loss": 0.85516596, + "num_input_tokens_seen": 235305525, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.23852539, + "step": 10899, + "time_per_iteration": 2.677253246307373 + }, + { + "auxiliary_loss_clip": 0.01274211, + "auxiliary_loss_mlp": 0.00229432, + "balance_loss_clip": 1.05045724, + "balance_loss_mlp": 0.20594731, + "epoch": 0.6553434540808658, + "flos": 24863399441280.0, + "grad_norm": 2.9410986951762164, + "language_loss": 0.6429019, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.6579383, + "num_input_tokens_seen": 235324415, + "router_z_loss_clip": 2.23730469, + "router_z_loss_mlp": 0.23461914, + "step": 10900, + "time_per_iteration": 2.6710598468780518 + }, + { + "auxiliary_loss_clip": 0.01283385, + "auxiliary_loss_mlp": 0.00214801, + "balance_loss_clip": 1.05804515, + "balance_loss_mlp": 0.19100709, + "epoch": 0.6554035773335337, + "flos": 23221060865280.0, + "grad_norm": 4.321381735483956, + "language_loss": 0.83396643, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.8489483, + "num_input_tokens_seen": 235341595, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.23779297, + "step": 10901, + "time_per_iteration": 2.676638126373291 + }, + { + "auxiliary_loss_clip": 0.01283699, + "auxiliary_loss_mlp": 0.00200524, + "balance_loss_clip": 1.05755949, + "balance_loss_mlp": 0.17610994, + "epoch": 0.6554637005862017, + "flos": 22783597125120.0, + "grad_norm": 7.4955332566519015, + "language_loss": 0.809811, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.82465327, + "num_input_tokens_seen": 235361700, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.2442627, + "step": 10902, + "time_per_iteration": 2.6722092628479004 + }, + { + "auxiliary_loss_clip": 0.01268227, + "auxiliary_loss_mlp": 0.00192497, + "balance_loss_clip": 1.0517993, + "balance_loss_mlp": 0.16860729, + "epoch": 0.6555238238388696, + "flos": 21507224267520.0, + "grad_norm": 4.506528977657844, + "language_loss": 0.77858722, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.79319453, + "num_input_tokens_seen": 235382065, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.23876953, + "step": 10903, + "time_per_iteration": 2.7329742908477783 + }, + { + "auxiliary_loss_clip": 0.01302214, + "auxiliary_loss_mlp": 0.002142, + "balance_loss_clip": 1.06954694, + "balance_loss_mlp": 0.18977419, + "epoch": 0.6555839470915377, + "flos": 30519473518080.0, + "grad_norm": 6.452241327104623, + "language_loss": 0.76953816, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.7847023, + "num_input_tokens_seen": 235402130, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.2442627, + "step": 10904, + "time_per_iteration": 2.710002899169922 + }, + { + "auxiliary_loss_clip": 0.01289509, + "auxiliary_loss_mlp": 0.00243696, + "balance_loss_clip": 1.06266308, + "balance_loss_mlp": 0.2180897, + "epoch": 0.6556440703442056, + "flos": 24642943718400.0, + "grad_norm": 4.0679568045962835, + "language_loss": 0.96902895, + "learning_rate": 1.120046465383464e-06, + "loss": 0.98436099, + "num_input_tokens_seen": 235420435, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.25610352, + "step": 10905, + "time_per_iteration": 2.7390940189361572 + }, + { + "auxiliary_loss_clip": 0.01289449, + "auxiliary_loss_mlp": 0.00209828, + "balance_loss_clip": 1.06724226, + "balance_loss_mlp": 0.18564025, + "epoch": 0.6557041935968736, + "flos": 23732464752000.0, + "grad_norm": 5.2061903983510875, + "language_loss": 0.84607613, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.86106884, + "num_input_tokens_seen": 235439960, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.24206543, + "step": 10906, + "time_per_iteration": 2.6646931171417236 + }, + { + "auxiliary_loss_clip": 0.01284026, + "auxiliary_loss_mlp": 0.00221858, + "balance_loss_clip": 1.05888224, + "balance_loss_mlp": 0.19727743, + "epoch": 0.6557643168495415, + "flos": 11102753819520.0, + "grad_norm": 4.98764644262817, + "language_loss": 0.88555413, + "learning_rate": 1.119347051825267e-06, + "loss": 0.90061301, + "num_input_tokens_seen": 235457495, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.24584961, + "step": 10907, + "time_per_iteration": 2.655547857284546 + }, + { + "auxiliary_loss_clip": 0.01288601, + "auxiliary_loss_mlp": 0.00218752, + "balance_loss_clip": 1.06228352, + "balance_loss_mlp": 0.19374156, + "epoch": 0.6558244401022095, + "flos": 30191034533760.0, + "grad_norm": 4.240968373265685, + "language_loss": 0.80724388, + "learning_rate": 1.118997395131211e-06, + "loss": 0.82231736, + "num_input_tokens_seen": 235479525, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.25, + "step": 10908, + "time_per_iteration": 4.108297824859619 + }, + { + "auxiliary_loss_clip": 0.012895, + "auxiliary_loss_mlp": 0.00216868, + "balance_loss_clip": 1.06229877, + "balance_loss_mlp": 0.19263314, + "epoch": 0.6558845633548775, + "flos": 17931060247680.0, + "grad_norm": 131.77238661328542, + "language_loss": 0.91522723, + "learning_rate": 1.118647771844861e-06, + "loss": 0.93029094, + "num_input_tokens_seen": 235496305, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.24243164, + "step": 10909, + "time_per_iteration": 4.07070779800415 + }, + { + "auxiliary_loss_clip": 0.01297113, + "auxiliary_loss_mlp": 0.00215572, + "balance_loss_clip": 1.06669688, + "balance_loss_mlp": 0.19088383, + "epoch": 0.6559446866075455, + "flos": 21904144531200.0, + "grad_norm": 18.387072844437405, + "language_loss": 0.75924754, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.77437437, + "num_input_tokens_seen": 235512545, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.24682617, + "step": 10910, + "time_per_iteration": 2.723598003387451 + }, + { + "auxiliary_loss_clip": 0.01300166, + "auxiliary_loss_mlp": 0.00202458, + "balance_loss_clip": 1.06556511, + "balance_loss_mlp": 0.17687619, + "epoch": 0.6560048098602135, + "flos": 14127976056960.0, + "grad_norm": 38.10937597344694, + "language_loss": 0.91661435, + "learning_rate": 1.117948625548313e-06, + "loss": 0.93164062, + "num_input_tokens_seen": 235526045, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.25561523, + "step": 10911, + "time_per_iteration": 2.6993765830993652 + }, + { + "auxiliary_loss_clip": 0.0126964, + "auxiliary_loss_mlp": 0.00215191, + "balance_loss_clip": 1.05499494, + "balance_loss_mlp": 0.19083694, + "epoch": 0.6560649331128814, + "flos": 18807567926400.0, + "grad_norm": 3.014057129800051, + "language_loss": 0.81293213, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.82778049, + "num_input_tokens_seen": 235545285, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24365234, + "step": 10912, + "time_per_iteration": 4.177293539047241 + }, + { + "auxiliary_loss_clip": 0.01318694, + "auxiliary_loss_mlp": 0.00241726, + "balance_loss_clip": 1.0843178, + "balance_loss_mlp": 0.21490347, + "epoch": 0.6561250563655494, + "flos": 17053618815360.0, + "grad_norm": 9.418710906181504, + "language_loss": 0.87184262, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.88744682, + "num_input_tokens_seen": 235563150, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.26855469, + "step": 10913, + "time_per_iteration": 2.6818864345550537 + }, + { + "auxiliary_loss_clip": 0.01270391, + "auxiliary_loss_mlp": 0.00195331, + "balance_loss_clip": 1.05455065, + "balance_loss_mlp": 0.17121519, + "epoch": 0.6561851796182173, + "flos": 22637656166400.0, + "grad_norm": 14.940546540433687, + "language_loss": 0.80994129, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.82459843, + "num_input_tokens_seen": 235582535, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.24121094, + "step": 10914, + "time_per_iteration": 2.665663003921509 + }, + { + "auxiliary_loss_clip": 0.01302609, + "auxiliary_loss_mlp": 0.00213902, + "balance_loss_clip": 1.07183075, + "balance_loss_mlp": 0.18870132, + "epoch": 0.6562453028708853, + "flos": 19239213663360.0, + "grad_norm": 5.80505734613378, + "language_loss": 0.81891096, + "learning_rate": 1.116550734430958e-06, + "loss": 0.83407611, + "num_input_tokens_seen": 235601490, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.25219727, + "step": 10915, + "time_per_iteration": 2.655879259109497 + }, + { + "auxiliary_loss_clip": 0.01309884, + "auxiliary_loss_mlp": 0.0023805, + "balance_loss_clip": 1.08088863, + "balance_loss_mlp": 0.21052468, + "epoch": 0.6563054261235532, + "flos": 23801305167360.0, + "grad_norm": 156.76837143944917, + "language_loss": 0.84858847, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.86406779, + "num_input_tokens_seen": 235619165, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.2755127, + "step": 10916, + "time_per_iteration": 2.6404120922088623 + }, + { + "auxiliary_loss_clip": 0.01286008, + "auxiliary_loss_mlp": 0.0019984, + "balance_loss_clip": 1.06224084, + "balance_loss_mlp": 0.17576019, + "epoch": 0.6563655493762213, + "flos": 19240039676160.0, + "grad_norm": 47.968771543372576, + "language_loss": 0.82472956, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.83958805, + "num_input_tokens_seen": 235637115, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.24084473, + "step": 10917, + "time_per_iteration": 4.133657217025757 + }, + { + "auxiliary_loss_clip": 0.01301438, + "auxiliary_loss_mlp": 0.00203818, + "balance_loss_clip": 1.07371092, + "balance_loss_mlp": 0.1782712, + "epoch": 0.6564256726288892, + "flos": 25556439427200.0, + "grad_norm": 36.690825657906665, + "language_loss": 0.81118125, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.82623386, + "num_input_tokens_seen": 235656330, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.25537109, + "step": 10918, + "time_per_iteration": 2.709566354751587 + }, + { + "auxiliary_loss_clip": 0.01295058, + "auxiliary_loss_mlp": 0.00198213, + "balance_loss_clip": 1.07504964, + "balance_loss_mlp": 0.17547995, + "epoch": 0.6564857958815572, + "flos": 22200623389440.0, + "grad_norm": 12.956843602357612, + "language_loss": 0.81405604, + "learning_rate": 1.115153379321332e-06, + "loss": 0.82898879, + "num_input_tokens_seen": 235674510, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.22753906, + "step": 10919, + "time_per_iteration": 2.6647236347198486 + }, + { + "auxiliary_loss_clip": 0.01202549, + "auxiliary_loss_mlp": 0.00082534, + "balance_loss_clip": 1.06641889, + "balance_loss_mlp": 0.07452312, + "epoch": 0.6565459191342251, + "flos": 58123144604160.0, + "grad_norm": 0.7187803542447121, + "language_loss": 0.52438414, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.5372349, + "num_input_tokens_seen": 235735050, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.08007812, + "step": 10920, + "time_per_iteration": 3.1762983798980713 + }, + { + "auxiliary_loss_clip": 0.01284827, + "auxiliary_loss_mlp": 0.00215558, + "balance_loss_clip": 1.06644917, + "balance_loss_mlp": 0.19085824, + "epoch": 0.6566060423868931, + "flos": 30809631582720.0, + "grad_norm": 2.3507194405737066, + "language_loss": 0.7147544, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.72975826, + "num_input_tokens_seen": 235757545, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24707031, + "step": 10921, + "time_per_iteration": 2.7751264572143555 + }, + { + "auxiliary_loss_clip": 0.01287271, + "auxiliary_loss_mlp": 0.00204914, + "balance_loss_clip": 1.06796837, + "balance_loss_mlp": 0.17931968, + "epoch": 0.6566661656395612, + "flos": 23367432787200.0, + "grad_norm": 74.50217093529716, + "language_loss": 0.9043014, + "learning_rate": 1.114105715254205e-06, + "loss": 0.91922325, + "num_input_tokens_seen": 235777265, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.25634766, + "step": 10922, + "time_per_iteration": 2.7428367137908936 + }, + { + "auxiliary_loss_clip": 0.01292817, + "auxiliary_loss_mlp": 0.00226402, + "balance_loss_clip": 1.06719589, + "balance_loss_mlp": 0.20261931, + "epoch": 0.6567262888922291, + "flos": 25735597488000.0, + "grad_norm": 33.03753580718833, + "language_loss": 0.81696653, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.83215874, + "num_input_tokens_seen": 235796565, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.2376709, + "step": 10923, + "time_per_iteration": 2.7667367458343506 + }, + { + "auxiliary_loss_clip": 0.01299726, + "auxiliary_loss_mlp": 0.00223101, + "balance_loss_clip": 1.07447791, + "balance_loss_mlp": 0.19819795, + "epoch": 0.6567864121448971, + "flos": 17123716206720.0, + "grad_norm": 6.581455141610187, + "language_loss": 0.88879943, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.9040277, + "num_input_tokens_seen": 235814805, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.24890137, + "step": 10924, + "time_per_iteration": 2.677077054977417 + }, + { + "auxiliary_loss_clip": 0.01279698, + "auxiliary_loss_mlp": 0.00216196, + "balance_loss_clip": 1.06231809, + "balance_loss_mlp": 0.1914244, + "epoch": 0.656846535397565, + "flos": 22419319345920.0, + "grad_norm": 6.939945101015875, + "language_loss": 0.80811834, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.82307726, + "num_input_tokens_seen": 235833405, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24780273, + "step": 10925, + "time_per_iteration": 2.759979248046875 + }, + { + "auxiliary_loss_clip": 0.01285524, + "auxiliary_loss_mlp": 0.00220033, + "balance_loss_clip": 1.05921364, + "balance_loss_mlp": 0.19676325, + "epoch": 0.656906658650233, + "flos": 17704535126400.0, + "grad_norm": 5.988322027613412, + "language_loss": 0.81441486, + "learning_rate": 1.112709300197942e-06, + "loss": 0.82947046, + "num_input_tokens_seen": 235848530, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.23254395, + "step": 10926, + "time_per_iteration": 2.631743907928467 + }, + { + "auxiliary_loss_clip": 0.01312846, + "auxiliary_loss_mlp": 0.00234734, + "balance_loss_clip": 1.07759929, + "balance_loss_mlp": 0.20819815, + "epoch": 0.6569667819029009, + "flos": 21175158009600.0, + "grad_norm": 25.39745908411096, + "language_loss": 0.79045188, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.80592763, + "num_input_tokens_seen": 235867225, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.26574707, + "step": 10927, + "time_per_iteration": 2.707012414932251 + }, + { + "auxiliary_loss_clip": 0.01215989, + "auxiliary_loss_mlp": 0.0011259, + "balance_loss_clip": 1.07749724, + "balance_loss_mlp": 0.10472205, + "epoch": 0.6570269051555689, + "flos": 68761897511040.0, + "grad_norm": 0.7912194219971501, + "language_loss": 0.63856637, + "learning_rate": 1.112011294493775e-06, + "loss": 0.65185213, + "num_input_tokens_seen": 235932925, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.07861328, + "step": 10928, + "time_per_iteration": 3.141432046890259 + }, + { + "auxiliary_loss_clip": 0.01285861, + "auxiliary_loss_mlp": 0.00221001, + "balance_loss_clip": 1.06227183, + "balance_loss_mlp": 0.19708753, + "epoch": 0.6570870284082369, + "flos": 26319289495680.0, + "grad_norm": 9.139986345643585, + "language_loss": 0.82578886, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.84085751, + "num_input_tokens_seen": 235952680, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.23913574, + "step": 10929, + "time_per_iteration": 2.887753486633301 + }, + { + "auxiliary_loss_clip": 0.01320132, + "auxiliary_loss_mlp": 0.00238939, + "balance_loss_clip": 1.08936954, + "balance_loss_mlp": 0.21133026, + "epoch": 0.6571471516609049, + "flos": 26174749167360.0, + "grad_norm": 5.154416789691215, + "language_loss": 0.75122964, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.76682037, + "num_input_tokens_seen": 235972075, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.27587891, + "step": 10930, + "time_per_iteration": 2.7492318153381348 + }, + { + "auxiliary_loss_clip": 0.01300785, + "auxiliary_loss_mlp": 0.00224395, + "balance_loss_clip": 1.06822634, + "balance_loss_mlp": 0.19953945, + "epoch": 0.6572072749135728, + "flos": 20376253664640.0, + "grad_norm": 21.246466224239313, + "language_loss": 0.80233473, + "learning_rate": 1.110964538515258e-06, + "loss": 0.81758648, + "num_input_tokens_seen": 235990340, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.2487793, + "step": 10931, + "time_per_iteration": 2.7322959899902344 + }, + { + "auxiliary_loss_clip": 0.01300482, + "auxiliary_loss_mlp": 0.00213414, + "balance_loss_clip": 1.07229435, + "balance_loss_mlp": 0.18883292, + "epoch": 0.6572673981662408, + "flos": 17128744110720.0, + "grad_norm": 29.4599427204182, + "language_loss": 0.78084165, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.79598057, + "num_input_tokens_seen": 236007470, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.24584961, + "step": 10932, + "time_per_iteration": 2.7350847721099854 + }, + { + "auxiliary_loss_clip": 0.01308846, + "auxiliary_loss_mlp": 0.0023622, + "balance_loss_clip": 1.07833552, + "balance_loss_mlp": 0.21085247, + "epoch": 0.6573275214189087, + "flos": 41275113281280.0, + "grad_norm": 13.417427592693649, + "language_loss": 0.88223577, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.89768642, + "num_input_tokens_seen": 236029030, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.25378418, + "step": 10933, + "time_per_iteration": 2.8791182041168213 + }, + { + "auxiliary_loss_clip": 0.01315128, + "auxiliary_loss_mlp": 0.00238117, + "balance_loss_clip": 1.07944679, + "balance_loss_mlp": 0.21111655, + "epoch": 0.6573876446715767, + "flos": 22890143842560.0, + "grad_norm": 5.3218745488158286, + "language_loss": 0.82210588, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.83763832, + "num_input_tokens_seen": 236047160, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.26989746, + "step": 10934, + "time_per_iteration": 2.6923482418060303 + }, + { + "auxiliary_loss_clip": 0.01289023, + "auxiliary_loss_mlp": 0.0024096, + "balance_loss_clip": 1.06614304, + "balance_loss_mlp": 0.21597332, + "epoch": 0.6574477679242448, + "flos": 44018150273280.0, + "grad_norm": 8387.726704944685, + "language_loss": 0.82980549, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.84510541, + "num_input_tokens_seen": 236069215, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.24987793, + "step": 10935, + "time_per_iteration": 2.8770923614501953 + }, + { + "auxiliary_loss_clip": 0.01312224, + "auxiliary_loss_mlp": 0.00237899, + "balance_loss_clip": 1.07944572, + "balance_loss_mlp": 0.2106479, + "epoch": 0.6575078911769127, + "flos": 24571517523840.0, + "grad_norm": 140.56214639922322, + "language_loss": 0.88265508, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.89815629, + "num_input_tokens_seen": 236088335, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.27258301, + "step": 10936, + "time_per_iteration": 2.714184045791626 + }, + { + "auxiliary_loss_clip": 0.01310243, + "auxiliary_loss_mlp": 0.0021881, + "balance_loss_clip": 1.07775915, + "balance_loss_mlp": 0.19292969, + "epoch": 0.6575680144295807, + "flos": 20924035050240.0, + "grad_norm": 22.389530552888452, + "language_loss": 0.76574105, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.78103155, + "num_input_tokens_seen": 236108540, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.25891113, + "step": 10937, + "time_per_iteration": 2.698570966720581 + }, + { + "auxiliary_loss_clip": 0.01311373, + "auxiliary_loss_mlp": 0.0024463, + "balance_loss_clip": 1.07815361, + "balance_loss_mlp": 0.21883342, + "epoch": 0.6576281376822486, + "flos": 10925642833920.0, + "grad_norm": 700.1767097064842, + "language_loss": 0.81609976, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.83165979, + "num_input_tokens_seen": 236124495, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.25817871, + "step": 10938, + "time_per_iteration": 2.6148123741149902 + }, + { + "auxiliary_loss_clip": 0.01304209, + "auxiliary_loss_mlp": 0.00236436, + "balance_loss_clip": 1.07250428, + "balance_loss_mlp": 0.21241572, + "epoch": 0.6576882609349166, + "flos": 19281552819840.0, + "grad_norm": 15.077558527055661, + "language_loss": 0.83161622, + "learning_rate": 1.108174673550927e-06, + "loss": 0.84702265, + "num_input_tokens_seen": 236142550, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.2401123, + "step": 10939, + "time_per_iteration": 2.6682143211364746 + }, + { + "auxiliary_loss_clip": 0.01327751, + "auxiliary_loss_mlp": 0.00231306, + "balance_loss_clip": 1.08482003, + "balance_loss_mlp": 0.20424551, + "epoch": 0.6577483841875845, + "flos": 20220544206720.0, + "grad_norm": 22.213872852261165, + "language_loss": 0.91226584, + "learning_rate": 1.107826092473037e-06, + "loss": 0.92785645, + "num_input_tokens_seen": 236156620, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.27050781, + "step": 10940, + "time_per_iteration": 2.649789333343506 + }, + { + "auxiliary_loss_clip": 0.01304211, + "auxiliary_loss_mlp": 0.00244998, + "balance_loss_clip": 1.07020426, + "balance_loss_mlp": 0.21819958, + "epoch": 0.6578085074402525, + "flos": 34751078962560.0, + "grad_norm": 496.7056901195356, + "language_loss": 0.76998961, + "learning_rate": 1.107477545226471e-06, + "loss": 0.78548175, + "num_input_tokens_seen": 236177095, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.26794434, + "step": 10941, + "time_per_iteration": 2.7449278831481934 + }, + { + "auxiliary_loss_clip": 0.01305738, + "auxiliary_loss_mlp": 0.00243476, + "balance_loss_clip": 1.07374656, + "balance_loss_mlp": 0.21797673, + "epoch": 0.6578686306929205, + "flos": 23470998675840.0, + "grad_norm": 100.44742258074554, + "language_loss": 0.77556753, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.79105967, + "num_input_tokens_seen": 236194695, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.25476074, + "step": 10942, + "time_per_iteration": 2.651858329772949 + }, + { + "auxiliary_loss_clip": 0.01337894, + "auxiliary_loss_mlp": 0.00244135, + "balance_loss_clip": 1.09581709, + "balance_loss_mlp": 0.21752764, + "epoch": 0.6579287539455885, + "flos": 18077073033600.0, + "grad_norm": 74.04648305235268, + "language_loss": 0.80645269, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.82227302, + "num_input_tokens_seen": 236213885, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.26623535, + "step": 10943, + "time_per_iteration": 2.6836698055267334 + }, + { + "auxiliary_loss_clip": 0.01304352, + "auxiliary_loss_mlp": 0.00235115, + "balance_loss_clip": 1.07166183, + "balance_loss_mlp": 0.21095133, + "epoch": 0.6579888771982564, + "flos": 28661383900800.0, + "grad_norm": 28.77079501849207, + "language_loss": 0.64981973, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.66521442, + "num_input_tokens_seen": 236237315, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.24133301, + "step": 10944, + "time_per_iteration": 2.717470169067383 + }, + { + "auxiliary_loss_clip": 0.01319699, + "auxiliary_loss_mlp": 0.00224005, + "balance_loss_clip": 1.08069968, + "balance_loss_mlp": 0.19633655, + "epoch": 0.6580490004509244, + "flos": 25046543911680.0, + "grad_norm": 2.7080146769026228, + "language_loss": 0.81569117, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.83112824, + "num_input_tokens_seen": 236256345, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.27685547, + "step": 10945, + "time_per_iteration": 2.705613136291504 + }, + { + "auxiliary_loss_clip": 0.01311821, + "auxiliary_loss_mlp": 0.002293, + "balance_loss_clip": 1.07361221, + "balance_loss_mlp": 0.2041229, + "epoch": 0.6581091237035923, + "flos": 43508793461760.0, + "grad_norm": 9.432859633658497, + "language_loss": 0.76627636, + "learning_rate": 1.105735316926046e-06, + "loss": 0.78168762, + "num_input_tokens_seen": 236281890, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.25170898, + "step": 10946, + "time_per_iteration": 2.874081611633301 + }, + { + "auxiliary_loss_clip": 0.01296448, + "auxiliary_loss_mlp": 0.0023116, + "balance_loss_clip": 1.06893384, + "balance_loss_mlp": 0.20548244, + "epoch": 0.6581692469562603, + "flos": 22415404763520.0, + "grad_norm": 133.22929229123142, + "language_loss": 0.89085436, + "learning_rate": 1.105386972944934e-06, + "loss": 0.90613043, + "num_input_tokens_seen": 236298370, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.25683594, + "step": 10947, + "time_per_iteration": 2.6572890281677246 + }, + { + "auxiliary_loss_clip": 0.01323954, + "auxiliary_loss_mlp": 0.00226035, + "balance_loss_clip": 1.08761859, + "balance_loss_mlp": 0.20036954, + "epoch": 0.6582293702089284, + "flos": 24859772167680.0, + "grad_norm": 16.568192679209684, + "language_loss": 0.85295796, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.86845791, + "num_input_tokens_seen": 236317380, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.25695801, + "step": 10948, + "time_per_iteration": 2.6326494216918945 + }, + { + "auxiliary_loss_clip": 0.01314524, + "auxiliary_loss_mlp": 0.00242353, + "balance_loss_clip": 1.07990599, + "balance_loss_mlp": 0.21647251, + "epoch": 0.6582894934615963, + "flos": 23039676161280.0, + "grad_norm": 58.691884632954086, + "language_loss": 0.86490542, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.88047421, + "num_input_tokens_seen": 236336210, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.25854492, + "step": 10949, + "time_per_iteration": 2.654794216156006 + }, + { + "auxiliary_loss_clip": 0.01257047, + "auxiliary_loss_mlp": 0.00104372, + "balance_loss_clip": 1.11039734, + "balance_loss_mlp": 0.09693316, + "epoch": 0.6583496167142643, + "flos": 72551980978560.0, + "grad_norm": 0.7145695678965415, + "language_loss": 0.61218536, + "learning_rate": 1.104342144597323e-06, + "loss": 0.62579954, + "num_input_tokens_seen": 236403090, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.07421875, + "step": 10950, + "time_per_iteration": 4.595803737640381 + }, + { + "auxiliary_loss_clip": 0.01319523, + "auxiliary_loss_mlp": 0.00213135, + "balance_loss_clip": 1.08640945, + "balance_loss_mlp": 0.18942389, + "epoch": 0.6584097399669322, + "flos": 13078846592640.0, + "grad_norm": 12.776267337934986, + "language_loss": 0.75873339, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.77405989, + "num_input_tokens_seen": 236420475, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.23730469, + "step": 10951, + "time_per_iteration": 4.06748366355896 + }, + { + "auxiliary_loss_clip": 0.01308648, + "auxiliary_loss_mlp": 0.00249415, + "balance_loss_clip": 1.07509482, + "balance_loss_mlp": 0.22402298, + "epoch": 0.6584698632196002, + "flos": 28693164458880.0, + "grad_norm": 202.2166719986399, + "language_loss": 0.82806444, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.84364504, + "num_input_tokens_seen": 236441915, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.25390625, + "step": 10952, + "time_per_iteration": 2.684678554534912 + }, + { + "auxiliary_loss_clip": 0.01327985, + "auxiliary_loss_mlp": 0.00253549, + "balance_loss_clip": 1.08828282, + "balance_loss_mlp": 0.22613122, + "epoch": 0.6585299864722681, + "flos": 14319272914560.0, + "grad_norm": 13.356693260247768, + "language_loss": 0.81189388, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.8277092, + "num_input_tokens_seen": 236460340, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.27404785, + "step": 10953, + "time_per_iteration": 2.67913818359375 + }, + { + "auxiliary_loss_clip": 0.01315402, + "auxiliary_loss_mlp": 0.00247293, + "balance_loss_clip": 1.08243358, + "balance_loss_mlp": 0.22149593, + "epoch": 0.6585901097249361, + "flos": 26797907243520.0, + "grad_norm": 164.7818173694376, + "language_loss": 0.86002386, + "learning_rate": 1.102949515683546e-06, + "loss": 0.87565082, + "num_input_tokens_seen": 236478280, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.25817871, + "step": 10954, + "time_per_iteration": 4.184727191925049 + }, + { + "auxiliary_loss_clip": 0.01327734, + "auxiliary_loss_mlp": 0.00232521, + "balance_loss_clip": 1.08555329, + "balance_loss_mlp": 0.20565167, + "epoch": 0.658650232977604, + "flos": 18733124989440.0, + "grad_norm": 9.719339438860779, + "language_loss": 0.78728443, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.80288696, + "num_input_tokens_seen": 236493225, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.26855469, + "step": 10955, + "time_per_iteration": 2.6890621185302734 + }, + { + "auxiliary_loss_clip": 0.01324585, + "auxiliary_loss_mlp": 0.00237878, + "balance_loss_clip": 1.08719659, + "balance_loss_mlp": 0.21389294, + "epoch": 0.6587103562302721, + "flos": 24753440931840.0, + "grad_norm": 6.486147543655871, + "language_loss": 0.89267361, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.90829831, + "num_input_tokens_seen": 236514420, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.23974609, + "step": 10956, + "time_per_iteration": 2.80676007270813 + }, + { + "auxiliary_loss_clip": 0.01303168, + "auxiliary_loss_mlp": 0.0024208, + "balance_loss_clip": 1.06881762, + "balance_loss_mlp": 0.2168791, + "epoch": 0.65877047948294, + "flos": 22346133384960.0, + "grad_norm": 15.33282434247926, + "language_loss": 0.89873546, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.91418791, + "num_input_tokens_seen": 236532785, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.2520752, + "step": 10957, + "time_per_iteration": 2.695183515548706 + }, + { + "auxiliary_loss_clip": 0.0130266, + "auxiliary_loss_mlp": 0.00241369, + "balance_loss_clip": 1.06982148, + "balance_loss_mlp": 0.21563169, + "epoch": 0.658830602735608, + "flos": 45180542298240.0, + "grad_norm": 40.4774447479131, + "language_loss": 0.83112955, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.8465699, + "num_input_tokens_seen": 236553330, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.25756836, + "step": 10958, + "time_per_iteration": 2.8857944011688232 + }, + { + "auxiliary_loss_clip": 0.01293729, + "auxiliary_loss_mlp": 0.00239634, + "balance_loss_clip": 1.06610072, + "balance_loss_mlp": 0.21454012, + "epoch": 0.6588907259882759, + "flos": 19901622326400.0, + "grad_norm": 17.23603380923493, + "language_loss": 0.8231886, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.83852226, + "num_input_tokens_seen": 236572960, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.25109863, + "step": 10959, + "time_per_iteration": 4.007513046264648 + }, + { + "auxiliary_loss_clip": 0.01296512, + "auxiliary_loss_mlp": 0.00220527, + "balance_loss_clip": 1.06243372, + "balance_loss_mlp": 0.19618435, + "epoch": 0.6589508492409439, + "flos": 24133766474880.0, + "grad_norm": 10.263285414051381, + "language_loss": 0.71028233, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.72545266, + "num_input_tokens_seen": 236594090, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.2434082, + "step": 10960, + "time_per_iteration": 2.670806884765625 + }, + { + "auxiliary_loss_clip": 0.01329313, + "auxiliary_loss_mlp": 0.00236477, + "balance_loss_clip": 1.08509994, + "balance_loss_mlp": 0.20889214, + "epoch": 0.659010972493612, + "flos": 18222906251520.0, + "grad_norm": 3.6560036581069273, + "language_loss": 0.92299986, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.93865776, + "num_input_tokens_seen": 236610190, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.27624512, + "step": 10961, + "time_per_iteration": 2.6301345825195312 + }, + { + "auxiliary_loss_clip": 0.0133678, + "auxiliary_loss_mlp": 0.00227304, + "balance_loss_clip": 1.09766269, + "balance_loss_mlp": 0.20141193, + "epoch": 0.6590710957462799, + "flos": 27600007898880.0, + "grad_norm": 17.381411880198446, + "language_loss": 0.8267144, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.84235525, + "num_input_tokens_seen": 236631575, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.25878906, + "step": 10962, + "time_per_iteration": 2.684784173965454 + }, + { + "auxiliary_loss_clip": 0.01308322, + "auxiliary_loss_mlp": 0.00244281, + "balance_loss_clip": 1.0752883, + "balance_loss_mlp": 0.21997431, + "epoch": 0.6591312189989479, + "flos": 20302959962880.0, + "grad_norm": 1467.8770176333844, + "language_loss": 0.87297487, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.88850093, + "num_input_tokens_seen": 236649815, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.24328613, + "step": 10963, + "time_per_iteration": 2.6317331790924072 + }, + { + "auxiliary_loss_clip": 0.01306148, + "auxiliary_loss_mlp": 0.00234045, + "balance_loss_clip": 1.07489681, + "balance_loss_mlp": 0.2096073, + "epoch": 0.6591913422516158, + "flos": 12312943868160.0, + "grad_norm": 3.8741011551764006, + "language_loss": 0.87432832, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.88973022, + "num_input_tokens_seen": 236668335, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.24462891, + "step": 10964, + "time_per_iteration": 2.604593276977539 + }, + { + "auxiliary_loss_clip": 0.01307227, + "auxiliary_loss_mlp": 0.00268989, + "balance_loss_clip": 1.07038546, + "balance_loss_mlp": 0.24240568, + "epoch": 0.6592514655042838, + "flos": 25884591102720.0, + "grad_norm": 28.073641793747406, + "language_loss": 0.82838809, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.8441503, + "num_input_tokens_seen": 236688945, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.26586914, + "step": 10965, + "time_per_iteration": 2.6874279975891113 + }, + { + "auxiliary_loss_clip": 0.01333385, + "auxiliary_loss_mlp": 0.00249656, + "balance_loss_clip": 1.08606446, + "balance_loss_mlp": 0.22223793, + "epoch": 0.6593115887569517, + "flos": 14063624841600.0, + "grad_norm": 131.684138310424, + "language_loss": 0.84028006, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.85611045, + "num_input_tokens_seen": 236707055, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.27404785, + "step": 10966, + "time_per_iteration": 2.6165521144866943 + }, + { + "auxiliary_loss_clip": 0.01309586, + "auxiliary_loss_mlp": 0.00246039, + "balance_loss_clip": 1.07396841, + "balance_loss_mlp": 0.21963379, + "epoch": 0.6593717120096197, + "flos": 24717925359360.0, + "grad_norm": 40.880823829445, + "language_loss": 0.84490502, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.86046124, + "num_input_tokens_seen": 236725900, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.26379395, + "step": 10967, + "time_per_iteration": 2.7135274410247803 + }, + { + "auxiliary_loss_clip": 0.01243186, + "auxiliary_loss_mlp": 0.00114221, + "balance_loss_clip": 1.09459949, + "balance_loss_mlp": 0.1051136, + "epoch": 0.6594318352622877, + "flos": 55558083502080.0, + "grad_norm": 1.8944556541337019, + "language_loss": 0.47547817, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.48905224, + "num_input_tokens_seen": 236788415, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.09130859, + "step": 10968, + "time_per_iteration": 3.141249418258667 + }, + { + "auxiliary_loss_clip": 0.01317666, + "auxiliary_loss_mlp": 0.00271758, + "balance_loss_clip": 1.0785625, + "balance_loss_mlp": 0.2446851, + "epoch": 0.6594919585149557, + "flos": 17456931699840.0, + "grad_norm": 50.056463031135934, + "language_loss": 0.88547486, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.90136909, + "num_input_tokens_seen": 236805155, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.27075195, + "step": 10969, + "time_per_iteration": 2.641716718673706 + }, + { + "auxiliary_loss_clip": 0.01298271, + "auxiliary_loss_mlp": 0.00228286, + "balance_loss_clip": 1.06389475, + "balance_loss_mlp": 0.20389557, + "epoch": 0.6595520817676236, + "flos": 18223229473920.0, + "grad_norm": 19.477118489527673, + "language_loss": 0.77019823, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.78546381, + "num_input_tokens_seen": 236824360, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.24401855, + "step": 10970, + "time_per_iteration": 2.602297306060791 + }, + { + "auxiliary_loss_clip": 0.01311053, + "auxiliary_loss_mlp": 0.00234884, + "balance_loss_clip": 1.07178116, + "balance_loss_mlp": 0.21035039, + "epoch": 0.6596122050202916, + "flos": 22199761463040.0, + "grad_norm": 62.23048812967663, + "language_loss": 0.84235513, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.85781449, + "num_input_tokens_seen": 236844640, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.24523926, + "step": 10971, + "time_per_iteration": 2.69478440284729 + }, + { + "auxiliary_loss_clip": 0.01332136, + "auxiliary_loss_mlp": 0.00226151, + "balance_loss_clip": 1.0837965, + "balance_loss_mlp": 0.20052072, + "epoch": 0.6596723282729595, + "flos": 14173834746240.0, + "grad_norm": 35.580097653748204, + "language_loss": 0.82171786, + "learning_rate": 1.096689432978629e-06, + "loss": 0.83730072, + "num_input_tokens_seen": 236861160, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.25646973, + "step": 10972, + "time_per_iteration": 2.5835957527160645 + }, + { + "auxiliary_loss_clip": 0.01305104, + "auxiliary_loss_mlp": 0.00245696, + "balance_loss_clip": 1.06649804, + "balance_loss_mlp": 0.21851644, + "epoch": 0.6597324515256275, + "flos": 30553193410560.0, + "grad_norm": 6.1068476374105645, + "language_loss": 0.66510361, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.68061161, + "num_input_tokens_seen": 236880465, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.27160645, + "step": 10973, + "time_per_iteration": 2.693239450454712 + }, + { + "auxiliary_loss_clip": 0.01339925, + "auxiliary_loss_mlp": 0.00247567, + "balance_loss_clip": 1.08513534, + "balance_loss_mlp": 0.22113863, + "epoch": 0.6597925747782956, + "flos": 17639860688640.0, + "grad_norm": 24.41482389402415, + "language_loss": 0.87404919, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.88992417, + "num_input_tokens_seen": 236897730, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.26428223, + "step": 10974, + "time_per_iteration": 2.6741955280303955 + }, + { + "auxiliary_loss_clip": 0.01333976, + "auxiliary_loss_mlp": 0.00240417, + "balance_loss_clip": 1.08381391, + "balance_loss_mlp": 0.21390441, + "epoch": 0.6598526980309635, + "flos": 22819112697600.0, + "grad_norm": 20.11649840461087, + "language_loss": 0.81470156, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.83044547, + "num_input_tokens_seen": 236917300, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.26525879, + "step": 10975, + "time_per_iteration": 2.6635823249816895 + }, + { + "auxiliary_loss_clip": 0.0132096, + "auxiliary_loss_mlp": 0.00250154, + "balance_loss_clip": 1.08096313, + "balance_loss_mlp": 0.22416642, + "epoch": 0.6599128212836315, + "flos": 21068036674560.0, + "grad_norm": 312.94091025780955, + "language_loss": 0.81301957, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.8287307, + "num_input_tokens_seen": 236935590, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.26000977, + "step": 10976, + "time_per_iteration": 2.677762031555176 + }, + { + "auxiliary_loss_clip": 0.01298929, + "auxiliary_loss_mlp": 0.00234819, + "balance_loss_clip": 1.06302679, + "balance_loss_mlp": 0.20980918, + "epoch": 0.6599729445362994, + "flos": 22163527618560.0, + "grad_norm": 38.35996794752393, + "language_loss": 0.75009131, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.76542878, + "num_input_tokens_seen": 236952830, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.25024414, + "step": 10977, + "time_per_iteration": 2.652109384536743 + }, + { + "auxiliary_loss_clip": 0.01327283, + "auxiliary_loss_mlp": 0.00228849, + "balance_loss_clip": 1.08054781, + "balance_loss_mlp": 0.20187162, + "epoch": 0.6600330677889674, + "flos": 18150079426560.0, + "grad_norm": 49.05650727122837, + "language_loss": 0.91626191, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.93182319, + "num_input_tokens_seen": 236971930, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.26965332, + "step": 10978, + "time_per_iteration": 2.6904234886169434 + }, + { + "auxiliary_loss_clip": 0.01315537, + "auxiliary_loss_mlp": 0.00243523, + "balance_loss_clip": 1.07053185, + "balance_loss_mlp": 0.21829835, + "epoch": 0.6600931910416353, + "flos": 18150115340160.0, + "grad_norm": 480.1024350451201, + "language_loss": 0.78188384, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.79747438, + "num_input_tokens_seen": 236989920, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.25219727, + "step": 10979, + "time_per_iteration": 2.6528327465057373 + }, + { + "auxiliary_loss_clip": 0.01319572, + "auxiliary_loss_mlp": 0.00230828, + "balance_loss_clip": 1.07364786, + "balance_loss_mlp": 0.20561558, + "epoch": 0.6601533142943034, + "flos": 17420733768960.0, + "grad_norm": 177.7227014423668, + "language_loss": 0.81824052, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.83374453, + "num_input_tokens_seen": 237006570, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.25219727, + "step": 10980, + "time_per_iteration": 2.652216911315918 + }, + { + "auxiliary_loss_clip": 0.01308203, + "auxiliary_loss_mlp": 0.00224118, + "balance_loss_clip": 1.07251525, + "balance_loss_mlp": 0.19915567, + "epoch": 0.6602134375469713, + "flos": 28219574615040.0, + "grad_norm": 5.082797096114045, + "language_loss": 0.80255759, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.81788081, + "num_input_tokens_seen": 237028415, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.24987793, + "step": 10981, + "time_per_iteration": 2.707818031311035 + }, + { + "auxiliary_loss_clip": 0.01303242, + "auxiliary_loss_mlp": 0.00234895, + "balance_loss_clip": 1.06438661, + "balance_loss_mlp": 0.20946792, + "epoch": 0.6602735607996393, + "flos": 29418056830080.0, + "grad_norm": 135.31517356213328, + "language_loss": 0.77355814, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.78893948, + "num_input_tokens_seen": 237046595, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.25427246, + "step": 10982, + "time_per_iteration": 2.7649765014648438 + }, + { + "auxiliary_loss_clip": 0.01311725, + "auxiliary_loss_mlp": 0.00235034, + "balance_loss_clip": 1.07347763, + "balance_loss_mlp": 0.2104297, + "epoch": 0.6603336840523072, + "flos": 18588045957120.0, + "grad_norm": 95.74729539028526, + "language_loss": 0.76812482, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.78359234, + "num_input_tokens_seen": 237066150, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.24609375, + "step": 10983, + "time_per_iteration": 2.713827133178711 + }, + { + "auxiliary_loss_clip": 0.0130095, + "auxiliary_loss_mlp": 0.00192223, + "balance_loss_clip": 1.06499481, + "balance_loss_mlp": 0.16815445, + "epoch": 0.6603938073049752, + "flos": 33254860913280.0, + "grad_norm": 13.977818283010057, + "language_loss": 0.78977567, + "learning_rate": 1.092522205413239e-06, + "loss": 0.80470741, + "num_input_tokens_seen": 237087060, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.2409668, + "step": 10984, + "time_per_iteration": 2.764824151992798 + }, + { + "auxiliary_loss_clip": 0.01311758, + "auxiliary_loss_mlp": 0.00249769, + "balance_loss_clip": 1.07171023, + "balance_loss_mlp": 0.22285204, + "epoch": 0.6604539305576431, + "flos": 17384284442880.0, + "grad_norm": 4.74309316608751, + "language_loss": 0.91286904, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.92848432, + "num_input_tokens_seen": 237103825, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.26928711, + "step": 10985, + "time_per_iteration": 2.594558000564575 + }, + { + "auxiliary_loss_clip": 0.01317484, + "auxiliary_loss_mlp": 0.00233768, + "balance_loss_clip": 1.07228851, + "balance_loss_mlp": 0.2066007, + "epoch": 0.6605140538103111, + "flos": 21251145231360.0, + "grad_norm": 8.573416578956435, + "language_loss": 0.8073436, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.82285619, + "num_input_tokens_seen": 237121740, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.2713623, + "step": 10986, + "time_per_iteration": 2.6564717292785645 + }, + { + "auxiliary_loss_clip": 0.01305338, + "auxiliary_loss_mlp": 0.00220486, + "balance_loss_clip": 1.07079828, + "balance_loss_mlp": 0.19566658, + "epoch": 0.6605741770629792, + "flos": 13881701433600.0, + "grad_norm": 53.640535409635504, + "language_loss": 0.87401903, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.88927734, + "num_input_tokens_seen": 237139565, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.24816895, + "step": 10987, + "time_per_iteration": 2.618987560272217 + }, + { + "auxiliary_loss_clip": 0.01233583, + "auxiliary_loss_mlp": 0.00083578, + "balance_loss_clip": 1.073259, + "balance_loss_mlp": 0.07394554, + "epoch": 0.6606343003156471, + "flos": 69316215171840.0, + "grad_norm": 0.7906003898804458, + "language_loss": 0.53605723, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.54922885, + "num_input_tokens_seen": 237201055, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.09619141, + "step": 10988, + "time_per_iteration": 3.2334463596343994 + }, + { + "auxiliary_loss_clip": 0.01294592, + "auxiliary_loss_mlp": 0.00203535, + "balance_loss_clip": 1.06407237, + "balance_loss_mlp": 0.18003868, + "epoch": 0.6606944235683151, + "flos": 27272394927360.0, + "grad_norm": 100.37237298079094, + "language_loss": 0.8505165, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.86549777, + "num_input_tokens_seen": 237221805, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.23486328, + "step": 10989, + "time_per_iteration": 2.717146635055542 + }, + { + "auxiliary_loss_clip": 0.0130773, + "auxiliary_loss_mlp": 0.00196435, + "balance_loss_clip": 1.06903589, + "balance_loss_mlp": 0.17173481, + "epoch": 0.660754546820983, + "flos": 13772820332160.0, + "grad_norm": 4.740401414373968, + "language_loss": 0.85838544, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.87342715, + "num_input_tokens_seen": 237238270, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.24682617, + "step": 10990, + "time_per_iteration": 2.577291250228882 + }, + { + "auxiliary_loss_clip": 0.01314972, + "auxiliary_loss_mlp": 0.0021506, + "balance_loss_clip": 1.07301581, + "balance_loss_mlp": 0.18989488, + "epoch": 0.660814670073651, + "flos": 15705209232000.0, + "grad_norm": 10.799305458272546, + "language_loss": 0.70793259, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.72323292, + "num_input_tokens_seen": 237255400, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.25195312, + "step": 10991, + "time_per_iteration": 2.6064372062683105 + }, + { + "auxiliary_loss_clip": 0.01322659, + "auxiliary_loss_mlp": 0.00206373, + "balance_loss_clip": 1.0761199, + "balance_loss_mlp": 0.17866868, + "epoch": 0.6608747933263189, + "flos": 20850023076480.0, + "grad_norm": 80.47784600718673, + "language_loss": 0.78198504, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.7972753, + "num_input_tokens_seen": 237273105, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.27685547, + "step": 10992, + "time_per_iteration": 4.077103137969971 + }, + { + "auxiliary_loss_clip": 0.01323213, + "auxiliary_loss_mlp": 0.00210364, + "balance_loss_clip": 1.07228875, + "balance_loss_mlp": 0.1823616, + "epoch": 0.660934916578987, + "flos": 20632117219200.0, + "grad_norm": 9.50510170739613, + "language_loss": 0.95711279, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.97244859, + "num_input_tokens_seen": 237292650, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.2800293, + "step": 10993, + "time_per_iteration": 4.027516841888428 + }, + { + "auxiliary_loss_clip": 0.01346513, + "auxiliary_loss_mlp": 0.00200172, + "balance_loss_clip": 1.0881567, + "balance_loss_mlp": 0.1711323, + "epoch": 0.6609950398316549, + "flos": 25113588647040.0, + "grad_norm": 17.918226289318532, + "language_loss": 0.73076451, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.74623138, + "num_input_tokens_seen": 237312865, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.29052734, + "step": 10994, + "time_per_iteration": 2.687933921813965 + }, + { + "auxiliary_loss_clip": 0.01307271, + "auxiliary_loss_mlp": 0.00217734, + "balance_loss_clip": 1.06743073, + "balance_loss_mlp": 0.19145995, + "epoch": 0.6610551630843229, + "flos": 18661196004480.0, + "grad_norm": 5.263128290323231, + "language_loss": 0.86475456, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.88000458, + "num_input_tokens_seen": 237331210, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.26306152, + "step": 10995, + "time_per_iteration": 2.616635322570801 + }, + { + "auxiliary_loss_clip": 0.01310948, + "auxiliary_loss_mlp": 0.00203989, + "balance_loss_clip": 1.0694983, + "balance_loss_mlp": 0.17903873, + "epoch": 0.6611152863369908, + "flos": 23258192549760.0, + "grad_norm": 1417.8763854982528, + "language_loss": 0.81085432, + "learning_rate": 1.088359933123053e-06, + "loss": 0.82600367, + "num_input_tokens_seen": 237349455, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.24951172, + "step": 10996, + "time_per_iteration": 2.665783166885376 + }, + { + "auxiliary_loss_clip": 0.01317735, + "auxiliary_loss_mlp": 0.00206645, + "balance_loss_clip": 1.07689905, + "balance_loss_mlp": 0.18124199, + "epoch": 0.6611754095896588, + "flos": 22159720776960.0, + "grad_norm": 4.19632665026702, + "language_loss": 0.7679745, + "learning_rate": 1.088013301487126e-06, + "loss": 0.78321832, + "num_input_tokens_seen": 237367100, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.25378418, + "step": 10997, + "time_per_iteration": 4.0714099407196045 + }, + { + "auxiliary_loss_clip": 0.01320009, + "auxiliary_loss_mlp": 0.00201079, + "balance_loss_clip": 1.07257915, + "balance_loss_mlp": 0.17419702, + "epoch": 0.6612355328423267, + "flos": 13991228979840.0, + "grad_norm": 31.162974855774134, + "language_loss": 0.79015601, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.80536687, + "num_input_tokens_seen": 237384840, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.26879883, + "step": 10998, + "time_per_iteration": 2.596104145050049 + }, + { + "auxiliary_loss_clip": 0.01215036, + "auxiliary_loss_mlp": 0.00121651, + "balance_loss_clip": 1.05878472, + "balance_loss_mlp": 0.11220992, + "epoch": 0.6612956560949947, + "flos": 61453716359040.0, + "grad_norm": 0.6459925793216112, + "language_loss": 0.50474203, + "learning_rate": 1.087320141976297e-06, + "loss": 0.51810884, + "num_input_tokens_seen": 237443355, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.09423828, + "step": 10999, + "time_per_iteration": 3.1296231746673584 + }, + { + "auxiliary_loss_clip": 0.01311147, + "auxiliary_loss_mlp": 0.00217273, + "balance_loss_clip": 1.06902969, + "balance_loss_mlp": 0.19283488, + "epoch": 0.6613557793476627, + "flos": 21616644072960.0, + "grad_norm": 57.00656657509133, + "language_loss": 0.7752493, + "learning_rate": 1.086973614127679e-06, + "loss": 0.79053348, + "num_input_tokens_seen": 237459205, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.2442627, + "step": 11000, + "time_per_iteration": 2.6392862796783447 + }, + { + "auxiliary_loss_clip": 0.01298982, + "auxiliary_loss_mlp": 0.00202406, + "balance_loss_clip": 1.0637877, + "balance_loss_mlp": 0.17778933, + "epoch": 0.6614159026003307, + "flos": 34020117192960.0, + "grad_norm": 1.7262664204774734, + "language_loss": 0.71182984, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.72684371, + "num_input_tokens_seen": 237483580, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.24621582, + "step": 11001, + "time_per_iteration": 4.1706671714782715 + }, + { + "auxiliary_loss_clip": 0.01309917, + "auxiliary_loss_mlp": 0.00190059, + "balance_loss_clip": 1.0708313, + "balance_loss_mlp": 0.16348717, + "epoch": 0.6614760258529987, + "flos": 24097281235200.0, + "grad_norm": 136.72045858319976, + "language_loss": 0.79074585, + "learning_rate": 1.086280662309739e-06, + "loss": 0.8057456, + "num_input_tokens_seen": 237502860, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.26538086, + "step": 11002, + "time_per_iteration": 2.6834259033203125 + }, + { + "auxiliary_loss_clip": 0.01304313, + "auxiliary_loss_mlp": 0.00206421, + "balance_loss_clip": 1.06887889, + "balance_loss_mlp": 0.18213849, + "epoch": 0.6615361491056666, + "flos": 14903790935040.0, + "grad_norm": 6.95275925639977, + "language_loss": 0.86796963, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.88307703, + "num_input_tokens_seen": 237521030, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.24304199, + "step": 11003, + "time_per_iteration": 2.6476757526397705 + }, + { + "auxiliary_loss_clip": 0.01340398, + "auxiliary_loss_mlp": 0.00217356, + "balance_loss_clip": 1.08949184, + "balance_loss_mlp": 0.18915114, + "epoch": 0.6615962723583346, + "flos": 15304877176320.0, + "grad_norm": 26.212404148920797, + "language_loss": 0.80156469, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.81714225, + "num_input_tokens_seen": 237539585, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.28198242, + "step": 11004, + "time_per_iteration": 2.6818130016326904 + }, + { + "auxiliary_loss_clip": 0.01326474, + "auxiliary_loss_mlp": 0.00220258, + "balance_loss_clip": 1.0752387, + "balance_loss_mlp": 0.1930311, + "epoch": 0.6616563956110025, + "flos": 18732586285440.0, + "grad_norm": 69.88220206356237, + "language_loss": 0.80191159, + "learning_rate": 1.085241494478132e-06, + "loss": 0.81737888, + "num_input_tokens_seen": 237557655, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.2722168, + "step": 11005, + "time_per_iteration": 2.6136467456817627 + }, + { + "auxiliary_loss_clip": 0.01304904, + "auxiliary_loss_mlp": 0.00205018, + "balance_loss_clip": 1.06656539, + "balance_loss_mlp": 0.18114041, + "epoch": 0.6617165188636706, + "flos": 24495063425280.0, + "grad_norm": 3.3884146505108594, + "language_loss": 0.84034801, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.85544729, + "num_input_tokens_seen": 237577000, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.2388916, + "step": 11006, + "time_per_iteration": 2.713775873184204 + }, + { + "auxiliary_loss_clip": 0.01314688, + "auxiliary_loss_mlp": 0.00201662, + "balance_loss_clip": 1.06915593, + "balance_loss_mlp": 0.17559059, + "epoch": 0.6617766421163385, + "flos": 22379673709440.0, + "grad_norm": 16.716259497089375, + "language_loss": 0.83501899, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.85018253, + "num_input_tokens_seen": 237597960, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.26098633, + "step": 11007, + "time_per_iteration": 2.6609649658203125 + }, + { + "auxiliary_loss_clip": 0.01332409, + "auxiliary_loss_mlp": 0.00209655, + "balance_loss_clip": 1.08098364, + "balance_loss_mlp": 0.18402538, + "epoch": 0.6618367653690065, + "flos": 20850418126080.0, + "grad_norm": 8.099382482980749, + "language_loss": 0.86976707, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.88518775, + "num_input_tokens_seen": 237616385, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.2565918, + "step": 11008, + "time_per_iteration": 2.6517550945281982 + }, + { + "auxiliary_loss_clip": 0.0132632, + "auxiliary_loss_mlp": 0.00218429, + "balance_loss_clip": 1.07614291, + "balance_loss_mlp": 0.19109425, + "epoch": 0.6618968886216744, + "flos": 17712328377600.0, + "grad_norm": 54.17045749775656, + "language_loss": 0.90103018, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.91647762, + "num_input_tokens_seen": 237634930, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.2734375, + "step": 11009, + "time_per_iteration": 2.6340255737304688 + }, + { + "auxiliary_loss_clip": 0.0119621, + "auxiliary_loss_mlp": 0.00090149, + "balance_loss_clip": 1.03999138, + "balance_loss_mlp": 0.08128002, + "epoch": 0.6619570118743424, + "flos": 67035347498880.0, + "grad_norm": 0.9775835445113337, + "language_loss": 0.67004734, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.68291098, + "num_input_tokens_seen": 237693175, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.08886719, + "step": 11010, + "time_per_iteration": 3.117248773574829 + }, + { + "auxiliary_loss_clip": 0.0132341, + "auxiliary_loss_mlp": 0.00213666, + "balance_loss_clip": 1.07631004, + "balance_loss_mlp": 0.18603356, + "epoch": 0.6620171351270103, + "flos": 18660908695680.0, + "grad_norm": 12.8246900283568, + "language_loss": 0.80893147, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.82430226, + "num_input_tokens_seen": 237713160, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.27648926, + "step": 11011, + "time_per_iteration": 2.6529664993286133 + }, + { + "auxiliary_loss_clip": 0.01311887, + "auxiliary_loss_mlp": 0.00219961, + "balance_loss_clip": 1.06801009, + "balance_loss_mlp": 0.19390163, + "epoch": 0.6620772583796783, + "flos": 24170503109760.0, + "grad_norm": 11.673914333388911, + "language_loss": 0.7977379, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.81305635, + "num_input_tokens_seen": 237733600, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.26062012, + "step": 11012, + "time_per_iteration": 2.6502583026885986 + }, + { + "auxiliary_loss_clip": 0.01291868, + "auxiliary_loss_mlp": 0.00197872, + "balance_loss_clip": 1.06116235, + "balance_loss_mlp": 0.17602131, + "epoch": 0.6621373816323463, + "flos": 23623547736960.0, + "grad_norm": 3.410284685492743, + "language_loss": 0.86377776, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.8786751, + "num_input_tokens_seen": 237752135, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.21862793, + "step": 11013, + "time_per_iteration": 2.6754016876220703 + }, + { + "auxiliary_loss_clip": 0.01322366, + "auxiliary_loss_mlp": 0.00215225, + "balance_loss_clip": 1.08142602, + "balance_loss_mlp": 0.18972582, + "epoch": 0.6621975048850143, + "flos": 18442212739200.0, + "grad_norm": 171.59903115980063, + "language_loss": 0.79553342, + "learning_rate": 1.082125865538971e-06, + "loss": 0.81090933, + "num_input_tokens_seen": 237770735, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.25488281, + "step": 11014, + "time_per_iteration": 2.5936050415039062 + }, + { + "auxiliary_loss_clip": 0.01309146, + "auxiliary_loss_mlp": 0.00214715, + "balance_loss_clip": 1.0709269, + "balance_loss_mlp": 0.18941867, + "epoch": 0.6622576281376823, + "flos": 14063876236800.0, + "grad_norm": 4010.0250004299874, + "language_loss": 0.85255289, + "learning_rate": 1.081779858400137e-06, + "loss": 0.86779153, + "num_input_tokens_seen": 237789005, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.25317383, + "step": 11015, + "time_per_iteration": 2.632601261138916 + }, + { + "auxiliary_loss_clip": 0.01327547, + "auxiliary_loss_mlp": 0.0020398, + "balance_loss_clip": 1.0788213, + "balance_loss_mlp": 0.17737275, + "epoch": 0.6623177513903502, + "flos": 17018965169280.0, + "grad_norm": 13.144387668284084, + "language_loss": 0.90288651, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.9182018, + "num_input_tokens_seen": 237807740, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.26611328, + "step": 11016, + "time_per_iteration": 2.6051058769226074 + }, + { + "auxiliary_loss_clip": 0.01340209, + "auxiliary_loss_mlp": 0.00217527, + "balance_loss_clip": 1.0878818, + "balance_loss_mlp": 0.18895274, + "epoch": 0.6623778746430182, + "flos": 17271021882240.0, + "grad_norm": 34.517595497635405, + "language_loss": 0.83487284, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.85045016, + "num_input_tokens_seen": 237826340, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.28540039, + "step": 11017, + "time_per_iteration": 2.643122434616089 + }, + { + "auxiliary_loss_clip": 0.01329295, + "auxiliary_loss_mlp": 0.00217423, + "balance_loss_clip": 1.08531141, + "balance_loss_mlp": 0.19126859, + "epoch": 0.6624379978956861, + "flos": 48792688767360.0, + "grad_norm": 4.953408435792612, + "language_loss": 0.84034628, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.8558135, + "num_input_tokens_seen": 237848305, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.26184082, + "step": 11018, + "time_per_iteration": 2.936920404434204 + }, + { + "auxiliary_loss_clip": 0.01305146, + "auxiliary_loss_mlp": 0.00206217, + "balance_loss_clip": 1.0669893, + "balance_loss_mlp": 0.18195805, + "epoch": 0.6624981211483542, + "flos": 18952431477120.0, + "grad_norm": 59.63110331423388, + "language_loss": 0.90634918, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.92146277, + "num_input_tokens_seen": 237867020, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.24255371, + "step": 11019, + "time_per_iteration": 2.680730104446411 + }, + { + "auxiliary_loss_clip": 0.01304717, + "auxiliary_loss_mlp": 0.00231849, + "balance_loss_clip": 1.07030869, + "balance_loss_mlp": 0.20605251, + "epoch": 0.6625582444010221, + "flos": 23256576437760.0, + "grad_norm": 3.345653983912836, + "language_loss": 0.77589941, + "learning_rate": 1.080050345253328e-06, + "loss": 0.79126513, + "num_input_tokens_seen": 237886710, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.25805664, + "step": 11020, + "time_per_iteration": 2.687253713607788 + }, + { + "auxiliary_loss_clip": 0.01342455, + "auxiliary_loss_mlp": 0.00224092, + "balance_loss_clip": 1.0912137, + "balance_loss_mlp": 0.19582756, + "epoch": 0.6626183676536901, + "flos": 21394823633280.0, + "grad_norm": 2.279079471693407, + "language_loss": 0.79273897, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.80840445, + "num_input_tokens_seen": 237904795, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.28271484, + "step": 11021, + "time_per_iteration": 2.7399682998657227 + }, + { + "auxiliary_loss_clip": 0.01333922, + "auxiliary_loss_mlp": 0.00227853, + "balance_loss_clip": 1.09148836, + "balance_loss_mlp": 0.19938551, + "epoch": 0.662678490906358, + "flos": 14571293713920.0, + "grad_norm": 64.35818947220129, + "language_loss": 0.90711522, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.92273301, + "num_input_tokens_seen": 237921320, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.2845459, + "step": 11022, + "time_per_iteration": 2.736555576324463 + }, + { + "auxiliary_loss_clip": 0.01363694, + "auxiliary_loss_mlp": 0.00228242, + "balance_loss_clip": 1.09674442, + "balance_loss_mlp": 0.19715185, + "epoch": 0.662738614159026, + "flos": 15992350554240.0, + "grad_norm": 9.266504744567998, + "language_loss": 0.88706428, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.90298361, + "num_input_tokens_seen": 237933525, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.31103516, + "step": 11023, + "time_per_iteration": 2.6191444396972656 + }, + { + "auxiliary_loss_clip": 0.01330342, + "auxiliary_loss_mlp": 0.00224946, + "balance_loss_clip": 1.09027624, + "balance_loss_mlp": 0.19810009, + "epoch": 0.6627987374116939, + "flos": 19536338966400.0, + "grad_norm": 4.839216360389303, + "language_loss": 0.81343567, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.82898855, + "num_input_tokens_seen": 237953395, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.2689209, + "step": 11024, + "time_per_iteration": 2.6421637535095215 + }, + { + "auxiliary_loss_clip": 0.01360013, + "auxiliary_loss_mlp": 0.00234272, + "balance_loss_clip": 1.1035502, + "balance_loss_mlp": 0.20438653, + "epoch": 0.662858860664362, + "flos": 15702838934400.0, + "grad_norm": 142.88119463158276, + "language_loss": 0.81744641, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.83338928, + "num_input_tokens_seen": 237971445, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.29870605, + "step": 11025, + "time_per_iteration": 2.7105233669281006 + }, + { + "auxiliary_loss_clip": 0.01362168, + "auxiliary_loss_mlp": 0.00220634, + "balance_loss_clip": 1.10154486, + "balance_loss_mlp": 0.18994993, + "epoch": 0.6629189839170299, + "flos": 20154289570560.0, + "grad_norm": 4000.398566753828, + "language_loss": 0.86473113, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.88055921, + "num_input_tokens_seen": 237989965, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.30688477, + "step": 11026, + "time_per_iteration": 2.6520049571990967 + }, + { + "auxiliary_loss_clip": 0.01324224, + "auxiliary_loss_mlp": 0.00215253, + "balance_loss_clip": 1.07702327, + "balance_loss_mlp": 0.18700041, + "epoch": 0.6629791071696979, + "flos": 20915415786240.0, + "grad_norm": 5.1897221259831285, + "language_loss": 0.83979452, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.85518932, + "num_input_tokens_seen": 238006820, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.28259277, + "step": 11027, + "time_per_iteration": 2.6647684574127197 + }, + { + "auxiliary_loss_clip": 0.01328454, + "auxiliary_loss_mlp": 0.00224332, + "balance_loss_clip": 1.08557153, + "balance_loss_mlp": 0.19586453, + "epoch": 0.6630392304223659, + "flos": 20846898593280.0, + "grad_norm": 6.02540474170768, + "language_loss": 0.81762862, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.83315647, + "num_input_tokens_seen": 238022560, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.28503418, + "step": 11028, + "time_per_iteration": 2.6518003940582275 + }, + { + "auxiliary_loss_clip": 0.01325464, + "auxiliary_loss_mlp": 0.00207194, + "balance_loss_clip": 1.0827527, + "balance_loss_mlp": 0.18000214, + "epoch": 0.6630993536750338, + "flos": 20995820380800.0, + "grad_norm": 431.64723499318114, + "language_loss": 0.8744908, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.88981742, + "num_input_tokens_seen": 238041895, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.2722168, + "step": 11029, + "time_per_iteration": 2.6660776138305664 + }, + { + "auxiliary_loss_clip": 0.0132888, + "auxiliary_loss_mlp": 0.00212975, + "balance_loss_clip": 1.08609676, + "balance_loss_mlp": 0.18606925, + "epoch": 0.6631594769277018, + "flos": 18259032355200.0, + "grad_norm": 7.858498183882519, + "language_loss": 0.87515771, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.89057624, + "num_input_tokens_seen": 238060445, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.26904297, + "step": 11030, + "time_per_iteration": 2.644345760345459 + }, + { + "auxiliary_loss_clip": 0.01363678, + "auxiliary_loss_mlp": 0.0022296, + "balance_loss_clip": 1.1091181, + "balance_loss_mlp": 0.19518416, + "epoch": 0.6632196001803697, + "flos": 17820491207040.0, + "grad_norm": 267.651267033203, + "language_loss": 0.87287939, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.88874578, + "num_input_tokens_seen": 238077080, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.27783203, + "step": 11031, + "time_per_iteration": 2.6109282970428467 + }, + { + "auxiliary_loss_clip": 0.0134285, + "auxiliary_loss_mlp": 0.00218074, + "balance_loss_clip": 1.09337044, + "balance_loss_mlp": 0.19071537, + "epoch": 0.6632797234330378, + "flos": 12670182581760.0, + "grad_norm": 6.44422893466026, + "language_loss": 0.8637504, + "learning_rate": 1.075903075048228e-06, + "loss": 0.8793596, + "num_input_tokens_seen": 238091045, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.27380371, + "step": 11032, + "time_per_iteration": 2.6281681060791016 + }, + { + "auxiliary_loss_clip": 0.01315055, + "auxiliary_loss_mlp": 0.00194812, + "balance_loss_clip": 1.07463562, + "balance_loss_mlp": 0.16840698, + "epoch": 0.6633398466857057, + "flos": 23584728113280.0, + "grad_norm": 6.979344023250589, + "language_loss": 0.88499534, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.90009403, + "num_input_tokens_seen": 238110220, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.26403809, + "step": 11033, + "time_per_iteration": 2.7141945362091064 + }, + { + "auxiliary_loss_clip": 0.01353551, + "auxiliary_loss_mlp": 0.00236076, + "balance_loss_clip": 1.09809518, + "balance_loss_mlp": 0.20559445, + "epoch": 0.6633999699383737, + "flos": 20631686256000.0, + "grad_norm": 5.088107633825604, + "language_loss": 0.89340174, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.909298, + "num_input_tokens_seen": 238130400, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.30480957, + "step": 11034, + "time_per_iteration": 4.079174995422363 + }, + { + "auxiliary_loss_clip": 0.0130167, + "auxiliary_loss_mlp": 0.00204928, + "balance_loss_clip": 1.07167554, + "balance_loss_mlp": 0.18116991, + "epoch": 0.6634600931910416, + "flos": 21797095023360.0, + "grad_norm": 13.385411810843099, + "language_loss": 0.82952487, + "learning_rate": 1.074867045054166e-06, + "loss": 0.84459078, + "num_input_tokens_seen": 238148165, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.23754883, + "step": 11035, + "time_per_iteration": 4.090306520462036 + }, + { + "auxiliary_loss_clip": 0.01345896, + "auxiliary_loss_mlp": 0.00200781, + "balance_loss_clip": 1.09283519, + "balance_loss_mlp": 0.17193222, + "epoch": 0.6635202164437096, + "flos": 18732873594240.0, + "grad_norm": 2.7276029236074284, + "language_loss": 0.91539776, + "learning_rate": 1.074521771867622e-06, + "loss": 0.93086451, + "num_input_tokens_seen": 238166360, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.28857422, + "step": 11036, + "time_per_iteration": 2.657353639602661 + }, + { + "auxiliary_loss_clip": 0.01270705, + "auxiliary_loss_mlp": 0.00077486, + "balance_loss_clip": 1.11072636, + "balance_loss_mlp": 0.06847345, + "epoch": 0.6635803396963775, + "flos": 60222771227520.0, + "grad_norm": 0.7435380376249618, + "language_loss": 0.51429021, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.52777207, + "num_input_tokens_seen": 238227630, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.09033203, + "step": 11037, + "time_per_iteration": 3.1461713314056396 + }, + { + "auxiliary_loss_clip": 0.01319014, + "auxiliary_loss_mlp": 0.00195681, + "balance_loss_clip": 1.07755542, + "balance_loss_mlp": 0.16728488, + "epoch": 0.6636404629490456, + "flos": 29167041611520.0, + "grad_norm": 167.3148044735357, + "language_loss": 0.86027896, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.87542588, + "num_input_tokens_seen": 238248435, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.28393555, + "step": 11038, + "time_per_iteration": 2.7862954139709473 + }, + { + "auxiliary_loss_clip": 0.01328164, + "auxiliary_loss_mlp": 0.00249127, + "balance_loss_clip": 1.0874753, + "balance_loss_mlp": 0.22102985, + "epoch": 0.6637005862017135, + "flos": 38907702766080.0, + "grad_norm": 64.67917318920603, + "language_loss": 0.74512303, + "learning_rate": 1.073486162925716e-06, + "loss": 0.76089597, + "num_input_tokens_seen": 238268755, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.28088379, + "step": 11039, + "time_per_iteration": 4.272482872009277 + }, + { + "auxiliary_loss_clip": 0.01343551, + "auxiliary_loss_mlp": 0.00229736, + "balance_loss_clip": 1.09476781, + "balance_loss_mlp": 0.20088738, + "epoch": 0.6637607094543815, + "flos": 22783345729920.0, + "grad_norm": 56.57298763913959, + "language_loss": 0.72021407, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.73594701, + "num_input_tokens_seen": 238290120, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.28857422, + "step": 11040, + "time_per_iteration": 2.727445125579834 + }, + { + "auxiliary_loss_clip": 0.01323888, + "auxiliary_loss_mlp": 0.00200063, + "balance_loss_clip": 1.08265293, + "balance_loss_mlp": 0.1740633, + "epoch": 0.6638208327070495, + "flos": 18114096977280.0, + "grad_norm": 12.905604380074468, + "language_loss": 0.81175852, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.826998, + "num_input_tokens_seen": 238309290, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.2598877, + "step": 11041, + "time_per_iteration": 2.7263808250427246 + }, + { + "auxiliary_loss_clip": 0.01333424, + "auxiliary_loss_mlp": 0.00225087, + "balance_loss_clip": 1.0920434, + "balance_loss_mlp": 0.19734702, + "epoch": 0.6638809559597174, + "flos": 29424880414080.0, + "grad_norm": 21.489010189778124, + "language_loss": 0.70112062, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.71670568, + "num_input_tokens_seen": 238327280, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.27758789, + "step": 11042, + "time_per_iteration": 2.741267442703247 + }, + { + "auxiliary_loss_clip": 0.01343871, + "auxiliary_loss_mlp": 0.00219225, + "balance_loss_clip": 1.09252191, + "balance_loss_mlp": 0.18854007, + "epoch": 0.6639410792123854, + "flos": 28072699902720.0, + "grad_norm": 10.173305716744704, + "language_loss": 0.78910041, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.80473131, + "num_input_tokens_seen": 238346330, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.30725098, + "step": 11043, + "time_per_iteration": 4.111525535583496 + }, + { + "auxiliary_loss_clip": 0.01309862, + "auxiliary_loss_mlp": 0.0021331, + "balance_loss_clip": 1.07616389, + "balance_loss_mlp": 0.18859768, + "epoch": 0.6640012024650533, + "flos": 25556367600000.0, + "grad_norm": 3.4964327291312283, + "language_loss": 0.88713145, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.90236318, + "num_input_tokens_seen": 238364650, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.24731445, + "step": 11044, + "time_per_iteration": 2.7383153438568115 + }, + { + "auxiliary_loss_clip": 0.01353126, + "auxiliary_loss_mlp": 0.00222285, + "balance_loss_clip": 1.10284746, + "balance_loss_mlp": 0.19336483, + "epoch": 0.6640613257177214, + "flos": 14866946559360.0, + "grad_norm": 3.818935525689628, + "language_loss": 0.80193698, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.81769109, + "num_input_tokens_seen": 238381630, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.28930664, + "step": 11045, + "time_per_iteration": 2.677267551422119 + }, + { + "auxiliary_loss_clip": 0.01338662, + "auxiliary_loss_mlp": 0.00220162, + "balance_loss_clip": 1.09320331, + "balance_loss_mlp": 0.19206476, + "epoch": 0.6641214489703893, + "flos": 23221096778880.0, + "grad_norm": 4.964796694512858, + "language_loss": 0.72012931, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.73571754, + "num_input_tokens_seen": 238402595, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.28063965, + "step": 11046, + "time_per_iteration": 2.6837244033813477 + }, + { + "auxiliary_loss_clip": 0.01317976, + "auxiliary_loss_mlp": 0.00209184, + "balance_loss_clip": 1.07988834, + "balance_loss_mlp": 0.18378031, + "epoch": 0.6641815722230573, + "flos": 37742617221120.0, + "grad_norm": 14.55161845947497, + "language_loss": 0.77583802, + "learning_rate": 1.070726085914088e-06, + "loss": 0.79110956, + "num_input_tokens_seen": 238426860, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.25366211, + "step": 11047, + "time_per_iteration": 2.764198064804077 + }, + { + "auxiliary_loss_clip": 0.01346259, + "auxiliary_loss_mlp": 0.00224767, + "balance_loss_clip": 1.10057473, + "balance_loss_mlp": 0.1959663, + "epoch": 0.6642416954757252, + "flos": 17931132074880.0, + "grad_norm": 12.91122996596213, + "language_loss": 0.83730817, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.8530184, + "num_input_tokens_seen": 238443990, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.28808594, + "step": 11048, + "time_per_iteration": 2.6463725566864014 + }, + { + "auxiliary_loss_clip": 0.01278829, + "auxiliary_loss_mlp": 0.00113365, + "balance_loss_clip": 1.11699009, + "balance_loss_mlp": 0.10525852, + "epoch": 0.6643018187283932, + "flos": 51995384104320.0, + "grad_norm": 0.7220635702194952, + "language_loss": 0.54553211, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.55945402, + "num_input_tokens_seen": 238503045, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.08105469, + "step": 11049, + "time_per_iteration": 3.155355930328369 + }, + { + "auxiliary_loss_clip": 0.013384, + "auxiliary_loss_mlp": 0.00214381, + "balance_loss_clip": 1.09545112, + "balance_loss_mlp": 0.18649827, + "epoch": 0.6643619419810611, + "flos": 30226657847040.0, + "grad_norm": 4.576893290811261, + "language_loss": 0.71977448, + "learning_rate": 1.069691638104648e-06, + "loss": 0.73530233, + "num_input_tokens_seen": 238527320, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.27893066, + "step": 11050, + "time_per_iteration": 2.7248833179473877 + }, + { + "auxiliary_loss_clip": 0.0132625, + "auxiliary_loss_mlp": 0.00217681, + "balance_loss_clip": 1.08579183, + "balance_loss_mlp": 0.18965478, + "epoch": 0.6644220652337292, + "flos": 22966131064320.0, + "grad_norm": 16.64397957206263, + "language_loss": 0.89886206, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.9143014, + "num_input_tokens_seen": 238546030, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.28027344, + "step": 11051, + "time_per_iteration": 2.6803629398345947 + }, + { + "auxiliary_loss_clip": 0.01342429, + "auxiliary_loss_mlp": 0.00226117, + "balance_loss_clip": 1.09663796, + "balance_loss_mlp": 0.19655341, + "epoch": 0.6644821884863971, + "flos": 21142228216320.0, + "grad_norm": 16.690932316827727, + "language_loss": 0.92355686, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.93924236, + "num_input_tokens_seen": 238564175, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.29541016, + "step": 11052, + "time_per_iteration": 2.660831928253174 + }, + { + "auxiliary_loss_clip": 0.01347572, + "auxiliary_loss_mlp": 0.00225497, + "balance_loss_clip": 1.0945102, + "balance_loss_mlp": 0.19705349, + "epoch": 0.6645423117390651, + "flos": 20192821885440.0, + "grad_norm": 39.628675196494356, + "language_loss": 0.84187144, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.85760218, + "num_input_tokens_seen": 238581010, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.28491211, + "step": 11053, + "time_per_iteration": 2.690481424331665 + }, + { + "auxiliary_loss_clip": 0.01309955, + "auxiliary_loss_mlp": 0.00227512, + "balance_loss_clip": 1.07614756, + "balance_loss_mlp": 0.20276463, + "epoch": 0.6646024349917331, + "flos": 24351959640960.0, + "grad_norm": 2.016946055326757, + "language_loss": 0.86517549, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.88055015, + "num_input_tokens_seen": 238601365, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.24768066, + "step": 11054, + "time_per_iteration": 2.677443265914917 + }, + { + "auxiliary_loss_clip": 0.01307472, + "auxiliary_loss_mlp": 0.002151, + "balance_loss_clip": 1.077461, + "balance_loss_mlp": 0.18746775, + "epoch": 0.664662558244401, + "flos": 18806706000000.0, + "grad_norm": 2.673033141844444, + "language_loss": 0.80205917, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.81728482, + "num_input_tokens_seen": 238619850, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.27636719, + "step": 11055, + "time_per_iteration": 2.648564338684082 + }, + { + "auxiliary_loss_clip": 0.01349794, + "auxiliary_loss_mlp": 0.00221601, + "balance_loss_clip": 1.10213411, + "balance_loss_mlp": 0.19176316, + "epoch": 0.664722681497069, + "flos": 18952790613120.0, + "grad_norm": 9.98820160521406, + "language_loss": 0.84486455, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.86057854, + "num_input_tokens_seen": 238637635, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.29870605, + "step": 11056, + "time_per_iteration": 2.6633214950561523 + }, + { + "auxiliary_loss_clip": 0.01326039, + "auxiliary_loss_mlp": 0.00236661, + "balance_loss_clip": 1.08595324, + "balance_loss_mlp": 0.206954, + "epoch": 0.6647828047497369, + "flos": 19571279921280.0, + "grad_norm": 8.190865522167034, + "language_loss": 0.79716694, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.81279397, + "num_input_tokens_seen": 238656200, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.29711914, + "step": 11057, + "time_per_iteration": 2.730844736099243 + }, + { + "auxiliary_loss_clip": 0.01342792, + "auxiliary_loss_mlp": 0.00200609, + "balance_loss_clip": 1.09464884, + "balance_loss_mlp": 0.17294037, + "epoch": 0.664842928002405, + "flos": 23149455102720.0, + "grad_norm": 9.75676712505621, + "language_loss": 0.88838124, + "learning_rate": 1.066934663776291e-06, + "loss": 0.90381521, + "num_input_tokens_seen": 238675005, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.27685547, + "step": 11058, + "time_per_iteration": 2.6712958812713623 + }, + { + "auxiliary_loss_clip": 0.01260405, + "auxiliary_loss_mlp": 0.00080382, + "balance_loss_clip": 1.10241556, + "balance_loss_mlp": 0.07294323, + "epoch": 0.6649030512550729, + "flos": 65244913148160.0, + "grad_norm": 0.7529408138088636, + "language_loss": 0.62004191, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.63344979, + "num_input_tokens_seen": 238731425, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.07421875, + "step": 11059, + "time_per_iteration": 3.0669233798980713 + }, + { + "auxiliary_loss_clip": 0.01301186, + "auxiliary_loss_mlp": 0.00221929, + "balance_loss_clip": 1.07162642, + "balance_loss_mlp": 0.1962629, + "epoch": 0.6649631745077409, + "flos": 20194797133440.0, + "grad_norm": 7.5310791373914885, + "language_loss": 0.84936416, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.86459535, + "num_input_tokens_seen": 238752020, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.25634766, + "step": 11060, + "time_per_iteration": 2.636038064956665 + }, + { + "auxiliary_loss_clip": 0.0131728, + "auxiliary_loss_mlp": 0.0021398, + "balance_loss_clip": 1.07885671, + "balance_loss_mlp": 0.18709852, + "epoch": 0.6650232977604088, + "flos": 17238558965760.0, + "grad_norm": 4.739421964250357, + "language_loss": 0.86255348, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.87786609, + "num_input_tokens_seen": 238769665, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.26855469, + "step": 11061, + "time_per_iteration": 2.6438612937927246 + }, + { + "auxiliary_loss_clip": 0.01320037, + "auxiliary_loss_mlp": 0.00208231, + "balance_loss_clip": 1.08242416, + "balance_loss_mlp": 0.18009731, + "epoch": 0.6650834210130768, + "flos": 10006867825920.0, + "grad_norm": 16.70752121773909, + "language_loss": 0.65535951, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.67064214, + "num_input_tokens_seen": 238782180, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.28125, + "step": 11062, + "time_per_iteration": 2.6328303813934326 + }, + { + "auxiliary_loss_clip": 0.01314815, + "auxiliary_loss_mlp": 0.00215248, + "balance_loss_clip": 1.07255006, + "balance_loss_mlp": 0.18717435, + "epoch": 0.6651435442657447, + "flos": 10452088903680.0, + "grad_norm": 7.549475212894057, + "language_loss": 0.86532158, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.88062215, + "num_input_tokens_seen": 238800315, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.28088379, + "step": 11063, + "time_per_iteration": 2.623905658721924 + }, + { + "auxiliary_loss_clip": 0.01317553, + "auxiliary_loss_mlp": 0.00196395, + "balance_loss_clip": 1.07950068, + "balance_loss_mlp": 0.16865513, + "epoch": 0.6652036675184128, + "flos": 22344229964160.0, + "grad_norm": 194.68605213943553, + "language_loss": 0.7767669, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.79190642, + "num_input_tokens_seen": 238822250, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.27709961, + "step": 11064, + "time_per_iteration": 2.808529853820801 + }, + { + "auxiliary_loss_clip": 0.01258142, + "auxiliary_loss_mlp": 0.00122799, + "balance_loss_clip": 1.10071039, + "balance_loss_mlp": 0.11464508, + "epoch": 0.6652637907710807, + "flos": 52909633998720.0, + "grad_norm": 1.104478105494719, + "language_loss": 0.62396896, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.6377784, + "num_input_tokens_seen": 238877190, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.08154297, + "step": 11065, + "time_per_iteration": 3.0941576957702637 + }, + { + "auxiliary_loss_clip": 0.01335858, + "auxiliary_loss_mlp": 0.00231172, + "balance_loss_clip": 1.09126985, + "balance_loss_mlp": 0.20258537, + "epoch": 0.6653239140237487, + "flos": 23104637907840.0, + "grad_norm": 5.42160028619434, + "language_loss": 0.71016169, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.72583193, + "num_input_tokens_seen": 238896010, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.28588867, + "step": 11066, + "time_per_iteration": 2.7121167182922363 + }, + { + "auxiliary_loss_clip": 0.01310432, + "auxiliary_loss_mlp": 0.00225557, + "balance_loss_clip": 1.07764697, + "balance_loss_mlp": 0.19592187, + "epoch": 0.6653840372764167, + "flos": 25959393175680.0, + "grad_norm": 7.807055371604854, + "language_loss": 0.74738038, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.76274025, + "num_input_tokens_seen": 238918990, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.29663086, + "step": 11067, + "time_per_iteration": 2.7526328563690186 + }, + { + "auxiliary_loss_clip": 0.01250534, + "auxiliary_loss_mlp": 0.00081958, + "balance_loss_clip": 1.09468865, + "balance_loss_mlp": 0.0733752, + "epoch": 0.6654441605290846, + "flos": 66041985899520.0, + "grad_norm": 0.8889512278614532, + "language_loss": 0.7135253, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.72685015, + "num_input_tokens_seen": 238975735, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.0859375, + "step": 11068, + "time_per_iteration": 3.163757801055908 + }, + { + "auxiliary_loss_clip": 0.01249606, + "auxiliary_loss_mlp": 0.00109474, + "balance_loss_clip": 1.0952673, + "balance_loss_mlp": 0.10265537, + "epoch": 0.6655042837817526, + "flos": 65196112521600.0, + "grad_norm": 0.7018646067512857, + "language_loss": 0.57425451, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.58784533, + "num_input_tokens_seen": 239042360, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.06835938, + "step": 11069, + "time_per_iteration": 3.2369637489318848 + }, + { + "auxiliary_loss_clip": 0.01251835, + "auxiliary_loss_mlp": 0.00070717, + "balance_loss_clip": 1.09750175, + "balance_loss_mlp": 0.06327787, + "epoch": 0.6655644070344205, + "flos": 69008746752000.0, + "grad_norm": 0.7615894395703928, + "language_loss": 0.62520063, + "learning_rate": 1.062803450204029e-06, + "loss": 0.63842607, + "num_input_tokens_seen": 239109410, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.07421875, + "step": 11070, + "time_per_iteration": 3.2223222255706787 + }, + { + "auxiliary_loss_clip": 0.01305645, + "auxiliary_loss_mlp": 0.00200235, + "balance_loss_clip": 1.07565129, + "balance_loss_mlp": 0.1740445, + "epoch": 0.6656245302870886, + "flos": 36315562809600.0, + "grad_norm": 4487.2493810219, + "language_loss": 0.68167925, + "learning_rate": 1.062459413096116e-06, + "loss": 0.696738, + "num_input_tokens_seen": 239135345, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.26171875, + "step": 11071, + "time_per_iteration": 2.8197529315948486 + }, + { + "auxiliary_loss_clip": 0.01331726, + "auxiliary_loss_mlp": 0.0020697, + "balance_loss_clip": 1.09714079, + "balance_loss_mlp": 0.18097015, + "epoch": 0.6656846535397565, + "flos": 21794832466560.0, + "grad_norm": 18.159954072695946, + "language_loss": 0.79407573, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.80946267, + "num_input_tokens_seen": 239154340, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.26000977, + "step": 11072, + "time_per_iteration": 2.6498849391937256 + }, + { + "auxiliary_loss_clip": 0.01311318, + "auxiliary_loss_mlp": 0.00214606, + "balance_loss_clip": 1.07624209, + "balance_loss_mlp": 0.18718764, + "epoch": 0.6657447767924245, + "flos": 37487615592960.0, + "grad_norm": 16.924933307414022, + "language_loss": 0.76907802, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.78433722, + "num_input_tokens_seen": 239177815, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.27429199, + "step": 11073, + "time_per_iteration": 2.7898452281951904 + }, + { + "auxiliary_loss_clip": 0.01333971, + "auxiliary_loss_mlp": 0.00194516, + "balance_loss_clip": 1.08811975, + "balance_loss_mlp": 0.16505969, + "epoch": 0.6658049000450924, + "flos": 16837688206080.0, + "grad_norm": 5.998503280476906, + "language_loss": 0.67883027, + "learning_rate": 1.061427515134354e-06, + "loss": 0.69411516, + "num_input_tokens_seen": 239195735, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.29431152, + "step": 11074, + "time_per_iteration": 2.6236162185668945 + }, + { + "auxiliary_loss_clip": 0.01297754, + "auxiliary_loss_mlp": 0.00188976, + "balance_loss_clip": 1.06922174, + "balance_loss_mlp": 0.16313094, + "epoch": 0.6658650232977604, + "flos": 33510975863040.0, + "grad_norm": 54.94338736780369, + "language_loss": 0.78335035, + "learning_rate": 1.061083620311235e-06, + "loss": 0.79821765, + "num_input_tokens_seen": 239217535, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.25842285, + "step": 11075, + "time_per_iteration": 2.7730679512023926 + }, + { + "auxiliary_loss_clip": 0.01312984, + "auxiliary_loss_mlp": 0.00204702, + "balance_loss_clip": 1.0770936, + "balance_loss_mlp": 0.17787966, + "epoch": 0.6659251465504283, + "flos": 37706311549440.0, + "grad_norm": 43.62880066545068, + "language_loss": 0.72613734, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.74131417, + "num_input_tokens_seen": 239241975, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.26806641, + "step": 11076, + "time_per_iteration": 4.205381870269775 + }, + { + "auxiliary_loss_clip": 0.01338323, + "auxiliary_loss_mlp": 0.00180675, + "balance_loss_clip": 1.08985543, + "balance_loss_mlp": 0.14906095, + "epoch": 0.6659852698030964, + "flos": 24893420232960.0, + "grad_norm": 3.7513189059400585, + "language_loss": 0.83584177, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.85103178, + "num_input_tokens_seen": 239262025, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.31616211, + "step": 11077, + "time_per_iteration": 4.214414358139038 + }, + { + "auxiliary_loss_clip": 0.01320126, + "auxiliary_loss_mlp": 0.00200548, + "balance_loss_clip": 1.08035088, + "balance_loss_mlp": 0.1706382, + "epoch": 0.6660453930557643, + "flos": 24352821567360.0, + "grad_norm": 3.6882647767335794, + "language_loss": 0.76146972, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.77667642, + "num_input_tokens_seen": 239282775, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.29882812, + "step": 11078, + "time_per_iteration": 2.729682683944702 + }, + { + "auxiliary_loss_clip": 0.0132357, + "auxiliary_loss_mlp": 0.00196999, + "balance_loss_clip": 1.08137536, + "balance_loss_mlp": 0.16894889, + "epoch": 0.6661055163084323, + "flos": 10597814380800.0, + "grad_norm": 13.291610269706066, + "language_loss": 0.79701859, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.81222427, + "num_input_tokens_seen": 239299775, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.28063965, + "step": 11079, + "time_per_iteration": 2.6364195346832275 + }, + { + "auxiliary_loss_clip": 0.01328776, + "auxiliary_loss_mlp": 0.00195573, + "balance_loss_clip": 1.08835828, + "balance_loss_mlp": 0.16784523, + "epoch": 0.6661656395611003, + "flos": 24057491944320.0, + "grad_norm": 14.40899469674049, + "language_loss": 0.86634582, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.88158929, + "num_input_tokens_seen": 239319660, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.27709961, + "step": 11080, + "time_per_iteration": 2.7041614055633545 + }, + { + "auxiliary_loss_clip": 0.01312844, + "auxiliary_loss_mlp": 0.00185565, + "balance_loss_clip": 1.08262396, + "balance_loss_mlp": 0.16090098, + "epoch": 0.6662257628137682, + "flos": 23036192542080.0, + "grad_norm": 71.88135040582911, + "language_loss": 0.84642279, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.86140686, + "num_input_tokens_seen": 239339215, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.24658203, + "step": 11081, + "time_per_iteration": 4.141645193099976 + }, + { + "auxiliary_loss_clip": 0.01333128, + "auxiliary_loss_mlp": 0.00201035, + "balance_loss_clip": 1.09021413, + "balance_loss_mlp": 0.17297332, + "epoch": 0.6662858860664362, + "flos": 24754446512640.0, + "grad_norm": 2.6529501221131477, + "language_loss": 0.88095307, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.89629471, + "num_input_tokens_seen": 239358545, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.28076172, + "step": 11082, + "time_per_iteration": 2.7488412857055664 + }, + { + "auxiliary_loss_clip": 0.01301449, + "auxiliary_loss_mlp": 0.00208948, + "balance_loss_clip": 1.07121634, + "balance_loss_mlp": 0.18343765, + "epoch": 0.6663460093191041, + "flos": 20009066883840.0, + "grad_norm": 49.59402567509883, + "language_loss": 0.89077979, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.90588379, + "num_input_tokens_seen": 239376665, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.25488281, + "step": 11083, + "time_per_iteration": 2.66951322555542 + }, + { + "auxiliary_loss_clip": 0.01320706, + "auxiliary_loss_mlp": 0.00208678, + "balance_loss_clip": 1.08130312, + "balance_loss_mlp": 0.18062803, + "epoch": 0.6664061325717722, + "flos": 17821389047040.0, + "grad_norm": 136.51337356621013, + "language_loss": 0.96703744, + "learning_rate": 1.057990170638731e-06, + "loss": 0.98233128, + "num_input_tokens_seen": 239394345, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.28039551, + "step": 11084, + "time_per_iteration": 2.667663097381592 + }, + { + "auxiliary_loss_clip": 0.01342587, + "auxiliary_loss_mlp": 0.0020289, + "balance_loss_clip": 1.09533024, + "balance_loss_mlp": 0.17484003, + "epoch": 0.6664662558244401, + "flos": 18076893465600.0, + "grad_norm": 6.850261241113876, + "language_loss": 0.86874938, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.88420421, + "num_input_tokens_seen": 239410605, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.28027344, + "step": 11085, + "time_per_iteration": 4.084920406341553 + }, + { + "auxiliary_loss_clip": 0.01315556, + "auxiliary_loss_mlp": 0.00189184, + "balance_loss_clip": 1.08029866, + "balance_loss_mlp": 0.16285038, + "epoch": 0.6665263790771081, + "flos": 21574197175680.0, + "grad_norm": 4.116345513658924, + "language_loss": 0.88226295, + "learning_rate": 1.057303129975894e-06, + "loss": 0.89731038, + "num_input_tokens_seen": 239427155, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.26367188, + "step": 11086, + "time_per_iteration": 2.710196018218994 + }, + { + "auxiliary_loss_clip": 0.01336157, + "auxiliary_loss_mlp": 0.00205891, + "balance_loss_clip": 1.09105086, + "balance_loss_mlp": 0.17703076, + "epoch": 0.666586502329776, + "flos": 24206629213440.0, + "grad_norm": 572.6338760507011, + "language_loss": 0.8415755, + "learning_rate": 1.056959663258702e-06, + "loss": 0.85699594, + "num_input_tokens_seen": 239445510, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.28869629, + "step": 11087, + "time_per_iteration": 2.6710333824157715 + }, + { + "auxiliary_loss_clip": 0.01301696, + "auxiliary_loss_mlp": 0.00231223, + "balance_loss_clip": 1.072788, + "balance_loss_mlp": 0.2055814, + "epoch": 0.666646625582444, + "flos": 22200515648640.0, + "grad_norm": 21309.640937763797, + "language_loss": 0.72633266, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.74166191, + "num_input_tokens_seen": 239464805, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.25634766, + "step": 11088, + "time_per_iteration": 2.6456172466278076 + }, + { + "auxiliary_loss_clip": 0.01348143, + "auxiliary_loss_mlp": 0.00209566, + "balance_loss_clip": 1.10506248, + "balance_loss_mlp": 0.18210003, + "epoch": 0.6667067488351119, + "flos": 18259930195200.0, + "grad_norm": 14.148509666635064, + "language_loss": 0.74393022, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.7595073, + "num_input_tokens_seen": 239483890, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.27441406, + "step": 11089, + "time_per_iteration": 2.619051694869995 + }, + { + "auxiliary_loss_clip": 0.01324247, + "auxiliary_loss_mlp": 0.00204829, + "balance_loss_clip": 1.09322, + "balance_loss_mlp": 0.17919894, + "epoch": 0.66676687208778, + "flos": 17236547804160.0, + "grad_norm": 5.639536938968374, + "language_loss": 0.88286352, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.89815432, + "num_input_tokens_seen": 239500080, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.25622559, + "step": 11090, + "time_per_iteration": 2.5965497493743896 + }, + { + "auxiliary_loss_clip": 0.01354949, + "auxiliary_loss_mlp": 0.002151, + "balance_loss_clip": 1.10258651, + "balance_loss_mlp": 0.18522595, + "epoch": 0.6668269953404479, + "flos": 19752197748480.0, + "grad_norm": 16.49088350283455, + "language_loss": 0.87806815, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.89376867, + "num_input_tokens_seen": 239517335, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.29858398, + "step": 11091, + "time_per_iteration": 2.6317131519317627 + }, + { + "auxiliary_loss_clip": 0.01329173, + "auxiliary_loss_mlp": 0.00197128, + "balance_loss_clip": 1.08531487, + "balance_loss_mlp": 0.16706333, + "epoch": 0.6668871185931159, + "flos": 20558428467840.0, + "grad_norm": 51.896271976057314, + "language_loss": 0.89036846, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.90563142, + "num_input_tokens_seen": 239536240, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.30078125, + "step": 11092, + "time_per_iteration": 2.605680465698242 + }, + { + "auxiliary_loss_clip": 0.01259865, + "auxiliary_loss_mlp": 0.00069026, + "balance_loss_clip": 1.1029191, + "balance_loss_mlp": 0.06139618, + "epoch": 0.6669472418457839, + "flos": 58088167735680.0, + "grad_norm": 0.7386861047862212, + "language_loss": 0.57403326, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.58732218, + "num_input_tokens_seen": 239598000, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.07617188, + "step": 11093, + "time_per_iteration": 3.178612470626831 + }, + { + "auxiliary_loss_clip": 0.01332201, + "auxiliary_loss_mlp": 0.00206973, + "balance_loss_clip": 1.09629369, + "balance_loss_mlp": 0.17979333, + "epoch": 0.6670073650984518, + "flos": 26065113880320.0, + "grad_norm": 32.46576738181697, + "language_loss": 0.83531493, + "learning_rate": 1.054556398252703e-06, + "loss": 0.85070664, + "num_input_tokens_seen": 239617650, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.27172852, + "step": 11094, + "time_per_iteration": 2.7155601978302 + }, + { + "auxiliary_loss_clip": 0.01325022, + "auxiliary_loss_mlp": 0.0021249, + "balance_loss_clip": 1.08382905, + "balance_loss_mlp": 0.18360591, + "epoch": 0.6670674883511198, + "flos": 32416849635840.0, + "grad_norm": 13.045531330639157, + "language_loss": 0.82024777, + "learning_rate": 1.05421321798155e-06, + "loss": 0.83562291, + "num_input_tokens_seen": 239639825, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.28894043, + "step": 11095, + "time_per_iteration": 2.7791244983673096 + }, + { + "auxiliary_loss_clip": 0.01320421, + "auxiliary_loss_mlp": 0.00188036, + "balance_loss_clip": 1.08649921, + "balance_loss_mlp": 0.16185766, + "epoch": 0.6671276116037878, + "flos": 18037786533120.0, + "grad_norm": 17.444688079774846, + "language_loss": 0.83250791, + "learning_rate": 1.053870073574727e-06, + "loss": 0.84759247, + "num_input_tokens_seen": 239656300, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.26147461, + "step": 11096, + "time_per_iteration": 2.639589309692383 + }, + { + "auxiliary_loss_clip": 0.01312894, + "auxiliary_loss_mlp": 0.00204797, + "balance_loss_clip": 1.0797956, + "balance_loss_mlp": 0.1795601, + "epoch": 0.6671877348564558, + "flos": 23767046570880.0, + "grad_norm": 5.704983351897757, + "language_loss": 0.72099102, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.73616791, + "num_input_tokens_seen": 239676655, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.25219727, + "step": 11097, + "time_per_iteration": 2.669074058532715 + }, + { + "auxiliary_loss_clip": 0.01317717, + "auxiliary_loss_mlp": 0.00219107, + "balance_loss_clip": 1.08482218, + "balance_loss_mlp": 0.19173642, + "epoch": 0.6672478581091237, + "flos": 20918360701440.0, + "grad_norm": 8.470894060410712, + "language_loss": 0.83871341, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.85408163, + "num_input_tokens_seen": 239695430, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.27368164, + "step": 11098, + "time_per_iteration": 2.6763150691986084 + }, + { + "auxiliary_loss_clip": 0.01318168, + "auxiliary_loss_mlp": 0.00214346, + "balance_loss_clip": 1.08470082, + "balance_loss_mlp": 0.18589067, + "epoch": 0.6673079813617917, + "flos": 27855799626240.0, + "grad_norm": 7.184630282611995, + "language_loss": 0.82195252, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.83727765, + "num_input_tokens_seen": 239717070, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.28430176, + "step": 11099, + "time_per_iteration": 2.735762596130371 + }, + { + "auxiliary_loss_clip": 0.01312954, + "auxiliary_loss_mlp": 0.00210594, + "balance_loss_clip": 1.07816482, + "balance_loss_mlp": 0.18455821, + "epoch": 0.6673681046144596, + "flos": 21616859554560.0, + "grad_norm": 7.482101810110598, + "language_loss": 0.85700184, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.87223727, + "num_input_tokens_seen": 239737105, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.26074219, + "step": 11100, + "time_per_iteration": 2.681617259979248 + }, + { + "auxiliary_loss_clip": 0.01309243, + "auxiliary_loss_mlp": 0.00197878, + "balance_loss_clip": 1.07835829, + "balance_loss_mlp": 0.17290333, + "epoch": 0.6674282278671276, + "flos": 20889884194560.0, + "grad_norm": 25.506406302488685, + "language_loss": 0.67203647, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.68710768, + "num_input_tokens_seen": 239757835, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.24987793, + "step": 11101, + "time_per_iteration": 2.69958233833313 + }, + { + "auxiliary_loss_clip": 0.01347545, + "auxiliary_loss_mlp": 0.00223254, + "balance_loss_clip": 1.09571695, + "balance_loss_mlp": 0.19333267, + "epoch": 0.6674883511197955, + "flos": 23624194181760.0, + "grad_norm": 7.003055670503995, + "language_loss": 0.82108068, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.83678865, + "num_input_tokens_seen": 239775425, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.29919434, + "step": 11102, + "time_per_iteration": 2.678558826446533 + }, + { + "auxiliary_loss_clip": 0.01315001, + "auxiliary_loss_mlp": 0.00200844, + "balance_loss_clip": 1.07688498, + "balance_loss_mlp": 0.1756787, + "epoch": 0.6675484743724636, + "flos": 19609668581760.0, + "grad_norm": 2.6948460577015205, + "language_loss": 0.90081221, + "learning_rate": 1.051469068021034e-06, + "loss": 0.91597068, + "num_input_tokens_seen": 239794605, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.25195312, + "step": 11103, + "time_per_iteration": 2.6501219272613525 + }, + { + "auxiliary_loss_clip": 0.01318945, + "auxiliary_loss_mlp": 0.00208936, + "balance_loss_clip": 1.07591355, + "balance_loss_mlp": 0.18164852, + "epoch": 0.6676085976251315, + "flos": 14319452482560.0, + "grad_norm": 2.8721578807070047, + "language_loss": 0.86369944, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.87897813, + "num_input_tokens_seen": 239812135, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.27270508, + "step": 11104, + "time_per_iteration": 2.6603171825408936 + }, + { + "auxiliary_loss_clip": 0.01325105, + "auxiliary_loss_mlp": 0.00209222, + "balance_loss_clip": 1.08781636, + "balance_loss_mlp": 0.18121944, + "epoch": 0.6676687208777995, + "flos": 38104596529920.0, + "grad_norm": 83.30280752928206, + "language_loss": 0.65825939, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.6736027, + "num_input_tokens_seen": 239835845, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.28039551, + "step": 11105, + "time_per_iteration": 2.806737184524536 + }, + { + "auxiliary_loss_clip": 0.01335285, + "auxiliary_loss_mlp": 0.00203197, + "balance_loss_clip": 1.08751881, + "balance_loss_mlp": 0.1718931, + "epoch": 0.6677288441304675, + "flos": 23981576549760.0, + "grad_norm": 99.13133577267284, + "language_loss": 0.84299159, + "learning_rate": 1.0504406049066e-06, + "loss": 0.85837638, + "num_input_tokens_seen": 239853820, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.31323242, + "step": 11106, + "time_per_iteration": 2.6882996559143066 + }, + { + "auxiliary_loss_clip": 0.01311336, + "auxiliary_loss_mlp": 0.00208662, + "balance_loss_clip": 1.07787359, + "balance_loss_mlp": 0.1815419, + "epoch": 0.6677889673831354, + "flos": 24170682677760.0, + "grad_norm": 94.49071588201153, + "language_loss": 0.84850132, + "learning_rate": 1.0500978558659e-06, + "loss": 0.86370134, + "num_input_tokens_seen": 239873365, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.27124023, + "step": 11107, + "time_per_iteration": 2.8059704303741455 + }, + { + "auxiliary_loss_clip": 0.01314509, + "auxiliary_loss_mlp": 0.00186347, + "balance_loss_clip": 1.07883501, + "balance_loss_mlp": 0.15939346, + "epoch": 0.6678490906358034, + "flos": 22309648145280.0, + "grad_norm": 12.566571384626034, + "language_loss": 0.97566247, + "learning_rate": 1.049755142845583e-06, + "loss": 0.99067098, + "num_input_tokens_seen": 239891215, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.26965332, + "step": 11108, + "time_per_iteration": 2.716533660888672 + }, + { + "auxiliary_loss_clip": 0.01295367, + "auxiliary_loss_mlp": 0.00199134, + "balance_loss_clip": 1.06362414, + "balance_loss_mlp": 0.1751256, + "epoch": 0.6679092138884714, + "flos": 36898752026880.0, + "grad_norm": 55.56784468847424, + "language_loss": 0.88102758, + "learning_rate": 1.049412465858646e-06, + "loss": 0.89597261, + "num_input_tokens_seen": 239913490, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.2401123, + "step": 11109, + "time_per_iteration": 2.8695895671844482 + }, + { + "auxiliary_loss_clip": 0.01330776, + "auxiliary_loss_mlp": 0.00217463, + "balance_loss_clip": 1.0863235, + "balance_loss_mlp": 0.18764897, + "epoch": 0.6679693371411394, + "flos": 18150294908160.0, + "grad_norm": 9.017437806504473, + "language_loss": 0.79914677, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.81462908, + "num_input_tokens_seen": 239931565, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.2980957, + "step": 11110, + "time_per_iteration": 2.610011577606201 + }, + { + "auxiliary_loss_clip": 0.01336959, + "auxiliary_loss_mlp": 0.0022371, + "balance_loss_clip": 1.08530641, + "balance_loss_mlp": 0.19016463, + "epoch": 0.6680294603938073, + "flos": 27198167472000.0, + "grad_norm": 3.2974133694201213, + "language_loss": 0.82343972, + "learning_rate": 1.04872722003689e-06, + "loss": 0.83904648, + "num_input_tokens_seen": 239952395, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.3359375, + "step": 11111, + "time_per_iteration": 2.7356085777282715 + }, + { + "auxiliary_loss_clip": 0.01313772, + "auxiliary_loss_mlp": 0.00201354, + "balance_loss_clip": 1.07758522, + "balance_loss_mlp": 0.17274374, + "epoch": 0.6680895836464753, + "flos": 21725309692800.0, + "grad_norm": 60.321112960300894, + "language_loss": 0.75017422, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.76532555, + "num_input_tokens_seen": 239968910, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.28601074, + "step": 11112, + "time_per_iteration": 2.627368927001953 + }, + { + "auxiliary_loss_clip": 0.01315317, + "auxiliary_loss_mlp": 0.00231732, + "balance_loss_clip": 1.0774579, + "balance_loss_mlp": 0.20506452, + "epoch": 0.6681497068991432, + "flos": 19646477043840.0, + "grad_norm": 15.389212339500732, + "language_loss": 0.71247816, + "learning_rate": 1.048042118504569e-06, + "loss": 0.72794867, + "num_input_tokens_seen": 239987680, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.26672363, + "step": 11113, + "time_per_iteration": 2.794199228286743 + }, + { + "auxiliary_loss_clip": 0.01301784, + "auxiliary_loss_mlp": 0.00194226, + "balance_loss_clip": 1.07128799, + "balance_loss_mlp": 0.16872764, + "epoch": 0.6682098301518112, + "flos": 17419153570560.0, + "grad_norm": 836.2031743434474, + "language_loss": 0.75061989, + "learning_rate": 1.047699621879422e-06, + "loss": 0.76558, + "num_input_tokens_seen": 240005790, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.25512695, + "step": 11114, + "time_per_iteration": 2.6205427646636963 + }, + { + "auxiliary_loss_clip": 0.01305907, + "auxiliary_loss_mlp": 0.00226428, + "balance_loss_clip": 1.07565737, + "balance_loss_mlp": 0.19948708, + "epoch": 0.6682699534044791, + "flos": 22599016110720.0, + "grad_norm": 39.257555293671246, + "language_loss": 0.8568027, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.87212598, + "num_input_tokens_seen": 240025895, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.26916504, + "step": 11115, + "time_per_iteration": 2.6771557331085205 + }, + { + "auxiliary_loss_clip": 0.01323149, + "auxiliary_loss_mlp": 0.00214554, + "balance_loss_clip": 1.07749939, + "balance_loss_mlp": 0.18482274, + "epoch": 0.6683300766571472, + "flos": 24863686750080.0, + "grad_norm": 25.94574468096145, + "language_loss": 0.87993014, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.89530718, + "num_input_tokens_seen": 240044880, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.29736328, + "step": 11116, + "time_per_iteration": 2.6875 + }, + { + "auxiliary_loss_clip": 0.01318369, + "auxiliary_loss_mlp": 0.00214739, + "balance_loss_clip": 1.07876205, + "balance_loss_mlp": 0.18705896, + "epoch": 0.6683901999098151, + "flos": 27126633536640.0, + "grad_norm": 2.1403193181815654, + "language_loss": 0.87234044, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.88767159, + "num_input_tokens_seen": 240065785, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.27722168, + "step": 11117, + "time_per_iteration": 2.6962273120880127 + }, + { + "auxiliary_loss_clip": 0.01320599, + "auxiliary_loss_mlp": 0.00183809, + "balance_loss_clip": 1.07633781, + "balance_loss_mlp": 0.15529433, + "epoch": 0.6684503231624831, + "flos": 20739023072640.0, + "grad_norm": 79.70186049883748, + "language_loss": 0.72723031, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.7422744, + "num_input_tokens_seen": 240085130, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.28491211, + "step": 11118, + "time_per_iteration": 2.663332462310791 + }, + { + "auxiliary_loss_clip": 0.01308865, + "auxiliary_loss_mlp": 0.00219591, + "balance_loss_clip": 1.07405233, + "balance_loss_mlp": 0.19251896, + "epoch": 0.668510446415151, + "flos": 21762189982080.0, + "grad_norm": 5.823710943337122, + "language_loss": 0.77157974, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.7868644, + "num_input_tokens_seen": 240105495, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.27075195, + "step": 11119, + "time_per_iteration": 4.132906913757324 + }, + { + "auxiliary_loss_clip": 0.01325467, + "auxiliary_loss_mlp": 0.00220301, + "balance_loss_clip": 1.084095, + "balance_loss_mlp": 0.19282301, + "epoch": 0.668570569667819, + "flos": 30191250015360.0, + "grad_norm": 5.2350465745689405, + "language_loss": 0.75270492, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.76816261, + "num_input_tokens_seen": 240125455, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.27478027, + "step": 11120, + "time_per_iteration": 4.179814100265503 + }, + { + "auxiliary_loss_clip": 0.01324784, + "auxiliary_loss_mlp": 0.00238188, + "balance_loss_clip": 1.08054221, + "balance_loss_mlp": 0.21063852, + "epoch": 0.668630692920487, + "flos": 24170646764160.0, + "grad_norm": 34.87265951134468, + "language_loss": 0.79572982, + "learning_rate": 1.045303157347638e-06, + "loss": 0.81135952, + "num_input_tokens_seen": 240143870, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.27563477, + "step": 11121, + "time_per_iteration": 2.7327256202697754 + }, + { + "auxiliary_loss_clip": 0.01313256, + "auxiliary_loss_mlp": 0.00232732, + "balance_loss_clip": 1.07154477, + "balance_loss_mlp": 0.20590973, + "epoch": 0.668690816173155, + "flos": 17457147181440.0, + "grad_norm": 27.538550582892153, + "language_loss": 0.80934107, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.82480097, + "num_input_tokens_seen": 240161020, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.26806641, + "step": 11122, + "time_per_iteration": 2.6584713459014893 + }, + { + "auxiliary_loss_clip": 0.01311803, + "auxiliary_loss_mlp": 0.00213238, + "balance_loss_clip": 1.07400084, + "balance_loss_mlp": 0.18638057, + "epoch": 0.668750939425823, + "flos": 25005102595200.0, + "grad_norm": 16.93209366373838, + "language_loss": 0.78807271, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.80332315, + "num_input_tokens_seen": 240179820, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.26867676, + "step": 11123, + "time_per_iteration": 4.1461341381073 + }, + { + "auxiliary_loss_clip": 0.01322354, + "auxiliary_loss_mlp": 0.00232309, + "balance_loss_clip": 1.08337045, + "balance_loss_mlp": 0.20511785, + "epoch": 0.6688110626784909, + "flos": 24096778444800.0, + "grad_norm": 15.058782835814498, + "language_loss": 0.88433719, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.89988387, + "num_input_tokens_seen": 240200130, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.27185059, + "step": 11124, + "time_per_iteration": 2.702263116836548 + }, + { + "auxiliary_loss_clip": 0.0132587, + "auxiliary_loss_mlp": 0.00208562, + "balance_loss_clip": 1.08840537, + "balance_loss_mlp": 0.18302739, + "epoch": 0.6688711859311589, + "flos": 21759532375680.0, + "grad_norm": 4.686264789007286, + "language_loss": 0.80359197, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.81893629, + "num_input_tokens_seen": 240217945, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.25537109, + "step": 11125, + "time_per_iteration": 2.664653778076172 + }, + { + "auxiliary_loss_clip": 0.01310936, + "auxiliary_loss_mlp": 0.00221057, + "balance_loss_clip": 1.07628167, + "balance_loss_mlp": 0.19307905, + "epoch": 0.6689313091838268, + "flos": 22929645824640.0, + "grad_norm": 11.897673305683146, + "language_loss": 0.76183593, + "learning_rate": 1.043592482774116e-06, + "loss": 0.77715582, + "num_input_tokens_seen": 240237220, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.27966309, + "step": 11126, + "time_per_iteration": 2.6937265396118164 + }, + { + "auxiliary_loss_clip": 0.01302463, + "auxiliary_loss_mlp": 0.00227183, + "balance_loss_clip": 1.06960869, + "balance_loss_mlp": 0.20045656, + "epoch": 0.6689914324364948, + "flos": 20886149180160.0, + "grad_norm": 3.9871094588706377, + "language_loss": 0.78618526, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.80148172, + "num_input_tokens_seen": 240256000, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.26721191, + "step": 11127, + "time_per_iteration": 4.171586990356445 + }, + { + "auxiliary_loss_clip": 0.01351041, + "auxiliary_loss_mlp": 0.00223555, + "balance_loss_clip": 1.09441113, + "balance_loss_mlp": 0.19380051, + "epoch": 0.6690515556891627, + "flos": 22748225207040.0, + "grad_norm": 29370.680498656297, + "language_loss": 0.90261739, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.91836333, + "num_input_tokens_seen": 240275845, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.29785156, + "step": 11128, + "time_per_iteration": 2.7912137508392334 + }, + { + "auxiliary_loss_clip": 0.01322946, + "auxiliary_loss_mlp": 0.00226613, + "balance_loss_clip": 1.08219504, + "balance_loss_mlp": 0.19991052, + "epoch": 0.6691116789418308, + "flos": 23331450337920.0, + "grad_norm": 59.63433753567568, + "language_loss": 0.87685573, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.89235133, + "num_input_tokens_seen": 240294095, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.26708984, + "step": 11129, + "time_per_iteration": 2.6986677646636963 + }, + { + "auxiliary_loss_clip": 0.01299244, + "auxiliary_loss_mlp": 0.00223407, + "balance_loss_clip": 1.0658927, + "balance_loss_mlp": 0.19601291, + "epoch": 0.6691718021944987, + "flos": 32447014081920.0, + "grad_norm": 70.74143421719654, + "language_loss": 0.77160859, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.78683507, + "num_input_tokens_seen": 240313460, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.27416992, + "step": 11130, + "time_per_iteration": 2.7375078201293945 + }, + { + "auxiliary_loss_clip": 0.01304307, + "auxiliary_loss_mlp": 0.00203511, + "balance_loss_clip": 1.07301176, + "balance_loss_mlp": 0.17958587, + "epoch": 0.6692319254471667, + "flos": 23731602825600.0, + "grad_norm": 303.49438202720097, + "language_loss": 0.77707672, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.79215491, + "num_input_tokens_seen": 240333540, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.23925781, + "step": 11131, + "time_per_iteration": 2.692816734313965 + }, + { + "auxiliary_loss_clip": 0.0132581, + "auxiliary_loss_mlp": 0.00212334, + "balance_loss_clip": 1.08032918, + "balance_loss_mlp": 0.18431999, + "epoch": 0.6692920486998346, + "flos": 14427902620800.0, + "grad_norm": 89.21277387851875, + "language_loss": 0.81553006, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.83091152, + "num_input_tokens_seen": 240350085, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.2800293, + "step": 11132, + "time_per_iteration": 2.6575253009796143 + }, + { + "auxiliary_loss_clip": 0.0132818, + "auxiliary_loss_mlp": 0.00217392, + "balance_loss_clip": 1.08811736, + "balance_loss_mlp": 0.18955675, + "epoch": 0.6693521719525026, + "flos": 21507475662720.0, + "grad_norm": 39.51931179937368, + "language_loss": 0.84746683, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.86292255, + "num_input_tokens_seen": 240370015, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.2779541, + "step": 11133, + "time_per_iteration": 2.740431785583496 + }, + { + "auxiliary_loss_clip": 0.01339219, + "auxiliary_loss_mlp": 0.00232521, + "balance_loss_clip": 1.08943141, + "balance_loss_mlp": 0.2047694, + "epoch": 0.6694122952051706, + "flos": 25406943022080.0, + "grad_norm": 15.087878586981141, + "language_loss": 0.75975776, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.77547514, + "num_input_tokens_seen": 240390770, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.27734375, + "step": 11134, + "time_per_iteration": 2.6746294498443604 + }, + { + "auxiliary_loss_clip": 0.0133751, + "auxiliary_loss_mlp": 0.00235386, + "balance_loss_clip": 1.09002805, + "balance_loss_mlp": 0.20701423, + "epoch": 0.6694724184578386, + "flos": 25661729168640.0, + "grad_norm": 15.50452115507411, + "language_loss": 0.87496895, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.89069796, + "num_input_tokens_seen": 240409590, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.28381348, + "step": 11135, + "time_per_iteration": 2.6890697479248047 + }, + { + "auxiliary_loss_clip": 0.01299115, + "auxiliary_loss_mlp": 0.00213938, + "balance_loss_clip": 1.06960583, + "balance_loss_mlp": 0.18753341, + "epoch": 0.6695325417105066, + "flos": 17709311635200.0, + "grad_norm": 10.35013586630069, + "language_loss": 0.80971396, + "learning_rate": 1.040173855277898e-06, + "loss": 0.82484448, + "num_input_tokens_seen": 240428180, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.26416016, + "step": 11136, + "time_per_iteration": 2.614917516708374 + }, + { + "auxiliary_loss_clip": 0.01308536, + "auxiliary_loss_mlp": 0.0020966, + "balance_loss_clip": 1.06853032, + "balance_loss_mlp": 0.18205138, + "epoch": 0.6695926649631745, + "flos": 24460050643200.0, + "grad_norm": 3.3721020561873285, + "language_loss": 0.73050344, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.74568546, + "num_input_tokens_seen": 240447815, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.27612305, + "step": 11137, + "time_per_iteration": 2.6895534992218018 + }, + { + "auxiliary_loss_clip": 0.01318249, + "auxiliary_loss_mlp": 0.00221247, + "balance_loss_clip": 1.08054447, + "balance_loss_mlp": 0.19424629, + "epoch": 0.6696527882158425, + "flos": 24280138396800.0, + "grad_norm": 10.84939039972831, + "language_loss": 0.76255822, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.77795315, + "num_input_tokens_seen": 240468635, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.27026367, + "step": 11138, + "time_per_iteration": 2.650568962097168 + }, + { + "auxiliary_loss_clip": 0.01298273, + "auxiliary_loss_mlp": 0.00220836, + "balance_loss_clip": 1.06370175, + "balance_loss_mlp": 0.19408543, + "epoch": 0.6697129114685104, + "flos": 23002759958400.0, + "grad_norm": 81.75616939443583, + "language_loss": 0.80177253, + "learning_rate": 1.039148976175053e-06, + "loss": 0.81696355, + "num_input_tokens_seen": 240488550, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.26733398, + "step": 11139, + "time_per_iteration": 2.6941585540771484 + }, + { + "auxiliary_loss_clip": 0.01304608, + "auxiliary_loss_mlp": 0.00207029, + "balance_loss_clip": 1.07178128, + "balance_loss_mlp": 0.18257928, + "epoch": 0.6697730347211784, + "flos": 22638123043200.0, + "grad_norm": 86.34494938487205, + "language_loss": 0.79949015, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.81460655, + "num_input_tokens_seen": 240508330, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.24462891, + "step": 11140, + "time_per_iteration": 2.6625864505767822 + }, + { + "auxiliary_loss_clip": 0.01330353, + "auxiliary_loss_mlp": 0.0023131, + "balance_loss_clip": 1.07981801, + "balance_loss_mlp": 0.20131667, + "epoch": 0.6698331579738463, + "flos": 28877242682880.0, + "grad_norm": 37.37632069177336, + "language_loss": 0.83002639, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.84564304, + "num_input_tokens_seen": 240528470, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.29980469, + "step": 11141, + "time_per_iteration": 2.7214157581329346 + }, + { + "auxiliary_loss_clip": 0.01319799, + "auxiliary_loss_mlp": 0.00222197, + "balance_loss_clip": 1.07701063, + "balance_loss_mlp": 0.19409905, + "epoch": 0.6698932812265144, + "flos": 24207096090240.0, + "grad_norm": 1.9154356018265815, + "language_loss": 0.89778113, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.91320109, + "num_input_tokens_seen": 240547815, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.28100586, + "step": 11142, + "time_per_iteration": 2.7078449726104736 + }, + { + "auxiliary_loss_clip": 0.01299201, + "auxiliary_loss_mlp": 0.00219681, + "balance_loss_clip": 1.06559968, + "balance_loss_mlp": 0.19350302, + "epoch": 0.6699534044791823, + "flos": 22090269830400.0, + "grad_norm": 4.079309301758657, + "language_loss": 0.7705518, + "learning_rate": 1.037782980862959e-06, + "loss": 0.78574067, + "num_input_tokens_seen": 240567765, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.26196289, + "step": 11143, + "time_per_iteration": 2.6819558143615723 + }, + { + "auxiliary_loss_clip": 0.01304142, + "auxiliary_loss_mlp": 0.00219671, + "balance_loss_clip": 1.06657827, + "balance_loss_mlp": 0.19412413, + "epoch": 0.6700135277318503, + "flos": 25192377129600.0, + "grad_norm": 11.126824571937712, + "language_loss": 0.76712799, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.78236616, + "num_input_tokens_seen": 240590750, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.25561523, + "step": 11144, + "time_per_iteration": 2.7973074913024902 + }, + { + "auxiliary_loss_clip": 0.01312998, + "auxiliary_loss_mlp": 0.00251638, + "balance_loss_clip": 1.07850671, + "balance_loss_mlp": 0.2248043, + "epoch": 0.6700736509845182, + "flos": 23440187784960.0, + "grad_norm": 21.517507189600842, + "language_loss": 0.80583131, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.82147765, + "num_input_tokens_seen": 240608875, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.26831055, + "step": 11145, + "time_per_iteration": 2.730835437774658 + }, + { + "auxiliary_loss_clip": 0.01327684, + "auxiliary_loss_mlp": 0.0021291, + "balance_loss_clip": 1.08515453, + "balance_loss_mlp": 0.18592066, + "epoch": 0.6701337742371862, + "flos": 24389953251840.0, + "grad_norm": 70.07832585781325, + "language_loss": 0.80427998, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.81968594, + "num_input_tokens_seen": 240628565, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.27001953, + "step": 11146, + "time_per_iteration": 2.7705130577087402 + }, + { + "auxiliary_loss_clip": 0.01307497, + "auxiliary_loss_mlp": 0.00225625, + "balance_loss_clip": 1.07075119, + "balance_loss_mlp": 0.19944689, + "epoch": 0.6701938974898543, + "flos": 14793652857600.0, + "grad_norm": 42.14412677715837, + "language_loss": 0.88102555, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.89635682, + "num_input_tokens_seen": 240646325, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.26171875, + "step": 11147, + "time_per_iteration": 2.6426525115966797 + }, + { + "auxiliary_loss_clip": 0.01329189, + "auxiliary_loss_mlp": 0.00223919, + "balance_loss_clip": 1.08586931, + "balance_loss_mlp": 0.1975261, + "epoch": 0.6702540207425222, + "flos": 20154002261760.0, + "grad_norm": 50.0046244054322, + "language_loss": 0.78156543, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.79709655, + "num_input_tokens_seen": 240666145, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.26403809, + "step": 11148, + "time_per_iteration": 2.676154851913452 + }, + { + "auxiliary_loss_clip": 0.01304097, + "auxiliary_loss_mlp": 0.00222921, + "balance_loss_clip": 1.07158685, + "balance_loss_mlp": 0.19956753, + "epoch": 0.6703141439951902, + "flos": 21214157201280.0, + "grad_norm": 5.552412564141966, + "language_loss": 0.78365409, + "learning_rate": 1.035735082774636e-06, + "loss": 0.79892427, + "num_input_tokens_seen": 240685570, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.23364258, + "step": 11149, + "time_per_iteration": 2.672943115234375 + }, + { + "auxiliary_loss_clip": 0.01289192, + "auxiliary_loss_mlp": 0.00231151, + "balance_loss_clip": 1.05793822, + "balance_loss_mlp": 0.20683183, + "epoch": 0.6703742672478581, + "flos": 23112538899840.0, + "grad_norm": 2.8394804695397107, + "language_loss": 0.82109839, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.8363018, + "num_input_tokens_seen": 240706945, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.24316406, + "step": 11150, + "time_per_iteration": 2.763502836227417 + }, + { + "auxiliary_loss_clip": 0.01327913, + "auxiliary_loss_mlp": 0.00221915, + "balance_loss_clip": 1.08658934, + "balance_loss_mlp": 0.19583249, + "epoch": 0.6704343905005261, + "flos": 22528918719360.0, + "grad_norm": 18.442359967408713, + "language_loss": 0.86991942, + "learning_rate": 1.035052742460671e-06, + "loss": 0.8854177, + "num_input_tokens_seen": 240727990, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.26086426, + "step": 11151, + "time_per_iteration": 2.7157938480377197 + }, + { + "auxiliary_loss_clip": 0.01295024, + "auxiliary_loss_mlp": 0.00171155, + "balance_loss_clip": 1.13366318, + "balance_loss_mlp": 0.1614753, + "epoch": 0.670494513753194, + "flos": 64793158773120.0, + "grad_norm": 0.7653275710376538, + "language_loss": 0.55300856, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.56767035, + "num_input_tokens_seen": 240790380, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.09667969, + "step": 11152, + "time_per_iteration": 3.2375335693359375 + }, + { + "auxiliary_loss_clip": 0.01310746, + "auxiliary_loss_mlp": 0.00229638, + "balance_loss_clip": 1.06665266, + "balance_loss_mlp": 0.20124227, + "epoch": 0.670554637005862, + "flos": 23511506238720.0, + "grad_norm": 25.932823779974033, + "language_loss": 0.90204805, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.91745186, + "num_input_tokens_seen": 240811545, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.28442383, + "step": 11153, + "time_per_iteration": 2.7074735164642334 + }, + { + "auxiliary_loss_clip": 0.01312386, + "auxiliary_loss_mlp": 0.00215123, + "balance_loss_clip": 1.0714947, + "balance_loss_mlp": 0.18794341, + "epoch": 0.67061476025853, + "flos": 19463404400640.0, + "grad_norm": 3.7851652937530322, + "language_loss": 0.82565665, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.84093177, + "num_input_tokens_seen": 240831380, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.27172852, + "step": 11154, + "time_per_iteration": 2.6449761390686035 + }, + { + "auxiliary_loss_clip": 0.01324564, + "auxiliary_loss_mlp": 0.00222543, + "balance_loss_clip": 1.08121371, + "balance_loss_mlp": 0.19707972, + "epoch": 0.670674883511198, + "flos": 20519967980160.0, + "grad_norm": 7.603343091415059, + "language_loss": 0.83139914, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.84687024, + "num_input_tokens_seen": 240851855, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.25488281, + "step": 11155, + "time_per_iteration": 2.7192580699920654 + }, + { + "auxiliary_loss_clip": 0.01300291, + "auxiliary_loss_mlp": 0.00232091, + "balance_loss_clip": 1.07050157, + "balance_loss_mlp": 0.20719999, + "epoch": 0.6707350067638659, + "flos": 25483971738240.0, + "grad_norm": 25.743831235299304, + "language_loss": 0.87111139, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.88643527, + "num_input_tokens_seen": 240869980, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.24902344, + "step": 11156, + "time_per_iteration": 2.727607488632202 + }, + { + "auxiliary_loss_clip": 0.01287585, + "auxiliary_loss_mlp": 0.00224908, + "balance_loss_clip": 1.05688679, + "balance_loss_mlp": 0.19931388, + "epoch": 0.6707951300165339, + "flos": 22273450214400.0, + "grad_norm": 10.500207549851753, + "language_loss": 0.82196856, + "learning_rate": 1.033006600114165e-06, + "loss": 0.83709353, + "num_input_tokens_seen": 240888680, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.25585938, + "step": 11157, + "time_per_iteration": 2.701732635498047 + }, + { + "auxiliary_loss_clip": 0.0130142, + "auxiliary_loss_mlp": 0.0020927, + "balance_loss_clip": 1.06456459, + "balance_loss_mlp": 0.18414101, + "epoch": 0.6708552532692018, + "flos": 23984593292160.0, + "grad_norm": 7.006803858516912, + "language_loss": 0.81962526, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.83473217, + "num_input_tokens_seen": 240909050, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.25158691, + "step": 11158, + "time_per_iteration": 2.6897823810577393 + }, + { + "auxiliary_loss_clip": 0.01285535, + "auxiliary_loss_mlp": 0.00220889, + "balance_loss_clip": 1.05700874, + "balance_loss_mlp": 0.19598572, + "epoch": 0.6709153765218698, + "flos": 24937519155840.0, + "grad_norm": 3.0327757934913593, + "language_loss": 0.887137, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.90220124, + "num_input_tokens_seen": 240930035, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.24914551, + "step": 11159, + "time_per_iteration": 2.908073902130127 + }, + { + "auxiliary_loss_clip": 0.01321718, + "auxiliary_loss_mlp": 0.00232846, + "balance_loss_clip": 1.07968867, + "balance_loss_mlp": 0.20626272, + "epoch": 0.6709754997745379, + "flos": 17530225401600.0, + "grad_norm": 51.89727699016217, + "language_loss": 0.8589862, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.87453187, + "num_input_tokens_seen": 240948895, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.26586914, + "step": 11160, + "time_per_iteration": 2.653062105178833 + }, + { + "auxiliary_loss_clip": 0.012906, + "auxiliary_loss_mlp": 0.00231904, + "balance_loss_clip": 1.05886316, + "balance_loss_mlp": 0.20781192, + "epoch": 0.6710356230272058, + "flos": 22090880361600.0, + "grad_norm": 7.398265330509629, + "language_loss": 0.80798954, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.82321459, + "num_input_tokens_seen": 240967770, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.2409668, + "step": 11161, + "time_per_iteration": 4.079337120056152 + }, + { + "auxiliary_loss_clip": 0.01318755, + "auxiliary_loss_mlp": 0.00239053, + "balance_loss_clip": 1.07195854, + "balance_loss_mlp": 0.21313679, + "epoch": 0.6710957462798738, + "flos": 24206449645440.0, + "grad_norm": 5.170603133659091, + "language_loss": 0.77752888, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.79310697, + "num_input_tokens_seen": 240988985, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.25915527, + "step": 11162, + "time_per_iteration": 4.1115453243255615 + }, + { + "auxiliary_loss_clip": 0.01281767, + "auxiliary_loss_mlp": 0.00215832, + "balance_loss_clip": 1.05368853, + "balance_loss_mlp": 0.19194284, + "epoch": 0.6711558695325417, + "flos": 19093955063040.0, + "grad_norm": 1.9691619794633932, + "language_loss": 0.7583847, + "learning_rate": 1.030961777833032e-06, + "loss": 0.77336073, + "num_input_tokens_seen": 241005455, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.23913574, + "step": 11163, + "time_per_iteration": 2.6660752296447754 + }, + { + "auxiliary_loss_clip": 0.01292569, + "auxiliary_loss_mlp": 0.00235627, + "balance_loss_clip": 1.06611109, + "balance_loss_mlp": 0.21037829, + "epoch": 0.6712159927852097, + "flos": 25557875971200.0, + "grad_norm": 96.02624115754256, + "language_loss": 0.82285243, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.83813441, + "num_input_tokens_seen": 241026175, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.25244141, + "step": 11164, + "time_per_iteration": 2.727402687072754 + }, + { + "auxiliary_loss_clip": 0.01318483, + "auxiliary_loss_mlp": 0.00235922, + "balance_loss_clip": 1.07845807, + "balance_loss_mlp": 0.20833676, + "epoch": 0.6712761160378776, + "flos": 22228812587520.0, + "grad_norm": 8.542920958716882, + "language_loss": 0.77385592, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.78939998, + "num_input_tokens_seen": 241044040, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.27563477, + "step": 11165, + "time_per_iteration": 2.683198928833008 + }, + { + "auxiliary_loss_clip": 0.01305686, + "auxiliary_loss_mlp": 0.00230589, + "balance_loss_clip": 1.07147026, + "balance_loss_mlp": 0.20441058, + "epoch": 0.6713362392905456, + "flos": 22455517276800.0, + "grad_norm": 6.872094082588257, + "language_loss": 0.80635619, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.82171893, + "num_input_tokens_seen": 241063615, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.26171875, + "step": 11166, + "time_per_iteration": 4.1220855712890625 + }, + { + "auxiliary_loss_clip": 0.01301295, + "auxiliary_loss_mlp": 0.00202661, + "balance_loss_clip": 1.07058787, + "balance_loss_mlp": 0.1801185, + "epoch": 0.6713963625432136, + "flos": 25630200005760.0, + "grad_norm": 32.212513337903594, + "language_loss": 0.86242837, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.87746793, + "num_input_tokens_seen": 241082520, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.22546387, + "step": 11167, + "time_per_iteration": 2.752408981323242 + }, + { + "auxiliary_loss_clip": 0.01301529, + "auxiliary_loss_mlp": 0.00242807, + "balance_loss_clip": 1.06565118, + "balance_loss_mlp": 0.21810687, + "epoch": 0.6714564857958816, + "flos": 35006475640320.0, + "grad_norm": 80.8812172122069, + "language_loss": 0.77141017, + "learning_rate": 1.029258769662629e-06, + "loss": 0.78685355, + "num_input_tokens_seen": 241103505, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.24694824, + "step": 11168, + "time_per_iteration": 2.834103584289551 + }, + { + "auxiliary_loss_clip": 0.01312613, + "auxiliary_loss_mlp": 0.00215109, + "balance_loss_clip": 1.07785714, + "balance_loss_mlp": 0.18986005, + "epoch": 0.6715166090485495, + "flos": 26279931168000.0, + "grad_norm": 29.58170497517036, + "language_loss": 0.82572567, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.84100294, + "num_input_tokens_seen": 241122885, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.25231934, + "step": 11169, + "time_per_iteration": 2.7379508018493652 + }, + { + "auxiliary_loss_clip": 0.01304174, + "auxiliary_loss_mlp": 0.00256016, + "balance_loss_clip": 1.07173753, + "balance_loss_mlp": 0.23062465, + "epoch": 0.6715767323012175, + "flos": 15924156583680.0, + "grad_norm": 57.717844886732344, + "language_loss": 0.83811641, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.85371828, + "num_input_tokens_seen": 241140865, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.25402832, + "step": 11170, + "time_per_iteration": 4.177717924118042 + }, + { + "auxiliary_loss_clip": 0.01308827, + "auxiliary_loss_mlp": 0.00252284, + "balance_loss_clip": 1.07298625, + "balance_loss_mlp": 0.22664186, + "epoch": 0.6716368555538854, + "flos": 17491441691520.0, + "grad_norm": 733.7462503720692, + "language_loss": 0.84388578, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.85949689, + "num_input_tokens_seen": 241158225, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.25646973, + "step": 11171, + "time_per_iteration": 2.732903003692627 + }, + { + "auxiliary_loss_clip": 0.01306764, + "auxiliary_loss_mlp": 0.00238739, + "balance_loss_clip": 1.06903815, + "balance_loss_mlp": 0.21153502, + "epoch": 0.6716969788065534, + "flos": 16761521416320.0, + "grad_norm": 6.772319733547146, + "language_loss": 0.9219479, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.9374029, + "num_input_tokens_seen": 241175215, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.27185059, + "step": 11172, + "time_per_iteration": 2.639415979385376 + }, + { + "auxiliary_loss_clip": 0.01290413, + "auxiliary_loss_mlp": 0.00226622, + "balance_loss_clip": 1.05911851, + "balance_loss_mlp": 0.20107546, + "epoch": 0.6717571020592215, + "flos": 22709800632960.0, + "grad_norm": 31.01093720871742, + "language_loss": 0.71029425, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.72546458, + "num_input_tokens_seen": 241195250, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.25537109, + "step": 11173, + "time_per_iteration": 2.698124885559082 + }, + { + "auxiliary_loss_clip": 0.01355391, + "auxiliary_loss_mlp": 0.00241722, + "balance_loss_clip": 1.09554827, + "balance_loss_mlp": 0.21227738, + "epoch": 0.6718172253118894, + "flos": 18734094656640.0, + "grad_norm": 2441.0965571876413, + "language_loss": 0.8459897, + "learning_rate": 1.02721637475002e-06, + "loss": 0.86196077, + "num_input_tokens_seen": 241210720, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.29431152, + "step": 11174, + "time_per_iteration": 2.641655206680298 + }, + { + "auxiliary_loss_clip": 0.01284579, + "auxiliary_loss_mlp": 0.00240028, + "balance_loss_clip": 1.0619334, + "balance_loss_mlp": 0.21728238, + "epoch": 0.6718773485645574, + "flos": 15632526061440.0, + "grad_norm": 107.69167646268538, + "language_loss": 0.78150815, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.79675424, + "num_input_tokens_seen": 241227395, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.22741699, + "step": 11175, + "time_per_iteration": 2.638228178024292 + }, + { + "auxiliary_loss_clip": 0.01308585, + "auxiliary_loss_mlp": 0.00243061, + "balance_loss_clip": 1.07635081, + "balance_loss_mlp": 0.21908842, + "epoch": 0.6719374718172253, + "flos": 19354774694400.0, + "grad_norm": 134.6125037434017, + "language_loss": 0.81516433, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.83068079, + "num_input_tokens_seen": 241246355, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.23999023, + "step": 11176, + "time_per_iteration": 2.642571210861206 + }, + { + "auxiliary_loss_clip": 0.01329842, + "auxiliary_loss_mlp": 0.00227503, + "balance_loss_clip": 1.08468199, + "balance_loss_mlp": 0.20031121, + "epoch": 0.6719975950698933, + "flos": 21981316901760.0, + "grad_norm": 895.2823886853358, + "language_loss": 0.81995833, + "learning_rate": 1.026195675108182e-06, + "loss": 0.83553183, + "num_input_tokens_seen": 241264180, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.27185059, + "step": 11177, + "time_per_iteration": 2.6667635440826416 + }, + { + "auxiliary_loss_clip": 0.0133293, + "auxiliary_loss_mlp": 0.00259906, + "balance_loss_clip": 1.08594823, + "balance_loss_mlp": 0.23310791, + "epoch": 0.6720577183225612, + "flos": 25228072270080.0, + "grad_norm": 20.080070173550794, + "language_loss": 0.84501922, + "learning_rate": 1.025855515730551e-06, + "loss": 0.86094755, + "num_input_tokens_seen": 241282245, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.2677002, + "step": 11178, + "time_per_iteration": 2.678718328475952 + }, + { + "auxiliary_loss_clip": 0.01332226, + "auxiliary_loss_mlp": 0.00258423, + "balance_loss_clip": 1.09080184, + "balance_loss_mlp": 0.23138602, + "epoch": 0.6721178415752292, + "flos": 16945886949120.0, + "grad_norm": 149.11966519640944, + "language_loss": 0.804452, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.82035851, + "num_input_tokens_seen": 241300745, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.27038574, + "step": 11179, + "time_per_iteration": 2.6362802982330322 + }, + { + "auxiliary_loss_clip": 0.01304602, + "auxiliary_loss_mlp": 0.00233707, + "balance_loss_clip": 1.0784744, + "balance_loss_mlp": 0.209126, + "epoch": 0.6721779648278972, + "flos": 21541375123200.0, + "grad_norm": 7.3295992695664145, + "language_loss": 0.81544036, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.83082342, + "num_input_tokens_seen": 241319320, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.24560547, + "step": 11180, + "time_per_iteration": 2.633653163909912 + }, + { + "auxiliary_loss_clip": 0.01295047, + "auxiliary_loss_mlp": 0.00233745, + "balance_loss_clip": 1.06583261, + "balance_loss_mlp": 0.20921159, + "epoch": 0.6722380880805652, + "flos": 22605444645120.0, + "grad_norm": 12.187054888855709, + "language_loss": 0.82421541, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.83950329, + "num_input_tokens_seen": 241342225, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.24536133, + "step": 11181, + "time_per_iteration": 2.7172436714172363 + }, + { + "auxiliary_loss_clip": 0.01297595, + "auxiliary_loss_mlp": 0.00239285, + "balance_loss_clip": 1.06309187, + "balance_loss_mlp": 0.2151688, + "epoch": 0.6722982113332331, + "flos": 15925269905280.0, + "grad_norm": 46.70879263312581, + "language_loss": 0.84384191, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.85921079, + "num_input_tokens_seen": 241358240, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.24133301, + "step": 11182, + "time_per_iteration": 2.6725025177001953 + }, + { + "auxiliary_loss_clip": 0.01298019, + "auxiliary_loss_mlp": 0.00242355, + "balance_loss_clip": 1.06790876, + "balance_loss_mlp": 0.2186321, + "epoch": 0.6723583345859011, + "flos": 20596170683520.0, + "grad_norm": 10.374757353583368, + "language_loss": 0.76454103, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.77994478, + "num_input_tokens_seen": 241378420, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.23718262, + "step": 11183, + "time_per_iteration": 2.75481915473938 + }, + { + "auxiliary_loss_clip": 0.01299911, + "auxiliary_loss_mlp": 0.00229237, + "balance_loss_clip": 1.06682992, + "balance_loss_mlp": 0.20423898, + "epoch": 0.672418457838569, + "flos": 21725848396800.0, + "grad_norm": 37.87933611582405, + "language_loss": 0.86010969, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.87540114, + "num_input_tokens_seen": 241397185, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.25024414, + "step": 11184, + "time_per_iteration": 2.6840507984161377 + }, + { + "auxiliary_loss_clip": 0.01329063, + "auxiliary_loss_mlp": 0.0022966, + "balance_loss_clip": 1.09002566, + "balance_loss_mlp": 0.20032257, + "epoch": 0.672478581091237, + "flos": 21470379891840.0, + "grad_norm": 27.24076439100641, + "language_loss": 0.77170384, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.78729099, + "num_input_tokens_seen": 241415785, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.2935791, + "step": 11185, + "time_per_iteration": 2.6839869022369385 + }, + { + "auxiliary_loss_clip": 0.01312333, + "auxiliary_loss_mlp": 0.00232469, + "balance_loss_clip": 1.0757035, + "balance_loss_mlp": 0.20768562, + "epoch": 0.6725387043439051, + "flos": 30846763267200.0, + "grad_norm": 2.336651280486047, + "language_loss": 0.88213193, + "learning_rate": 1.023135571620345e-06, + "loss": 0.89758003, + "num_input_tokens_seen": 241437390, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.24780273, + "step": 11186, + "time_per_iteration": 2.776339292526245 + }, + { + "auxiliary_loss_clip": 0.01290655, + "auxiliary_loss_mlp": 0.00222937, + "balance_loss_clip": 1.06559384, + "balance_loss_mlp": 0.1997866, + "epoch": 0.672598827596573, + "flos": 24055947659520.0, + "grad_norm": 29.03659291746407, + "language_loss": 0.86465454, + "learning_rate": 1.022795745163813e-06, + "loss": 0.87979043, + "num_input_tokens_seen": 241458085, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.23144531, + "step": 11187, + "time_per_iteration": 2.6979331970214844 + }, + { + "auxiliary_loss_clip": 0.0131591, + "auxiliary_loss_mlp": 0.00247805, + "balance_loss_clip": 1.07596147, + "balance_loss_mlp": 0.22100694, + "epoch": 0.672658950849241, + "flos": 21871861182720.0, + "grad_norm": 11.116828611075734, + "language_loss": 0.79061115, + "learning_rate": 1.022455955762965e-06, + "loss": 0.80624837, + "num_input_tokens_seen": 241476880, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.26806641, + "step": 11188, + "time_per_iteration": 2.7001514434814453 + }, + { + "auxiliary_loss_clip": 0.01298323, + "auxiliary_loss_mlp": 0.00232267, + "balance_loss_clip": 1.06988394, + "balance_loss_mlp": 0.20831782, + "epoch": 0.6727190741019089, + "flos": 23222102359680.0, + "grad_norm": 73.11593431435512, + "language_loss": 0.83189559, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.84720147, + "num_input_tokens_seen": 241496535, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.23950195, + "step": 11189, + "time_per_iteration": 2.730586528778076 + }, + { + "auxiliary_loss_clip": 0.01325971, + "auxiliary_loss_mlp": 0.00220794, + "balance_loss_clip": 1.08023143, + "balance_loss_mlp": 0.1933282, + "epoch": 0.6727791973545769, + "flos": 15778610674560.0, + "grad_norm": 3.158940472573889, + "language_loss": 0.85707736, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.872545, + "num_input_tokens_seen": 241513465, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.27453613, + "step": 11190, + "time_per_iteration": 2.6533353328704834 + }, + { + "auxiliary_loss_clip": 0.0131784, + "auxiliary_loss_mlp": 0.00243074, + "balance_loss_clip": 1.07440543, + "balance_loss_mlp": 0.21446377, + "epoch": 0.6728393206072448, + "flos": 21249852341760.0, + "grad_norm": 6.975194801086423, + "language_loss": 0.84600842, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.86161757, + "num_input_tokens_seen": 241534125, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.28637695, + "step": 11191, + "time_per_iteration": 2.6581954956054688 + }, + { + "auxiliary_loss_clip": 0.01299849, + "auxiliary_loss_mlp": 0.00208175, + "balance_loss_clip": 1.06998098, + "balance_loss_mlp": 0.18419032, + "epoch": 0.6728994438599128, + "flos": 32123279779200.0, + "grad_norm": 2.6161481655427306, + "language_loss": 0.91869473, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.93377507, + "num_input_tokens_seen": 241556340, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.23962402, + "step": 11192, + "time_per_iteration": 2.769777297973633 + }, + { + "auxiliary_loss_clip": 0.01320469, + "auxiliary_loss_mlp": 0.00238929, + "balance_loss_clip": 1.08372831, + "balance_loss_mlp": 0.21246472, + "epoch": 0.6729595671125808, + "flos": 23112359331840.0, + "grad_norm": 73.67056136592683, + "language_loss": 0.83245581, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.84804976, + "num_input_tokens_seen": 241575185, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.2644043, + "step": 11193, + "time_per_iteration": 2.6710476875305176 + }, + { + "auxiliary_loss_clip": 0.01285587, + "auxiliary_loss_mlp": 0.00213632, + "balance_loss_clip": 1.05345035, + "balance_loss_mlp": 0.18734622, + "epoch": 0.6730196903652488, + "flos": 14611406227200.0, + "grad_norm": 323.7458337071312, + "language_loss": 0.8733561, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.88834834, + "num_input_tokens_seen": 241592970, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.26306152, + "step": 11194, + "time_per_iteration": 2.704383373260498 + }, + { + "auxiliary_loss_clip": 0.01306125, + "auxiliary_loss_mlp": 0.00234157, + "balance_loss_clip": 1.06479001, + "balance_loss_mlp": 0.20794335, + "epoch": 0.6730798136179167, + "flos": 21105922544640.0, + "grad_norm": 132.2576801637099, + "language_loss": 0.99689412, + "learning_rate": 1.0200784685983075e-06, + "loss": 1.01229692, + "num_input_tokens_seen": 241610245, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.26220703, + "step": 11195, + "time_per_iteration": 2.6532254219055176 + }, + { + "auxiliary_loss_clip": 0.01291221, + "auxiliary_loss_mlp": 0.00204549, + "balance_loss_clip": 1.06333947, + "balance_loss_mlp": 0.17903787, + "epoch": 0.6731399368705847, + "flos": 28986267438720.0, + "grad_norm": 5.0177471510128315, + "language_loss": 0.79570413, + "learning_rate": 1.019738976106662e-06, + "loss": 0.81066191, + "num_input_tokens_seen": 241630350, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.25488281, + "step": 11196, + "time_per_iteration": 2.724848747253418 + }, + { + "auxiliary_loss_clip": 0.01285756, + "auxiliary_loss_mlp": 0.0014176, + "balance_loss_clip": 1.12766671, + "balance_loss_mlp": 0.13222323, + "epoch": 0.6732000601232526, + "flos": 64743708723840.0, + "grad_norm": 0.7640002041131015, + "language_loss": 0.55892313, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.57319832, + "num_input_tokens_seen": 241692380, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.09521484, + "step": 11197, + "time_per_iteration": 3.1000730991363525 + }, + { + "auxiliary_loss_clip": 0.01296749, + "auxiliary_loss_mlp": 0.00219859, + "balance_loss_clip": 1.06849587, + "balance_loss_mlp": 0.19538517, + "epoch": 0.6732601833759206, + "flos": 17201642762880.0, + "grad_norm": 6.236577681339241, + "language_loss": 0.84329802, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.85846412, + "num_input_tokens_seen": 241710430, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.24499512, + "step": 11198, + "time_per_iteration": 2.6077024936676025 + }, + { + "auxiliary_loss_clip": 0.01293736, + "auxiliary_loss_mlp": 0.00237604, + "balance_loss_clip": 1.06176114, + "balance_loss_mlp": 0.20989954, + "epoch": 0.6733203066285887, + "flos": 18658861620480.0, + "grad_norm": 19.091696384327854, + "language_loss": 0.8945744, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.90988779, + "num_input_tokens_seen": 241724775, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.27685547, + "step": 11199, + "time_per_iteration": 2.636986494064331 + }, + { + "auxiliary_loss_clip": 0.01300395, + "auxiliary_loss_mlp": 0.00245473, + "balance_loss_clip": 1.06099725, + "balance_loss_mlp": 0.21717232, + "epoch": 0.6733804298812566, + "flos": 35809330481280.0, + "grad_norm": 52.92740060880868, + "language_loss": 0.77838266, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.79384136, + "num_input_tokens_seen": 241744440, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.28295898, + "step": 11200, + "time_per_iteration": 2.7594549655914307 + }, + { + "auxiliary_loss_clip": 0.01301699, + "auxiliary_loss_mlp": 0.00231539, + "balance_loss_clip": 1.07031977, + "balance_loss_mlp": 0.20648101, + "epoch": 0.6734405531339246, + "flos": 61638833099520.0, + "grad_norm": 93.03523221876395, + "language_loss": 0.70245671, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.71778911, + "num_input_tokens_seen": 241771705, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.25036621, + "step": 11201, + "time_per_iteration": 3.0999417304992676 + }, + { + "auxiliary_loss_clip": 0.01300747, + "auxiliary_loss_mlp": 0.00232774, + "balance_loss_clip": 1.06679022, + "balance_loss_mlp": 0.20647672, + "epoch": 0.6735006763865925, + "flos": 20522338277760.0, + "grad_norm": 5.335154939501229, + "language_loss": 0.72587299, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.7412082, + "num_input_tokens_seen": 241790830, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.26269531, + "step": 11202, + "time_per_iteration": 2.7085816860198975 + }, + { + "auxiliary_loss_clip": 0.01290599, + "auxiliary_loss_mlp": 0.00225402, + "balance_loss_clip": 1.05947876, + "balance_loss_mlp": 0.1995215, + "epoch": 0.6735607996392605, + "flos": 13918869031680.0, + "grad_norm": 53.21245041917134, + "language_loss": 0.8348155, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.84997547, + "num_input_tokens_seen": 241808165, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.25878906, + "step": 11203, + "time_per_iteration": 4.047514200210571 + }, + { + "auxiliary_loss_clip": 0.01324868, + "auxiliary_loss_mlp": 0.00244827, + "balance_loss_clip": 1.07979214, + "balance_loss_mlp": 0.21669312, + "epoch": 0.6736209228919284, + "flos": 18807244704000.0, + "grad_norm": 282.89564674482324, + "language_loss": 0.76716226, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.78285921, + "num_input_tokens_seen": 241826925, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.28112793, + "step": 11204, + "time_per_iteration": 4.178461074829102 + }, + { + "auxiliary_loss_clip": 0.01315272, + "auxiliary_loss_mlp": 0.00225324, + "balance_loss_clip": 1.07457805, + "balance_loss_mlp": 0.19763178, + "epoch": 0.6736810461445965, + "flos": 20373129181440.0, + "grad_norm": 190.32500804307804, + "language_loss": 0.81085718, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.82626307, + "num_input_tokens_seen": 241845525, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.27697754, + "step": 11205, + "time_per_iteration": 2.723565101623535 + }, + { + "auxiliary_loss_clip": 0.01299501, + "auxiliary_loss_mlp": 0.00216269, + "balance_loss_clip": 1.06566763, + "balance_loss_mlp": 0.18999511, + "epoch": 0.6737411693972644, + "flos": 30007530927360.0, + "grad_norm": 15.619446346773517, + "language_loss": 0.80414432, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.81930196, + "num_input_tokens_seen": 241866815, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.26269531, + "step": 11206, + "time_per_iteration": 2.7050068378448486 + }, + { + "auxiliary_loss_clip": 0.01333846, + "auxiliary_loss_mlp": 0.00239511, + "balance_loss_clip": 1.08939242, + "balance_loss_mlp": 0.2116634, + "epoch": 0.6738012926499324, + "flos": 25447342844160.0, + "grad_norm": 8.978462294900178, + "language_loss": 0.78417063, + "learning_rate": 1.016007014855092e-06, + "loss": 0.79990417, + "num_input_tokens_seen": 241887050, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.27880859, + "step": 11207, + "time_per_iteration": 2.7164790630340576 + }, + { + "auxiliary_loss_clip": 0.01277901, + "auxiliary_loss_mlp": 0.00207694, + "balance_loss_clip": 1.05443978, + "balance_loss_mlp": 0.18442425, + "epoch": 0.6738614159026003, + "flos": 20776873029120.0, + "grad_norm": 26.87810196210832, + "language_loss": 0.82907987, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.84393579, + "num_input_tokens_seen": 241904280, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.23266602, + "step": 11208, + "time_per_iteration": 4.086644172668457 + }, + { + "auxiliary_loss_clip": 0.01296945, + "auxiliary_loss_mlp": 0.0023074, + "balance_loss_clip": 1.06009579, + "balance_loss_mlp": 0.20267785, + "epoch": 0.6739215391552683, + "flos": 19566898462080.0, + "grad_norm": 6.545634156104539, + "language_loss": 0.84514874, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.86042559, + "num_input_tokens_seen": 241919190, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.28088379, + "step": 11209, + "time_per_iteration": 2.6883926391601562 + }, + { + "auxiliary_loss_clip": 0.01276144, + "auxiliary_loss_mlp": 0.00197979, + "balance_loss_clip": 1.05097556, + "balance_loss_mlp": 0.17306437, + "epoch": 0.6739816624079362, + "flos": 24388193485440.0, + "grad_norm": 72.91196250060742, + "language_loss": 0.77097631, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.78571755, + "num_input_tokens_seen": 241940525, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.24926758, + "step": 11210, + "time_per_iteration": 2.698976993560791 + }, + { + "auxiliary_loss_clip": 0.01275776, + "auxiliary_loss_mlp": 0.0020364, + "balance_loss_clip": 1.0537796, + "balance_loss_mlp": 0.18085892, + "epoch": 0.6740417856606042, + "flos": 22528164533760.0, + "grad_norm": 163.47028337840828, + "language_loss": 0.87599576, + "learning_rate": 1.014651056529377e-06, + "loss": 0.89078987, + "num_input_tokens_seen": 241959290, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.22790527, + "step": 11211, + "time_per_iteration": 2.6795995235443115 + }, + { + "auxiliary_loss_clip": 0.01291122, + "auxiliary_loss_mlp": 0.00228971, + "balance_loss_clip": 1.0600332, + "balance_loss_mlp": 0.20408005, + "epoch": 0.6741019089132723, + "flos": 25775458606080.0, + "grad_norm": 69.66147143645453, + "language_loss": 0.82254744, + "learning_rate": 1.014312160327143e-06, + "loss": 0.83774835, + "num_input_tokens_seen": 241980715, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2487793, + "step": 11212, + "time_per_iteration": 4.104069948196411 + }, + { + "auxiliary_loss_clip": 0.01300148, + "auxiliary_loss_mlp": 0.00225793, + "balance_loss_clip": 1.06230092, + "balance_loss_mlp": 0.19879256, + "epoch": 0.6741620321659402, + "flos": 21105671149440.0, + "grad_norm": 10.703458552470371, + "language_loss": 0.87071037, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.88596982, + "num_input_tokens_seen": 241999985, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.26977539, + "step": 11213, + "time_per_iteration": 2.6865451335906982 + }, + { + "auxiliary_loss_clip": 0.01281681, + "auxiliary_loss_mlp": 0.00217096, + "balance_loss_clip": 1.05103159, + "balance_loss_mlp": 0.19201475, + "epoch": 0.6742221554186082, + "flos": 20740423703040.0, + "grad_norm": 509.5370735052831, + "language_loss": 0.76606941, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.78105724, + "num_input_tokens_seen": 242018990, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.25097656, + "step": 11214, + "time_per_iteration": 2.76403546333313 + }, + { + "auxiliary_loss_clip": 0.01289843, + "auxiliary_loss_mlp": 0.00228712, + "balance_loss_clip": 1.05741894, + "balance_loss_mlp": 0.20335665, + "epoch": 0.6742822786712761, + "flos": 37774146384000.0, + "grad_norm": 132.8866380919774, + "language_loss": 0.78431439, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.79949993, + "num_input_tokens_seen": 242039340, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.25354004, + "step": 11215, + "time_per_iteration": 2.825324773788452 + }, + { + "auxiliary_loss_clip": 0.01298552, + "auxiliary_loss_mlp": 0.00195974, + "balance_loss_clip": 1.05976915, + "balance_loss_mlp": 0.17063032, + "epoch": 0.6743424019239441, + "flos": 37263891732480.0, + "grad_norm": 3.943185180035349, + "language_loss": 0.75674945, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.77169466, + "num_input_tokens_seen": 242062215, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.25341797, + "step": 11216, + "time_per_iteration": 2.8052830696105957 + }, + { + "auxiliary_loss_clip": 0.01259489, + "auxiliary_loss_mlp": 0.00086099, + "balance_loss_clip": 1.1066761, + "balance_loss_mlp": 0.07908985, + "epoch": 0.674402525176612, + "flos": 65997746300160.0, + "grad_norm": 0.6755282063509241, + "language_loss": 0.5594821, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.57293797, + "num_input_tokens_seen": 242131130, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.0703125, + "step": 11217, + "time_per_iteration": 3.246645450592041 + }, + { + "auxiliary_loss_clip": 0.01297422, + "auxiliary_loss_mlp": 0.00210943, + "balance_loss_clip": 1.06312394, + "balance_loss_mlp": 0.18516967, + "epoch": 0.67446264842928, + "flos": 26461208131200.0, + "grad_norm": 115.2914297863557, + "language_loss": 0.82990873, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.8449924, + "num_input_tokens_seen": 242149720, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.25769043, + "step": 11218, + "time_per_iteration": 2.655461072921753 + }, + { + "auxiliary_loss_clip": 0.01323506, + "auxiliary_loss_mlp": 0.00224308, + "balance_loss_clip": 1.07618332, + "balance_loss_mlp": 0.19734278, + "epoch": 0.674522771681948, + "flos": 23732392924800.0, + "grad_norm": 206.36283447635344, + "language_loss": 0.75383103, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.76930916, + "num_input_tokens_seen": 242168875, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.26977539, + "step": 11219, + "time_per_iteration": 2.699918508529663 + }, + { + "auxiliary_loss_clip": 0.01286701, + "auxiliary_loss_mlp": 0.00218437, + "balance_loss_clip": 1.05775905, + "balance_loss_mlp": 0.19308147, + "epoch": 0.674582894934616, + "flos": 24754338771840.0, + "grad_norm": 7.738309191391277, + "language_loss": 0.83573312, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.85078448, + "num_input_tokens_seen": 242188465, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.25366211, + "step": 11220, + "time_per_iteration": 2.6912553310394287 + }, + { + "auxiliary_loss_clip": 0.01267055, + "auxiliary_loss_mlp": 0.00215277, + "balance_loss_clip": 1.0424422, + "balance_loss_mlp": 0.1898137, + "epoch": 0.6746430181872839, + "flos": 24826626892800.0, + "grad_norm": 131.47479151310202, + "language_loss": 0.79646844, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.81129175, + "num_input_tokens_seen": 242208675, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.25476074, + "step": 11221, + "time_per_iteration": 2.7170398235321045 + }, + { + "auxiliary_loss_clip": 0.0127352, + "auxiliary_loss_mlp": 0.00202385, + "balance_loss_clip": 1.04535341, + "balance_loss_mlp": 0.17907922, + "epoch": 0.6747031414399519, + "flos": 16873491087360.0, + "grad_norm": 148.23225497609852, + "language_loss": 0.68243349, + "learning_rate": 1.010925256180498e-06, + "loss": 0.69719255, + "num_input_tokens_seen": 242227440, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.23291016, + "step": 11222, + "time_per_iteration": 2.63856840133667 + }, + { + "auxiliary_loss_clip": 0.0128996, + "auxiliary_loss_mlp": 0.00240388, + "balance_loss_clip": 1.05994058, + "balance_loss_mlp": 0.21455535, + "epoch": 0.6747632646926198, + "flos": 22784925928320.0, + "grad_norm": 2.091110316342292, + "language_loss": 0.84203786, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.85734141, + "num_input_tokens_seen": 242245240, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.25805664, + "step": 11223, + "time_per_iteration": 2.645334005355835 + }, + { + "auxiliary_loss_clip": 0.01286318, + "auxiliary_loss_mlp": 0.00240035, + "balance_loss_clip": 1.05398595, + "balance_loss_mlp": 0.21398789, + "epoch": 0.6748233879452878, + "flos": 20046090827520.0, + "grad_norm": 12.857476862051659, + "language_loss": 0.82169771, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.83696127, + "num_input_tokens_seen": 242263435, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.26037598, + "step": 11224, + "time_per_iteration": 2.644181489944458 + }, + { + "auxiliary_loss_clip": 0.01286945, + "auxiliary_loss_mlp": 0.00202064, + "balance_loss_clip": 1.05796456, + "balance_loss_mlp": 0.1789135, + "epoch": 0.6748835111979558, + "flos": 23002831785600.0, + "grad_norm": 5.7970593878259935, + "language_loss": 0.69243896, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.70732903, + "num_input_tokens_seen": 242282765, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.23168945, + "step": 11225, + "time_per_iteration": 2.67832612991333 + }, + { + "auxiliary_loss_clip": 0.01267537, + "auxiliary_loss_mlp": 0.00210036, + "balance_loss_clip": 1.04456508, + "balance_loss_mlp": 0.18746983, + "epoch": 0.6749436344506238, + "flos": 12197311009920.0, + "grad_norm": 5.118617176185002, + "language_loss": 0.70526987, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.72004557, + "num_input_tokens_seen": 242298980, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.22570801, + "step": 11226, + "time_per_iteration": 2.6403989791870117 + }, + { + "auxiliary_loss_clip": 0.01280401, + "auxiliary_loss_mlp": 0.00226755, + "balance_loss_clip": 1.05323482, + "balance_loss_mlp": 0.20056452, + "epoch": 0.6750037577032918, + "flos": 11873720361600.0, + "grad_norm": 17.00704244151231, + "language_loss": 0.81830925, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.83338082, + "num_input_tokens_seen": 242315420, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.26220703, + "step": 11227, + "time_per_iteration": 2.6202220916748047 + }, + { + "auxiliary_loss_clip": 0.01266553, + "auxiliary_loss_mlp": 0.00219998, + "balance_loss_clip": 1.04112554, + "balance_loss_mlp": 0.19670437, + "epoch": 0.6750638809559597, + "flos": 17019611614080.0, + "grad_norm": 151.35065950682036, + "language_loss": 0.79755819, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.81242365, + "num_input_tokens_seen": 242332805, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.2331543, + "step": 11228, + "time_per_iteration": 2.6803903579711914 + }, + { + "auxiliary_loss_clip": 0.01265982, + "auxiliary_loss_mlp": 0.00090376, + "balance_loss_clip": 1.10507941, + "balance_loss_mlp": 0.08217438, + "epoch": 0.6751240042086277, + "flos": 70951011891840.0, + "grad_norm": 0.7561516964718535, + "language_loss": 0.52518928, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.53875291, + "num_input_tokens_seen": 242396160, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.08203125, + "step": 11229, + "time_per_iteration": 3.240126848220825 + }, + { + "auxiliary_loss_clip": 0.01290531, + "auxiliary_loss_mlp": 0.00202459, + "balance_loss_clip": 1.05854654, + "balance_loss_mlp": 0.17575563, + "epoch": 0.6751841274612956, + "flos": 22675146986880.0, + "grad_norm": 4.569299161972507, + "language_loss": 0.86787391, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.8828038, + "num_input_tokens_seen": 242414660, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.26721191, + "step": 11230, + "time_per_iteration": 2.696629285812378 + }, + { + "auxiliary_loss_clip": 0.01277966, + "auxiliary_loss_mlp": 0.00203423, + "balance_loss_clip": 1.05212879, + "balance_loss_mlp": 0.1796051, + "epoch": 0.6752442507139637, + "flos": 21288636051840.0, + "grad_norm": 513.3641229552558, + "language_loss": 0.74183339, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.75664729, + "num_input_tokens_seen": 242434225, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.23828125, + "step": 11231, + "time_per_iteration": 2.6920838356018066 + }, + { + "auxiliary_loss_clip": 0.01348831, + "auxiliary_loss_mlp": 0.00248204, + "balance_loss_clip": 1.09460807, + "balance_loss_mlp": 0.21911699, + "epoch": 0.6753043739666316, + "flos": 28256921781120.0, + "grad_norm": 137.1191303029634, + "language_loss": 0.74597096, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.76194131, + "num_input_tokens_seen": 242454355, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.29077148, + "step": 11232, + "time_per_iteration": 2.831052541732788 + }, + { + "auxiliary_loss_clip": 0.01305869, + "auxiliary_loss_mlp": 0.0023058, + "balance_loss_clip": 1.07137322, + "balance_loss_mlp": 0.20481853, + "epoch": 0.6753644972192996, + "flos": 21360349555200.0, + "grad_norm": 15.280151081469556, + "language_loss": 0.7979033, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.81326783, + "num_input_tokens_seen": 242474935, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.25720215, + "step": 11233, + "time_per_iteration": 2.6672158241271973 + }, + { + "auxiliary_loss_clip": 0.01287524, + "auxiliary_loss_mlp": 0.00236873, + "balance_loss_clip": 1.05835927, + "balance_loss_mlp": 0.2103014, + "epoch": 0.6754246204719675, + "flos": 26541971861760.0, + "grad_norm": 11.420094268039191, + "language_loss": 0.84996253, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.86520648, + "num_input_tokens_seen": 242495530, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.26599121, + "step": 11234, + "time_per_iteration": 2.762568235397339 + }, + { + "auxiliary_loss_clip": 0.01314089, + "auxiliary_loss_mlp": 0.00232099, + "balance_loss_clip": 1.07786465, + "balance_loss_mlp": 0.20423998, + "epoch": 0.6754847437246355, + "flos": 25556690822400.0, + "grad_norm": 3.711701044595577, + "language_loss": 0.82746947, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.84293139, + "num_input_tokens_seen": 242514550, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.27868652, + "step": 11235, + "time_per_iteration": 2.724761486053467 + }, + { + "auxiliary_loss_clip": 0.01268559, + "auxiliary_loss_mlp": 0.00098104, + "balance_loss_clip": 1.11155069, + "balance_loss_mlp": 0.08966371, + "epoch": 0.6755448669773034, + "flos": 59513318726400.0, + "grad_norm": 0.7701769422053569, + "language_loss": 0.50637114, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.52003777, + "num_input_tokens_seen": 242569200, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.08447266, + "step": 11236, + "time_per_iteration": 3.120445966720581 + }, + { + "auxiliary_loss_clip": 0.01301247, + "auxiliary_loss_mlp": 0.00223249, + "balance_loss_clip": 1.06360793, + "balance_loss_mlp": 0.19707045, + "epoch": 0.6756049902299714, + "flos": 23294534135040.0, + "grad_norm": 2.6211664404715767, + "language_loss": 0.84217304, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.857418, + "num_input_tokens_seen": 242586950, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.26196289, + "step": 11237, + "time_per_iteration": 2.6781067848205566 + }, + { + "auxiliary_loss_clip": 0.01327772, + "auxiliary_loss_mlp": 0.00222063, + "balance_loss_clip": 1.08601165, + "balance_loss_mlp": 0.19477563, + "epoch": 0.6756651134826394, + "flos": 31575426566400.0, + "grad_norm": 24.203126446412714, + "language_loss": 0.87107146, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.88656974, + "num_input_tokens_seen": 242607380, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.27282715, + "step": 11238, + "time_per_iteration": 2.7278542518615723 + }, + { + "auxiliary_loss_clip": 0.01299646, + "auxiliary_loss_mlp": 0.00228279, + "balance_loss_clip": 1.06509101, + "balance_loss_mlp": 0.20033616, + "epoch": 0.6757252367353074, + "flos": 27272287186560.0, + "grad_norm": 82.4615143611275, + "language_loss": 0.7579577, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.77323693, + "num_input_tokens_seen": 242628025, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.27966309, + "step": 11239, + "time_per_iteration": 2.6949501037597656 + }, + { + "auxiliary_loss_clip": 0.01287329, + "auxiliary_loss_mlp": 0.00207811, + "balance_loss_clip": 1.06155765, + "balance_loss_mlp": 0.18237215, + "epoch": 0.6757853599879754, + "flos": 16830900535680.0, + "grad_norm": 50.99771384188272, + "language_loss": 0.89265299, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.90760446, + "num_input_tokens_seen": 242643825, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.25402832, + "step": 11240, + "time_per_iteration": 2.5914981365203857 + }, + { + "auxiliary_loss_clip": 0.0136126, + "auxiliary_loss_mlp": 0.00229392, + "balance_loss_clip": 1.10285079, + "balance_loss_mlp": 0.19899383, + "epoch": 0.6758454832406433, + "flos": 23220055284480.0, + "grad_norm": 5.95678381639936, + "language_loss": 0.90519518, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.92110169, + "num_input_tokens_seen": 242661820, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.30395508, + "step": 11241, + "time_per_iteration": 2.6538054943084717 + }, + { + "auxiliary_loss_clip": 0.01311054, + "auxiliary_loss_mlp": 0.00231149, + "balance_loss_clip": 1.07782602, + "balance_loss_mlp": 0.20574597, + "epoch": 0.6759056064933113, + "flos": 16289547684480.0, + "grad_norm": 29.674028975161615, + "language_loss": 0.89524812, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.91067016, + "num_input_tokens_seen": 242679890, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.25415039, + "step": 11242, + "time_per_iteration": 2.622302532196045 + }, + { + "auxiliary_loss_clip": 0.01295658, + "auxiliary_loss_mlp": 0.00226701, + "balance_loss_clip": 1.06593955, + "balance_loss_mlp": 0.20022488, + "epoch": 0.6759657297459792, + "flos": 25922297404800.0, + "grad_norm": 47.5442822572552, + "language_loss": 0.8046267, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.81985033, + "num_input_tokens_seen": 242699495, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.26501465, + "step": 11243, + "time_per_iteration": 2.7488772869110107 + }, + { + "auxiliary_loss_clip": 0.01292892, + "auxiliary_loss_mlp": 0.0020828, + "balance_loss_clip": 1.06154251, + "balance_loss_mlp": 0.18388994, + "epoch": 0.6760258529986473, + "flos": 23000820624000.0, + "grad_norm": 14.78352165159696, + "language_loss": 0.80294251, + "learning_rate": 1.003487287162221e-06, + "loss": 0.81795424, + "num_input_tokens_seen": 242719500, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.24365234, + "step": 11244, + "time_per_iteration": 2.6461992263793945 + }, + { + "auxiliary_loss_clip": 0.01291896, + "auxiliary_loss_mlp": 0.00199837, + "balance_loss_clip": 1.06276941, + "balance_loss_mlp": 0.17524418, + "epoch": 0.6760859762513152, + "flos": 20959335141120.0, + "grad_norm": 3.097029647035086, + "language_loss": 0.94757545, + "learning_rate": 1.003149631190393e-06, + "loss": 0.96249282, + "num_input_tokens_seen": 242738325, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.24609375, + "step": 11245, + "time_per_iteration": 4.072397708892822 + }, + { + "auxiliary_loss_clip": 0.01308007, + "auxiliary_loss_mlp": 0.00213578, + "balance_loss_clip": 1.07433724, + "balance_loss_mlp": 0.18735172, + "epoch": 0.6761460995039832, + "flos": 23622937205760.0, + "grad_norm": 6.154526126456195, + "language_loss": 0.82243657, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.83765239, + "num_input_tokens_seen": 242756620, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.2623291, + "step": 11246, + "time_per_iteration": 4.251142978668213 + }, + { + "auxiliary_loss_clip": 0.0129426, + "auxiliary_loss_mlp": 0.00211815, + "balance_loss_clip": 1.06401968, + "balance_loss_mlp": 0.18742456, + "epoch": 0.6762062227566511, + "flos": 20770875457920.0, + "grad_norm": 3.325448848588042, + "language_loss": 0.943434, + "learning_rate": 1.002474432661539e-06, + "loss": 0.95849478, + "num_input_tokens_seen": 242774505, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.24401855, + "step": 11247, + "time_per_iteration": 2.734910488128662 + }, + { + "auxiliary_loss_clip": 0.01284792, + "auxiliary_loss_mlp": 0.00216099, + "balance_loss_clip": 1.12481809, + "balance_loss_mlp": 0.20689617, + "epoch": 0.6762663460093191, + "flos": 52818099166080.0, + "grad_norm": 0.8088087879989856, + "language_loss": 0.53630304, + "learning_rate": 1.002136890130115e-06, + "loss": 0.55131191, + "num_input_tokens_seen": 242828645, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.09179688, + "step": 11248, + "time_per_iteration": 3.173039674758911 + }, + { + "auxiliary_loss_clip": 0.01299586, + "auxiliary_loss_mlp": 0.00215846, + "balance_loss_clip": 1.06996012, + "balance_loss_mlp": 0.19078809, + "epoch": 0.676326469261987, + "flos": 23696302734720.0, + "grad_norm": 11.256525265876824, + "language_loss": 0.818546, + "learning_rate": 1.001799385437761e-06, + "loss": 0.8337003, + "num_input_tokens_seen": 242850100, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.25048828, + "step": 11249, + "time_per_iteration": 2.707488536834717 + }, + { + "auxiliary_loss_clip": 0.0132297, + "auxiliary_loss_mlp": 0.00218591, + "balance_loss_clip": 1.07952595, + "balance_loss_mlp": 0.19001648, + "epoch": 0.676386592514655, + "flos": 14063732582400.0, + "grad_norm": 23.921609418666314, + "language_loss": 0.8387078, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.85412347, + "num_input_tokens_seen": 242867775, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.28552246, + "step": 11250, + "time_per_iteration": 4.140531778335571 + }, + { + "auxiliary_loss_clip": 0.01304699, + "auxiliary_loss_mlp": 0.00245014, + "balance_loss_clip": 1.07163429, + "balance_loss_mlp": 0.21872875, + "epoch": 0.676446715767323, + "flos": 20412236113920.0, + "grad_norm": 1211.709643533875, + "language_loss": 0.81856942, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.83406657, + "num_input_tokens_seen": 242886865, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.26293945, + "step": 11251, + "time_per_iteration": 2.6566455364227295 + }, + { + "auxiliary_loss_clip": 0.01332642, + "auxiliary_loss_mlp": 0.00213477, + "balance_loss_clip": 1.08879435, + "balance_loss_mlp": 0.18591629, + "epoch": 0.676506839019991, + "flos": 21288241002240.0, + "grad_norm": 118.297584514589, + "language_loss": 0.80952287, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.82498407, + "num_input_tokens_seen": 242906705, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.27563477, + "step": 11252, + "time_per_iteration": 2.6716134548187256 + }, + { + "auxiliary_loss_clip": 0.01293741, + "auxiliary_loss_mlp": 0.00200295, + "balance_loss_clip": 1.06821823, + "balance_loss_mlp": 0.17675166, + "epoch": 0.676566962272659, + "flos": 29932477459200.0, + "grad_norm": 21.117730126635536, + "language_loss": 0.75473469, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.76967514, + "num_input_tokens_seen": 242925215, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.23547363, + "step": 11253, + "time_per_iteration": 2.7073874473571777 + }, + { + "auxiliary_loss_clip": 0.0132758, + "auxiliary_loss_mlp": 0.00236335, + "balance_loss_clip": 1.0827179, + "balance_loss_mlp": 0.20674753, + "epoch": 0.6766270855253269, + "flos": 17931203902080.0, + "grad_norm": 4.630326378458993, + "language_loss": 0.8581233, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.87376237, + "num_input_tokens_seen": 242944750, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.29614258, + "step": 11254, + "time_per_iteration": 4.0635294914245605 + }, + { + "auxiliary_loss_clip": 0.01321265, + "auxiliary_loss_mlp": 0.00210824, + "balance_loss_clip": 1.0840261, + "balance_loss_mlp": 0.18314409, + "epoch": 0.6766872087779949, + "flos": 23104853389440.0, + "grad_norm": 7.81418434837909, + "language_loss": 0.80079079, + "learning_rate": 9.997751526206835e-07, + "loss": 0.81611168, + "num_input_tokens_seen": 242963860, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.27709961, + "step": 11255, + "time_per_iteration": 2.6828413009643555 + }, + { + "auxiliary_loss_clip": 0.01326996, + "auxiliary_loss_mlp": 0.00231535, + "balance_loss_clip": 1.08142185, + "balance_loss_mlp": 0.20355611, + "epoch": 0.6767473320306628, + "flos": 26213137827840.0, + "grad_norm": 32.36280186990526, + "language_loss": 0.85633969, + "learning_rate": 9.994379131600828e-07, + "loss": 0.871925, + "num_input_tokens_seen": 242983050, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.27966309, + "step": 11256, + "time_per_iteration": 2.701913356781006 + }, + { + "auxiliary_loss_clip": 0.01313479, + "auxiliary_loss_mlp": 0.0020358, + "balance_loss_clip": 1.07653534, + "balance_loss_mlp": 0.17727031, + "epoch": 0.6768074552833309, + "flos": 18368739469440.0, + "grad_norm": 44.33197210832063, + "language_loss": 0.77429003, + "learning_rate": 9.991007116408965e-07, + "loss": 0.78946066, + "num_input_tokens_seen": 243001125, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.26330566, + "step": 11257, + "time_per_iteration": 2.6664254665374756 + }, + { + "auxiliary_loss_clip": 0.01286041, + "auxiliary_loss_mlp": 0.00218575, + "balance_loss_clip": 1.05911541, + "balance_loss_mlp": 0.1944114, + "epoch": 0.6768675785359988, + "flos": 23039927556480.0, + "grad_norm": 116.03369803766219, + "language_loss": 0.81370461, + "learning_rate": 9.987635480759109e-07, + "loss": 0.82875073, + "num_input_tokens_seen": 243021865, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.24157715, + "step": 11258, + "time_per_iteration": 2.6875154972076416 + }, + { + "auxiliary_loss_clip": 0.01289812, + "auxiliary_loss_mlp": 0.0021406, + "balance_loss_clip": 1.06337559, + "balance_loss_mlp": 0.19053976, + "epoch": 0.6769277017886668, + "flos": 33036524092800.0, + "grad_norm": 3.442452905056381, + "language_loss": 0.74364018, + "learning_rate": 9.984264224779127e-07, + "loss": 0.75867891, + "num_input_tokens_seen": 243042970, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.23535156, + "step": 11259, + "time_per_iteration": 2.809744119644165 + }, + { + "auxiliary_loss_clip": 0.01286401, + "auxiliary_loss_mlp": 0.00197924, + "balance_loss_clip": 1.0555532, + "balance_loss_mlp": 0.17205521, + "epoch": 0.6769878250413347, + "flos": 20848406964480.0, + "grad_norm": 9.371611021795106, + "language_loss": 0.9336834, + "learning_rate": 9.980893348596839e-07, + "loss": 0.94852662, + "num_input_tokens_seen": 243058470, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.25842285, + "step": 11260, + "time_per_iteration": 2.645406484603882 + }, + { + "auxiliary_loss_clip": 0.013201, + "auxiliary_loss_mlp": 0.00237704, + "balance_loss_clip": 1.07588589, + "balance_loss_mlp": 0.21011904, + "epoch": 0.6770479482940027, + "flos": 15595968994560.0, + "grad_norm": 186.54146122843073, + "language_loss": 0.87568104, + "learning_rate": 9.977522852340081e-07, + "loss": 0.89125907, + "num_input_tokens_seen": 243076630, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.27612305, + "step": 11261, + "time_per_iteration": 2.636981248855591 + }, + { + "auxiliary_loss_clip": 0.01288065, + "auxiliary_loss_mlp": 0.00205718, + "balance_loss_clip": 1.05966091, + "balance_loss_mlp": 0.18054113, + "epoch": 0.6771080715466706, + "flos": 18621011664000.0, + "grad_norm": 1.9344882801306478, + "language_loss": 0.9472822, + "learning_rate": 9.97415273613666e-07, + "loss": 0.96222007, + "num_input_tokens_seen": 243092260, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.2520752, + "step": 11262, + "time_per_iteration": 2.619154453277588 + }, + { + "auxiliary_loss_clip": 0.01296493, + "auxiliary_loss_mlp": 0.00201708, + "balance_loss_clip": 1.06491542, + "balance_loss_mlp": 0.17625666, + "epoch": 0.6771681947993387, + "flos": 12495441893760.0, + "grad_norm": 55.76934331525783, + "language_loss": 0.82805419, + "learning_rate": 9.97078300011439e-07, + "loss": 0.84303617, + "num_input_tokens_seen": 243109405, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.2545166, + "step": 11263, + "time_per_iteration": 2.6332430839538574 + }, + { + "auxiliary_loss_clip": 0.01306044, + "auxiliary_loss_mlp": 0.00213262, + "balance_loss_clip": 1.06775928, + "balance_loss_mlp": 0.18449718, + "epoch": 0.6772283180520066, + "flos": 22236964974720.0, + "grad_norm": 2.9676118471325044, + "language_loss": 0.79353553, + "learning_rate": 9.967413644401016e-07, + "loss": 0.80872858, + "num_input_tokens_seen": 243128135, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.28747559, + "step": 11264, + "time_per_iteration": 2.6286263465881348 + }, + { + "auxiliary_loss_clip": 0.01309012, + "auxiliary_loss_mlp": 0.00207027, + "balance_loss_clip": 1.07407391, + "balance_loss_mlp": 0.17853622, + "epoch": 0.6772884413046746, + "flos": 16143139848960.0, + "grad_norm": 4.539254012112613, + "language_loss": 0.84984136, + "learning_rate": 9.964044669124324e-07, + "loss": 0.8650018, + "num_input_tokens_seen": 243146785, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.28466797, + "step": 11265, + "time_per_iteration": 2.6215333938598633 + }, + { + "auxiliary_loss_clip": 0.01295043, + "auxiliary_loss_mlp": 0.00212036, + "balance_loss_clip": 1.06685257, + "balance_loss_mlp": 0.18716902, + "epoch": 0.6773485645573426, + "flos": 19135755515520.0, + "grad_norm": 193.91547836874702, + "language_loss": 0.69858325, + "learning_rate": 9.96067607441207e-07, + "loss": 0.71365404, + "num_input_tokens_seen": 243165275, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.24853516, + "step": 11266, + "time_per_iteration": 2.6507067680358887 + }, + { + "auxiliary_loss_clip": 0.01299726, + "auxiliary_loss_mlp": 0.00223054, + "balance_loss_clip": 1.0698086, + "balance_loss_mlp": 0.19649437, + "epoch": 0.6774086878100105, + "flos": 14136918543360.0, + "grad_norm": 6.073911330931136, + "language_loss": 0.79655826, + "learning_rate": 9.957307860391976e-07, + "loss": 0.81178606, + "num_input_tokens_seen": 243182845, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.26574707, + "step": 11267, + "time_per_iteration": 2.608888864517212 + }, + { + "auxiliary_loss_clip": 0.01324473, + "auxiliary_loss_mlp": 0.00218028, + "balance_loss_clip": 1.07969809, + "balance_loss_mlp": 0.18952471, + "epoch": 0.6774688110626785, + "flos": 22197067943040.0, + "grad_norm": 9.966181283008698, + "language_loss": 0.77148485, + "learning_rate": 9.953940027191785e-07, + "loss": 0.78690988, + "num_input_tokens_seen": 243201475, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.28540039, + "step": 11268, + "time_per_iteration": 2.723778486251831 + }, + { + "auxiliary_loss_clip": 0.0132533, + "auxiliary_loss_mlp": 0.00219414, + "balance_loss_clip": 1.08728361, + "balance_loss_mlp": 0.19073229, + "epoch": 0.6775289343153464, + "flos": 23039963470080.0, + "grad_norm": 31.458423882901357, + "language_loss": 0.8415426, + "learning_rate": 9.950572574939194e-07, + "loss": 0.85698998, + "num_input_tokens_seen": 243221850, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.28686523, + "step": 11269, + "time_per_iteration": 2.660388469696045 + }, + { + "auxiliary_loss_clip": 0.01299854, + "auxiliary_loss_mlp": 0.00210996, + "balance_loss_clip": 1.06879091, + "balance_loss_mlp": 0.18317288, + "epoch": 0.6775890575680145, + "flos": 18293506433280.0, + "grad_norm": 170.92684964360814, + "language_loss": 0.83930421, + "learning_rate": 9.94720550376189e-07, + "loss": 0.85441267, + "num_input_tokens_seen": 243239855, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.27844238, + "step": 11270, + "time_per_iteration": 2.6789307594299316 + }, + { + "auxiliary_loss_clip": 0.01301153, + "auxiliary_loss_mlp": 0.00222821, + "balance_loss_clip": 1.06892323, + "balance_loss_mlp": 0.19506958, + "epoch": 0.6776491808206824, + "flos": 25336450581120.0, + "grad_norm": 3.692228090660706, + "language_loss": 0.79510987, + "learning_rate": 9.94383881378756e-07, + "loss": 0.81034958, + "num_input_tokens_seen": 243260085, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.27734375, + "step": 11271, + "time_per_iteration": 2.6820244789123535 + }, + { + "auxiliary_loss_clip": 0.01314575, + "auxiliary_loss_mlp": 0.00188953, + "balance_loss_clip": 1.08086634, + "balance_loss_mlp": 0.16160686, + "epoch": 0.6777093040733504, + "flos": 26028233591040.0, + "grad_norm": 21.29938881010118, + "language_loss": 0.76385337, + "learning_rate": 9.94047250514387e-07, + "loss": 0.77888864, + "num_input_tokens_seen": 243280065, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.2734375, + "step": 11272, + "time_per_iteration": 2.7630057334899902 + }, + { + "auxiliary_loss_clip": 0.01306895, + "auxiliary_loss_mlp": 0.00210071, + "balance_loss_clip": 1.07330513, + "balance_loss_mlp": 0.18007761, + "epoch": 0.6777694273260183, + "flos": 18003599763840.0, + "grad_norm": 4.311547946234995, + "language_loss": 0.81897163, + "learning_rate": 9.937106577958481e-07, + "loss": 0.83414137, + "num_input_tokens_seen": 243297775, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.29980469, + "step": 11273, + "time_per_iteration": 2.663538694381714 + }, + { + "auxiliary_loss_clip": 0.01300425, + "auxiliary_loss_mlp": 0.00221647, + "balance_loss_clip": 1.06985056, + "balance_loss_mlp": 0.19283403, + "epoch": 0.6778295505786863, + "flos": 23441085624960.0, + "grad_norm": 748.7022476552714, + "language_loss": 0.77919465, + "learning_rate": 9.933741032359015e-07, + "loss": 0.79441535, + "num_input_tokens_seen": 243315760, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.28820801, + "step": 11274, + "time_per_iteration": 2.753005027770996 + }, + { + "auxiliary_loss_clip": 0.01307842, + "auxiliary_loss_mlp": 0.00201453, + "balance_loss_clip": 1.07115901, + "balance_loss_mlp": 0.17343906, + "epoch": 0.6778896738313542, + "flos": 19098408349440.0, + "grad_norm": 60.24082870717384, + "language_loss": 0.74781477, + "learning_rate": 9.930375868473093e-07, + "loss": 0.76290768, + "num_input_tokens_seen": 243335715, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.27990723, + "step": 11275, + "time_per_iteration": 2.6674227714538574 + }, + { + "auxiliary_loss_clip": 0.01295039, + "auxiliary_loss_mlp": 0.00220688, + "balance_loss_clip": 1.07061398, + "balance_loss_mlp": 0.19574933, + "epoch": 0.6779497970840223, + "flos": 26103933504000.0, + "grad_norm": 81.46913128112362, + "language_loss": 0.80885571, + "learning_rate": 9.927011086428335e-07, + "loss": 0.82401299, + "num_input_tokens_seen": 243356935, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.24926758, + "step": 11276, + "time_per_iteration": 2.754884719848633 + }, + { + "auxiliary_loss_clip": 0.01292634, + "auxiliary_loss_mlp": 0.00201544, + "balance_loss_clip": 1.06803954, + "balance_loss_mlp": 0.17503206, + "epoch": 0.6780099203366902, + "flos": 19719232041600.0, + "grad_norm": 33.89928082546112, + "language_loss": 0.85210454, + "learning_rate": 9.923646686352317e-07, + "loss": 0.86704636, + "num_input_tokens_seen": 243375625, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26550293, + "step": 11277, + "time_per_iteration": 2.631744146347046 + }, + { + "auxiliary_loss_clip": 0.01337403, + "auxiliary_loss_mlp": 0.0022117, + "balance_loss_clip": 1.09417415, + "balance_loss_mlp": 0.19214261, + "epoch": 0.6780700435893582, + "flos": 18214538382720.0, + "grad_norm": 103.35896418452627, + "language_loss": 0.9295696, + "learning_rate": 9.920282668372627e-07, + "loss": 0.94515538, + "num_input_tokens_seen": 243390195, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.29016113, + "step": 11278, + "time_per_iteration": 2.642486572265625 + }, + { + "auxiliary_loss_clip": 0.01300989, + "auxiliary_loss_mlp": 0.00211436, + "balance_loss_clip": 1.07519364, + "balance_loss_mlp": 0.18568656, + "epoch": 0.6781301668420262, + "flos": 25376239872000.0, + "grad_norm": 5.398722570520613, + "language_loss": 0.76297128, + "learning_rate": 9.916919032616844e-07, + "loss": 0.77809548, + "num_input_tokens_seen": 243411690, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.25744629, + "step": 11279, + "time_per_iteration": 2.7287485599517822 + }, + { + "auxiliary_loss_clip": 0.01315107, + "auxiliary_loss_mlp": 0.00208959, + "balance_loss_clip": 1.0800308, + "balance_loss_mlp": 0.18281616, + "epoch": 0.6781902900946941, + "flos": 24020432087040.0, + "grad_norm": 855.4106859756599, + "language_loss": 0.8327291, + "learning_rate": 9.913555779212485e-07, + "loss": 0.84796971, + "num_input_tokens_seen": 243430280, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.26159668, + "step": 11280, + "time_per_iteration": 2.6987435817718506 + }, + { + "auxiliary_loss_clip": 0.01321982, + "auxiliary_loss_mlp": 0.00219451, + "balance_loss_clip": 1.08347535, + "balance_loss_mlp": 0.19283199, + "epoch": 0.6782504133473621, + "flos": 19646764352640.0, + "grad_norm": 29.511036963529197, + "language_loss": 0.80397546, + "learning_rate": 9.910192908287104e-07, + "loss": 0.81938982, + "num_input_tokens_seen": 243448690, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.26623535, + "step": 11281, + "time_per_iteration": 2.7068095207214355 + }, + { + "auxiliary_loss_clip": 0.01330775, + "auxiliary_loss_mlp": 0.0021149, + "balance_loss_clip": 1.09729505, + "balance_loss_mlp": 0.18651575, + "epoch": 0.67831053660003, + "flos": 24932742647040.0, + "grad_norm": 101.57531249571609, + "language_loss": 0.70746022, + "learning_rate": 9.906830419968217e-07, + "loss": 0.72288287, + "num_input_tokens_seen": 243470695, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.24975586, + "step": 11282, + "time_per_iteration": 2.714597225189209 + }, + { + "auxiliary_loss_clip": 0.01344551, + "auxiliary_loss_mlp": 0.00236908, + "balance_loss_clip": 1.09721136, + "balance_loss_mlp": 0.20850082, + "epoch": 0.6783706598526981, + "flos": 31208383440000.0, + "grad_norm": 4.191243079076073, + "language_loss": 0.81996882, + "learning_rate": 9.90346831438334e-07, + "loss": 0.83578336, + "num_input_tokens_seen": 243493345, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.28417969, + "step": 11283, + "time_per_iteration": 2.768838882446289 + }, + { + "auxiliary_loss_clip": 0.01306603, + "auxiliary_loss_mlp": 0.00220294, + "balance_loss_clip": 1.0757277, + "balance_loss_mlp": 0.19456902, + "epoch": 0.678430783105366, + "flos": 35441317687680.0, + "grad_norm": 9.038976015586, + "language_loss": 0.62665313, + "learning_rate": 9.900106591659948e-07, + "loss": 0.64192212, + "num_input_tokens_seen": 243515670, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.25732422, + "step": 11284, + "time_per_iteration": 2.806441068649292 + }, + { + "auxiliary_loss_clip": 0.01317183, + "auxiliary_loss_mlp": 0.00218698, + "balance_loss_clip": 1.08059013, + "balance_loss_mlp": 0.19248353, + "epoch": 0.678490906358034, + "flos": 14428800460800.0, + "grad_norm": 2.772937886423487, + "language_loss": 0.8447901, + "learning_rate": 9.896745251925535e-07, + "loss": 0.86014891, + "num_input_tokens_seen": 243533625, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.26196289, + "step": 11285, + "time_per_iteration": 2.6513357162475586 + }, + { + "auxiliary_loss_clip": 0.01309576, + "auxiliary_loss_mlp": 0.0021389, + "balance_loss_clip": 1.07677889, + "balance_loss_mlp": 0.18682925, + "epoch": 0.6785510296107019, + "flos": 24311236596480.0, + "grad_norm": 28.42528101944517, + "language_loss": 0.73731726, + "learning_rate": 9.893384295307557e-07, + "loss": 0.75255191, + "num_input_tokens_seen": 243553040, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.27026367, + "step": 11286, + "time_per_iteration": 2.6624886989593506 + }, + { + "auxiliary_loss_clip": 0.01335215, + "auxiliary_loss_mlp": 0.00223106, + "balance_loss_clip": 1.08952641, + "balance_loss_mlp": 0.19426952, + "epoch": 0.6786111528633699, + "flos": 26977244872320.0, + "grad_norm": 10.122499328015232, + "language_loss": 0.65500963, + "learning_rate": 9.890023721933447e-07, + "loss": 0.6705929, + "num_input_tokens_seen": 243572590, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.28808594, + "step": 11287, + "time_per_iteration": 4.075164794921875 + }, + { + "auxiliary_loss_clip": 0.01297356, + "auxiliary_loss_mlp": 0.00241007, + "balance_loss_clip": 1.07133138, + "balance_loss_mlp": 0.21437603, + "epoch": 0.6786712761160378, + "flos": 24317557390080.0, + "grad_norm": 4.808566460524855, + "language_loss": 0.82946181, + "learning_rate": 9.886663531930655e-07, + "loss": 0.84484541, + "num_input_tokens_seen": 243594140, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.26635742, + "step": 11288, + "time_per_iteration": 2.7399380207061768 + }, + { + "auxiliary_loss_clip": 0.0134779, + "auxiliary_loss_mlp": 0.00222315, + "balance_loss_clip": 1.10494685, + "balance_loss_mlp": 0.19493297, + "epoch": 0.6787313993687059, + "flos": 22930435923840.0, + "grad_norm": 12.800271192634636, + "language_loss": 0.80086672, + "learning_rate": 9.883303725426593e-07, + "loss": 0.81656778, + "num_input_tokens_seen": 243615170, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.2734375, + "step": 11289, + "time_per_iteration": 4.155390739440918 + }, + { + "auxiliary_loss_clip": 0.01304629, + "auxiliary_loss_mlp": 0.00212805, + "balance_loss_clip": 1.07342029, + "balance_loss_mlp": 0.18568458, + "epoch": 0.6787915226213738, + "flos": 26868435598080.0, + "grad_norm": 5.047538352736781, + "language_loss": 0.86163843, + "learning_rate": 9.879944302548682e-07, + "loss": 0.87681276, + "num_input_tokens_seen": 243635675, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.27111816, + "step": 11290, + "time_per_iteration": 2.746690511703491 + }, + { + "auxiliary_loss_clip": 0.01309427, + "auxiliary_loss_mlp": 0.00231763, + "balance_loss_clip": 1.08349967, + "balance_loss_mlp": 0.20718208, + "epoch": 0.6788516458740418, + "flos": 20008851402240.0, + "grad_norm": 32.34611754149499, + "language_loss": 0.80958533, + "learning_rate": 9.87658526342428e-07, + "loss": 0.82499725, + "num_input_tokens_seen": 243654950, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.24584961, + "step": 11291, + "time_per_iteration": 2.705991744995117 + }, + { + "auxiliary_loss_clip": 0.01312834, + "auxiliary_loss_mlp": 0.00211017, + "balance_loss_clip": 1.07986045, + "balance_loss_mlp": 0.18531564, + "epoch": 0.6789117691267098, + "flos": 28727099832960.0, + "grad_norm": 4.16007525496291, + "language_loss": 0.82633233, + "learning_rate": 9.873226608180785e-07, + "loss": 0.84157085, + "num_input_tokens_seen": 243674970, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.25695801, + "step": 11292, + "time_per_iteration": 4.213390588760376 + }, + { + "auxiliary_loss_clip": 0.01332597, + "auxiliary_loss_mlp": 0.00260498, + "balance_loss_clip": 1.09197164, + "balance_loss_mlp": 0.23175633, + "epoch": 0.6789718923793777, + "flos": 23403451150080.0, + "grad_norm": 8.580508335467105, + "language_loss": 0.91306704, + "learning_rate": 9.869868336945556e-07, + "loss": 0.92899799, + "num_input_tokens_seen": 243693440, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.28747559, + "step": 11293, + "time_per_iteration": 2.721670150756836 + }, + { + "auxiliary_loss_clip": 0.01359747, + "auxiliary_loss_mlp": 0.00232706, + "balance_loss_clip": 1.11129522, + "balance_loss_mlp": 0.20271252, + "epoch": 0.6790320156320457, + "flos": 20448865008000.0, + "grad_norm": 3.8682650109924035, + "language_loss": 0.91052288, + "learning_rate": 9.866510449845929e-07, + "loss": 0.92644745, + "num_input_tokens_seen": 243710055, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.29968262, + "step": 11294, + "time_per_iteration": 2.7330214977264404 + }, + { + "auxiliary_loss_clip": 0.01322434, + "auxiliary_loss_mlp": 0.00214486, + "balance_loss_clip": 1.08948255, + "balance_loss_mlp": 0.18921322, + "epoch": 0.6790921388847136, + "flos": 24167199058560.0, + "grad_norm": 50.93826601598886, + "language_loss": 0.85482961, + "learning_rate": 9.86315294700924e-07, + "loss": 0.87019879, + "num_input_tokens_seen": 243728635, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.25280762, + "step": 11295, + "time_per_iteration": 2.6926283836364746 + }, + { + "auxiliary_loss_clip": 0.01294288, + "auxiliary_loss_mlp": 0.00208455, + "balance_loss_clip": 1.07394433, + "balance_loss_mlp": 0.18439864, + "epoch": 0.6791522621373817, + "flos": 21908095027200.0, + "grad_norm": 3.7104553850605106, + "language_loss": 0.79499489, + "learning_rate": 9.859795828562823e-07, + "loss": 0.81002235, + "num_input_tokens_seen": 243748330, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.24047852, + "step": 11296, + "time_per_iteration": 4.196814298629761 + }, + { + "auxiliary_loss_clip": 0.01308648, + "auxiliary_loss_mlp": 0.00209024, + "balance_loss_clip": 1.08158088, + "balance_loss_mlp": 0.18347792, + "epoch": 0.6792123853900496, + "flos": 24826519152000.0, + "grad_norm": 48.418470308297486, + "language_loss": 0.79253566, + "learning_rate": 9.856439094633949e-07, + "loss": 0.80771244, + "num_input_tokens_seen": 243769380, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.25561523, + "step": 11297, + "time_per_iteration": 2.71862530708313 + }, + { + "auxiliary_loss_clip": 0.01334393, + "auxiliary_loss_mlp": 0.00231705, + "balance_loss_clip": 1.08842838, + "balance_loss_mlp": 0.20220038, + "epoch": 0.6792725086427176, + "flos": 17566279678080.0, + "grad_norm": 10.395497677228484, + "language_loss": 0.79382652, + "learning_rate": 9.853082745349918e-07, + "loss": 0.80948752, + "num_input_tokens_seen": 243785510, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.29492188, + "step": 11298, + "time_per_iteration": 2.6678709983825684 + }, + { + "auxiliary_loss_clip": 0.01323229, + "auxiliary_loss_mlp": 0.00207873, + "balance_loss_clip": 1.08893299, + "balance_loss_mlp": 0.18262455, + "epoch": 0.6793326318953855, + "flos": 26941837040640.0, + "grad_norm": 220.97177703423156, + "language_loss": 0.7953952, + "learning_rate": 9.84972678083801e-07, + "loss": 0.81070626, + "num_input_tokens_seen": 243805545, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.25256348, + "step": 11299, + "time_per_iteration": 2.7395448684692383 + }, + { + "auxiliary_loss_clip": 0.01319469, + "auxiliary_loss_mlp": 0.00246936, + "balance_loss_clip": 1.08403754, + "balance_loss_mlp": 0.2201378, + "epoch": 0.6793927551480535, + "flos": 24318275662080.0, + "grad_norm": 59.382884595424066, + "language_loss": 0.82772326, + "learning_rate": 9.846371201225488e-07, + "loss": 0.84338737, + "num_input_tokens_seen": 243825185, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.26806641, + "step": 11300, + "time_per_iteration": 2.7566795349121094 + }, + { + "auxiliary_loss_clip": 0.01318948, + "auxiliary_loss_mlp": 0.00242405, + "balance_loss_clip": 1.0859015, + "balance_loss_mlp": 0.2138305, + "epoch": 0.6794528784007214, + "flos": 11436615757440.0, + "grad_norm": 92.33796106141028, + "language_loss": 0.73054093, + "learning_rate": 9.843016006639577e-07, + "loss": 0.74615443, + "num_input_tokens_seen": 243841600, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.28552246, + "step": 11301, + "time_per_iteration": 2.6600849628448486 + }, + { + "auxiliary_loss_clip": 0.01314419, + "auxiliary_loss_mlp": 0.00216771, + "balance_loss_clip": 1.08179462, + "balance_loss_mlp": 0.18979371, + "epoch": 0.6795130016533895, + "flos": 25229688382080.0, + "grad_norm": 42.61519046938056, + "language_loss": 0.89500463, + "learning_rate": 9.839661197207525e-07, + "loss": 0.91031653, + "num_input_tokens_seen": 243862250, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.26977539, + "step": 11302, + "time_per_iteration": 2.6935997009277344 + }, + { + "auxiliary_loss_clip": 0.01321088, + "auxiliary_loss_mlp": 0.00213752, + "balance_loss_clip": 1.08478713, + "balance_loss_mlp": 0.18677504, + "epoch": 0.6795731249060574, + "flos": 18296415434880.0, + "grad_norm": 101.81306409398864, + "language_loss": 0.78176069, + "learning_rate": 9.83630677305654e-07, + "loss": 0.79710907, + "num_input_tokens_seen": 243880560, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.26977539, + "step": 11303, + "time_per_iteration": 2.6947760581970215 + }, + { + "auxiliary_loss_clip": 0.01357681, + "auxiliary_loss_mlp": 0.00210743, + "balance_loss_clip": 1.10815465, + "balance_loss_mlp": 0.1826694, + "epoch": 0.6796332481587254, + "flos": 20300374183680.0, + "grad_norm": 2.655632736186707, + "language_loss": 0.80241382, + "learning_rate": 9.832952734313813e-07, + "loss": 0.81809807, + "num_input_tokens_seen": 243900635, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.28088379, + "step": 11304, + "time_per_iteration": 2.668069362640381 + }, + { + "auxiliary_loss_clip": 0.01339115, + "auxiliary_loss_mlp": 0.00239059, + "balance_loss_clip": 1.10018468, + "balance_loss_mlp": 0.21028209, + "epoch": 0.6796933714113934, + "flos": 23586847015680.0, + "grad_norm": 24.265924647726155, + "language_loss": 0.82209778, + "learning_rate": 9.829599081106536e-07, + "loss": 0.83787954, + "num_input_tokens_seen": 243920160, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.28747559, + "step": 11305, + "time_per_iteration": 2.7086780071258545 + }, + { + "auxiliary_loss_clip": 0.01333451, + "auxiliary_loss_mlp": 0.0023132, + "balance_loss_clip": 1.09788966, + "balance_loss_mlp": 0.20489147, + "epoch": 0.6797534946640613, + "flos": 27119917693440.0, + "grad_norm": 34.25944412804717, + "language_loss": 0.76861006, + "learning_rate": 9.826245813561882e-07, + "loss": 0.78425777, + "num_input_tokens_seen": 243939015, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.26428223, + "step": 11306, + "time_per_iteration": 2.6930007934570312 + }, + { + "auxiliary_loss_clip": 0.01310056, + "auxiliary_loss_mlp": 0.00218521, + "balance_loss_clip": 1.08562684, + "balance_loss_mlp": 0.19328472, + "epoch": 0.6798136179167293, + "flos": 22127437428480.0, + "grad_norm": 39.614563097428835, + "language_loss": 0.86693501, + "learning_rate": 9.822892931807021e-07, + "loss": 0.88222086, + "num_input_tokens_seen": 243958470, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.25231934, + "step": 11307, + "time_per_iteration": 2.6930887699127197 + }, + { + "auxiliary_loss_clip": 0.01327176, + "auxiliary_loss_mlp": 0.00230771, + "balance_loss_clip": 1.09396303, + "balance_loss_mlp": 0.20427109, + "epoch": 0.6798737411693972, + "flos": 17488640430720.0, + "grad_norm": 3.113852430876501, + "language_loss": 0.94993961, + "learning_rate": 9.819540435969066e-07, + "loss": 0.96551907, + "num_input_tokens_seen": 243975450, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.26501465, + "step": 11308, + "time_per_iteration": 2.6412384510040283 + }, + { + "auxiliary_loss_clip": 0.01322778, + "auxiliary_loss_mlp": 0.00219867, + "balance_loss_clip": 1.08764672, + "balance_loss_mlp": 0.19185312, + "epoch": 0.6799338644220653, + "flos": 22892262744960.0, + "grad_norm": 1970.862576569574, + "language_loss": 0.81521916, + "learning_rate": 9.816188326175154e-07, + "loss": 0.83064568, + "num_input_tokens_seen": 243994355, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.27990723, + "step": 11309, + "time_per_iteration": 2.6705515384674072 + }, + { + "auxiliary_loss_clip": 0.01314644, + "auxiliary_loss_mlp": 0.00219119, + "balance_loss_clip": 1.0840137, + "balance_loss_mlp": 0.19116473, + "epoch": 0.6799939876747332, + "flos": 23180409648000.0, + "grad_norm": 2.0986490342564346, + "language_loss": 0.92172986, + "learning_rate": 9.812836602552411e-07, + "loss": 0.93706745, + "num_input_tokens_seen": 244011620, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.27954102, + "step": 11310, + "time_per_iteration": 2.6523659229278564 + }, + { + "auxiliary_loss_clip": 0.01301225, + "auxiliary_loss_mlp": 0.00204937, + "balance_loss_clip": 1.0768702, + "balance_loss_mlp": 0.18014117, + "epoch": 0.6800541109274012, + "flos": 19499925553920.0, + "grad_norm": 28.29991023288008, + "language_loss": 0.9104625, + "learning_rate": 9.80948526522792e-07, + "loss": 0.92552412, + "num_input_tokens_seen": 244029925, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.24780273, + "step": 11311, + "time_per_iteration": 2.711142063140869 + }, + { + "auxiliary_loss_clip": 0.01334849, + "auxiliary_loss_mlp": 0.00234293, + "balance_loss_clip": 1.09002721, + "balance_loss_mlp": 0.20477691, + "epoch": 0.6801142341800691, + "flos": 22277652105600.0, + "grad_norm": 33.71440914258333, + "language_loss": 0.8423481, + "learning_rate": 9.806134314328767e-07, + "loss": 0.85803956, + "num_input_tokens_seen": 244051225, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.29504395, + "step": 11312, + "time_per_iteration": 2.744414806365967 + }, + { + "auxiliary_loss_clip": 0.01200189, + "auxiliary_loss_mlp": 0.00174708, + "balance_loss_clip": 1.04871559, + "balance_loss_mlp": 0.16359723, + "epoch": 0.6801743574327371, + "flos": 68714817759360.0, + "grad_norm": 0.974939239236735, + "language_loss": 0.56720746, + "learning_rate": 9.802783749982038e-07, + "loss": 0.5809564, + "num_input_tokens_seen": 244115930, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.11132812, + "step": 11313, + "time_per_iteration": 3.2752668857574463 + }, + { + "auxiliary_loss_clip": 0.01322041, + "auxiliary_loss_mlp": 0.00228143, + "balance_loss_clip": 1.0875268, + "balance_loss_mlp": 0.20282267, + "epoch": 0.680234480685405, + "flos": 29460467813760.0, + "grad_norm": 33.91793621755427, + "language_loss": 0.75060344, + "learning_rate": 9.799433572314754e-07, + "loss": 0.76610529, + "num_input_tokens_seen": 244137320, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.25341797, + "step": 11314, + "time_per_iteration": 2.7429568767547607 + }, + { + "auxiliary_loss_clip": 0.01320048, + "auxiliary_loss_mlp": 0.00225395, + "balance_loss_clip": 1.0877316, + "balance_loss_mlp": 0.19845358, + "epoch": 0.6802946039380731, + "flos": 15916866122880.0, + "grad_norm": 937.3890076649097, + "language_loss": 0.87088001, + "learning_rate": 9.796083781453972e-07, + "loss": 0.88633442, + "num_input_tokens_seen": 244152755, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.26916504, + "step": 11315, + "time_per_iteration": 2.601853847503662 + }, + { + "auxiliary_loss_clip": 0.01319458, + "auxiliary_loss_mlp": 0.00231848, + "balance_loss_clip": 1.08410025, + "balance_loss_mlp": 0.20512152, + "epoch": 0.680354727190741, + "flos": 22018664067840.0, + "grad_norm": 25.556567692300085, + "language_loss": 0.77385712, + "learning_rate": 9.792734377526718e-07, + "loss": 0.78937018, + "num_input_tokens_seen": 244171480, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.26757812, + "step": 11316, + "time_per_iteration": 2.695786714553833 + }, + { + "auxiliary_loss_clip": 0.01324297, + "auxiliary_loss_mlp": 0.0024933, + "balance_loss_clip": 1.09068692, + "balance_loss_mlp": 0.22138724, + "epoch": 0.680414850443409, + "flos": 18441494467200.0, + "grad_norm": 4.618903987695485, + "language_loss": 0.75081897, + "learning_rate": 9.789385360660003e-07, + "loss": 0.76655531, + "num_input_tokens_seen": 244187920, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.27954102, + "step": 11317, + "time_per_iteration": 2.6427836418151855 + }, + { + "auxiliary_loss_clip": 0.01327075, + "auxiliary_loss_mlp": 0.00223803, + "balance_loss_clip": 1.0897119, + "balance_loss_mlp": 0.19665933, + "epoch": 0.680474973696077, + "flos": 26358611909760.0, + "grad_norm": 4.1378780023082, + "language_loss": 0.83072293, + "learning_rate": 9.78603673098082e-07, + "loss": 0.84623176, + "num_input_tokens_seen": 244209565, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.27148438, + "step": 11318, + "time_per_iteration": 2.711071729660034 + }, + { + "auxiliary_loss_clip": 0.01289979, + "auxiliary_loss_mlp": 0.0021701, + "balance_loss_clip": 1.06784201, + "balance_loss_mlp": 0.19357356, + "epoch": 0.6805350969487449, + "flos": 18333116156160.0, + "grad_norm": 2.9069256391834766, + "language_loss": 0.74351805, + "learning_rate": 9.782688488616143e-07, + "loss": 0.75858796, + "num_input_tokens_seen": 244228015, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.234375, + "step": 11319, + "time_per_iteration": 2.6153879165649414 + }, + { + "auxiliary_loss_clip": 0.01312093, + "auxiliary_loss_mlp": 0.00205154, + "balance_loss_clip": 1.08154595, + "balance_loss_mlp": 0.17873746, + "epoch": 0.6805952202014129, + "flos": 19937497034880.0, + "grad_norm": 488.93365288879244, + "language_loss": 0.83895481, + "learning_rate": 9.779340633692945e-07, + "loss": 0.85412729, + "num_input_tokens_seen": 244245615, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.26416016, + "step": 11320, + "time_per_iteration": 2.717703342437744 + }, + { + "auxiliary_loss_clip": 0.01327164, + "auxiliary_loss_mlp": 0.00232659, + "balance_loss_clip": 1.09228742, + "balance_loss_mlp": 0.20558643, + "epoch": 0.6806553434540809, + "flos": 25224301342080.0, + "grad_norm": 38.394007313471306, + "language_loss": 0.81871909, + "learning_rate": 9.77599316633817e-07, + "loss": 0.83431733, + "num_input_tokens_seen": 244263625, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.27062988, + "step": 11321, + "time_per_iteration": 2.665618419647217 + }, + { + "auxiliary_loss_clip": 0.01326938, + "auxiliary_loss_mlp": 0.00245909, + "balance_loss_clip": 1.09138346, + "balance_loss_mlp": 0.21906283, + "epoch": 0.6807154667067489, + "flos": 17785586165760.0, + "grad_norm": 2.439947391372625, + "language_loss": 0.7961266, + "learning_rate": 9.772646086678758e-07, + "loss": 0.81185508, + "num_input_tokens_seen": 244282745, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.26843262, + "step": 11322, + "time_per_iteration": 2.760840654373169 + }, + { + "auxiliary_loss_clip": 0.01315106, + "auxiliary_loss_mlp": 0.00229435, + "balance_loss_clip": 1.08031905, + "balance_loss_mlp": 0.2026011, + "epoch": 0.6807755899594168, + "flos": 22199905117440.0, + "grad_norm": 21.081301451474275, + "language_loss": 0.86804044, + "learning_rate": 9.769299394841638e-07, + "loss": 0.88348585, + "num_input_tokens_seen": 244303770, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.26818848, + "step": 11323, + "time_per_iteration": 2.7937393188476562 + }, + { + "auxiliary_loss_clip": 0.0117847, + "auxiliary_loss_mlp": 0.00161085, + "balance_loss_clip": 1.02806103, + "balance_loss_mlp": 0.15107137, + "epoch": 0.6808357132120848, + "flos": 68631073200000.0, + "grad_norm": 0.7786063341856444, + "language_loss": 0.56555778, + "learning_rate": 9.765953090953714e-07, + "loss": 0.57895333, + "num_input_tokens_seen": 244355910, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.10009766, + "step": 11324, + "time_per_iteration": 2.9567527770996094 + }, + { + "auxiliary_loss_clip": 0.01335103, + "auxiliary_loss_mlp": 0.00234431, + "balance_loss_clip": 1.09821689, + "balance_loss_mlp": 0.20704909, + "epoch": 0.6808958364647527, + "flos": 23843357015040.0, + "grad_norm": 3.7369134854781287, + "language_loss": 0.7840873, + "learning_rate": 9.76260717514186e-07, + "loss": 0.79978263, + "num_input_tokens_seen": 244376610, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.27380371, + "step": 11325, + "time_per_iteration": 2.688129186630249 + }, + { + "auxiliary_loss_clip": 0.01327312, + "auxiliary_loss_mlp": 0.00227113, + "balance_loss_clip": 1.08764386, + "balance_loss_mlp": 0.19679812, + "epoch": 0.6809559597174207, + "flos": 17711717846400.0, + "grad_norm": 12.87350082893165, + "language_loss": 0.77954024, + "learning_rate": 9.759261647532974e-07, + "loss": 0.79508448, + "num_input_tokens_seen": 244393000, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.30285645, + "step": 11326, + "time_per_iteration": 2.6052799224853516 + }, + { + "auxiliary_loss_clip": 0.01318836, + "auxiliary_loss_mlp": 0.0022485, + "balance_loss_clip": 1.08550775, + "balance_loss_mlp": 0.19901715, + "epoch": 0.6810160829700886, + "flos": 22491894775680.0, + "grad_norm": 131.43899559331504, + "language_loss": 0.79321003, + "learning_rate": 9.75591650825392e-07, + "loss": 0.80864692, + "num_input_tokens_seen": 244409515, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.25854492, + "step": 11327, + "time_per_iteration": 2.670639991760254 + }, + { + "auxiliary_loss_clip": 0.01332608, + "auxiliary_loss_mlp": 0.00230138, + "balance_loss_clip": 1.09620059, + "balance_loss_mlp": 0.20167072, + "epoch": 0.6810762062227567, + "flos": 16832875783680.0, + "grad_norm": 294.0619182677346, + "language_loss": 0.84627366, + "learning_rate": 9.752571757431526e-07, + "loss": 0.86190116, + "num_input_tokens_seen": 244427165, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.28491211, + "step": 11328, + "time_per_iteration": 2.6032655239105225 + }, + { + "auxiliary_loss_clip": 0.01322629, + "auxiliary_loss_mlp": 0.00230872, + "balance_loss_clip": 1.08326077, + "balance_loss_mlp": 0.20316795, + "epoch": 0.6811363294754246, + "flos": 12714676554240.0, + "grad_norm": 26.96706487279339, + "language_loss": 0.73656809, + "learning_rate": 9.74922739519265e-07, + "loss": 0.75210315, + "num_input_tokens_seen": 244445705, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.27697754, + "step": 11329, + "time_per_iteration": 4.0281922817230225 + }, + { + "auxiliary_loss_clip": 0.01325922, + "auxiliary_loss_mlp": 0.00223582, + "balance_loss_clip": 1.08471787, + "balance_loss_mlp": 0.19649783, + "epoch": 0.6811964527280926, + "flos": 17711969241600.0, + "grad_norm": 19.541979627151445, + "language_loss": 0.8559655, + "learning_rate": 9.745883421664096e-07, + "loss": 0.87146056, + "num_input_tokens_seen": 244460415, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.27050781, + "step": 11330, + "time_per_iteration": 2.6189651489257812 + }, + { + "auxiliary_loss_clip": 0.01320902, + "auxiliary_loss_mlp": 0.00225218, + "balance_loss_clip": 1.08924818, + "balance_loss_mlp": 0.19758505, + "epoch": 0.6812565759807605, + "flos": 24863471268480.0, + "grad_norm": 29.8491246882194, + "language_loss": 0.71888965, + "learning_rate": 9.742539836972665e-07, + "loss": 0.73435086, + "num_input_tokens_seen": 244480555, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.27636719, + "step": 11331, + "time_per_iteration": 4.166785001754761 + }, + { + "auxiliary_loss_clip": 0.01326166, + "auxiliary_loss_mlp": 0.00221086, + "balance_loss_clip": 1.09001553, + "balance_loss_mlp": 0.19387066, + "epoch": 0.6813166992334285, + "flos": 17166019449600.0, + "grad_norm": 60.32069536375336, + "language_loss": 0.80901849, + "learning_rate": 9.739196641245148e-07, + "loss": 0.82449102, + "num_input_tokens_seen": 244498540, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.27233887, + "step": 11332, + "time_per_iteration": 2.6404590606689453 + }, + { + "auxiliary_loss_clip": 0.01318961, + "auxiliary_loss_mlp": 0.00239364, + "balance_loss_clip": 1.0839808, + "balance_loss_mlp": 0.21410345, + "epoch": 0.6813768224860965, + "flos": 18843550375680.0, + "grad_norm": 89.75654103361083, + "language_loss": 0.83005482, + "learning_rate": 9.735853834608326e-07, + "loss": 0.8456381, + "num_input_tokens_seen": 244517015, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.25280762, + "step": 11333, + "time_per_iteration": 2.6837387084960938 + }, + { + "auxiliary_loss_clip": 0.01347256, + "auxiliary_loss_mlp": 0.00249319, + "balance_loss_clip": 1.10680425, + "balance_loss_mlp": 0.22166267, + "epoch": 0.6814369457387645, + "flos": 24532733813760.0, + "grad_norm": 11.328477005705668, + "language_loss": 0.77963328, + "learning_rate": 9.732511417188963e-07, + "loss": 0.79559898, + "num_input_tokens_seen": 244537450, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.27648926, + "step": 11334, + "time_per_iteration": 2.6901237964630127 + }, + { + "auxiliary_loss_clip": 0.01315, + "auxiliary_loss_mlp": 0.00230105, + "balance_loss_clip": 1.08718622, + "balance_loss_mlp": 0.20571497, + "epoch": 0.6814970689914325, + "flos": 18222978078720.0, + "grad_norm": 31.200579773330176, + "language_loss": 0.92576712, + "learning_rate": 9.729169389113791e-07, + "loss": 0.94121814, + "num_input_tokens_seen": 244555640, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.24389648, + "step": 11335, + "time_per_iteration": 4.090856313705444 + }, + { + "auxiliary_loss_clip": 0.01313851, + "auxiliary_loss_mlp": 0.00219836, + "balance_loss_clip": 1.08630514, + "balance_loss_mlp": 0.19438514, + "epoch": 0.6815571922441004, + "flos": 25228790542080.0, + "grad_norm": 28.67166025867169, + "language_loss": 0.89645892, + "learning_rate": 9.725827750509542e-07, + "loss": 0.91179574, + "num_input_tokens_seen": 244574005, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.25463867, + "step": 11336, + "time_per_iteration": 2.684993028640747 + }, + { + "auxiliary_loss_clip": 0.01300377, + "auxiliary_loss_mlp": 0.00237905, + "balance_loss_clip": 1.07524395, + "balance_loss_mlp": 0.2123228, + "epoch": 0.6816173154967684, + "flos": 19456078026240.0, + "grad_norm": 6.2520092393676885, + "language_loss": 0.88289475, + "learning_rate": 9.72248650150294e-07, + "loss": 0.89827752, + "num_input_tokens_seen": 244591395, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.2557373, + "step": 11337, + "time_per_iteration": 2.6603710651397705 + }, + { + "auxiliary_loss_clip": 0.01290707, + "auxiliary_loss_mlp": 0.00206683, + "balance_loss_clip": 1.06719398, + "balance_loss_mlp": 0.18092155, + "epoch": 0.6816774387494363, + "flos": 17931455297280.0, + "grad_norm": 31.692101224895776, + "language_loss": 0.78996348, + "learning_rate": 9.719145642220673e-07, + "loss": 0.80493742, + "num_input_tokens_seen": 244610400, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25756836, + "step": 11338, + "time_per_iteration": 4.083198547363281 + }, + { + "auxiliary_loss_clip": 0.01306936, + "auxiliary_loss_mlp": 0.00229136, + "balance_loss_clip": 1.07847929, + "balance_loss_mlp": 0.2047455, + "epoch": 0.6817375620021043, + "flos": 22233014478720.0, + "grad_norm": 6.200340893422323, + "language_loss": 0.84643358, + "learning_rate": 9.715805172789435e-07, + "loss": 0.86179429, + "num_input_tokens_seen": 244630400, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.24389648, + "step": 11339, + "time_per_iteration": 2.6369268894195557 + }, + { + "auxiliary_loss_clip": 0.01328998, + "auxiliary_loss_mlp": 0.00244791, + "balance_loss_clip": 1.0965687, + "balance_loss_mlp": 0.21794489, + "epoch": 0.6817976852547722, + "flos": 25374408278400.0, + "grad_norm": 12.272080836668996, + "language_loss": 0.7915557, + "learning_rate": 9.712465093335901e-07, + "loss": 0.80729353, + "num_input_tokens_seen": 244649155, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.26855469, + "step": 11340, + "time_per_iteration": 2.7062699794769287 + }, + { + "auxiliary_loss_clip": 0.01350481, + "auxiliary_loss_mlp": 0.00213112, + "balance_loss_clip": 1.10318327, + "balance_loss_mlp": 0.18278527, + "epoch": 0.6818578085074403, + "flos": 22265764704000.0, + "grad_norm": 5.079924212906128, + "language_loss": 0.97659248, + "learning_rate": 9.709125403986722e-07, + "loss": 0.99222845, + "num_input_tokens_seen": 244665470, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.30322266, + "step": 11341, + "time_per_iteration": 2.6394901275634766 + }, + { + "auxiliary_loss_clip": 0.01310797, + "auxiliary_loss_mlp": 0.00224593, + "balance_loss_clip": 1.07766902, + "balance_loss_mlp": 0.19685295, + "epoch": 0.6819179317601082, + "flos": 19318145800320.0, + "grad_norm": 7.757833861847879, + "language_loss": 0.76598269, + "learning_rate": 9.705786104868531e-07, + "loss": 0.78133655, + "num_input_tokens_seen": 244684390, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.27746582, + "step": 11342, + "time_per_iteration": 2.782640218734741 + }, + { + "auxiliary_loss_clip": 0.01301721, + "auxiliary_loss_mlp": 0.00233328, + "balance_loss_clip": 1.07561302, + "balance_loss_mlp": 0.2066485, + "epoch": 0.6819780550127762, + "flos": 21104126864640.0, + "grad_norm": 350.5037640289769, + "language_loss": 0.81567067, + "learning_rate": 9.702447196107963e-07, + "loss": 0.83102113, + "num_input_tokens_seen": 244703370, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26672363, + "step": 11343, + "time_per_iteration": 2.6717472076416016 + }, + { + "auxiliary_loss_clip": 0.01326219, + "auxiliary_loss_mlp": 0.00221837, + "balance_loss_clip": 1.09693468, + "balance_loss_mlp": 0.19558708, + "epoch": 0.6820381782654441, + "flos": 29716403195520.0, + "grad_norm": 4.5198485772932635, + "language_loss": 0.86427736, + "learning_rate": 9.699108677831639e-07, + "loss": 0.87975788, + "num_input_tokens_seen": 244723325, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.2623291, + "step": 11344, + "time_per_iteration": 2.7020976543426514 + }, + { + "auxiliary_loss_clip": 0.01315913, + "auxiliary_loss_mlp": 0.00237559, + "balance_loss_clip": 1.08190072, + "balance_loss_mlp": 0.21036755, + "epoch": 0.6820983015181121, + "flos": 29242130993280.0, + "grad_norm": 18.31676394500806, + "language_loss": 0.72755826, + "learning_rate": 9.695770550166136e-07, + "loss": 0.74309295, + "num_input_tokens_seen": 244745650, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.27172852, + "step": 11345, + "time_per_iteration": 2.7447431087493896 + }, + { + "auxiliary_loss_clip": 0.01343647, + "auxiliary_loss_mlp": 0.00215412, + "balance_loss_clip": 1.10537243, + "balance_loss_mlp": 0.18639693, + "epoch": 0.6821584247707801, + "flos": 18871775487360.0, + "grad_norm": 108.89467184277882, + "language_loss": 0.75636524, + "learning_rate": 9.692432813238054e-07, + "loss": 0.77195579, + "num_input_tokens_seen": 244760270, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.2902832, + "step": 11346, + "time_per_iteration": 2.6062843799591064 + }, + { + "auxiliary_loss_clip": 0.0134568, + "auxiliary_loss_mlp": 0.00228426, + "balance_loss_clip": 1.10518646, + "balance_loss_mlp": 0.20079292, + "epoch": 0.6822185480234481, + "flos": 21324582587520.0, + "grad_norm": 3.6535129832131066, + "language_loss": 0.85008955, + "learning_rate": 9.689095467173952e-07, + "loss": 0.8658306, + "num_input_tokens_seen": 244779565, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.27648926, + "step": 11347, + "time_per_iteration": 2.6937942504882812 + }, + { + "auxiliary_loss_clip": 0.01170686, + "auxiliary_loss_mlp": 0.00099261, + "balance_loss_clip": 1.01774716, + "balance_loss_mlp": 0.09234683, + "epoch": 0.6822786712761161, + "flos": 63488306430720.0, + "grad_norm": 0.7094703262582747, + "language_loss": 0.51985794, + "learning_rate": 9.685758512100378e-07, + "loss": 0.53255737, + "num_input_tokens_seen": 244838480, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.06933594, + "step": 11348, + "time_per_iteration": 3.148404359817505 + }, + { + "auxiliary_loss_clip": 0.01328772, + "auxiliary_loss_mlp": 0.00244963, + "balance_loss_clip": 1.09647274, + "balance_loss_mlp": 0.21651936, + "epoch": 0.682338794528784, + "flos": 21068934514560.0, + "grad_norm": 14.922133044727996, + "language_loss": 0.85822189, + "learning_rate": 9.682421948143873e-07, + "loss": 0.87395924, + "num_input_tokens_seen": 244855265, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.2845459, + "step": 11349, + "time_per_iteration": 2.6567888259887695 + }, + { + "auxiliary_loss_clip": 0.01376108, + "auxiliary_loss_mlp": 0.00254506, + "balance_loss_clip": 1.12300181, + "balance_loss_mlp": 0.22241443, + "epoch": 0.682398917781452, + "flos": 36283243547520.0, + "grad_norm": 7.228374132193785, + "language_loss": 0.83979142, + "learning_rate": 9.67908577543096e-07, + "loss": 0.85609758, + "num_input_tokens_seen": 244875555, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.32104492, + "step": 11350, + "time_per_iteration": 2.7415237426757812 + }, + { + "auxiliary_loss_clip": 0.01329801, + "auxiliary_loss_mlp": 0.00235319, + "balance_loss_clip": 1.0942204, + "balance_loss_mlp": 0.20894971, + "epoch": 0.6824590410341199, + "flos": 24859197550080.0, + "grad_norm": 4.663517850575071, + "language_loss": 0.8563695, + "learning_rate": 9.675749994088161e-07, + "loss": 0.87202072, + "num_input_tokens_seen": 244895270, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.26391602, + "step": 11351, + "time_per_iteration": 2.6771934032440186 + }, + { + "auxiliary_loss_clip": 0.01320978, + "auxiliary_loss_mlp": 0.00220047, + "balance_loss_clip": 1.08888996, + "balance_loss_mlp": 0.19408366, + "epoch": 0.6825191642867879, + "flos": 22452392793600.0, + "grad_norm": 8.209618898660578, + "language_loss": 0.79626787, + "learning_rate": 9.672414604241954e-07, + "loss": 0.81167817, + "num_input_tokens_seen": 244914535, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.2598877, + "step": 11352, + "time_per_iteration": 2.6202590465545654 + }, + { + "auxiliary_loss_clip": 0.01339779, + "auxiliary_loss_mlp": 0.00219549, + "balance_loss_clip": 1.10274887, + "balance_loss_mlp": 0.19164234, + "epoch": 0.6825792875394558, + "flos": 29424377623680.0, + "grad_norm": 8.481243143774462, + "language_loss": 0.87259305, + "learning_rate": 9.669079606018814e-07, + "loss": 0.88818634, + "num_input_tokens_seen": 244936095, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.2791748, + "step": 11353, + "time_per_iteration": 2.7030444145202637 + }, + { + "auxiliary_loss_clip": 0.0134183, + "auxiliary_loss_mlp": 0.00208366, + "balance_loss_clip": 1.10182726, + "balance_loss_mlp": 0.18079241, + "epoch": 0.6826394107921239, + "flos": 18770974945920.0, + "grad_norm": 4.3522275685777, + "language_loss": 0.86253417, + "learning_rate": 9.665744999545218e-07, + "loss": 0.87803614, + "num_input_tokens_seen": 244955290, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.27575684, + "step": 11354, + "time_per_iteration": 2.6290395259857178 + }, + { + "auxiliary_loss_clip": 0.01334799, + "auxiliary_loss_mlp": 0.00217497, + "balance_loss_clip": 1.09897757, + "balance_loss_mlp": 0.18980497, + "epoch": 0.6826995340447918, + "flos": 16617591619200.0, + "grad_norm": 8.318967449939317, + "language_loss": 0.71902382, + "learning_rate": 9.662410784947599e-07, + "loss": 0.73454678, + "num_input_tokens_seen": 244972935, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.27697754, + "step": 11355, + "time_per_iteration": 2.659524917602539 + }, + { + "auxiliary_loss_clip": 0.01323209, + "auxiliary_loss_mlp": 0.00207773, + "balance_loss_clip": 1.08947253, + "balance_loss_mlp": 0.18139245, + "epoch": 0.6827596572974598, + "flos": 20848299223680.0, + "grad_norm": 6.983634545652979, + "language_loss": 0.89719748, + "learning_rate": 9.659076962352398e-07, + "loss": 0.9125073, + "num_input_tokens_seen": 244989440, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.26379395, + "step": 11356, + "time_per_iteration": 2.638920545578003 + }, + { + "auxiliary_loss_clip": 0.01352767, + "auxiliary_loss_mlp": 0.0022392, + "balance_loss_clip": 1.10921419, + "balance_loss_mlp": 0.19600117, + "epoch": 0.6828197805501277, + "flos": 22748081552640.0, + "grad_norm": 24.05930095201894, + "language_loss": 0.85512888, + "learning_rate": 9.655743531886052e-07, + "loss": 0.87089574, + "num_input_tokens_seen": 245007830, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.27929688, + "step": 11357, + "time_per_iteration": 2.6437485218048096 + }, + { + "auxiliary_loss_clip": 0.01152857, + "auxiliary_loss_mlp": 0.00123835, + "balance_loss_clip": 1.00159669, + "balance_loss_mlp": 0.11630123, + "epoch": 0.6828799038027957, + "flos": 71646565829760.0, + "grad_norm": 0.8085626736641585, + "language_loss": 0.59049946, + "learning_rate": 9.65241049367493e-07, + "loss": 0.60326636, + "num_input_tokens_seen": 245070720, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.07519531, + "step": 11358, + "time_per_iteration": 3.210094928741455 + }, + { + "auxiliary_loss_clip": 0.0132394, + "auxiliary_loss_mlp": 0.00233212, + "balance_loss_clip": 1.08705711, + "balance_loss_mlp": 0.20475665, + "epoch": 0.6829400270554637, + "flos": 19829154637440.0, + "grad_norm": 27.771926705006624, + "language_loss": 0.88635445, + "learning_rate": 9.64907784784544e-07, + "loss": 0.90192604, + "num_input_tokens_seen": 245089070, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.28430176, + "step": 11359, + "time_per_iteration": 2.679016351699829 + }, + { + "auxiliary_loss_clip": 0.01350268, + "auxiliary_loss_mlp": 0.00240355, + "balance_loss_clip": 1.10180569, + "balance_loss_mlp": 0.2100763, + "epoch": 0.6830001503081317, + "flos": 21980634543360.0, + "grad_norm": 11.285460129189289, + "language_loss": 0.91048473, + "learning_rate": 9.645745594523958e-07, + "loss": 0.92639101, + "num_input_tokens_seen": 245106500, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.30273438, + "step": 11360, + "time_per_iteration": 2.7041373252868652 + }, + { + "auxiliary_loss_clip": 0.01320147, + "auxiliary_loss_mlp": 0.00216439, + "balance_loss_clip": 1.08792067, + "balance_loss_mlp": 0.18823454, + "epoch": 0.6830602735607997, + "flos": 24316767290880.0, + "grad_norm": 24.022104849084034, + "language_loss": 0.80748415, + "learning_rate": 9.642413733836844e-07, + "loss": 0.82285005, + "num_input_tokens_seen": 245125260, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.28222656, + "step": 11361, + "time_per_iteration": 2.679302453994751 + }, + { + "auxiliary_loss_clip": 0.01163024, + "auxiliary_loss_mlp": 0.00105032, + "balance_loss_clip": 1.0107224, + "balance_loss_mlp": 0.09678274, + "epoch": 0.6831203968134676, + "flos": 57690062323200.0, + "grad_norm": 0.8537423254233002, + "language_loss": 0.58059853, + "learning_rate": 9.639082265910437e-07, + "loss": 0.59327906, + "num_input_tokens_seen": 245188730, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.08251953, + "step": 11362, + "time_per_iteration": 3.212855577468872 + }, + { + "auxiliary_loss_clip": 0.01323737, + "auxiliary_loss_mlp": 0.0022489, + "balance_loss_clip": 1.08710849, + "balance_loss_mlp": 0.19328766, + "epoch": 0.6831805200661356, + "flos": 14388436552320.0, + "grad_norm": 66.45598451697782, + "language_loss": 0.86763436, + "learning_rate": 9.635751190871074e-07, + "loss": 0.88312066, + "num_input_tokens_seen": 245205065, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.31567383, + "step": 11363, + "time_per_iteration": 2.6062698364257812 + }, + { + "auxiliary_loss_clip": 0.01323461, + "auxiliary_loss_mlp": 0.00235961, + "balance_loss_clip": 1.09624028, + "balance_loss_mlp": 0.20677876, + "epoch": 0.6832406433188035, + "flos": 22820297846400.0, + "grad_norm": 14.78062015443939, + "language_loss": 0.99370527, + "learning_rate": 9.632420508845063e-07, + "loss": 1.00929952, + "num_input_tokens_seen": 245224265, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.29150391, + "step": 11364, + "time_per_iteration": 2.712170124053955 + }, + { + "auxiliary_loss_clip": 0.01330578, + "auxiliary_loss_mlp": 0.00215283, + "balance_loss_clip": 1.09652936, + "balance_loss_mlp": 0.18841347, + "epoch": 0.6833007665714715, + "flos": 17561718650880.0, + "grad_norm": 6.596449783504465, + "language_loss": 0.9476493, + "learning_rate": 9.629090219958697e-07, + "loss": 0.96310788, + "num_input_tokens_seen": 245243360, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.26843262, + "step": 11365, + "time_per_iteration": 2.6384382247924805 + }, + { + "auxiliary_loss_clip": 0.01350287, + "auxiliary_loss_mlp": 0.00214927, + "balance_loss_clip": 1.10680246, + "balance_loss_mlp": 0.18499368, + "epoch": 0.6833608898241395, + "flos": 22445928345600.0, + "grad_norm": 146.52456472484948, + "language_loss": 0.92077136, + "learning_rate": 9.625760324338272e-07, + "loss": 0.93642348, + "num_input_tokens_seen": 245256350, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.29919434, + "step": 11366, + "time_per_iteration": 2.6060526371002197 + }, + { + "auxiliary_loss_clip": 0.01327476, + "auxiliary_loss_mlp": 0.00218719, + "balance_loss_clip": 1.09189796, + "balance_loss_mlp": 0.18977532, + "epoch": 0.6834210130768075, + "flos": 24534637234560.0, + "grad_norm": 5.405924753602497, + "language_loss": 0.83705574, + "learning_rate": 9.622430822110062e-07, + "loss": 0.85251766, + "num_input_tokens_seen": 245277575, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.28930664, + "step": 11367, + "time_per_iteration": 2.790229320526123 + }, + { + "auxiliary_loss_clip": 0.01339277, + "auxiliary_loss_mlp": 0.0022667, + "balance_loss_clip": 1.09930801, + "balance_loss_mlp": 0.19711784, + "epoch": 0.6834811363294754, + "flos": 20047132321920.0, + "grad_norm": 21.72607634374216, + "language_loss": 0.77226353, + "learning_rate": 9.619101713400312e-07, + "loss": 0.78792298, + "num_input_tokens_seen": 245296615, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.29528809, + "step": 11368, + "time_per_iteration": 2.6209146976470947 + }, + { + "auxiliary_loss_clip": 0.01319464, + "auxiliary_loss_mlp": 0.00210065, + "balance_loss_clip": 1.08541298, + "balance_loss_mlp": 0.18253976, + "epoch": 0.6835412595821434, + "flos": 24790752184320.0, + "grad_norm": 7.468361554133743, + "language_loss": 0.81972206, + "learning_rate": 9.615772998335261e-07, + "loss": 0.83501738, + "num_input_tokens_seen": 245316275, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.27514648, + "step": 11369, + "time_per_iteration": 2.6648309230804443 + }, + { + "auxiliary_loss_clip": 0.01345137, + "auxiliary_loss_mlp": 0.0021839, + "balance_loss_clip": 1.10420227, + "balance_loss_mlp": 0.18688326, + "epoch": 0.6836013828348113, + "flos": 19500356517120.0, + "grad_norm": 11.566002239938404, + "language_loss": 0.87424445, + "learning_rate": 9.612444677041138e-07, + "loss": 0.88987976, + "num_input_tokens_seen": 245334595, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.31506348, + "step": 11370, + "time_per_iteration": 2.6069774627685547 + }, + { + "auxiliary_loss_clip": 0.01156683, + "auxiliary_loss_mlp": 0.00127264, + "balance_loss_clip": 1.00525641, + "balance_loss_mlp": 0.11849048, + "epoch": 0.6836615060874793, + "flos": 58363999251840.0, + "grad_norm": 0.7846285360047448, + "language_loss": 0.59606874, + "learning_rate": 9.609116749644162e-07, + "loss": 0.60890818, + "num_input_tokens_seen": 245389750, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.08789062, + "step": 11371, + "time_per_iteration": 3.049044132232666 + }, + { + "auxiliary_loss_clip": 0.0130859, + "auxiliary_loss_mlp": 0.00212791, + "balance_loss_clip": 1.08044767, + "balance_loss_mlp": 0.18596914, + "epoch": 0.6837216293401474, + "flos": 12166895168640.0, + "grad_norm": 9617.15315793117, + "language_loss": 0.71199423, + "learning_rate": 9.605789216270511e-07, + "loss": 0.72720802, + "num_input_tokens_seen": 245407530, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.26806641, + "step": 11372, + "time_per_iteration": 4.057488203048706 + }, + { + "auxiliary_loss_clip": 0.01334152, + "auxiliary_loss_mlp": 0.0022438, + "balance_loss_clip": 1.1017971, + "balance_loss_mlp": 0.19762969, + "epoch": 0.6837817525928153, + "flos": 22127581082880.0, + "grad_norm": 20.63110418043011, + "language_loss": 0.79280746, + "learning_rate": 9.602462077046375e-07, + "loss": 0.80839282, + "num_input_tokens_seen": 245427000, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.26757812, + "step": 11373, + "time_per_iteration": 4.1181488037109375 + }, + { + "auxiliary_loss_clip": 0.01142562, + "auxiliary_loss_mlp": 0.00088774, + "balance_loss_clip": 0.99122751, + "balance_loss_mlp": 0.07980987, + "epoch": 0.6838418758454833, + "flos": 65005928985600.0, + "grad_norm": 1.1859871045653942, + "language_loss": 0.56307733, + "learning_rate": 9.599135332097935e-07, + "loss": 0.5753907, + "num_input_tokens_seen": 245491620, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.08984375, + "step": 11374, + "time_per_iteration": 3.2921030521392822 + }, + { + "auxiliary_loss_clip": 0.01341109, + "auxiliary_loss_mlp": 0.0021781, + "balance_loss_clip": 1.10525179, + "balance_loss_mlp": 0.1908807, + "epoch": 0.6839019990981512, + "flos": 21030833162880.0, + "grad_norm": 336.473778664986, + "language_loss": 0.82802689, + "learning_rate": 9.595808981551312e-07, + "loss": 0.84361607, + "num_input_tokens_seen": 245511285, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.26928711, + "step": 11375, + "time_per_iteration": 2.697084426879883 + }, + { + "auxiliary_loss_clip": 0.01324292, + "auxiliary_loss_mlp": 0.00215984, + "balance_loss_clip": 1.09170496, + "balance_loss_mlp": 0.18696821, + "epoch": 0.6839621223508192, + "flos": 24935543907840.0, + "grad_norm": 178.41062312990812, + "language_loss": 0.76421791, + "learning_rate": 9.592483025532651e-07, + "loss": 0.77962071, + "num_input_tokens_seen": 245532910, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.29003906, + "step": 11376, + "time_per_iteration": 2.6964218616485596 + }, + { + "auxiliary_loss_clip": 0.01338336, + "auxiliary_loss_mlp": 0.00232872, + "balance_loss_clip": 1.09691238, + "balance_loss_mlp": 0.20272407, + "epoch": 0.6840222456034871, + "flos": 26358827391360.0, + "grad_norm": 980.3329566996595, + "language_loss": 0.81719315, + "learning_rate": 9.58915746416808e-07, + "loss": 0.83290529, + "num_input_tokens_seen": 245550540, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.30151367, + "step": 11377, + "time_per_iteration": 4.113945960998535 + }, + { + "auxiliary_loss_clip": 0.01142874, + "auxiliary_loss_mlp": 0.00102171, + "balance_loss_clip": 0.98944914, + "balance_loss_mlp": 0.09477954, + "epoch": 0.6840823688561551, + "flos": 65988336936960.0, + "grad_norm": 0.7412685148472757, + "language_loss": 0.561064, + "learning_rate": 9.585832297583707e-07, + "loss": 0.5735144, + "num_input_tokens_seen": 245619570, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.07373047, + "step": 11378, + "time_per_iteration": 3.2214090824127197 + }, + { + "auxiliary_loss_clip": 0.01321601, + "auxiliary_loss_mlp": 0.0021634, + "balance_loss_clip": 1.08924901, + "balance_loss_mlp": 0.18812314, + "epoch": 0.684142492108823, + "flos": 21397588980480.0, + "grad_norm": 57.02778976334184, + "language_loss": 0.85029751, + "learning_rate": 9.58250752590561e-07, + "loss": 0.86567688, + "num_input_tokens_seen": 245637980, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.28222656, + "step": 11379, + "time_per_iteration": 2.640737533569336 + }, + { + "auxiliary_loss_clip": 0.01300038, + "auxiliary_loss_mlp": 0.00201014, + "balance_loss_clip": 1.08041763, + "balance_loss_mlp": 0.17550288, + "epoch": 0.6842026153614911, + "flos": 18801426700800.0, + "grad_norm": 6.837824797734919, + "language_loss": 0.77458, + "learning_rate": 9.57918314925988e-07, + "loss": 0.78959054, + "num_input_tokens_seen": 245655690, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25512695, + "step": 11380, + "time_per_iteration": 2.6612961292266846 + }, + { + "auxiliary_loss_clip": 0.01338528, + "auxiliary_loss_mlp": 0.00214643, + "balance_loss_clip": 1.10065103, + "balance_loss_mlp": 0.1854371, + "epoch": 0.684262738614159, + "flos": 19646405216640.0, + "grad_norm": 232.71335384445288, + "language_loss": 0.86598891, + "learning_rate": 9.575859167772568e-07, + "loss": 0.88152063, + "num_input_tokens_seen": 245671525, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.29223633, + "step": 11381, + "time_per_iteration": 4.029786586761475 + }, + { + "auxiliary_loss_clip": 0.01127161, + "auxiliary_loss_mlp": 0.00095351, + "balance_loss_clip": 0.97805715, + "balance_loss_mlp": 0.08791246, + "epoch": 0.684322861866827, + "flos": 62354462739840.0, + "grad_norm": 0.8723047349986254, + "language_loss": 0.66622567, + "learning_rate": 9.572535581569713e-07, + "loss": 0.67845076, + "num_input_tokens_seen": 245724115, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07421875, + "step": 11382, + "time_per_iteration": 2.9936907291412354 + }, + { + "auxiliary_loss_clip": 0.01122984, + "auxiliary_loss_mlp": 0.00077902, + "balance_loss_clip": 0.97599769, + "balance_loss_mlp": 0.07060599, + "epoch": 0.6843829851194949, + "flos": 65805048812160.0, + "grad_norm": 0.8142352532481685, + "language_loss": 0.57476544, + "learning_rate": 9.569212390777356e-07, + "loss": 0.58677429, + "num_input_tokens_seen": 245789245, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.07275391, + "step": 11383, + "time_per_iteration": 3.241542100906372 + }, + { + "auxiliary_loss_clip": 0.01303615, + "auxiliary_loss_mlp": 0.00201128, + "balance_loss_clip": 1.07366729, + "balance_loss_mlp": 0.17295843, + "epoch": 0.6844431083721629, + "flos": 27855153181440.0, + "grad_norm": 6.200664654815156, + "language_loss": 0.86023569, + "learning_rate": 9.565889595521517e-07, + "loss": 0.87528312, + "num_input_tokens_seen": 245812420, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.28173828, + "step": 11384, + "time_per_iteration": 2.7454993724823 + }, + { + "auxiliary_loss_clip": 0.01315681, + "auxiliary_loss_mlp": 0.00222788, + "balance_loss_clip": 1.08185744, + "balance_loss_mlp": 0.19458283, + "epoch": 0.684503231624831, + "flos": 18255010032000.0, + "grad_norm": 17.424319530594193, + "language_loss": 0.86133575, + "learning_rate": 9.562567195928187e-07, + "loss": 0.87672043, + "num_input_tokens_seen": 245829135, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.28173828, + "step": 11385, + "time_per_iteration": 2.6432549953460693 + }, + { + "auxiliary_loss_clip": 0.01341802, + "auxiliary_loss_mlp": 0.00230963, + "balance_loss_clip": 1.09680796, + "balance_loss_mlp": 0.19880068, + "epoch": 0.6845633548774989, + "flos": 17639681120640.0, + "grad_norm": 4.049947944977753, + "language_loss": 0.9180429, + "learning_rate": 9.55924519212335e-07, + "loss": 0.93377054, + "num_input_tokens_seen": 245847140, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.3215332, + "step": 11386, + "time_per_iteration": 2.6324450969696045 + }, + { + "auxiliary_loss_clip": 0.01322191, + "auxiliary_loss_mlp": 0.00196552, + "balance_loss_clip": 1.09165668, + "balance_loss_mlp": 0.16945602, + "epoch": 0.6846234781301669, + "flos": 20807576179200.0, + "grad_norm": 291.86929868731335, + "language_loss": 0.89135408, + "learning_rate": 9.555923584232984e-07, + "loss": 0.90654153, + "num_input_tokens_seen": 245862855, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.27111816, + "step": 11387, + "time_per_iteration": 2.7209436893463135 + }, + { + "auxiliary_loss_clip": 0.01315768, + "auxiliary_loss_mlp": 0.00227502, + "balance_loss_clip": 1.09011602, + "balance_loss_mlp": 0.20163396, + "epoch": 0.6846836013828348, + "flos": 36101176485120.0, + "grad_norm": 183.31036655861934, + "language_loss": 0.78040683, + "learning_rate": 9.552602372383047e-07, + "loss": 0.79583955, + "num_input_tokens_seen": 245885415, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.25866699, + "step": 11388, + "time_per_iteration": 2.826472043991089 + }, + { + "auxiliary_loss_clip": 0.01315398, + "auxiliary_loss_mlp": 0.00205803, + "balance_loss_clip": 1.08315706, + "balance_loss_mlp": 0.17999467, + "epoch": 0.6847437246355028, + "flos": 43142468607360.0, + "grad_norm": 65.28604128838909, + "language_loss": 0.70094407, + "learning_rate": 9.549281556699469e-07, + "loss": 0.71615613, + "num_input_tokens_seen": 245906285, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.25793457, + "step": 11389, + "time_per_iteration": 2.8460347652435303 + }, + { + "auxiliary_loss_clip": 0.01126228, + "auxiliary_loss_mlp": 0.00052319, + "balance_loss_clip": 0.97536141, + "balance_loss_mlp": 0.04430835, + "epoch": 0.6848038478881707, + "flos": 71663729552640.0, + "grad_norm": 0.7088352766433441, + "language_loss": 0.55422103, + "learning_rate": 9.54596113730818e-07, + "loss": 0.56600654, + "num_input_tokens_seen": 245967620, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.08007812, + "step": 11390, + "time_per_iteration": 3.2279646396636963 + }, + { + "auxiliary_loss_clip": 0.01300576, + "auxiliary_loss_mlp": 0.00203543, + "balance_loss_clip": 1.07629502, + "balance_loss_mlp": 0.17730513, + "epoch": 0.6848639711408387, + "flos": 19937820257280.0, + "grad_norm": 7.6436062038874955, + "language_loss": 0.95745444, + "learning_rate": 9.542641114335109e-07, + "loss": 0.97249568, + "num_input_tokens_seen": 245985075, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26269531, + "step": 11391, + "time_per_iteration": 2.656298875808716 + }, + { + "auxiliary_loss_clip": 0.01312479, + "auxiliary_loss_mlp": 0.00217588, + "balance_loss_clip": 1.07865834, + "balance_loss_mlp": 0.19114703, + "epoch": 0.6849240943935067, + "flos": 26867501844480.0, + "grad_norm": 10.482021994306256, + "language_loss": 0.85160601, + "learning_rate": 9.539321487906117e-07, + "loss": 0.86690664, + "num_input_tokens_seen": 246003560, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.2644043, + "step": 11392, + "time_per_iteration": 2.705381155014038 + }, + { + "auxiliary_loss_clip": 0.013029, + "auxiliary_loss_mlp": 0.0021586, + "balance_loss_clip": 1.07577372, + "balance_loss_mlp": 0.18891835, + "epoch": 0.6849842176461747, + "flos": 13735365425280.0, + "grad_norm": 51.59990266304212, + "language_loss": 0.80846453, + "learning_rate": 9.536002258147104e-07, + "loss": 0.82365215, + "num_input_tokens_seen": 246019600, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26928711, + "step": 11393, + "time_per_iteration": 2.5993099212646484 + }, + { + "auxiliary_loss_clip": 0.01323789, + "auxiliary_loss_mlp": 0.0022222, + "balance_loss_clip": 1.08896661, + "balance_loss_mlp": 0.19490942, + "epoch": 0.6850443408988426, + "flos": 24973070641920.0, + "grad_norm": 57.42161687920957, + "language_loss": 0.72971022, + "learning_rate": 9.532683425183936e-07, + "loss": 0.74517024, + "num_input_tokens_seen": 246038920, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.27294922, + "step": 11394, + "time_per_iteration": 2.679710865020752 + }, + { + "auxiliary_loss_clip": 0.0129639, + "auxiliary_loss_mlp": 0.00228987, + "balance_loss_clip": 1.07435131, + "balance_loss_mlp": 0.20271346, + "epoch": 0.6851044641515106, + "flos": 27744225004800.0, + "grad_norm": 3.1061484806775246, + "language_loss": 0.86327988, + "learning_rate": 9.529364989142468e-07, + "loss": 0.87853366, + "num_input_tokens_seen": 246060490, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.26269531, + "step": 11395, + "time_per_iteration": 2.7165331840515137 + }, + { + "auxiliary_loss_clip": 0.01307431, + "auxiliary_loss_mlp": 0.00226719, + "balance_loss_clip": 1.08200693, + "balance_loss_mlp": 0.20189922, + "epoch": 0.6851645874041785, + "flos": 24351061800960.0, + "grad_norm": 24.407280164036642, + "language_loss": 0.79482216, + "learning_rate": 9.526046950148527e-07, + "loss": 0.81016362, + "num_input_tokens_seen": 246081465, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.24841309, + "step": 11396, + "time_per_iteration": 2.714905261993408 + }, + { + "auxiliary_loss_clip": 0.01314507, + "auxiliary_loss_mlp": 0.00223512, + "balance_loss_clip": 1.08423615, + "balance_loss_mlp": 0.19533128, + "epoch": 0.6852247106568465, + "flos": 15077849264640.0, + "grad_norm": 7.80191187678357, + "language_loss": 0.8897692, + "learning_rate": 9.522729308327931e-07, + "loss": 0.90514946, + "num_input_tokens_seen": 246096110, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.28198242, + "step": 11397, + "time_per_iteration": 2.6716971397399902 + }, + { + "auxiliary_loss_clip": 0.01305861, + "auxiliary_loss_mlp": 0.00220458, + "balance_loss_clip": 1.07964933, + "balance_loss_mlp": 0.19414845, + "epoch": 0.6852848339095146, + "flos": 18770005278720.0, + "grad_norm": 29.886382012136686, + "language_loss": 0.78346556, + "learning_rate": 9.519412063806493e-07, + "loss": 0.7987287, + "num_input_tokens_seen": 246114785, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26318359, + "step": 11398, + "time_per_iteration": 2.6406800746917725 + }, + { + "auxiliary_loss_clip": 0.01284762, + "auxiliary_loss_mlp": 0.00215241, + "balance_loss_clip": 1.06599474, + "balance_loss_mlp": 0.18907444, + "epoch": 0.6853449571621825, + "flos": 27854363082240.0, + "grad_norm": 7.023816377315614, + "language_loss": 0.77568412, + "learning_rate": 9.516095216709996e-07, + "loss": 0.79068416, + "num_input_tokens_seen": 246136375, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26171875, + "step": 11399, + "time_per_iteration": 2.734245777130127 + }, + { + "auxiliary_loss_clip": 0.01294316, + "auxiliary_loss_mlp": 0.00237131, + "balance_loss_clip": 1.07242477, + "balance_loss_mlp": 0.21109605, + "epoch": 0.6854050804148505, + "flos": 18150510389760.0, + "grad_norm": 155.96721789712078, + "language_loss": 0.7929548, + "learning_rate": 9.512778767164217e-07, + "loss": 0.80826932, + "num_input_tokens_seen": 246155090, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.26062012, + "step": 11400, + "time_per_iteration": 2.639631509780884 + }, + { + "auxiliary_loss_clip": 0.01329642, + "auxiliary_loss_mlp": 0.00222714, + "balance_loss_clip": 1.08656001, + "balance_loss_mlp": 0.19284041, + "epoch": 0.6854652036675184, + "flos": 16326212492160.0, + "grad_norm": 2.311747045619686, + "language_loss": 0.87323302, + "learning_rate": 9.509462715294927e-07, + "loss": 0.88875651, + "num_input_tokens_seen": 246172645, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.29882812, + "step": 11401, + "time_per_iteration": 2.6340627670288086 + }, + { + "auxiliary_loss_clip": 0.01298828, + "auxiliary_loss_mlp": 0.00198166, + "balance_loss_clip": 1.07182455, + "balance_loss_mlp": 0.17179731, + "epoch": 0.6855253269201864, + "flos": 14940814878720.0, + "grad_norm": 3.5644813293294, + "language_loss": 0.85316801, + "learning_rate": 9.50614706122786e-07, + "loss": 0.86813796, + "num_input_tokens_seen": 246189055, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26342773, + "step": 11402, + "time_per_iteration": 2.6001222133636475 + }, + { + "auxiliary_loss_clip": 0.01320984, + "auxiliary_loss_mlp": 0.00223365, + "balance_loss_clip": 1.08800292, + "balance_loss_mlp": 0.19638826, + "epoch": 0.6855854501728543, + "flos": 23037736826880.0, + "grad_norm": 3.1189103137928984, + "language_loss": 0.78643751, + "learning_rate": 9.502831805088742e-07, + "loss": 0.80188107, + "num_input_tokens_seen": 246207990, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.26989746, + "step": 11403, + "time_per_iteration": 2.706217050552368 + }, + { + "auxiliary_loss_clip": 0.01301826, + "auxiliary_loss_mlp": 0.00231407, + "balance_loss_clip": 1.07848895, + "balance_loss_mlp": 0.20493068, + "epoch": 0.6856455734255223, + "flos": 13253623194240.0, + "grad_norm": 18.297737702311643, + "language_loss": 0.91324198, + "learning_rate": 9.499516947003294e-07, + "loss": 0.92857432, + "num_input_tokens_seen": 246221595, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.26464844, + "step": 11404, + "time_per_iteration": 2.633016586303711 + }, + { + "auxiliary_loss_clip": 0.01326881, + "auxiliary_loss_mlp": 0.00218202, + "balance_loss_clip": 1.09741902, + "balance_loss_mlp": 0.19233397, + "epoch": 0.6857056966781903, + "flos": 23333461499520.0, + "grad_norm": 2.2190170737494523, + "language_loss": 0.83188021, + "learning_rate": 9.496202487097222e-07, + "loss": 0.84733099, + "num_input_tokens_seen": 246242970, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.25854492, + "step": 11405, + "time_per_iteration": 2.7015390396118164 + }, + { + "auxiliary_loss_clip": 0.01135956, + "auxiliary_loss_mlp": 0.00088138, + "balance_loss_clip": 0.98579991, + "balance_loss_mlp": 0.07988854, + "epoch": 0.6857658199308583, + "flos": 61852647784320.0, + "grad_norm": 0.7668166622478042, + "language_loss": 0.60166395, + "learning_rate": 9.492888425496199e-07, + "loss": 0.61390489, + "num_input_tokens_seen": 246300405, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.08251953, + "step": 11406, + "time_per_iteration": 3.2171072959899902 + }, + { + "auxiliary_loss_clip": 0.01314297, + "auxiliary_loss_mlp": 0.00232053, + "balance_loss_clip": 1.08457565, + "balance_loss_mlp": 0.20420602, + "epoch": 0.6858259431835262, + "flos": 16654543735680.0, + "grad_norm": 29.30960763978246, + "language_loss": 0.85970145, + "learning_rate": 9.489574762325907e-07, + "loss": 0.87516499, + "num_input_tokens_seen": 246318780, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.27844238, + "step": 11407, + "time_per_iteration": 2.6490190029144287 + }, + { + "auxiliary_loss_clip": 0.01309612, + "auxiliary_loss_mlp": 0.00213048, + "balance_loss_clip": 1.0779829, + "balance_loss_mlp": 0.18409257, + "epoch": 0.6858860664361942, + "flos": 21872974504320.0, + "grad_norm": 729.1861968259099, + "language_loss": 0.78635478, + "learning_rate": 9.486261497711991e-07, + "loss": 0.80158138, + "num_input_tokens_seen": 246339405, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.28930664, + "step": 11408, + "time_per_iteration": 2.7114598751068115 + }, + { + "auxiliary_loss_clip": 0.01312029, + "auxiliary_loss_mlp": 0.0022864, + "balance_loss_clip": 1.07925069, + "balance_loss_mlp": 0.20244947, + "epoch": 0.6859461896888621, + "flos": 15267637751040.0, + "grad_norm": 3.0154352531304083, + "language_loss": 0.79146254, + "learning_rate": 9.482948631780087e-07, + "loss": 0.80686921, + "num_input_tokens_seen": 246357055, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.26196289, + "step": 11409, + "time_per_iteration": 2.670933723449707 + }, + { + "auxiliary_loss_clip": 0.01291838, + "auxiliary_loss_mlp": 0.00237765, + "balance_loss_clip": 1.07363713, + "balance_loss_mlp": 0.21282688, + "epoch": 0.6860063129415301, + "flos": 18620293392000.0, + "grad_norm": 24.142668884263877, + "language_loss": 0.82742667, + "learning_rate": 9.479636164655825e-07, + "loss": 0.84272265, + "num_input_tokens_seen": 246374050, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24963379, + "step": 11410, + "time_per_iteration": 2.655996084213257 + }, + { + "auxiliary_loss_clip": 0.01308851, + "auxiliary_loss_mlp": 0.00235948, + "balance_loss_clip": 1.07706535, + "balance_loss_mlp": 0.20843467, + "epoch": 0.6860664361941982, + "flos": 23951376190080.0, + "grad_norm": 212.0217761035595, + "language_loss": 0.79691046, + "learning_rate": 9.476324096464821e-07, + "loss": 0.8123585, + "num_input_tokens_seen": 246392910, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.27502441, + "step": 11411, + "time_per_iteration": 2.6714401245117188 + }, + { + "auxiliary_loss_clip": 0.01303021, + "auxiliary_loss_mlp": 0.00211325, + "balance_loss_clip": 1.06847548, + "balance_loss_mlp": 0.18220216, + "epoch": 0.6861265594468661, + "flos": 20407782827520.0, + "grad_norm": 57.67596830213775, + "language_loss": 0.800192, + "learning_rate": 9.473012427332654e-07, + "loss": 0.81533551, + "num_input_tokens_seen": 246411540, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.29150391, + "step": 11412, + "time_per_iteration": 2.675710678100586 + }, + { + "auxiliary_loss_clip": 0.0128875, + "auxiliary_loss_mlp": 0.00227181, + "balance_loss_clip": 1.05943418, + "balance_loss_mlp": 0.19932142, + "epoch": 0.6861866826995341, + "flos": 11428571111040.0, + "grad_norm": 23.344048857108735, + "language_loss": 0.8084355, + "learning_rate": 9.469701157384919e-07, + "loss": 0.82359481, + "num_input_tokens_seen": 246423295, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.27856445, + "step": 11413, + "time_per_iteration": 2.5663914680480957 + }, + { + "auxiliary_loss_clip": 0.01297023, + "auxiliary_loss_mlp": 0.00241558, + "balance_loss_clip": 1.07139754, + "balance_loss_mlp": 0.21464014, + "epoch": 0.686246805952202, + "flos": 15997593939840.0, + "grad_norm": 3.7755445038198725, + "language_loss": 0.81121528, + "learning_rate": 9.466390286747164e-07, + "loss": 0.82660103, + "num_input_tokens_seen": 246441045, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26953125, + "step": 11414, + "time_per_iteration": 3.9959309101104736 + }, + { + "auxiliary_loss_clip": 0.01325681, + "auxiliary_loss_mlp": 0.00237793, + "balance_loss_clip": 1.08960092, + "balance_loss_mlp": 0.20825326, + "epoch": 0.68630692920487, + "flos": 19826712512640.0, + "grad_norm": 3.217131902400526, + "language_loss": 0.97033232, + "learning_rate": 9.46307981554495e-07, + "loss": 0.98596704, + "num_input_tokens_seen": 246456905, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.29516602, + "step": 11415, + "time_per_iteration": 4.103572845458984 + }, + { + "auxiliary_loss_clip": 0.01317206, + "auxiliary_loss_mlp": 0.00242153, + "balance_loss_clip": 1.07944095, + "balance_loss_mlp": 0.21392423, + "epoch": 0.6863670524575379, + "flos": 26286216048000.0, + "grad_norm": 151.11963573318604, + "language_loss": 0.74313694, + "learning_rate": 9.459769743903801e-07, + "loss": 0.75873053, + "num_input_tokens_seen": 246477545, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.2824707, + "step": 11416, + "time_per_iteration": 2.684231996536255 + }, + { + "auxiliary_loss_clip": 0.01283374, + "auxiliary_loss_mlp": 0.00224961, + "balance_loss_clip": 1.05845869, + "balance_loss_mlp": 0.19899735, + "epoch": 0.686427175710206, + "flos": 19173138595200.0, + "grad_norm": 67.70781745444148, + "language_loss": 0.82942605, + "learning_rate": 9.456460071949237e-07, + "loss": 0.84450948, + "num_input_tokens_seen": 246496705, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.25939941, + "step": 11417, + "time_per_iteration": 2.689422369003296 + }, + { + "auxiliary_loss_clip": 0.01319676, + "auxiliary_loss_mlp": 0.0023757, + "balance_loss_clip": 1.0887444, + "balance_loss_mlp": 0.21000887, + "epoch": 0.6864872989628739, + "flos": 18916628595840.0, + "grad_norm": 14.130640764940376, + "language_loss": 0.85158789, + "learning_rate": 9.45315079980678e-07, + "loss": 0.86716032, + "num_input_tokens_seen": 246514860, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.27575684, + "step": 11418, + "time_per_iteration": 2.647162914276123 + }, + { + "auxiliary_loss_clip": 0.01317399, + "auxiliary_loss_mlp": 0.00211285, + "balance_loss_clip": 1.0830307, + "balance_loss_mlp": 0.18410495, + "epoch": 0.6865474222155419, + "flos": 25956196865280.0, + "grad_norm": 16.90431747077494, + "language_loss": 0.84242475, + "learning_rate": 9.449841927601887e-07, + "loss": 0.85771155, + "num_input_tokens_seen": 246536145, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.27185059, + "step": 11419, + "time_per_iteration": 4.206003427505493 + }, + { + "auxiliary_loss_clip": 0.01279721, + "auxiliary_loss_mlp": 0.00228565, + "balance_loss_clip": 1.06157827, + "balance_loss_mlp": 0.2047466, + "epoch": 0.6866075454682098, + "flos": 18478087447680.0, + "grad_norm": 3.174319829220743, + "language_loss": 0.79345661, + "learning_rate": 9.446533455460044e-07, + "loss": 0.80853945, + "num_input_tokens_seen": 246553265, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.23815918, + "step": 11420, + "time_per_iteration": 2.6605639457702637 + }, + { + "auxiliary_loss_clip": 0.01296151, + "auxiliary_loss_mlp": 0.00238648, + "balance_loss_clip": 1.0688616, + "balance_loss_mlp": 0.21244612, + "epoch": 0.6866676687208778, + "flos": 34239998298240.0, + "grad_norm": 3.4984242446274476, + "language_loss": 0.80891299, + "learning_rate": 9.443225383506712e-07, + "loss": 0.82426101, + "num_input_tokens_seen": 246575130, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.26184082, + "step": 11421, + "time_per_iteration": 2.7796339988708496 + }, + { + "auxiliary_loss_clip": 0.01285989, + "auxiliary_loss_mlp": 0.00215982, + "balance_loss_clip": 1.0666883, + "balance_loss_mlp": 0.19144903, + "epoch": 0.6867277919735457, + "flos": 21721754246400.0, + "grad_norm": 386.212655751545, + "language_loss": 0.83543682, + "learning_rate": 9.439917711867338e-07, + "loss": 0.85045654, + "num_input_tokens_seen": 246593095, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.24560547, + "step": 11422, + "time_per_iteration": 2.6969165802001953 + }, + { + "auxiliary_loss_clip": 0.01301859, + "auxiliary_loss_mlp": 0.00236353, + "balance_loss_clip": 1.06897879, + "balance_loss_mlp": 0.20629999, + "epoch": 0.6867879152262137, + "flos": 24097999507200.0, + "grad_norm": 2.0561006585753887, + "language_loss": 0.83225816, + "learning_rate": 9.436610440667334e-07, + "loss": 0.84764028, + "num_input_tokens_seen": 246612165, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.30053711, + "step": 11423, + "time_per_iteration": 4.090404272079468 + }, + { + "auxiliary_loss_clip": 0.01301754, + "auxiliary_loss_mlp": 0.00242925, + "balance_loss_clip": 1.07724702, + "balance_loss_mlp": 0.21713987, + "epoch": 0.6868480384788818, + "flos": 21615818060160.0, + "grad_norm": 1237.0640902245777, + "language_loss": 0.80364835, + "learning_rate": 9.433303570032129e-07, + "loss": 0.81909513, + "num_input_tokens_seen": 246632065, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.25769043, + "step": 11424, + "time_per_iteration": 2.666814088821411 + }, + { + "auxiliary_loss_clip": 0.01285303, + "auxiliary_loss_mlp": 0.00228496, + "balance_loss_clip": 1.06063366, + "balance_loss_mlp": 0.20119685, + "epoch": 0.6869081617315497, + "flos": 26286144220800.0, + "grad_norm": 6.957102040376679, + "language_loss": 0.72288632, + "learning_rate": 9.429997100087112e-07, + "loss": 0.73802429, + "num_input_tokens_seen": 246651245, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.27294922, + "step": 11425, + "time_per_iteration": 2.7278475761413574 + }, + { + "auxiliary_loss_clip": 0.01282721, + "auxiliary_loss_mlp": 0.00212387, + "balance_loss_clip": 1.05600369, + "balance_loss_mlp": 0.1871857, + "epoch": 0.6869682849842177, + "flos": 21105096531840.0, + "grad_norm": 4.21643719903377, + "language_loss": 0.78518486, + "learning_rate": 9.426691030957657e-07, + "loss": 0.80013597, + "num_input_tokens_seen": 246672225, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.2520752, + "step": 11426, + "time_per_iteration": 2.659825563430786 + }, + { + "auxiliary_loss_clip": 0.01294055, + "auxiliary_loss_mlp": 0.00223845, + "balance_loss_clip": 1.06646717, + "balance_loss_mlp": 0.1960932, + "epoch": 0.6870284082368856, + "flos": 17092653920640.0, + "grad_norm": 31.160708811583568, + "language_loss": 0.92369866, + "learning_rate": 9.423385362769136e-07, + "loss": 0.9388777, + "num_input_tokens_seen": 246688385, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.27758789, + "step": 11427, + "time_per_iteration": 2.6106672286987305 + }, + { + "auxiliary_loss_clip": 0.01273237, + "auxiliary_loss_mlp": 0.00207968, + "balance_loss_clip": 1.05358458, + "balance_loss_mlp": 0.18322045, + "epoch": 0.6870885314895536, + "flos": 27308090067840.0, + "grad_norm": 10.113187348863718, + "language_loss": 0.84098458, + "learning_rate": 9.420080095646909e-07, + "loss": 0.85579669, + "num_input_tokens_seen": 246710730, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24755859, + "step": 11428, + "time_per_iteration": 2.683837413787842 + }, + { + "auxiliary_loss_clip": 0.01315341, + "auxiliary_loss_mlp": 0.00236779, + "balance_loss_clip": 1.08288431, + "balance_loss_mlp": 0.20752494, + "epoch": 0.6871486547422215, + "flos": 20814543417600.0, + "grad_norm": 20.023143724926324, + "language_loss": 0.8116352, + "learning_rate": 9.4167752297163e-07, + "loss": 0.82715642, + "num_input_tokens_seen": 246730350, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.29284668, + "step": 11429, + "time_per_iteration": 2.707099199295044 + }, + { + "auxiliary_loss_clip": 0.01310961, + "auxiliary_loss_mlp": 0.00216918, + "balance_loss_clip": 1.07395792, + "balance_loss_mlp": 0.18966669, + "epoch": 0.6872087779948896, + "flos": 30154118330880.0, + "grad_norm": 6.7264972139513235, + "language_loss": 0.91320485, + "learning_rate": 9.413470765102643e-07, + "loss": 0.92848361, + "num_input_tokens_seen": 246751700, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.27209473, + "step": 11430, + "time_per_iteration": 2.757507085800171 + }, + { + "auxiliary_loss_clip": 0.01295505, + "auxiliary_loss_mlp": 0.00224687, + "balance_loss_clip": 1.0705049, + "balance_loss_mlp": 0.19819853, + "epoch": 0.6872689012475575, + "flos": 20704584908160.0, + "grad_norm": 25.23189301479444, + "language_loss": 0.77919334, + "learning_rate": 9.410166701931225e-07, + "loss": 0.79439527, + "num_input_tokens_seen": 246769860, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.26525879, + "step": 11431, + "time_per_iteration": 2.654498815536499 + }, + { + "auxiliary_loss_clip": 0.01283948, + "auxiliary_loss_mlp": 0.00217167, + "balance_loss_clip": 1.06463718, + "balance_loss_mlp": 0.19337302, + "epoch": 0.6873290245002255, + "flos": 25520852027520.0, + "grad_norm": 2.8009744688603475, + "language_loss": 0.87896317, + "learning_rate": 9.406863040327355e-07, + "loss": 0.8939743, + "num_input_tokens_seen": 246789905, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.23779297, + "step": 11432, + "time_per_iteration": 2.6764862537384033 + }, + { + "auxiliary_loss_clip": 0.01285565, + "auxiliary_loss_mlp": 0.00209901, + "balance_loss_clip": 1.06626046, + "balance_loss_mlp": 0.18495058, + "epoch": 0.6873891477528934, + "flos": 25191479289600.0, + "grad_norm": 13.98807157182821, + "language_loss": 0.73497874, + "learning_rate": 9.403559780416295e-07, + "loss": 0.74993336, + "num_input_tokens_seen": 246808815, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.24938965, + "step": 11433, + "time_per_iteration": 2.7252845764160156 + }, + { + "auxiliary_loss_clip": 0.01297058, + "auxiliary_loss_mlp": 0.00220511, + "balance_loss_clip": 1.06904054, + "balance_loss_mlp": 0.19510713, + "epoch": 0.6874492710055614, + "flos": 35152380685440.0, + "grad_norm": 12.055244355180793, + "language_loss": 0.81512797, + "learning_rate": 9.400256922323309e-07, + "loss": 0.83030367, + "num_input_tokens_seen": 246829775, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.25378418, + "step": 11434, + "time_per_iteration": 2.893059492111206 + }, + { + "auxiliary_loss_clip": 0.01284894, + "auxiliary_loss_mlp": 0.00195909, + "balance_loss_clip": 1.06240487, + "balance_loss_mlp": 0.1710775, + "epoch": 0.6875093942582293, + "flos": 17822215059840.0, + "grad_norm": 20.18970825482068, + "language_loss": 0.88963187, + "learning_rate": 9.396954466173657e-07, + "loss": 0.90443987, + "num_input_tokens_seen": 246848045, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.24853516, + "step": 11435, + "time_per_iteration": 2.663299560546875 + }, + { + "auxiliary_loss_clip": 0.01315451, + "auxiliary_loss_mlp": 0.00229667, + "balance_loss_clip": 1.08188188, + "balance_loss_mlp": 0.20207021, + "epoch": 0.6875695175108973, + "flos": 20704548994560.0, + "grad_norm": 6.7215323853994695, + "language_loss": 0.90158951, + "learning_rate": 9.393652412092538e-07, + "loss": 0.91704065, + "num_input_tokens_seen": 246866095, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.27575684, + "step": 11436, + "time_per_iteration": 2.6802542209625244 + }, + { + "auxiliary_loss_clip": 0.01279014, + "auxiliary_loss_mlp": 0.00218049, + "balance_loss_clip": 1.05745292, + "balance_loss_mlp": 0.19274041, + "epoch": 0.6876296407635654, + "flos": 25374013228800.0, + "grad_norm": 10.996698648074721, + "language_loss": 0.87551749, + "learning_rate": 9.390350760205183e-07, + "loss": 0.89048809, + "num_input_tokens_seen": 246883975, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25317383, + "step": 11437, + "time_per_iteration": 2.677687883377075 + }, + { + "auxiliary_loss_clip": 0.01330847, + "auxiliary_loss_mlp": 0.00245151, + "balance_loss_clip": 1.0939486, + "balance_loss_mlp": 0.21717215, + "epoch": 0.6876897640162333, + "flos": 23222317841280.0, + "grad_norm": 31.871484200819832, + "language_loss": 0.86550176, + "learning_rate": 9.387049510636793e-07, + "loss": 0.88126177, + "num_input_tokens_seen": 246901560, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.27966309, + "step": 11438, + "time_per_iteration": 2.67934250831604 + }, + { + "auxiliary_loss_clip": 0.01276323, + "auxiliary_loss_mlp": 0.00240806, + "balance_loss_clip": 1.05895817, + "balance_loss_mlp": 0.21560492, + "epoch": 0.6877498872689013, + "flos": 27124335066240.0, + "grad_norm": 61.5684580711111, + "language_loss": 0.79143393, + "learning_rate": 9.383748663512554e-07, + "loss": 0.80660522, + "num_input_tokens_seen": 246922655, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.2520752, + "step": 11439, + "time_per_iteration": 2.736802339553833 + }, + { + "auxiliary_loss_clip": 0.01289722, + "auxiliary_loss_mlp": 0.00224492, + "balance_loss_clip": 1.06664872, + "balance_loss_mlp": 0.1988744, + "epoch": 0.6878100105215692, + "flos": 11581658876160.0, + "grad_norm": 9.829076454592645, + "language_loss": 0.84273624, + "learning_rate": 9.380448218957623e-07, + "loss": 0.85787845, + "num_input_tokens_seen": 246940100, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.25634766, + "step": 11440, + "time_per_iteration": 2.688310146331787 + }, + { + "auxiliary_loss_clip": 0.01287884, + "auxiliary_loss_mlp": 0.00220178, + "balance_loss_clip": 1.06799459, + "balance_loss_mlp": 0.19482201, + "epoch": 0.6878701337742372, + "flos": 20303175444480.0, + "grad_norm": 6.577072896244068, + "language_loss": 0.78997684, + "learning_rate": 9.377148177097167e-07, + "loss": 0.80505753, + "num_input_tokens_seen": 246958545, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25341797, + "step": 11441, + "time_per_iteration": 2.6336488723754883 + }, + { + "auxiliary_loss_clip": 0.01318852, + "auxiliary_loss_mlp": 0.00238361, + "balance_loss_clip": 1.08664632, + "balance_loss_mlp": 0.20903541, + "epoch": 0.6879302570269051, + "flos": 13840080549120.0, + "grad_norm": 56.1304409299792, + "language_loss": 0.7588312, + "learning_rate": 9.373848538056317e-07, + "loss": 0.77440333, + "num_input_tokens_seen": 246974805, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.29345703, + "step": 11442, + "time_per_iteration": 2.662461042404175 + }, + { + "auxiliary_loss_clip": 0.01294645, + "auxiliary_loss_mlp": 0.00218895, + "balance_loss_clip": 1.07156277, + "balance_loss_mlp": 0.19415948, + "epoch": 0.6879903802795732, + "flos": 21324654414720.0, + "grad_norm": 13.21480983694487, + "language_loss": 0.82287359, + "learning_rate": 9.370549301960189e-07, + "loss": 0.838009, + "num_input_tokens_seen": 246992505, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.24731445, + "step": 11443, + "time_per_iteration": 2.662371873855591 + }, + { + "auxiliary_loss_clip": 0.01295606, + "auxiliary_loss_mlp": 0.00202261, + "balance_loss_clip": 1.07200408, + "balance_loss_mlp": 0.17597565, + "epoch": 0.6880505035322411, + "flos": 25152049134720.0, + "grad_norm": 25.630543726804504, + "language_loss": 0.82421803, + "learning_rate": 9.367250468933893e-07, + "loss": 0.83919668, + "num_input_tokens_seen": 247013370, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.26257324, + "step": 11444, + "time_per_iteration": 2.719097852706909 + }, + { + "auxiliary_loss_clip": 0.01287885, + "auxiliary_loss_mlp": 0.00220638, + "balance_loss_clip": 1.06894398, + "balance_loss_mlp": 0.19449519, + "epoch": 0.6881106267849091, + "flos": 23215530170880.0, + "grad_norm": 147.1924596783848, + "language_loss": 0.86017859, + "learning_rate": 9.363952039102536e-07, + "loss": 0.87526387, + "num_input_tokens_seen": 247029855, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26123047, + "step": 11445, + "time_per_iteration": 2.6388022899627686 + }, + { + "auxiliary_loss_clip": 0.01129502, + "auxiliary_loss_mlp": 0.00062938, + "balance_loss_clip": 0.9812699, + "balance_loss_mlp": 0.05497492, + "epoch": 0.688170750037577, + "flos": 48484397312640.0, + "grad_norm": 0.7925604633907269, + "language_loss": 0.5766995, + "learning_rate": 9.360654012591183e-07, + "loss": 0.58862388, + "num_input_tokens_seen": 247085030, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07958984, + "step": 11446, + "time_per_iteration": 3.2068605422973633 + }, + { + "auxiliary_loss_clip": 0.01309983, + "auxiliary_loss_mlp": 0.00241424, + "balance_loss_clip": 1.08053744, + "balance_loss_mlp": 0.21543661, + "epoch": 0.688230873290245, + "flos": 22783633038720.0, + "grad_norm": 419.85862117588727, + "language_loss": 0.83841741, + "learning_rate": 9.357356389524886e-07, + "loss": 0.85393143, + "num_input_tokens_seen": 247104840, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.25964355, + "step": 11447, + "time_per_iteration": 2.7774975299835205 + }, + { + "auxiliary_loss_clip": 0.0131424, + "auxiliary_loss_mlp": 0.00228082, + "balance_loss_clip": 1.0862906, + "balance_loss_mlp": 0.20099753, + "epoch": 0.6882909965429129, + "flos": 22455660931200.0, + "grad_norm": 2.9850701924980143, + "language_loss": 0.80482721, + "learning_rate": 9.354059170028705e-07, + "loss": 0.82025045, + "num_input_tokens_seen": 247121905, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.27111816, + "step": 11448, + "time_per_iteration": 2.727586507797241 + }, + { + "auxiliary_loss_clip": 0.01311303, + "auxiliary_loss_mlp": 0.00231982, + "balance_loss_clip": 1.08065033, + "balance_loss_mlp": 0.20365755, + "epoch": 0.688351119795581, + "flos": 26214143408640.0, + "grad_norm": 29.28177465043976, + "language_loss": 0.84857488, + "learning_rate": 9.350762354227673e-07, + "loss": 0.86400771, + "num_input_tokens_seen": 247142375, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.2833252, + "step": 11449, + "time_per_iteration": 2.759615421295166 + }, + { + "auxiliary_loss_clip": 0.01294732, + "auxiliary_loss_mlp": 0.00216645, + "balance_loss_clip": 1.07520628, + "balance_loss_mlp": 0.1908128, + "epoch": 0.6884112430482489, + "flos": 22565260304640.0, + "grad_norm": 128.36708985493803, + "language_loss": 0.8013497, + "learning_rate": 9.34746594224679e-07, + "loss": 0.81646347, + "num_input_tokens_seen": 247161095, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25830078, + "step": 11450, + "time_per_iteration": 2.6574368476867676 + }, + { + "auxiliary_loss_clip": 0.01335668, + "auxiliary_loss_mlp": 0.00240482, + "balance_loss_clip": 1.10048592, + "balance_loss_mlp": 0.21208677, + "epoch": 0.6884713663009169, + "flos": 17341047446400.0, + "grad_norm": 3.0134411078306163, + "language_loss": 0.86955214, + "learning_rate": 9.344169934211068e-07, + "loss": 0.88531363, + "num_input_tokens_seen": 247178565, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.28369141, + "step": 11451, + "time_per_iteration": 2.6486563682556152 + }, + { + "auxiliary_loss_clip": 0.01332841, + "auxiliary_loss_mlp": 0.00227438, + "balance_loss_clip": 1.09991384, + "balance_loss_mlp": 0.20091403, + "epoch": 0.6885314895535849, + "flos": 26470832976000.0, + "grad_norm": 5.820247415981641, + "language_loss": 0.7530145, + "learning_rate": 9.340874330245505e-07, + "loss": 0.76861727, + "num_input_tokens_seen": 247202345, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.26574707, + "step": 11452, + "time_per_iteration": 2.7194902896881104 + }, + { + "auxiliary_loss_clip": 0.01322129, + "auxiliary_loss_mlp": 0.00236674, + "balance_loss_clip": 1.09399652, + "balance_loss_mlp": 0.21082938, + "epoch": 0.6885916128062528, + "flos": 20521548178560.0, + "grad_norm": 2.580366958045525, + "language_loss": 0.79788196, + "learning_rate": 9.337579130475042e-07, + "loss": 0.81347001, + "num_input_tokens_seen": 247219240, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.25805664, + "step": 11453, + "time_per_iteration": 2.6723976135253906 + }, + { + "auxiliary_loss_clip": 0.01143268, + "auxiliary_loss_mlp": 0.00119669, + "balance_loss_clip": 0.9980197, + "balance_loss_mlp": 0.11132425, + "epoch": 0.6886517360589208, + "flos": 70715795679360.0, + "grad_norm": 0.7631602870708566, + "language_loss": 0.50005144, + "learning_rate": 9.334284335024644e-07, + "loss": 0.51268077, + "num_input_tokens_seen": 247272010, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.08349609, + "step": 11454, + "time_per_iteration": 2.998389720916748 + }, + { + "auxiliary_loss_clip": 0.01296947, + "auxiliary_loss_mlp": 0.00220319, + "balance_loss_clip": 1.08374417, + "balance_loss_mlp": 0.19516611, + "epoch": 0.6887118593115887, + "flos": 17893533513600.0, + "grad_norm": 7.760866431362832, + "language_loss": 0.8357693, + "learning_rate": 9.330989944019263e-07, + "loss": 0.85094196, + "num_input_tokens_seen": 247290630, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.25146484, + "step": 11455, + "time_per_iteration": 2.644127130508423 + }, + { + "auxiliary_loss_clip": 0.01337928, + "auxiliary_loss_mlp": 0.00229822, + "balance_loss_clip": 1.10098863, + "balance_loss_mlp": 0.19913781, + "epoch": 0.6887719825642568, + "flos": 17453017117440.0, + "grad_norm": 51.9491973848988, + "language_loss": 0.83988905, + "learning_rate": 9.327695957583803e-07, + "loss": 0.85556662, + "num_input_tokens_seen": 247304800, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.30688477, + "step": 11456, + "time_per_iteration": 4.009915590286255 + }, + { + "auxiliary_loss_clip": 0.01295133, + "auxiliary_loss_mlp": 0.00230052, + "balance_loss_clip": 1.07794356, + "balance_loss_mlp": 0.20518512, + "epoch": 0.6888321058169247, + "flos": 23070199743360.0, + "grad_norm": 8.070151590330056, + "language_loss": 0.87776458, + "learning_rate": 9.32440237584319e-07, + "loss": 0.8930164, + "num_input_tokens_seen": 247323450, + "router_z_loss_clip": 2.17285156, + "router_z_loss_mlp": 0.24865723, + "step": 11457, + "time_per_iteration": 2.6713366508483887 + }, + { + "auxiliary_loss_clip": 0.01312592, + "auxiliary_loss_mlp": 0.00229157, + "balance_loss_clip": 1.08710098, + "balance_loss_mlp": 0.20371723, + "epoch": 0.6888922290695927, + "flos": 23368833417600.0, + "grad_norm": 9.651104730989799, + "language_loss": 0.86572939, + "learning_rate": 9.321109198922301e-07, + "loss": 0.88114691, + "num_input_tokens_seen": 247343845, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.2545166, + "step": 11458, + "time_per_iteration": 4.1068079471588135 + }, + { + "auxiliary_loss_clip": 0.01303158, + "auxiliary_loss_mlp": 0.00229841, + "balance_loss_clip": 1.08345807, + "balance_loss_mlp": 0.20225623, + "epoch": 0.6889523523222606, + "flos": 17631636474240.0, + "grad_norm": 5.047134182062818, + "language_loss": 0.76095968, + "learning_rate": 9.31781642694603e-07, + "loss": 0.7762897, + "num_input_tokens_seen": 247356650, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.27575684, + "step": 11459, + "time_per_iteration": 2.6859982013702393 + }, + { + "auxiliary_loss_clip": 0.01323321, + "auxiliary_loss_mlp": 0.00226324, + "balance_loss_clip": 1.09428346, + "balance_loss_mlp": 0.19869165, + "epoch": 0.6890124755749286, + "flos": 25228144097280.0, + "grad_norm": 296.97508855727773, + "language_loss": 0.75759238, + "learning_rate": 9.314524060039221e-07, + "loss": 0.77308881, + "num_input_tokens_seen": 247377340, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.27648926, + "step": 11460, + "time_per_iteration": 2.747553825378418 + }, + { + "auxiliary_loss_clip": 0.01334853, + "auxiliary_loss_mlp": 0.00249872, + "balance_loss_clip": 1.09870076, + "balance_loss_mlp": 0.2182336, + "epoch": 0.6890725988275965, + "flos": 20230240878720.0, + "grad_norm": 10.001968078305456, + "language_loss": 0.8805567, + "learning_rate": 9.311232098326731e-07, + "loss": 0.89640397, + "num_input_tokens_seen": 247395805, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.31640625, + "step": 11461, + "time_per_iteration": 4.19239616394043 + }, + { + "auxiliary_loss_clip": 0.01305923, + "auxiliary_loss_mlp": 0.00213396, + "balance_loss_clip": 1.07981074, + "balance_loss_mlp": 0.18701524, + "epoch": 0.6891327220802645, + "flos": 14535311264640.0, + "grad_norm": 56.64484727732651, + "language_loss": 0.7736237, + "learning_rate": 9.307940541933401e-07, + "loss": 0.78881687, + "num_input_tokens_seen": 247413165, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26367188, + "step": 11462, + "time_per_iteration": 2.6811599731445312 + }, + { + "auxiliary_loss_clip": 0.0131152, + "auxiliary_loss_mlp": 0.00238822, + "balance_loss_clip": 1.09128034, + "balance_loss_mlp": 0.21054545, + "epoch": 0.6891928453329325, + "flos": 21139139646720.0, + "grad_norm": 158.26326199128758, + "language_loss": 0.93672299, + "learning_rate": 9.304649390984034e-07, + "loss": 0.9522264, + "num_input_tokens_seen": 247433140, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.28271484, + "step": 11463, + "time_per_iteration": 2.6819827556610107 + }, + { + "auxiliary_loss_clip": 0.01295716, + "auxiliary_loss_mlp": 0.00217814, + "balance_loss_clip": 1.07814062, + "balance_loss_mlp": 0.19365063, + "epoch": 0.6892529685856005, + "flos": 17858520731520.0, + "grad_norm": 2.3634371562557837, + "language_loss": 0.7575919, + "learning_rate": 9.301358645603428e-07, + "loss": 0.77272725, + "num_input_tokens_seen": 247451265, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.24157715, + "step": 11464, + "time_per_iteration": 2.6504580974578857 + }, + { + "auxiliary_loss_clip": 0.01309307, + "auxiliary_loss_mlp": 0.00230408, + "balance_loss_clip": 1.08251476, + "balance_loss_mlp": 0.20256107, + "epoch": 0.6893130918382685, + "flos": 29934811843200.0, + "grad_norm": 8.248440116904419, + "language_loss": 0.75306797, + "learning_rate": 9.298068305916373e-07, + "loss": 0.76846516, + "num_input_tokens_seen": 247471645, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.27844238, + "step": 11465, + "time_per_iteration": 4.1993982791900635 + }, + { + "auxiliary_loss_clip": 0.01327243, + "auxiliary_loss_mlp": 0.00255242, + "balance_loss_clip": 1.0944407, + "balance_loss_mlp": 0.22555935, + "epoch": 0.6893732150909364, + "flos": 24388516707840.0, + "grad_norm": 3.114898435955582, + "language_loss": 0.8127569, + "learning_rate": 9.294778372047649e-07, + "loss": 0.82858169, + "num_input_tokens_seen": 247491170, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.29699707, + "step": 11466, + "time_per_iteration": 2.6897146701812744 + }, + { + "auxiliary_loss_clip": 0.01314433, + "auxiliary_loss_mlp": 0.00228497, + "balance_loss_clip": 1.09010816, + "balance_loss_mlp": 0.20095937, + "epoch": 0.6894333383436044, + "flos": 16982874979200.0, + "grad_norm": 10.136319153429584, + "language_loss": 0.79929268, + "learning_rate": 9.291488844121995e-07, + "loss": 0.814722, + "num_input_tokens_seen": 247509005, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.27539062, + "step": 11467, + "time_per_iteration": 2.718916416168213 + }, + { + "auxiliary_loss_clip": 0.01322769, + "auxiliary_loss_mlp": 0.00233806, + "balance_loss_clip": 1.09065104, + "balance_loss_mlp": 0.20485018, + "epoch": 0.6894934615962723, + "flos": 18985540838400.0, + "grad_norm": 15.521384417608077, + "language_loss": 0.89928752, + "learning_rate": 9.288199722264156e-07, + "loss": 0.91485322, + "num_input_tokens_seen": 247527050, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.28942871, + "step": 11468, + "time_per_iteration": 2.636523962020874 + }, + { + "auxiliary_loss_clip": 0.01351062, + "auxiliary_loss_mlp": 0.00221488, + "balance_loss_clip": 1.11380863, + "balance_loss_mlp": 0.19181724, + "epoch": 0.6895535848489404, + "flos": 34531664734080.0, + "grad_norm": 15.486063178333204, + "language_loss": 0.73255074, + "learning_rate": 9.284911006598875e-07, + "loss": 0.74827623, + "num_input_tokens_seen": 247547765, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.29711914, + "step": 11469, + "time_per_iteration": 2.81756329536438 + }, + { + "auxiliary_loss_clip": 0.01157977, + "auxiliary_loss_mlp": 0.00076395, + "balance_loss_clip": 1.01172388, + "balance_loss_mlp": 0.06747842, + "epoch": 0.6896137081016083, + "flos": 50075852273280.0, + "grad_norm": 0.7737481553979402, + "language_loss": 0.54489303, + "learning_rate": 9.281622697250824e-07, + "loss": 0.55723679, + "num_input_tokens_seen": 247603515, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.08935547, + "step": 11470, + "time_per_iteration": 3.01958966255188 + }, + { + "auxiliary_loss_clip": 0.01316814, + "auxiliary_loss_mlp": 0.00216855, + "balance_loss_clip": 1.09283054, + "balance_loss_mlp": 0.19161856, + "epoch": 0.6896738313542763, + "flos": 19938215306880.0, + "grad_norm": 13.588526669961501, + "language_loss": 0.84462506, + "learning_rate": 9.278334794344715e-07, + "loss": 0.85996175, + "num_input_tokens_seen": 247622110, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.25244141, + "step": 11471, + "time_per_iteration": 2.662492036819458 + }, + { + "auxiliary_loss_clip": 0.01311183, + "auxiliary_loss_mlp": 0.00223916, + "balance_loss_clip": 1.0852282, + "balance_loss_mlp": 0.19730853, + "epoch": 0.6897339546069442, + "flos": 21725489260800.0, + "grad_norm": 175.97009524285994, + "language_loss": 0.86096746, + "learning_rate": 9.275047298005232e-07, + "loss": 0.87631845, + "num_input_tokens_seen": 247641905, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26635742, + "step": 11472, + "time_per_iteration": 2.6898367404937744 + }, + { + "auxiliary_loss_clip": 0.01317316, + "auxiliary_loss_mlp": 0.00226635, + "balance_loss_clip": 1.0890255, + "balance_loss_mlp": 0.19986045, + "epoch": 0.6897940778596122, + "flos": 19826497031040.0, + "grad_norm": 6.093336395081715, + "language_loss": 0.83861232, + "learning_rate": 9.271760208357024e-07, + "loss": 0.85405183, + "num_input_tokens_seen": 247660945, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.26794434, + "step": 11473, + "time_per_iteration": 2.6662890911102295 + }, + { + "auxiliary_loss_clip": 0.0133379, + "auxiliary_loss_mlp": 0.00233931, + "balance_loss_clip": 1.0977478, + "balance_loss_mlp": 0.20437893, + "epoch": 0.6898542011122801, + "flos": 17310056987520.0, + "grad_norm": 4.394009387243449, + "language_loss": 0.82559717, + "learning_rate": 9.268473525524751e-07, + "loss": 0.84127438, + "num_input_tokens_seen": 247678395, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.29541016, + "step": 11474, + "time_per_iteration": 2.6429173946380615 + }, + { + "auxiliary_loss_clip": 0.01322662, + "auxiliary_loss_mlp": 0.0021127, + "balance_loss_clip": 1.09119391, + "balance_loss_mlp": 0.18342283, + "epoch": 0.6899143243649482, + "flos": 24754051463040.0, + "grad_norm": 37.454293746270615, + "language_loss": 0.82711792, + "learning_rate": 9.26518724963303e-07, + "loss": 0.84245729, + "num_input_tokens_seen": 247698380, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.27844238, + "step": 11475, + "time_per_iteration": 2.789879560470581 + }, + { + "auxiliary_loss_clip": 0.01320289, + "auxiliary_loss_mlp": 0.00206093, + "balance_loss_clip": 1.08943391, + "balance_loss_mlp": 0.17900863, + "epoch": 0.6899744476176161, + "flos": 17234536642560.0, + "grad_norm": 5.201716087866568, + "language_loss": 0.96656942, + "learning_rate": 9.261901380806491e-07, + "loss": 0.98183322, + "num_input_tokens_seen": 247716370, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.27075195, + "step": 11476, + "time_per_iteration": 2.722194194793701 + }, + { + "auxiliary_loss_clip": 0.01291812, + "auxiliary_loss_mlp": 0.0022147, + "balance_loss_clip": 1.06959844, + "balance_loss_mlp": 0.19545884, + "epoch": 0.6900345708702841, + "flos": 25410678036480.0, + "grad_norm": 102.78020507535624, + "language_loss": 0.7775588, + "learning_rate": 9.258615919169724e-07, + "loss": 0.79269165, + "num_input_tokens_seen": 247737335, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.26013184, + "step": 11477, + "time_per_iteration": 2.675520658493042 + }, + { + "auxiliary_loss_clip": 0.01338302, + "auxiliary_loss_mlp": 0.00227612, + "balance_loss_clip": 1.10480499, + "balance_loss_mlp": 0.19969282, + "epoch": 0.6900946941229521, + "flos": 23434190213760.0, + "grad_norm": 15.036246598504665, + "language_loss": 0.7715174, + "learning_rate": 9.255330864847313e-07, + "loss": 0.78717649, + "num_input_tokens_seen": 247756680, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.2791748, + "step": 11478, + "time_per_iteration": 2.748300075531006 + }, + { + "auxiliary_loss_clip": 0.01315724, + "auxiliary_loss_mlp": 0.00240406, + "balance_loss_clip": 1.08830714, + "balance_loss_mlp": 0.21406046, + "epoch": 0.69015481737562, + "flos": 17820096157440.0, + "grad_norm": 3.013988505457007, + "language_loss": 0.8462944, + "learning_rate": 9.252046217963843e-07, + "loss": 0.86185569, + "num_input_tokens_seen": 247774265, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.26391602, + "step": 11479, + "time_per_iteration": 2.7033936977386475 + }, + { + "auxiliary_loss_clip": 0.0132255, + "auxiliary_loss_mlp": 0.00232283, + "balance_loss_clip": 1.09022689, + "balance_loss_mlp": 0.20290965, + "epoch": 0.690214940628288, + "flos": 17456500736640.0, + "grad_norm": 6.321882654911184, + "language_loss": 0.87872303, + "learning_rate": 9.248761978643856e-07, + "loss": 0.89427137, + "num_input_tokens_seen": 247792395, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.29345703, + "step": 11480, + "time_per_iteration": 2.751217842102051 + }, + { + "auxiliary_loss_clip": 0.01294607, + "auxiliary_loss_mlp": 0.00235298, + "balance_loss_clip": 1.06974864, + "balance_loss_mlp": 0.20942938, + "epoch": 0.6902750638809559, + "flos": 29566691308800.0, + "grad_norm": 4.40063162056622, + "language_loss": 0.82171911, + "learning_rate": 9.245478147011885e-07, + "loss": 0.83701813, + "num_input_tokens_seen": 247811985, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.25842285, + "step": 11481, + "time_per_iteration": 2.7223100662231445 + }, + { + "auxiliary_loss_clip": 0.01314608, + "auxiliary_loss_mlp": 0.00231216, + "balance_loss_clip": 1.08924234, + "balance_loss_mlp": 0.20674194, + "epoch": 0.690335187133624, + "flos": 25557121785600.0, + "grad_norm": 8.075145040513439, + "language_loss": 0.78740585, + "learning_rate": 9.24219472319246e-07, + "loss": 0.80286413, + "num_input_tokens_seen": 247831880, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.24487305, + "step": 11482, + "time_per_iteration": 2.720813035964966 + }, + { + "auxiliary_loss_clip": 0.01314744, + "auxiliary_loss_mlp": 0.00254791, + "balance_loss_clip": 1.08359361, + "balance_loss_mlp": 0.22732529, + "epoch": 0.6903953103862919, + "flos": 22488447070080.0, + "grad_norm": 5.8900817641606436, + "language_loss": 0.88369155, + "learning_rate": 9.238911707310096e-07, + "loss": 0.89938688, + "num_input_tokens_seen": 247851170, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.27478027, + "step": 11483, + "time_per_iteration": 2.697443723678589 + }, + { + "auxiliary_loss_clip": 0.01306334, + "auxiliary_loss_mlp": 0.00242448, + "balance_loss_clip": 1.08919477, + "balance_loss_mlp": 0.21654424, + "epoch": 0.6904554336389599, + "flos": 26100521712000.0, + "grad_norm": 8.037775621594268, + "language_loss": 0.73514903, + "learning_rate": 9.235629099489273e-07, + "loss": 0.75063682, + "num_input_tokens_seen": 247868950, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25939941, + "step": 11484, + "time_per_iteration": 2.718510389328003 + }, + { + "auxiliary_loss_clip": 0.01290372, + "auxiliary_loss_mlp": 0.0024684, + "balance_loss_clip": 1.06971526, + "balance_loss_mlp": 0.21884987, + "epoch": 0.6905155568916278, + "flos": 31171754545920.0, + "grad_norm": 10.921594418241574, + "language_loss": 0.797225, + "learning_rate": 9.232346899854479e-07, + "loss": 0.81259716, + "num_input_tokens_seen": 247889805, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.27990723, + "step": 11485, + "time_per_iteration": 2.7111570835113525 + }, + { + "auxiliary_loss_clip": 0.01302909, + "auxiliary_loss_mlp": 0.00252427, + "balance_loss_clip": 1.07802296, + "balance_loss_mlp": 0.22528341, + "epoch": 0.6905756801442958, + "flos": 17639681120640.0, + "grad_norm": 3.499777026043928, + "language_loss": 0.91353023, + "learning_rate": 9.22906510853017e-07, + "loss": 0.92908359, + "num_input_tokens_seen": 247908585, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.27111816, + "step": 11486, + "time_per_iteration": 2.7135114669799805 + }, + { + "auxiliary_loss_clip": 0.01317215, + "auxiliary_loss_mlp": 0.00249888, + "balance_loss_clip": 1.09068191, + "balance_loss_mlp": 0.22365046, + "epoch": 0.6906358033969637, + "flos": 22343691260160.0, + "grad_norm": 12.651417772532204, + "language_loss": 0.7960971, + "learning_rate": 9.225783725640786e-07, + "loss": 0.81176811, + "num_input_tokens_seen": 247928480, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.26269531, + "step": 11487, + "time_per_iteration": 2.708281993865967 + }, + { + "auxiliary_loss_clip": 0.01122203, + "auxiliary_loss_mlp": 0.00068168, + "balance_loss_clip": 0.9795078, + "balance_loss_mlp": 0.06125404, + "epoch": 0.6906959266496318, + "flos": 69747789081600.0, + "grad_norm": 0.8590823011672896, + "language_loss": 0.6569944, + "learning_rate": 9.222502751310759e-07, + "loss": 0.66889811, + "num_input_tokens_seen": 247988855, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.06933594, + "step": 11488, + "time_per_iteration": 3.1898462772369385 + }, + { + "auxiliary_loss_clip": 0.01320258, + "auxiliary_loss_mlp": 0.00295142, + "balance_loss_clip": 1.08697355, + "balance_loss_mlp": 0.2672351, + "epoch": 0.6907560499022997, + "flos": 21434253788160.0, + "grad_norm": 904.8165761070784, + "language_loss": 0.85116631, + "learning_rate": 9.219222185664519e-07, + "loss": 0.8673203, + "num_input_tokens_seen": 248007685, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.27880859, + "step": 11489, + "time_per_iteration": 2.6812314987182617 + }, + { + "auxiliary_loss_clip": 0.01308019, + "auxiliary_loss_mlp": 0.00273204, + "balance_loss_clip": 1.08388162, + "balance_loss_mlp": 0.24565518, + "epoch": 0.6908161731549677, + "flos": 14392207480320.0, + "grad_norm": 19.325312891714866, + "language_loss": 0.72146189, + "learning_rate": 9.215942028826445e-07, + "loss": 0.73727405, + "num_input_tokens_seen": 248025145, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.2755127, + "step": 11490, + "time_per_iteration": 2.628631591796875 + }, + { + "auxiliary_loss_clip": 0.01304875, + "auxiliary_loss_mlp": 0.00248708, + "balance_loss_clip": 1.08219779, + "balance_loss_mlp": 0.22336459, + "epoch": 0.6908762964076357, + "flos": 20010970304640.0, + "grad_norm": 6.866291061733434, + "language_loss": 0.77613151, + "learning_rate": 9.212662280920937e-07, + "loss": 0.79166734, + "num_input_tokens_seen": 248043750, + "router_z_loss_clip": 2.22558594, + "router_z_loss_mlp": 0.25341797, + "step": 11491, + "time_per_iteration": 2.650214195251465 + }, + { + "auxiliary_loss_clip": 0.01306502, + "auxiliary_loss_mlp": 0.00263167, + "balance_loss_clip": 1.08157241, + "balance_loss_mlp": 0.23478284, + "epoch": 0.6909364196603036, + "flos": 28769079853440.0, + "grad_norm": 6.9653540392790765, + "language_loss": 0.75853342, + "learning_rate": 9.20938294207235e-07, + "loss": 0.77423012, + "num_input_tokens_seen": 248065765, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.28417969, + "step": 11492, + "time_per_iteration": 2.7249984741210938 + }, + { + "auxiliary_loss_clip": 0.01326675, + "auxiliary_loss_mlp": 0.00255226, + "balance_loss_clip": 1.09049749, + "balance_loss_mlp": 0.22586495, + "epoch": 0.6909965429129716, + "flos": 22528128620160.0, + "grad_norm": 9.119265070271943, + "language_loss": 0.82883114, + "learning_rate": 9.206104012405049e-07, + "loss": 0.84465015, + "num_input_tokens_seen": 248083810, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.2935791, + "step": 11493, + "time_per_iteration": 2.7061524391174316 + }, + { + "auxiliary_loss_clip": 0.01307963, + "auxiliary_loss_mlp": 0.00265394, + "balance_loss_clip": 1.08132935, + "balance_loss_mlp": 0.23745167, + "epoch": 0.6910566661656395, + "flos": 18405942981120.0, + "grad_norm": 45.33405724894164, + "language_loss": 0.81686914, + "learning_rate": 9.20282549204336e-07, + "loss": 0.83260268, + "num_input_tokens_seen": 248103185, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.27929688, + "step": 11494, + "time_per_iteration": 2.6435508728027344 + }, + { + "auxiliary_loss_clip": 0.01314605, + "auxiliary_loss_mlp": 0.00279742, + "balance_loss_clip": 1.08554626, + "balance_loss_mlp": 0.25089356, + "epoch": 0.6911167894183076, + "flos": 30773972355840.0, + "grad_norm": 26.104037383112463, + "language_loss": 0.76974726, + "learning_rate": 9.19954738111161e-07, + "loss": 0.78569067, + "num_input_tokens_seen": 248125665, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.28881836, + "step": 11495, + "time_per_iteration": 2.7480101585388184 + }, + { + "auxiliary_loss_clip": 0.01305114, + "auxiliary_loss_mlp": 0.00293233, + "balance_loss_clip": 1.08226109, + "balance_loss_mlp": 0.26647067, + "epoch": 0.6911769126709755, + "flos": 13735724561280.0, + "grad_norm": 50.2413971594535, + "language_loss": 0.82752627, + "learning_rate": 9.196269679734119e-07, + "loss": 0.84350979, + "num_input_tokens_seen": 248142545, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26745605, + "step": 11496, + "time_per_iteration": 2.658576250076294 + }, + { + "auxiliary_loss_clip": 0.01292189, + "auxiliary_loss_mlp": 0.00256694, + "balance_loss_clip": 1.07615983, + "balance_loss_mlp": 0.23021811, + "epoch": 0.6912370359236435, + "flos": 17566854295680.0, + "grad_norm": 26.61395978688353, + "language_loss": 0.87774134, + "learning_rate": 9.19299238803515e-07, + "loss": 0.89323014, + "num_input_tokens_seen": 248160225, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.26452637, + "step": 11497, + "time_per_iteration": 2.663674831390381 + }, + { + "auxiliary_loss_clip": 0.01324258, + "auxiliary_loss_mlp": 0.00259775, + "balance_loss_clip": 1.099823, + "balance_loss_mlp": 0.23302434, + "epoch": 0.6912971591763114, + "flos": 22090772620800.0, + "grad_norm": 30.570038290386222, + "language_loss": 0.88344979, + "learning_rate": 9.189715506138993e-07, + "loss": 0.89929014, + "num_input_tokens_seen": 248180430, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.26745605, + "step": 11498, + "time_per_iteration": 2.700890064239502 + }, + { + "auxiliary_loss_clip": 0.012983, + "auxiliary_loss_mlp": 0.00263596, + "balance_loss_clip": 1.08118415, + "balance_loss_mlp": 0.23865694, + "epoch": 0.6913572824289794, + "flos": 29971476650880.0, + "grad_norm": 8.679129564184658, + "language_loss": 0.91112882, + "learning_rate": 9.186439034169915e-07, + "loss": 0.9267478, + "num_input_tokens_seen": 248202365, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24951172, + "step": 11499, + "time_per_iteration": 4.13749098777771 + }, + { + "auxiliary_loss_clip": 0.01292083, + "auxiliary_loss_mlp": 0.00264741, + "balance_loss_clip": 1.07734871, + "balance_loss_mlp": 0.23995745, + "epoch": 0.6914174056816473, + "flos": 20448936835200.0, + "grad_norm": 6.366804741517456, + "language_loss": 0.82265496, + "learning_rate": 9.183162972252145e-07, + "loss": 0.83822322, + "num_input_tokens_seen": 248221750, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.24755859, + "step": 11500, + "time_per_iteration": 4.1396262645721436 + }, + { + "auxiliary_loss_clip": 0.01304076, + "auxiliary_loss_mlp": 0.00288907, + "balance_loss_clip": 1.07701707, + "balance_loss_mlp": 0.26126266, + "epoch": 0.6914775289343154, + "flos": 21282530739840.0, + "grad_norm": 9.237510588595658, + "language_loss": 0.85505563, + "learning_rate": 9.179887320509921e-07, + "loss": 0.87098545, + "num_input_tokens_seen": 248239535, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.27636719, + "step": 11501, + "time_per_iteration": 2.7013654708862305 + }, + { + "auxiliary_loss_clip": 0.01299511, + "auxiliary_loss_mlp": 0.00279798, + "balance_loss_clip": 1.07636714, + "balance_loss_mlp": 0.25081828, + "epoch": 0.6915376521869833, + "flos": 23878118401920.0, + "grad_norm": 6.181522988266709, + "language_loss": 0.81134921, + "learning_rate": 9.176612079067458e-07, + "loss": 0.8271423, + "num_input_tokens_seen": 248259055, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.29003906, + "step": 11502, + "time_per_iteration": 2.6507809162139893 + }, + { + "auxiliary_loss_clip": 0.01321349, + "auxiliary_loss_mlp": 0.00271048, + "balance_loss_clip": 1.09201598, + "balance_loss_mlp": 0.24218714, + "epoch": 0.6915977754396513, + "flos": 11510268595200.0, + "grad_norm": 12.07756844021101, + "language_loss": 0.84570503, + "learning_rate": 9.173337248048953e-07, + "loss": 0.86162901, + "num_input_tokens_seen": 248276765, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.28857422, + "step": 11503, + "time_per_iteration": 2.6662471294403076 + }, + { + "auxiliary_loss_clip": 0.01306542, + "auxiliary_loss_mlp": 0.0023097, + "balance_loss_clip": 1.07951152, + "balance_loss_mlp": 0.2057333, + "epoch": 0.6916578986923193, + "flos": 22601278667520.0, + "grad_norm": 118.05737838361628, + "language_loss": 0.86023146, + "learning_rate": 9.170062827578575e-07, + "loss": 0.8756066, + "num_input_tokens_seen": 248295310, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.25231934, + "step": 11504, + "time_per_iteration": 4.190938234329224 + }, + { + "auxiliary_loss_clip": 0.01332385, + "auxiliary_loss_mlp": 0.00280069, + "balance_loss_clip": 1.10184383, + "balance_loss_mlp": 0.25159025, + "epoch": 0.6917180219449872, + "flos": 23477355383040.0, + "grad_norm": 27.94622750539555, + "language_loss": 0.81244922, + "learning_rate": 9.166788817780499e-07, + "loss": 0.8285737, + "num_input_tokens_seen": 248315230, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.28479004, + "step": 11505, + "time_per_iteration": 2.6970748901367188 + }, + { + "auxiliary_loss_clip": 0.01289587, + "auxiliary_loss_mlp": 0.00276031, + "balance_loss_clip": 1.0713104, + "balance_loss_mlp": 0.25105637, + "epoch": 0.6917781451976552, + "flos": 23732536579200.0, + "grad_norm": 15.363115713337606, + "language_loss": 0.94425434, + "learning_rate": 9.163515218778886e-07, + "loss": 0.95991051, + "num_input_tokens_seen": 248332980, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25, + "step": 11506, + "time_per_iteration": 2.7147128582000732 + }, + { + "auxiliary_loss_clip": 0.01303077, + "auxiliary_loss_mlp": 0.00269282, + "balance_loss_clip": 1.08090591, + "balance_loss_mlp": 0.24341348, + "epoch": 0.6918382684503231, + "flos": 31466760946560.0, + "grad_norm": 35.6326349563974, + "language_loss": 0.82166803, + "learning_rate": 9.160242030697856e-07, + "loss": 0.83739161, + "num_input_tokens_seen": 248352865, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25878906, + "step": 11507, + "time_per_iteration": 4.168775796890259 + }, + { + "auxiliary_loss_clip": 0.0127711, + "auxiliary_loss_mlp": 0.00265851, + "balance_loss_clip": 1.06028605, + "balance_loss_mlp": 0.24130537, + "epoch": 0.6918983917029912, + "flos": 21650471706240.0, + "grad_norm": 3.2057044630949285, + "language_loss": 0.84432274, + "learning_rate": 9.156969253661538e-07, + "loss": 0.85975236, + "num_input_tokens_seen": 248371125, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.24536133, + "step": 11508, + "time_per_iteration": 2.646319627761841 + }, + { + "auxiliary_loss_clip": 0.01276492, + "auxiliary_loss_mlp": 0.00283192, + "balance_loss_clip": 1.06356227, + "balance_loss_mlp": 0.25825295, + "epoch": 0.6919585149556591, + "flos": 25550082720000.0, + "grad_norm": 16.330980557234295, + "language_loss": 0.80053985, + "learning_rate": 9.153696887794027e-07, + "loss": 0.81613672, + "num_input_tokens_seen": 248390455, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.24963379, + "step": 11509, + "time_per_iteration": 2.7215075492858887 + }, + { + "auxiliary_loss_clip": 0.01286562, + "auxiliary_loss_mlp": 0.00273405, + "balance_loss_clip": 1.07140517, + "balance_loss_mlp": 0.24677365, + "epoch": 0.6920186382083271, + "flos": 23659781581440.0, + "grad_norm": 4.149489660433431, + "language_loss": 0.73249662, + "learning_rate": 9.150424933219425e-07, + "loss": 0.74809635, + "num_input_tokens_seen": 248411305, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.26611328, + "step": 11510, + "time_per_iteration": 2.7804603576660156 + }, + { + "auxiliary_loss_clip": 0.01308722, + "auxiliary_loss_mlp": 0.00246921, + "balance_loss_clip": 1.08261693, + "balance_loss_mlp": 0.22056374, + "epoch": 0.692078761460995, + "flos": 19061959023360.0, + "grad_norm": 7.677232079248114, + "language_loss": 0.84240043, + "learning_rate": 9.147153390061788e-07, + "loss": 0.85795683, + "num_input_tokens_seen": 248430190, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26342773, + "step": 11511, + "time_per_iteration": 2.764129877090454 + }, + { + "auxiliary_loss_clip": 0.01308477, + "auxiliary_loss_mlp": 0.00259838, + "balance_loss_clip": 1.08615541, + "balance_loss_mlp": 0.23200309, + "epoch": 0.692138884713663, + "flos": 29023291382400.0, + "grad_norm": 3.126908786534533, + "language_loss": 0.71768296, + "learning_rate": 9.143882258445184e-07, + "loss": 0.73336613, + "num_input_tokens_seen": 248450830, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.27844238, + "step": 11512, + "time_per_iteration": 2.688924551010132 + }, + { + "auxiliary_loss_clip": 0.01309872, + "auxiliary_loss_mlp": 0.00265953, + "balance_loss_clip": 1.08624554, + "balance_loss_mlp": 0.24007289, + "epoch": 0.6921990079663309, + "flos": 14757849976320.0, + "grad_norm": 2.71942559268146, + "language_loss": 0.92555869, + "learning_rate": 9.140611538493666e-07, + "loss": 0.9413169, + "num_input_tokens_seen": 248468585, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25878906, + "step": 11513, + "time_per_iteration": 2.6588733196258545 + }, + { + "auxiliary_loss_clip": 0.0128917, + "auxiliary_loss_mlp": 0.00237328, + "balance_loss_clip": 1.07374787, + "balance_loss_mlp": 0.21498813, + "epoch": 0.692259131218999, + "flos": 23841848643840.0, + "grad_norm": 377.7788695046926, + "language_loss": 0.83747172, + "learning_rate": 9.137341230331233e-07, + "loss": 0.85273671, + "num_input_tokens_seen": 248490535, + "router_z_loss_clip": 2.15527344, + "router_z_loss_mlp": 0.22363281, + "step": 11514, + "time_per_iteration": 2.6802823543548584 + }, + { + "auxiliary_loss_clip": 0.0132334, + "auxiliary_loss_mlp": 0.00257487, + "balance_loss_clip": 1.08943009, + "balance_loss_mlp": 0.23084328, + "epoch": 0.6923192544716669, + "flos": 19135073157120.0, + "grad_norm": 85.9287146165235, + "language_loss": 0.837098, + "learning_rate": 9.134071334081907e-07, + "loss": 0.85290629, + "num_input_tokens_seen": 248508575, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.26611328, + "step": 11515, + "time_per_iteration": 2.669571876525879 + }, + { + "auxiliary_loss_clip": 0.01273745, + "auxiliary_loss_mlp": 0.00229127, + "balance_loss_clip": 1.06588495, + "balance_loss_mlp": 0.2062266, + "epoch": 0.6923793777243349, + "flos": 28074639237120.0, + "grad_norm": 158.26266522000012, + "language_loss": 0.62308425, + "learning_rate": 9.130801849869694e-07, + "loss": 0.6381129, + "num_input_tokens_seen": 248527025, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.22900391, + "step": 11516, + "time_per_iteration": 2.690290689468384 + }, + { + "auxiliary_loss_clip": 0.0128466, + "auxiliary_loss_mlp": 0.0027538, + "balance_loss_clip": 1.07294619, + "balance_loss_mlp": 0.25063232, + "epoch": 0.6924395009770029, + "flos": 16581250033920.0, + "grad_norm": 7.2642672588330255, + "language_loss": 0.82342666, + "learning_rate": 9.127532777818557e-07, + "loss": 0.83902717, + "num_input_tokens_seen": 248544275, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.24755859, + "step": 11517, + "time_per_iteration": 2.6340889930725098 + }, + { + "auxiliary_loss_clip": 0.01310751, + "auxiliary_loss_mlp": 0.00274762, + "balance_loss_clip": 1.08347082, + "balance_loss_mlp": 0.2475944, + "epoch": 0.6924996242296708, + "flos": 16655297921280.0, + "grad_norm": 1297.328809777429, + "language_loss": 0.83356225, + "learning_rate": 9.124264118052465e-07, + "loss": 0.84941733, + "num_input_tokens_seen": 248561870, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.27197266, + "step": 11518, + "time_per_iteration": 2.609740734100342 + }, + { + "auxiliary_loss_clip": 0.01323837, + "auxiliary_loss_mlp": 0.00284965, + "balance_loss_clip": 1.08927631, + "balance_loss_mlp": 0.25659341, + "epoch": 0.6925597474823388, + "flos": 34754167532160.0, + "grad_norm": 14.075376235934375, + "language_loss": 0.72459018, + "learning_rate": 9.120995870695376e-07, + "loss": 0.74067819, + "num_input_tokens_seen": 248588190, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.28344727, + "step": 11519, + "time_per_iteration": 2.8003461360931396 + }, + { + "auxiliary_loss_clip": 0.01293514, + "auxiliary_loss_mlp": 0.00272941, + "balance_loss_clip": 1.07369697, + "balance_loss_mlp": 0.24825232, + "epoch": 0.6926198707350067, + "flos": 21871717528320.0, + "grad_norm": 19.650124046436865, + "language_loss": 0.70870024, + "learning_rate": 9.117728035871212e-07, + "loss": 0.72436476, + "num_input_tokens_seen": 248606460, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24694824, + "step": 11520, + "time_per_iteration": 2.696540594100952 + }, + { + "auxiliary_loss_clip": 0.01318114, + "auxiliary_loss_mlp": 0.00291636, + "balance_loss_clip": 1.08332229, + "balance_loss_mlp": 0.26209545, + "epoch": 0.6926799939876748, + "flos": 13006271162880.0, + "grad_norm": 100.2489245075827, + "language_loss": 0.85879612, + "learning_rate": 9.114460613703887e-07, + "loss": 0.87489367, + "num_input_tokens_seen": 248623715, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.29541016, + "step": 11521, + "time_per_iteration": 2.6229183673858643 + }, + { + "auxiliary_loss_clip": 0.01295731, + "auxiliary_loss_mlp": 0.00282723, + "balance_loss_clip": 1.07679379, + "balance_loss_mlp": 0.25777262, + "epoch": 0.6927401172403427, + "flos": 16761234107520.0, + "grad_norm": 85.52897290668318, + "language_loss": 0.89349103, + "learning_rate": 9.111193604317304e-07, + "loss": 0.90927553, + "num_input_tokens_seen": 248640575, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24951172, + "step": 11522, + "time_per_iteration": 2.7441492080688477 + }, + { + "auxiliary_loss_clip": 0.01285943, + "auxiliary_loss_mlp": 0.00285793, + "balance_loss_clip": 1.07009864, + "balance_loss_mlp": 0.26010299, + "epoch": 0.6928002404930107, + "flos": 25705648523520.0, + "grad_norm": 3.1386390383044516, + "language_loss": 0.81322896, + "learning_rate": 9.107927007835361e-07, + "loss": 0.82894635, + "num_input_tokens_seen": 248663535, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25708008, + "step": 11523, + "time_per_iteration": 2.7605044841766357 + }, + { + "auxiliary_loss_clip": 0.01283857, + "auxiliary_loss_mlp": 0.00283119, + "balance_loss_clip": 1.07013512, + "balance_loss_mlp": 0.25901496, + "epoch": 0.6928603637456786, + "flos": 18588261438720.0, + "grad_norm": 22.09733384874968, + "language_loss": 0.74744618, + "learning_rate": 9.104660824381915e-07, + "loss": 0.76311594, + "num_input_tokens_seen": 248681125, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24133301, + "step": 11524, + "time_per_iteration": 2.702228307723999 + }, + { + "auxiliary_loss_clip": 0.01321411, + "auxiliary_loss_mlp": 0.00268831, + "balance_loss_clip": 1.09485483, + "balance_loss_mlp": 0.24236678, + "epoch": 0.6929204869983466, + "flos": 22200874784640.0, + "grad_norm": 911.1556551453043, + "language_loss": 0.73478997, + "learning_rate": 9.101395054080815e-07, + "loss": 0.75069237, + "num_input_tokens_seen": 248700555, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26477051, + "step": 11525, + "time_per_iteration": 2.675136089324951 + }, + { + "auxiliary_loss_clip": 0.01301136, + "auxiliary_loss_mlp": 0.00283094, + "balance_loss_clip": 1.07711029, + "balance_loss_mlp": 0.25701076, + "epoch": 0.6929806102510145, + "flos": 17894754576000.0, + "grad_norm": 64.10652842486756, + "language_loss": 0.80345589, + "learning_rate": 9.098129697055907e-07, + "loss": 0.81929815, + "num_input_tokens_seen": 248716095, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.26086426, + "step": 11526, + "time_per_iteration": 2.826284885406494 + }, + { + "auxiliary_loss_clip": 0.012791, + "auxiliary_loss_mlp": 0.00286869, + "balance_loss_clip": 1.06456995, + "balance_loss_mlp": 0.26266918, + "epoch": 0.6930407335036826, + "flos": 19755178577280.0, + "grad_norm": 13.27052534398285, + "language_loss": 0.82914507, + "learning_rate": 9.094864753431022e-07, + "loss": 0.84480482, + "num_input_tokens_seen": 248735330, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.24206543, + "step": 11527, + "time_per_iteration": 2.687304735183716 + }, + { + "auxiliary_loss_clip": 0.01301908, + "auxiliary_loss_mlp": 0.00281144, + "balance_loss_clip": 1.08015049, + "balance_loss_mlp": 0.25470322, + "epoch": 0.6931008567563505, + "flos": 21544248211200.0, + "grad_norm": 376.6884635237403, + "language_loss": 0.86169451, + "learning_rate": 9.091600223329952e-07, + "loss": 0.87752503, + "num_input_tokens_seen": 248754530, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26464844, + "step": 11528, + "time_per_iteration": 2.6772499084472656 + }, + { + "auxiliary_loss_clip": 0.01265581, + "auxiliary_loss_mlp": 0.00255385, + "balance_loss_clip": 1.05433106, + "balance_loss_mlp": 0.2320078, + "epoch": 0.6931609800090185, + "flos": 26250018117120.0, + "grad_norm": 97.99514989415712, + "language_loss": 0.80891138, + "learning_rate": 9.088336106876491e-07, + "loss": 0.824121, + "num_input_tokens_seen": 248775825, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.23364258, + "step": 11529, + "time_per_iteration": 2.6894993782043457 + }, + { + "auxiliary_loss_clip": 0.01281166, + "auxiliary_loss_mlp": 0.00280905, + "balance_loss_clip": 1.06849575, + "balance_loss_mlp": 0.25667021, + "epoch": 0.6932211032616865, + "flos": 32343376366080.0, + "grad_norm": 2.5734612003580066, + "language_loss": 0.80186015, + "learning_rate": 9.085072404194436e-07, + "loss": 0.81748086, + "num_input_tokens_seen": 248796180, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.24243164, + "step": 11530, + "time_per_iteration": 2.770237684249878 + }, + { + "auxiliary_loss_clip": 0.01306827, + "auxiliary_loss_mlp": 0.00280672, + "balance_loss_clip": 1.08110881, + "balance_loss_mlp": 0.25328997, + "epoch": 0.6932812265143544, + "flos": 22049079909120.0, + "grad_norm": 3.8436117916387373, + "language_loss": 0.85055965, + "learning_rate": 9.081809115407513e-07, + "loss": 0.86643469, + "num_input_tokens_seen": 248814735, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.27355957, + "step": 11531, + "time_per_iteration": 2.6862869262695312 + }, + { + "auxiliary_loss_clip": 0.01287135, + "auxiliary_loss_mlp": 0.00261949, + "balance_loss_clip": 1.07043123, + "balance_loss_mlp": 0.23895399, + "epoch": 0.6933413497670224, + "flos": 26256626219520.0, + "grad_norm": 12.230624014864734, + "language_loss": 0.75474524, + "learning_rate": 9.078546240639484e-07, + "loss": 0.77023613, + "num_input_tokens_seen": 248839140, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.2298584, + "step": 11532, + "time_per_iteration": 2.769974708557129 + }, + { + "auxiliary_loss_clip": 0.01304991, + "auxiliary_loss_mlp": 0.00286053, + "balance_loss_clip": 1.08096123, + "balance_loss_mlp": 0.25972, + "epoch": 0.6934014730196904, + "flos": 19573003774080.0, + "grad_norm": 10.123490041728155, + "language_loss": 0.74096489, + "learning_rate": 9.075283780014082e-07, + "loss": 0.75687534, + "num_input_tokens_seen": 248858300, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26342773, + "step": 11533, + "time_per_iteration": 2.680494546890259 + }, + { + "auxiliary_loss_clip": 0.01300122, + "auxiliary_loss_mlp": 0.00290187, + "balance_loss_clip": 1.07447779, + "balance_loss_mlp": 0.26434252, + "epoch": 0.6934615962723584, + "flos": 22119249127680.0, + "grad_norm": 16.665870622920476, + "language_loss": 0.70214134, + "learning_rate": 9.072021733655007e-07, + "loss": 0.71804446, + "num_input_tokens_seen": 248876310, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.25866699, + "step": 11534, + "time_per_iteration": 2.6382734775543213 + }, + { + "auxiliary_loss_clip": 0.01312722, + "auxiliary_loss_mlp": 0.00276576, + "balance_loss_clip": 1.08582258, + "balance_loss_mlp": 0.24992032, + "epoch": 0.6935217195250263, + "flos": 21360816432000.0, + "grad_norm": 19.84737164819511, + "language_loss": 0.81443524, + "learning_rate": 9.068760101685971e-07, + "loss": 0.83032823, + "num_input_tokens_seen": 248895650, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26660156, + "step": 11535, + "time_per_iteration": 2.70357608795166 + }, + { + "auxiliary_loss_clip": 0.01203329, + "auxiliary_loss_mlp": 0.00129336, + "balance_loss_clip": 1.05727029, + "balance_loss_mlp": 0.12089604, + "epoch": 0.6935818427776943, + "flos": 64063813115520.0, + "grad_norm": 0.70360812406411, + "language_loss": 0.58621407, + "learning_rate": 9.065498884230638e-07, + "loss": 0.59954071, + "num_input_tokens_seen": 248963920, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.08447266, + "step": 11536, + "time_per_iteration": 3.2685277462005615 + }, + { + "auxiliary_loss_clip": 0.01315174, + "auxiliary_loss_mlp": 0.00287459, + "balance_loss_clip": 1.08594334, + "balance_loss_mlp": 0.25982636, + "epoch": 0.6936419660303622, + "flos": 20302564913280.0, + "grad_norm": 9.723397091747158, + "language_loss": 0.79783762, + "learning_rate": 9.062238081412692e-07, + "loss": 0.81386387, + "num_input_tokens_seen": 248983380, + "router_z_loss_clip": 2.29394531, + "router_z_loss_mlp": 0.27624512, + "step": 11537, + "time_per_iteration": 2.682908058166504 + }, + { + "auxiliary_loss_clip": 0.01206323, + "auxiliary_loss_mlp": 0.00111312, + "balance_loss_clip": 1.06132936, + "balance_loss_mlp": 0.10339644, + "epoch": 0.6937020892830302, + "flos": 67182581347200.0, + "grad_norm": 0.7723814349692734, + "language_loss": 0.55247545, + "learning_rate": 9.058977693355767e-07, + "loss": 0.56565177, + "num_input_tokens_seen": 249044680, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.07910156, + "step": 11538, + "time_per_iteration": 3.1421449184417725 + }, + { + "auxiliary_loss_clip": 0.01265707, + "auxiliary_loss_mlp": 0.00246089, + "balance_loss_clip": 1.05723953, + "balance_loss_mlp": 0.22438097, + "epoch": 0.6937622125356981, + "flos": 23878190229120.0, + "grad_norm": 140.7407802381563, + "language_loss": 0.82710326, + "learning_rate": 9.055717720183505e-07, + "loss": 0.84222126, + "num_input_tokens_seen": 249061060, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.21716309, + "step": 11539, + "time_per_iteration": 2.6570234298706055 + }, + { + "auxiliary_loss_clip": 0.0127501, + "auxiliary_loss_mlp": 0.0027453, + "balance_loss_clip": 1.05974507, + "balance_loss_mlp": 0.25052142, + "epoch": 0.6938223357883662, + "flos": 28730619365760.0, + "grad_norm": 36.40069699376993, + "language_loss": 0.7194488, + "learning_rate": 9.05245816201953e-07, + "loss": 0.73494422, + "num_input_tokens_seen": 249081430, + "router_z_loss_clip": 2.15527344, + "router_z_loss_mlp": 0.2401123, + "step": 11540, + "time_per_iteration": 2.7153890132904053 + }, + { + "auxiliary_loss_clip": 0.01284998, + "auxiliary_loss_mlp": 0.00265338, + "balance_loss_clip": 1.07114089, + "balance_loss_mlp": 0.24018469, + "epoch": 0.6938824590410341, + "flos": 28655027193600.0, + "grad_norm": 31.679835958077117, + "language_loss": 0.92464095, + "learning_rate": 9.049199018987437e-07, + "loss": 0.9401443, + "num_input_tokens_seen": 249103020, + "router_z_loss_clip": 2.13964844, + "router_z_loss_mlp": 0.25158691, + "step": 11541, + "time_per_iteration": 4.12665581703186 + }, + { + "auxiliary_loss_clip": 0.01292333, + "auxiliary_loss_mlp": 0.00288158, + "balance_loss_clip": 1.07033074, + "balance_loss_mlp": 0.26273036, + "epoch": 0.6939425822937021, + "flos": 18983062800000.0, + "grad_norm": 4.500303178715709, + "language_loss": 0.91832942, + "learning_rate": 9.04594029121081e-07, + "loss": 0.93413436, + "num_input_tokens_seen": 249120810, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25415039, + "step": 11542, + "time_per_iteration": 2.6150009632110596 + }, + { + "auxiliary_loss_clip": 0.01313657, + "auxiliary_loss_mlp": 0.00298177, + "balance_loss_clip": 1.08547032, + "balance_loss_mlp": 0.27019882, + "epoch": 0.6940027055463701, + "flos": 23075838178560.0, + "grad_norm": 9.674469223226875, + "language_loss": 0.82523537, + "learning_rate": 9.04268197881323e-07, + "loss": 0.84135377, + "num_input_tokens_seen": 249138050, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.2800293, + "step": 11543, + "time_per_iteration": 4.138035774230957 + }, + { + "auxiliary_loss_clip": 0.01287896, + "auxiliary_loss_mlp": 0.00272683, + "balance_loss_clip": 1.06964254, + "balance_loss_mlp": 0.24726713, + "epoch": 0.694062828799038, + "flos": 18186564666240.0, + "grad_norm": 8.504606202653616, + "language_loss": 0.81592315, + "learning_rate": 9.039424081918241e-07, + "loss": 0.8315289, + "num_input_tokens_seen": 249155570, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25427246, + "step": 11544, + "time_per_iteration": 2.66316819190979 + }, + { + "auxiliary_loss_clip": 0.01287591, + "auxiliary_loss_mlp": 0.00278593, + "balance_loss_clip": 1.06696463, + "balance_loss_mlp": 0.25456005, + "epoch": 0.694122952051706, + "flos": 17821532701440.0, + "grad_norm": 38.40593630046677, + "language_loss": 0.79157138, + "learning_rate": 9.036166600649388e-07, + "loss": 0.80723321, + "num_input_tokens_seen": 249172960, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24035645, + "step": 11545, + "time_per_iteration": 2.6780457496643066 + }, + { + "auxiliary_loss_clip": 0.01274024, + "auxiliary_loss_mlp": 0.00243569, + "balance_loss_clip": 1.06284881, + "balance_loss_mlp": 0.22001363, + "epoch": 0.694183075304374, + "flos": 21215306436480.0, + "grad_norm": 11.921560979595178, + "language_loss": 0.8603459, + "learning_rate": 9.0329095351302e-07, + "loss": 0.87552184, + "num_input_tokens_seen": 249192450, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.23535156, + "step": 11546, + "time_per_iteration": 4.195306777954102 + }, + { + "auxiliary_loss_clip": 0.0126741, + "auxiliary_loss_mlp": 0.00288273, + "balance_loss_clip": 1.05986381, + "balance_loss_mlp": 0.2655876, + "epoch": 0.694243198557042, + "flos": 24060508686720.0, + "grad_norm": 11.416251023040264, + "language_loss": 0.84371287, + "learning_rate": 9.029652885484194e-07, + "loss": 0.85926974, + "num_input_tokens_seen": 249214320, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.22692871, + "step": 11547, + "time_per_iteration": 2.754603385925293 + }, + { + "auxiliary_loss_clip": 0.01265616, + "auxiliary_loss_mlp": 0.00265915, + "balance_loss_clip": 1.05521727, + "balance_loss_mlp": 0.24283652, + "epoch": 0.6943033218097099, + "flos": 21141869080320.0, + "grad_norm": 11.7015599488726, + "language_loss": 0.89103782, + "learning_rate": 9.026396651834834e-07, + "loss": 0.90635312, + "num_input_tokens_seen": 249230925, + "router_z_loss_clip": 2.10253906, + "router_z_loss_mlp": 0.23071289, + "step": 11548, + "time_per_iteration": 2.7024788856506348 + }, + { + "auxiliary_loss_clip": 0.01168362, + "auxiliary_loss_mlp": 0.0018477, + "balance_loss_clip": 1.02625871, + "balance_loss_mlp": 0.17404091, + "epoch": 0.6943634450623779, + "flos": 57812015975040.0, + "grad_norm": 0.6778900724852694, + "language_loss": 0.52989888, + "learning_rate": 9.023140834305613e-07, + "loss": 0.54343021, + "num_input_tokens_seen": 249293975, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.10742188, + "step": 11549, + "time_per_iteration": 4.53424334526062 + }, + { + "auxiliary_loss_clip": 0.01289753, + "auxiliary_loss_mlp": 0.00286806, + "balance_loss_clip": 1.06772852, + "balance_loss_mlp": 0.26010299, + "epoch": 0.6944235683150458, + "flos": 30590684231040.0, + "grad_norm": 16.492252485559327, + "language_loss": 0.80389428, + "learning_rate": 9.01988543302e-07, + "loss": 0.81965989, + "num_input_tokens_seen": 249315285, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.26708984, + "step": 11550, + "time_per_iteration": 2.75132155418396 + }, + { + "auxiliary_loss_clip": 0.01300146, + "auxiliary_loss_mlp": 0.00264391, + "balance_loss_clip": 1.07709289, + "balance_loss_mlp": 0.23833185, + "epoch": 0.6944836915677138, + "flos": 19719447523200.0, + "grad_norm": 159.66552169303074, + "language_loss": 0.82757831, + "learning_rate": 9.016630448101425e-07, + "loss": 0.84322369, + "num_input_tokens_seen": 249333505, + "router_z_loss_clip": 2.22949219, + "router_z_loss_mlp": 0.26049805, + "step": 11551, + "time_per_iteration": 2.67439603805542 + }, + { + "auxiliary_loss_clip": 0.0128735, + "auxiliary_loss_mlp": 0.00279959, + "balance_loss_clip": 1.07017446, + "balance_loss_mlp": 0.2549485, + "epoch": 0.6945438148203817, + "flos": 24863579009280.0, + "grad_norm": 8.868287160742073, + "language_loss": 0.9062829, + "learning_rate": 9.01337587967333e-07, + "loss": 0.921956, + "num_input_tokens_seen": 249354180, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.25, + "step": 11552, + "time_per_iteration": 2.7214698791503906 + }, + { + "auxiliary_loss_clip": 0.01272683, + "auxiliary_loss_mlp": 0.00268299, + "balance_loss_clip": 1.06061316, + "balance_loss_mlp": 0.2441356, + "epoch": 0.6946039380730498, + "flos": 33326646243840.0, + "grad_norm": 26.468980376466835, + "language_loss": 0.74093509, + "learning_rate": 9.010121727859117e-07, + "loss": 0.75634491, + "num_input_tokens_seen": 249377035, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.24182129, + "step": 11553, + "time_per_iteration": 2.7499876022338867 + }, + { + "auxiliary_loss_clip": 0.01324134, + "auxiliary_loss_mlp": 0.002553, + "balance_loss_clip": 1.09325838, + "balance_loss_mlp": 0.2276554, + "epoch": 0.6946640613257177, + "flos": 20850956830080.0, + "grad_norm": 8.778987608949201, + "language_loss": 0.86931819, + "learning_rate": 9.006867992782195e-07, + "loss": 0.88511252, + "num_input_tokens_seen": 249396155, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.27648926, + "step": 11554, + "time_per_iteration": 2.6144003868103027 + }, + { + "auxiliary_loss_clip": 0.01285157, + "auxiliary_loss_mlp": 0.00266138, + "balance_loss_clip": 1.06744599, + "balance_loss_mlp": 0.24130671, + "epoch": 0.6947241845783857, + "flos": 19354846521600.0, + "grad_norm": 34.265753755812305, + "language_loss": 0.82078964, + "learning_rate": 9.003614674565934e-07, + "loss": 0.83630258, + "num_input_tokens_seen": 249414555, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.24829102, + "step": 11555, + "time_per_iteration": 2.642665147781372 + }, + { + "auxiliary_loss_clip": 0.01282588, + "auxiliary_loss_mlp": 0.00278824, + "balance_loss_clip": 1.0636183, + "balance_loss_mlp": 0.25328934, + "epoch": 0.6947843078310536, + "flos": 27120240915840.0, + "grad_norm": 12.786429228100987, + "language_loss": 0.86000997, + "learning_rate": 9.000361773333705e-07, + "loss": 0.87562406, + "num_input_tokens_seen": 249433570, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25500488, + "step": 11556, + "time_per_iteration": 2.6957011222839355 + }, + { + "auxiliary_loss_clip": 0.0129113, + "auxiliary_loss_mlp": 0.00271873, + "balance_loss_clip": 1.06796217, + "balance_loss_mlp": 0.24594478, + "epoch": 0.6948444310837216, + "flos": 28585109370240.0, + "grad_norm": 612.184728107731, + "language_loss": 0.71128988, + "learning_rate": 8.997109289208869e-07, + "loss": 0.72691989, + "num_input_tokens_seen": 249453735, + "router_z_loss_clip": 2.23144531, + "router_z_loss_mlp": 0.25915527, + "step": 11557, + "time_per_iteration": 2.7108612060546875 + }, + { + "auxiliary_loss_clip": 0.01277492, + "auxiliary_loss_mlp": 0.00260633, + "balance_loss_clip": 1.06406939, + "balance_loss_mlp": 0.23643313, + "epoch": 0.6949045543363896, + "flos": 15669262696320.0, + "grad_norm": 4.37388067809844, + "language_loss": 0.93432152, + "learning_rate": 8.993857222314752e-07, + "loss": 0.94970274, + "num_input_tokens_seen": 249470805, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.24194336, + "step": 11558, + "time_per_iteration": 2.6287479400634766 + }, + { + "auxiliary_loss_clip": 0.01296031, + "auxiliary_loss_mlp": 0.00253619, + "balance_loss_clip": 1.07310212, + "balance_loss_mlp": 0.22853768, + "epoch": 0.6949646775890576, + "flos": 23259413612160.0, + "grad_norm": 36.06726203903377, + "language_loss": 0.76844513, + "learning_rate": 8.990605572774664e-07, + "loss": 0.78394163, + "num_input_tokens_seen": 249491150, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.25061035, + "step": 11559, + "time_per_iteration": 2.70719313621521 + }, + { + "auxiliary_loss_clip": 0.01281943, + "auxiliary_loss_mlp": 0.00257209, + "balance_loss_clip": 1.06521749, + "balance_loss_mlp": 0.23385634, + "epoch": 0.6950248008417256, + "flos": 22382546797440.0, + "grad_norm": 15.232175611036396, + "language_loss": 0.87534386, + "learning_rate": 8.987354340711921e-07, + "loss": 0.89073539, + "num_input_tokens_seen": 249511560, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.23352051, + "step": 11560, + "time_per_iteration": 2.667597532272339 + }, + { + "auxiliary_loss_clip": 0.01277952, + "auxiliary_loss_mlp": 0.00229896, + "balance_loss_clip": 1.0630455, + "balance_loss_mlp": 0.20462318, + "epoch": 0.6950849240943935, + "flos": 23477355383040.0, + "grad_norm": 433.98191344196493, + "language_loss": 0.83370721, + "learning_rate": 8.9841035262498e-07, + "loss": 0.84878564, + "num_input_tokens_seen": 249531910, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.25280762, + "step": 11561, + "time_per_iteration": 2.6866745948791504 + }, + { + "auxiliary_loss_clip": 0.01290082, + "auxiliary_loss_mlp": 0.0027791, + "balance_loss_clip": 1.07095909, + "balance_loss_mlp": 0.25175595, + "epoch": 0.6951450473470615, + "flos": 17420554200960.0, + "grad_norm": 5.033589334401177, + "language_loss": 0.87175465, + "learning_rate": 8.980853129511577e-07, + "loss": 0.88743448, + "num_input_tokens_seen": 249550300, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.26159668, + "step": 11562, + "time_per_iteration": 2.628737688064575 + }, + { + "auxiliary_loss_clip": 0.01299517, + "auxiliary_loss_mlp": 0.00284839, + "balance_loss_clip": 1.07518899, + "balance_loss_mlp": 0.25749287, + "epoch": 0.6952051705997294, + "flos": 20485745297280.0, + "grad_norm": 44.8587900534598, + "language_loss": 0.7716617, + "learning_rate": 8.977603150620515e-07, + "loss": 0.78750515, + "num_input_tokens_seen": 249567740, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.2734375, + "step": 11563, + "time_per_iteration": 2.630321502685547 + }, + { + "auxiliary_loss_clip": 0.0127929, + "auxiliary_loss_mlp": 0.00266027, + "balance_loss_clip": 1.06417322, + "balance_loss_mlp": 0.24168466, + "epoch": 0.6952652938523974, + "flos": 13989541040640.0, + "grad_norm": 9.724799628348563, + "language_loss": 0.8135947, + "learning_rate": 8.974353589699846e-07, + "loss": 0.82904792, + "num_input_tokens_seen": 249582700, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.2434082, + "step": 11564, + "time_per_iteration": 2.659019947052002 + }, + { + "auxiliary_loss_clip": 0.01333262, + "auxiliary_loss_mlp": 0.00262656, + "balance_loss_clip": 1.09225178, + "balance_loss_mlp": 0.23323521, + "epoch": 0.6953254171050653, + "flos": 30953956429440.0, + "grad_norm": 49.5064919801215, + "language_loss": 0.80632931, + "learning_rate": 8.971104446872785e-07, + "loss": 0.82228851, + "num_input_tokens_seen": 249602920, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.29418945, + "step": 11565, + "time_per_iteration": 2.7365176677703857 + }, + { + "auxiliary_loss_clip": 0.01099798, + "auxiliary_loss_mlp": 0.00097539, + "balance_loss_clip": 0.96075493, + "balance_loss_mlp": 0.08938526, + "epoch": 0.6953855403577334, + "flos": 61670257499520.0, + "grad_norm": 0.9686945605819843, + "language_loss": 0.57513863, + "learning_rate": 8.96785572226255e-07, + "loss": 0.58711201, + "num_input_tokens_seen": 249660400, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.08154297, + "step": 11566, + "time_per_iteration": 3.0226728916168213 + }, + { + "auxiliary_loss_clip": 0.01313287, + "auxiliary_loss_mlp": 0.00257598, + "balance_loss_clip": 1.08098078, + "balance_loss_mlp": 0.23032275, + "epoch": 0.6954456636104013, + "flos": 23039029716480.0, + "grad_norm": 9.856553428060108, + "language_loss": 0.85986859, + "learning_rate": 8.964607415992338e-07, + "loss": 0.87557745, + "num_input_tokens_seen": 249679335, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.27294922, + "step": 11567, + "time_per_iteration": 2.664696455001831 + }, + { + "auxiliary_loss_clip": 0.0130707, + "auxiliary_loss_mlp": 0.00268994, + "balance_loss_clip": 1.08264685, + "balance_loss_mlp": 0.24350691, + "epoch": 0.6955057868630693, + "flos": 23918518224000.0, + "grad_norm": 14.01189704846884, + "language_loss": 0.81899816, + "learning_rate": 8.961359528185313e-07, + "loss": 0.83475876, + "num_input_tokens_seen": 249701805, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.25476074, + "step": 11568, + "time_per_iteration": 2.829275608062744 + }, + { + "auxiliary_loss_clip": 0.01307153, + "auxiliary_loss_mlp": 0.0023615, + "balance_loss_clip": 1.07965851, + "balance_loss_mlp": 0.21245071, + "epoch": 0.6955659101157372, + "flos": 22594634651520.0, + "grad_norm": 9.20240329755627, + "language_loss": 0.79655039, + "learning_rate": 8.958112058964649e-07, + "loss": 0.81198347, + "num_input_tokens_seen": 249720550, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.23706055, + "step": 11569, + "time_per_iteration": 2.7245430946350098 + }, + { + "auxiliary_loss_clip": 0.0127551, + "auxiliary_loss_mlp": 0.002605, + "balance_loss_clip": 1.0589509, + "balance_loss_mlp": 0.23528756, + "epoch": 0.6956260333684052, + "flos": 24572523104640.0, + "grad_norm": 22.565521571802737, + "language_loss": 0.8379181, + "learning_rate": 8.954865008453471e-07, + "loss": 0.85327816, + "num_input_tokens_seen": 249740325, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.2520752, + "step": 11570, + "time_per_iteration": 2.708599805831909 + }, + { + "auxiliary_loss_clip": 0.01289488, + "auxiliary_loss_mlp": 0.00272472, + "balance_loss_clip": 1.07106805, + "balance_loss_mlp": 0.24648434, + "epoch": 0.6956861566210732, + "flos": 25846058787840.0, + "grad_norm": 8.536648881124693, + "language_loss": 0.81551456, + "learning_rate": 8.95161837677493e-07, + "loss": 0.8311342, + "num_input_tokens_seen": 249760570, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.2598877, + "step": 11571, + "time_per_iteration": 2.7924716472625732 + }, + { + "auxiliary_loss_clip": 0.01261003, + "auxiliary_loss_mlp": 0.00238926, + "balance_loss_clip": 1.05816984, + "balance_loss_mlp": 0.21640712, + "epoch": 0.6957462798737412, + "flos": 15301393557120.0, + "grad_norm": 646.7788151917458, + "language_loss": 0.82709289, + "learning_rate": 8.948372164052118e-07, + "loss": 0.84209216, + "num_input_tokens_seen": 249778290, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.22521973, + "step": 11572, + "time_per_iteration": 2.7000858783721924 + }, + { + "auxiliary_loss_clip": 0.01274207, + "auxiliary_loss_mlp": 0.00264139, + "balance_loss_clip": 1.05746484, + "balance_loss_mlp": 0.23601753, + "epoch": 0.6958064031264092, + "flos": 36246830135040.0, + "grad_norm": 32.21237181777202, + "language_loss": 0.77558964, + "learning_rate": 8.94512637040814e-07, + "loss": 0.79097313, + "num_input_tokens_seen": 249800925, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.28112793, + "step": 11573, + "time_per_iteration": 2.8843913078308105 + }, + { + "auxiliary_loss_clip": 0.01311631, + "auxiliary_loss_mlp": 0.0027946, + "balance_loss_clip": 1.0895741, + "balance_loss_mlp": 0.25168464, + "epoch": 0.6958665263790771, + "flos": 19208725994880.0, + "grad_norm": 74.57106615894776, + "language_loss": 0.82960743, + "learning_rate": 8.941880995966095e-07, + "loss": 0.84551835, + "num_input_tokens_seen": 249820500, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.27783203, + "step": 11574, + "time_per_iteration": 2.7093420028686523 + }, + { + "auxiliary_loss_clip": 0.01286083, + "auxiliary_loss_mlp": 0.00223072, + "balance_loss_clip": 1.06405556, + "balance_loss_mlp": 0.19654818, + "epoch": 0.6959266496317451, + "flos": 21795838047360.0, + "grad_norm": 9926.656857561158, + "language_loss": 0.81979674, + "learning_rate": 8.938636040849014e-07, + "loss": 0.83488834, + "num_input_tokens_seen": 249839845, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.26538086, + "step": 11575, + "time_per_iteration": 2.7470943927764893 + }, + { + "auxiliary_loss_clip": 0.01294164, + "auxiliary_loss_mlp": 0.0026411, + "balance_loss_clip": 1.0732286, + "balance_loss_mlp": 0.23714501, + "epoch": 0.695986772884413, + "flos": 20558248899840.0, + "grad_norm": 21.37178932750013, + "language_loss": 0.87346935, + "learning_rate": 8.935391505179966e-07, + "loss": 0.88905215, + "num_input_tokens_seen": 249857400, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26989746, + "step": 11576, + "time_per_iteration": 2.641602039337158 + }, + { + "auxiliary_loss_clip": 0.01316012, + "auxiliary_loss_mlp": 0.0029725, + "balance_loss_clip": 1.07997561, + "balance_loss_mlp": 0.26962882, + "epoch": 0.696046896137081, + "flos": 14936217937920.0, + "grad_norm": 200.11487955809358, + "language_loss": 0.67937887, + "learning_rate": 8.932147389081985e-07, + "loss": 0.69551152, + "num_input_tokens_seen": 249871645, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.27624512, + "step": 11577, + "time_per_iteration": 2.6142749786376953 + }, + { + "auxiliary_loss_clip": 0.01286297, + "auxiliary_loss_mlp": 0.00260182, + "balance_loss_clip": 1.07233906, + "balance_loss_mlp": 0.23514792, + "epoch": 0.696107019389749, + "flos": 30740216549760.0, + "grad_norm": 44.69229931361354, + "language_loss": 0.81236827, + "learning_rate": 8.928903692678081e-07, + "loss": 0.82783306, + "num_input_tokens_seen": 249894215, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.25024414, + "step": 11578, + "time_per_iteration": 2.768449068069458 + }, + { + "auxiliary_loss_clip": 0.01297751, + "auxiliary_loss_mlp": 0.00259658, + "balance_loss_clip": 1.0767889, + "balance_loss_mlp": 0.23345554, + "epoch": 0.696167142642417, + "flos": 20776729374720.0, + "grad_norm": 1.976882549499725, + "language_loss": 0.88561594, + "learning_rate": 8.925660416091254e-07, + "loss": 0.90119004, + "num_input_tokens_seen": 249912850, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26208496, + "step": 11579, + "time_per_iteration": 2.6877195835113525 + }, + { + "auxiliary_loss_clip": 0.01296159, + "auxiliary_loss_mlp": 0.00262058, + "balance_loss_clip": 1.07404017, + "balance_loss_mlp": 0.23458028, + "epoch": 0.6962272658950849, + "flos": 22565152563840.0, + "grad_norm": 831.7374460481063, + "language_loss": 0.80521083, + "learning_rate": 8.922417559444502e-07, + "loss": 0.82079297, + "num_input_tokens_seen": 249932650, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.27478027, + "step": 11580, + "time_per_iteration": 2.710693836212158 + }, + { + "auxiliary_loss_clip": 0.01310908, + "auxiliary_loss_mlp": 0.00255526, + "balance_loss_clip": 1.08205593, + "balance_loss_mlp": 0.22753549, + "epoch": 0.6962873891477529, + "flos": 22200156512640.0, + "grad_norm": 13.644704821110995, + "language_loss": 0.7474578, + "learning_rate": 8.919175122860787e-07, + "loss": 0.76312214, + "num_input_tokens_seen": 249951205, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27978516, + "step": 11581, + "time_per_iteration": 2.664170026779175 + }, + { + "auxiliary_loss_clip": 0.01280841, + "auxiliary_loss_mlp": 0.00243112, + "balance_loss_clip": 1.06221437, + "balance_loss_mlp": 0.2187573, + "epoch": 0.6963475124004208, + "flos": 12489695717760.0, + "grad_norm": 2.335573893877376, + "language_loss": 0.82963055, + "learning_rate": 8.915933106463056e-07, + "loss": 0.84487009, + "num_input_tokens_seen": 249967045, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24328613, + "step": 11582, + "time_per_iteration": 2.640676975250244 + }, + { + "auxiliary_loss_clip": 0.01294347, + "auxiliary_loss_mlp": 0.0025676, + "balance_loss_clip": 1.07182372, + "balance_loss_mlp": 0.23172641, + "epoch": 0.6964076356530888, + "flos": 17165085696000.0, + "grad_norm": 9.739230513551702, + "language_loss": 0.77081335, + "learning_rate": 8.91269151037425e-07, + "loss": 0.78632438, + "num_input_tokens_seen": 249984565, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25036621, + "step": 11583, + "time_per_iteration": 4.093315839767456 + }, + { + "auxiliary_loss_clip": 0.012916, + "auxiliary_loss_mlp": 0.00235724, + "balance_loss_clip": 1.07445979, + "balance_loss_mlp": 0.20997484, + "epoch": 0.6964677589057569, + "flos": 19937317466880.0, + "grad_norm": 7.3255550116584605, + "language_loss": 0.90564007, + "learning_rate": 8.909450334717301e-07, + "loss": 0.92091334, + "num_input_tokens_seen": 250004235, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25756836, + "step": 11584, + "time_per_iteration": 2.695594310760498 + }, + { + "auxiliary_loss_clip": 0.01318985, + "auxiliary_loss_mlp": 0.00255685, + "balance_loss_clip": 1.09014976, + "balance_loss_mlp": 0.22753954, + "epoch": 0.6965278821584248, + "flos": 22784064001920.0, + "grad_norm": 122.1428154674752, + "language_loss": 0.89815623, + "learning_rate": 8.906209579615107e-07, + "loss": 0.913903, + "num_input_tokens_seen": 250017645, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.28112793, + "step": 11585, + "time_per_iteration": 4.143929719924927 + }, + { + "auxiliary_loss_clip": 0.01286089, + "auxiliary_loss_mlp": 0.00238227, + "balance_loss_clip": 1.07045984, + "balance_loss_mlp": 0.21324088, + "epoch": 0.6965880054110928, + "flos": 20047563285120.0, + "grad_norm": 2.3369090410392466, + "language_loss": 0.86410159, + "learning_rate": 8.90296924519055e-07, + "loss": 0.8793447, + "num_input_tokens_seen": 250037640, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.25012207, + "step": 11586, + "time_per_iteration": 2.645862102508545 + }, + { + "auxiliary_loss_clip": 0.01269011, + "auxiliary_loss_mlp": 0.002872, + "balance_loss_clip": 1.05791426, + "balance_loss_mlp": 0.26068833, + "epoch": 0.6966481286637607, + "flos": 21908238681600.0, + "grad_norm": 5417.557564947026, + "language_loss": 0.86292994, + "learning_rate": 8.899729331566519e-07, + "loss": 0.87849212, + "num_input_tokens_seen": 250056490, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.26513672, + "step": 11587, + "time_per_iteration": 2.7160658836364746 + }, + { + "auxiliary_loss_clip": 0.01275381, + "auxiliary_loss_mlp": 0.0022482, + "balance_loss_clip": 1.06502223, + "balance_loss_mlp": 0.20012023, + "epoch": 0.6967082519164287, + "flos": 15633172506240.0, + "grad_norm": 28.13757085737976, + "language_loss": 0.8198297, + "learning_rate": 8.896489838865857e-07, + "loss": 0.83483171, + "num_input_tokens_seen": 250074285, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.24694824, + "step": 11588, + "time_per_iteration": 4.143243312835693 + }, + { + "auxiliary_loss_clip": 0.01286577, + "auxiliary_loss_mlp": 0.00246185, + "balance_loss_clip": 1.06497145, + "balance_loss_mlp": 0.22102007, + "epoch": 0.6967683751690966, + "flos": 24024598064640.0, + "grad_norm": 16.86093753578077, + "language_loss": 0.83650494, + "learning_rate": 8.893250767211413e-07, + "loss": 0.85183263, + "num_input_tokens_seen": 250093350, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.25170898, + "step": 11589, + "time_per_iteration": 2.705061674118042 + }, + { + "auxiliary_loss_clip": 0.01309524, + "auxiliary_loss_mlp": 0.00264188, + "balance_loss_clip": 1.08133912, + "balance_loss_mlp": 0.23746106, + "epoch": 0.6968284984217646, + "flos": 31024700265600.0, + "grad_norm": 42.006055150064, + "language_loss": 0.71743941, + "learning_rate": 8.890012116726012e-07, + "loss": 0.73317659, + "num_input_tokens_seen": 250114170, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.26745605, + "step": 11590, + "time_per_iteration": 2.799466133117676 + }, + { + "auxiliary_loss_clip": 0.01112646, + "auxiliary_loss_mlp": 0.00087347, + "balance_loss_clip": 0.97337615, + "balance_loss_mlp": 0.07938349, + "epoch": 0.6968886216744326, + "flos": 67622990002560.0, + "grad_norm": 110.62832568099462, + "language_loss": 0.60842067, + "learning_rate": 8.88677388753248e-07, + "loss": 0.62042063, + "num_input_tokens_seen": 250178250, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.07958984, + "step": 11591, + "time_per_iteration": 3.2509963512420654 + }, + { + "auxiliary_loss_clip": 0.01311449, + "auxiliary_loss_mlp": 0.00241282, + "balance_loss_clip": 1.08770108, + "balance_loss_mlp": 0.2136018, + "epoch": 0.6969487449271006, + "flos": 24863686750080.0, + "grad_norm": 10.892331944954357, + "language_loss": 0.78043956, + "learning_rate": 8.883536079753582e-07, + "loss": 0.7959668, + "num_input_tokens_seen": 250198420, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.27709961, + "step": 11592, + "time_per_iteration": 4.15174412727356 + }, + { + "auxiliary_loss_clip": 0.01297378, + "auxiliary_loss_mlp": 0.00238522, + "balance_loss_clip": 1.08008456, + "balance_loss_mlp": 0.21355975, + "epoch": 0.6970088681797685, + "flos": 28767858791040.0, + "grad_norm": 5.046033818867354, + "language_loss": 0.70891917, + "learning_rate": 8.880298693512109e-07, + "loss": 0.72427821, + "num_input_tokens_seen": 250220650, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25, + "step": 11593, + "time_per_iteration": 2.825120210647583 + }, + { + "auxiliary_loss_clip": 0.01290676, + "auxiliary_loss_mlp": 0.00232736, + "balance_loss_clip": 1.07197762, + "balance_loss_mlp": 0.20736852, + "epoch": 0.6970689914324365, + "flos": 27308556944640.0, + "grad_norm": 7.171725400821005, + "language_loss": 0.62179899, + "learning_rate": 8.877061728930832e-07, + "loss": 0.6370331, + "num_input_tokens_seen": 250241750, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25378418, + "step": 11594, + "time_per_iteration": 2.7425458431243896 + }, + { + "auxiliary_loss_clip": 0.01282157, + "auxiliary_loss_mlp": 0.00223667, + "balance_loss_clip": 1.06572127, + "balance_loss_mlp": 0.19840702, + "epoch": 0.6971291146851044, + "flos": 19136258305920.0, + "grad_norm": 52.191303744747515, + "language_loss": 0.85527509, + "learning_rate": 8.87382518613248e-07, + "loss": 0.87033331, + "num_input_tokens_seen": 250259445, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.25256348, + "step": 11595, + "time_per_iteration": 2.747504234313965 + }, + { + "auxiliary_loss_clip": 0.01321353, + "auxiliary_loss_mlp": 0.00256172, + "balance_loss_clip": 1.08694196, + "balance_loss_mlp": 0.22796667, + "epoch": 0.6971892379377724, + "flos": 14610508387200.0, + "grad_norm": 37.43101383925927, + "language_loss": 0.81434894, + "learning_rate": 8.870589065239793e-07, + "loss": 0.83012414, + "num_input_tokens_seen": 250275640, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.28198242, + "step": 11596, + "time_per_iteration": 2.595853090286255 + }, + { + "auxiliary_loss_clip": 0.01311298, + "auxiliary_loss_mlp": 0.0025316, + "balance_loss_clip": 1.0851264, + "balance_loss_mlp": 0.22668314, + "epoch": 0.6972493611904405, + "flos": 22307457415680.0, + "grad_norm": 3.9015003814296065, + "language_loss": 0.83983612, + "learning_rate": 8.867353366375492e-07, + "loss": 0.85548067, + "num_input_tokens_seen": 250296435, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26464844, + "step": 11597, + "time_per_iteration": 2.6793556213378906 + }, + { + "auxiliary_loss_clip": 0.01309016, + "auxiliary_loss_mlp": 0.00272803, + "balance_loss_clip": 1.08648598, + "balance_loss_mlp": 0.24577829, + "epoch": 0.6973094844431084, + "flos": 17420374632960.0, + "grad_norm": 9.832967467763396, + "language_loss": 0.82973194, + "learning_rate": 8.864118089662267e-07, + "loss": 0.84555018, + "num_input_tokens_seen": 250314035, + "router_z_loss_clip": 2.22363281, + "router_z_loss_mlp": 0.27001953, + "step": 11598, + "time_per_iteration": 2.7300751209259033 + }, + { + "auxiliary_loss_clip": 0.01333338, + "auxiliary_loss_mlp": 0.00260279, + "balance_loss_clip": 1.09470737, + "balance_loss_mlp": 0.23176399, + "epoch": 0.6973696076957764, + "flos": 27235370983680.0, + "grad_norm": 59.86397808374666, + "language_loss": 0.96971887, + "learning_rate": 8.860883235222791e-07, + "loss": 0.98565507, + "num_input_tokens_seen": 250332995, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.28503418, + "step": 11599, + "time_per_iteration": 2.7195026874542236 + }, + { + "auxiliary_loss_clip": 0.0132864, + "auxiliary_loss_mlp": 0.00265254, + "balance_loss_clip": 1.0947274, + "balance_loss_mlp": 0.23627406, + "epoch": 0.6974297309484443, + "flos": 22018089450240.0, + "grad_norm": 4.286044615335168, + "language_loss": 0.79549825, + "learning_rate": 8.85764880317974e-07, + "loss": 0.81143719, + "num_input_tokens_seen": 250352120, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.28967285, + "step": 11600, + "time_per_iteration": 2.688734531402588 + }, + { + "auxiliary_loss_clip": 0.01297189, + "auxiliary_loss_mlp": 0.00258396, + "balance_loss_clip": 1.07651997, + "balance_loss_mlp": 0.23308802, + "epoch": 0.6974898542011123, + "flos": 28366449327360.0, + "grad_norm": 211.55427917820833, + "language_loss": 0.84414446, + "learning_rate": 8.854414793655771e-07, + "loss": 0.85970032, + "num_input_tokens_seen": 250371705, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.2532959, + "step": 11601, + "time_per_iteration": 2.730921983718872 + }, + { + "auxiliary_loss_clip": 0.01287664, + "auxiliary_loss_mlp": 0.00227612, + "balance_loss_clip": 1.07198644, + "balance_loss_mlp": 0.20331705, + "epoch": 0.6975499774537802, + "flos": 15232050351360.0, + "grad_norm": 38.751905237347, + "language_loss": 0.81477702, + "learning_rate": 8.851181206773508e-07, + "loss": 0.82992971, + "num_input_tokens_seen": 250390485, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.24291992, + "step": 11602, + "time_per_iteration": 2.6343679428100586 + }, + { + "auxiliary_loss_clip": 0.01313076, + "auxiliary_loss_mlp": 0.00253165, + "balance_loss_clip": 1.09008121, + "balance_loss_mlp": 0.22972879, + "epoch": 0.6976101007064482, + "flos": 22157422306560.0, + "grad_norm": 571.0638020979759, + "language_loss": 0.83622706, + "learning_rate": 8.847948042655567e-07, + "loss": 0.85188949, + "num_input_tokens_seen": 250407020, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.234375, + "step": 11603, + "time_per_iteration": 2.6371474266052246 + }, + { + "auxiliary_loss_clip": 0.01304961, + "auxiliary_loss_mlp": 0.00227883, + "balance_loss_clip": 1.08270741, + "balance_loss_mlp": 0.20184797, + "epoch": 0.6976702239591162, + "flos": 22273522041600.0, + "grad_norm": 4.570734110541665, + "language_loss": 0.7036798, + "learning_rate": 8.844715301424557e-07, + "loss": 0.71900821, + "num_input_tokens_seen": 250425880, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.26049805, + "step": 11604, + "time_per_iteration": 2.6659657955169678 + }, + { + "auxiliary_loss_clip": 0.01309819, + "auxiliary_loss_mlp": 0.00242048, + "balance_loss_clip": 1.08536172, + "balance_loss_mlp": 0.21447469, + "epoch": 0.6977303472117842, + "flos": 25848608653440.0, + "grad_norm": 19.412551594633, + "language_loss": 0.87725925, + "learning_rate": 8.841482983203057e-07, + "loss": 0.89277792, + "num_input_tokens_seen": 250442925, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.27575684, + "step": 11605, + "time_per_iteration": 2.6619906425476074 + }, + { + "auxiliary_loss_clip": 0.01305137, + "auxiliary_loss_mlp": 0.00232367, + "balance_loss_clip": 1.08265674, + "balance_loss_mlp": 0.20672509, + "epoch": 0.6977904704644521, + "flos": 20959586536320.0, + "grad_norm": 22.17364236194721, + "language_loss": 0.76789093, + "learning_rate": 8.838251088113638e-07, + "loss": 0.78326601, + "num_input_tokens_seen": 250461220, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25646973, + "step": 11606, + "time_per_iteration": 2.6805639266967773 + }, + { + "auxiliary_loss_clip": 0.0131932, + "auxiliary_loss_mlp": 0.00226688, + "balance_loss_clip": 1.09146237, + "balance_loss_mlp": 0.20003247, + "epoch": 0.6978505937171201, + "flos": 22055041566720.0, + "grad_norm": 92.43521896483705, + "language_loss": 0.90048945, + "learning_rate": 8.835019616278856e-07, + "loss": 0.91594958, + "num_input_tokens_seen": 250480975, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.26647949, + "step": 11607, + "time_per_iteration": 2.6860251426696777 + }, + { + "auxiliary_loss_clip": 0.01321572, + "auxiliary_loss_mlp": 0.00273909, + "balance_loss_clip": 1.08936143, + "balance_loss_mlp": 0.247421, + "epoch": 0.697910716969788, + "flos": 20043720529920.0, + "grad_norm": 30227.619900663347, + "language_loss": 0.87079179, + "learning_rate": 8.831788567821265e-07, + "loss": 0.88674664, + "num_input_tokens_seen": 250497980, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.26477051, + "step": 11608, + "time_per_iteration": 2.7198586463928223 + }, + { + "auxiliary_loss_clip": 0.01292183, + "auxiliary_loss_mlp": 0.00240056, + "balance_loss_clip": 1.07176399, + "balance_loss_mlp": 0.21539119, + "epoch": 0.697970840222456, + "flos": 15888245961600.0, + "grad_norm": 15.220661875825252, + "language_loss": 0.99072599, + "learning_rate": 8.828557942863357e-07, + "loss": 1.00604844, + "num_input_tokens_seen": 250511910, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.2467041, + "step": 11609, + "time_per_iteration": 2.6847362518310547 + }, + { + "auxiliary_loss_clip": 0.01309718, + "auxiliary_loss_mlp": 0.00248966, + "balance_loss_clip": 1.08363056, + "balance_loss_mlp": 0.22216761, + "epoch": 0.698030963475124, + "flos": 21215629658880.0, + "grad_norm": 50.61888558893641, + "language_loss": 0.73246491, + "learning_rate": 8.82532774152765e-07, + "loss": 0.74805176, + "num_input_tokens_seen": 250531090, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26831055, + "step": 11610, + "time_per_iteration": 2.711545944213867 + }, + { + "auxiliary_loss_clip": 0.01296018, + "auxiliary_loss_mlp": 0.00245495, + "balance_loss_clip": 1.07844853, + "balance_loss_mlp": 0.22053289, + "epoch": 0.698091086727792, + "flos": 33759728524800.0, + "grad_norm": 149.23390057467813, + "language_loss": 0.91233128, + "learning_rate": 8.822097963936643e-07, + "loss": 0.92774642, + "num_input_tokens_seen": 250551565, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.24951172, + "step": 11611, + "time_per_iteration": 2.860970973968506 + }, + { + "auxiliary_loss_clip": 0.01304627, + "auxiliary_loss_mlp": 0.00224007, + "balance_loss_clip": 1.08079839, + "balance_loss_mlp": 0.1991404, + "epoch": 0.69815120998046, + "flos": 15887850912000.0, + "grad_norm": 11.460937148735658, + "language_loss": 0.79710072, + "learning_rate": 8.818868610212793e-07, + "loss": 0.81238711, + "num_input_tokens_seen": 250569625, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.24829102, + "step": 11612, + "time_per_iteration": 2.748175621032715 + }, + { + "auxiliary_loss_clip": 0.01308347, + "auxiliary_loss_mlp": 0.00235734, + "balance_loss_clip": 1.08936858, + "balance_loss_mlp": 0.20955601, + "epoch": 0.6982113332331279, + "flos": 18947044437120.0, + "grad_norm": 2.747968249744425, + "language_loss": 0.88038969, + "learning_rate": 8.815639680478573e-07, + "loss": 0.89583051, + "num_input_tokens_seen": 250586960, + "router_z_loss_clip": 2.18457031, + "router_z_loss_mlp": 0.26184082, + "step": 11613, + "time_per_iteration": 2.705920457839966 + }, + { + "auxiliary_loss_clip": 0.01313769, + "auxiliary_loss_mlp": 0.00256016, + "balance_loss_clip": 1.08791995, + "balance_loss_mlp": 0.22886053, + "epoch": 0.6982714564857959, + "flos": 24389594115840.0, + "grad_norm": 11.166335424784496, + "language_loss": 0.83072996, + "learning_rate": 8.812411174856411e-07, + "loss": 0.8464278, + "num_input_tokens_seen": 250605080, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.27185059, + "step": 11614, + "time_per_iteration": 2.776323080062866 + }, + { + "auxiliary_loss_clip": 0.01296321, + "auxiliary_loss_mlp": 0.00258371, + "balance_loss_clip": 1.07846117, + "balance_loss_mlp": 0.23195384, + "epoch": 0.6983315797384638, + "flos": 20083725302400.0, + "grad_norm": 7.12871923323161, + "language_loss": 0.85466999, + "learning_rate": 8.809183093468746e-07, + "loss": 0.87021685, + "num_input_tokens_seen": 250623965, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.26428223, + "step": 11615, + "time_per_iteration": 2.706289052963257 + }, + { + "auxiliary_loss_clip": 0.01303919, + "auxiliary_loss_mlp": 0.00228384, + "balance_loss_clip": 1.08291698, + "balance_loss_mlp": 0.20290911, + "epoch": 0.6983917029911318, + "flos": 13512431664000.0, + "grad_norm": 18.529615515496587, + "language_loss": 0.80607045, + "learning_rate": 8.80595543643797e-07, + "loss": 0.82139337, + "num_input_tokens_seen": 250640675, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25488281, + "step": 11616, + "time_per_iteration": 2.638949155807495 + }, + { + "auxiliary_loss_clip": 0.01295533, + "auxiliary_loss_mlp": 0.00211921, + "balance_loss_clip": 1.07847941, + "balance_loss_mlp": 0.18613623, + "epoch": 0.6984518262437998, + "flos": 22018412672640.0, + "grad_norm": 18.766983593462218, + "language_loss": 0.9188478, + "learning_rate": 8.802728203886487e-07, + "loss": 0.93392229, + "num_input_tokens_seen": 250660295, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25769043, + "step": 11617, + "time_per_iteration": 2.7625527381896973 + }, + { + "auxiliary_loss_clip": 0.01333784, + "auxiliary_loss_mlp": 0.00267457, + "balance_loss_clip": 1.09932852, + "balance_loss_mlp": 0.23672503, + "epoch": 0.6985119494964678, + "flos": 18770615809920.0, + "grad_norm": 7.2074438760974715, + "language_loss": 0.71430898, + "learning_rate": 8.799501395936682e-07, + "loss": 0.73032141, + "num_input_tokens_seen": 250678155, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.30725098, + "step": 11618, + "time_per_iteration": 2.7730109691619873 + }, + { + "auxiliary_loss_clip": 0.01292729, + "auxiliary_loss_mlp": 0.00257858, + "balance_loss_clip": 1.07629216, + "balance_loss_mlp": 0.23108372, + "epoch": 0.6985720727491357, + "flos": 22382834106240.0, + "grad_norm": 3.47547254924256, + "language_loss": 0.91574353, + "learning_rate": 8.796275012710903e-07, + "loss": 0.93124938, + "num_input_tokens_seen": 250697230, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.26745605, + "step": 11619, + "time_per_iteration": 2.663734197616577 + }, + { + "auxiliary_loss_clip": 0.01313394, + "auxiliary_loss_mlp": 0.00234288, + "balance_loss_clip": 1.09086156, + "balance_loss_mlp": 0.20929028, + "epoch": 0.6986321960018037, + "flos": 39567884785920.0, + "grad_norm": 156.49808108863385, + "language_loss": 0.74839717, + "learning_rate": 8.793049054331494e-07, + "loss": 0.76387399, + "num_input_tokens_seen": 250719865, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.24987793, + "step": 11620, + "time_per_iteration": 2.812398910522461 + }, + { + "auxiliary_loss_clip": 0.01338664, + "auxiliary_loss_mlp": 0.00255896, + "balance_loss_clip": 1.10368967, + "balance_loss_mlp": 0.22813249, + "epoch": 0.6986923192544716, + "flos": 17967725055360.0, + "grad_norm": 4.396020378632075, + "language_loss": 0.83254671, + "learning_rate": 8.789823520920794e-07, + "loss": 0.84849226, + "num_input_tokens_seen": 250736565, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.27783203, + "step": 11621, + "time_per_iteration": 2.62652325630188 + }, + { + "auxiliary_loss_clip": 0.01322773, + "auxiliary_loss_mlp": 0.00253854, + "balance_loss_clip": 1.09412527, + "balance_loss_mlp": 0.22523215, + "epoch": 0.6987524425071396, + "flos": 25594325297280.0, + "grad_norm": 10.812802379644133, + "language_loss": 0.77497709, + "learning_rate": 8.7865984126011e-07, + "loss": 0.79074335, + "num_input_tokens_seen": 250757235, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.28625488, + "step": 11622, + "time_per_iteration": 2.696106195449829 + }, + { + "auxiliary_loss_clip": 0.01325198, + "auxiliary_loss_mlp": 0.0022108, + "balance_loss_clip": 1.0964669, + "balance_loss_mlp": 0.19406709, + "epoch": 0.6988125657598077, + "flos": 17530081747200.0, + "grad_norm": 3.523260552001237, + "language_loss": 0.70515937, + "learning_rate": 8.783373729494721e-07, + "loss": 0.72062218, + "num_input_tokens_seen": 250775585, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.2701416, + "step": 11623, + "time_per_iteration": 2.6642913818359375 + }, + { + "auxiliary_loss_clip": 0.01326026, + "auxiliary_loss_mlp": 0.00231306, + "balance_loss_clip": 1.08964455, + "balance_loss_mlp": 0.20214784, + "epoch": 0.6988726890124756, + "flos": 39165721136640.0, + "grad_norm": 50.029928844056094, + "language_loss": 0.68231297, + "learning_rate": 8.780149471723932e-07, + "loss": 0.69788623, + "num_input_tokens_seen": 250795725, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.29150391, + "step": 11624, + "time_per_iteration": 2.9272568225860596 + }, + { + "auxiliary_loss_clip": 0.01323179, + "auxiliary_loss_mlp": 0.00234155, + "balance_loss_clip": 1.09448612, + "balance_loss_mlp": 0.20635541, + "epoch": 0.6989328122651436, + "flos": 20193468330240.0, + "grad_norm": 12.017261367937008, + "language_loss": 0.85732478, + "learning_rate": 8.776925639411017e-07, + "loss": 0.8728981, + "num_input_tokens_seen": 250814555, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.27819824, + "step": 11625, + "time_per_iteration": 4.1252405643463135 + }, + { + "auxiliary_loss_clip": 0.01313187, + "auxiliary_loss_mlp": 0.00233468, + "balance_loss_clip": 1.08870339, + "balance_loss_mlp": 0.20743248, + "epoch": 0.6989929355178115, + "flos": 21834873152640.0, + "grad_norm": 7.300717445748576, + "language_loss": 0.76181597, + "learning_rate": 8.773702232678188e-07, + "loss": 0.77728248, + "num_input_tokens_seen": 250833105, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26025391, + "step": 11626, + "time_per_iteration": 2.653622627258301 + }, + { + "auxiliary_loss_clip": 0.01310597, + "auxiliary_loss_mlp": 0.0022246, + "balance_loss_clip": 1.0850718, + "balance_loss_mlp": 0.19361147, + "epoch": 0.6990530587704795, + "flos": 26322880855680.0, + "grad_norm": 10.038082266061474, + "language_loss": 0.80559611, + "learning_rate": 8.770479251647697e-07, + "loss": 0.82092673, + "num_input_tokens_seen": 250852570, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.28869629, + "step": 11627, + "time_per_iteration": 4.113186359405518 + }, + { + "auxiliary_loss_clip": 0.01287764, + "auxiliary_loss_mlp": 0.00229214, + "balance_loss_clip": 1.07361078, + "balance_loss_mlp": 0.20257035, + "epoch": 0.6991131820231474, + "flos": 19828975069440.0, + "grad_norm": 213.34408602385665, + "language_loss": 0.70517671, + "learning_rate": 8.767256696441768e-07, + "loss": 0.72034645, + "num_input_tokens_seen": 250870500, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.26611328, + "step": 11628, + "time_per_iteration": 2.6422126293182373 + }, + { + "auxiliary_loss_clip": 0.01302693, + "auxiliary_loss_mlp": 0.00248859, + "balance_loss_clip": 1.07807577, + "balance_loss_mlp": 0.22152387, + "epoch": 0.6991733052758154, + "flos": 33984817102080.0, + "grad_norm": 97.65910319278507, + "language_loss": 0.76003712, + "learning_rate": 8.764034567182581e-07, + "loss": 0.77555269, + "num_input_tokens_seen": 250892745, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.27307129, + "step": 11629, + "time_per_iteration": 2.7795052528381348 + }, + { + "auxiliary_loss_clip": 0.01302595, + "auxiliary_loss_mlp": 0.0024338, + "balance_loss_clip": 1.08028042, + "balance_loss_mlp": 0.21531841, + "epoch": 0.6992334285284834, + "flos": 15633136592640.0, + "grad_norm": 44.998773626826214, + "language_loss": 0.79473895, + "learning_rate": 8.760812863992337e-07, + "loss": 0.81019866, + "num_input_tokens_seen": 250910225, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.28063965, + "step": 11630, + "time_per_iteration": 4.147046804428101 + }, + { + "auxiliary_loss_clip": 0.01305602, + "auxiliary_loss_mlp": 0.00238802, + "balance_loss_clip": 1.0842793, + "balance_loss_mlp": 0.21298155, + "epoch": 0.6992935517811514, + "flos": 21726279360000.0, + "grad_norm": 13.554491626142193, + "language_loss": 0.81130159, + "learning_rate": 8.757591586993196e-07, + "loss": 0.82674563, + "num_input_tokens_seen": 250929715, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25830078, + "step": 11631, + "time_per_iteration": 2.6665046215057373 + }, + { + "auxiliary_loss_clip": 0.01353006, + "auxiliary_loss_mlp": 0.00249862, + "balance_loss_clip": 1.10736477, + "balance_loss_mlp": 0.2197497, + "epoch": 0.6993536750338193, + "flos": 20115254465280.0, + "grad_norm": 93.71521491821949, + "language_loss": 0.98277402, + "learning_rate": 8.7543707363073e-07, + "loss": 0.99880272, + "num_input_tokens_seen": 250944230, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.30114746, + "step": 11632, + "time_per_iteration": 2.6804420948028564 + }, + { + "auxiliary_loss_clip": 0.01317002, + "auxiliary_loss_mlp": 0.00255881, + "balance_loss_clip": 1.09397316, + "balance_loss_mlp": 0.22911859, + "epoch": 0.6994137982864873, + "flos": 22010547594240.0, + "grad_norm": 9.87550507147234, + "language_loss": 0.86164057, + "learning_rate": 8.751150312056792e-07, + "loss": 0.8773694, + "num_input_tokens_seen": 250961865, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26794434, + "step": 11633, + "time_per_iteration": 2.698359251022339 + }, + { + "auxiliary_loss_clip": 0.01333914, + "auxiliary_loss_mlp": 0.00246051, + "balance_loss_clip": 1.09615552, + "balance_loss_mlp": 0.21672554, + "epoch": 0.6994739215391552, + "flos": 25519020433920.0, + "grad_norm": 10.89829014589353, + "language_loss": 0.79708385, + "learning_rate": 8.747930314363794e-07, + "loss": 0.8128835, + "num_input_tokens_seen": 250982025, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.29309082, + "step": 11634, + "time_per_iteration": 4.0990142822265625 + }, + { + "auxiliary_loss_clip": 0.0116623, + "auxiliary_loss_mlp": 0.00059865, + "balance_loss_clip": 1.02436411, + "balance_loss_mlp": 0.05252205, + "epoch": 0.6995340447918232, + "flos": 59128357691520.0, + "grad_norm": 0.6575285437797487, + "language_loss": 0.52785188, + "learning_rate": 8.744710743350412e-07, + "loss": 0.54011285, + "num_input_tokens_seen": 251046900, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.07324219, + "step": 11635, + "time_per_iteration": 3.3541340827941895 + }, + { + "auxiliary_loss_clip": 0.01321406, + "auxiliary_loss_mlp": 0.00232455, + "balance_loss_clip": 1.09496069, + "balance_loss_mlp": 0.20556137, + "epoch": 0.6995941680444913, + "flos": 17967832796160.0, + "grad_norm": 199.7813421496229, + "language_loss": 0.8705588, + "learning_rate": 8.741491599138726e-07, + "loss": 0.88609743, + "num_input_tokens_seen": 251065050, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.26916504, + "step": 11636, + "time_per_iteration": 2.7324116230010986 + }, + { + "auxiliary_loss_clip": 0.01303378, + "auxiliary_loss_mlp": 0.00213857, + "balance_loss_clip": 1.079615, + "balance_loss_mlp": 0.18678461, + "epoch": 0.6996542912971592, + "flos": 21980095839360.0, + "grad_norm": 5.136567454497571, + "language_loss": 0.91567683, + "learning_rate": 8.738272881850801e-07, + "loss": 0.93084925, + "num_input_tokens_seen": 251083355, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.27087402, + "step": 11637, + "time_per_iteration": 2.711207866668701 + }, + { + "auxiliary_loss_clip": 0.01335218, + "auxiliary_loss_mlp": 0.00250357, + "balance_loss_clip": 1.10503817, + "balance_loss_mlp": 0.22292675, + "epoch": 0.6997144145498272, + "flos": 11686158518400.0, + "grad_norm": 29.557613024811893, + "language_loss": 0.78578746, + "learning_rate": 8.735054591608704e-07, + "loss": 0.80164325, + "num_input_tokens_seen": 251096420, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.27429199, + "step": 11638, + "time_per_iteration": 2.676647424697876 + }, + { + "auxiliary_loss_clip": 0.01325551, + "auxiliary_loss_mlp": 0.00240186, + "balance_loss_clip": 1.0930655, + "balance_loss_mlp": 0.20979983, + "epoch": 0.6997745378024951, + "flos": 29607162958080.0, + "grad_norm": 8.705756134485972, + "language_loss": 0.85307497, + "learning_rate": 8.731836728534459e-07, + "loss": 0.86873233, + "num_input_tokens_seen": 251115410, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.30407715, + "step": 11639, + "time_per_iteration": 2.7238106727600098 + }, + { + "auxiliary_loss_clip": 0.01318988, + "auxiliary_loss_mlp": 0.00226685, + "balance_loss_clip": 1.08858538, + "balance_loss_mlp": 0.1990048, + "epoch": 0.6998346610551631, + "flos": 20886616056960.0, + "grad_norm": 214.1430496779924, + "language_loss": 0.90713298, + "learning_rate": 8.728619292750093e-07, + "loss": 0.92258972, + "num_input_tokens_seen": 251133530, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.27697754, + "step": 11640, + "time_per_iteration": 2.652945041656494 + }, + { + "auxiliary_loss_clip": 0.0128748, + "auxiliary_loss_mlp": 0.00224376, + "balance_loss_clip": 1.07074511, + "balance_loss_mlp": 0.19837648, + "epoch": 0.699894784307831, + "flos": 27163046949120.0, + "grad_norm": 13.093506115514211, + "language_loss": 0.8482216, + "learning_rate": 8.725402284377619e-07, + "loss": 0.86334014, + "num_input_tokens_seen": 251153985, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.2598877, + "step": 11641, + "time_per_iteration": 2.676967144012451 + }, + { + "auxiliary_loss_clip": 0.01313616, + "auxiliary_loss_mlp": 0.00226745, + "balance_loss_clip": 1.08747435, + "balance_loss_mlp": 0.19880247, + "epoch": 0.699954907560499, + "flos": 20923640000640.0, + "grad_norm": 2.9278682398823572, + "language_loss": 0.88381433, + "learning_rate": 8.722185703539022e-07, + "loss": 0.89921796, + "num_input_tokens_seen": 251173225, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.27929688, + "step": 11642, + "time_per_iteration": 2.6657252311706543 + }, + { + "auxiliary_loss_clip": 0.01359525, + "auxiliary_loss_mlp": 0.0024995, + "balance_loss_clip": 1.11479068, + "balance_loss_mlp": 0.21927774, + "epoch": 0.700015030813167, + "flos": 28657792540800.0, + "grad_norm": 26.690227191939602, + "language_loss": 0.8398481, + "learning_rate": 8.718969550356266e-07, + "loss": 0.85594285, + "num_input_tokens_seen": 251192485, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.30651855, + "step": 11643, + "time_per_iteration": 2.7600436210632324 + }, + { + "auxiliary_loss_clip": 0.01327381, + "auxiliary_loss_mlp": 0.00252635, + "balance_loss_clip": 1.09230638, + "balance_loss_mlp": 0.22311832, + "epoch": 0.700075154065835, + "flos": 29205286617600.0, + "grad_norm": 23.553946160485978, + "language_loss": 0.67224503, + "learning_rate": 8.715753824951315e-07, + "loss": 0.68804514, + "num_input_tokens_seen": 251214965, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.29504395, + "step": 11644, + "time_per_iteration": 2.7750518321990967 + }, + { + "auxiliary_loss_clip": 0.01323691, + "auxiliary_loss_mlp": 0.00268128, + "balance_loss_clip": 1.09639907, + "balance_loss_mlp": 0.24041153, + "epoch": 0.7001352773185029, + "flos": 23112431159040.0, + "grad_norm": 6.978426704557098, + "language_loss": 0.88433588, + "learning_rate": 8.712538527446119e-07, + "loss": 0.90025413, + "num_input_tokens_seen": 251234500, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.27709961, + "step": 11645, + "time_per_iteration": 2.67975115776062 + }, + { + "auxiliary_loss_clip": 0.01320831, + "auxiliary_loss_mlp": 0.0024532, + "balance_loss_clip": 1.0930171, + "balance_loss_mlp": 0.21543472, + "epoch": 0.7001954005711709, + "flos": 21322858734720.0, + "grad_norm": 648.8115189432855, + "language_loss": 0.75412035, + "learning_rate": 8.709323657962584e-07, + "loss": 0.76978189, + "num_input_tokens_seen": 251254360, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.29907227, + "step": 11646, + "time_per_iteration": 2.6506471633911133 + }, + { + "auxiliary_loss_clip": 0.01304137, + "auxiliary_loss_mlp": 0.00226729, + "balance_loss_clip": 1.08103848, + "balance_loss_mlp": 0.19973981, + "epoch": 0.7002555238238388, + "flos": 24535822383360.0, + "grad_norm": 2.923809909581497, + "language_loss": 0.77856195, + "learning_rate": 8.706109216622635e-07, + "loss": 0.79387057, + "num_input_tokens_seen": 251274790, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26977539, + "step": 11647, + "time_per_iteration": 2.674961805343628 + }, + { + "auxiliary_loss_clip": 0.01319062, + "auxiliary_loss_mlp": 0.00200688, + "balance_loss_clip": 1.09230149, + "balance_loss_mlp": 0.17682266, + "epoch": 0.7003156470765068, + "flos": 39056552726400.0, + "grad_norm": 8.416306262752876, + "language_loss": 0.81521565, + "learning_rate": 8.702895203548155e-07, + "loss": 0.8304131, + "num_input_tokens_seen": 251296275, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.23864746, + "step": 11648, + "time_per_iteration": 2.802321672439575 + }, + { + "auxiliary_loss_clip": 0.01307147, + "auxiliary_loss_mlp": 0.00208628, + "balance_loss_clip": 1.08472657, + "balance_loss_mlp": 0.18284345, + "epoch": 0.7003757703291749, + "flos": 28804092635520.0, + "grad_norm": 15.004134061053056, + "language_loss": 0.84959239, + "learning_rate": 8.699681618861014e-07, + "loss": 0.86475015, + "num_input_tokens_seen": 251317375, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25793457, + "step": 11649, + "time_per_iteration": 2.7266275882720947 + }, + { + "auxiliary_loss_clip": 0.01317337, + "auxiliary_loss_mlp": 0.00226785, + "balance_loss_clip": 1.08860934, + "balance_loss_mlp": 0.19900928, + "epoch": 0.7004358935818428, + "flos": 15953854152960.0, + "grad_norm": 9.281049190072922, + "language_loss": 0.84888721, + "learning_rate": 8.69646846268308e-07, + "loss": 0.86432844, + "num_input_tokens_seen": 251333570, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.27807617, + "step": 11650, + "time_per_iteration": 2.708414316177368 + }, + { + "auxiliary_loss_clip": 0.01305397, + "auxiliary_loss_mlp": 0.0024505, + "balance_loss_clip": 1.0839963, + "balance_loss_mlp": 0.21845399, + "epoch": 0.7004960168345108, + "flos": 20411984718720.0, + "grad_norm": 55.63922341323825, + "language_loss": 0.85771924, + "learning_rate": 8.693255735136194e-07, + "loss": 0.87322378, + "num_input_tokens_seen": 251351070, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.26574707, + "step": 11651, + "time_per_iteration": 2.6482927799224854 + }, + { + "auxiliary_loss_clip": 0.0134054, + "auxiliary_loss_mlp": 0.00242229, + "balance_loss_clip": 1.09833336, + "balance_loss_mlp": 0.21307014, + "epoch": 0.7005561400871787, + "flos": 17347547808000.0, + "grad_norm": 33.26775123222925, + "language_loss": 0.78738892, + "learning_rate": 8.690043436342198e-07, + "loss": 0.80321658, + "num_input_tokens_seen": 251370005, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.29150391, + "step": 11652, + "time_per_iteration": 2.658904790878296 + }, + { + "auxiliary_loss_clip": 0.01312984, + "auxiliary_loss_mlp": 0.00242624, + "balance_loss_clip": 1.08720446, + "balance_loss_mlp": 0.21462129, + "epoch": 0.7006162633398467, + "flos": 25302120157440.0, + "grad_norm": 1.9405637777040976, + "language_loss": 0.80430275, + "learning_rate": 8.686831566422874e-07, + "loss": 0.81985879, + "num_input_tokens_seen": 251391210, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.2800293, + "step": 11653, + "time_per_iteration": 2.718663215637207 + }, + { + "auxiliary_loss_clip": 0.01346137, + "auxiliary_loss_mlp": 0.00233235, + "balance_loss_clip": 1.10103869, + "balance_loss_mlp": 0.20171601, + "epoch": 0.7006763865925146, + "flos": 20668997508480.0, + "grad_norm": 23.747329461607578, + "language_loss": 0.81266439, + "learning_rate": 8.68362012550003e-07, + "loss": 0.82845819, + "num_input_tokens_seen": 251411505, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.31494141, + "step": 11654, + "time_per_iteration": 2.758901834487915 + }, + { + "auxiliary_loss_clip": 0.01340043, + "auxiliary_loss_mlp": 0.00265732, + "balance_loss_clip": 1.10157323, + "balance_loss_mlp": 0.23656186, + "epoch": 0.7007365098451827, + "flos": 20046449963520.0, + "grad_norm": 44.192815979898114, + "language_loss": 0.85167617, + "learning_rate": 8.680409113695453e-07, + "loss": 0.86773384, + "num_input_tokens_seen": 251428975, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.29187012, + "step": 11655, + "time_per_iteration": 2.6764109134674072 + }, + { + "auxiliary_loss_clip": 0.01340696, + "auxiliary_loss_mlp": 0.002519, + "balance_loss_clip": 1.09821689, + "balance_loss_mlp": 0.22373088, + "epoch": 0.7007966330978506, + "flos": 20777375819520.0, + "grad_norm": 23.449489548791338, + "language_loss": 0.78931576, + "learning_rate": 8.677198531130889e-07, + "loss": 0.8052417, + "num_input_tokens_seen": 251446940, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.28137207, + "step": 11656, + "time_per_iteration": 2.685081958770752 + }, + { + "auxiliary_loss_clip": 0.01317644, + "auxiliary_loss_mlp": 0.00221561, + "balance_loss_clip": 1.08907604, + "balance_loss_mlp": 0.19272405, + "epoch": 0.7008567563505186, + "flos": 29638189330560.0, + "grad_norm": 4.513124834494237, + "language_loss": 0.85405034, + "learning_rate": 8.673988377928092e-07, + "loss": 0.86944234, + "num_input_tokens_seen": 251466205, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.28808594, + "step": 11657, + "time_per_iteration": 2.7515408992767334 + }, + { + "auxiliary_loss_clip": 0.01342975, + "auxiliary_loss_mlp": 0.00231104, + "balance_loss_clip": 1.1030066, + "balance_loss_mlp": 0.19898915, + "epoch": 0.7009168796031865, + "flos": 17092007475840.0, + "grad_norm": 27.67714548999825, + "language_loss": 0.88662696, + "learning_rate": 8.670778654208797e-07, + "loss": 0.90236783, + "num_input_tokens_seen": 251484820, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.32116699, + "step": 11658, + "time_per_iteration": 2.620649576187134 + }, + { + "auxiliary_loss_clip": 0.01307203, + "auxiliary_loss_mlp": 0.0023092, + "balance_loss_clip": 1.08722842, + "balance_loss_mlp": 0.20463455, + "epoch": 0.7009770028558545, + "flos": 20448972748800.0, + "grad_norm": 33.60606379453234, + "language_loss": 0.89475942, + "learning_rate": 8.667569360094713e-07, + "loss": 0.91014063, + "num_input_tokens_seen": 251502670, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26293945, + "step": 11659, + "time_per_iteration": 2.7887327671051025 + }, + { + "auxiliary_loss_clip": 0.01295383, + "auxiliary_loss_mlp": 0.00207662, + "balance_loss_clip": 1.07755494, + "balance_loss_mlp": 0.18052956, + "epoch": 0.7010371261085224, + "flos": 19245139407360.0, + "grad_norm": 4.5528509019015075, + "language_loss": 0.77040106, + "learning_rate": 8.664360495707526e-07, + "loss": 0.7854315, + "num_input_tokens_seen": 251521630, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.27124023, + "step": 11660, + "time_per_iteration": 2.7761759757995605 + }, + { + "auxiliary_loss_clip": 0.01312247, + "auxiliary_loss_mlp": 0.00228586, + "balance_loss_clip": 1.08812857, + "balance_loss_mlp": 0.20098928, + "epoch": 0.7010972493611904, + "flos": 22127581082880.0, + "grad_norm": 6.457399249868733, + "language_loss": 0.87940824, + "learning_rate": 8.661152061168924e-07, + "loss": 0.89481652, + "num_input_tokens_seen": 251540105, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.27587891, + "step": 11661, + "time_per_iteration": 2.7300469875335693 + }, + { + "auxiliary_loss_clip": 0.01318492, + "auxiliary_loss_mlp": 0.00217746, + "balance_loss_clip": 1.092731, + "balance_loss_mlp": 0.18961266, + "epoch": 0.7011573726138585, + "flos": 31391132860800.0, + "grad_norm": 7276.132771548174, + "language_loss": 0.8524704, + "learning_rate": 8.657944056600579e-07, + "loss": 0.86783278, + "num_input_tokens_seen": 251560530, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.28149414, + "step": 11662, + "time_per_iteration": 2.74375057220459 + }, + { + "auxiliary_loss_clip": 0.01326428, + "auxiliary_loss_mlp": 0.00233453, + "balance_loss_clip": 1.0976491, + "balance_loss_mlp": 0.20683353, + "epoch": 0.7012174958665264, + "flos": 18150582216960.0, + "grad_norm": 8.92457066673001, + "language_loss": 0.91757965, + "learning_rate": 8.654736482124134e-07, + "loss": 0.93317842, + "num_input_tokens_seen": 251577930, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.26586914, + "step": 11663, + "time_per_iteration": 2.684138536453247 + }, + { + "auxiliary_loss_clip": 0.01229761, + "auxiliary_loss_mlp": 0.00121707, + "balance_loss_clip": 1.07882893, + "balance_loss_mlp": 0.11202722, + "epoch": 0.7012776191191944, + "flos": 60651256567680.0, + "grad_norm": 0.7918717568430268, + "language_loss": 0.53599918, + "learning_rate": 8.651529337861209e-07, + "loss": 0.54951382, + "num_input_tokens_seen": 251638820, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.09667969, + "step": 11664, + "time_per_iteration": 3.1592986583709717 + }, + { + "auxiliary_loss_clip": 0.01337299, + "auxiliary_loss_mlp": 0.00234934, + "balance_loss_clip": 1.10293567, + "balance_loss_mlp": 0.20681244, + "epoch": 0.7013377423718623, + "flos": 27198598435200.0, + "grad_norm": 9.878759453427575, + "language_loss": 0.90925407, + "learning_rate": 8.64832262393344e-07, + "loss": 0.92497647, + "num_input_tokens_seen": 251658070, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.28137207, + "step": 11665, + "time_per_iteration": 2.6933584213256836 + }, + { + "auxiliary_loss_clip": 0.01336425, + "auxiliary_loss_mlp": 0.00216375, + "balance_loss_clip": 1.0995028, + "balance_loss_mlp": 0.18874224, + "epoch": 0.7013978656245303, + "flos": 16543543731840.0, + "grad_norm": 877.4929760936716, + "language_loss": 0.86383212, + "learning_rate": 8.645116340462404e-07, + "loss": 0.87936008, + "num_input_tokens_seen": 251671575, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.27612305, + "step": 11666, + "time_per_iteration": 2.658090829849243 + }, + { + "auxiliary_loss_clip": 0.01344087, + "auxiliary_loss_mlp": 0.00222153, + "balance_loss_clip": 1.10629058, + "balance_loss_mlp": 0.19471067, + "epoch": 0.7014579888771982, + "flos": 23143780753920.0, + "grad_norm": 2.7742263673426657, + "language_loss": 0.87328041, + "learning_rate": 8.641910487569695e-07, + "loss": 0.88894284, + "num_input_tokens_seen": 251689350, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.27453613, + "step": 11667, + "time_per_iteration": 4.126591682434082 + }, + { + "auxiliary_loss_clip": 0.01317407, + "auxiliary_loss_mlp": 0.00225937, + "balance_loss_clip": 1.08837581, + "balance_loss_mlp": 0.19811311, + "epoch": 0.7015181121298663, + "flos": 25082095397760.0, + "grad_norm": 43.62373346979314, + "language_loss": 0.75295365, + "learning_rate": 8.638705065376879e-07, + "loss": 0.76838708, + "num_input_tokens_seen": 251704635, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.27856445, + "step": 11668, + "time_per_iteration": 2.6964621543884277 + }, + { + "auxiliary_loss_clip": 0.01322329, + "auxiliary_loss_mlp": 0.00252634, + "balance_loss_clip": 1.09370542, + "balance_loss_mlp": 0.22545415, + "epoch": 0.7015782353825342, + "flos": 23327894891520.0, + "grad_norm": 126.83534057536173, + "language_loss": 0.85276854, + "learning_rate": 8.635500074005519e-07, + "loss": 0.86851817, + "num_input_tokens_seen": 251723035, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27172852, + "step": 11669, + "time_per_iteration": 4.103363275527954 + }, + { + "auxiliary_loss_clip": 0.01223842, + "auxiliary_loss_mlp": 0.00094179, + "balance_loss_clip": 1.07288122, + "balance_loss_mlp": 0.08535793, + "epoch": 0.7016383586352022, + "flos": 70397161107840.0, + "grad_norm": 0.7301794816138979, + "language_loss": 0.54027987, + "learning_rate": 8.632295513577122e-07, + "loss": 0.55346012, + "num_input_tokens_seen": 251791630, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.08837891, + "step": 11670, + "time_per_iteration": 3.263697862625122 + }, + { + "auxiliary_loss_clip": 0.0130224, + "auxiliary_loss_mlp": 0.00250197, + "balance_loss_clip": 1.07986248, + "balance_loss_mlp": 0.22249281, + "epoch": 0.7016984818878701, + "flos": 19792274348160.0, + "grad_norm": 6.20706387279615, + "language_loss": 0.8969236, + "learning_rate": 8.629091384213218e-07, + "loss": 0.91244805, + "num_input_tokens_seen": 251809840, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.27709961, + "step": 11671, + "time_per_iteration": 2.6400206089019775 + }, + { + "auxiliary_loss_clip": 0.01339456, + "auxiliary_loss_mlp": 0.00241384, + "balance_loss_clip": 1.10699201, + "balance_loss_mlp": 0.21378738, + "epoch": 0.7017586051405381, + "flos": 12896923184640.0, + "grad_norm": 125.45969303259349, + "language_loss": 0.85002911, + "learning_rate": 8.625887686035313e-07, + "loss": 0.86583751, + "num_input_tokens_seen": 251827550, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.27600098, + "step": 11672, + "time_per_iteration": 4.107734441757202 + }, + { + "auxiliary_loss_clip": 0.01327859, + "auxiliary_loss_mlp": 0.00243899, + "balance_loss_clip": 1.09231639, + "balance_loss_mlp": 0.2145258, + "epoch": 0.701818728393206, + "flos": 18332828847360.0, + "grad_norm": 7.481867002414591, + "language_loss": 0.93051362, + "learning_rate": 8.622684419164883e-07, + "loss": 0.94623119, + "num_input_tokens_seen": 251844880, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.29382324, + "step": 11673, + "time_per_iteration": 2.6551406383514404 + }, + { + "auxiliary_loss_clip": 0.01311887, + "auxiliary_loss_mlp": 0.00235299, + "balance_loss_clip": 1.08491898, + "balance_loss_mlp": 0.20697439, + "epoch": 0.701878851645874, + "flos": 17384212615680.0, + "grad_norm": 14.623282361587567, + "language_loss": 0.8168571, + "learning_rate": 8.619481583723399e-07, + "loss": 0.83232892, + "num_input_tokens_seen": 251861025, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.2833252, + "step": 11674, + "time_per_iteration": 2.6849820613861084 + }, + { + "auxiliary_loss_clip": 0.0131536, + "auxiliary_loss_mlp": 0.0022559, + "balance_loss_clip": 1.09025431, + "balance_loss_mlp": 0.19936347, + "epoch": 0.701938974898542, + "flos": 23915501481600.0, + "grad_norm": 188.4768488618571, + "language_loss": 0.78091323, + "learning_rate": 8.616279179832329e-07, + "loss": 0.7963227, + "num_input_tokens_seen": 251880175, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.26220703, + "step": 11675, + "time_per_iteration": 2.7338595390319824 + }, + { + "auxiliary_loss_clip": 0.01353477, + "auxiliary_loss_mlp": 0.00242419, + "balance_loss_clip": 1.11170697, + "balance_loss_mlp": 0.21347526, + "epoch": 0.70199909815121, + "flos": 21795586652160.0, + "grad_norm": 62.16736915348335, + "language_loss": 0.61342102, + "learning_rate": 8.613077207613078e-07, + "loss": 0.62937999, + "num_input_tokens_seen": 251899005, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.28955078, + "step": 11676, + "time_per_iteration": 4.161667585372925 + }, + { + "auxiliary_loss_clip": 0.01207725, + "auxiliary_loss_mlp": 0.0009836, + "balance_loss_clip": 1.05719435, + "balance_loss_mlp": 0.08982471, + "epoch": 0.702059221403878, + "flos": 71715047109120.0, + "grad_norm": 0.7124634283139019, + "language_loss": 0.58557308, + "learning_rate": 8.609875667187079e-07, + "loss": 0.59863394, + "num_input_tokens_seen": 251966790, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.08544922, + "step": 11677, + "time_per_iteration": 3.241692543029785 + }, + { + "auxiliary_loss_clip": 0.01323668, + "auxiliary_loss_mlp": 0.00237443, + "balance_loss_clip": 1.0905242, + "balance_loss_mlp": 0.20928627, + "epoch": 0.7021193446565459, + "flos": 28111052649600.0, + "grad_norm": 5.6204315281460815, + "language_loss": 0.70921403, + "learning_rate": 8.606674558675737e-07, + "loss": 0.72482514, + "num_input_tokens_seen": 251989315, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.28186035, + "step": 11678, + "time_per_iteration": 2.7664196491241455 + }, + { + "auxiliary_loss_clip": 0.01323936, + "auxiliary_loss_mlp": 0.00251399, + "balance_loss_clip": 1.09771585, + "balance_loss_mlp": 0.22411212, + "epoch": 0.7021794679092139, + "flos": 22924905229440.0, + "grad_norm": 14.160294846085067, + "language_loss": 0.8456012, + "learning_rate": 8.603473882200444e-07, + "loss": 0.86135459, + "num_input_tokens_seen": 252006620, + "router_z_loss_clip": 2.26464844, + "router_z_loss_mlp": 0.27258301, + "step": 11679, + "time_per_iteration": 2.685023069381714 + }, + { + "auxiliary_loss_clip": 0.01326193, + "auxiliary_loss_mlp": 0.00206133, + "balance_loss_clip": 1.10198545, + "balance_loss_mlp": 0.1808008, + "epoch": 0.7022395911618818, + "flos": 18077827219200.0, + "grad_norm": 7.905375875381045, + "language_loss": 0.81314653, + "learning_rate": 8.600273637882567e-07, + "loss": 0.82846975, + "num_input_tokens_seen": 252024570, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.25341797, + "step": 11680, + "time_per_iteration": 2.6626663208007812 + }, + { + "auxiliary_loss_clip": 0.0135234, + "auxiliary_loss_mlp": 0.00259828, + "balance_loss_clip": 1.11087489, + "balance_loss_mlp": 0.22965586, + "epoch": 0.7022997144145499, + "flos": 16034294661120.0, + "grad_norm": 78.61715038692307, + "language_loss": 0.83418208, + "learning_rate": 8.597073825843446e-07, + "loss": 0.85030377, + "num_input_tokens_seen": 252042775, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.30163574, + "step": 11681, + "time_per_iteration": 2.6673402786254883 + }, + { + "auxiliary_loss_clip": 0.01305988, + "auxiliary_loss_mlp": 0.00229545, + "balance_loss_clip": 1.08566785, + "balance_loss_mlp": 0.20209104, + "epoch": 0.7023598376672178, + "flos": 26468678160000.0, + "grad_norm": 228.57749837659802, + "language_loss": 0.83644915, + "learning_rate": 8.593874446204434e-07, + "loss": 0.85180449, + "num_input_tokens_seen": 252063690, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.2746582, + "step": 11682, + "time_per_iteration": 2.7109694480895996 + }, + { + "auxiliary_loss_clip": 0.0134591, + "auxiliary_loss_mlp": 0.00259228, + "balance_loss_clip": 1.10606432, + "balance_loss_mlp": 0.22977164, + "epoch": 0.7024199609198858, + "flos": 17055917285760.0, + "grad_norm": 34.78708347901931, + "language_loss": 0.84089476, + "learning_rate": 8.590675499086841e-07, + "loss": 0.85694611, + "num_input_tokens_seen": 252080335, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.29455566, + "step": 11683, + "time_per_iteration": 2.6066231727600098 + }, + { + "auxiliary_loss_clip": 0.0135689, + "auxiliary_loss_mlp": 0.00262841, + "balance_loss_clip": 1.11442196, + "balance_loss_mlp": 0.23306251, + "epoch": 0.7024800841725537, + "flos": 25849039616640.0, + "grad_norm": 10.726741627333086, + "language_loss": 0.82668078, + "learning_rate": 8.587476984611976e-07, + "loss": 0.84287804, + "num_input_tokens_seen": 252101075, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.29785156, + "step": 11684, + "time_per_iteration": 2.7096107006073 + }, + { + "auxiliary_loss_clip": 0.01316845, + "auxiliary_loss_mlp": 0.00246006, + "balance_loss_clip": 1.08983564, + "balance_loss_mlp": 0.21812263, + "epoch": 0.7025402074252217, + "flos": 23513014609920.0, + "grad_norm": 3.884303840129664, + "language_loss": 0.81274045, + "learning_rate": 8.584278902901128e-07, + "loss": 0.82836896, + "num_input_tokens_seen": 252120510, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.27905273, + "step": 11685, + "time_per_iteration": 2.6517906188964844 + }, + { + "auxiliary_loss_clip": 0.01310065, + "auxiliary_loss_mlp": 0.00234835, + "balance_loss_clip": 1.08278561, + "balance_loss_mlp": 0.20810872, + "epoch": 0.7026003306778896, + "flos": 20150985519360.0, + "grad_norm": 487.6545545311212, + "language_loss": 0.90787542, + "learning_rate": 8.581081254075582e-07, + "loss": 0.92332435, + "num_input_tokens_seen": 252137590, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.26733398, + "step": 11686, + "time_per_iteration": 2.6723053455352783 + }, + { + "auxiliary_loss_clip": 0.01191238, + "auxiliary_loss_mlp": 0.00108204, + "balance_loss_clip": 1.0427537, + "balance_loss_mlp": 0.09995484, + "epoch": 0.7026604539305576, + "flos": 64772400712320.0, + "grad_norm": 0.9613418414631639, + "language_loss": 0.69182831, + "learning_rate": 8.577884038256566e-07, + "loss": 0.70482278, + "num_input_tokens_seen": 252199830, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.08251953, + "step": 11687, + "time_per_iteration": 3.322389841079712 + }, + { + "auxiliary_loss_clip": 0.01314231, + "auxiliary_loss_mlp": 0.0026263, + "balance_loss_clip": 1.08434069, + "balance_loss_mlp": 0.23392409, + "epoch": 0.7027205771832256, + "flos": 21871466133120.0, + "grad_norm": 6.408008998817926, + "language_loss": 0.86102438, + "learning_rate": 8.574687255565329e-07, + "loss": 0.87679291, + "num_input_tokens_seen": 252217200, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.2869873, + "step": 11688, + "time_per_iteration": 2.660240411758423 + }, + { + "auxiliary_loss_clip": 0.01326162, + "auxiliary_loss_mlp": 0.00276363, + "balance_loss_clip": 1.08914804, + "balance_loss_mlp": 0.24725235, + "epoch": 0.7027807004358936, + "flos": 23367791923200.0, + "grad_norm": 2.2219332605155007, + "language_loss": 0.75302064, + "learning_rate": 8.571490906123107e-07, + "loss": 0.76904595, + "num_input_tokens_seen": 252236105, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.29125977, + "step": 11689, + "time_per_iteration": 2.682840347290039 + }, + { + "auxiliary_loss_clip": 0.0132404, + "auxiliary_loss_mlp": 0.00248816, + "balance_loss_clip": 1.08934522, + "balance_loss_mlp": 0.22006254, + "epoch": 0.7028408236885616, + "flos": 15304266645120.0, + "grad_norm": 16.83030651929631, + "language_loss": 0.89897859, + "learning_rate": 8.568294990051086e-07, + "loss": 0.91470718, + "num_input_tokens_seen": 252253315, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.28710938, + "step": 11690, + "time_per_iteration": 2.6385533809661865 + }, + { + "auxiliary_loss_clip": 0.01318776, + "auxiliary_loss_mlp": 0.00248342, + "balance_loss_clip": 1.08615899, + "balance_loss_mlp": 0.21934988, + "epoch": 0.7029009469412295, + "flos": 22018197191040.0, + "grad_norm": 564.8394970409294, + "language_loss": 0.83475941, + "learning_rate": 8.56509950747047e-07, + "loss": 0.85043061, + "num_input_tokens_seen": 252272765, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.28991699, + "step": 11691, + "time_per_iteration": 2.679696559906006 + }, + { + "auxiliary_loss_clip": 0.0129996, + "auxiliary_loss_mlp": 0.00221494, + "balance_loss_clip": 1.07930541, + "balance_loss_mlp": 0.19579235, + "epoch": 0.7029610701938975, + "flos": 21835519597440.0, + "grad_norm": 3.728597496619534, + "language_loss": 0.87768555, + "learning_rate": 8.561904458502429e-07, + "loss": 0.89290011, + "num_input_tokens_seen": 252290510, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25720215, + "step": 11692, + "time_per_iteration": 2.6516358852386475 + }, + { + "auxiliary_loss_clip": 0.01307524, + "auxiliary_loss_mlp": 0.00235712, + "balance_loss_clip": 1.08328772, + "balance_loss_mlp": 0.20900902, + "epoch": 0.7030211934465654, + "flos": 19135647774720.0, + "grad_norm": 14.059582892251306, + "language_loss": 0.83444768, + "learning_rate": 8.558709843268111e-07, + "loss": 0.8498801, + "num_input_tokens_seen": 252309365, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26721191, + "step": 11693, + "time_per_iteration": 2.67405366897583 + }, + { + "auxiliary_loss_clip": 0.01341414, + "auxiliary_loss_mlp": 0.0024898, + "balance_loss_clip": 1.10374928, + "balance_loss_mlp": 0.22002393, + "epoch": 0.7030813166992335, + "flos": 38546010766080.0, + "grad_norm": 16.67464394212993, + "language_loss": 0.76519525, + "learning_rate": 8.55551566188866e-07, + "loss": 0.78109914, + "num_input_tokens_seen": 252333010, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.29003906, + "step": 11694, + "time_per_iteration": 2.8117544651031494 + }, + { + "auxiliary_loss_clip": 0.01336262, + "auxiliary_loss_mlp": 0.00243222, + "balance_loss_clip": 1.09843314, + "balance_loss_mlp": 0.21355127, + "epoch": 0.7031414399519014, + "flos": 14720897859840.0, + "grad_norm": 9.720224042527823, + "language_loss": 0.86681819, + "learning_rate": 8.552321914485203e-07, + "loss": 0.88261306, + "num_input_tokens_seen": 252351330, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.296875, + "step": 11695, + "time_per_iteration": 2.6800084114074707 + }, + { + "auxiliary_loss_clip": 0.01320046, + "auxiliary_loss_mlp": 0.00233713, + "balance_loss_clip": 1.09479523, + "balance_loss_mlp": 0.20860752, + "epoch": 0.7032015632045694, + "flos": 14027247342720.0, + "grad_norm": 109.0917691305427, + "language_loss": 0.82383567, + "learning_rate": 8.549128601178852e-07, + "loss": 0.83937323, + "num_input_tokens_seen": 252369580, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.25109863, + "step": 11696, + "time_per_iteration": 2.697777509689331 + }, + { + "auxiliary_loss_clip": 0.01310709, + "auxiliary_loss_mlp": 0.00237588, + "balance_loss_clip": 1.08289218, + "balance_loss_mlp": 0.20987152, + "epoch": 0.7032616864572373, + "flos": 27637175496960.0, + "grad_norm": 2.190951765886415, + "language_loss": 0.82078892, + "learning_rate": 8.545935722090693e-07, + "loss": 0.83627188, + "num_input_tokens_seen": 252390525, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.27758789, + "step": 11697, + "time_per_iteration": 2.735732316970825 + }, + { + "auxiliary_loss_clip": 0.01319326, + "auxiliary_loss_mlp": 0.00251372, + "balance_loss_clip": 1.08688414, + "balance_loss_mlp": 0.22242841, + "epoch": 0.7033218097099053, + "flos": 17967294092160.0, + "grad_norm": 3.5968193162284825, + "language_loss": 0.8687942, + "learning_rate": 8.542743277341793e-07, + "loss": 0.88450116, + "num_input_tokens_seen": 252407470, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.28955078, + "step": 11698, + "time_per_iteration": 2.6966915130615234 + }, + { + "auxiliary_loss_clip": 0.01317074, + "auxiliary_loss_mlp": 0.00246025, + "balance_loss_clip": 1.08201146, + "balance_loss_mlp": 0.21658072, + "epoch": 0.7033819329625732, + "flos": 19501721233920.0, + "grad_norm": 32.06662519110118, + "language_loss": 0.91213053, + "learning_rate": 8.539551267053222e-07, + "loss": 0.92776155, + "num_input_tokens_seen": 252427025, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.29443359, + "step": 11699, + "time_per_iteration": 2.7040956020355225 + }, + { + "auxiliary_loss_clip": 0.01325223, + "auxiliary_loss_mlp": 0.00244987, + "balance_loss_clip": 1.09637189, + "balance_loss_mlp": 0.21883294, + "epoch": 0.7034420562152413, + "flos": 23987645948160.0, + "grad_norm": 15.413865507802537, + "language_loss": 0.87578392, + "learning_rate": 8.53635969134601e-07, + "loss": 0.89148605, + "num_input_tokens_seen": 252445410, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.26159668, + "step": 11700, + "time_per_iteration": 2.749657392501831 + }, + { + "auxiliary_loss_clip": 0.01312332, + "auxiliary_loss_mlp": 0.00271446, + "balance_loss_clip": 1.08097541, + "balance_loss_mlp": 0.24097607, + "epoch": 0.7035021794679092, + "flos": 35043427756800.0, + "grad_norm": 2.3315473357736773, + "language_loss": 0.82577407, + "learning_rate": 8.533168550341186e-07, + "loss": 0.84161192, + "num_input_tokens_seen": 252463905, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.3046875, + "step": 11701, + "time_per_iteration": 2.820862293243408 + }, + { + "auxiliary_loss_clip": 0.01341693, + "auxiliary_loss_mlp": 0.00224941, + "balance_loss_clip": 1.09823859, + "balance_loss_mlp": 0.19563891, + "epoch": 0.7035623027205772, + "flos": 10997428164480.0, + "grad_norm": 15.444516257875412, + "language_loss": 0.94879824, + "learning_rate": 8.529977844159769e-07, + "loss": 0.9644646, + "num_input_tokens_seen": 252478655, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.29296875, + "step": 11702, + "time_per_iteration": 2.7030301094055176 + }, + { + "auxiliary_loss_clip": 0.01320072, + "auxiliary_loss_mlp": 0.00251985, + "balance_loss_clip": 1.08684635, + "balance_loss_mlp": 0.22354189, + "epoch": 0.7036224259732452, + "flos": 23623727304960.0, + "grad_norm": 7.048535511243155, + "language_loss": 0.68671012, + "learning_rate": 8.526787572922738e-07, + "loss": 0.70243073, + "num_input_tokens_seen": 252498740, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.2845459, + "step": 11703, + "time_per_iteration": 2.7521889209747314 + }, + { + "auxiliary_loss_clip": 0.01326608, + "auxiliary_loss_mlp": 0.00251341, + "balance_loss_clip": 1.09000587, + "balance_loss_mlp": 0.22183666, + "epoch": 0.7036825492259131, + "flos": 31686175175040.0, + "grad_norm": 13.170309643139255, + "language_loss": 0.71039307, + "learning_rate": 8.523597736751067e-07, + "loss": 0.72617245, + "num_input_tokens_seen": 252517800, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.29492188, + "step": 11704, + "time_per_iteration": 2.7554047107696533 + }, + { + "auxiliary_loss_clip": 0.01305925, + "auxiliary_loss_mlp": 0.00287263, + "balance_loss_clip": 1.08352602, + "balance_loss_mlp": 0.26088223, + "epoch": 0.7037426724785811, + "flos": 30192866127360.0, + "grad_norm": 233.88713233125634, + "language_loss": 0.77621615, + "learning_rate": 8.520408335765719e-07, + "loss": 0.79214805, + "num_input_tokens_seen": 252539620, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26379395, + "step": 11705, + "time_per_iteration": 2.714193820953369 + }, + { + "auxiliary_loss_clip": 0.01306334, + "auxiliary_loss_mlp": 0.00232228, + "balance_loss_clip": 1.08286262, + "balance_loss_mlp": 0.20674139, + "epoch": 0.703802795731249, + "flos": 24311523905280.0, + "grad_norm": 2.7437645383781217, + "language_loss": 0.6983937, + "learning_rate": 8.517219370087645e-07, + "loss": 0.71377933, + "num_input_tokens_seen": 252557300, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25512695, + "step": 11706, + "time_per_iteration": 2.6913063526153564 + }, + { + "auxiliary_loss_clip": 0.01327118, + "auxiliary_loss_mlp": 0.00253749, + "balance_loss_clip": 1.09640789, + "balance_loss_mlp": 0.22716531, + "epoch": 0.7038629189839171, + "flos": 22528954632960.0, + "grad_norm": 107.75134668138257, + "language_loss": 0.77105629, + "learning_rate": 8.514030839837756e-07, + "loss": 0.78686488, + "num_input_tokens_seen": 252576715, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.26574707, + "step": 11707, + "time_per_iteration": 2.647080659866333 + }, + { + "auxiliary_loss_clip": 0.01313748, + "auxiliary_loss_mlp": 0.00227592, + "balance_loss_clip": 1.08975935, + "balance_loss_mlp": 0.20042384, + "epoch": 0.703923042236585, + "flos": 26250484993920.0, + "grad_norm": 9.098448514624287, + "language_loss": 0.82875156, + "learning_rate": 8.510842745136974e-07, + "loss": 0.84416497, + "num_input_tokens_seen": 252596190, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.27172852, + "step": 11708, + "time_per_iteration": 2.6897099018096924 + }, + { + "auxiliary_loss_clip": 0.013212, + "auxiliary_loss_mlp": 0.00229509, + "balance_loss_clip": 1.09399998, + "balance_loss_mlp": 0.20265129, + "epoch": 0.703983165489253, + "flos": 19390254353280.0, + "grad_norm": 6.275333996928926, + "language_loss": 0.80044556, + "learning_rate": 8.50765508610619e-07, + "loss": 0.8159526, + "num_input_tokens_seen": 252613410, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.26818848, + "step": 11709, + "time_per_iteration": 4.172357559204102 + }, + { + "auxiliary_loss_clip": 0.01322265, + "auxiliary_loss_mlp": 0.0025041, + "balance_loss_clip": 1.09076929, + "balance_loss_mlp": 0.22112057, + "epoch": 0.7040432887419209, + "flos": 16683630773760.0, + "grad_norm": 28.696795050176263, + "language_loss": 0.88182455, + "learning_rate": 8.504467862866267e-07, + "loss": 0.8975513, + "num_input_tokens_seen": 252629150, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.29284668, + "step": 11710, + "time_per_iteration": 2.6173975467681885 + }, + { + "auxiliary_loss_clip": 0.01312818, + "auxiliary_loss_mlp": 0.00248064, + "balance_loss_clip": 1.07825851, + "balance_loss_mlp": 0.2194659, + "epoch": 0.7041034119945889, + "flos": 21141402203520.0, + "grad_norm": 1.8958269495038547, + "language_loss": 0.8516022, + "learning_rate": 8.501281075538076e-07, + "loss": 0.86721104, + "num_input_tokens_seen": 252648225, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.2857666, + "step": 11711, + "time_per_iteration": 4.1078102588653564 + }, + { + "auxiliary_loss_clip": 0.01329073, + "auxiliary_loss_mlp": 0.00253856, + "balance_loss_clip": 1.09532356, + "balance_loss_mlp": 0.22684358, + "epoch": 0.7041635352472568, + "flos": 16910299549440.0, + "grad_norm": 4.067309756814337, + "language_loss": 0.84680963, + "learning_rate": 8.498094724242457e-07, + "loss": 0.86263895, + "num_input_tokens_seen": 252665380, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.26977539, + "step": 11712, + "time_per_iteration": 2.6760692596435547 + }, + { + "auxiliary_loss_clip": 0.01155791, + "auxiliary_loss_mlp": 0.00076112, + "balance_loss_clip": 1.00832593, + "balance_loss_mlp": 0.06724288, + "epoch": 0.7042236584999249, + "flos": 71681219475840.0, + "grad_norm": 0.855935011190982, + "language_loss": 0.63870609, + "learning_rate": 8.494908809100247e-07, + "loss": 0.65102506, + "num_input_tokens_seen": 252727950, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.08886719, + "step": 11713, + "time_per_iteration": 3.213409185409546 + }, + { + "auxiliary_loss_clip": 0.01295079, + "auxiliary_loss_mlp": 0.00235854, + "balance_loss_clip": 1.0741415, + "balance_loss_mlp": 0.20943692, + "epoch": 0.7042837817525928, + "flos": 28658187590400.0, + "grad_norm": 78.5787318600898, + "language_loss": 0.79747212, + "learning_rate": 8.49172333023225e-07, + "loss": 0.81278145, + "num_input_tokens_seen": 252746770, + "router_z_loss_clip": 2.21191406, + "router_z_loss_mlp": 0.26428223, + "step": 11714, + "time_per_iteration": 4.125335216522217 + }, + { + "auxiliary_loss_clip": 0.01318555, + "auxiliary_loss_mlp": 0.00238014, + "balance_loss_clip": 1.08986795, + "balance_loss_mlp": 0.20963044, + "epoch": 0.7043439050052608, + "flos": 19753562465280.0, + "grad_norm": 22.42929641167915, + "language_loss": 0.86562717, + "learning_rate": 8.488538287759248e-07, + "loss": 0.88119292, + "num_input_tokens_seen": 252765610, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.28356934, + "step": 11715, + "time_per_iteration": 2.6931378841400146 + }, + { + "auxiliary_loss_clip": 0.01331531, + "auxiliary_loss_mlp": 0.00253548, + "balance_loss_clip": 1.09589267, + "balance_loss_mlp": 0.22413862, + "epoch": 0.7044040282579288, + "flos": 11538529620480.0, + "grad_norm": 7.139489232483213, + "language_loss": 0.82123238, + "learning_rate": 8.485353681802037e-07, + "loss": 0.83708322, + "num_input_tokens_seen": 252781610, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.29394531, + "step": 11716, + "time_per_iteration": 2.6041252613067627 + }, + { + "auxiliary_loss_clip": 0.01336551, + "auxiliary_loss_mlp": 0.00268123, + "balance_loss_clip": 1.09819496, + "balance_loss_mlp": 0.24063377, + "epoch": 0.7044641515105967, + "flos": 33656126722560.0, + "grad_norm": 44.30289417993636, + "language_loss": 0.75555289, + "learning_rate": 8.482169512481358e-07, + "loss": 0.77159965, + "num_input_tokens_seen": 252800600, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.27490234, + "step": 11717, + "time_per_iteration": 2.77431321144104 + }, + { + "auxiliary_loss_clip": 0.01306803, + "auxiliary_loss_mlp": 0.00242705, + "balance_loss_clip": 1.07968259, + "balance_loss_mlp": 0.21756384, + "epoch": 0.7045242747632647, + "flos": 26723859356160.0, + "grad_norm": 13.197460340871222, + "language_loss": 0.80155879, + "learning_rate": 8.478985779917967e-07, + "loss": 0.81705385, + "num_input_tokens_seen": 252822310, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.25146484, + "step": 11718, + "time_per_iteration": 4.227887153625488 + }, + { + "auxiliary_loss_clip": 0.01312114, + "auxiliary_loss_mlp": 0.00234008, + "balance_loss_clip": 1.08426023, + "balance_loss_mlp": 0.20684057, + "epoch": 0.7045843980159326, + "flos": 26797655848320.0, + "grad_norm": 5.474984375858945, + "language_loss": 0.85110229, + "learning_rate": 8.475802484232606e-07, + "loss": 0.86656356, + "num_input_tokens_seen": 252842355, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.27185059, + "step": 11719, + "time_per_iteration": 2.7273921966552734 + }, + { + "auxiliary_loss_clip": 0.013184, + "auxiliary_loss_mlp": 0.00251984, + "balance_loss_clip": 1.08964539, + "balance_loss_mlp": 0.22358853, + "epoch": 0.7046445212686007, + "flos": 41574824363520.0, + "grad_norm": 6.1303110187322645, + "language_loss": 0.72973633, + "learning_rate": 8.472619625545951e-07, + "loss": 0.74544024, + "num_input_tokens_seen": 252866785, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.28393555, + "step": 11720, + "time_per_iteration": 2.8446900844573975 + }, + { + "auxiliary_loss_clip": 0.01325549, + "auxiliary_loss_mlp": 0.00246485, + "balance_loss_clip": 1.09359372, + "balance_loss_mlp": 0.21937671, + "epoch": 0.7047046445212686, + "flos": 15560166113280.0, + "grad_norm": 24.839738710101877, + "language_loss": 0.88498962, + "learning_rate": 8.46943720397872e-07, + "loss": 0.90070993, + "num_input_tokens_seen": 252881870, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.27087402, + "step": 11721, + "time_per_iteration": 2.6057255268096924 + }, + { + "auxiliary_loss_clip": 0.01148274, + "auxiliary_loss_mlp": 0.00090179, + "balance_loss_clip": 1.00112152, + "balance_loss_mlp": 0.08092834, + "epoch": 0.7047647677739366, + "flos": 70410269571840.0, + "grad_norm": 0.7485611837238091, + "language_loss": 0.64404643, + "learning_rate": 8.466255219651582e-07, + "loss": 0.65643096, + "num_input_tokens_seen": 252951300, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.09228516, + "step": 11722, + "time_per_iteration": 3.3200857639312744 + }, + { + "auxiliary_loss_clip": 0.01306996, + "auxiliary_loss_mlp": 0.00230338, + "balance_loss_clip": 1.08157897, + "balance_loss_mlp": 0.20439778, + "epoch": 0.7048248910266045, + "flos": 23660032976640.0, + "grad_norm": 3.2987490481092876, + "language_loss": 0.74042159, + "learning_rate": 8.463073672685211e-07, + "loss": 0.75579494, + "num_input_tokens_seen": 252971400, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.25952148, + "step": 11723, + "time_per_iteration": 2.7141129970550537 + }, + { + "auxiliary_loss_clip": 0.01294557, + "auxiliary_loss_mlp": 0.00249873, + "balance_loss_clip": 1.072896, + "balance_loss_mlp": 0.22251439, + "epoch": 0.7048850142792725, + "flos": 21397158017280.0, + "grad_norm": 4.414000866810696, + "language_loss": 0.8776021, + "learning_rate": 8.459892563200235e-07, + "loss": 0.89304638, + "num_input_tokens_seen": 252989475, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.27331543, + "step": 11724, + "time_per_iteration": 2.770357847213745 + }, + { + "auxiliary_loss_clip": 0.01314291, + "auxiliary_loss_mlp": 0.00232149, + "balance_loss_clip": 1.08776498, + "balance_loss_mlp": 0.20457634, + "epoch": 0.7049451375319404, + "flos": 21648101408640.0, + "grad_norm": 313.73858153889097, + "language_loss": 0.80119449, + "learning_rate": 8.456711891317296e-07, + "loss": 0.81665885, + "num_input_tokens_seen": 253007220, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.27575684, + "step": 11725, + "time_per_iteration": 2.6550889015197754 + }, + { + "auxiliary_loss_clip": 0.01310509, + "auxiliary_loss_mlp": 0.00216837, + "balance_loss_clip": 1.08416402, + "balance_loss_mlp": 0.1899316, + "epoch": 0.7050052607846085, + "flos": 14866802904960.0, + "grad_norm": 6.566165361528535, + "language_loss": 0.86872017, + "learning_rate": 8.453531657156998e-07, + "loss": 0.88399357, + "num_input_tokens_seen": 253025410, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.26940918, + "step": 11726, + "time_per_iteration": 2.60640025138855 + }, + { + "auxiliary_loss_clip": 0.01320555, + "auxiliary_loss_mlp": 0.00260016, + "balance_loss_clip": 1.08888531, + "balance_loss_mlp": 0.23278855, + "epoch": 0.7050653840372764, + "flos": 19241763528960.0, + "grad_norm": 2.7977418610118123, + "language_loss": 0.77398586, + "learning_rate": 8.450351860839931e-07, + "loss": 0.78979164, + "num_input_tokens_seen": 253043305, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.27209473, + "step": 11727, + "time_per_iteration": 2.6946444511413574 + }, + { + "auxiliary_loss_clip": 0.01267666, + "auxiliary_loss_mlp": 0.00222224, + "balance_loss_clip": 1.05684149, + "balance_loss_mlp": 0.19838229, + "epoch": 0.7051255072899444, + "flos": 27780422935680.0, + "grad_norm": 2.4528021432575664, + "language_loss": 0.76114142, + "learning_rate": 8.44717250248668e-07, + "loss": 0.77604032, + "num_input_tokens_seen": 253062790, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.23852539, + "step": 11728, + "time_per_iteration": 2.7293336391448975 + }, + { + "auxiliary_loss_clip": 0.01309001, + "auxiliary_loss_mlp": 0.00206411, + "balance_loss_clip": 1.08035278, + "balance_loss_mlp": 0.17884955, + "epoch": 0.7051856305426124, + "flos": 27892033470720.0, + "grad_norm": 15.64677542454446, + "language_loss": 0.79671085, + "learning_rate": 8.443993582217803e-07, + "loss": 0.81186485, + "num_input_tokens_seen": 253082055, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27563477, + "step": 11729, + "time_per_iteration": 2.733164072036743 + }, + { + "auxiliary_loss_clip": 0.01327964, + "auxiliary_loss_mlp": 0.00230899, + "balance_loss_clip": 1.09201169, + "balance_loss_mlp": 0.20191908, + "epoch": 0.7052457537952803, + "flos": 25043563082880.0, + "grad_norm": 19.83512566636921, + "language_loss": 0.85310328, + "learning_rate": 8.440815100153862e-07, + "loss": 0.86869192, + "num_input_tokens_seen": 253102575, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.28991699, + "step": 11730, + "time_per_iteration": 2.6579298973083496 + }, + { + "auxiliary_loss_clip": 0.01303831, + "auxiliary_loss_mlp": 0.0021693, + "balance_loss_clip": 1.07934999, + "balance_loss_mlp": 0.19032297, + "epoch": 0.7053058770479483, + "flos": 21871717528320.0, + "grad_norm": 34.21991092124688, + "language_loss": 0.74223852, + "learning_rate": 8.437637056415359e-07, + "loss": 0.75744617, + "num_input_tokens_seen": 253121290, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26611328, + "step": 11731, + "time_per_iteration": 2.6538937091827393 + }, + { + "auxiliary_loss_clip": 0.01302487, + "auxiliary_loss_mlp": 0.00248768, + "balance_loss_clip": 1.08030844, + "balance_loss_mlp": 0.22313754, + "epoch": 0.7053660003006162, + "flos": 16398716094720.0, + "grad_norm": 60.94869609901482, + "language_loss": 0.82757443, + "learning_rate": 8.434459451122815e-07, + "loss": 0.84308696, + "num_input_tokens_seen": 253139720, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.2565918, + "step": 11732, + "time_per_iteration": 2.6105875968933105 + }, + { + "auxiliary_loss_clip": 0.01318774, + "auxiliary_loss_mlp": 0.00226691, + "balance_loss_clip": 1.09651589, + "balance_loss_mlp": 0.19978571, + "epoch": 0.7054261235532843, + "flos": 22711560399360.0, + "grad_norm": 230.39928559629897, + "language_loss": 0.77418, + "learning_rate": 8.431282284396735e-07, + "loss": 0.78963464, + "num_input_tokens_seen": 253160250, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.26928711, + "step": 11733, + "time_per_iteration": 2.6692514419555664 + }, + { + "auxiliary_loss_clip": 0.01287465, + "auxiliary_loss_mlp": 0.00233922, + "balance_loss_clip": 1.06664228, + "balance_loss_mlp": 0.2060028, + "epoch": 0.7054862468059522, + "flos": 13589711775360.0, + "grad_norm": 2.906144032403962, + "language_loss": 0.81351227, + "learning_rate": 8.428105556357583e-07, + "loss": 0.82872611, + "num_input_tokens_seen": 253178710, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.27941895, + "step": 11734, + "time_per_iteration": 2.617344379425049 + }, + { + "auxiliary_loss_clip": 0.01335906, + "auxiliary_loss_mlp": 0.00234844, + "balance_loss_clip": 1.09389019, + "balance_loss_mlp": 0.20663913, + "epoch": 0.7055463700586202, + "flos": 15880704105600.0, + "grad_norm": 3.628348902361387, + "language_loss": 0.82472265, + "learning_rate": 8.424929267125829e-07, + "loss": 0.84043014, + "num_input_tokens_seen": 253194805, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.28186035, + "step": 11735, + "time_per_iteration": 2.658045768737793 + }, + { + "auxiliary_loss_clip": 0.01335467, + "auxiliary_loss_mlp": 0.00219686, + "balance_loss_clip": 1.09663975, + "balance_loss_mlp": 0.1872611, + "epoch": 0.7056064933112881, + "flos": 23076161400960.0, + "grad_norm": 22.46104892926085, + "language_loss": 0.81119752, + "learning_rate": 8.421753416821933e-07, + "loss": 0.82674909, + "num_input_tokens_seen": 253213895, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.32421875, + "step": 11736, + "time_per_iteration": 2.689769983291626 + }, + { + "auxiliary_loss_clip": 0.01305399, + "auxiliary_loss_mlp": 0.00219753, + "balance_loss_clip": 1.08131957, + "balance_loss_mlp": 0.19271615, + "epoch": 0.7056666165639561, + "flos": 24057168721920.0, + "grad_norm": 17.39489262798504, + "language_loss": 0.75964153, + "learning_rate": 8.41857800556629e-07, + "loss": 0.77489305, + "num_input_tokens_seen": 253231620, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.27062988, + "step": 11737, + "time_per_iteration": 2.7491648197174072 + }, + { + "auxiliary_loss_clip": 0.01335472, + "auxiliary_loss_mlp": 0.00235325, + "balance_loss_clip": 1.09588861, + "balance_loss_mlp": 0.2057377, + "epoch": 0.705726739816624, + "flos": 17493237371520.0, + "grad_norm": 15.081980981608442, + "language_loss": 0.78224897, + "learning_rate": 8.415403033479332e-07, + "loss": 0.797957, + "num_input_tokens_seen": 253249590, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.29577637, + "step": 11738, + "time_per_iteration": 2.6664719581604004 + }, + { + "auxiliary_loss_clip": 0.0132925, + "auxiliary_loss_mlp": 0.00220611, + "balance_loss_clip": 1.09363246, + "balance_loss_mlp": 0.19048658, + "epoch": 0.7057868630692921, + "flos": 51350426472960.0, + "grad_norm": 32.42702375695564, + "language_loss": 0.83672082, + "learning_rate": 8.41222850068145e-07, + "loss": 0.85221946, + "num_input_tokens_seen": 253273870, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.30114746, + "step": 11739, + "time_per_iteration": 2.9577319622039795 + }, + { + "auxiliary_loss_clip": 0.01283925, + "auxiliary_loss_mlp": 0.00219629, + "balance_loss_clip": 1.062837, + "balance_loss_mlp": 0.19330731, + "epoch": 0.70584698632196, + "flos": 26102963836800.0, + "grad_norm": 28.762442675812515, + "language_loss": 0.78459811, + "learning_rate": 8.409054407293032e-07, + "loss": 0.79963362, + "num_input_tokens_seen": 253293720, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26281738, + "step": 11740, + "time_per_iteration": 2.7515838146209717 + }, + { + "auxiliary_loss_clip": 0.01287157, + "auxiliary_loss_mlp": 0.00212783, + "balance_loss_clip": 1.06664169, + "balance_loss_mlp": 0.18723664, + "epoch": 0.705907109574628, + "flos": 21543134889600.0, + "grad_norm": 196.53078447009077, + "language_loss": 0.89626062, + "learning_rate": 8.405880753434434e-07, + "loss": 0.91126001, + "num_input_tokens_seen": 253313700, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25537109, + "step": 11741, + "time_per_iteration": 2.6962642669677734 + }, + { + "auxiliary_loss_clip": 0.01309614, + "auxiliary_loss_mlp": 0.00231083, + "balance_loss_clip": 1.08166611, + "balance_loss_mlp": 0.20182911, + "epoch": 0.705967232827296, + "flos": 22710842127360.0, + "grad_norm": 1441.677851573194, + "language_loss": 0.87249136, + "learning_rate": 8.402707539225993e-07, + "loss": 0.88789827, + "num_input_tokens_seen": 253332425, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.29284668, + "step": 11742, + "time_per_iteration": 2.6889233589172363 + }, + { + "auxiliary_loss_clip": 0.0131415, + "auxiliary_loss_mlp": 0.00257398, + "balance_loss_clip": 1.07974052, + "balance_loss_mlp": 0.22783412, + "epoch": 0.7060273560799639, + "flos": 28691225124480.0, + "grad_norm": 3.728993143273088, + "language_loss": 0.72883701, + "learning_rate": 8.39953476478805e-07, + "loss": 0.74455249, + "num_input_tokens_seen": 253353620, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.2956543, + "step": 11743, + "time_per_iteration": 2.7079827785491943 + }, + { + "auxiliary_loss_clip": 0.01321113, + "auxiliary_loss_mlp": 0.00240838, + "balance_loss_clip": 1.08624458, + "balance_loss_mlp": 0.21296667, + "epoch": 0.7060874793326319, + "flos": 15706178899200.0, + "grad_norm": 21.459992160720404, + "language_loss": 0.73772317, + "learning_rate": 8.396362430240902e-07, + "loss": 0.75334275, + "num_input_tokens_seen": 253370930, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.27832031, + "step": 11744, + "time_per_iteration": 2.6887686252593994 + }, + { + "auxiliary_loss_clip": 0.01307566, + "auxiliary_loss_mlp": 0.00231633, + "balance_loss_clip": 1.08528531, + "balance_loss_mlp": 0.20575297, + "epoch": 0.7061476025852998, + "flos": 21506757390720.0, + "grad_norm": 4.629380566651849, + "language_loss": 0.72482294, + "learning_rate": 8.393190535704857e-07, + "loss": 0.74021494, + "num_input_tokens_seen": 253389810, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25866699, + "step": 11745, + "time_per_iteration": 2.632183074951172 + }, + { + "auxiliary_loss_clip": 0.01310914, + "auxiliary_loss_mlp": 0.00233282, + "balance_loss_clip": 1.08735764, + "balance_loss_mlp": 0.20460053, + "epoch": 0.7062077258379679, + "flos": 28181832399360.0, + "grad_norm": 2.5529946609183995, + "language_loss": 0.7719962, + "learning_rate": 8.390019081300188e-07, + "loss": 0.78743815, + "num_input_tokens_seen": 253408685, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.28674316, + "step": 11746, + "time_per_iteration": 2.778212070465088 + }, + { + "auxiliary_loss_clip": 0.01293444, + "auxiliary_loss_mlp": 0.00219416, + "balance_loss_clip": 1.06996655, + "balance_loss_mlp": 0.19090064, + "epoch": 0.7062678490906358, + "flos": 27853680723840.0, + "grad_norm": 7.565634419161394, + "language_loss": 0.84946394, + "learning_rate": 8.386848067147175e-07, + "loss": 0.86459249, + "num_input_tokens_seen": 253429685, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.28491211, + "step": 11747, + "time_per_iteration": 2.7247676849365234 + }, + { + "auxiliary_loss_clip": 0.0128311, + "auxiliary_loss_mlp": 0.00223814, + "balance_loss_clip": 1.06855845, + "balance_loss_mlp": 0.19920962, + "epoch": 0.7063279723433038, + "flos": 23184862934400.0, + "grad_norm": 60.60335225987283, + "language_loss": 0.72048271, + "learning_rate": 8.383677493366031e-07, + "loss": 0.73555195, + "num_input_tokens_seen": 253448260, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24621582, + "step": 11748, + "time_per_iteration": 2.6634669303894043 + }, + { + "auxiliary_loss_clip": 0.01334053, + "auxiliary_loss_mlp": 0.00260521, + "balance_loss_clip": 1.10058284, + "balance_loss_mlp": 0.23281638, + "epoch": 0.7063880955959717, + "flos": 20188655907840.0, + "grad_norm": 15.83648273847332, + "language_loss": 0.89145207, + "learning_rate": 8.380507360077003e-07, + "loss": 0.90739775, + "num_input_tokens_seen": 253467725, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.27709961, + "step": 11749, + "time_per_iteration": 2.7296640872955322 + }, + { + "auxiliary_loss_clip": 0.01144363, + "auxiliary_loss_mlp": 0.00071639, + "balance_loss_clip": 0.99257708, + "balance_loss_mlp": 0.06372344, + "epoch": 0.7064482188486397, + "flos": 63668182763520.0, + "grad_norm": 0.7821520493582876, + "language_loss": 0.53388321, + "learning_rate": 8.377337667400304e-07, + "loss": 0.54604316, + "num_input_tokens_seen": 253526940, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.07910156, + "step": 11750, + "time_per_iteration": 3.1415154933929443 + }, + { + "auxiliary_loss_clip": 0.01309889, + "auxiliary_loss_mlp": 0.00231918, + "balance_loss_clip": 1.0841428, + "balance_loss_mlp": 0.20672946, + "epoch": 0.7065083421013076, + "flos": 25191227894400.0, + "grad_norm": 50.22901524468032, + "language_loss": 0.88308692, + "learning_rate": 8.37416841545612e-07, + "loss": 0.89850503, + "num_input_tokens_seen": 253546160, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.25170898, + "step": 11751, + "time_per_iteration": 2.730597734451294 + }, + { + "auxiliary_loss_clip": 0.0130193, + "auxiliary_loss_mlp": 0.00201096, + "balance_loss_clip": 1.07811129, + "balance_loss_mlp": 0.17444067, + "epoch": 0.7065684653539757, + "flos": 22893699288960.0, + "grad_norm": 4.8794327112594855, + "language_loss": 0.7584548, + "learning_rate": 8.370999604364634e-07, + "loss": 0.77348506, + "num_input_tokens_seen": 253565505, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26660156, + "step": 11752, + "time_per_iteration": 4.065606355667114 + }, + { + "auxiliary_loss_clip": 0.01317368, + "auxiliary_loss_mlp": 0.00239564, + "balance_loss_clip": 1.08648133, + "balance_loss_mlp": 0.21042901, + "epoch": 0.7066285886066436, + "flos": 23550254035200.0, + "grad_norm": 3.4223805384649832, + "language_loss": 0.87047994, + "learning_rate": 8.367831234246025e-07, + "loss": 0.88604927, + "num_input_tokens_seen": 253585125, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.29125977, + "step": 11753, + "time_per_iteration": 4.157552003860474 + }, + { + "auxiliary_loss_clip": 0.0130819, + "auxiliary_loss_mlp": 0.00221703, + "balance_loss_clip": 1.0860424, + "balance_loss_mlp": 0.1966095, + "epoch": 0.7066887118593116, + "flos": 21069293650560.0, + "grad_norm": 17.561498410063667, + "language_loss": 0.7656002, + "learning_rate": 8.364663305220405e-07, + "loss": 0.78089917, + "num_input_tokens_seen": 253604815, + "router_z_loss_clip": 2.22753906, + "router_z_loss_mlp": 0.25085449, + "step": 11754, + "time_per_iteration": 2.735171318054199 + }, + { + "auxiliary_loss_clip": 0.01312966, + "auxiliary_loss_mlp": 0.00222381, + "balance_loss_clip": 1.08722937, + "balance_loss_mlp": 0.19577345, + "epoch": 0.7067488351119796, + "flos": 21176307244800.0, + "grad_norm": 16.025329457683053, + "language_loss": 0.94642526, + "learning_rate": 8.361495817407919e-07, + "loss": 0.96177876, + "num_input_tokens_seen": 253622855, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.26599121, + "step": 11755, + "time_per_iteration": 2.6536967754364014 + }, + { + "auxiliary_loss_clip": 0.01291818, + "auxiliary_loss_mlp": 0.00235595, + "balance_loss_clip": 1.07401145, + "balance_loss_mlp": 0.20986995, + "epoch": 0.7068089583646475, + "flos": 20449224144000.0, + "grad_norm": 2.952421877152004, + "language_loss": 0.88228846, + "learning_rate": 8.358328770928678e-07, + "loss": 0.89756262, + "num_input_tokens_seen": 253642760, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25708008, + "step": 11756, + "time_per_iteration": 4.105769872665405 + }, + { + "auxiliary_loss_clip": 0.01148786, + "auxiliary_loss_mlp": 0.00061118, + "balance_loss_clip": 0.99933922, + "balance_loss_mlp": 0.05124793, + "epoch": 0.7068690816173155, + "flos": 59109179829120.0, + "grad_norm": 0.823877723369298, + "language_loss": 0.59542727, + "learning_rate": 8.355162165902785e-07, + "loss": 0.6075263, + "num_input_tokens_seen": 253695685, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.09863281, + "step": 11757, + "time_per_iteration": 2.967616558074951 + }, + { + "auxiliary_loss_clip": 0.01316745, + "auxiliary_loss_mlp": 0.00265007, + "balance_loss_clip": 1.09149361, + "balance_loss_mlp": 0.23794615, + "epoch": 0.7069292048699835, + "flos": 16251554073600.0, + "grad_norm": 7.8541397859173605, + "language_loss": 0.89002889, + "learning_rate": 8.351996002450307e-07, + "loss": 0.90584636, + "num_input_tokens_seen": 253713305, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.27050781, + "step": 11758, + "time_per_iteration": 2.6295113563537598 + }, + { + "auxiliary_loss_clip": 0.01306234, + "auxiliary_loss_mlp": 0.00243387, + "balance_loss_clip": 1.08393776, + "balance_loss_mlp": 0.21575466, + "epoch": 0.7069893281226515, + "flos": 41172768455040.0, + "grad_norm": 4.190992112918625, + "language_loss": 0.84691751, + "learning_rate": 8.348830280691304e-07, + "loss": 0.8624137, + "num_input_tokens_seen": 253736100, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.27648926, + "step": 11759, + "time_per_iteration": 2.8372421264648438 + }, + { + "auxiliary_loss_clip": 0.01306034, + "auxiliary_loss_mlp": 0.0020472, + "balance_loss_clip": 1.08065867, + "balance_loss_mlp": 0.17839819, + "epoch": 0.7070494513753194, + "flos": 24207275658240.0, + "grad_norm": 30.358461241256776, + "language_loss": 0.76064342, + "learning_rate": 8.34566500074583e-07, + "loss": 0.77575094, + "num_input_tokens_seen": 253757350, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.26318359, + "step": 11760, + "time_per_iteration": 4.148841619491577 + }, + { + "auxiliary_loss_clip": 0.01316615, + "auxiliary_loss_mlp": 0.00229283, + "balance_loss_clip": 1.08502531, + "balance_loss_mlp": 0.20200787, + "epoch": 0.7071095746279874, + "flos": 20185675079040.0, + "grad_norm": 735.3354563176207, + "language_loss": 0.856749, + "learning_rate": 8.342500162733899e-07, + "loss": 0.872208, + "num_input_tokens_seen": 253772855, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.27270508, + "step": 11761, + "time_per_iteration": 2.6712989807128906 + }, + { + "auxiliary_loss_clip": 0.01313266, + "auxiliary_loss_mlp": 0.00233881, + "balance_loss_clip": 1.08972406, + "balance_loss_mlp": 0.20746401, + "epoch": 0.7071696978806553, + "flos": 18183045133440.0, + "grad_norm": 7.372245888337569, + "language_loss": 0.82323182, + "learning_rate": 8.33933576677553e-07, + "loss": 0.83870327, + "num_input_tokens_seen": 253790360, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26403809, + "step": 11762, + "time_per_iteration": 2.675609588623047 + }, + { + "auxiliary_loss_clip": 0.01299462, + "auxiliary_loss_mlp": 0.00226695, + "balance_loss_clip": 1.08113039, + "balance_loss_mlp": 0.20088658, + "epoch": 0.7072298211333233, + "flos": 24131719399680.0, + "grad_norm": 7.651089139557024, + "language_loss": 0.84264308, + "learning_rate": 8.336171812990724e-07, + "loss": 0.85790467, + "num_input_tokens_seen": 253810585, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25793457, + "step": 11763, + "time_per_iteration": 2.663773775100708 + }, + { + "auxiliary_loss_clip": 0.0131509, + "auxiliary_loss_mlp": 0.00244894, + "balance_loss_clip": 1.0856005, + "balance_loss_mlp": 0.21600926, + "epoch": 0.7072899443859912, + "flos": 27198418867200.0, + "grad_norm": 145.79413998867074, + "language_loss": 0.87113613, + "learning_rate": 8.333008301499453e-07, + "loss": 0.88673598, + "num_input_tokens_seen": 253829080, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.28894043, + "step": 11764, + "time_per_iteration": 2.7702624797821045 + }, + { + "auxiliary_loss_clip": 0.01302811, + "auxiliary_loss_mlp": 0.0023128, + "balance_loss_clip": 1.08003044, + "balance_loss_mlp": 0.20437488, + "epoch": 0.7073500676386593, + "flos": 16435596384000.0, + "grad_norm": 7.610852136064226, + "language_loss": 0.87211096, + "learning_rate": 8.32984523242167e-07, + "loss": 0.88745183, + "num_input_tokens_seen": 253846780, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.26940918, + "step": 11765, + "time_per_iteration": 2.6413052082061768 + }, + { + "auxiliary_loss_clip": 0.01312833, + "auxiliary_loss_mlp": 0.00247549, + "balance_loss_clip": 1.08924675, + "balance_loss_mlp": 0.22028607, + "epoch": 0.7074101908913272, + "flos": 27673732563840.0, + "grad_norm": 200.82270234390404, + "language_loss": 0.75362372, + "learning_rate": 8.326682605877324e-07, + "loss": 0.76922756, + "num_input_tokens_seen": 253867075, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.27270508, + "step": 11766, + "time_per_iteration": 2.772528886795044 + }, + { + "auxiliary_loss_clip": 0.0133011, + "auxiliary_loss_mlp": 0.00244982, + "balance_loss_clip": 1.09978426, + "balance_loss_mlp": 0.21665801, + "epoch": 0.7074703141439952, + "flos": 22238078296320.0, + "grad_norm": 15.652506593538734, + "language_loss": 0.72292864, + "learning_rate": 8.323520421986352e-07, + "loss": 0.73867953, + "num_input_tokens_seen": 253885790, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.28295898, + "step": 11767, + "time_per_iteration": 2.7287936210632324 + }, + { + "auxiliary_loss_clip": 0.01315436, + "auxiliary_loss_mlp": 0.0022021, + "balance_loss_clip": 1.08451641, + "balance_loss_mlp": 0.19174254, + "epoch": 0.7075304373966632, + "flos": 29643217234560.0, + "grad_norm": 173.5241611600249, + "language_loss": 0.61132354, + "learning_rate": 8.320358680868646e-07, + "loss": 0.62667996, + "num_input_tokens_seen": 253907070, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.28417969, + "step": 11768, + "time_per_iteration": 2.7615857124328613 + }, + { + "auxiliary_loss_clip": 0.01290216, + "auxiliary_loss_mlp": 0.00208498, + "balance_loss_clip": 1.07181776, + "balance_loss_mlp": 0.18206942, + "epoch": 0.7075905606493311, + "flos": 19755214490880.0, + "grad_norm": 2.71677225170037, + "language_loss": 0.82408965, + "learning_rate": 8.317197382644119e-07, + "loss": 0.83907682, + "num_input_tokens_seen": 253927290, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.26428223, + "step": 11769, + "time_per_iteration": 2.692446708679199 + }, + { + "auxiliary_loss_clip": 0.01174854, + "auxiliary_loss_mlp": 0.00093828, + "balance_loss_clip": 1.02405286, + "balance_loss_mlp": 0.08543548, + "epoch": 0.7076506839019991, + "flos": 65716132694400.0, + "grad_norm": 0.8362946786902048, + "language_loss": 0.61487353, + "learning_rate": 8.314036527432637e-07, + "loss": 0.62756038, + "num_input_tokens_seen": 253983440, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.08398438, + "step": 11770, + "time_per_iteration": 3.1053028106689453 + }, + { + "auxiliary_loss_clip": 0.0133586, + "auxiliary_loss_mlp": 0.00235104, + "balance_loss_clip": 1.10499501, + "balance_loss_mlp": 0.20796019, + "epoch": 0.707710807154667, + "flos": 23765286804480.0, + "grad_norm": 5.5328402097002245, + "language_loss": 0.82802981, + "learning_rate": 8.310876115354055e-07, + "loss": 0.84373939, + "num_input_tokens_seen": 254003825, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.27160645, + "step": 11771, + "time_per_iteration": 2.751283884048462 + }, + { + "auxiliary_loss_clip": 0.01305564, + "auxiliary_loss_mlp": 0.00236505, + "balance_loss_clip": 1.08369279, + "balance_loss_mlp": 0.20772775, + "epoch": 0.7077709304073351, + "flos": 21251360712960.0, + "grad_norm": 163.39558080410222, + "language_loss": 0.79734862, + "learning_rate": 8.307716146528221e-07, + "loss": 0.81276929, + "num_input_tokens_seen": 254023345, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.28808594, + "step": 11772, + "time_per_iteration": 2.6969833374023438 + }, + { + "auxiliary_loss_clip": 0.01315169, + "auxiliary_loss_mlp": 0.00217284, + "balance_loss_clip": 1.08569837, + "balance_loss_mlp": 0.18991369, + "epoch": 0.707831053660003, + "flos": 20740746925440.0, + "grad_norm": 25.933654063051723, + "language_loss": 0.80892909, + "learning_rate": 8.30455662107496e-07, + "loss": 0.82425362, + "num_input_tokens_seen": 254041815, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.27380371, + "step": 11773, + "time_per_iteration": 2.692239284515381 + }, + { + "auxiliary_loss_clip": 0.01332292, + "auxiliary_loss_mlp": 0.00255219, + "balance_loss_clip": 1.10147107, + "balance_loss_mlp": 0.22802764, + "epoch": 0.707891176912671, + "flos": 21980993679360.0, + "grad_norm": 30.936194627872844, + "language_loss": 0.77894199, + "learning_rate": 8.301397539114095e-07, + "loss": 0.79481709, + "num_input_tokens_seen": 254062065, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.27197266, + "step": 11774, + "time_per_iteration": 2.6728408336639404 + }, + { + "auxiliary_loss_clip": 0.01297431, + "auxiliary_loss_mlp": 0.00235735, + "balance_loss_clip": 1.08267379, + "balance_loss_mlp": 0.21004577, + "epoch": 0.7079513001653389, + "flos": 21068970428160.0, + "grad_norm": 1364.1114897444427, + "language_loss": 0.81514478, + "learning_rate": 8.298238900765407e-07, + "loss": 0.8304764, + "num_input_tokens_seen": 254080605, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.25695801, + "step": 11775, + "time_per_iteration": 2.6735169887542725 + }, + { + "auxiliary_loss_clip": 0.01318235, + "auxiliary_loss_mlp": 0.00262022, + "balance_loss_clip": 1.09220397, + "balance_loss_mlp": 0.23413911, + "epoch": 0.7080114234180069, + "flos": 18040659621120.0, + "grad_norm": 123.84558346286963, + "language_loss": 0.93919879, + "learning_rate": 8.295080706148665e-07, + "loss": 0.95500135, + "num_input_tokens_seen": 254098710, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.27856445, + "step": 11776, + "time_per_iteration": 2.626718759536743 + }, + { + "auxiliary_loss_clip": 0.0130368, + "auxiliary_loss_mlp": 0.00227973, + "balance_loss_clip": 1.07993364, + "balance_loss_mlp": 0.19998288, + "epoch": 0.7080715466706748, + "flos": 15122271409920.0, + "grad_norm": 68.63603195095418, + "language_loss": 0.8186906, + "learning_rate": 8.291922955383641e-07, + "loss": 0.83400714, + "num_input_tokens_seen": 254117200, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.28015137, + "step": 11777, + "time_per_iteration": 2.6779003143310547 + }, + { + "auxiliary_loss_clip": 0.01343658, + "auxiliary_loss_mlp": 0.00254166, + "balance_loss_clip": 1.10791063, + "balance_loss_mlp": 0.22535291, + "epoch": 0.7081316699233429, + "flos": 14422802889600.0, + "grad_norm": 6.3698679744293765, + "language_loss": 0.90174735, + "learning_rate": 8.288765648590066e-07, + "loss": 0.91772568, + "num_input_tokens_seen": 254132115, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.28845215, + "step": 11778, + "time_per_iteration": 2.611539602279663 + }, + { + "auxiliary_loss_clip": 0.01311736, + "auxiliary_loss_mlp": 0.00223513, + "balance_loss_clip": 1.08899486, + "balance_loss_mlp": 0.19822881, + "epoch": 0.7081917931760108, + "flos": 23222389668480.0, + "grad_norm": 2.0723821926668036, + "language_loss": 0.906708, + "learning_rate": 8.285608785887673e-07, + "loss": 0.92206055, + "num_input_tokens_seen": 254152285, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25280762, + "step": 11779, + "time_per_iteration": 2.7006278038024902 + }, + { + "auxiliary_loss_clip": 0.01329387, + "auxiliary_loss_mlp": 0.00238015, + "balance_loss_clip": 1.09784985, + "balance_loss_mlp": 0.21171729, + "epoch": 0.7082519164286788, + "flos": 39308429871360.0, + "grad_norm": 10.121596812940599, + "language_loss": 0.78394639, + "learning_rate": 8.28245236739618e-07, + "loss": 0.79962045, + "num_input_tokens_seen": 254172805, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.26293945, + "step": 11780, + "time_per_iteration": 2.848233699798584 + }, + { + "auxiliary_loss_clip": 0.01319947, + "auxiliary_loss_mlp": 0.00205106, + "balance_loss_clip": 1.09083533, + "balance_loss_mlp": 0.17856991, + "epoch": 0.7083120396813467, + "flos": 21651154064640.0, + "grad_norm": 244.0619349576848, + "language_loss": 0.80012584, + "learning_rate": 8.279296393235256e-07, + "loss": 0.81537634, + "num_input_tokens_seen": 254191890, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.26538086, + "step": 11781, + "time_per_iteration": 2.704643487930298 + }, + { + "auxiliary_loss_clip": 0.01299745, + "auxiliary_loss_mlp": 0.00217526, + "balance_loss_clip": 1.07875276, + "balance_loss_mlp": 0.19258785, + "epoch": 0.7083721629340147, + "flos": 17567033863680.0, + "grad_norm": 12.916079052207705, + "language_loss": 0.85229313, + "learning_rate": 8.276140863524585e-07, + "loss": 0.86746585, + "num_input_tokens_seen": 254210150, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.24975586, + "step": 11782, + "time_per_iteration": 2.6962387561798096 + }, + { + "auxiliary_loss_clip": 0.01297354, + "auxiliary_loss_mlp": 0.00214264, + "balance_loss_clip": 1.07921827, + "balance_loss_mlp": 0.19081597, + "epoch": 0.7084322861866827, + "flos": 29350509304320.0, + "grad_norm": 2.1652447612010053, + "language_loss": 0.75775671, + "learning_rate": 8.272985778383828e-07, + "loss": 0.77287292, + "num_input_tokens_seen": 254233015, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.23449707, + "step": 11783, + "time_per_iteration": 2.747670888900757 + }, + { + "auxiliary_loss_clip": 0.01320161, + "auxiliary_loss_mlp": 0.0024293, + "balance_loss_clip": 1.09177828, + "balance_loss_mlp": 0.21575026, + "epoch": 0.7084924094393507, + "flos": 20194294343040.0, + "grad_norm": 74.80175211946299, + "language_loss": 0.85525769, + "learning_rate": 8.269831137932632e-07, + "loss": 0.87088865, + "num_input_tokens_seen": 254251345, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27185059, + "step": 11784, + "time_per_iteration": 2.701885461807251 + }, + { + "auxiliary_loss_clip": 0.01317298, + "auxiliary_loss_mlp": 0.00212169, + "balance_loss_clip": 1.09566522, + "balance_loss_mlp": 0.18513253, + "epoch": 0.7085525326920187, + "flos": 23477211728640.0, + "grad_norm": 17.08869940962551, + "language_loss": 0.85103178, + "learning_rate": 8.266676942290609e-07, + "loss": 0.86632645, + "num_input_tokens_seen": 254269905, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.27038574, + "step": 11785, + "time_per_iteration": 2.6550745964050293 + }, + { + "auxiliary_loss_clip": 0.0131782, + "auxiliary_loss_mlp": 0.00238683, + "balance_loss_clip": 1.08891094, + "balance_loss_mlp": 0.21084802, + "epoch": 0.7086126559446866, + "flos": 25958818558080.0, + "grad_norm": 4.989566989739853, + "language_loss": 0.84086692, + "learning_rate": 8.26352319157738e-07, + "loss": 0.85643196, + "num_input_tokens_seen": 254289990, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.27832031, + "step": 11786, + "time_per_iteration": 2.75182843208313 + }, + { + "auxiliary_loss_clip": 0.01344697, + "auxiliary_loss_mlp": 0.00250227, + "balance_loss_clip": 1.10377717, + "balance_loss_mlp": 0.21985272, + "epoch": 0.7086727791973546, + "flos": 26724793109760.0, + "grad_norm": 1657.7014686987259, + "language_loss": 0.86547232, + "learning_rate": 8.260369885912526e-07, + "loss": 0.88142151, + "num_input_tokens_seen": 254309085, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.30383301, + "step": 11787, + "time_per_iteration": 2.710374593734741 + }, + { + "auxiliary_loss_clip": 0.01319059, + "auxiliary_loss_mlp": 0.00239937, + "balance_loss_clip": 1.08884549, + "balance_loss_mlp": 0.21325818, + "epoch": 0.7087329024500225, + "flos": 21683365585920.0, + "grad_norm": 51.61730053589012, + "language_loss": 0.84958708, + "learning_rate": 8.257217025415615e-07, + "loss": 0.86517704, + "num_input_tokens_seen": 254327045, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.26660156, + "step": 11788, + "time_per_iteration": 2.6813700199127197 + }, + { + "auxiliary_loss_clip": 0.01360167, + "auxiliary_loss_mlp": 0.00257817, + "balance_loss_clip": 1.11741817, + "balance_loss_mlp": 0.22662038, + "epoch": 0.7087930257026905, + "flos": 17931060247680.0, + "grad_norm": 15.412505344874223, + "language_loss": 0.7955755, + "learning_rate": 8.254064610206212e-07, + "loss": 0.8117553, + "num_input_tokens_seen": 254344585, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.31188965, + "step": 11789, + "time_per_iteration": 2.6369714736938477 + }, + { + "auxiliary_loss_clip": 0.01332695, + "auxiliary_loss_mlp": 0.00251012, + "balance_loss_clip": 1.10202718, + "balance_loss_mlp": 0.22255713, + "epoch": 0.7088531489553584, + "flos": 18911528864640.0, + "grad_norm": 9.018995081709546, + "language_loss": 0.84803426, + "learning_rate": 8.250912640403858e-07, + "loss": 0.86387134, + "num_input_tokens_seen": 254362470, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2845459, + "step": 11790, + "time_per_iteration": 2.678720235824585 + }, + { + "auxiliary_loss_clip": 0.01335088, + "auxiliary_loss_mlp": 0.00257669, + "balance_loss_clip": 1.09826136, + "balance_loss_mlp": 0.22909483, + "epoch": 0.7089132722080265, + "flos": 27380880979200.0, + "grad_norm": 189.95478257218957, + "language_loss": 0.81768656, + "learning_rate": 8.247761116128085e-07, + "loss": 0.83361411, + "num_input_tokens_seen": 254383190, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.28540039, + "step": 11791, + "time_per_iteration": 2.728337049484253 + }, + { + "auxiliary_loss_clip": 0.0131831, + "auxiliary_loss_mlp": 0.00251133, + "balance_loss_clip": 1.09359479, + "balance_loss_mlp": 0.2238695, + "epoch": 0.7089733954606944, + "flos": 22162917087360.0, + "grad_norm": 52.750406647008674, + "language_loss": 0.89692217, + "learning_rate": 8.244610037498376e-07, + "loss": 0.91261661, + "num_input_tokens_seen": 254403115, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.27282715, + "step": 11792, + "time_per_iteration": 2.649060010910034 + }, + { + "auxiliary_loss_clip": 0.01333196, + "auxiliary_loss_mlp": 0.00234937, + "balance_loss_clip": 1.09487367, + "balance_loss_mlp": 0.20501527, + "epoch": 0.7090335187133624, + "flos": 24425827960320.0, + "grad_norm": 8.639870072237386, + "language_loss": 0.73816812, + "learning_rate": 8.241459404634232e-07, + "loss": 0.75384951, + "num_input_tokens_seen": 254421875, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.29931641, + "step": 11793, + "time_per_iteration": 2.676018714904785 + }, + { + "auxiliary_loss_clip": 0.012995, + "auxiliary_loss_mlp": 0.00250214, + "balance_loss_clip": 1.07636631, + "balance_loss_mlp": 0.2238811, + "epoch": 0.7090936419660303, + "flos": 21835232288640.0, + "grad_norm": 13.30590651833809, + "language_loss": 0.77432221, + "learning_rate": 8.238309217655133e-07, + "loss": 0.78981942, + "num_input_tokens_seen": 254440765, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26342773, + "step": 11794, + "time_per_iteration": 4.08112907409668 + }, + { + "auxiliary_loss_clip": 0.01322997, + "auxiliary_loss_mlp": 0.00244229, + "balance_loss_clip": 1.09356093, + "balance_loss_mlp": 0.21777654, + "epoch": 0.7091537652186983, + "flos": 20082360585600.0, + "grad_norm": 6.699976676901574, + "language_loss": 0.81575048, + "learning_rate": 8.23515947668052e-07, + "loss": 0.83142269, + "num_input_tokens_seen": 254459480, + "router_z_loss_clip": 2.29589844, + "router_z_loss_mlp": 0.26452637, + "step": 11795, + "time_per_iteration": 4.156214714050293 + }, + { + "auxiliary_loss_clip": 0.01312425, + "auxiliary_loss_mlp": 0.0024266, + "balance_loss_clip": 1.08181763, + "balance_loss_mlp": 0.21559973, + "epoch": 0.7092138884713663, + "flos": 13151565676800.0, + "grad_norm": 99.41797874437958, + "language_loss": 0.84658235, + "learning_rate": 8.232010181829838e-07, + "loss": 0.8621332, + "num_input_tokens_seen": 254473985, + "router_z_loss_clip": 2.30566406, + "router_z_loss_mlp": 0.27062988, + "step": 11796, + "time_per_iteration": 2.656310796737671 + }, + { + "auxiliary_loss_clip": 0.01325841, + "auxiliary_loss_mlp": 0.00266719, + "balance_loss_clip": 1.08801007, + "balance_loss_mlp": 0.23595136, + "epoch": 0.7092740117240343, + "flos": 21645982506240.0, + "grad_norm": 3.3170342866029996, + "language_loss": 0.8135649, + "learning_rate": 8.228861333222523e-07, + "loss": 0.82949054, + "num_input_tokens_seen": 254492135, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.30786133, + "step": 11797, + "time_per_iteration": 2.627934455871582 + }, + { + "auxiliary_loss_clip": 0.0131857, + "auxiliary_loss_mlp": 0.00233344, + "balance_loss_clip": 1.0920707, + "balance_loss_mlp": 0.20627198, + "epoch": 0.7093341349767023, + "flos": 21032521102080.0, + "grad_norm": 6.062186458002524, + "language_loss": 0.86124688, + "learning_rate": 8.225712930977953e-07, + "loss": 0.87676609, + "num_input_tokens_seen": 254512865, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.27087402, + "step": 11798, + "time_per_iteration": 4.1195783615112305 + }, + { + "auxiliary_loss_clip": 0.01295147, + "auxiliary_loss_mlp": 0.00237158, + "balance_loss_clip": 1.07578409, + "balance_loss_mlp": 0.21189737, + "epoch": 0.7093942582293702, + "flos": 22017658487040.0, + "grad_norm": 367.14708351276334, + "language_loss": 0.73859, + "learning_rate": 8.222564975215529e-07, + "loss": 0.7539131, + "num_input_tokens_seen": 254532605, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25268555, + "step": 11799, + "time_per_iteration": 2.7658236026763916 + }, + { + "auxiliary_loss_clip": 0.01306503, + "auxiliary_loss_mlp": 0.00231256, + "balance_loss_clip": 1.08147573, + "balance_loss_mlp": 0.20367077, + "epoch": 0.7094543814820382, + "flos": 27235586465280.0, + "grad_norm": 51.69963050462977, + "language_loss": 0.89106786, + "learning_rate": 8.219417466054622e-07, + "loss": 0.90644544, + "num_input_tokens_seen": 254553780, + "router_z_loss_clip": 2.24902344, + "router_z_loss_mlp": 0.27624512, + "step": 11800, + "time_per_iteration": 2.759904146194458 + }, + { + "auxiliary_loss_clip": 0.01302847, + "auxiliary_loss_mlp": 0.00233453, + "balance_loss_clip": 1.08015442, + "balance_loss_mlp": 0.2060115, + "epoch": 0.7095145047347061, + "flos": 12089148180480.0, + "grad_norm": 5.529268432127161, + "language_loss": 0.94055223, + "learning_rate": 8.21627040361459e-07, + "loss": 0.95591527, + "num_input_tokens_seen": 254567510, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.27441406, + "step": 11801, + "time_per_iteration": 2.6875314712524414 + }, + { + "auxiliary_loss_clip": 0.01303428, + "auxiliary_loss_mlp": 0.00221785, + "balance_loss_clip": 1.07665348, + "balance_loss_mlp": 0.19498691, + "epoch": 0.7095746279873741, + "flos": 19383789905280.0, + "grad_norm": 15.311859806525256, + "language_loss": 0.84082711, + "learning_rate": 8.213123788014758e-07, + "loss": 0.85607922, + "num_input_tokens_seen": 254585565, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26782227, + "step": 11802, + "time_per_iteration": 2.6406784057617188 + }, + { + "auxiliary_loss_clip": 0.01307315, + "auxiliary_loss_mlp": 0.00259105, + "balance_loss_clip": 1.07991576, + "balance_loss_mlp": 0.23037511, + "epoch": 0.709634751240042, + "flos": 21360600950400.0, + "grad_norm": 4.100908213751981, + "language_loss": 0.90386236, + "learning_rate": 8.209977619374462e-07, + "loss": 0.91952658, + "num_input_tokens_seen": 254603465, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.28735352, + "step": 11803, + "time_per_iteration": 4.037668943405151 + }, + { + "auxiliary_loss_clip": 0.01303068, + "auxiliary_loss_mlp": 0.00212109, + "balance_loss_clip": 1.07581019, + "balance_loss_mlp": 0.18593046, + "epoch": 0.7096948744927101, + "flos": 13917037438080.0, + "grad_norm": 17.77275933572515, + "language_loss": 0.77895284, + "learning_rate": 8.206831897812995e-07, + "loss": 0.7941047, + "num_input_tokens_seen": 254620500, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26196289, + "step": 11804, + "time_per_iteration": 2.7181293964385986 + }, + { + "auxiliary_loss_clip": 0.01284986, + "auxiliary_loss_mlp": 0.0023195, + "balance_loss_clip": 1.07100308, + "balance_loss_mlp": 0.20666611, + "epoch": 0.709754997745378, + "flos": 30298335436800.0, + "grad_norm": 22.154516640777494, + "language_loss": 0.85323536, + "learning_rate": 8.203686623449637e-07, + "loss": 0.86840475, + "num_input_tokens_seen": 254638565, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.25292969, + "step": 11805, + "time_per_iteration": 2.7226836681365967 + }, + { + "auxiliary_loss_clip": 0.0129799, + "auxiliary_loss_mlp": 0.00239087, + "balance_loss_clip": 1.0788151, + "balance_loss_mlp": 0.21079835, + "epoch": 0.709815120998046, + "flos": 18515147304960.0, + "grad_norm": 7.129100229469187, + "language_loss": 0.89134413, + "learning_rate": 8.200541796403667e-07, + "loss": 0.90671492, + "num_input_tokens_seen": 254657505, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.28259277, + "step": 11806, + "time_per_iteration": 2.6834018230438232 + }, + { + "auxiliary_loss_clip": 0.01328154, + "auxiliary_loss_mlp": 0.0023504, + "balance_loss_clip": 1.09248495, + "balance_loss_mlp": 0.20606007, + "epoch": 0.7098752442507139, + "flos": 22272588288000.0, + "grad_norm": 43.02001618844886, + "language_loss": 0.69695556, + "learning_rate": 8.197397416794332e-07, + "loss": 0.71258754, + "num_input_tokens_seen": 254674730, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.29003906, + "step": 11807, + "time_per_iteration": 2.648043155670166 + }, + { + "auxiliary_loss_clip": 0.01317784, + "auxiliary_loss_mlp": 0.00253206, + "balance_loss_clip": 1.08190334, + "balance_loss_mlp": 0.22379705, + "epoch": 0.7099353675033819, + "flos": 19275447507840.0, + "grad_norm": 4.9955948137257495, + "language_loss": 0.79677719, + "learning_rate": 8.194253484740882e-07, + "loss": 0.81248707, + "num_input_tokens_seen": 254691665, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.29406738, + "step": 11808, + "time_per_iteration": 2.6850717067718506 + }, + { + "auxiliary_loss_clip": 0.01302351, + "auxiliary_loss_mlp": 0.00245787, + "balance_loss_clip": 1.07675552, + "balance_loss_mlp": 0.21972759, + "epoch": 0.70999549075605, + "flos": 21908525990400.0, + "grad_norm": 23.450290510449364, + "language_loss": 0.79483259, + "learning_rate": 8.191110000362513e-07, + "loss": 0.810314, + "num_input_tokens_seen": 254711610, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26049805, + "step": 11809, + "time_per_iteration": 2.6580810546875 + }, + { + "auxiliary_loss_clip": 0.01138217, + "auxiliary_loss_mlp": 0.00110738, + "balance_loss_clip": 0.98690641, + "balance_loss_mlp": 0.10339484, + "epoch": 0.7100556140087179, + "flos": 70456053456000.0, + "grad_norm": 0.8189236744616847, + "language_loss": 0.58730751, + "learning_rate": 8.187966963778435e-07, + "loss": 0.59979707, + "num_input_tokens_seen": 254772615, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.07324219, + "step": 11810, + "time_per_iteration": 3.241323232650757 + }, + { + "auxiliary_loss_clip": 0.01298781, + "auxiliary_loss_mlp": 0.00236374, + "balance_loss_clip": 1.07861161, + "balance_loss_mlp": 0.21005303, + "epoch": 0.7101157372613859, + "flos": 23039568420480.0, + "grad_norm": 5.593104852310323, + "language_loss": 0.81009841, + "learning_rate": 8.18482437510784e-07, + "loss": 0.82544994, + "num_input_tokens_seen": 254791375, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.26318359, + "step": 11811, + "time_per_iteration": 2.7064311504364014 + }, + { + "auxiliary_loss_clip": 0.01289432, + "auxiliary_loss_mlp": 0.00235322, + "balance_loss_clip": 1.07041764, + "balance_loss_mlp": 0.20848814, + "epoch": 0.7101758605140538, + "flos": 23185329811200.0, + "grad_norm": 478.68339422123296, + "language_loss": 0.89451277, + "learning_rate": 8.181682234469882e-07, + "loss": 0.90976036, + "num_input_tokens_seen": 254809300, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.26843262, + "step": 11812, + "time_per_iteration": 2.683307647705078 + }, + { + "auxiliary_loss_clip": 0.01308027, + "auxiliary_loss_mlp": 0.00234162, + "balance_loss_clip": 1.07913661, + "balance_loss_mlp": 0.20754215, + "epoch": 0.7102359837667218, + "flos": 23696123166720.0, + "grad_norm": 9.538017043412362, + "language_loss": 0.78298199, + "learning_rate": 8.178540541983716e-07, + "loss": 0.79840392, + "num_input_tokens_seen": 254829325, + "router_z_loss_clip": 2.29199219, + "router_z_loss_mlp": 0.26623535, + "step": 11813, + "time_per_iteration": 2.692432165145874 + }, + { + "auxiliary_loss_clip": 0.01280867, + "auxiliary_loss_mlp": 0.00231497, + "balance_loss_clip": 1.06379867, + "balance_loss_mlp": 0.20510413, + "epoch": 0.7102961070193897, + "flos": 19391116279680.0, + "grad_norm": 3.1039371274621796, + "language_loss": 0.89275843, + "learning_rate": 8.175399297768495e-07, + "loss": 0.90788209, + "num_input_tokens_seen": 254847690, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.26379395, + "step": 11814, + "time_per_iteration": 2.6304638385772705 + }, + { + "auxiliary_loss_clip": 0.01309877, + "auxiliary_loss_mlp": 0.00237565, + "balance_loss_clip": 1.0812943, + "balance_loss_mlp": 0.21098107, + "epoch": 0.7103562302720577, + "flos": 21507511576320.0, + "grad_norm": 5.1568719017404225, + "language_loss": 0.84473801, + "learning_rate": 8.172258501943301e-07, + "loss": 0.86021245, + "num_input_tokens_seen": 254865960, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.26574707, + "step": 11815, + "time_per_iteration": 2.6646881103515625 + }, + { + "auxiliary_loss_clip": 0.01273851, + "auxiliary_loss_mlp": 0.00239697, + "balance_loss_clip": 1.06197822, + "balance_loss_mlp": 0.21513981, + "epoch": 0.7104163535247257, + "flos": 14535059869440.0, + "grad_norm": 22.050838542330087, + "language_loss": 0.86587441, + "learning_rate": 8.16911815462725e-07, + "loss": 0.88100988, + "num_input_tokens_seen": 254882815, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.2454834, + "step": 11816, + "time_per_iteration": 2.6475400924682617 + }, + { + "auxiliary_loss_clip": 0.01264185, + "auxiliary_loss_mlp": 0.00236192, + "balance_loss_clip": 1.05082464, + "balance_loss_mlp": 0.21106282, + "epoch": 0.7104764767773937, + "flos": 11400310085760.0, + "grad_norm": 7.529657391722591, + "language_loss": 0.93358308, + "learning_rate": 8.165978255939426e-07, + "loss": 0.94858682, + "num_input_tokens_seen": 254898705, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.25146484, + "step": 11817, + "time_per_iteration": 2.623936653137207 + }, + { + "auxiliary_loss_clip": 0.01279408, + "auxiliary_loss_mlp": 0.00231503, + "balance_loss_clip": 1.05989003, + "balance_loss_mlp": 0.2066716, + "epoch": 0.7105366000300616, + "flos": 11690432236800.0, + "grad_norm": 7.0885683922989395, + "language_loss": 0.94240069, + "learning_rate": 8.162838805998897e-07, + "loss": 0.95750976, + "num_input_tokens_seen": 254913665, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24853516, + "step": 11818, + "time_per_iteration": 2.7172231674194336 + }, + { + "auxiliary_loss_clip": 0.01271691, + "auxiliary_loss_mlp": 0.00235114, + "balance_loss_clip": 1.05837846, + "balance_loss_mlp": 0.21092631, + "epoch": 0.7105967232827296, + "flos": 19354020508800.0, + "grad_norm": 14.665763364046901, + "language_loss": 0.82658303, + "learning_rate": 8.159699804924709e-07, + "loss": 0.84165108, + "num_input_tokens_seen": 254932140, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24206543, + "step": 11819, + "time_per_iteration": 2.6298696994781494 + }, + { + "auxiliary_loss_clip": 0.01307785, + "auxiliary_loss_mlp": 0.00249537, + "balance_loss_clip": 1.08462465, + "balance_loss_mlp": 0.22425261, + "epoch": 0.7106568465353975, + "flos": 22930400010240.0, + "grad_norm": 45.92688921509662, + "language_loss": 0.78506815, + "learning_rate": 8.156561252835883e-07, + "loss": 0.80064142, + "num_input_tokens_seen": 254951580, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.25305176, + "step": 11820, + "time_per_iteration": 2.6420037746429443 + }, + { + "auxiliary_loss_clip": 0.01298431, + "auxiliary_loss_mlp": 0.00273271, + "balance_loss_clip": 1.07215142, + "balance_loss_mlp": 0.24548347, + "epoch": 0.7107169697880655, + "flos": 19099665325440.0, + "grad_norm": 5.737542674377961, + "language_loss": 0.85577929, + "learning_rate": 8.153423149851449e-07, + "loss": 0.87149632, + "num_input_tokens_seen": 254969425, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.27807617, + "step": 11821, + "time_per_iteration": 2.6810569763183594 + }, + { + "auxiliary_loss_clip": 0.01137292, + "auxiliary_loss_mlp": 0.00161742, + "balance_loss_clip": 0.98758823, + "balance_loss_mlp": 0.15363584, + "epoch": 0.7107770930407336, + "flos": 63638054231040.0, + "grad_norm": 0.7423283992107796, + "language_loss": 0.54645759, + "learning_rate": 8.150285496090388e-07, + "loss": 0.55944794, + "num_input_tokens_seen": 255032680, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.08105469, + "step": 11822, + "time_per_iteration": 3.195051670074463 + }, + { + "auxiliary_loss_clip": 0.01265238, + "auxiliary_loss_mlp": 0.00224653, + "balance_loss_clip": 1.0574038, + "balance_loss_mlp": 0.20075171, + "epoch": 0.7108372162934015, + "flos": 22054466949120.0, + "grad_norm": 14.307011450028615, + "language_loss": 0.69920397, + "learning_rate": 8.147148291671688e-07, + "loss": 0.71410286, + "num_input_tokens_seen": 255054400, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.23876953, + "step": 11823, + "time_per_iteration": 2.7069122791290283 + }, + { + "auxiliary_loss_clip": 0.01280458, + "auxiliary_loss_mlp": 0.00255443, + "balance_loss_clip": 1.06215882, + "balance_loss_mlp": 0.22963434, + "epoch": 0.7108973395460695, + "flos": 19135144984320.0, + "grad_norm": 64.92627333505916, + "language_loss": 0.80971861, + "learning_rate": 8.144011536714322e-07, + "loss": 0.82507759, + "num_input_tokens_seen": 255072785, + "router_z_loss_clip": 2.18261719, + "router_z_loss_mlp": 0.25830078, + "step": 11824, + "time_per_iteration": 2.744117498397827 + }, + { + "auxiliary_loss_clip": 0.01270469, + "auxiliary_loss_mlp": 0.00241619, + "balance_loss_clip": 1.05627465, + "balance_loss_mlp": 0.21831334, + "epoch": 0.7109574627987374, + "flos": 17894431353600.0, + "grad_norm": 4.78040171196326, + "language_loss": 0.78983933, + "learning_rate": 8.140875231337223e-07, + "loss": 0.80496019, + "num_input_tokens_seen": 255091820, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.23278809, + "step": 11825, + "time_per_iteration": 2.6786391735076904 + }, + { + "auxiliary_loss_clip": 0.0129474, + "auxiliary_loss_mlp": 0.00223079, + "balance_loss_clip": 1.07213485, + "balance_loss_mlp": 0.19743708, + "epoch": 0.7110175860514054, + "flos": 28979623422720.0, + "grad_norm": 4.831099879472472, + "language_loss": 0.8573302, + "learning_rate": 8.137739375659321e-07, + "loss": 0.87250841, + "num_input_tokens_seen": 255111720, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25646973, + "step": 11826, + "time_per_iteration": 2.7674899101257324 + }, + { + "auxiliary_loss_clip": 0.01286027, + "auxiliary_loss_mlp": 0.0026403, + "balance_loss_clip": 1.06656241, + "balance_loss_mlp": 0.23754169, + "epoch": 0.7110777093040733, + "flos": 26173312623360.0, + "grad_norm": 19.56752080799484, + "language_loss": 0.88714719, + "learning_rate": 8.134603969799527e-07, + "loss": 0.90264773, + "num_input_tokens_seen": 255133495, + "router_z_loss_clip": 2.19238281, + "router_z_loss_mlp": 0.26501465, + "step": 11827, + "time_per_iteration": 2.7827465534210205 + }, + { + "auxiliary_loss_clip": 0.01281431, + "auxiliary_loss_mlp": 0.00245756, + "balance_loss_clip": 1.06237221, + "balance_loss_mlp": 0.21998283, + "epoch": 0.7111378325567413, + "flos": 26869943969280.0, + "grad_norm": 3.525313372649869, + "language_loss": 0.69932628, + "learning_rate": 8.131469013876748e-07, + "loss": 0.71459806, + "num_input_tokens_seen": 255156880, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.2578125, + "step": 11828, + "time_per_iteration": 2.720226287841797 + }, + { + "auxiliary_loss_clip": 0.01280886, + "auxiliary_loss_mlp": 0.00239231, + "balance_loss_clip": 1.06122541, + "balance_loss_mlp": 0.21403056, + "epoch": 0.7111979558094093, + "flos": 27271820309760.0, + "grad_norm": 2.7471545017884225, + "language_loss": 0.79053861, + "learning_rate": 8.128334508009846e-07, + "loss": 0.80573976, + "num_input_tokens_seen": 255178920, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25195312, + "step": 11829, + "time_per_iteration": 2.693220615386963 + }, + { + "auxiliary_loss_clip": 0.01275794, + "auxiliary_loss_mlp": 0.00246234, + "balance_loss_clip": 1.05893147, + "balance_loss_mlp": 0.2209495, + "epoch": 0.7112580790620773, + "flos": 25046938961280.0, + "grad_norm": 10.010772882863414, + "language_loss": 0.88176221, + "learning_rate": 8.125200452317697e-07, + "loss": 0.89698255, + "num_input_tokens_seen": 255198095, + "router_z_loss_clip": 2.17089844, + "router_z_loss_mlp": 0.25268555, + "step": 11830, + "time_per_iteration": 2.708855628967285 + }, + { + "auxiliary_loss_clip": 0.01279903, + "auxiliary_loss_mlp": 0.00232115, + "balance_loss_clip": 1.06499267, + "balance_loss_mlp": 0.2069979, + "epoch": 0.7113182023147452, + "flos": 21646628951040.0, + "grad_norm": 24.018989400200926, + "language_loss": 0.90403134, + "learning_rate": 8.122066846919138e-07, + "loss": 0.91915154, + "num_input_tokens_seen": 255215860, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.2512207, + "step": 11831, + "time_per_iteration": 2.6565804481506348 + }, + { + "auxiliary_loss_clip": 0.01257268, + "auxiliary_loss_mlp": 0.0020592, + "balance_loss_clip": 1.04293799, + "balance_loss_mlp": 0.18277003, + "epoch": 0.7113783255674132, + "flos": 20996287257600.0, + "grad_norm": 34.87712364751467, + "language_loss": 0.84586287, + "learning_rate": 8.118933691932985e-07, + "loss": 0.86049473, + "num_input_tokens_seen": 255235425, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.23156738, + "step": 11832, + "time_per_iteration": 2.6915061473846436 + }, + { + "auxiliary_loss_clip": 0.01153801, + "auxiliary_loss_mlp": 0.00116108, + "balance_loss_clip": 1.00534821, + "balance_loss_mlp": 0.10890808, + "epoch": 0.7114384488200811, + "flos": 66771080161920.0, + "grad_norm": 0.7446925512026675, + "language_loss": 0.56104136, + "learning_rate": 8.115800987478059e-07, + "loss": 0.57374048, + "num_input_tokens_seen": 255291680, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07177734, + "step": 11833, + "time_per_iteration": 3.0766937732696533 + }, + { + "auxiliary_loss_clip": 0.01261284, + "auxiliary_loss_mlp": 0.00242031, + "balance_loss_clip": 1.04798818, + "balance_loss_mlp": 0.21725938, + "epoch": 0.7114985720727491, + "flos": 25010058672000.0, + "grad_norm": 164.59082177685957, + "language_loss": 0.78650331, + "learning_rate": 8.11266873367315e-07, + "loss": 0.80153644, + "num_input_tokens_seen": 255313880, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24768066, + "step": 11834, + "time_per_iteration": 2.7964184284210205 + }, + { + "auxiliary_loss_clip": 0.01293159, + "auxiliary_loss_mlp": 0.00242862, + "balance_loss_clip": 1.06641793, + "balance_loss_mlp": 0.21741037, + "epoch": 0.7115586953254172, + "flos": 21470128496640.0, + "grad_norm": 127.5517436536806, + "language_loss": 0.87514263, + "learning_rate": 8.10953693063704e-07, + "loss": 0.89050281, + "num_input_tokens_seen": 255332390, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.25439453, + "step": 11835, + "time_per_iteration": 2.689601421356201 + }, + { + "auxiliary_loss_clip": 0.01259749, + "auxiliary_loss_mlp": 0.00235548, + "balance_loss_clip": 1.04818177, + "balance_loss_mlp": 0.21060941, + "epoch": 0.7116188185780851, + "flos": 28622600190720.0, + "grad_norm": 29.92542143175671, + "language_loss": 0.83210486, + "learning_rate": 8.10640557848848e-07, + "loss": 0.84705776, + "num_input_tokens_seen": 255354025, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.24963379, + "step": 11836, + "time_per_iteration": 4.166890859603882 + }, + { + "auxiliary_loss_clip": 0.01245875, + "auxiliary_loss_mlp": 0.00207909, + "balance_loss_clip": 1.03423977, + "balance_loss_mlp": 0.18475816, + "epoch": 0.7116789418307531, + "flos": 25293608634240.0, + "grad_norm": 1547.0627220984154, + "language_loss": 0.7631861, + "learning_rate": 8.103274677346208e-07, + "loss": 0.77772391, + "num_input_tokens_seen": 255371400, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.23168945, + "step": 11837, + "time_per_iteration": 2.713808059692383 + }, + { + "auxiliary_loss_clip": 0.01312325, + "auxiliary_loss_mlp": 0.00277977, + "balance_loss_clip": 1.0821085, + "balance_loss_mlp": 0.25020093, + "epoch": 0.711739065083421, + "flos": 25557301353600.0, + "grad_norm": 73.97627968694856, + "language_loss": 0.70998228, + "learning_rate": 8.100144227328958e-07, + "loss": 0.72588527, + "num_input_tokens_seen": 255390710, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.27758789, + "step": 11838, + "time_per_iteration": 4.191413879394531 + }, + { + "auxiliary_loss_clip": 0.01272934, + "auxiliary_loss_mlp": 0.00273094, + "balance_loss_clip": 1.0595963, + "balance_loss_mlp": 0.24794063, + "epoch": 0.711799188336089, + "flos": 26140993361280.0, + "grad_norm": 125.02997867215538, + "language_loss": 0.75171351, + "learning_rate": 8.097014228555426e-07, + "loss": 0.76717383, + "num_input_tokens_seen": 255408790, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.25158691, + "step": 11839, + "time_per_iteration": 2.6903140544891357 + }, + { + "auxiliary_loss_clip": 0.01279361, + "auxiliary_loss_mlp": 0.00249951, + "balance_loss_clip": 1.06308126, + "balance_loss_mlp": 0.22541822, + "epoch": 0.7118593115887569, + "flos": 21140648017920.0, + "grad_norm": 19.006471311540068, + "language_loss": 0.92159569, + "learning_rate": 8.093884681144305e-07, + "loss": 0.93688881, + "num_input_tokens_seen": 255426280, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24536133, + "step": 11840, + "time_per_iteration": 4.078336477279663 + }, + { + "auxiliary_loss_clip": 0.01290042, + "auxiliary_loss_mlp": 0.00250374, + "balance_loss_clip": 1.06446135, + "balance_loss_mlp": 0.22336107, + "epoch": 0.711919434841425, + "flos": 14975684006400.0, + "grad_norm": 7.295192867157736, + "language_loss": 0.86120081, + "learning_rate": 8.090755585214277e-07, + "loss": 0.87660497, + "num_input_tokens_seen": 255442935, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.27038574, + "step": 11841, + "time_per_iteration": 2.6544392108917236 + }, + { + "auxiliary_loss_clip": 0.01271654, + "auxiliary_loss_mlp": 0.00244951, + "balance_loss_clip": 1.05924404, + "balance_loss_mlp": 0.22000009, + "epoch": 0.7119795580940929, + "flos": 16508997826560.0, + "grad_norm": 5.659541085534239, + "language_loss": 0.82240331, + "learning_rate": 8.087626940883994e-07, + "loss": 0.8375693, + "num_input_tokens_seen": 255460925, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.24938965, + "step": 11842, + "time_per_iteration": 2.693629503250122 + }, + { + "auxiliary_loss_clip": 0.01148192, + "auxiliary_loss_mlp": 0.00167125, + "balance_loss_clip": 0.99981904, + "balance_loss_mlp": 0.15739775, + "epoch": 0.7120396813467609, + "flos": 66570736055040.0, + "grad_norm": 0.7759872523398607, + "language_loss": 0.61048806, + "learning_rate": 8.084498748272082e-07, + "loss": 0.62364125, + "num_input_tokens_seen": 255521360, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.09716797, + "step": 11843, + "time_per_iteration": 3.1240131855010986 + }, + { + "auxiliary_loss_clip": 0.0126324, + "auxiliary_loss_mlp": 0.00237256, + "balance_loss_clip": 1.05098021, + "balance_loss_mlp": 0.21219784, + "epoch": 0.7120998045994288, + "flos": 26432731624320.0, + "grad_norm": 222.66528375189867, + "language_loss": 0.8803097, + "learning_rate": 8.081371007497171e-07, + "loss": 0.89531469, + "num_input_tokens_seen": 255541435, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.25061035, + "step": 11844, + "time_per_iteration": 2.686566114425659 + }, + { + "auxiliary_loss_clip": 0.01277385, + "auxiliary_loss_mlp": 0.00245656, + "balance_loss_clip": 1.05601549, + "balance_loss_mlp": 0.21876253, + "epoch": 0.7121599278520968, + "flos": 16427982700800.0, + "grad_norm": 15.92153091131029, + "language_loss": 0.86214697, + "learning_rate": 8.078243718677873e-07, + "loss": 0.87737745, + "num_input_tokens_seen": 255558505, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.26928711, + "step": 11845, + "time_per_iteration": 4.0865888595581055 + }, + { + "auxiliary_loss_clip": 0.01259933, + "auxiliary_loss_mlp": 0.00237445, + "balance_loss_clip": 1.04729533, + "balance_loss_mlp": 0.21158886, + "epoch": 0.7122200511047647, + "flos": 28949889939840.0, + "grad_norm": 30.46951875105981, + "language_loss": 0.85683751, + "learning_rate": 8.075116881932762e-07, + "loss": 0.87181127, + "num_input_tokens_seen": 255577815, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.25854492, + "step": 11846, + "time_per_iteration": 2.7531375885009766 + }, + { + "auxiliary_loss_clip": 0.01258073, + "auxiliary_loss_mlp": 0.00256262, + "balance_loss_clip": 1.04605448, + "balance_loss_mlp": 0.23114485, + "epoch": 0.7122801743574327, + "flos": 16471866142080.0, + "grad_norm": 28.04507569741064, + "language_loss": 0.67009115, + "learning_rate": 8.071990497380421e-07, + "loss": 0.68523455, + "num_input_tokens_seen": 255595885, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.25109863, + "step": 11847, + "time_per_iteration": 2.677217483520508 + }, + { + "auxiliary_loss_clip": 0.01264927, + "auxiliary_loss_mlp": 0.00241516, + "balance_loss_clip": 1.05498052, + "balance_loss_mlp": 0.21648213, + "epoch": 0.7123402976101008, + "flos": 20631039811200.0, + "grad_norm": 3.801206611753666, + "language_loss": 0.77665049, + "learning_rate": 8.068864565139395e-07, + "loss": 0.79171497, + "num_input_tokens_seen": 255616750, + "router_z_loss_clip": 2.10253906, + "router_z_loss_mlp": 0.25024414, + "step": 11848, + "time_per_iteration": 2.696023464202881 + }, + { + "auxiliary_loss_clip": 0.01141828, + "auxiliary_loss_mlp": 0.00120446, + "balance_loss_clip": 0.99276054, + "balance_loss_mlp": 0.11071836, + "epoch": 0.7124004208627687, + "flos": 62325734837760.0, + "grad_norm": 0.8172684682466103, + "language_loss": 0.61412293, + "learning_rate": 8.065739085328211e-07, + "loss": 0.62674564, + "num_input_tokens_seen": 255677900, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.09716797, + "step": 11849, + "time_per_iteration": 3.16585373878479 + }, + { + "auxiliary_loss_clip": 0.01265861, + "auxiliary_loss_mlp": 0.00232461, + "balance_loss_clip": 1.04902875, + "balance_loss_mlp": 0.2049236, + "epoch": 0.7124605441154367, + "flos": 39675975788160.0, + "grad_norm": 2.1272482297112414, + "language_loss": 0.70605141, + "learning_rate": 8.0626140580654e-07, + "loss": 0.72103465, + "num_input_tokens_seen": 255699140, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.2755127, + "step": 11850, + "time_per_iteration": 2.88264536857605 + }, + { + "auxiliary_loss_clip": 0.01277557, + "auxiliary_loss_mlp": 0.00234697, + "balance_loss_clip": 1.05865097, + "balance_loss_mlp": 0.2088283, + "epoch": 0.7125206673681046, + "flos": 28181868312960.0, + "grad_norm": 105.07111302683754, + "language_loss": 0.77226412, + "learning_rate": 8.05948948346946e-07, + "loss": 0.78738666, + "num_input_tokens_seen": 255719640, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25854492, + "step": 11851, + "time_per_iteration": 2.7177817821502686 + }, + { + "auxiliary_loss_clip": 0.0124309, + "auxiliary_loss_mlp": 0.00219849, + "balance_loss_clip": 1.03737426, + "balance_loss_mlp": 0.19653204, + "epoch": 0.7125807906207726, + "flos": 26176939896960.0, + "grad_norm": 25.456657458017784, + "language_loss": 0.88378161, + "learning_rate": 8.056365361658882e-07, + "loss": 0.89841098, + "num_input_tokens_seen": 255740450, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.23291016, + "step": 11852, + "time_per_iteration": 2.6728756427764893 + }, + { + "auxiliary_loss_clip": 0.01291491, + "auxiliary_loss_mlp": 0.00246187, + "balance_loss_clip": 1.06233895, + "balance_loss_mlp": 0.21652734, + "epoch": 0.7126409138734405, + "flos": 17157328358400.0, + "grad_norm": 90.70486721673967, + "language_loss": 0.84299767, + "learning_rate": 8.053241692752126e-07, + "loss": 0.85837442, + "num_input_tokens_seen": 255758070, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.29650879, + "step": 11853, + "time_per_iteration": 2.7249643802642822 + }, + { + "auxiliary_loss_clip": 0.0124291, + "auxiliary_loss_mlp": 0.00214689, + "balance_loss_clip": 1.03495979, + "balance_loss_mlp": 0.19152692, + "epoch": 0.7127010371261085, + "flos": 18769933451520.0, + "grad_norm": 131.22078910061552, + "language_loss": 1.00342584, + "learning_rate": 8.050118476867635e-07, + "loss": 1.01800179, + "num_input_tokens_seen": 255775685, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.23181152, + "step": 11854, + "time_per_iteration": 2.618648052215576 + }, + { + "auxiliary_loss_clip": 0.01239656, + "auxiliary_loss_mlp": 0.00240753, + "balance_loss_clip": 1.0360918, + "balance_loss_mlp": 0.21730411, + "epoch": 0.7127611603787765, + "flos": 20376433232640.0, + "grad_norm": 2381.655795776995, + "language_loss": 0.84391654, + "learning_rate": 8.046995714123856e-07, + "loss": 0.85872066, + "num_input_tokens_seen": 255794750, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.23449707, + "step": 11855, + "time_per_iteration": 2.6915297508239746 + }, + { + "auxiliary_loss_clip": 0.01280084, + "auxiliary_loss_mlp": 0.00243886, + "balance_loss_clip": 1.06152654, + "balance_loss_mlp": 0.21799409, + "epoch": 0.7128212836314445, + "flos": 20449008662400.0, + "grad_norm": 13.927002641833509, + "language_loss": 0.82375711, + "learning_rate": 8.043873404639192e-07, + "loss": 0.83899677, + "num_input_tokens_seen": 255813325, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25927734, + "step": 11856, + "time_per_iteration": 2.659921407699585 + }, + { + "auxiliary_loss_clip": 0.01277876, + "auxiliary_loss_mlp": 0.00233068, + "balance_loss_clip": 1.05628347, + "balance_loss_mlp": 0.20816565, + "epoch": 0.7128814068841124, + "flos": 23440834229760.0, + "grad_norm": 9.777537997000486, + "language_loss": 0.78051168, + "learning_rate": 8.040751548532046e-07, + "loss": 0.79562104, + "num_input_tokens_seen": 255832470, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.24926758, + "step": 11857, + "time_per_iteration": 2.7028539180755615 + }, + { + "auxiliary_loss_clip": 0.01245092, + "auxiliary_loss_mlp": 0.00230586, + "balance_loss_clip": 1.0368495, + "balance_loss_mlp": 0.20602918, + "epoch": 0.7129415301367804, + "flos": 18222942165120.0, + "grad_norm": 759.3069654840982, + "language_loss": 0.92750919, + "learning_rate": 8.03763014592081e-07, + "loss": 0.94226593, + "num_input_tokens_seen": 255849740, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.24536133, + "step": 11858, + "time_per_iteration": 2.645064353942871 + }, + { + "auxiliary_loss_clip": 0.01264737, + "auxiliary_loss_mlp": 0.00208552, + "balance_loss_clip": 1.04824293, + "balance_loss_mlp": 0.18355414, + "epoch": 0.7130016533894483, + "flos": 15523896355200.0, + "grad_norm": 5.604398731106021, + "language_loss": 0.87606448, + "learning_rate": 8.034509196923829e-07, + "loss": 0.89079738, + "num_input_tokens_seen": 255866975, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.25, + "step": 11859, + "time_per_iteration": 2.6407277584075928 + }, + { + "auxiliary_loss_clip": 0.01254789, + "auxiliary_loss_mlp": 0.00229623, + "balance_loss_clip": 1.0444746, + "balance_loss_mlp": 0.20535222, + "epoch": 0.7130617766421163, + "flos": 57115668960000.0, + "grad_norm": 3.1704172784753943, + "language_loss": 0.74061662, + "learning_rate": 8.031388701659456e-07, + "loss": 0.75546074, + "num_input_tokens_seen": 255892915, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.24291992, + "step": 11860, + "time_per_iteration": 2.98284912109375 + }, + { + "auxiliary_loss_clip": 0.01277054, + "auxiliary_loss_mlp": 0.00252683, + "balance_loss_clip": 1.05309129, + "balance_loss_mlp": 0.22564626, + "epoch": 0.7131218998947844, + "flos": 19788252024960.0, + "grad_norm": 81.59333778989303, + "language_loss": 0.71766853, + "learning_rate": 8.028268660246023e-07, + "loss": 0.73296589, + "num_input_tokens_seen": 255911480, + "router_z_loss_clip": 2.23925781, + "router_z_loss_mlp": 0.27026367, + "step": 11861, + "time_per_iteration": 2.662611722946167 + }, + { + "auxiliary_loss_clip": 0.01276246, + "auxiliary_loss_mlp": 0.002345, + "balance_loss_clip": 1.06038141, + "balance_loss_mlp": 0.20935857, + "epoch": 0.7131820231474523, + "flos": 26651894457600.0, + "grad_norm": 26.459816216299515, + "language_loss": 0.75748026, + "learning_rate": 8.025149072801849e-07, + "loss": 0.77258778, + "num_input_tokens_seen": 255931140, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.25146484, + "step": 11862, + "time_per_iteration": 2.734743118286133 + }, + { + "auxiliary_loss_clip": 0.0126215, + "auxiliary_loss_mlp": 0.0021549, + "balance_loss_clip": 1.04649734, + "balance_loss_mlp": 0.19013372, + "epoch": 0.7132421464001203, + "flos": 29205609840000.0, + "grad_norm": 24.67354832644342, + "language_loss": 0.77065599, + "learning_rate": 8.022029939445214e-07, + "loss": 0.7854324, + "num_input_tokens_seen": 255951665, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.25366211, + "step": 11863, + "time_per_iteration": 2.7892401218414307 + }, + { + "auxiliary_loss_clip": 0.01283125, + "auxiliary_loss_mlp": 0.00215533, + "balance_loss_clip": 1.06308913, + "balance_loss_mlp": 0.19070138, + "epoch": 0.7133022696527882, + "flos": 23073611535360.0, + "grad_norm": 6.014648925128437, + "language_loss": 0.74023563, + "learning_rate": 8.018911260294414e-07, + "loss": 0.7552222, + "num_input_tokens_seen": 255970055, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.24841309, + "step": 11864, + "time_per_iteration": 2.777473211288452 + }, + { + "auxiliary_loss_clip": 0.01270052, + "auxiliary_loss_mlp": 0.00246243, + "balance_loss_clip": 1.05304718, + "balance_loss_mlp": 0.22104195, + "epoch": 0.7133623929054562, + "flos": 17457111267840.0, + "grad_norm": 2.7344733741812637, + "language_loss": 0.9412626, + "learning_rate": 8.015793035467697e-07, + "loss": 0.95642555, + "num_input_tokens_seen": 255987720, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25219727, + "step": 11865, + "time_per_iteration": 2.6319243907928467 + }, + { + "auxiliary_loss_clip": 0.0126455, + "auxiliary_loss_mlp": 0.00235705, + "balance_loss_clip": 1.04429901, + "balance_loss_mlp": 0.20951448, + "epoch": 0.7134225161581241, + "flos": 19536554448000.0, + "grad_norm": 7.1582059509409, + "language_loss": 0.83978724, + "learning_rate": 8.012675265083304e-07, + "loss": 0.85478985, + "num_input_tokens_seen": 256005490, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.26196289, + "step": 11866, + "time_per_iteration": 2.6633992195129395 + }, + { + "auxiliary_loss_clip": 0.01285911, + "auxiliary_loss_mlp": 0.0020777, + "balance_loss_clip": 1.06485641, + "balance_loss_mlp": 0.18370193, + "epoch": 0.7134826394107922, + "flos": 26250089944320.0, + "grad_norm": 16.826330433995448, + "language_loss": 0.81603765, + "learning_rate": 8.009557949259464e-07, + "loss": 0.83097452, + "num_input_tokens_seen": 256026030, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24072266, + "step": 11867, + "time_per_iteration": 2.676706314086914 + }, + { + "auxiliary_loss_clip": 0.01246773, + "auxiliary_loss_mlp": 0.00228992, + "balance_loss_clip": 1.03675795, + "balance_loss_mlp": 0.20344529, + "epoch": 0.7135427626634601, + "flos": 15815311395840.0, + "grad_norm": 14.918312061975374, + "language_loss": 0.78725874, + "learning_rate": 8.006441088114397e-07, + "loss": 0.80201638, + "num_input_tokens_seen": 256043680, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.25561523, + "step": 11868, + "time_per_iteration": 2.69899845123291 + }, + { + "auxiliary_loss_clip": 0.01275725, + "auxiliary_loss_mlp": 0.00231874, + "balance_loss_clip": 1.05612624, + "balance_loss_mlp": 0.20381194, + "epoch": 0.7136028859161281, + "flos": 18223409041920.0, + "grad_norm": 2.830523440944727, + "language_loss": 0.74946964, + "learning_rate": 8.003324681766286e-07, + "loss": 0.76454568, + "num_input_tokens_seen": 256059705, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.28063965, + "step": 11869, + "time_per_iteration": 2.631502151489258 + }, + { + "auxiliary_loss_clip": 0.01268007, + "auxiliary_loss_mlp": 0.00242037, + "balance_loss_clip": 1.04881299, + "balance_loss_mlp": 0.21595421, + "epoch": 0.713663009168796, + "flos": 24314827956480.0, + "grad_norm": 29.211077191918694, + "language_loss": 0.83624315, + "learning_rate": 8.000208730333298e-07, + "loss": 0.85134363, + "num_input_tokens_seen": 256079785, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26098633, + "step": 11870, + "time_per_iteration": 2.7220349311828613 + }, + { + "auxiliary_loss_clip": 0.0125619, + "auxiliary_loss_mlp": 0.0021277, + "balance_loss_clip": 1.04481411, + "balance_loss_mlp": 0.18748602, + "epoch": 0.713723132421464, + "flos": 26538488242560.0, + "grad_norm": 5.558874879147666, + "language_loss": 0.87984353, + "learning_rate": 7.997093233933597e-07, + "loss": 0.89453316, + "num_input_tokens_seen": 256099000, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.25292969, + "step": 11871, + "time_per_iteration": 2.6989307403564453 + }, + { + "auxiliary_loss_clip": 0.01275654, + "auxiliary_loss_mlp": 0.00257414, + "balance_loss_clip": 1.05822229, + "balance_loss_mlp": 0.22878017, + "epoch": 0.7137832556741319, + "flos": 19865675790720.0, + "grad_norm": 2.1061822791508513, + "language_loss": 0.86417615, + "learning_rate": 7.993978192685331e-07, + "loss": 0.87950689, + "num_input_tokens_seen": 256117985, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.28625488, + "step": 11872, + "time_per_iteration": 2.751570701599121 + }, + { + "auxiliary_loss_clip": 0.01277149, + "auxiliary_loss_mlp": 0.00234379, + "balance_loss_clip": 1.05957198, + "balance_loss_mlp": 0.20700818, + "epoch": 0.7138433789267999, + "flos": 21688932193920.0, + "grad_norm": 30.43630265853651, + "language_loss": 0.94419801, + "learning_rate": 7.990863606706606e-07, + "loss": 0.95931333, + "num_input_tokens_seen": 256134350, + "router_z_loss_clip": 2.17871094, + "router_z_loss_mlp": 0.27392578, + "step": 11873, + "time_per_iteration": 2.727137804031372 + }, + { + "auxiliary_loss_clip": 0.01247021, + "auxiliary_loss_mlp": 0.00203735, + "balance_loss_clip": 1.03883553, + "balance_loss_mlp": 0.17996505, + "epoch": 0.713903502179468, + "flos": 17602729004160.0, + "grad_norm": 14.943641467380465, + "language_loss": 0.94634032, + "learning_rate": 7.987749476115539e-07, + "loss": 0.96084791, + "num_input_tokens_seen": 256150610, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.23742676, + "step": 11874, + "time_per_iteration": 2.625796318054199 + }, + { + "auxiliary_loss_clip": 0.01274303, + "auxiliary_loss_mlp": 0.0020825, + "balance_loss_clip": 1.05546188, + "balance_loss_mlp": 0.18186875, + "epoch": 0.7139636254321359, + "flos": 18040336398720.0, + "grad_norm": 8.930798948928826, + "language_loss": 0.92674279, + "learning_rate": 7.984635801030228e-07, + "loss": 0.94156832, + "num_input_tokens_seen": 256168620, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.26367188, + "step": 11875, + "time_per_iteration": 2.6625149250030518 + }, + { + "auxiliary_loss_clip": 0.01271912, + "auxiliary_loss_mlp": 0.00248202, + "balance_loss_clip": 1.05048513, + "balance_loss_mlp": 0.2198181, + "epoch": 0.7140237486848039, + "flos": 23331127115520.0, + "grad_norm": 10.808462844116736, + "language_loss": 0.79541701, + "learning_rate": 7.981522581568721e-07, + "loss": 0.81061816, + "num_input_tokens_seen": 256186700, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.28393555, + "step": 11876, + "time_per_iteration": 2.6693646907806396 + }, + { + "auxiliary_loss_clip": 0.01257502, + "auxiliary_loss_mlp": 0.0023419, + "balance_loss_clip": 1.04416847, + "balance_loss_mlp": 0.20772564, + "epoch": 0.7140838719374718, + "flos": 16837077674880.0, + "grad_norm": 19.283066876119175, + "language_loss": 0.86992192, + "learning_rate": 7.978409817849079e-07, + "loss": 0.88483882, + "num_input_tokens_seen": 256205390, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.26501465, + "step": 11877, + "time_per_iteration": 2.6685664653778076 + }, + { + "auxiliary_loss_clip": 0.01259093, + "auxiliary_loss_mlp": 0.00234487, + "balance_loss_clip": 1.04267073, + "balance_loss_mlp": 0.2093934, + "epoch": 0.7141439951901398, + "flos": 21142012734720.0, + "grad_norm": 13.292210243408844, + "language_loss": 0.77064592, + "learning_rate": 7.97529750998934e-07, + "loss": 0.78558171, + "num_input_tokens_seen": 256224575, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25097656, + "step": 11878, + "time_per_iteration": 4.146072864532471 + }, + { + "auxiliary_loss_clip": 0.01260961, + "auxiliary_loss_mlp": 0.00208266, + "balance_loss_clip": 1.04362762, + "balance_loss_mlp": 0.18388772, + "epoch": 0.7142041184428077, + "flos": 24717709877760.0, + "grad_norm": 17.025928246028684, + "language_loss": 0.76790094, + "learning_rate": 7.972185658107535e-07, + "loss": 0.78259313, + "num_input_tokens_seen": 256242130, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24377441, + "step": 11879, + "time_per_iteration": 2.6682848930358887 + }, + { + "auxiliary_loss_clip": 0.01267764, + "auxiliary_loss_mlp": 0.00221591, + "balance_loss_clip": 1.04848838, + "balance_loss_mlp": 0.19516227, + "epoch": 0.7142642416954758, + "flos": 21908202768000.0, + "grad_norm": 8.38540381463372, + "language_loss": 0.78211689, + "learning_rate": 7.969074262321646e-07, + "loss": 0.79701042, + "num_input_tokens_seen": 256261920, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26416016, + "step": 11880, + "time_per_iteration": 4.1915342807769775 + }, + { + "auxiliary_loss_clip": 0.01277478, + "auxiliary_loss_mlp": 0.00221945, + "balance_loss_clip": 1.05591345, + "balance_loss_mlp": 0.19620734, + "epoch": 0.7143243649481437, + "flos": 20805636844800.0, + "grad_norm": 35.965645444652736, + "language_loss": 0.90640676, + "learning_rate": 7.965963322749674e-07, + "loss": 0.92140102, + "num_input_tokens_seen": 256277970, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25744629, + "step": 11881, + "time_per_iteration": 2.6142094135284424 + }, + { + "auxiliary_loss_clip": 0.01254184, + "auxiliary_loss_mlp": 0.00204132, + "balance_loss_clip": 1.04209733, + "balance_loss_mlp": 0.17914605, + "epoch": 0.7143844882008117, + "flos": 27235011847680.0, + "grad_norm": 9.725998550650056, + "language_loss": 0.70498842, + "learning_rate": 7.962852839509579e-07, + "loss": 0.71957159, + "num_input_tokens_seen": 256298205, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.24987793, + "step": 11882, + "time_per_iteration": 4.116163492202759 + }, + { + "auxiliary_loss_clip": 0.01264404, + "auxiliary_loss_mlp": 0.00203547, + "balance_loss_clip": 1.04254198, + "balance_loss_mlp": 0.17666522, + "epoch": 0.7144446114534796, + "flos": 17929623703680.0, + "grad_norm": 8.084659908894222, + "language_loss": 0.78842235, + "learning_rate": 7.959742812719304e-07, + "loss": 0.8031019, + "num_input_tokens_seen": 256316685, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.26867676, + "step": 11883, + "time_per_iteration": 2.657250165939331 + }, + { + "auxiliary_loss_clip": 0.01266311, + "auxiliary_loss_mlp": 0.00220972, + "balance_loss_clip": 1.05439281, + "balance_loss_mlp": 0.19772655, + "epoch": 0.7145047347061476, + "flos": 20740962407040.0, + "grad_norm": 18.546685640048075, + "language_loss": 0.85822642, + "learning_rate": 7.956633242496788e-07, + "loss": 0.87309921, + "num_input_tokens_seen": 256334205, + "router_z_loss_clip": 2.12402344, + "router_z_loss_mlp": 0.23242188, + "step": 11884, + "time_per_iteration": 2.65487003326416 + }, + { + "auxiliary_loss_clip": 0.01284388, + "auxiliary_loss_mlp": 0.00235088, + "balance_loss_clip": 1.05765331, + "balance_loss_mlp": 0.20615613, + "epoch": 0.7145648579588155, + "flos": 21178605715200.0, + "grad_norm": 19.636169756243543, + "language_loss": 0.84497678, + "learning_rate": 7.953524128959954e-07, + "loss": 0.86017156, + "num_input_tokens_seen": 256353340, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.28918457, + "step": 11885, + "time_per_iteration": 2.72526478767395 + }, + { + "auxiliary_loss_clip": 0.01150098, + "auxiliary_loss_mlp": 0.00055123, + "balance_loss_clip": 1.00522554, + "balance_loss_mlp": 0.04754145, + "epoch": 0.7146249812114835, + "flos": 64784539509120.0, + "grad_norm": 0.8680475333707608, + "language_loss": 0.65902877, + "learning_rate": 7.95041547222669e-07, + "loss": 0.67108095, + "num_input_tokens_seen": 256411550, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.07568359, + "step": 11886, + "time_per_iteration": 3.123544454574585 + }, + { + "auxiliary_loss_clip": 0.01248985, + "auxiliary_loss_mlp": 0.00204516, + "balance_loss_clip": 1.03756428, + "balance_loss_mlp": 0.17969659, + "epoch": 0.7146851044641516, + "flos": 18113881495680.0, + "grad_norm": 213.60631635354207, + "language_loss": 0.84467685, + "learning_rate": 7.947307272414874e-07, + "loss": 0.85921186, + "num_input_tokens_seen": 256430360, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.2479248, + "step": 11887, + "time_per_iteration": 4.177378177642822 + }, + { + "auxiliary_loss_clip": 0.01254731, + "auxiliary_loss_mlp": 0.00223374, + "balance_loss_clip": 1.0429852, + "balance_loss_mlp": 0.19816151, + "epoch": 0.7147452277168195, + "flos": 19243846517760.0, + "grad_norm": 232.52600638433896, + "language_loss": 0.78150272, + "learning_rate": 7.944199529642372e-07, + "loss": 0.79628378, + "num_input_tokens_seen": 256449750, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.25183105, + "step": 11888, + "time_per_iteration": 2.7120697498321533 + }, + { + "auxiliary_loss_clip": 0.01242683, + "auxiliary_loss_mlp": 0.0021817, + "balance_loss_clip": 1.03116477, + "balance_loss_mlp": 0.19269545, + "epoch": 0.7148053509694875, + "flos": 23764712186880.0, + "grad_norm": 7.94569194090063, + "language_loss": 0.92294937, + "learning_rate": 7.941092244027041e-07, + "loss": 0.93755794, + "num_input_tokens_seen": 256467330, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.25500488, + "step": 11889, + "time_per_iteration": 2.654223918914795 + }, + { + "auxiliary_loss_clip": 0.01260051, + "auxiliary_loss_mlp": 0.00208632, + "balance_loss_clip": 1.04492295, + "balance_loss_mlp": 0.18211968, + "epoch": 0.7148654742221554, + "flos": 22485322586880.0, + "grad_norm": 5.443867285171067, + "language_loss": 0.8477695, + "learning_rate": 7.937985415686695e-07, + "loss": 0.86245638, + "num_input_tokens_seen": 256485705, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.26550293, + "step": 11890, + "time_per_iteration": 2.700242757797241 + }, + { + "auxiliary_loss_clip": 0.01252159, + "auxiliary_loss_mlp": 0.00193136, + "balance_loss_clip": 1.04245806, + "balance_loss_mlp": 0.1702601, + "epoch": 0.7149255974748234, + "flos": 24679213476480.0, + "grad_norm": 8.46228822807439, + "language_loss": 0.81110811, + "learning_rate": 7.934879044739147e-07, + "loss": 0.82556105, + "num_input_tokens_seen": 256504755, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.22875977, + "step": 11891, + "time_per_iteration": 2.7379469871520996 + }, + { + "auxiliary_loss_clip": 0.01266126, + "auxiliary_loss_mlp": 0.00237414, + "balance_loss_clip": 1.04849863, + "balance_loss_mlp": 0.21210569, + "epoch": 0.7149857207274913, + "flos": 18405583845120.0, + "grad_norm": 6.628701029653289, + "language_loss": 0.77453387, + "learning_rate": 7.931773131302211e-07, + "loss": 0.78956926, + "num_input_tokens_seen": 256523670, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.2532959, + "step": 11892, + "time_per_iteration": 2.6510062217712402 + }, + { + "auxiliary_loss_clip": 0.01273863, + "auxiliary_loss_mlp": 0.00235197, + "balance_loss_clip": 1.05498338, + "balance_loss_mlp": 0.20723069, + "epoch": 0.7150458439801594, + "flos": 24969515195520.0, + "grad_norm": 744.2274721602425, + "language_loss": 0.80616105, + "learning_rate": 7.928667675493632e-07, + "loss": 0.82125163, + "num_input_tokens_seen": 256542225, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.27954102, + "step": 11893, + "time_per_iteration": 2.7355520725250244 + }, + { + "auxiliary_loss_clip": 0.01273695, + "auxiliary_loss_mlp": 0.0022779, + "balance_loss_clip": 1.05404377, + "balance_loss_mlp": 0.2012538, + "epoch": 0.7151059672328273, + "flos": 16690777580160.0, + "grad_norm": 22.689845751680505, + "language_loss": 0.79345465, + "learning_rate": 7.925562677431185e-07, + "loss": 0.80846953, + "num_input_tokens_seen": 256560730, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.26550293, + "step": 11894, + "time_per_iteration": 2.6429600715637207 + }, + { + "auxiliary_loss_clip": 0.01264935, + "auxiliary_loss_mlp": 0.00208926, + "balance_loss_clip": 1.04521501, + "balance_loss_mlp": 0.18365389, + "epoch": 0.7151660904854953, + "flos": 27271820309760.0, + "grad_norm": 14.784022717953684, + "language_loss": 0.84911275, + "learning_rate": 7.922458137232613e-07, + "loss": 0.86385137, + "num_input_tokens_seen": 256580505, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25256348, + "step": 11895, + "time_per_iteration": 2.735880136489868 + }, + { + "auxiliary_loss_clip": 0.01273099, + "auxiliary_loss_mlp": 0.0023551, + "balance_loss_clip": 1.05164599, + "balance_loss_mlp": 0.20766284, + "epoch": 0.7152262137381632, + "flos": 18332254229760.0, + "grad_norm": 10.662360226954874, + "language_loss": 0.7740047, + "learning_rate": 7.919354055015643e-07, + "loss": 0.78909075, + "num_input_tokens_seen": 256597330, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.27844238, + "step": 11896, + "time_per_iteration": 2.620870351791382 + }, + { + "auxiliary_loss_clip": 0.01258522, + "auxiliary_loss_mlp": 0.00228478, + "balance_loss_clip": 1.04067671, + "balance_loss_mlp": 0.20018944, + "epoch": 0.7152863369908312, + "flos": 21799285752960.0, + "grad_norm": 49.794996365429995, + "language_loss": 0.94091928, + "learning_rate": 7.91625043089798e-07, + "loss": 0.95578933, + "num_input_tokens_seen": 256616030, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.28295898, + "step": 11897, + "time_per_iteration": 2.6797690391540527 + }, + { + "auxiliary_loss_clip": 0.01268442, + "auxiliary_loss_mlp": 0.00211907, + "balance_loss_clip": 1.05356431, + "balance_loss_mlp": 0.18681353, + "epoch": 0.7153464602434991, + "flos": 22158427887360.0, + "grad_norm": 4.758699032591646, + "language_loss": 0.86554211, + "learning_rate": 7.913147264997304e-07, + "loss": 0.88034558, + "num_input_tokens_seen": 256635570, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.25085449, + "step": 11898, + "time_per_iteration": 2.6576380729675293 + }, + { + "auxiliary_loss_clip": 0.01268806, + "auxiliary_loss_mlp": 0.00202047, + "balance_loss_clip": 1.04604244, + "balance_loss_mlp": 0.17552334, + "epoch": 0.7154065834961671, + "flos": 24716057852160.0, + "grad_norm": 60.35270207578106, + "language_loss": 0.83088797, + "learning_rate": 7.910044557431302e-07, + "loss": 0.84559643, + "num_input_tokens_seen": 256655290, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26513672, + "step": 11899, + "time_per_iteration": 2.722740888595581 + }, + { + "auxiliary_loss_clip": 0.01255993, + "auxiliary_loss_mlp": 0.00226396, + "balance_loss_clip": 1.04568672, + "balance_loss_mlp": 0.20231603, + "epoch": 0.7154667067488351, + "flos": 22601494149120.0, + "grad_norm": 14.633255648291215, + "language_loss": 0.84605372, + "learning_rate": 7.906942308317614e-07, + "loss": 0.86087757, + "num_input_tokens_seen": 256671605, + "router_z_loss_clip": 2.10058594, + "router_z_loss_mlp": 0.24084473, + "step": 11900, + "time_per_iteration": 2.77054500579834 + }, + { + "auxiliary_loss_clip": 0.01252181, + "auxiliary_loss_mlp": 0.00208875, + "balance_loss_clip": 1.04094946, + "balance_loss_mlp": 0.18455622, + "epoch": 0.7155268300015031, + "flos": 18771154513920.0, + "grad_norm": 45.949619770470285, + "language_loss": 0.90657091, + "learning_rate": 7.903840517773886e-07, + "loss": 0.92118144, + "num_input_tokens_seen": 256689680, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.24316406, + "step": 11901, + "time_per_iteration": 2.7028958797454834 + }, + { + "auxiliary_loss_clip": 0.01278047, + "auxiliary_loss_mlp": 0.00240053, + "balance_loss_clip": 1.05471039, + "balance_loss_mlp": 0.21195579, + "epoch": 0.7155869532541711, + "flos": 18296343607680.0, + "grad_norm": 2.6095953929097346, + "language_loss": 0.89547253, + "learning_rate": 7.900739185917744e-07, + "loss": 0.91065347, + "num_input_tokens_seen": 256707760, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.28076172, + "step": 11902, + "time_per_iteration": 2.6463263034820557 + }, + { + "auxiliary_loss_clip": 0.01263134, + "auxiliary_loss_mlp": 0.00205042, + "balance_loss_clip": 1.04559529, + "balance_loss_mlp": 0.17863756, + "epoch": 0.715647076506839, + "flos": 11980805783040.0, + "grad_norm": 3.3729937887601893, + "language_loss": 0.78673697, + "learning_rate": 7.897638312866785e-07, + "loss": 0.80141866, + "num_input_tokens_seen": 256724150, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.26416016, + "step": 11903, + "time_per_iteration": 2.636934518814087 + }, + { + "auxiliary_loss_clip": 0.01244981, + "auxiliary_loss_mlp": 0.00236749, + "balance_loss_clip": 1.03751171, + "balance_loss_mlp": 0.21279959, + "epoch": 0.715707199759507, + "flos": 18951641377920.0, + "grad_norm": 12.666410514764433, + "language_loss": 0.80936444, + "learning_rate": 7.894537898738589e-07, + "loss": 0.82418168, + "num_input_tokens_seen": 256742780, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.23950195, + "step": 11904, + "time_per_iteration": 2.6337482929229736 + }, + { + "auxiliary_loss_clip": 0.01264309, + "auxiliary_loss_mlp": 0.00220712, + "balance_loss_clip": 1.04385495, + "balance_loss_mlp": 0.19484398, + "epoch": 0.7157673230121749, + "flos": 15304410299520.0, + "grad_norm": 12.340101771157052, + "language_loss": 0.83252442, + "learning_rate": 7.891437943650727e-07, + "loss": 0.84737468, + "num_input_tokens_seen": 256761355, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.25854492, + "step": 11905, + "time_per_iteration": 2.6642301082611084 + }, + { + "auxiliary_loss_clip": 0.01253056, + "auxiliary_loss_mlp": 0.00239458, + "balance_loss_clip": 1.0422616, + "balance_loss_mlp": 0.21450716, + "epoch": 0.715827446264843, + "flos": 23221850964480.0, + "grad_norm": 9.676597489466346, + "language_loss": 0.87730426, + "learning_rate": 7.88833844772076e-07, + "loss": 0.89222944, + "num_input_tokens_seen": 256781335, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.24975586, + "step": 11906, + "time_per_iteration": 2.6924445629119873 + }, + { + "auxiliary_loss_clip": 0.01156371, + "auxiliary_loss_mlp": 0.00081476, + "balance_loss_clip": 1.00821066, + "balance_loss_mlp": 0.07394207, + "epoch": 0.7158875695175109, + "flos": 60975421833600.0, + "grad_norm": 0.7139877342393994, + "language_loss": 0.54484075, + "learning_rate": 7.885239411066205e-07, + "loss": 0.55721921, + "num_input_tokens_seen": 256838890, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07519531, + "step": 11907, + "time_per_iteration": 3.0979459285736084 + }, + { + "auxiliary_loss_clip": 0.01250868, + "auxiliary_loss_mlp": 0.00220317, + "balance_loss_clip": 1.03863072, + "balance_loss_mlp": 0.19580777, + "epoch": 0.7159476927701789, + "flos": 17128780024320.0, + "grad_norm": 8.90857974602353, + "language_loss": 0.78343278, + "learning_rate": 7.882140833804593e-07, + "loss": 0.79814464, + "num_input_tokens_seen": 256858145, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.24487305, + "step": 11908, + "time_per_iteration": 2.6786036491394043 + }, + { + "auxiliary_loss_clip": 0.01265346, + "auxiliary_loss_mlp": 0.00217189, + "balance_loss_clip": 1.04540873, + "balance_loss_mlp": 0.18826842, + "epoch": 0.7160078160228468, + "flos": 22490601886080.0, + "grad_norm": 20.550082684250587, + "language_loss": 0.79345298, + "learning_rate": 7.879042716053415e-07, + "loss": 0.80827832, + "num_input_tokens_seen": 256878545, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.2890625, + "step": 11909, + "time_per_iteration": 2.6847121715545654 + }, + { + "auxiliary_loss_clip": 0.01271979, + "auxiliary_loss_mlp": 0.00227631, + "balance_loss_clip": 1.04966998, + "balance_loss_mlp": 0.19990245, + "epoch": 0.7160679392755148, + "flos": 30590935626240.0, + "grad_norm": 26.92875375630976, + "language_loss": 0.80603832, + "learning_rate": 7.875945057930144e-07, + "loss": 0.82103443, + "num_input_tokens_seen": 256899920, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.27734375, + "step": 11910, + "time_per_iteration": 2.8244686126708984 + }, + { + "auxiliary_loss_clip": 0.01254635, + "auxiliary_loss_mlp": 0.00214491, + "balance_loss_clip": 1.04168344, + "balance_loss_mlp": 0.18907547, + "epoch": 0.7161280625281827, + "flos": 21323648833920.0, + "grad_norm": 2.8697259191627458, + "language_loss": 0.81939328, + "learning_rate": 7.872847859552251e-07, + "loss": 0.83408457, + "num_input_tokens_seen": 256918460, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.25402832, + "step": 11911, + "time_per_iteration": 2.6515073776245117 + }, + { + "auxiliary_loss_clip": 0.01279842, + "auxiliary_loss_mlp": 0.00214594, + "balance_loss_clip": 1.05246496, + "balance_loss_mlp": 0.18665171, + "epoch": 0.7161881857808508, + "flos": 61860078921600.0, + "grad_norm": 32.631607560757566, + "language_loss": 0.69410354, + "learning_rate": 7.869751121037192e-07, + "loss": 0.70904791, + "num_input_tokens_seen": 256942015, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.27941895, + "step": 11912, + "time_per_iteration": 3.08223819732666 + }, + { + "auxiliary_loss_clip": 0.01262134, + "auxiliary_loss_mlp": 0.00226588, + "balance_loss_clip": 1.04938984, + "balance_loss_mlp": 0.20218542, + "epoch": 0.7162483090335187, + "flos": 20812101292800.0, + "grad_norm": 32.39114378128236, + "language_loss": 0.86930037, + "learning_rate": 7.866654842502376e-07, + "loss": 0.88418758, + "num_input_tokens_seen": 256961065, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.24401855, + "step": 11913, + "time_per_iteration": 2.7596442699432373 + }, + { + "auxiliary_loss_clip": 0.01235702, + "auxiliary_loss_mlp": 0.00227623, + "balance_loss_clip": 1.03135467, + "balance_loss_mlp": 0.20422173, + "epoch": 0.7163084322861867, + "flos": 24097532630400.0, + "grad_norm": 4.125145605369417, + "language_loss": 0.81531912, + "learning_rate": 7.863559024065234e-07, + "loss": 0.82995236, + "num_input_tokens_seen": 256982165, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.23425293, + "step": 11914, + "time_per_iteration": 2.753490924835205 + }, + { + "auxiliary_loss_clip": 0.01257115, + "auxiliary_loss_mlp": 0.00209209, + "balance_loss_clip": 1.0509423, + "balance_loss_mlp": 0.18489072, + "epoch": 0.7163685555388547, + "flos": 20080888128000.0, + "grad_norm": 5.132760735389259, + "language_loss": 0.8006829, + "learning_rate": 7.860463665843143e-07, + "loss": 0.81534618, + "num_input_tokens_seen": 256999825, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.24328613, + "step": 11915, + "time_per_iteration": 2.752250909805298 + }, + { + "auxiliary_loss_clip": 0.01263253, + "auxiliary_loss_mlp": 0.00229128, + "balance_loss_clip": 1.04668188, + "balance_loss_mlp": 0.20212674, + "epoch": 0.7164286787915226, + "flos": 17456967613440.0, + "grad_norm": 98.05236979291158, + "language_loss": 0.86818063, + "learning_rate": 7.85736876795349e-07, + "loss": 0.88310438, + "num_input_tokens_seen": 257017450, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.27026367, + "step": 11916, + "time_per_iteration": 2.6448185443878174 + }, + { + "auxiliary_loss_clip": 0.01277141, + "auxiliary_loss_mlp": 0.00233216, + "balance_loss_clip": 1.06015348, + "balance_loss_mlp": 0.20634601, + "epoch": 0.7164888020441906, + "flos": 19718908819200.0, + "grad_norm": 84.34837181805183, + "language_loss": 0.76712942, + "learning_rate": 7.854274330513626e-07, + "loss": 0.78223294, + "num_input_tokens_seen": 257035465, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.26843262, + "step": 11917, + "time_per_iteration": 2.6621766090393066 + }, + { + "auxiliary_loss_clip": 0.01269178, + "auxiliary_loss_mlp": 0.00227563, + "balance_loss_clip": 1.05154824, + "balance_loss_mlp": 0.20076516, + "epoch": 0.7165489252968585, + "flos": 21470523546240.0, + "grad_norm": 449.56883407250615, + "language_loss": 0.84539956, + "learning_rate": 7.851180353640896e-07, + "loss": 0.86036694, + "num_input_tokens_seen": 257053750, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.26794434, + "step": 11918, + "time_per_iteration": 2.6377766132354736 + }, + { + "auxiliary_loss_clip": 0.0117663, + "auxiliary_loss_mlp": 0.00060356, + "balance_loss_clip": 1.0258801, + "balance_loss_mlp": 0.05301319, + "epoch": 0.7166090485495266, + "flos": 69928060464000.0, + "grad_norm": 0.6377753369933555, + "language_loss": 0.52944678, + "learning_rate": 7.848086837452639e-07, + "loss": 0.54181665, + "num_input_tokens_seen": 257121215, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.07324219, + "step": 11919, + "time_per_iteration": 3.208137035369873 + }, + { + "auxiliary_loss_clip": 0.01294544, + "auxiliary_loss_mlp": 0.00231933, + "balance_loss_clip": 1.07156801, + "balance_loss_mlp": 0.20561175, + "epoch": 0.7166691718021945, + "flos": 27343892949120.0, + "grad_norm": 10.029736608656009, + "language_loss": 0.73919046, + "learning_rate": 7.844993782066132e-07, + "loss": 0.75445521, + "num_input_tokens_seen": 257143370, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26379395, + "step": 11920, + "time_per_iteration": 4.231455564498901 + }, + { + "auxiliary_loss_clip": 0.01267635, + "auxiliary_loss_mlp": 0.00213123, + "balance_loss_clip": 1.04934192, + "balance_loss_mlp": 0.18470398, + "epoch": 0.7167292950548625, + "flos": 30408868563840.0, + "grad_norm": 8.262721794304813, + "language_loss": 0.82358849, + "learning_rate": 7.841901187598678e-07, + "loss": 0.83839607, + "num_input_tokens_seen": 257162160, + "router_z_loss_clip": 2.18457031, + "router_z_loss_mlp": 0.28442383, + "step": 11921, + "time_per_iteration": 2.7021360397338867 + }, + { + "auxiliary_loss_clip": 0.01287914, + "auxiliary_loss_mlp": 0.00250266, + "balance_loss_clip": 1.06517196, + "balance_loss_mlp": 0.22078547, + "epoch": 0.7167894183075304, + "flos": 14571257800320.0, + "grad_norm": 33.28079654794878, + "language_loss": 0.86775792, + "learning_rate": 7.83880905416755e-07, + "loss": 0.88313973, + "num_input_tokens_seen": 257179300, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.2947998, + "step": 11922, + "time_per_iteration": 4.082406997680664 + }, + { + "auxiliary_loss_clip": 0.01191035, + "auxiliary_loss_mlp": 0.00071154, + "balance_loss_clip": 1.03875232, + "balance_loss_mlp": 0.06290495, + "epoch": 0.7168495415601984, + "flos": 64110674407680.0, + "grad_norm": 0.7511269270598219, + "language_loss": 0.54569602, + "learning_rate": 7.83571738189001e-07, + "loss": 0.5583179, + "num_input_tokens_seen": 257235470, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.08251953, + "step": 11923, + "time_per_iteration": 2.9721767902374268 + }, + { + "auxiliary_loss_clip": 0.0128907, + "auxiliary_loss_mlp": 0.00215849, + "balance_loss_clip": 1.06755817, + "balance_loss_mlp": 0.18889561, + "epoch": 0.7169096648128663, + "flos": 24681440119680.0, + "grad_norm": 7.629939222497074, + "language_loss": 0.84847987, + "learning_rate": 7.832626170883279e-07, + "loss": 0.86352903, + "num_input_tokens_seen": 257255850, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.26940918, + "step": 11924, + "time_per_iteration": 2.6907904148101807 + }, + { + "auxiliary_loss_clip": 0.01246472, + "auxiliary_loss_mlp": 0.00216752, + "balance_loss_clip": 1.03681493, + "balance_loss_mlp": 0.19248073, + "epoch": 0.7169697880655344, + "flos": 20667525050880.0, + "grad_norm": 5.477023946022965, + "language_loss": 0.77063525, + "learning_rate": 7.829535421264588e-07, + "loss": 0.78526747, + "num_input_tokens_seen": 257275425, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.24267578, + "step": 11925, + "time_per_iteration": 4.1361799240112305 + }, + { + "auxiliary_loss_clip": 0.01248798, + "auxiliary_loss_mlp": 0.00193376, + "balance_loss_clip": 1.03850234, + "balance_loss_mlp": 0.16912889, + "epoch": 0.7170299113182023, + "flos": 21032700670080.0, + "grad_norm": 10.637210698786488, + "language_loss": 0.84144413, + "learning_rate": 7.826445133151133e-07, + "loss": 0.8558659, + "num_input_tokens_seen": 257295740, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.24255371, + "step": 11926, + "time_per_iteration": 2.782789468765259 + }, + { + "auxiliary_loss_clip": 0.01268474, + "auxiliary_loss_mlp": 0.00221548, + "balance_loss_clip": 1.04856217, + "balance_loss_mlp": 0.19569129, + "epoch": 0.7170900345708703, + "flos": 22893304239360.0, + "grad_norm": 9.014022248344915, + "language_loss": 0.85278666, + "learning_rate": 7.823355306660093e-07, + "loss": 0.86768693, + "num_input_tokens_seen": 257315970, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.25866699, + "step": 11927, + "time_per_iteration": 2.692638397216797 + }, + { + "auxiliary_loss_clip": 0.01277987, + "auxiliary_loss_mlp": 0.00221798, + "balance_loss_clip": 1.05713391, + "balance_loss_mlp": 0.19125679, + "epoch": 0.7171501578235383, + "flos": 15518688883200.0, + "grad_norm": 266.91809568443483, + "language_loss": 0.78218329, + "learning_rate": 7.820265941908642e-07, + "loss": 0.79718113, + "num_input_tokens_seen": 257334230, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.30505371, + "step": 11928, + "time_per_iteration": 2.686121940612793 + }, + { + "auxiliary_loss_clip": 0.01254692, + "auxiliary_loss_mlp": 0.00212798, + "balance_loss_clip": 1.04575753, + "balance_loss_mlp": 0.1870247, + "epoch": 0.7172102810762062, + "flos": 26104292640000.0, + "grad_norm": 12.646325122161398, + "language_loss": 0.71942115, + "learning_rate": 7.817177039013931e-07, + "loss": 0.73409605, + "num_input_tokens_seen": 257352145, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.25769043, + "step": 11929, + "time_per_iteration": 4.119197368621826 + }, + { + "auxiliary_loss_clip": 0.01277903, + "auxiliary_loss_mlp": 0.00214275, + "balance_loss_clip": 1.05448365, + "balance_loss_mlp": 0.18798964, + "epoch": 0.7172704043288742, + "flos": 21506649649920.0, + "grad_norm": 12.879138468175295, + "language_loss": 0.81176162, + "learning_rate": 7.81408859809308e-07, + "loss": 0.8266834, + "num_input_tokens_seen": 257371460, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26306152, + "step": 11930, + "time_per_iteration": 2.7036526203155518 + }, + { + "auxiliary_loss_clip": 0.01270081, + "auxiliary_loss_mlp": 0.00197007, + "balance_loss_clip": 1.05743527, + "balance_loss_mlp": 0.17203259, + "epoch": 0.7173305275815421, + "flos": 18770939032320.0, + "grad_norm": 22.95463412717161, + "language_loss": 0.90153819, + "learning_rate": 7.811000619263219e-07, + "loss": 0.9162091, + "num_input_tokens_seen": 257390800, + "router_z_loss_clip": 2.12597656, + "router_z_loss_mlp": 0.24987793, + "step": 11931, + "time_per_iteration": 2.647714138031006 + }, + { + "auxiliary_loss_clip": 0.01250316, + "auxiliary_loss_mlp": 0.00220043, + "balance_loss_clip": 1.04033279, + "balance_loss_mlp": 0.19579619, + "epoch": 0.7173906508342102, + "flos": 16179876483840.0, + "grad_norm": 481.2374985030743, + "language_loss": 0.86100888, + "learning_rate": 7.80791310264143e-07, + "loss": 0.87571251, + "num_input_tokens_seen": 257407495, + "router_z_loss_clip": 2.10253906, + "router_z_loss_mlp": 0.24267578, + "step": 11932, + "time_per_iteration": 2.6424949169158936 + }, + { + "auxiliary_loss_clip": 0.01259525, + "auxiliary_loss_mlp": 0.00211827, + "balance_loss_clip": 1.04499197, + "balance_loss_mlp": 0.18656684, + "epoch": 0.7174507740868781, + "flos": 26613864933120.0, + "grad_norm": 13.85324772332025, + "language_loss": 0.81430089, + "learning_rate": 7.804826048344803e-07, + "loss": 0.82901442, + "num_input_tokens_seen": 257429675, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.25256348, + "step": 11933, + "time_per_iteration": 2.7740328311920166 + }, + { + "auxiliary_loss_clip": 0.01288486, + "auxiliary_loss_mlp": 0.00239329, + "balance_loss_clip": 1.0577879, + "balance_loss_mlp": 0.20938373, + "epoch": 0.7175108973395461, + "flos": 18432911116800.0, + "grad_norm": 8.959877738238635, + "language_loss": 0.84645581, + "learning_rate": 7.801739456490388e-07, + "loss": 0.86173403, + "num_input_tokens_seen": 257442765, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.29956055, + "step": 11934, + "time_per_iteration": 2.5996956825256348 + }, + { + "auxiliary_loss_clip": 0.01261466, + "auxiliary_loss_mlp": 0.00206236, + "balance_loss_clip": 1.04674792, + "balance_loss_mlp": 0.17828137, + "epoch": 0.717571020592214, + "flos": 23914962777600.0, + "grad_norm": 6.618132174380191, + "language_loss": 0.93629003, + "learning_rate": 7.798653327195237e-07, + "loss": 0.95096701, + "num_input_tokens_seen": 257459310, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.2791748, + "step": 11935, + "time_per_iteration": 2.6732680797576904 + }, + { + "auxiliary_loss_clip": 0.01272979, + "auxiliary_loss_mlp": 0.00225615, + "balance_loss_clip": 1.06047201, + "balance_loss_mlp": 0.19951968, + "epoch": 0.717631143844882, + "flos": 38256930109440.0, + "grad_norm": 8.154293989888002, + "language_loss": 0.81803894, + "learning_rate": 7.795567660576388e-07, + "loss": 0.83302486, + "num_input_tokens_seen": 257484750, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.2611084, + "step": 11936, + "time_per_iteration": 2.833138942718506 + }, + { + "auxiliary_loss_clip": 0.01191469, + "auxiliary_loss_mlp": 0.00059612, + "balance_loss_clip": 1.03945732, + "balance_loss_mlp": 0.05241149, + "epoch": 0.7176912670975499, + "flos": 65515896328320.0, + "grad_norm": 0.7550919204081958, + "language_loss": 0.54969656, + "learning_rate": 7.79248245675082e-07, + "loss": 0.56220734, + "num_input_tokens_seen": 257543110, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.07177734, + "step": 11937, + "time_per_iteration": 3.2498767375946045 + }, + { + "auxiliary_loss_clip": 0.01275128, + "auxiliary_loss_mlp": 0.00229831, + "balance_loss_clip": 1.05531311, + "balance_loss_mlp": 0.20172133, + "epoch": 0.717751390350218, + "flos": 31281066610560.0, + "grad_norm": 12.2428878961128, + "language_loss": 0.61375093, + "learning_rate": 7.789397715835542e-07, + "loss": 0.62880051, + "num_input_tokens_seen": 257567410, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.28137207, + "step": 11938, + "time_per_iteration": 2.7231028079986572 + }, + { + "auxiliary_loss_clip": 0.01260255, + "auxiliary_loss_mlp": 0.00218438, + "balance_loss_clip": 1.0467056, + "balance_loss_mlp": 0.19290309, + "epoch": 0.7178115136028859, + "flos": 19859031774720.0, + "grad_norm": 28.084082258614814, + "language_loss": 0.84142596, + "learning_rate": 7.786313437947527e-07, + "loss": 0.85621285, + "num_input_tokens_seen": 257586270, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.25549316, + "step": 11939, + "time_per_iteration": 2.6487059593200684 + }, + { + "auxiliary_loss_clip": 0.0118519, + "auxiliary_loss_mlp": 0.00077497, + "balance_loss_clip": 1.03395808, + "balance_loss_mlp": 0.0699634, + "epoch": 0.7178716368555539, + "flos": 64348655967360.0, + "grad_norm": 4.143817666274519, + "language_loss": 0.6049704, + "learning_rate": 7.783229623203738e-07, + "loss": 0.61759728, + "num_input_tokens_seen": 257647415, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.07519531, + "step": 11940, + "time_per_iteration": 3.125281572341919 + }, + { + "auxiliary_loss_clip": 0.01273334, + "auxiliary_loss_mlp": 0.00197705, + "balance_loss_clip": 1.06084824, + "balance_loss_mlp": 0.17339832, + "epoch": 0.7179317601082219, + "flos": 26762607152640.0, + "grad_norm": 3.168053533812203, + "language_loss": 0.65411437, + "learning_rate": 7.780146271721097e-07, + "loss": 0.66882479, + "num_input_tokens_seen": 257669795, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.24316406, + "step": 11941, + "time_per_iteration": 2.7014000415802 + }, + { + "auxiliary_loss_clip": 0.01274934, + "auxiliary_loss_mlp": 0.00230627, + "balance_loss_clip": 1.0620079, + "balance_loss_mlp": 0.20570049, + "epoch": 0.7179918833608898, + "flos": 23513804709120.0, + "grad_norm": 4.598870124580506, + "language_loss": 0.86272597, + "learning_rate": 7.777063383616543e-07, + "loss": 0.87778163, + "num_input_tokens_seen": 257687415, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.24938965, + "step": 11942, + "time_per_iteration": 2.6897284984588623 + }, + { + "auxiliary_loss_clip": 0.01267906, + "auxiliary_loss_mlp": 0.00211709, + "balance_loss_clip": 1.05214334, + "balance_loss_mlp": 0.18681842, + "epoch": 0.7180520066135578, + "flos": 17165588486400.0, + "grad_norm": 11.195815188512125, + "language_loss": 0.75243604, + "learning_rate": 7.773980959006968e-07, + "loss": 0.76723218, + "num_input_tokens_seen": 257706215, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24865723, + "step": 11943, + "time_per_iteration": 2.6274774074554443 + }, + { + "auxiliary_loss_clip": 0.01272674, + "auxiliary_loss_mlp": 0.00247864, + "balance_loss_clip": 1.05353844, + "balance_loss_mlp": 0.22346207, + "epoch": 0.7181121298662257, + "flos": 17566638814080.0, + "grad_norm": 1723.7773482036296, + "language_loss": 0.85947657, + "learning_rate": 7.770898998009254e-07, + "loss": 0.87468195, + "num_input_tokens_seen": 257724740, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24438477, + "step": 11944, + "time_per_iteration": 2.694152593612671 + }, + { + "auxiliary_loss_clip": 0.01287535, + "auxiliary_loss_mlp": 0.00223375, + "balance_loss_clip": 1.06398845, + "balance_loss_mlp": 0.19494364, + "epoch": 0.7181722531188938, + "flos": 11947660508160.0, + "grad_norm": 10.15681520764219, + "language_loss": 0.75445485, + "learning_rate": 7.767817500740277e-07, + "loss": 0.76956391, + "num_input_tokens_seen": 257742060, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.28442383, + "step": 11945, + "time_per_iteration": 2.6331703662872314 + }, + { + "auxiliary_loss_clip": 0.01167605, + "auxiliary_loss_mlp": 0.00103727, + "balance_loss_clip": 1.01610684, + "balance_loss_mlp": 0.09671738, + "epoch": 0.7182323763715617, + "flos": 65503649790720.0, + "grad_norm": 0.6835535749883511, + "language_loss": 0.50433487, + "learning_rate": 7.76473646731689e-07, + "loss": 0.51704818, + "num_input_tokens_seen": 257802250, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.0703125, + "step": 11946, + "time_per_iteration": 3.096867799758911 + }, + { + "auxiliary_loss_clip": 0.01285294, + "auxiliary_loss_mlp": 0.00214961, + "balance_loss_clip": 1.06409395, + "balance_loss_mlp": 0.18786471, + "epoch": 0.7182924996242297, + "flos": 20630932070400.0, + "grad_norm": 5.742867556199986, + "language_loss": 0.82683671, + "learning_rate": 7.761655897855925e-07, + "loss": 0.84183925, + "num_input_tokens_seen": 257821155, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.27087402, + "step": 11947, + "time_per_iteration": 2.635847806930542 + }, + { + "auxiliary_loss_clip": 0.01260844, + "auxiliary_loss_mlp": 0.00219633, + "balance_loss_clip": 1.05024827, + "balance_loss_mlp": 0.19414642, + "epoch": 0.7183526228768976, + "flos": 16216433550720.0, + "grad_norm": 7.866271554400097, + "language_loss": 0.8059808, + "learning_rate": 7.758575792474187e-07, + "loss": 0.82078552, + "num_input_tokens_seen": 257839905, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.25512695, + "step": 11948, + "time_per_iteration": 2.6462740898132324 + }, + { + "auxiliary_loss_clip": 0.01281404, + "auxiliary_loss_mlp": 0.00254019, + "balance_loss_clip": 1.06648111, + "balance_loss_mlp": 0.22723305, + "epoch": 0.7184127461295656, + "flos": 22232655342720.0, + "grad_norm": 3.7174083021791846, + "language_loss": 0.78485847, + "learning_rate": 7.755496151288483e-07, + "loss": 0.80021274, + "num_input_tokens_seen": 257860055, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.26757812, + "step": 11949, + "time_per_iteration": 2.6439085006713867 + }, + { + "auxiliary_loss_clip": 0.01267572, + "auxiliary_loss_mlp": 0.00224553, + "balance_loss_clip": 1.04945612, + "balance_loss_mlp": 0.19847031, + "epoch": 0.7184728693822335, + "flos": 27344503480320.0, + "grad_norm": 2867.13708883593, + "language_loss": 0.85918683, + "learning_rate": 7.752416974415598e-07, + "loss": 0.87410808, + "num_input_tokens_seen": 257879315, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.26074219, + "step": 11950, + "time_per_iteration": 2.8285813331604004 + }, + { + "auxiliary_loss_clip": 0.0128229, + "auxiliary_loss_mlp": 0.0024866, + "balance_loss_clip": 1.06408191, + "balance_loss_mlp": 0.22196887, + "epoch": 0.7185329926349016, + "flos": 16508530949760.0, + "grad_norm": 17.53060593532323, + "language_loss": 0.77840233, + "learning_rate": 7.749338261972282e-07, + "loss": 0.79371184, + "num_input_tokens_seen": 257896570, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.2668457, + "step": 11951, + "time_per_iteration": 2.704913854598999 + }, + { + "auxiliary_loss_clip": 0.01281651, + "auxiliary_loss_mlp": 0.00232361, + "balance_loss_clip": 1.0582664, + "balance_loss_mlp": 0.20514566, + "epoch": 0.7185931158875695, + "flos": 23951052967680.0, + "grad_norm": 17.271431252928643, + "language_loss": 0.86208498, + "learning_rate": 7.746260014075286e-07, + "loss": 0.8772251, + "num_input_tokens_seen": 257916855, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.27209473, + "step": 11952, + "time_per_iteration": 2.7066574096679688 + }, + { + "auxiliary_loss_clip": 0.01285418, + "auxiliary_loss_mlp": 0.0023973, + "balance_loss_clip": 1.06518388, + "balance_loss_mlp": 0.21359886, + "epoch": 0.7186532391402375, + "flos": 26542007775360.0, + "grad_norm": 48.16844378588123, + "language_loss": 0.82661736, + "learning_rate": 7.743182230841352e-07, + "loss": 0.84186876, + "num_input_tokens_seen": 257937140, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.26159668, + "step": 11953, + "time_per_iteration": 2.7572388648986816 + }, + { + "auxiliary_loss_clip": 0.01292815, + "auxiliary_loss_mlp": 0.00251596, + "balance_loss_clip": 1.0655787, + "balance_loss_mlp": 0.22365382, + "epoch": 0.7187133623929055, + "flos": 22383049587840.0, + "grad_norm": 3.357789755071188, + "language_loss": 0.82308835, + "learning_rate": 7.740104912387164e-07, + "loss": 0.83853245, + "num_input_tokens_seen": 257956785, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.27966309, + "step": 11954, + "time_per_iteration": 2.6839640140533447 + }, + { + "auxiliary_loss_clip": 0.012823, + "auxiliary_loss_mlp": 0.00240461, + "balance_loss_clip": 1.06385612, + "balance_loss_mlp": 0.21376991, + "epoch": 0.7187734856455734, + "flos": 15779580341760.0, + "grad_norm": 8.247387421709513, + "language_loss": 0.81956148, + "learning_rate": 7.737028058829425e-07, + "loss": 0.83478904, + "num_input_tokens_seen": 257975455, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.26672363, + "step": 11955, + "time_per_iteration": 2.843238592147827 + }, + { + "auxiliary_loss_clip": 0.0126268, + "auxiliary_loss_mlp": 0.00236231, + "balance_loss_clip": 1.04996705, + "balance_loss_mlp": 0.21216276, + "epoch": 0.7188336088982414, + "flos": 31759612531200.0, + "grad_norm": 10.352524305193992, + "language_loss": 0.81142282, + "learning_rate": 7.733951670284817e-07, + "loss": 0.82641196, + "num_input_tokens_seen": 257996850, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.24072266, + "step": 11956, + "time_per_iteration": 2.7955563068389893 + }, + { + "auxiliary_loss_clip": 0.01287323, + "auxiliary_loss_mlp": 0.00235469, + "balance_loss_clip": 1.06312215, + "balance_loss_mlp": 0.20750269, + "epoch": 0.7188937321509093, + "flos": 21465208333440.0, + "grad_norm": 5.46743126248071, + "language_loss": 0.79256272, + "learning_rate": 7.730875746869987e-07, + "loss": 0.80779064, + "num_input_tokens_seen": 258016145, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.27978516, + "step": 11957, + "time_per_iteration": 2.7715535163879395 + }, + { + "auxiliary_loss_clip": 0.01284077, + "auxiliary_loss_mlp": 0.00249224, + "balance_loss_clip": 1.06129372, + "balance_loss_mlp": 0.21896912, + "epoch": 0.7189538554035774, + "flos": 27271497087360.0, + "grad_norm": 3.5168923581034885, + "language_loss": 0.83218426, + "learning_rate": 7.727800288701582e-07, + "loss": 0.84751725, + "num_input_tokens_seen": 258035420, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.30285645, + "step": 11958, + "time_per_iteration": 2.7629034519195557 + }, + { + "auxiliary_loss_clip": 0.01259904, + "auxiliary_loss_mlp": 0.00219078, + "balance_loss_clip": 1.04090214, + "balance_loss_mlp": 0.193663, + "epoch": 0.7190139786562453, + "flos": 21580625710080.0, + "grad_norm": 16.794277923719616, + "language_loss": 0.91561925, + "learning_rate": 7.724725295896215e-07, + "loss": 0.93040907, + "num_input_tokens_seen": 258053520, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25402832, + "step": 11959, + "time_per_iteration": 2.669038772583008 + }, + { + "auxiliary_loss_clip": 0.01281339, + "auxiliary_loss_mlp": 0.00223452, + "balance_loss_clip": 1.05732512, + "balance_loss_mlp": 0.19647522, + "epoch": 0.7190741019089133, + "flos": 26721237663360.0, + "grad_norm": 5.744386962422641, + "language_loss": 0.89042354, + "learning_rate": 7.7216507685705e-07, + "loss": 0.90547156, + "num_input_tokens_seen": 258073020, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.26953125, + "step": 11960, + "time_per_iteration": 2.6460134983062744 + }, + { + "auxiliary_loss_clip": 0.01278968, + "auxiliary_loss_mlp": 0.00249463, + "balance_loss_clip": 1.05835605, + "balance_loss_mlp": 0.22048274, + "epoch": 0.7191342251615812, + "flos": 26104759516800.0, + "grad_norm": 61.44568339644394, + "language_loss": 0.861283, + "learning_rate": 7.718576706841013e-07, + "loss": 0.8765673, + "num_input_tokens_seen": 258093155, + "router_z_loss_clip": 2.20410156, + "router_z_loss_mlp": 0.28979492, + "step": 11961, + "time_per_iteration": 2.6783933639526367 + }, + { + "auxiliary_loss_clip": 0.01270645, + "auxiliary_loss_mlp": 0.00220876, + "balance_loss_clip": 1.06094491, + "balance_loss_mlp": 0.19717672, + "epoch": 0.7191943484142492, + "flos": 22967028904320.0, + "grad_norm": 21.2956373254159, + "language_loss": 0.80758417, + "learning_rate": 7.715503110824326e-07, + "loss": 0.82249933, + "num_input_tokens_seen": 258113905, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.23693848, + "step": 11962, + "time_per_iteration": 4.067600250244141 + }, + { + "auxiliary_loss_clip": 0.01277967, + "auxiliary_loss_mlp": 0.00222357, + "balance_loss_clip": 1.0540688, + "balance_loss_mlp": 0.19571325, + "epoch": 0.7192544716669171, + "flos": 22565332131840.0, + "grad_norm": 151.07154464750238, + "language_loss": 0.82572287, + "learning_rate": 7.712429980637001e-07, + "loss": 0.84072614, + "num_input_tokens_seen": 258132820, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26635742, + "step": 11963, + "time_per_iteration": 2.663830041885376 + }, + { + "auxiliary_loss_clip": 0.01319084, + "auxiliary_loss_mlp": 0.00242357, + "balance_loss_clip": 1.08314097, + "balance_loss_mlp": 0.21334137, + "epoch": 0.7193145949195852, + "flos": 18982200873600.0, + "grad_norm": 40.65007559096879, + "language_loss": 0.94660389, + "learning_rate": 7.709357316395564e-07, + "loss": 0.96221828, + "num_input_tokens_seen": 258148055, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.28979492, + "step": 11964, + "time_per_iteration": 4.069465398788452 + }, + { + "auxiliary_loss_clip": 0.01279608, + "auxiliary_loss_mlp": 0.00253329, + "balance_loss_clip": 1.05891967, + "balance_loss_mlp": 0.22532724, + "epoch": 0.7193747181722531, + "flos": 18004246208640.0, + "grad_norm": 108.46581405345161, + "language_loss": 0.81560117, + "learning_rate": 7.70628511821652e-07, + "loss": 0.83093053, + "num_input_tokens_seen": 258165995, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.27966309, + "step": 11965, + "time_per_iteration": 2.6524858474731445 + }, + { + "auxiliary_loss_clip": 0.01288687, + "auxiliary_loss_mlp": 0.00222494, + "balance_loss_clip": 1.06413031, + "balance_loss_mlp": 0.19579124, + "epoch": 0.7194348414249211, + "flos": 24389414547840.0, + "grad_norm": 11.657674583756176, + "language_loss": 0.86127383, + "learning_rate": 7.703213386216377e-07, + "loss": 0.87638563, + "num_input_tokens_seen": 258186165, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26708984, + "step": 11966, + "time_per_iteration": 2.6706488132476807 + }, + { + "auxiliary_loss_clip": 0.01265701, + "auxiliary_loss_mlp": 0.00219864, + "balance_loss_clip": 1.05122292, + "balance_loss_mlp": 0.19649848, + "epoch": 0.7194949646775891, + "flos": 22163455791360.0, + "grad_norm": 6.665520391476738, + "language_loss": 0.80880672, + "learning_rate": 7.700142120511619e-07, + "loss": 0.8236624, + "num_input_tokens_seen": 258204595, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.23376465, + "step": 11967, + "time_per_iteration": 4.0812389850616455 + }, + { + "auxiliary_loss_clip": 0.01271883, + "auxiliary_loss_mlp": 0.00211527, + "balance_loss_clip": 1.05991709, + "balance_loss_mlp": 0.18701792, + "epoch": 0.719555087930257, + "flos": 20266366982400.0, + "grad_norm": 8.429505443655739, + "language_loss": 0.88601232, + "learning_rate": 7.6970713212187e-07, + "loss": 0.90084636, + "num_input_tokens_seen": 258223110, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.24499512, + "step": 11968, + "time_per_iteration": 2.685732126235962 + }, + { + "auxiliary_loss_clip": 0.01278588, + "auxiliary_loss_mlp": 0.00235189, + "balance_loss_clip": 1.05891609, + "balance_loss_mlp": 0.20984487, + "epoch": 0.719615211182925, + "flos": 24716309247360.0, + "grad_norm": 26.40673377375941, + "language_loss": 0.85160196, + "learning_rate": 7.69400098845407e-07, + "loss": 0.86673975, + "num_input_tokens_seen": 258242660, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25354004, + "step": 11969, + "time_per_iteration": 2.7035915851593018 + }, + { + "auxiliary_loss_clip": 0.01273618, + "auxiliary_loss_mlp": 0.00236032, + "balance_loss_clip": 1.05364263, + "balance_loss_mlp": 0.20935297, + "epoch": 0.719675334435593, + "flos": 20009641501440.0, + "grad_norm": 4.077165123040124, + "language_loss": 0.78019047, + "learning_rate": 7.69093112233417e-07, + "loss": 0.79528701, + "num_input_tokens_seen": 258261850, + "router_z_loss_clip": 2.20410156, + "router_z_loss_mlp": 0.2668457, + "step": 11970, + "time_per_iteration": 2.706408977508545 + }, + { + "auxiliary_loss_clip": 0.01177366, + "auxiliary_loss_mlp": 0.00148605, + "balance_loss_clip": 1.01754701, + "balance_loss_mlp": 0.14097515, + "epoch": 0.719735457688261, + "flos": 44199861177600.0, + "grad_norm": 0.9253839073123786, + "language_loss": 0.5985446, + "learning_rate": 7.68786172297538e-07, + "loss": 0.61180431, + "num_input_tokens_seen": 258312570, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.07617188, + "step": 11971, + "time_per_iteration": 4.455890417098999 + }, + { + "auxiliary_loss_clip": 0.01275942, + "auxiliary_loss_mlp": 0.00238057, + "balance_loss_clip": 1.05220294, + "balance_loss_mlp": 0.21032888, + "epoch": 0.7197955809409289, + "flos": 16802890905600.0, + "grad_norm": 204.16617404685942, + "language_loss": 0.8965435, + "learning_rate": 7.684792790494105e-07, + "loss": 0.9116835, + "num_input_tokens_seen": 258331600, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.27722168, + "step": 11972, + "time_per_iteration": 2.687319755554199 + }, + { + "auxiliary_loss_clip": 0.01291396, + "auxiliary_loss_mlp": 0.00235537, + "balance_loss_clip": 1.07060146, + "balance_loss_mlp": 0.20944166, + "epoch": 0.7198557041935969, + "flos": 24535391420160.0, + "grad_norm": 4.312155147429244, + "language_loss": 0.82065988, + "learning_rate": 7.681724325006733e-07, + "loss": 0.83592921, + "num_input_tokens_seen": 258351785, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26086426, + "step": 11973, + "time_per_iteration": 2.73226261138916 + }, + { + "auxiliary_loss_clip": 0.01177082, + "auxiliary_loss_mlp": 0.00083242, + "balance_loss_clip": 1.01614666, + "balance_loss_mlp": 0.07599403, + "epoch": 0.7199158274462648, + "flos": 70710839602560.0, + "grad_norm": 0.8313854656387055, + "language_loss": 0.55821055, + "learning_rate": 7.6786563266296e-07, + "loss": 0.57081383, + "num_input_tokens_seen": 258404035, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.07226562, + "step": 11974, + "time_per_iteration": 3.005429983139038 + }, + { + "auxiliary_loss_clip": 0.01269321, + "auxiliary_loss_mlp": 0.00220181, + "balance_loss_clip": 1.05064058, + "balance_loss_mlp": 0.19532628, + "epoch": 0.7199759506989328, + "flos": 29347995352320.0, + "grad_norm": 7.08770582997476, + "language_loss": 0.69264615, + "learning_rate": 7.675588795479062e-07, + "loss": 0.70754117, + "num_input_tokens_seen": 258424850, + "router_z_loss_clip": 2.18847656, + "router_z_loss_mlp": 0.24841309, + "step": 11975, + "time_per_iteration": 2.7801480293273926 + }, + { + "auxiliary_loss_clip": 0.01254887, + "auxiliary_loss_mlp": 0.00208499, + "balance_loss_clip": 1.03538144, + "balance_loss_mlp": 0.18373913, + "epoch": 0.7200360739516007, + "flos": 24640465680000.0, + "grad_norm": 21.052868106043128, + "language_loss": 0.75716591, + "learning_rate": 7.672521731671425e-07, + "loss": 0.7717998, + "num_input_tokens_seen": 258445485, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24755859, + "step": 11976, + "time_per_iteration": 2.705580711364746 + }, + { + "auxiliary_loss_clip": 0.01269313, + "auxiliary_loss_mlp": 0.00227527, + "balance_loss_clip": 1.05587435, + "balance_loss_mlp": 0.20229049, + "epoch": 0.7200961972042688, + "flos": 20812855478400.0, + "grad_norm": 21.055087082594326, + "language_loss": 0.74992645, + "learning_rate": 7.669455135323004e-07, + "loss": 0.76489484, + "num_input_tokens_seen": 258464505, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.25244141, + "step": 11977, + "time_per_iteration": 2.686563014984131 + }, + { + "auxiliary_loss_clip": 0.01284247, + "auxiliary_loss_mlp": 0.00236879, + "balance_loss_clip": 1.06343853, + "balance_loss_mlp": 0.21147573, + "epoch": 0.7201563204569367, + "flos": 31245910174080.0, + "grad_norm": 2.883417268537661, + "language_loss": 0.83641291, + "learning_rate": 7.666389006550074e-07, + "loss": 0.85162419, + "num_input_tokens_seen": 258487190, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25378418, + "step": 11978, + "time_per_iteration": 2.7775189876556396 + }, + { + "auxiliary_loss_clip": 0.01275724, + "auxiliary_loss_mlp": 0.00238939, + "balance_loss_clip": 1.05908537, + "balance_loss_mlp": 0.21311848, + "epoch": 0.7202164437096047, + "flos": 26651391667200.0, + "grad_norm": 71.5673744653802, + "language_loss": 0.85707206, + "learning_rate": 7.663323345468908e-07, + "loss": 0.87221873, + "num_input_tokens_seen": 258503790, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.25805664, + "step": 11979, + "time_per_iteration": 2.7086057662963867 + }, + { + "auxiliary_loss_clip": 0.01282397, + "auxiliary_loss_mlp": 0.00242937, + "balance_loss_clip": 1.05881512, + "balance_loss_mlp": 0.21612713, + "epoch": 0.7202765669622727, + "flos": 25959608657280.0, + "grad_norm": 2.34155609923816, + "language_loss": 0.71218228, + "learning_rate": 7.660258152195767e-07, + "loss": 0.72743559, + "num_input_tokens_seen": 258527335, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.2677002, + "step": 11980, + "time_per_iteration": 2.7212605476379395 + }, + { + "auxiliary_loss_clip": 0.01279393, + "auxiliary_loss_mlp": 0.00238517, + "balance_loss_clip": 1.05585742, + "balance_loss_mlp": 0.21227857, + "epoch": 0.7203366902149406, + "flos": 28512354372480.0, + "grad_norm": 16.82305431011846, + "language_loss": 0.76674664, + "learning_rate": 7.657193426846871e-07, + "loss": 0.78192574, + "num_input_tokens_seen": 258546690, + "router_z_loss_clip": 2.23535156, + "router_z_loss_mlp": 0.2623291, + "step": 11981, + "time_per_iteration": 2.7050836086273193 + }, + { + "auxiliary_loss_clip": 0.01288222, + "auxiliary_loss_mlp": 0.00236004, + "balance_loss_clip": 1.06398034, + "balance_loss_mlp": 0.20765644, + "epoch": 0.7203968134676086, + "flos": 21106030285440.0, + "grad_norm": 39.996194041635505, + "language_loss": 0.81953472, + "learning_rate": 7.65412916953843e-07, + "loss": 0.834777, + "num_input_tokens_seen": 258566340, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.2833252, + "step": 11982, + "time_per_iteration": 2.695765972137451 + }, + { + "auxiliary_loss_clip": 0.01271304, + "auxiliary_loss_mlp": 0.00227577, + "balance_loss_clip": 1.05238104, + "balance_loss_mlp": 0.20186332, + "epoch": 0.7204569367202766, + "flos": 18332146488960.0, + "grad_norm": 65.72578191298544, + "language_loss": 0.7538923, + "learning_rate": 7.65106538038665e-07, + "loss": 0.76888108, + "num_input_tokens_seen": 258584455, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25695801, + "step": 11983, + "time_per_iteration": 2.6308202743530273 + }, + { + "auxiliary_loss_clip": 0.01278897, + "auxiliary_loss_mlp": 0.00254672, + "balance_loss_clip": 1.05486679, + "balance_loss_mlp": 0.22744417, + "epoch": 0.7205170599729446, + "flos": 23255103980160.0, + "grad_norm": 27.819114363776198, + "language_loss": 0.73824, + "learning_rate": 7.648002059507715e-07, + "loss": 0.75357574, + "num_input_tokens_seen": 258604725, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.27233887, + "step": 11984, + "time_per_iteration": 2.701201915740967 + }, + { + "auxiliary_loss_clip": 0.01316322, + "auxiliary_loss_mlp": 0.00216763, + "balance_loss_clip": 1.08471966, + "balance_loss_mlp": 0.18861744, + "epoch": 0.7205771832256125, + "flos": 20120892900480.0, + "grad_norm": 31.333632469713486, + "language_loss": 0.82802451, + "learning_rate": 7.644939207017771e-07, + "loss": 0.84335536, + "num_input_tokens_seen": 258622885, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.28161621, + "step": 11985, + "time_per_iteration": 2.641064405441284 + }, + { + "auxiliary_loss_clip": 0.01270582, + "auxiliary_loss_mlp": 0.00213202, + "balance_loss_clip": 1.05429995, + "balance_loss_mlp": 0.18754837, + "epoch": 0.7206373064782805, + "flos": 27703250565120.0, + "grad_norm": 11.52233170752679, + "language_loss": 0.68589664, + "learning_rate": 7.641876823032977e-07, + "loss": 0.7007345, + "num_input_tokens_seen": 258644305, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25634766, + "step": 11986, + "time_per_iteration": 2.7199172973632812 + }, + { + "auxiliary_loss_clip": 0.01284087, + "auxiliary_loss_mlp": 0.00239347, + "balance_loss_clip": 1.05869222, + "balance_loss_mlp": 0.21129669, + "epoch": 0.7206974297309484, + "flos": 17968156018560.0, + "grad_norm": 19.069202993554043, + "language_loss": 0.78884959, + "learning_rate": 7.638814907669455e-07, + "loss": 0.80408382, + "num_input_tokens_seen": 258661775, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.28039551, + "step": 11987, + "time_per_iteration": 2.6895010471343994 + }, + { + "auxiliary_loss_clip": 0.01283081, + "auxiliary_loss_mlp": 0.00239176, + "balance_loss_clip": 1.05851269, + "balance_loss_mlp": 0.21153107, + "epoch": 0.7207575529836164, + "flos": 16983162288000.0, + "grad_norm": 46.19099880940014, + "language_loss": 0.86691052, + "learning_rate": 7.635753461043301e-07, + "loss": 0.88213307, + "num_input_tokens_seen": 258679830, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.27648926, + "step": 11988, + "time_per_iteration": 2.698059320449829 + }, + { + "auxiliary_loss_clip": 0.01268328, + "auxiliary_loss_mlp": 0.00221811, + "balance_loss_clip": 1.05020583, + "balance_loss_mlp": 0.19683644, + "epoch": 0.7208176762362843, + "flos": 18727594295040.0, + "grad_norm": 6.410814048068348, + "language_loss": 0.85214162, + "learning_rate": 7.632692483270618e-07, + "loss": 0.86704296, + "num_input_tokens_seen": 258697415, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25, + "step": 11989, + "time_per_iteration": 2.6740670204162598 + }, + { + "auxiliary_loss_clip": 0.01255594, + "auxiliary_loss_mlp": 0.00248045, + "balance_loss_clip": 1.03864741, + "balance_loss_mlp": 0.2223081, + "epoch": 0.7208777994889524, + "flos": 18734489706240.0, + "grad_norm": 8.572349543196555, + "language_loss": 0.90791595, + "learning_rate": 7.629631974467481e-07, + "loss": 0.92295235, + "num_input_tokens_seen": 258716755, + "router_z_loss_clip": 2.17285156, + "router_z_loss_mlp": 0.25756836, + "step": 11990, + "time_per_iteration": 2.685666799545288 + }, + { + "auxiliary_loss_clip": 0.01289538, + "auxiliary_loss_mlp": 0.00247137, + "balance_loss_clip": 1.0622952, + "balance_loss_mlp": 0.22005308, + "epoch": 0.7209379227416203, + "flos": 14793437376000.0, + "grad_norm": 18.331604295828267, + "language_loss": 0.86277282, + "learning_rate": 7.626571934749931e-07, + "loss": 0.87813962, + "num_input_tokens_seen": 258733270, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.27075195, + "step": 11991, + "time_per_iteration": 2.617421865463257 + }, + { + "auxiliary_loss_clip": 0.01274107, + "auxiliary_loss_mlp": 0.00229003, + "balance_loss_clip": 1.05062568, + "balance_loss_mlp": 0.20262162, + "epoch": 0.7209980459942883, + "flos": 29636860527360.0, + "grad_norm": 6.388065775798868, + "language_loss": 0.78805083, + "learning_rate": 7.623512364234022e-07, + "loss": 0.80308193, + "num_input_tokens_seen": 258755270, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.2635498, + "step": 11992, + "time_per_iteration": 2.746302604675293 + }, + { + "auxiliary_loss_clip": 0.01292702, + "auxiliary_loss_mlp": 0.00241478, + "balance_loss_clip": 1.06250489, + "balance_loss_mlp": 0.21308249, + "epoch": 0.7210581692469563, + "flos": 23477175815040.0, + "grad_norm": 8.660892457368531, + "language_loss": 0.73977697, + "learning_rate": 7.620453263035755e-07, + "loss": 0.75511873, + "num_input_tokens_seen": 258775340, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.28356934, + "step": 11993, + "time_per_iteration": 2.72078275680542 + }, + { + "auxiliary_loss_clip": 0.01275699, + "auxiliary_loss_mlp": 0.00234308, + "balance_loss_clip": 1.05721712, + "balance_loss_mlp": 0.21037072, + "epoch": 0.7211182924996242, + "flos": 26099839353600.0, + "grad_norm": 12.133936881067596, + "language_loss": 0.7320618, + "learning_rate": 7.61739463127115e-07, + "loss": 0.74716187, + "num_input_tokens_seen": 258794580, + "router_z_loss_clip": 2.18847656, + "router_z_loss_mlp": 0.23950195, + "step": 11994, + "time_per_iteration": 2.71977162361145 + }, + { + "auxiliary_loss_clip": 0.0131178, + "auxiliary_loss_mlp": 0.00227499, + "balance_loss_clip": 1.07593036, + "balance_loss_mlp": 0.19862671, + "epoch": 0.7211784157522922, + "flos": 17712076982400.0, + "grad_norm": 15.29109309836306, + "language_loss": 0.7661289, + "learning_rate": 7.614336469056172e-07, + "loss": 0.78152168, + "num_input_tokens_seen": 258812330, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.28857422, + "step": 11995, + "time_per_iteration": 2.6521332263946533 + }, + { + "auxiliary_loss_clip": 0.01288454, + "auxiliary_loss_mlp": 0.00233967, + "balance_loss_clip": 1.06649494, + "balance_loss_mlp": 0.20671612, + "epoch": 0.7212385390049602, + "flos": 24423637230720.0, + "grad_norm": 26.061642915747093, + "language_loss": 0.86751246, + "learning_rate": 7.6112787765068e-07, + "loss": 0.88273668, + "num_input_tokens_seen": 258831770, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.27233887, + "step": 11996, + "time_per_iteration": 2.692399263381958 + }, + { + "auxiliary_loss_clip": 0.01300197, + "auxiliary_loss_mlp": 0.00237727, + "balance_loss_clip": 1.07074058, + "balance_loss_mlp": 0.21058334, + "epoch": 0.7212986622576282, + "flos": 28147250580480.0, + "grad_norm": 3.8818114038970344, + "language_loss": 0.91973388, + "learning_rate": 7.60822155373899e-07, + "loss": 0.93511313, + "num_input_tokens_seen": 258849090, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.27124023, + "step": 11997, + "time_per_iteration": 2.744286298751831 + }, + { + "auxiliary_loss_clip": 0.01287665, + "auxiliary_loss_mlp": 0.0022072, + "balance_loss_clip": 1.06028461, + "balance_loss_mlp": 0.1936473, + "epoch": 0.7213587855102961, + "flos": 21835770992640.0, + "grad_norm": 1238.4657391461449, + "language_loss": 0.78906381, + "learning_rate": 7.605164800868646e-07, + "loss": 0.80414772, + "num_input_tokens_seen": 258868230, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.27062988, + "step": 11998, + "time_per_iteration": 2.687098503112793 + }, + { + "auxiliary_loss_clip": 0.01263077, + "auxiliary_loss_mlp": 0.00211432, + "balance_loss_clip": 1.04519546, + "balance_loss_mlp": 0.18553934, + "epoch": 0.7214189087629641, + "flos": 14611549881600.0, + "grad_norm": 29.24405151464381, + "language_loss": 0.79672396, + "learning_rate": 7.602108518011696e-07, + "loss": 0.81146908, + "num_input_tokens_seen": 258885525, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.2590332, + "step": 11999, + "time_per_iteration": 2.707030773162842 + }, + { + "auxiliary_loss_clip": 0.01285329, + "auxiliary_loss_mlp": 0.00233863, + "balance_loss_clip": 1.06151283, + "balance_loss_mlp": 0.20628986, + "epoch": 0.721479032015632, + "flos": 19390864884480.0, + "grad_norm": 32.891450241636356, + "language_loss": 0.9069469, + "learning_rate": 7.599052705284039e-07, + "loss": 0.92213887, + "num_input_tokens_seen": 258903245, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.27587891, + "step": 12000, + "time_per_iteration": 2.666729211807251 + }, + { + "auxiliary_loss_clip": 0.01273804, + "auxiliary_loss_mlp": 0.00226903, + "balance_loss_clip": 1.05559194, + "balance_loss_mlp": 0.20268011, + "epoch": 0.7215391552683, + "flos": 18512884748160.0, + "grad_norm": 164.78102882984254, + "language_loss": 0.85328454, + "learning_rate": 7.59599736280154e-07, + "loss": 0.86829156, + "num_input_tokens_seen": 258921245, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24230957, + "step": 12001, + "time_per_iteration": 2.8500304222106934 + }, + { + "auxiliary_loss_clip": 0.01269797, + "auxiliary_loss_mlp": 0.00216622, + "balance_loss_clip": 1.05096877, + "balance_loss_mlp": 0.19101554, + "epoch": 0.721599278520968, + "flos": 23258731253760.0, + "grad_norm": 37.1926755820163, + "language_loss": 0.87862599, + "learning_rate": 7.592942490680066e-07, + "loss": 0.8934902, + "num_input_tokens_seen": 258939425, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.25598145, + "step": 12002, + "time_per_iteration": 2.729344606399536 + }, + { + "auxiliary_loss_clip": 0.01294095, + "auxiliary_loss_mlp": 0.00213469, + "balance_loss_clip": 1.06669271, + "balance_loss_mlp": 0.18640804, + "epoch": 0.721659401773636, + "flos": 39199045979520.0, + "grad_norm": 4.02729078884529, + "language_loss": 0.72336805, + "learning_rate": 7.589888089035462e-07, + "loss": 0.73844367, + "num_input_tokens_seen": 258960710, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.27062988, + "step": 12003, + "time_per_iteration": 2.8136518001556396 + }, + { + "auxiliary_loss_clip": 0.01298767, + "auxiliary_loss_mlp": 0.00245831, + "balance_loss_clip": 1.07014298, + "balance_loss_mlp": 0.21775705, + "epoch": 0.7217195250263039, + "flos": 14939917038720.0, + "grad_norm": 14.252274180273691, + "language_loss": 0.81094903, + "learning_rate": 7.586834157983544e-07, + "loss": 0.82639503, + "num_input_tokens_seen": 258978475, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.28088379, + "step": 12004, + "time_per_iteration": 2.613208770751953 + }, + { + "auxiliary_loss_clip": 0.01151325, + "auxiliary_loss_mlp": 0.0006073, + "balance_loss_clip": 0.99559057, + "balance_loss_mlp": 0.0521468, + "epoch": 0.7217796482789719, + "flos": 70869206666880.0, + "grad_norm": 0.8543799041770398, + "language_loss": 0.53192043, + "learning_rate": 7.583780697640112e-07, + "loss": 0.54404098, + "num_input_tokens_seen": 259037520, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.0859375, + "step": 12005, + "time_per_iteration": 4.473542928695679 + }, + { + "auxiliary_loss_clip": 0.01287666, + "auxiliary_loss_mlp": 0.00218609, + "balance_loss_clip": 1.0618, + "balance_loss_mlp": 0.19394496, + "epoch": 0.7218397715316398, + "flos": 37451525402880.0, + "grad_norm": 36.761025856332225, + "language_loss": 0.71219629, + "learning_rate": 7.580727708120962e-07, + "loss": 0.7272591, + "num_input_tokens_seen": 259061325, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.24645996, + "step": 12006, + "time_per_iteration": 4.225692510604858 + }, + { + "auxiliary_loss_clip": 0.01276949, + "auxiliary_loss_mlp": 0.00217214, + "balance_loss_clip": 1.0558089, + "balance_loss_mlp": 0.19216776, + "epoch": 0.7218998947843078, + "flos": 22710662559360.0, + "grad_norm": 173.99190768631524, + "language_loss": 0.97985613, + "learning_rate": 7.577675189541865e-07, + "loss": 0.99479777, + "num_input_tokens_seen": 259078135, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.25024414, + "step": 12007, + "time_per_iteration": 2.6619932651519775 + }, + { + "auxiliary_loss_clip": 0.01274404, + "auxiliary_loss_mlp": 0.00239581, + "balance_loss_clip": 1.05065525, + "balance_loss_mlp": 0.21352234, + "epoch": 0.7219600180369758, + "flos": 12167182477440.0, + "grad_norm": 44.4385224785224, + "language_loss": 0.74539208, + "learning_rate": 7.574623142018568e-07, + "loss": 0.7605319, + "num_input_tokens_seen": 259095910, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.26074219, + "step": 12008, + "time_per_iteration": 2.6073551177978516 + }, + { + "auxiliary_loss_clip": 0.01270335, + "auxiliary_loss_mlp": 0.00221083, + "balance_loss_clip": 1.04697537, + "balance_loss_mlp": 0.19454658, + "epoch": 0.7220201412896438, + "flos": 22596573985920.0, + "grad_norm": 33.74395203792997, + "language_loss": 0.88282555, + "learning_rate": 7.57157156566681e-07, + "loss": 0.89773977, + "num_input_tokens_seen": 259114225, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26538086, + "step": 12009, + "time_per_iteration": 4.125509738922119 + }, + { + "auxiliary_loss_clip": 0.01300366, + "auxiliary_loss_mlp": 0.00254465, + "balance_loss_clip": 1.06899786, + "balance_loss_mlp": 0.22742838, + "epoch": 0.7220802645423118, + "flos": 26718651884160.0, + "grad_norm": 212.31168893122904, + "language_loss": 0.71120381, + "learning_rate": 7.568520460602297e-07, + "loss": 0.7267521, + "num_input_tokens_seen": 259134660, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.27038574, + "step": 12010, + "time_per_iteration": 2.7128357887268066 + }, + { + "auxiliary_loss_clip": 0.01280105, + "auxiliary_loss_mlp": 0.00223307, + "balance_loss_clip": 1.0625205, + "balance_loss_mlp": 0.19786739, + "epoch": 0.7221403877949797, + "flos": 24420548661120.0, + "grad_norm": 94.05798383279836, + "language_loss": 0.83583134, + "learning_rate": 7.565469826940742e-07, + "loss": 0.85086548, + "num_input_tokens_seen": 259153300, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.2545166, + "step": 12011, + "time_per_iteration": 2.7237677574157715 + }, + { + "auxiliary_loss_clip": 0.01281516, + "auxiliary_loss_mlp": 0.00223253, + "balance_loss_clip": 1.06051099, + "balance_loss_mlp": 0.19717011, + "epoch": 0.7222005110476477, + "flos": 23514379326720.0, + "grad_norm": 32.05653979635765, + "language_loss": 0.86258161, + "learning_rate": 7.56241966479781e-07, + "loss": 0.87762934, + "num_input_tokens_seen": 259172115, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.26074219, + "step": 12012, + "time_per_iteration": 2.7390973567962646 + }, + { + "auxiliary_loss_clip": 0.01276219, + "auxiliary_loss_mlp": 0.00223573, + "balance_loss_clip": 1.04847956, + "balance_loss_mlp": 0.19716819, + "epoch": 0.7222606343003156, + "flos": 23112538899840.0, + "grad_norm": 5.714869652721236, + "language_loss": 0.86140341, + "learning_rate": 7.559369974289171e-07, + "loss": 0.87640131, + "num_input_tokens_seen": 259191345, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.26416016, + "step": 12013, + "time_per_iteration": 4.215647220611572 + }, + { + "auxiliary_loss_clip": 0.01292628, + "auxiliary_loss_mlp": 0.00232485, + "balance_loss_clip": 1.07058883, + "balance_loss_mlp": 0.20729646, + "epoch": 0.7223207575529836, + "flos": 24351169541760.0, + "grad_norm": 188.28572466052273, + "language_loss": 0.82655311, + "learning_rate": 7.556320755530484e-07, + "loss": 0.84180427, + "num_input_tokens_seen": 259211700, + "router_z_loss_clip": 2.22167969, + "router_z_loss_mlp": 0.25170898, + "step": 12014, + "time_per_iteration": 2.738389015197754 + }, + { + "auxiliary_loss_clip": 0.0128476, + "auxiliary_loss_mlp": 0.0021688, + "balance_loss_clip": 1.06307495, + "balance_loss_mlp": 0.19027224, + "epoch": 0.7223808808056515, + "flos": 28330179569280.0, + "grad_norm": 6.068376206425522, + "language_loss": 0.92309165, + "learning_rate": 7.553272008637346e-07, + "loss": 0.93810809, + "num_input_tokens_seen": 259233825, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.26599121, + "step": 12015, + "time_per_iteration": 2.772022247314453 + }, + { + "auxiliary_loss_clip": 0.01270758, + "auxiliary_loss_mlp": 0.0021099, + "balance_loss_clip": 1.04875052, + "balance_loss_mlp": 0.18643326, + "epoch": 0.7224410040583196, + "flos": 21069437304960.0, + "grad_norm": 44.072045500354896, + "language_loss": 0.86542636, + "learning_rate": 7.55022373372538e-07, + "loss": 0.88024384, + "num_input_tokens_seen": 259253055, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.24560547, + "step": 12016, + "time_per_iteration": 2.6919190883636475 + }, + { + "auxiliary_loss_clip": 0.01273208, + "auxiliary_loss_mlp": 0.00223728, + "balance_loss_clip": 1.05526495, + "balance_loss_mlp": 0.19968396, + "epoch": 0.7225011273109875, + "flos": 26795429205120.0, + "grad_norm": 69.01418673848607, + "language_loss": 0.84320527, + "learning_rate": 7.547175930910186e-07, + "loss": 0.85817462, + "num_input_tokens_seen": 259273420, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24084473, + "step": 12017, + "time_per_iteration": 2.7903904914855957 + }, + { + "auxiliary_loss_clip": 0.01256766, + "auxiliary_loss_mlp": 0.00203895, + "balance_loss_clip": 1.04009604, + "balance_loss_mlp": 0.17930201, + "epoch": 0.7225612505636555, + "flos": 23583578878080.0, + "grad_norm": 33.28488959675894, + "language_loss": 0.81191063, + "learning_rate": 7.54412860030732e-07, + "loss": 0.82651728, + "num_input_tokens_seen": 259291000, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.24597168, + "step": 12018, + "time_per_iteration": 2.6843619346618652 + }, + { + "auxiliary_loss_clip": 0.01255222, + "auxiliary_loss_mlp": 0.00235381, + "balance_loss_clip": 1.04184294, + "balance_loss_mlp": 0.21010903, + "epoch": 0.7226213738163234, + "flos": 20777627214720.0, + "grad_norm": 7.513429103109149, + "language_loss": 0.84272408, + "learning_rate": 7.541081742032347e-07, + "loss": 0.85763013, + "num_input_tokens_seen": 259312390, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.25256348, + "step": 12019, + "time_per_iteration": 2.87785267829895 + }, + { + "auxiliary_loss_clip": 0.01262795, + "auxiliary_loss_mlp": 0.00220087, + "balance_loss_clip": 1.04975808, + "balance_loss_mlp": 0.19470698, + "epoch": 0.7226814970689914, + "flos": 32635832901120.0, + "grad_norm": 259.8497637730788, + "language_loss": 0.82196414, + "learning_rate": 7.53803535620081e-07, + "loss": 0.83679295, + "num_input_tokens_seen": 259332645, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.25378418, + "step": 12020, + "time_per_iteration": 2.924428701400757 + }, + { + "auxiliary_loss_clip": 0.0126936, + "auxiliary_loss_mlp": 0.0024594, + "balance_loss_clip": 1.0465467, + "balance_loss_mlp": 0.22227678, + "epoch": 0.7227416203216595, + "flos": 22454368041600.0, + "grad_norm": 4.644978591650585, + "language_loss": 0.83384532, + "learning_rate": 7.534989442928219e-07, + "loss": 0.84899831, + "num_input_tokens_seen": 259353810, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.23669434, + "step": 12021, + "time_per_iteration": 2.898688554763794 + }, + { + "auxiliary_loss_clip": 0.01282823, + "auxiliary_loss_mlp": 0.00224909, + "balance_loss_clip": 1.05838585, + "balance_loss_mlp": 0.19724092, + "epoch": 0.7228017435743274, + "flos": 21652303299840.0, + "grad_norm": 18.05328117920469, + "language_loss": 0.74715316, + "learning_rate": 7.531944002330073e-07, + "loss": 0.76223052, + "num_input_tokens_seen": 259372460, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.27661133, + "step": 12022, + "time_per_iteration": 2.7517173290252686 + }, + { + "auxiliary_loss_clip": 0.01269642, + "auxiliary_loss_mlp": 0.00246908, + "balance_loss_clip": 1.04883754, + "balance_loss_mlp": 0.22002634, + "epoch": 0.7228618668269954, + "flos": 29533474206720.0, + "grad_norm": 10.098462373193954, + "language_loss": 0.74963611, + "learning_rate": 7.528899034521858e-07, + "loss": 0.76480162, + "num_input_tokens_seen": 259393275, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26879883, + "step": 12023, + "time_per_iteration": 2.7015674114227295 + }, + { + "auxiliary_loss_clip": 0.01268836, + "auxiliary_loss_mlp": 0.00225601, + "balance_loss_clip": 1.0461843, + "balance_loss_mlp": 0.19947004, + "epoch": 0.7229219900796633, + "flos": 27453815544960.0, + "grad_norm": 33.59654137288184, + "language_loss": 0.76428324, + "learning_rate": 7.525854539619052e-07, + "loss": 0.77922761, + "num_input_tokens_seen": 259416205, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26123047, + "step": 12024, + "time_per_iteration": 2.707451105117798 + }, + { + "auxiliary_loss_clip": 0.01262658, + "auxiliary_loss_mlp": 0.00230141, + "balance_loss_clip": 1.03713965, + "balance_loss_mlp": 0.20298535, + "epoch": 0.7229821133323313, + "flos": 16289368116480.0, + "grad_norm": 239.46872495358159, + "language_loss": 0.83759469, + "learning_rate": 7.522810517737089e-07, + "loss": 0.85252273, + "num_input_tokens_seen": 259433115, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.27148438, + "step": 12025, + "time_per_iteration": 2.656254768371582 + }, + { + "auxiliary_loss_clip": 0.0126115, + "auxiliary_loss_mlp": 0.0022059, + "balance_loss_clip": 1.04206836, + "balance_loss_mlp": 0.19431628, + "epoch": 0.7230422365849992, + "flos": 20412343854720.0, + "grad_norm": 108.03002347626064, + "language_loss": 0.8331632, + "learning_rate": 7.519766968991395e-07, + "loss": 0.84798062, + "num_input_tokens_seen": 259450475, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.26257324, + "step": 12026, + "time_per_iteration": 2.697174072265625 + }, + { + "auxiliary_loss_clip": 0.01269011, + "auxiliary_loss_mlp": 0.00220287, + "balance_loss_clip": 1.05357909, + "balance_loss_mlp": 0.19266585, + "epoch": 0.7231023598376672, + "flos": 25593499284480.0, + "grad_norm": 7.966147561993866, + "language_loss": 0.78598845, + "learning_rate": 7.516723893497388e-07, + "loss": 0.80088139, + "num_input_tokens_seen": 259469355, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.27624512, + "step": 12027, + "time_per_iteration": 2.720675468444824 + }, + { + "auxiliary_loss_clip": 0.01272616, + "auxiliary_loss_mlp": 0.00234768, + "balance_loss_clip": 1.05283761, + "balance_loss_mlp": 0.2095907, + "epoch": 0.7231624830903352, + "flos": 25149607009920.0, + "grad_norm": 7.968967093858625, + "language_loss": 0.87871087, + "learning_rate": 7.513681291370469e-07, + "loss": 0.89378476, + "num_input_tokens_seen": 259486565, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.25170898, + "step": 12028, + "time_per_iteration": 2.823854446411133 + }, + { + "auxiliary_loss_clip": 0.01279634, + "auxiliary_loss_mlp": 0.00222456, + "balance_loss_clip": 1.05619586, + "balance_loss_mlp": 0.19766034, + "epoch": 0.7232226063430032, + "flos": 21725740656000.0, + "grad_norm": 42.96263471786511, + "language_loss": 0.88930637, + "learning_rate": 7.510639162726e-07, + "loss": 0.90432727, + "num_input_tokens_seen": 259505070, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.24816895, + "step": 12029, + "time_per_iteration": 2.7128186225891113 + }, + { + "auxiliary_loss_clip": 0.01163123, + "auxiliary_loss_mlp": 0.0011819, + "balance_loss_clip": 1.00681901, + "balance_loss_mlp": 0.1101796, + "epoch": 0.7232827295956711, + "flos": 68436798491520.0, + "grad_norm": 2.144642586085176, + "language_loss": 0.61344457, + "learning_rate": 7.507597507679347e-07, + "loss": 0.62625766, + "num_input_tokens_seen": 259569135, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.08007812, + "step": 12030, + "time_per_iteration": 3.311546564102173 + }, + { + "auxiliary_loss_clip": 0.01267744, + "auxiliary_loss_mlp": 0.00214628, + "balance_loss_clip": 1.04630268, + "balance_loss_mlp": 0.18681654, + "epoch": 0.7233428528483391, + "flos": 20192642317440.0, + "grad_norm": 44.85885372039228, + "language_loss": 0.86251915, + "learning_rate": 7.504556326345859e-07, + "loss": 0.87734282, + "num_input_tokens_seen": 259587035, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.27819824, + "step": 12031, + "time_per_iteration": 2.7131011486053467 + }, + { + "auxiliary_loss_clip": 0.01281894, + "auxiliary_loss_mlp": 0.00253826, + "balance_loss_clip": 1.05859089, + "balance_loss_mlp": 0.22676536, + "epoch": 0.723402976101007, + "flos": 23949472769280.0, + "grad_norm": 12.418482663991782, + "language_loss": 0.89295501, + "learning_rate": 7.501515618840834e-07, + "loss": 0.9083122, + "num_input_tokens_seen": 259606140, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.27075195, + "step": 12032, + "time_per_iteration": 2.705920457839966 + }, + { + "auxiliary_loss_clip": 0.01274944, + "auxiliary_loss_mlp": 0.00233493, + "balance_loss_clip": 1.05012405, + "balance_loss_mlp": 0.20633674, + "epoch": 0.723463099353675, + "flos": 20813394182400.0, + "grad_norm": 5.2748871415768415, + "language_loss": 0.85430598, + "learning_rate": 7.498475385279592e-07, + "loss": 0.86939037, + "num_input_tokens_seen": 259624275, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.27148438, + "step": 12033, + "time_per_iteration": 2.6838576793670654 + }, + { + "auxiliary_loss_clip": 0.01268165, + "auxiliary_loss_mlp": 0.00206552, + "balance_loss_clip": 1.04913151, + "balance_loss_mlp": 0.18158942, + "epoch": 0.723523222606343, + "flos": 19098013299840.0, + "grad_norm": 2.0629415054743667, + "language_loss": 0.80835652, + "learning_rate": 7.495435625777423e-07, + "loss": 0.82310373, + "num_input_tokens_seen": 259643465, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24963379, + "step": 12034, + "time_per_iteration": 2.6600399017333984 + }, + { + "auxiliary_loss_clip": 0.0124278, + "auxiliary_loss_mlp": 0.00225557, + "balance_loss_clip": 1.02486813, + "balance_loss_mlp": 0.20018888, + "epoch": 0.723583345859011, + "flos": 26506994993280.0, + "grad_norm": 92.99637885367927, + "language_loss": 0.89139611, + "learning_rate": 7.492396340449578e-07, + "loss": 0.90607947, + "num_input_tokens_seen": 259662500, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25378418, + "step": 12035, + "time_per_iteration": 2.7290596961975098 + }, + { + "auxiliary_loss_clip": 0.01286496, + "auxiliary_loss_mlp": 0.00226932, + "balance_loss_clip": 1.05626869, + "balance_loss_mlp": 0.19852439, + "epoch": 0.723643469111679, + "flos": 16033863697920.0, + "grad_norm": 3386.8964651176993, + "language_loss": 0.69518805, + "learning_rate": 7.489357529411326e-07, + "loss": 0.71032238, + "num_input_tokens_seen": 259680140, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.28417969, + "step": 12036, + "time_per_iteration": 2.6330206394195557 + }, + { + "auxiliary_loss_clip": 0.01236461, + "auxiliary_loss_mlp": 0.00206852, + "balance_loss_clip": 1.02612507, + "balance_loss_mlp": 0.18291432, + "epoch": 0.7237035923643469, + "flos": 21945549934080.0, + "grad_norm": 4.8235916718369705, + "language_loss": 0.75975448, + "learning_rate": 7.486319192777883e-07, + "loss": 0.77418756, + "num_input_tokens_seen": 259700160, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.23925781, + "step": 12037, + "time_per_iteration": 2.667886972427368 + }, + { + "auxiliary_loss_clip": 0.01271382, + "auxiliary_loss_mlp": 0.00238989, + "balance_loss_clip": 1.04620934, + "balance_loss_mlp": 0.21165408, + "epoch": 0.7237637156170149, + "flos": 23583112001280.0, + "grad_norm": 390.93073834807353, + "language_loss": 0.81991184, + "learning_rate": 7.483281330664479e-07, + "loss": 0.83501548, + "num_input_tokens_seen": 259720525, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.27331543, + "step": 12038, + "time_per_iteration": 2.7076480388641357 + }, + { + "auxiliary_loss_clip": 0.012825, + "auxiliary_loss_mlp": 0.00225504, + "balance_loss_clip": 1.05601335, + "balance_loss_mlp": 0.19846702, + "epoch": 0.7238238388696828, + "flos": 20594698225920.0, + "grad_norm": 26.562260516497325, + "language_loss": 0.80992603, + "learning_rate": 7.480243943186293e-07, + "loss": 0.82500601, + "num_input_tokens_seen": 259738680, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.27038574, + "step": 12039, + "time_per_iteration": 2.739544153213501 + }, + { + "auxiliary_loss_clip": 0.01250462, + "auxiliary_loss_mlp": 0.00220608, + "balance_loss_clip": 1.03822732, + "balance_loss_mlp": 0.19777903, + "epoch": 0.7238839621223508, + "flos": 24207024263040.0, + "grad_norm": 3.7866550103913323, + "language_loss": 0.85539603, + "learning_rate": 7.477207030458513e-07, + "loss": 0.87010682, + "num_input_tokens_seen": 259758790, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.2286377, + "step": 12040, + "time_per_iteration": 2.720722198486328 + }, + { + "auxiliary_loss_clip": 0.01265887, + "auxiliary_loss_mlp": 0.0021832, + "balance_loss_clip": 1.03996968, + "balance_loss_mlp": 0.19229637, + "epoch": 0.7239440853750188, + "flos": 14209745368320.0, + "grad_norm": 3.3638649095431097, + "language_loss": 0.85208243, + "learning_rate": 7.474170592596301e-07, + "loss": 0.86692446, + "num_input_tokens_seen": 259777370, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26013184, + "step": 12041, + "time_per_iteration": 2.6597347259521484 + }, + { + "auxiliary_loss_clip": 0.01252894, + "auxiliary_loss_mlp": 0.00217207, + "balance_loss_clip": 1.03300285, + "balance_loss_mlp": 0.19223225, + "epoch": 0.7240042086276868, + "flos": 21614812479360.0, + "grad_norm": 18.3092018530224, + "language_loss": 0.74152803, + "learning_rate": 7.471134629714797e-07, + "loss": 0.75622904, + "num_input_tokens_seen": 259794665, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24951172, + "step": 12042, + "time_per_iteration": 2.656081438064575 + }, + { + "auxiliary_loss_clip": 0.01283594, + "auxiliary_loss_mlp": 0.00247752, + "balance_loss_clip": 1.05690813, + "balance_loss_mlp": 0.22047734, + "epoch": 0.7240643318803547, + "flos": 23331450337920.0, + "grad_norm": 6.049910069362165, + "language_loss": 0.90746987, + "learning_rate": 7.468099141929116e-07, + "loss": 0.92278332, + "num_input_tokens_seen": 259811110, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.27282715, + "step": 12043, + "time_per_iteration": 2.659255027770996 + }, + { + "auxiliary_loss_clip": 0.01262366, + "auxiliary_loss_mlp": 0.0023571, + "balance_loss_clip": 1.04054368, + "balance_loss_mlp": 0.20774373, + "epoch": 0.7241244551330227, + "flos": 24024849459840.0, + "grad_norm": 28.09313221973264, + "language_loss": 0.7282508, + "learning_rate": 7.465064129354379e-07, + "loss": 0.74323153, + "num_input_tokens_seen": 259831080, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.27966309, + "step": 12044, + "time_per_iteration": 2.7109737396240234 + }, + { + "auxiliary_loss_clip": 0.01268604, + "auxiliary_loss_mlp": 0.00246281, + "balance_loss_clip": 1.04796469, + "balance_loss_mlp": 0.22018635, + "epoch": 0.7241845783856906, + "flos": 18730323728640.0, + "grad_norm": 13.42573846496366, + "language_loss": 0.88042843, + "learning_rate": 7.462029592105658e-07, + "loss": 0.89557731, + "num_input_tokens_seen": 259850135, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26123047, + "step": 12045, + "time_per_iteration": 2.6442127227783203 + }, + { + "auxiliary_loss_clip": 0.01251475, + "auxiliary_loss_mlp": 0.00217862, + "balance_loss_clip": 1.03407288, + "balance_loss_mlp": 0.19435346, + "epoch": 0.7242447016383586, + "flos": 19498668577920.0, + "grad_norm": 1.8255434433947455, + "language_loss": 0.79080719, + "learning_rate": 7.458995530298034e-07, + "loss": 0.80550057, + "num_input_tokens_seen": 259868185, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.23498535, + "step": 12046, + "time_per_iteration": 2.6434571743011475 + }, + { + "auxiliary_loss_clip": 0.01256295, + "auxiliary_loss_mlp": 0.00229185, + "balance_loss_clip": 1.03838575, + "balance_loss_mlp": 0.20398375, + "epoch": 0.7243048248910267, + "flos": 22163491704960.0, + "grad_norm": 5.726548600967069, + "language_loss": 0.8046295, + "learning_rate": 7.455961944046553e-07, + "loss": 0.81948423, + "num_input_tokens_seen": 259887055, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.2520752, + "step": 12047, + "time_per_iteration": 4.071962356567383 + }, + { + "auxiliary_loss_clip": 0.01278115, + "auxiliary_loss_mlp": 0.00210867, + "balance_loss_clip": 1.05233371, + "balance_loss_mlp": 0.1853919, + "epoch": 0.7243649481436946, + "flos": 27672762896640.0, + "grad_norm": 7.161236229036705, + "language_loss": 0.79496324, + "learning_rate": 7.45292883346627e-07, + "loss": 0.80985308, + "num_input_tokens_seen": 259908295, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.25463867, + "step": 12048, + "time_per_iteration": 4.130521535873413 + }, + { + "auxiliary_loss_clip": 0.0112476, + "auxiliary_loss_mlp": 0.00081424, + "balance_loss_clip": 0.97220814, + "balance_loss_mlp": 0.07365181, + "epoch": 0.7244250713963626, + "flos": 63244545759360.0, + "grad_norm": 0.9352396133428663, + "language_loss": 0.53099358, + "learning_rate": 7.449896198672168e-07, + "loss": 0.54305542, + "num_input_tokens_seen": 259968475, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.07763672, + "step": 12049, + "time_per_iteration": 3.17824387550354 + }, + { + "auxiliary_loss_clip": 0.0127365, + "auxiliary_loss_mlp": 0.00258874, + "balance_loss_clip": 1.04749703, + "balance_loss_mlp": 0.23298165, + "epoch": 0.7244851946490305, + "flos": 17967114524160.0, + "grad_norm": 7.607850557255949, + "language_loss": 0.71443379, + "learning_rate": 7.446864039779258e-07, + "loss": 0.72975904, + "num_input_tokens_seen": 259984865, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.25891113, + "step": 12050, + "time_per_iteration": 2.6474475860595703 + }, + { + "auxiliary_loss_clip": 0.01116972, + "auxiliary_loss_mlp": 0.00108793, + "balance_loss_clip": 0.96234995, + "balance_loss_mlp": 0.10149699, + "epoch": 0.7245453179016985, + "flos": 70943649603840.0, + "grad_norm": 0.7148443151744304, + "language_loss": 0.52273166, + "learning_rate": 7.443832356902528e-07, + "loss": 0.53498936, + "num_input_tokens_seen": 260046735, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.07275391, + "step": 12051, + "time_per_iteration": 4.623785972595215 + }, + { + "auxiliary_loss_clip": 0.01243553, + "auxiliary_loss_mlp": 0.00252252, + "balance_loss_clip": 1.02909935, + "balance_loss_mlp": 0.22670534, + "epoch": 0.7246054411543664, + "flos": 24568464867840.0, + "grad_norm": 3.378749155927843, + "language_loss": 0.78021979, + "learning_rate": 7.440801150156927e-07, + "loss": 0.79517782, + "num_input_tokens_seen": 260067950, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.25549316, + "step": 12052, + "time_per_iteration": 2.6937434673309326 + }, + { + "auxiliary_loss_clip": 0.01251612, + "auxiliary_loss_mlp": 0.00238259, + "balance_loss_clip": 1.0302484, + "balance_loss_mlp": 0.21113876, + "epoch": 0.7246655644070344, + "flos": 32338312548480.0, + "grad_norm": 11.282816127506127, + "language_loss": 0.80879098, + "learning_rate": 7.437770419657415e-07, + "loss": 0.8236897, + "num_input_tokens_seen": 260087730, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.2713623, + "step": 12053, + "time_per_iteration": 2.7403669357299805 + }, + { + "auxiliary_loss_clip": 0.01269423, + "auxiliary_loss_mlp": 0.00230777, + "balance_loss_clip": 1.0451355, + "balance_loss_mlp": 0.20484917, + "epoch": 0.7247256876597024, + "flos": 21872471713920.0, + "grad_norm": 4.3815430562959055, + "language_loss": 0.87816846, + "learning_rate": 7.434740165518898e-07, + "loss": 0.89317054, + "num_input_tokens_seen": 260107760, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.2590332, + "step": 12054, + "time_per_iteration": 2.653071641921997 + }, + { + "auxiliary_loss_clip": 0.01249155, + "auxiliary_loss_mlp": 0.00241343, + "balance_loss_clip": 1.03106713, + "balance_loss_mlp": 0.21621388, + "epoch": 0.7247858109123704, + "flos": 16213093585920.0, + "grad_norm": 9.556909149608172, + "language_loss": 0.79154909, + "learning_rate": 7.431710387856301e-07, + "loss": 0.80645406, + "num_input_tokens_seen": 260123660, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.2512207, + "step": 12055, + "time_per_iteration": 2.6758198738098145 + }, + { + "auxiliary_loss_clip": 0.01252814, + "auxiliary_loss_mlp": 0.00242606, + "balance_loss_clip": 1.03506732, + "balance_loss_mlp": 0.21845451, + "epoch": 0.7248459341650383, + "flos": 20850705434880.0, + "grad_norm": 68.77276886785229, + "language_loss": 0.81238157, + "learning_rate": 7.428681086784496e-07, + "loss": 0.82733577, + "num_input_tokens_seen": 260142690, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24157715, + "step": 12056, + "time_per_iteration": 4.080258131027222 + }, + { + "auxiliary_loss_clip": 0.0125064, + "auxiliary_loss_mlp": 0.00225557, + "balance_loss_clip": 1.03224885, + "balance_loss_mlp": 0.20105913, + "epoch": 0.7249060574177063, + "flos": 25921794614400.0, + "grad_norm": 6.303727030519149, + "language_loss": 0.77806783, + "learning_rate": 7.425652262418368e-07, + "loss": 0.79282987, + "num_input_tokens_seen": 260162590, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24487305, + "step": 12057, + "time_per_iteration": 2.7776424884796143 + }, + { + "auxiliary_loss_clip": 0.01263628, + "auxiliary_loss_mlp": 0.00247192, + "balance_loss_clip": 1.04024279, + "balance_loss_mlp": 0.22104931, + "epoch": 0.7249661806703742, + "flos": 17345536646400.0, + "grad_norm": 4.345539200625184, + "language_loss": 0.70928895, + "learning_rate": 7.42262391487277e-07, + "loss": 0.72439706, + "num_input_tokens_seen": 260181065, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26135254, + "step": 12058, + "time_per_iteration": 2.69521164894104 + }, + { + "auxiliary_loss_clip": 0.01266325, + "auxiliary_loss_mlp": 0.00248036, + "balance_loss_clip": 1.042153, + "balance_loss_mlp": 0.22296628, + "epoch": 0.7250263039230422, + "flos": 19574153009280.0, + "grad_norm": 26.976626070991806, + "language_loss": 0.81024867, + "learning_rate": 7.419596044262535e-07, + "loss": 0.82539225, + "num_input_tokens_seen": 260200330, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.25073242, + "step": 12059, + "time_per_iteration": 2.6649599075317383 + }, + { + "auxiliary_loss_clip": 0.01227025, + "auxiliary_loss_mlp": 0.00233844, + "balance_loss_clip": 1.01880765, + "balance_loss_mlp": 0.21046749, + "epoch": 0.7250864271757103, + "flos": 21976648133760.0, + "grad_norm": 12.63449340063702, + "language_loss": 0.84425902, + "learning_rate": 7.416568650702472e-07, + "loss": 0.8588677, + "num_input_tokens_seen": 260219975, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.23388672, + "step": 12060, + "time_per_iteration": 2.7001724243164062 + }, + { + "auxiliary_loss_clip": 0.01238865, + "auxiliary_loss_mlp": 0.0022635, + "balance_loss_clip": 1.01957941, + "balance_loss_mlp": 0.20201969, + "epoch": 0.7251465504283782, + "flos": 25012608537600.0, + "grad_norm": 65.27919795730367, + "language_loss": 0.82523793, + "learning_rate": 7.413541734307393e-07, + "loss": 0.83989006, + "num_input_tokens_seen": 260242025, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24353027, + "step": 12061, + "time_per_iteration": 2.8386805057525635 + }, + { + "auxiliary_loss_clip": 0.01248402, + "auxiliary_loss_mlp": 0.00248076, + "balance_loss_clip": 1.03756571, + "balance_loss_mlp": 0.22400814, + "epoch": 0.7252066736810462, + "flos": 16690131135360.0, + "grad_norm": 81.45767526515074, + "language_loss": 0.86743212, + "learning_rate": 7.410515295192068e-07, + "loss": 0.88239688, + "num_input_tokens_seen": 260260015, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.24072266, + "step": 12062, + "time_per_iteration": 2.675107955932617 + }, + { + "auxiliary_loss_clip": 0.01285405, + "auxiliary_loss_mlp": 0.0026109, + "balance_loss_clip": 1.05607128, + "balance_loss_mlp": 0.23342109, + "epoch": 0.7252667969337141, + "flos": 25703026830720.0, + "grad_norm": 13.638672484690883, + "language_loss": 0.8057844, + "learning_rate": 7.407489333471262e-07, + "loss": 0.82124937, + "num_input_tokens_seen": 260278635, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.27661133, + "step": 12063, + "time_per_iteration": 2.690988063812256 + }, + { + "auxiliary_loss_clip": 0.01240893, + "auxiliary_loss_mlp": 0.00235042, + "balance_loss_clip": 1.02335787, + "balance_loss_mlp": 0.20914981, + "epoch": 0.7253269201863821, + "flos": 18259930195200.0, + "grad_norm": 1001.2335629438238, + "language_loss": 0.77250719, + "learning_rate": 7.40446384925973e-07, + "loss": 0.78726649, + "num_input_tokens_seen": 260298510, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.25891113, + "step": 12064, + "time_per_iteration": 2.6365723609924316 + }, + { + "auxiliary_loss_clip": 0.01246427, + "auxiliary_loss_mlp": 0.002567, + "balance_loss_clip": 1.02759624, + "balance_loss_mlp": 0.23170227, + "epoch": 0.72538704343905, + "flos": 20411805150720.0, + "grad_norm": 9.056626644501517, + "language_loss": 0.98317873, + "learning_rate": 7.401438842672192e-07, + "loss": 0.99821001, + "num_input_tokens_seen": 260317405, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24975586, + "step": 12065, + "time_per_iteration": 2.589937210083008 + }, + { + "auxiliary_loss_clip": 0.01089864, + "auxiliary_loss_mlp": 0.0006551, + "balance_loss_clip": 0.9403888, + "balance_loss_mlp": 0.05897757, + "epoch": 0.725447166691718, + "flos": 70151209706880.0, + "grad_norm": 3.6546765283462435, + "language_loss": 0.55665648, + "learning_rate": 7.398414313823349e-07, + "loss": 0.56821024, + "num_input_tokens_seen": 260388085, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.06542969, + "step": 12066, + "time_per_iteration": 3.334608793258667 + }, + { + "auxiliary_loss_clip": 0.01250917, + "auxiliary_loss_mlp": 0.00246661, + "balance_loss_clip": 1.03526139, + "balance_loss_mlp": 0.22178163, + "epoch": 0.725507289944386, + "flos": 27052334254080.0, + "grad_norm": 13.573734116625271, + "language_loss": 0.83150285, + "learning_rate": 7.395390262827897e-07, + "loss": 0.84647858, + "num_input_tokens_seen": 260406165, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24865723, + "step": 12067, + "time_per_iteration": 2.669015407562256 + }, + { + "auxiliary_loss_clip": 0.01093861, + "auxiliary_loss_mlp": 0.00121592, + "balance_loss_clip": 0.94269562, + "balance_loss_mlp": 0.11453526, + "epoch": 0.725567413197054, + "flos": 62921924778240.0, + "grad_norm": 0.7150106117422902, + "language_loss": 0.56329787, + "learning_rate": 7.392366689800515e-07, + "loss": 0.57545245, + "num_input_tokens_seen": 260461365, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.07080078, + "step": 12068, + "time_per_iteration": 3.0415451526641846 + }, + { + "auxiliary_loss_clip": 0.01094892, + "auxiliary_loss_mlp": 0.00129063, + "balance_loss_clip": 0.9435392, + "balance_loss_mlp": 0.12210134, + "epoch": 0.7256275364497219, + "flos": 60295957188480.0, + "grad_norm": 0.6555718883480811, + "language_loss": 0.55106884, + "learning_rate": 7.389343594855848e-07, + "loss": 0.56330836, + "num_input_tokens_seen": 260523795, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.06982422, + "step": 12069, + "time_per_iteration": 3.1353037357330322 + }, + { + "auxiliary_loss_clip": 0.0123998, + "auxiliary_loss_mlp": 0.00232529, + "balance_loss_clip": 1.02880526, + "balance_loss_mlp": 0.20966437, + "epoch": 0.7256876597023899, + "flos": 24498511130880.0, + "grad_norm": 3.22236282966861, + "language_loss": 0.87662464, + "learning_rate": 7.38632097810854e-07, + "loss": 0.89134973, + "num_input_tokens_seen": 260544765, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.2286377, + "step": 12070, + "time_per_iteration": 2.6457509994506836 + }, + { + "auxiliary_loss_clip": 0.01243859, + "auxiliary_loss_mlp": 0.00224169, + "balance_loss_clip": 1.02886677, + "balance_loss_mlp": 0.20024347, + "epoch": 0.7257477829550578, + "flos": 24352749740160.0, + "grad_norm": 2.2172236758770354, + "language_loss": 0.79382384, + "learning_rate": 7.383298839673197e-07, + "loss": 0.8085041, + "num_input_tokens_seen": 260564340, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.23925781, + "step": 12071, + "time_per_iteration": 2.6753342151641846 + }, + { + "auxiliary_loss_clip": 0.01242259, + "auxiliary_loss_mlp": 0.00238651, + "balance_loss_clip": 1.03244984, + "balance_loss_mlp": 0.21469031, + "epoch": 0.7258079062077258, + "flos": 17202217380480.0, + "grad_norm": 7.945172261506096, + "language_loss": 0.75695908, + "learning_rate": 7.380277179664436e-07, + "loss": 0.77176821, + "num_input_tokens_seen": 260582565, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.23962402, + "step": 12072, + "time_per_iteration": 2.727036237716675 + }, + { + "auxiliary_loss_clip": 0.01258998, + "auxiliary_loss_mlp": 0.00236676, + "balance_loss_clip": 1.0352037, + "balance_loss_mlp": 0.21186891, + "epoch": 0.7258680294603939, + "flos": 21580338401280.0, + "grad_norm": 10.299992987992185, + "language_loss": 0.83817196, + "learning_rate": 7.377255998196821e-07, + "loss": 0.85312873, + "num_input_tokens_seen": 260601700, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.2479248, + "step": 12073, + "time_per_iteration": 2.738556385040283 + }, + { + "auxiliary_loss_clip": 0.0126266, + "auxiliary_loss_mlp": 0.0025568, + "balance_loss_clip": 1.04257727, + "balance_loss_mlp": 0.23010972, + "epoch": 0.7259281527130618, + "flos": 34855399036800.0, + "grad_norm": 11.678413389618562, + "language_loss": 0.77105212, + "learning_rate": 7.374235295384923e-07, + "loss": 0.78623557, + "num_input_tokens_seen": 260623040, + "router_z_loss_clip": 2.19628906, + "router_z_loss_mlp": 0.25585938, + "step": 12074, + "time_per_iteration": 2.9134140014648438 + }, + { + "auxiliary_loss_clip": 0.01254104, + "auxiliary_loss_mlp": 0.00238183, + "balance_loss_clip": 1.03729093, + "balance_loss_mlp": 0.21277916, + "epoch": 0.7259882759657298, + "flos": 25404644551680.0, + "grad_norm": 8.185945630682946, + "language_loss": 0.80456036, + "learning_rate": 7.371215071343302e-07, + "loss": 0.81948316, + "num_input_tokens_seen": 260642735, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.25402832, + "step": 12075, + "time_per_iteration": 2.74428653717041 + }, + { + "auxiliary_loss_clip": 0.01256035, + "auxiliary_loss_mlp": 0.00260344, + "balance_loss_clip": 1.03316331, + "balance_loss_mlp": 0.23304555, + "epoch": 0.7260483992183977, + "flos": 62953630531200.0, + "grad_norm": 23.584045513066417, + "language_loss": 0.70743024, + "learning_rate": 7.368195326186458e-07, + "loss": 0.72259402, + "num_input_tokens_seen": 260669935, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.27355957, + "step": 12076, + "time_per_iteration": 3.070871114730835 + }, + { + "auxiliary_loss_clip": 0.01253907, + "auxiliary_loss_mlp": 0.00241512, + "balance_loss_clip": 1.03565693, + "balance_loss_mlp": 0.21735997, + "epoch": 0.7261085224710657, + "flos": 26467528924800.0, + "grad_norm": 19.495163140102978, + "language_loss": 0.86088657, + "learning_rate": 7.365176060028912e-07, + "loss": 0.87584072, + "num_input_tokens_seen": 260689605, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24157715, + "step": 12077, + "time_per_iteration": 2.712242364883423 + }, + { + "auxiliary_loss_clip": 0.01097362, + "auxiliary_loss_mlp": 0.00092578, + "balance_loss_clip": 0.94802356, + "balance_loss_mlp": 0.08470994, + "epoch": 0.7261686457237336, + "flos": 66772732187520.0, + "grad_norm": 0.8789776441714001, + "language_loss": 0.64651084, + "learning_rate": 7.362157272985163e-07, + "loss": 0.65841031, + "num_input_tokens_seen": 260748265, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.07861328, + "step": 12078, + "time_per_iteration": 3.171923875808716 + }, + { + "auxiliary_loss_clip": 0.01101216, + "auxiliary_loss_mlp": 0.00092906, + "balance_loss_clip": 0.94899428, + "balance_loss_mlp": 0.08575378, + "epoch": 0.7262287689764017, + "flos": 69999594399360.0, + "grad_norm": 0.7023879364798159, + "language_loss": 0.58860755, + "learning_rate": 7.359138965169671e-07, + "loss": 0.60054874, + "num_input_tokens_seen": 260816715, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.07128906, + "step": 12079, + "time_per_iteration": 3.2756714820861816 + }, + { + "auxiliary_loss_clip": 0.01255845, + "auxiliary_loss_mlp": 0.00267513, + "balance_loss_clip": 1.03924131, + "balance_loss_mlp": 0.24170418, + "epoch": 0.7262888922290696, + "flos": 23805435231360.0, + "grad_norm": 11.697391168651972, + "language_loss": 0.74452758, + "learning_rate": 7.356121136696895e-07, + "loss": 0.75976121, + "num_input_tokens_seen": 260836765, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.25805664, + "step": 12080, + "time_per_iteration": 2.681201696395874 + }, + { + "auxiliary_loss_clip": 0.01254445, + "auxiliary_loss_mlp": 0.00267848, + "balance_loss_clip": 1.03028321, + "balance_loss_mlp": 0.24156255, + "epoch": 0.7263490154817376, + "flos": 19500320603520.0, + "grad_norm": 1353.7176604243023, + "language_loss": 0.81053519, + "learning_rate": 7.35310378768128e-07, + "loss": 0.8257581, + "num_input_tokens_seen": 260854610, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.26306152, + "step": 12081, + "time_per_iteration": 2.6446871757507324 + }, + { + "auxiliary_loss_clip": 0.01241769, + "auxiliary_loss_mlp": 0.00232757, + "balance_loss_clip": 1.02619338, + "balance_loss_mlp": 0.20719807, + "epoch": 0.7264091387344055, + "flos": 16286243633280.0, + "grad_norm": 11.711055465487718, + "language_loss": 0.89375991, + "learning_rate": 7.350086918237237e-07, + "loss": 0.9085052, + "num_input_tokens_seen": 260871620, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.25549316, + "step": 12082, + "time_per_iteration": 2.6734776496887207 + }, + { + "auxiliary_loss_clip": 0.0127715, + "auxiliary_loss_mlp": 0.00235681, + "balance_loss_clip": 1.04515159, + "balance_loss_mlp": 0.20757174, + "epoch": 0.7264692619870735, + "flos": 24352031468160.0, + "grad_norm": 10.6639709717113, + "language_loss": 0.86087918, + "learning_rate": 7.347070528479158e-07, + "loss": 0.87600756, + "num_input_tokens_seen": 260890490, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.28125, + "step": 12083, + "time_per_iteration": 2.7450032234191895 + }, + { + "auxiliary_loss_clip": 0.01266354, + "auxiliary_loss_mlp": 0.00259185, + "balance_loss_clip": 1.04331684, + "balance_loss_mlp": 0.23328078, + "epoch": 0.7265293852397414, + "flos": 25119478477440.0, + "grad_norm": 3.665624236241452, + "language_loss": 0.80432618, + "learning_rate": 7.344054618521433e-07, + "loss": 0.81958157, + "num_input_tokens_seen": 260909700, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.2590332, + "step": 12084, + "time_per_iteration": 2.6886215209960938 + }, + { + "auxiliary_loss_clip": 0.01261847, + "auxiliary_loss_mlp": 0.00239779, + "balance_loss_clip": 1.04004526, + "balance_loss_mlp": 0.21448243, + "epoch": 0.7265895084924094, + "flos": 22638230784000.0, + "grad_norm": 3.098585695528353, + "language_loss": 0.85272646, + "learning_rate": 7.34103918847843e-07, + "loss": 0.86774266, + "num_input_tokens_seen": 260929090, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25292969, + "step": 12085, + "time_per_iteration": 2.688434600830078 + }, + { + "auxiliary_loss_clip": 0.01264753, + "auxiliary_loss_mlp": 0.00237633, + "balance_loss_clip": 1.0411768, + "balance_loss_mlp": 0.21207437, + "epoch": 0.7266496317450775, + "flos": 23368222886400.0, + "grad_norm": 8.714331915310572, + "language_loss": 0.791704, + "learning_rate": 7.338024238464493e-07, + "loss": 0.80672789, + "num_input_tokens_seen": 260946615, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.25598145, + "step": 12086, + "time_per_iteration": 2.633610248565674 + }, + { + "auxiliary_loss_clip": 0.01253618, + "auxiliary_loss_mlp": 0.00234968, + "balance_loss_clip": 1.03811932, + "balance_loss_mlp": 0.20977867, + "epoch": 0.7267097549977454, + "flos": 28074603323520.0, + "grad_norm": 11.323773298052332, + "language_loss": 0.77264804, + "learning_rate": 7.335009768593938e-07, + "loss": 0.78753388, + "num_input_tokens_seen": 260968515, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.25158691, + "step": 12087, + "time_per_iteration": 2.726531982421875 + }, + { + "auxiliary_loss_clip": 0.0128475, + "auxiliary_loss_mlp": 0.00276867, + "balance_loss_clip": 1.05714989, + "balance_loss_mlp": 0.24903116, + "epoch": 0.7267698782504134, + "flos": 22195523658240.0, + "grad_norm": 7.21447526576546, + "language_loss": 0.86662984, + "learning_rate": 7.331995778981088e-07, + "loss": 0.88224602, + "num_input_tokens_seen": 260986790, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.27832031, + "step": 12088, + "time_per_iteration": 2.672757148742676 + }, + { + "auxiliary_loss_clip": 0.01261395, + "auxiliary_loss_mlp": 0.00248747, + "balance_loss_clip": 1.03851187, + "balance_loss_mlp": 0.22136509, + "epoch": 0.7268300015030813, + "flos": 18514859996160.0, + "grad_norm": 12.923488893392822, + "language_loss": 0.81380951, + "learning_rate": 7.328982269740221e-07, + "loss": 0.82891089, + "num_input_tokens_seen": 261004925, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.27355957, + "step": 12089, + "time_per_iteration": 4.1307213306427 + }, + { + "auxiliary_loss_clip": 0.01261456, + "auxiliary_loss_mlp": 0.00233135, + "balance_loss_clip": 1.04430008, + "balance_loss_mlp": 0.20849448, + "epoch": 0.7268901247557493, + "flos": 23986029836160.0, + "grad_norm": 35.654303371393006, + "language_loss": 0.79146791, + "learning_rate": 7.325969240985616e-07, + "loss": 0.80641389, + "num_input_tokens_seen": 261023895, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.24633789, + "step": 12090, + "time_per_iteration": 4.115847826004028 + }, + { + "auxiliary_loss_clip": 0.01253213, + "auxiliary_loss_mlp": 0.00241123, + "balance_loss_clip": 1.03807831, + "balance_loss_mlp": 0.21471855, + "epoch": 0.7269502480084172, + "flos": 32088087429120.0, + "grad_norm": 9.928690696610657, + "language_loss": 0.84656465, + "learning_rate": 7.322956692831528e-07, + "loss": 0.86150807, + "num_input_tokens_seen": 261045445, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.26403809, + "step": 12091, + "time_per_iteration": 2.818190813064575 + }, + { + "auxiliary_loss_clip": 0.01259447, + "auxiliary_loss_mlp": 0.0022384, + "balance_loss_clip": 1.04024971, + "balance_loss_mlp": 0.1995337, + "epoch": 0.7270103712610853, + "flos": 19062785036160.0, + "grad_norm": 13.834609043654309, + "language_loss": 0.79517519, + "learning_rate": 7.319944625392205e-07, + "loss": 0.81000811, + "num_input_tokens_seen": 261064275, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24316406, + "step": 12092, + "time_per_iteration": 2.6594552993774414 + }, + { + "auxiliary_loss_clip": 0.01267909, + "auxiliary_loss_mlp": 0.00227182, + "balance_loss_clip": 1.04900861, + "balance_loss_mlp": 0.20355485, + "epoch": 0.7270704945137532, + "flos": 34532921710080.0, + "grad_norm": 25.099596956599733, + "language_loss": 0.70042276, + "learning_rate": 7.31693303878184e-07, + "loss": 0.71537369, + "num_input_tokens_seen": 261083310, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.23620605, + "step": 12093, + "time_per_iteration": 4.21579384803772 + }, + { + "auxiliary_loss_clip": 0.01269741, + "auxiliary_loss_mlp": 0.00241915, + "balance_loss_clip": 1.0534668, + "balance_loss_mlp": 0.21839526, + "epoch": 0.7271306177664212, + "flos": 21507583403520.0, + "grad_norm": 8.501364100824022, + "language_loss": 0.80477196, + "learning_rate": 7.313921933114644e-07, + "loss": 0.81988853, + "num_input_tokens_seen": 261103460, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.23522949, + "step": 12094, + "time_per_iteration": 2.6825344562530518 + }, + { + "auxiliary_loss_clip": 0.01240656, + "auxiliary_loss_mlp": 0.00212874, + "balance_loss_clip": 1.03211808, + "balance_loss_mlp": 0.1905939, + "epoch": 0.7271907410190891, + "flos": 22272444633600.0, + "grad_norm": 44.84208940070094, + "language_loss": 0.93193215, + "learning_rate": 7.310911308504808e-07, + "loss": 0.9464674, + "num_input_tokens_seen": 261121375, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.22265625, + "step": 12095, + "time_per_iteration": 2.671861410140991 + }, + { + "auxiliary_loss_clip": 0.01277523, + "auxiliary_loss_mlp": 0.00237402, + "balance_loss_clip": 1.05369389, + "balance_loss_mlp": 0.20981744, + "epoch": 0.7272508642717571, + "flos": 22893124671360.0, + "grad_norm": 4.664460477296896, + "language_loss": 0.87002295, + "learning_rate": 7.307901165066479e-07, + "loss": 0.88517225, + "num_input_tokens_seen": 261141105, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.27600098, + "step": 12096, + "time_per_iteration": 2.673971176147461 + }, + { + "auxiliary_loss_clip": 0.01271215, + "auxiliary_loss_mlp": 0.00227754, + "balance_loss_clip": 1.05008245, + "balance_loss_mlp": 0.20261317, + "epoch": 0.727310987524425, + "flos": 11655886331520.0, + "grad_norm": 6.731083439545469, + "language_loss": 0.8216356, + "learning_rate": 7.30489150291381e-07, + "loss": 0.83662534, + "num_input_tokens_seen": 261159255, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.25109863, + "step": 12097, + "time_per_iteration": 2.7495741844177246 + }, + { + "auxiliary_loss_clip": 0.01285212, + "auxiliary_loss_mlp": 0.0026012, + "balance_loss_clip": 1.05725908, + "balance_loss_mlp": 0.23121193, + "epoch": 0.727371110777093, + "flos": 24535319592960.0, + "grad_norm": 2.238590421368581, + "language_loss": 0.86884624, + "learning_rate": 7.301882322160935e-07, + "loss": 0.88429952, + "num_input_tokens_seen": 261177960, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.28918457, + "step": 12098, + "time_per_iteration": 4.096341371536255 + }, + { + "auxiliary_loss_clip": 0.01275339, + "auxiliary_loss_mlp": 0.00261135, + "balance_loss_clip": 1.04681957, + "balance_loss_mlp": 0.23409879, + "epoch": 0.7274312340297611, + "flos": 74739835405440.0, + "grad_norm": 39.489435095921756, + "language_loss": 0.76474637, + "learning_rate": 7.298873622921952e-07, + "loss": 0.78011107, + "num_input_tokens_seen": 261205660, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.27038574, + "step": 12099, + "time_per_iteration": 3.1368422508239746 + }, + { + "auxiliary_loss_clip": 0.01291622, + "auxiliary_loss_mlp": 0.00264759, + "balance_loss_clip": 1.05453777, + "balance_loss_mlp": 0.2349565, + "epoch": 0.727491357282429, + "flos": 22342865247360.0, + "grad_norm": 411.16012032047, + "language_loss": 0.82548571, + "learning_rate": 7.29586540531095e-07, + "loss": 0.84104949, + "num_input_tokens_seen": 261225185, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.2980957, + "step": 12100, + "time_per_iteration": 2.7505297660827637 + }, + { + "auxiliary_loss_clip": 0.0128041, + "auxiliary_loss_mlp": 0.00242168, + "balance_loss_clip": 1.06242013, + "balance_loss_mlp": 0.21672916, + "epoch": 0.727551480535097, + "flos": 23297550877440.0, + "grad_norm": 2.972072811239429, + "language_loss": 0.81146657, + "learning_rate": 7.292857669442005e-07, + "loss": 0.82669234, + "num_input_tokens_seen": 261247965, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25439453, + "step": 12101, + "time_per_iteration": 2.7899773120880127 + }, + { + "auxiliary_loss_clip": 0.01258146, + "auxiliary_loss_mlp": 0.00218798, + "balance_loss_clip": 1.04257107, + "balance_loss_mlp": 0.19592139, + "epoch": 0.7276116037877649, + "flos": 21470559459840.0, + "grad_norm": 15.479879239277544, + "language_loss": 0.89413977, + "learning_rate": 7.289850415429177e-07, + "loss": 0.9089092, + "num_input_tokens_seen": 261267585, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.22888184, + "step": 12102, + "time_per_iteration": 2.6946518421173096 + }, + { + "auxiliary_loss_clip": 0.01258904, + "auxiliary_loss_mlp": 0.00225409, + "balance_loss_clip": 1.042539, + "balance_loss_mlp": 0.20069724, + "epoch": 0.7276717270404329, + "flos": 21464059098240.0, + "grad_norm": 7.11643406185164, + "language_loss": 0.87777305, + "learning_rate": 7.286843643386495e-07, + "loss": 0.89261615, + "num_input_tokens_seen": 261285200, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.24707031, + "step": 12103, + "time_per_iteration": 2.725360631942749 + }, + { + "auxiliary_loss_clip": 0.01273293, + "auxiliary_loss_mlp": 0.0021955, + "balance_loss_clip": 1.05027652, + "balance_loss_mlp": 0.19424143, + "epoch": 0.7277318502931008, + "flos": 16837221329280.0, + "grad_norm": 23.181462925983205, + "language_loss": 0.76702893, + "learning_rate": 7.283837353427968e-07, + "loss": 0.78195739, + "num_input_tokens_seen": 261303645, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25305176, + "step": 12104, + "time_per_iteration": 2.643568515777588 + }, + { + "auxiliary_loss_clip": 0.01264537, + "auxiliary_loss_mlp": 0.00243948, + "balance_loss_clip": 1.04474926, + "balance_loss_mlp": 0.21843696, + "epoch": 0.7277919735457689, + "flos": 33400550476800.0, + "grad_norm": 3.4796838070257308, + "language_loss": 0.75092298, + "learning_rate": 7.280831545667611e-07, + "loss": 0.76600778, + "num_input_tokens_seen": 261323265, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25537109, + "step": 12105, + "time_per_iteration": 2.7491822242736816 + }, + { + "auxiliary_loss_clip": 0.01267182, + "auxiliary_loss_mlp": 0.00239707, + "balance_loss_clip": 1.04561365, + "balance_loss_mlp": 0.21227714, + "epoch": 0.7278520967984368, + "flos": 19206499351680.0, + "grad_norm": 3.547716866409734, + "language_loss": 0.82550406, + "learning_rate": 7.27782622021939e-07, + "loss": 0.84057295, + "num_input_tokens_seen": 261339745, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.27429199, + "step": 12106, + "time_per_iteration": 2.607492208480835 + }, + { + "auxiliary_loss_clip": 0.01285073, + "auxiliary_loss_mlp": 0.00243678, + "balance_loss_clip": 1.06138706, + "balance_loss_mlp": 0.21699874, + "epoch": 0.7279122200511048, + "flos": 34094667870720.0, + "grad_norm": 334.13393317773574, + "language_loss": 0.79613519, + "learning_rate": 7.274821377197273e-07, + "loss": 0.81142271, + "num_input_tokens_seen": 261359310, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26660156, + "step": 12107, + "time_per_iteration": 2.752103328704834 + }, + { + "auxiliary_loss_clip": 0.01260349, + "auxiliary_loss_mlp": 0.00219, + "balance_loss_clip": 1.04326308, + "balance_loss_mlp": 0.19369218, + "epoch": 0.7279723433037727, + "flos": 54599049348480.0, + "grad_norm": 28.11152081961286, + "language_loss": 0.82603687, + "learning_rate": 7.271817016715205e-07, + "loss": 0.84083039, + "num_input_tokens_seen": 261384640, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25280762, + "step": 12108, + "time_per_iteration": 2.936485767364502 + }, + { + "auxiliary_loss_clip": 0.01287245, + "auxiliary_loss_mlp": 0.00238711, + "balance_loss_clip": 1.06352949, + "balance_loss_mlp": 0.21148404, + "epoch": 0.7280324665564407, + "flos": 36137482156800.0, + "grad_norm": 13.903603057504498, + "language_loss": 0.7230953, + "learning_rate": 7.268813138887124e-07, + "loss": 0.7383548, + "num_input_tokens_seen": 261405290, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.27209473, + "step": 12109, + "time_per_iteration": 2.768984317779541 + }, + { + "auxiliary_loss_clip": 0.01272367, + "auxiliary_loss_mlp": 0.00260698, + "balance_loss_clip": 1.05097234, + "balance_loss_mlp": 0.23305303, + "epoch": 0.7280925898091086, + "flos": 11618539165440.0, + "grad_norm": 9.346265747416028, + "language_loss": 0.74570274, + "learning_rate": 7.265809743826912e-07, + "loss": 0.76103342, + "num_input_tokens_seen": 261419710, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.27661133, + "step": 12110, + "time_per_iteration": 2.5977776050567627 + }, + { + "auxiliary_loss_clip": 0.01280578, + "auxiliary_loss_mlp": 0.0024082, + "balance_loss_clip": 1.05403578, + "balance_loss_mlp": 0.21527302, + "epoch": 0.7281527130617766, + "flos": 34277094069120.0, + "grad_norm": 30.24648436924254, + "language_loss": 0.68184721, + "learning_rate": 7.26280683164847e-07, + "loss": 0.69706118, + "num_input_tokens_seen": 261442385, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.25524902, + "step": 12111, + "time_per_iteration": 2.747955322265625 + }, + { + "auxiliary_loss_clip": 0.01294927, + "auxiliary_loss_mlp": 0.00229834, + "balance_loss_clip": 1.06058788, + "balance_loss_mlp": 0.20237993, + "epoch": 0.7282128363144446, + "flos": 13918043018880.0, + "grad_norm": 2400.037514326801, + "language_loss": 0.86554652, + "learning_rate": 7.259804402465677e-07, + "loss": 0.88079411, + "num_input_tokens_seen": 261459805, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.27453613, + "step": 12112, + "time_per_iteration": 2.6613457202911377 + }, + { + "auxiliary_loss_clip": 0.01267185, + "auxiliary_loss_mlp": 0.00231431, + "balance_loss_clip": 1.04775906, + "balance_loss_mlp": 0.20652843, + "epoch": 0.7282729595671126, + "flos": 20777627214720.0, + "grad_norm": 16.89572788049301, + "language_loss": 0.74356925, + "learning_rate": 7.25680245639237e-07, + "loss": 0.75855541, + "num_input_tokens_seen": 261477175, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.24914551, + "step": 12113, + "time_per_iteration": 2.702554702758789 + }, + { + "auxiliary_loss_clip": 0.01272549, + "auxiliary_loss_mlp": 0.00233295, + "balance_loss_clip": 1.04925847, + "balance_loss_mlp": 0.20668702, + "epoch": 0.7283330828197806, + "flos": 16325422392960.0, + "grad_norm": 44.369101287803964, + "language_loss": 0.80753446, + "learning_rate": 7.253800993542399e-07, + "loss": 0.82259291, + "num_input_tokens_seen": 261494990, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26635742, + "step": 12114, + "time_per_iteration": 2.623629093170166 + }, + { + "auxiliary_loss_clip": 0.01257588, + "auxiliary_loss_mlp": 0.00245327, + "balance_loss_clip": 1.04145026, + "balance_loss_mlp": 0.21794438, + "epoch": 0.7283932060724485, + "flos": 27490193043840.0, + "grad_norm": 36.993132534421, + "language_loss": 0.76103586, + "learning_rate": 7.250800014029564e-07, + "loss": 0.77606499, + "num_input_tokens_seen": 261514445, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.27380371, + "step": 12115, + "time_per_iteration": 2.779085874557495 + }, + { + "auxiliary_loss_clip": 0.01286753, + "auxiliary_loss_mlp": 0.00227305, + "balance_loss_clip": 1.0594728, + "balance_loss_mlp": 0.20019686, + "epoch": 0.7284533293251165, + "flos": 18367877543040.0, + "grad_norm": 405.09157506531295, + "language_loss": 0.68120849, + "learning_rate": 7.247799517967674e-07, + "loss": 0.69634908, + "num_input_tokens_seen": 261533565, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.27087402, + "step": 12116, + "time_per_iteration": 2.732276439666748 + }, + { + "auxiliary_loss_clip": 0.01295119, + "auxiliary_loss_mlp": 0.00244203, + "balance_loss_clip": 1.068488, + "balance_loss_mlp": 0.21434067, + "epoch": 0.7285134525777844, + "flos": 21725525174400.0, + "grad_norm": 9.506354351601134, + "language_loss": 0.81813252, + "learning_rate": 7.2447995054705e-07, + "loss": 0.83352578, + "num_input_tokens_seen": 261553795, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.29870605, + "step": 12117, + "time_per_iteration": 2.682467460632324 + }, + { + "auxiliary_loss_clip": 0.01280491, + "auxiliary_loss_mlp": 0.00245108, + "balance_loss_clip": 1.0593195, + "balance_loss_mlp": 0.21898963, + "epoch": 0.7285735758304525, + "flos": 20741357456640.0, + "grad_norm": 160.7165542498642, + "language_loss": 0.77343655, + "learning_rate": 7.241799976651807e-07, + "loss": 0.78869247, + "num_input_tokens_seen": 261572565, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26135254, + "step": 12118, + "time_per_iteration": 2.736356735229492 + }, + { + "auxiliary_loss_clip": 0.01267246, + "auxiliary_loss_mlp": 0.00256369, + "balance_loss_clip": 1.05497098, + "balance_loss_mlp": 0.23300368, + "epoch": 0.7286336990831204, + "flos": 17310954827520.0, + "grad_norm": 146.89779056020166, + "language_loss": 0.91368973, + "learning_rate": 7.238800931625346e-07, + "loss": 0.92892587, + "num_input_tokens_seen": 261590910, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.23376465, + "step": 12119, + "time_per_iteration": 2.7646474838256836 + }, + { + "auxiliary_loss_clip": 0.0125879, + "auxiliary_loss_mlp": 0.00215246, + "balance_loss_clip": 1.04147148, + "balance_loss_mlp": 0.19060591, + "epoch": 0.7286938223357884, + "flos": 19787390098560.0, + "grad_norm": 117.53962686666505, + "language_loss": 0.92112124, + "learning_rate": 7.235802370504831e-07, + "loss": 0.93586153, + "num_input_tokens_seen": 261606005, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.24645996, + "step": 12120, + "time_per_iteration": 2.678436756134033 + }, + { + "auxiliary_loss_clip": 0.0126597, + "auxiliary_loss_mlp": 0.00215232, + "balance_loss_clip": 1.04909921, + "balance_loss_mlp": 0.19040096, + "epoch": 0.7287539455884563, + "flos": 15340859625600.0, + "grad_norm": 106.98424610063344, + "language_loss": 0.86961353, + "learning_rate": 7.232804293403963e-07, + "loss": 0.88442552, + "num_input_tokens_seen": 261622305, + "router_z_loss_clip": 2.17089844, + "router_z_loss_mlp": 0.24841309, + "step": 12121, + "time_per_iteration": 2.6567625999450684 + }, + { + "auxiliary_loss_clip": 0.01288645, + "auxiliary_loss_mlp": 0.00234258, + "balance_loss_clip": 1.05922365, + "balance_loss_mlp": 0.20760277, + "epoch": 0.7288140688411243, + "flos": 25192484870400.0, + "grad_norm": 8.186345709562667, + "language_loss": 0.78831816, + "learning_rate": 7.229806700436441e-07, + "loss": 0.80354726, + "num_input_tokens_seen": 261642465, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.26672363, + "step": 12122, + "time_per_iteration": 2.7476046085357666 + }, + { + "auxiliary_loss_clip": 0.01236315, + "auxiliary_loss_mlp": 0.00227351, + "balance_loss_clip": 1.02819109, + "balance_loss_mlp": 0.20381942, + "epoch": 0.7288741920937922, + "flos": 23984162328960.0, + "grad_norm": 20.501816134195007, + "language_loss": 0.93881905, + "learning_rate": 7.226809591715923e-07, + "loss": 0.95345581, + "num_input_tokens_seen": 261661420, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.23547363, + "step": 12123, + "time_per_iteration": 2.7230021953582764 + }, + { + "auxiliary_loss_clip": 0.01262324, + "auxiliary_loss_mlp": 0.00234401, + "balance_loss_clip": 1.04146433, + "balance_loss_mlp": 0.208652, + "epoch": 0.7289343153464602, + "flos": 22744921155840.0, + "grad_norm": 4.896195921757098, + "language_loss": 0.89278805, + "learning_rate": 7.223812967356065e-07, + "loss": 0.90775532, + "num_input_tokens_seen": 261680865, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25756836, + "step": 12124, + "time_per_iteration": 2.665356159210205 + }, + { + "auxiliary_loss_clip": 0.01276073, + "auxiliary_loss_mlp": 0.00255583, + "balance_loss_clip": 1.05493462, + "balance_loss_mlp": 0.23100173, + "epoch": 0.7289944385991282, + "flos": 24900028335360.0, + "grad_norm": 2.5127645729155597, + "language_loss": 0.75379413, + "learning_rate": 7.220816827470499e-07, + "loss": 0.76911068, + "num_input_tokens_seen": 261701455, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.24572754, + "step": 12125, + "time_per_iteration": 2.7525475025177 + }, + { + "auxiliary_loss_clip": 0.01294423, + "auxiliary_loss_mlp": 0.00257196, + "balance_loss_clip": 1.0609014, + "balance_loss_mlp": 0.22921745, + "epoch": 0.7290545618517962, + "flos": 22967064817920.0, + "grad_norm": 1.90918260976406, + "language_loss": 0.82583708, + "learning_rate": 7.217821172172855e-07, + "loss": 0.8413533, + "num_input_tokens_seen": 261721260, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.27978516, + "step": 12126, + "time_per_iteration": 2.8510475158691406 + }, + { + "auxiliary_loss_clip": 0.0109545, + "auxiliary_loss_mlp": 0.00054484, + "balance_loss_clip": 0.94712412, + "balance_loss_mlp": 0.04747497, + "epoch": 0.7291146851044642, + "flos": 61901523216000.0, + "grad_norm": 0.8272256417272956, + "language_loss": 0.57929534, + "learning_rate": 7.2148260015767e-07, + "loss": 0.59079468, + "num_input_tokens_seen": 261779370, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.0703125, + "step": 12127, + "time_per_iteration": 3.1270980834960938 + }, + { + "auxiliary_loss_clip": 0.012569, + "auxiliary_loss_mlp": 0.00203943, + "balance_loss_clip": 1.0408206, + "balance_loss_mlp": 0.18145992, + "epoch": 0.7291748083571321, + "flos": 23330947547520.0, + "grad_norm": 195.3503783303676, + "language_loss": 0.78244293, + "learning_rate": 7.21183131579562e-07, + "loss": 0.79705143, + "num_input_tokens_seen": 261798050, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.22497559, + "step": 12128, + "time_per_iteration": 2.658074140548706 + }, + { + "auxiliary_loss_clip": 0.01256408, + "auxiliary_loss_mlp": 0.00210009, + "balance_loss_clip": 1.03474355, + "balance_loss_mlp": 0.18414059, + "epoch": 0.7292349316098001, + "flos": 28330000001280.0, + "grad_norm": 5.324848188690332, + "language_loss": 0.74013877, + "learning_rate": 7.20883711494319e-07, + "loss": 0.75480294, + "num_input_tokens_seen": 261817660, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.25866699, + "step": 12129, + "time_per_iteration": 2.712419271469116 + }, + { + "auxiliary_loss_clip": 0.01251271, + "auxiliary_loss_mlp": 0.0024342, + "balance_loss_clip": 1.03789699, + "balance_loss_mlp": 0.21871966, + "epoch": 0.729295054862468, + "flos": 24132222190080.0, + "grad_norm": 5.418545663474037, + "language_loss": 0.81653589, + "learning_rate": 7.205843399132927e-07, + "loss": 0.83148277, + "num_input_tokens_seen": 261837935, + "router_z_loss_clip": 2.13378906, + "router_z_loss_mlp": 0.24731445, + "step": 12130, + "time_per_iteration": 2.7416512966156006 + }, + { + "auxiliary_loss_clip": 0.01268837, + "auxiliary_loss_mlp": 0.00215431, + "balance_loss_clip": 1.04671371, + "balance_loss_mlp": 0.18922891, + "epoch": 0.7293551781151361, + "flos": 22816239609600.0, + "grad_norm": 2.952511640622813, + "language_loss": 0.78170836, + "learning_rate": 7.202850168478374e-07, + "loss": 0.79655111, + "num_input_tokens_seen": 261857575, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.26196289, + "step": 12131, + "time_per_iteration": 4.110085964202881 + }, + { + "auxiliary_loss_clip": 0.01252354, + "auxiliary_loss_mlp": 0.00228204, + "balance_loss_clip": 1.03707969, + "balance_loss_mlp": 0.20287196, + "epoch": 0.729415301367804, + "flos": 22126683242880.0, + "grad_norm": 5.709691268912697, + "language_loss": 0.84980553, + "learning_rate": 7.199857423093025e-07, + "loss": 0.86461115, + "num_input_tokens_seen": 261877265, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.2532959, + "step": 12132, + "time_per_iteration": 2.6924636363983154 + }, + { + "auxiliary_loss_clip": 0.0125906, + "auxiliary_loss_mlp": 0.00199988, + "balance_loss_clip": 1.04413831, + "balance_loss_mlp": 0.17533529, + "epoch": 0.729475424620472, + "flos": 12349608675840.0, + "grad_norm": 57.45528915869271, + "language_loss": 0.88451552, + "learning_rate": 7.196865163090358e-07, + "loss": 0.89910603, + "num_input_tokens_seen": 261893695, + "router_z_loss_clip": 2.15136719, + "router_z_loss_mlp": 0.24658203, + "step": 12133, + "time_per_iteration": 4.1014063358306885 + }, + { + "auxiliary_loss_clip": 0.01254546, + "auxiliary_loss_mlp": 0.00225997, + "balance_loss_clip": 1.04056239, + "balance_loss_mlp": 0.19980642, + "epoch": 0.7295355478731399, + "flos": 22195308176640.0, + "grad_norm": 21.99897843270376, + "language_loss": 0.80623585, + "learning_rate": 7.193873388583846e-07, + "loss": 0.82104135, + "num_input_tokens_seen": 261911825, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.26171875, + "step": 12134, + "time_per_iteration": 2.641759157180786 + }, + { + "auxiliary_loss_clip": 0.01282985, + "auxiliary_loss_mlp": 0.00238043, + "balance_loss_clip": 1.06008005, + "balance_loss_mlp": 0.21226948, + "epoch": 0.7295956711258079, + "flos": 23222030532480.0, + "grad_norm": 3.3257628560320813, + "language_loss": 0.78871799, + "learning_rate": 7.190882099686939e-07, + "loss": 0.80392826, + "num_input_tokens_seen": 261931190, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25769043, + "step": 12135, + "time_per_iteration": 2.6655220985412598 + }, + { + "auxiliary_loss_clip": 0.0125792, + "auxiliary_loss_mlp": 0.0024135, + "balance_loss_clip": 1.0364908, + "balance_loss_mlp": 0.21637535, + "epoch": 0.7296557943784758, + "flos": 31869104163840.0, + "grad_norm": 17.645662666385217, + "language_loss": 0.7283504, + "learning_rate": 7.187891296513075e-07, + "loss": 0.74334311, + "num_input_tokens_seen": 261951240, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.24963379, + "step": 12136, + "time_per_iteration": 4.3085548877716064 + }, + { + "auxiliary_loss_clip": 0.01263145, + "auxiliary_loss_mlp": 0.00238278, + "balance_loss_clip": 1.04467785, + "balance_loss_mlp": 0.21422121, + "epoch": 0.7297159176311439, + "flos": 26651714889600.0, + "grad_norm": 33.888076463111965, + "language_loss": 0.81474495, + "learning_rate": 7.184900979175654e-07, + "loss": 0.82975912, + "num_input_tokens_seen": 261971605, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24060059, + "step": 12137, + "time_per_iteration": 2.729807138442993 + }, + { + "auxiliary_loss_clip": 0.01281273, + "auxiliary_loss_mlp": 0.00220139, + "balance_loss_clip": 1.05800378, + "balance_loss_mlp": 0.19430621, + "epoch": 0.7297760408838118, + "flos": 24749562263040.0, + "grad_norm": 5.483370348608696, + "language_loss": 0.82298255, + "learning_rate": 7.181911147788069e-07, + "loss": 0.8379966, + "num_input_tokens_seen": 261990830, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.25817871, + "step": 12138, + "time_per_iteration": 2.7305657863616943 + }, + { + "auxiliary_loss_clip": 0.01260243, + "auxiliary_loss_mlp": 0.0022573, + "balance_loss_clip": 1.04041719, + "balance_loss_mlp": 0.20070788, + "epoch": 0.7298361641364798, + "flos": 18073768982400.0, + "grad_norm": 62.106252395720624, + "language_loss": 0.82268447, + "learning_rate": 7.178921802463702e-07, + "loss": 0.8375442, + "num_input_tokens_seen": 262008190, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25024414, + "step": 12139, + "time_per_iteration": 2.6892127990722656 + }, + { + "auxiliary_loss_clip": 0.01259101, + "auxiliary_loss_mlp": 0.00236483, + "balance_loss_clip": 1.04480171, + "balance_loss_mlp": 0.21247463, + "epoch": 0.7298962873891478, + "flos": 29895597169920.0, + "grad_norm": 3.2464281567487423, + "language_loss": 0.79798996, + "learning_rate": 7.175932943315898e-07, + "loss": 0.81294584, + "num_input_tokens_seen": 262030460, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.24023438, + "step": 12140, + "time_per_iteration": 4.135778903961182 + }, + { + "auxiliary_loss_clip": 0.01277539, + "auxiliary_loss_mlp": 0.00242378, + "balance_loss_clip": 1.05390823, + "balance_loss_mlp": 0.21759447, + "epoch": 0.7299564106418157, + "flos": 32266096254720.0, + "grad_norm": 11.939390105143806, + "language_loss": 0.63581771, + "learning_rate": 7.172944570458003e-07, + "loss": 0.65101689, + "num_input_tokens_seen": 262050830, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.24768066, + "step": 12141, + "time_per_iteration": 2.7798407077789307 + }, + { + "auxiliary_loss_clip": 0.01276991, + "auxiliary_loss_mlp": 0.00204162, + "balance_loss_clip": 1.05970395, + "balance_loss_mlp": 0.180177, + "epoch": 0.7300165338944837, + "flos": 22930292269440.0, + "grad_norm": 4.528240783951071, + "language_loss": 0.80143863, + "learning_rate": 7.169956684003342e-07, + "loss": 0.81625009, + "num_input_tokens_seen": 262071245, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.23986816, + "step": 12142, + "time_per_iteration": 2.682826042175293 + }, + { + "auxiliary_loss_clip": 0.01248634, + "auxiliary_loss_mlp": 0.00201696, + "balance_loss_clip": 1.03736854, + "balance_loss_mlp": 0.17787784, + "epoch": 0.7300766571471516, + "flos": 19828795501440.0, + "grad_norm": 31.202185643079506, + "language_loss": 0.79662842, + "learning_rate": 7.16696928406521e-07, + "loss": 0.81113172, + "num_input_tokens_seen": 262087525, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.23803711, + "step": 12143, + "time_per_iteration": 2.7022361755371094 + }, + { + "auxiliary_loss_clip": 0.01279051, + "auxiliary_loss_mlp": 0.00229261, + "balance_loss_clip": 1.06027055, + "balance_loss_mlp": 0.20296367, + "epoch": 0.7301367803998197, + "flos": 24347829576960.0, + "grad_norm": 56.01705730540878, + "language_loss": 0.74144959, + "learning_rate": 7.163982370756882e-07, + "loss": 0.75653267, + "num_input_tokens_seen": 262107355, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.26318359, + "step": 12144, + "time_per_iteration": 2.709817409515381 + }, + { + "auxiliary_loss_clip": 0.01292461, + "auxiliary_loss_mlp": 0.00249697, + "balance_loss_clip": 1.06854498, + "balance_loss_mlp": 0.22423397, + "epoch": 0.7301969036524876, + "flos": 15304518040320.0, + "grad_norm": 47.16023626613984, + "language_loss": 0.86033523, + "learning_rate": 7.160995944191627e-07, + "loss": 0.87575674, + "num_input_tokens_seen": 262125645, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.25488281, + "step": 12145, + "time_per_iteration": 2.67399263381958 + }, + { + "auxiliary_loss_clip": 0.0125892, + "auxiliary_loss_mlp": 0.00259122, + "balance_loss_clip": 1.03997445, + "balance_loss_mlp": 0.23266964, + "epoch": 0.7302570269051556, + "flos": 23507268433920.0, + "grad_norm": 11.66439997265892, + "language_loss": 0.98708725, + "learning_rate": 7.158010004482702e-07, + "loss": 1.0022676, + "num_input_tokens_seen": 262144075, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.2644043, + "step": 12146, + "time_per_iteration": 2.666377544403076 + }, + { + "auxiliary_loss_clip": 0.01276984, + "auxiliary_loss_mlp": 0.00227848, + "balance_loss_clip": 1.05465388, + "balance_loss_mlp": 0.20197991, + "epoch": 0.7303171501578235, + "flos": 20523056549760.0, + "grad_norm": 12.743594054439544, + "language_loss": 0.67436433, + "learning_rate": 7.155024551743316e-07, + "loss": 0.68941265, + "num_input_tokens_seen": 262165940, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25866699, + "step": 12147, + "time_per_iteration": 2.79508900642395 + }, + { + "auxiliary_loss_clip": 0.01271738, + "auxiliary_loss_mlp": 0.00239774, + "balance_loss_clip": 1.05278039, + "balance_loss_mlp": 0.21539578, + "epoch": 0.7303772734104915, + "flos": 18332613365760.0, + "grad_norm": 30.759902019143446, + "language_loss": 0.82518113, + "learning_rate": 7.152039586086693e-07, + "loss": 0.84029627, + "num_input_tokens_seen": 262184520, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24353027, + "step": 12148, + "time_per_iteration": 2.634248733520508 + }, + { + "auxiliary_loss_clip": 0.01140812, + "auxiliary_loss_mlp": 0.00099413, + "balance_loss_clip": 0.98543155, + "balance_loss_mlp": 0.09235552, + "epoch": 0.7304373966631594, + "flos": 60654776100480.0, + "grad_norm": 0.7445455277795336, + "language_loss": 0.56129277, + "learning_rate": 7.149055107626017e-07, + "loss": 0.57369506, + "num_input_tokens_seen": 262247070, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.07080078, + "step": 12149, + "time_per_iteration": 3.118022918701172 + }, + { + "auxiliary_loss_clip": 0.01277485, + "auxiliary_loss_mlp": 0.00221265, + "balance_loss_clip": 1.06115007, + "balance_loss_mlp": 0.19698198, + "epoch": 0.7304975199158275, + "flos": 19828077229440.0, + "grad_norm": 24.4361253284381, + "language_loss": 0.82950854, + "learning_rate": 7.146071116474451e-07, + "loss": 0.84449601, + "num_input_tokens_seen": 262266605, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24304199, + "step": 12150, + "time_per_iteration": 2.6864802837371826 + }, + { + "auxiliary_loss_clip": 0.01272267, + "auxiliary_loss_mlp": 0.00218958, + "balance_loss_clip": 1.05141258, + "balance_loss_mlp": 0.19449672, + "epoch": 0.7305576431684954, + "flos": 13223997452160.0, + "grad_norm": 342.07665874240166, + "language_loss": 0.93157494, + "learning_rate": 7.143087612745158e-07, + "loss": 0.94648719, + "num_input_tokens_seen": 262283880, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.24462891, + "step": 12151, + "time_per_iteration": 2.6406688690185547 + }, + { + "auxiliary_loss_clip": 0.01278782, + "auxiliary_loss_mlp": 0.0023185, + "balance_loss_clip": 1.05816436, + "balance_loss_mlp": 0.206864, + "epoch": 0.7306177664211634, + "flos": 24060472773120.0, + "grad_norm": 26.780871819756527, + "language_loss": 0.85945439, + "learning_rate": 7.14010459655127e-07, + "loss": 0.87456071, + "num_input_tokens_seen": 262304155, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.24987793, + "step": 12152, + "time_per_iteration": 2.671936273574829 + }, + { + "auxiliary_loss_clip": 0.01275775, + "auxiliary_loss_mlp": 0.00239622, + "balance_loss_clip": 1.06043971, + "balance_loss_mlp": 0.21510062, + "epoch": 0.7306778896738314, + "flos": 27089106802560.0, + "grad_norm": 3.964205802481182, + "language_loss": 0.86515439, + "learning_rate": 7.137122068005919e-07, + "loss": 0.88030833, + "num_input_tokens_seen": 262325660, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.2454834, + "step": 12153, + "time_per_iteration": 2.6952896118164062 + }, + { + "auxiliary_loss_clip": 0.01278821, + "auxiliary_loss_mlp": 0.00255096, + "balance_loss_clip": 1.0590694, + "balance_loss_mlp": 0.22909603, + "epoch": 0.7307380129264993, + "flos": 16690669839360.0, + "grad_norm": 13.605643333848507, + "language_loss": 0.75762582, + "learning_rate": 7.134140027222173e-07, + "loss": 0.77296501, + "num_input_tokens_seen": 262344075, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.26000977, + "step": 12154, + "time_per_iteration": 2.6677534580230713 + }, + { + "auxiliary_loss_clip": 0.01288358, + "auxiliary_loss_mlp": 0.00227959, + "balance_loss_clip": 1.06170559, + "balance_loss_mlp": 0.20209068, + "epoch": 0.7307981361791673, + "flos": 21725740656000.0, + "grad_norm": 134.8578983523739, + "language_loss": 0.73469603, + "learning_rate": 7.131158474313128e-07, + "loss": 0.74985915, + "num_input_tokens_seen": 262363305, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.25891113, + "step": 12155, + "time_per_iteration": 2.672449827194214 + }, + { + "auxiliary_loss_clip": 0.01238045, + "auxiliary_loss_mlp": 0.00213469, + "balance_loss_clip": 1.03113317, + "balance_loss_mlp": 0.1896987, + "epoch": 0.7308582594318352, + "flos": 18040659621120.0, + "grad_norm": 70.83322231286152, + "language_loss": 0.90266061, + "learning_rate": 7.128177409391851e-07, + "loss": 0.91717577, + "num_input_tokens_seen": 262380730, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.23791504, + "step": 12156, + "time_per_iteration": 2.653552293777466 + }, + { + "auxiliary_loss_clip": 0.01267039, + "auxiliary_loss_mlp": 0.00244476, + "balance_loss_clip": 1.05033934, + "balance_loss_mlp": 0.22041979, + "epoch": 0.7309183826845033, + "flos": 13844964798720.0, + "grad_norm": 8.408825542314908, + "language_loss": 0.83606195, + "learning_rate": 7.125196832571367e-07, + "loss": 0.8511771, + "num_input_tokens_seen": 262395480, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24072266, + "step": 12157, + "time_per_iteration": 2.5982203483581543 + }, + { + "auxiliary_loss_clip": 0.01257122, + "auxiliary_loss_mlp": 0.00230656, + "balance_loss_clip": 1.042907, + "balance_loss_mlp": 0.20806572, + "epoch": 0.7309785059371712, + "flos": 17019216564480.0, + "grad_norm": 3.963865641527486, + "language_loss": 0.80986285, + "learning_rate": 7.122216743964713e-07, + "loss": 0.82474053, + "num_input_tokens_seen": 262413340, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.22607422, + "step": 12158, + "time_per_iteration": 2.772718906402588 + }, + { + "auxiliary_loss_clip": 0.01271984, + "auxiliary_loss_mlp": 0.00254722, + "balance_loss_clip": 1.05377889, + "balance_loss_mlp": 0.22770953, + "epoch": 0.7310386291898392, + "flos": 26502398052480.0, + "grad_norm": 53.408192145423634, + "language_loss": 0.92519587, + "learning_rate": 7.119237143684896e-07, + "loss": 0.94046295, + "num_input_tokens_seen": 262433455, + "router_z_loss_clip": 2.18066406, + "router_z_loss_mlp": 0.27026367, + "step": 12159, + "time_per_iteration": 2.7398695945739746 + }, + { + "auxiliary_loss_clip": 0.01283192, + "auxiliary_loss_mlp": 0.00227394, + "balance_loss_clip": 1.05536687, + "balance_loss_mlp": 0.20060745, + "epoch": 0.7310987524425071, + "flos": 16945922862720.0, + "grad_norm": 97.29042053401565, + "language_loss": 0.83942783, + "learning_rate": 7.116258031844895e-07, + "loss": 0.85453367, + "num_input_tokens_seen": 262450335, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.26782227, + "step": 12160, + "time_per_iteration": 2.6845180988311768 + }, + { + "auxiliary_loss_clip": 0.01301116, + "auxiliary_loss_mlp": 0.00233843, + "balance_loss_clip": 1.06780934, + "balance_loss_mlp": 0.205888, + "epoch": 0.7311588756951751, + "flos": 13845288021120.0, + "grad_norm": 16.752251947680907, + "language_loss": 0.85269803, + "learning_rate": 7.113279408557675e-07, + "loss": 0.8680476, + "num_input_tokens_seen": 262468240, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.27954102, + "step": 12161, + "time_per_iteration": 2.716829299926758 + }, + { + "auxiliary_loss_clip": 0.01317814, + "auxiliary_loss_mlp": 0.0022959, + "balance_loss_clip": 1.08046627, + "balance_loss_mlp": 0.20151666, + "epoch": 0.731218998947843, + "flos": 28767894704640.0, + "grad_norm": 8.323234721399743, + "language_loss": 0.79385924, + "learning_rate": 7.110301273936192e-07, + "loss": 0.80933326, + "num_input_tokens_seen": 262487045, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.28076172, + "step": 12162, + "time_per_iteration": 2.741257667541504 + }, + { + "auxiliary_loss_clip": 0.01294616, + "auxiliary_loss_mlp": 0.00228259, + "balance_loss_clip": 1.06719136, + "balance_loss_mlp": 0.20110363, + "epoch": 0.7312791222005111, + "flos": 27088783580160.0, + "grad_norm": 2.4667831219465746, + "language_loss": 0.75072217, + "learning_rate": 7.107323628093382e-07, + "loss": 0.76595092, + "num_input_tokens_seen": 262504855, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.2713623, + "step": 12163, + "time_per_iteration": 2.805196523666382 + }, + { + "auxiliary_loss_clip": 0.01272227, + "auxiliary_loss_mlp": 0.00254396, + "balance_loss_clip": 1.0485363, + "balance_loss_mlp": 0.22683522, + "epoch": 0.731339245453179, + "flos": 20924035050240.0, + "grad_norm": 4.775194471855563, + "language_loss": 0.76254904, + "learning_rate": 7.104346471142153e-07, + "loss": 0.77781528, + "num_input_tokens_seen": 262524920, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.2755127, + "step": 12164, + "time_per_iteration": 2.801875114440918 + }, + { + "auxiliary_loss_clip": 0.01258309, + "auxiliary_loss_mlp": 0.00217241, + "balance_loss_clip": 1.04501724, + "balance_loss_mlp": 0.19233775, + "epoch": 0.731399368705847, + "flos": 23075694524160.0, + "grad_norm": 11.3491048727447, + "language_loss": 0.83028877, + "learning_rate": 7.101369803195391e-07, + "loss": 0.84504426, + "num_input_tokens_seen": 262545725, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24914551, + "step": 12165, + "time_per_iteration": 2.913952589035034 + }, + { + "auxiliary_loss_clip": 0.0127009, + "auxiliary_loss_mlp": 0.0022034, + "balance_loss_clip": 1.05070865, + "balance_loss_mlp": 0.19508001, + "epoch": 0.731459491958515, + "flos": 23582681038080.0, + "grad_norm": 4.637734226455805, + "language_loss": 0.84056485, + "learning_rate": 7.098393624365988e-07, + "loss": 0.85546911, + "num_input_tokens_seen": 262565480, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25280762, + "step": 12166, + "time_per_iteration": 2.6711525917053223 + }, + { + "auxiliary_loss_clip": 0.01263408, + "auxiliary_loss_mlp": 0.00213778, + "balance_loss_clip": 1.04532099, + "balance_loss_mlp": 0.18994758, + "epoch": 0.7315196152111829, + "flos": 22379278659840.0, + "grad_norm": 10.142232686521435, + "language_loss": 0.85356009, + "learning_rate": 7.095417934766781e-07, + "loss": 0.86833197, + "num_input_tokens_seen": 262584145, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.23840332, + "step": 12167, + "time_per_iteration": 2.654996633529663 + }, + { + "auxiliary_loss_clip": 0.01258735, + "auxiliary_loss_mlp": 0.00216753, + "balance_loss_clip": 1.04230118, + "balance_loss_mlp": 0.19231471, + "epoch": 0.7315797384638509, + "flos": 26177047637760.0, + "grad_norm": 323.4310408174081, + "language_loss": 0.83065581, + "learning_rate": 7.092442734510622e-07, + "loss": 0.8454107, + "num_input_tokens_seen": 262604045, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24438477, + "step": 12168, + "time_per_iteration": 2.6749041080474854 + }, + { + "auxiliary_loss_clip": 0.01286397, + "auxiliary_loss_mlp": 0.00246034, + "balance_loss_clip": 1.0573802, + "balance_loss_mlp": 0.21935451, + "epoch": 0.7316398617165188, + "flos": 21506326427520.0, + "grad_norm": 2.2619963381915604, + "language_loss": 0.885867, + "learning_rate": 7.089468023710326e-07, + "loss": 0.90119135, + "num_input_tokens_seen": 262624540, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.26696777, + "step": 12169, + "time_per_iteration": 2.7780306339263916 + }, + { + "auxiliary_loss_clip": 0.01280988, + "auxiliary_loss_mlp": 0.00218518, + "balance_loss_clip": 1.05864859, + "balance_loss_mlp": 0.19388901, + "epoch": 0.7316999849691869, + "flos": 30482557315200.0, + "grad_norm": 2.1851908474256687, + "language_loss": 0.79907095, + "learning_rate": 7.08649380247871e-07, + "loss": 0.81406605, + "num_input_tokens_seen": 262644545, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.24633789, + "step": 12170, + "time_per_iteration": 2.7666494846343994 + }, + { + "auxiliary_loss_clip": 0.01261713, + "auxiliary_loss_mlp": 0.00212773, + "balance_loss_clip": 1.04254615, + "balance_loss_mlp": 0.18741766, + "epoch": 0.7317601082218548, + "flos": 21543781334400.0, + "grad_norm": 469.80286867599347, + "language_loss": 0.78160077, + "learning_rate": 7.083520070928533e-07, + "loss": 0.79634571, + "num_input_tokens_seen": 262662570, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25341797, + "step": 12171, + "time_per_iteration": 2.644022226333618 + }, + { + "auxiliary_loss_clip": 0.01279001, + "auxiliary_loss_mlp": 0.00256153, + "balance_loss_clip": 1.05425644, + "balance_loss_mlp": 0.22878218, + "epoch": 0.7318202314745228, + "flos": 33251592775680.0, + "grad_norm": 7.887623573219813, + "language_loss": 0.73686242, + "learning_rate": 7.080546829172564e-07, + "loss": 0.75221395, + "num_input_tokens_seen": 262683245, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.27380371, + "step": 12172, + "time_per_iteration": 2.714815616607666 + }, + { + "auxiliary_loss_clip": 0.01304492, + "auxiliary_loss_mlp": 0.00246365, + "balance_loss_clip": 1.06990004, + "balance_loss_mlp": 0.21869631, + "epoch": 0.7318803547271907, + "flos": 20157054917760.0, + "grad_norm": 71.33397749093749, + "language_loss": 0.72884965, + "learning_rate": 7.077574077323564e-07, + "loss": 0.74435818, + "num_input_tokens_seen": 262701585, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.27685547, + "step": 12173, + "time_per_iteration": 4.131450176239014 + }, + { + "auxiliary_loss_clip": 0.01258729, + "auxiliary_loss_mlp": 0.00216844, + "balance_loss_clip": 1.04371715, + "balance_loss_mlp": 0.19409913, + "epoch": 0.7319404779798587, + "flos": 20558536208640.0, + "grad_norm": 5.603741076141377, + "language_loss": 0.83231294, + "learning_rate": 7.074601815494243e-07, + "loss": 0.84706867, + "num_input_tokens_seen": 262719295, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.22741699, + "step": 12174, + "time_per_iteration": 2.7303740978240967 + }, + { + "auxiliary_loss_clip": 0.01267554, + "auxiliary_loss_mlp": 0.00226459, + "balance_loss_clip": 1.05161023, + "balance_loss_mlp": 0.20212884, + "epoch": 0.7320006012325266, + "flos": 28695391102080.0, + "grad_norm": 780.7006529920528, + "language_loss": 0.8703745, + "learning_rate": 7.071630043797317e-07, + "loss": 0.88531458, + "num_input_tokens_seen": 262739995, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.24316406, + "step": 12175, + "time_per_iteration": 4.141963005065918 + }, + { + "auxiliary_loss_clip": 0.01254214, + "auxiliary_loss_mlp": 0.00222725, + "balance_loss_clip": 1.03667653, + "balance_loss_mlp": 0.19703519, + "epoch": 0.7320607244851947, + "flos": 16362697731840.0, + "grad_norm": 8.10099619468523, + "language_loss": 0.85550559, + "learning_rate": 7.068658762345488e-07, + "loss": 0.87027502, + "num_input_tokens_seen": 262757680, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25708008, + "step": 12176, + "time_per_iteration": 2.6761393547058105 + }, + { + "auxiliary_loss_clip": 0.01284451, + "auxiliary_loss_mlp": 0.00212368, + "balance_loss_clip": 1.05937147, + "balance_loss_mlp": 0.18764408, + "epoch": 0.7321208477378626, + "flos": 20955097336320.0, + "grad_norm": 14.24967872280871, + "language_loss": 0.83636749, + "learning_rate": 7.065687971251399e-07, + "loss": 0.85133564, + "num_input_tokens_seen": 262776990, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.24719238, + "step": 12177, + "time_per_iteration": 2.6815991401672363 + }, + { + "auxiliary_loss_clip": 0.01240888, + "auxiliary_loss_mlp": 0.00214544, + "balance_loss_clip": 1.02842069, + "balance_loss_mlp": 0.19110793, + "epoch": 0.7321809709905306, + "flos": 13845072539520.0, + "grad_norm": 11.042537130963975, + "language_loss": 0.83400738, + "learning_rate": 7.06271767062772e-07, + "loss": 0.84856176, + "num_input_tokens_seen": 262795440, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.23461914, + "step": 12178, + "time_per_iteration": 4.016367435455322 + }, + { + "auxiliary_loss_clip": 0.01266145, + "auxiliary_loss_mlp": 0.00240069, + "balance_loss_clip": 1.04301989, + "balance_loss_mlp": 0.21409304, + "epoch": 0.7322410942431986, + "flos": 26979938392320.0, + "grad_norm": 67.55388268571134, + "language_loss": 0.91153467, + "learning_rate": 7.059747860587084e-07, + "loss": 0.92659682, + "num_input_tokens_seen": 262816385, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26000977, + "step": 12179, + "time_per_iteration": 2.8037075996398926 + }, + { + "auxiliary_loss_clip": 0.01264007, + "auxiliary_loss_mlp": 0.00218374, + "balance_loss_clip": 1.04673243, + "balance_loss_mlp": 0.19422197, + "epoch": 0.7323012174958665, + "flos": 17639717034240.0, + "grad_norm": 103.44882232937525, + "language_loss": 0.82304072, + "learning_rate": 7.056778541242115e-07, + "loss": 0.83786452, + "num_input_tokens_seen": 262834955, + "router_z_loss_clip": 2.17675781, + "router_z_loss_mlp": 0.24169922, + "step": 12180, + "time_per_iteration": 2.682004690170288 + }, + { + "auxiliary_loss_clip": 0.01285535, + "auxiliary_loss_mlp": 0.00255643, + "balance_loss_clip": 1.05356669, + "balance_loss_mlp": 0.22702043, + "epoch": 0.7323613407485345, + "flos": 32342765834880.0, + "grad_norm": 5.939929901887024, + "language_loss": 0.88825202, + "learning_rate": 7.053809712705396e-07, + "loss": 0.90366375, + "num_input_tokens_seen": 262853555, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.28601074, + "step": 12181, + "time_per_iteration": 2.7430319786071777 + }, + { + "auxiliary_loss_clip": 0.01283357, + "auxiliary_loss_mlp": 0.00239218, + "balance_loss_clip": 1.06197321, + "balance_loss_mlp": 0.21308698, + "epoch": 0.7324214640012024, + "flos": 18362777811840.0, + "grad_norm": 42.91402216894036, + "language_loss": 0.8140195, + "learning_rate": 7.050841375089506e-07, + "loss": 0.82924521, + "num_input_tokens_seen": 262870975, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26123047, + "step": 12182, + "time_per_iteration": 2.743802785873413 + }, + { + "auxiliary_loss_clip": 0.01281285, + "auxiliary_loss_mlp": 0.00237065, + "balance_loss_clip": 1.05710793, + "balance_loss_mlp": 0.21114917, + "epoch": 0.7324815872538705, + "flos": 30812289189120.0, + "grad_norm": 614.1968038986224, + "language_loss": 0.78858173, + "learning_rate": 7.047873528507015e-07, + "loss": 0.8037653, + "num_input_tokens_seen": 262892635, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.25952148, + "step": 12183, + "time_per_iteration": 4.1820068359375 + }, + { + "auxiliary_loss_clip": 0.01277743, + "auxiliary_loss_mlp": 0.00234202, + "balance_loss_clip": 1.04870903, + "balance_loss_mlp": 0.20814292, + "epoch": 0.7325417105065384, + "flos": 21505069451520.0, + "grad_norm": 67.96327613811766, + "language_loss": 0.81205863, + "learning_rate": 7.04490617307045e-07, + "loss": 0.82717812, + "num_input_tokens_seen": 262910725, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.26074219, + "step": 12184, + "time_per_iteration": 2.6547834873199463 + }, + { + "auxiliary_loss_clip": 0.01106719, + "auxiliary_loss_mlp": 0.00065223, + "balance_loss_clip": 0.95135605, + "balance_loss_mlp": 0.05892899, + "epoch": 0.7326018337592064, + "flos": 67257742556160.0, + "grad_norm": 0.7509799004240644, + "language_loss": 0.6467225, + "learning_rate": 7.041939308892344e-07, + "loss": 0.65844196, + "num_input_tokens_seen": 262974150, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.06298828, + "step": 12185, + "time_per_iteration": 3.1555488109588623 + }, + { + "auxiliary_loss_clip": 0.01264927, + "auxiliary_loss_mlp": 0.00224355, + "balance_loss_clip": 1.04145074, + "balance_loss_mlp": 0.19717555, + "epoch": 0.7326619570118743, + "flos": 22857070394880.0, + "grad_norm": 196.23057870594036, + "language_loss": 0.91929048, + "learning_rate": 7.038972936085197e-07, + "loss": 0.9341833, + "num_input_tokens_seen": 262993370, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.27185059, + "step": 12186, + "time_per_iteration": 2.6969404220581055 + }, + { + "auxiliary_loss_clip": 0.01273289, + "auxiliary_loss_mlp": 0.0025166, + "balance_loss_clip": 1.04388297, + "balance_loss_mlp": 0.2242177, + "epoch": 0.7327220802645423, + "flos": 23327499841920.0, + "grad_norm": 17.847073571981564, + "language_loss": 0.83119917, + "learning_rate": 7.036007054761508e-07, + "loss": 0.8464486, + "num_input_tokens_seen": 263012665, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.2746582, + "step": 12187, + "time_per_iteration": 2.83419132232666 + }, + { + "auxiliary_loss_clip": 0.01279156, + "auxiliary_loss_mlp": 0.00205811, + "balance_loss_clip": 1.05269766, + "balance_loss_mlp": 0.18144508, + "epoch": 0.7327822035172102, + "flos": 23180661043200.0, + "grad_norm": 281.889061545406, + "language_loss": 0.9766494, + "learning_rate": 7.033041665033716e-07, + "loss": 0.99149907, + "num_input_tokens_seen": 263031475, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.24377441, + "step": 12188, + "time_per_iteration": 2.7800092697143555 + }, + { + "auxiliary_loss_clip": 0.01255528, + "auxiliary_loss_mlp": 0.00241614, + "balance_loss_clip": 1.03438497, + "balance_loss_mlp": 0.21597221, + "epoch": 0.7328423267698783, + "flos": 21066600130560.0, + "grad_norm": 16.85740685926344, + "language_loss": 0.8419463, + "learning_rate": 7.030076767014284e-07, + "loss": 0.85691774, + "num_input_tokens_seen": 263051445, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.2565918, + "step": 12189, + "time_per_iteration": 2.729074478149414 + }, + { + "auxiliary_loss_clip": 0.01261451, + "auxiliary_loss_mlp": 0.00220687, + "balance_loss_clip": 1.03563344, + "balance_loss_mlp": 0.19701242, + "epoch": 0.7329024500225462, + "flos": 21689578638720.0, + "grad_norm": 6.726875137754761, + "language_loss": 0.90031266, + "learning_rate": 7.027112360815648e-07, + "loss": 0.91513407, + "num_input_tokens_seen": 263070835, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.23657227, + "step": 12190, + "time_per_iteration": 2.799989938735962 + }, + { + "auxiliary_loss_clip": 0.01267386, + "auxiliary_loss_mlp": 0.00246187, + "balance_loss_clip": 1.04563653, + "balance_loss_mlp": 0.22062883, + "epoch": 0.7329625732752142, + "flos": 24164038661760.0, + "grad_norm": 69.61236772254864, + "language_loss": 0.79888946, + "learning_rate": 7.024148446550204e-07, + "loss": 0.81402522, + "num_input_tokens_seen": 263090070, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.2557373, + "step": 12191, + "time_per_iteration": 2.6681509017944336 + }, + { + "auxiliary_loss_clip": 0.01247066, + "auxiliary_loss_mlp": 0.00221881, + "balance_loss_clip": 1.03003633, + "balance_loss_mlp": 0.1978122, + "epoch": 0.7330226965278822, + "flos": 30077915627520.0, + "grad_norm": 113.43333543725988, + "language_loss": 0.77050745, + "learning_rate": 7.021185024330361e-07, + "loss": 0.7851969, + "num_input_tokens_seen": 263110030, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24072266, + "step": 12192, + "time_per_iteration": 2.7498695850372314 + }, + { + "auxiliary_loss_clip": 0.01259162, + "auxiliary_loss_mlp": 0.00225464, + "balance_loss_clip": 1.03853345, + "balance_loss_mlp": 0.20217048, + "epoch": 0.7330828197805501, + "flos": 23368294713600.0, + "grad_norm": 12.84282469944012, + "language_loss": 0.8311457, + "learning_rate": 7.01822209426848e-07, + "loss": 0.84599197, + "num_input_tokens_seen": 263129735, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.23278809, + "step": 12193, + "time_per_iteration": 2.6332759857177734 + }, + { + "auxiliary_loss_clip": 0.01256932, + "auxiliary_loss_mlp": 0.00219056, + "balance_loss_clip": 1.03406835, + "balance_loss_mlp": 0.19319907, + "epoch": 0.7331429430332181, + "flos": 21032808410880.0, + "grad_norm": 9.671674478962926, + "language_loss": 0.86712027, + "learning_rate": 7.015259656476911e-07, + "loss": 0.8818801, + "num_input_tokens_seen": 263149100, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25866699, + "step": 12194, + "time_per_iteration": 2.656219482421875 + }, + { + "auxiliary_loss_clip": 0.01262493, + "auxiliary_loss_mlp": 0.00206023, + "balance_loss_clip": 1.04265618, + "balance_loss_mlp": 0.18091738, + "epoch": 0.733203066285886, + "flos": 14647891466880.0, + "grad_norm": 333.6559140077547, + "language_loss": 0.78979433, + "learning_rate": 7.012297711067998e-07, + "loss": 0.80447954, + "num_input_tokens_seen": 263166620, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25097656, + "step": 12195, + "time_per_iteration": 2.6064982414245605 + }, + { + "auxiliary_loss_clip": 0.01234148, + "auxiliary_loss_mlp": 0.00213088, + "balance_loss_clip": 1.01904356, + "balance_loss_mlp": 0.18977109, + "epoch": 0.7332631895385541, + "flos": 17165301177600.0, + "grad_norm": 11.733237573276215, + "language_loss": 0.8046937, + "learning_rate": 7.009336258154057e-07, + "loss": 0.81916606, + "num_input_tokens_seen": 263184780, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.23291016, + "step": 12196, + "time_per_iteration": 2.643955945968628 + }, + { + "auxiliary_loss_clip": 0.01250739, + "auxiliary_loss_mlp": 0.00233229, + "balance_loss_clip": 1.03438807, + "balance_loss_mlp": 0.20957758, + "epoch": 0.733323312791222, + "flos": 28658151676800.0, + "grad_norm": 7.126596125794442, + "language_loss": 0.80327433, + "learning_rate": 7.006375297847394e-07, + "loss": 0.81811404, + "num_input_tokens_seen": 263204625, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.23620605, + "step": 12197, + "time_per_iteration": 2.6887831687927246 + }, + { + "auxiliary_loss_clip": 0.01269759, + "auxiliary_loss_mlp": 0.00241738, + "balance_loss_clip": 1.04090214, + "balance_loss_mlp": 0.21403354, + "epoch": 0.73338343604389, + "flos": 16618417632000.0, + "grad_norm": 6.411773261507132, + "language_loss": 0.86613178, + "learning_rate": 7.003414830260282e-07, + "loss": 0.88124669, + "num_input_tokens_seen": 263221565, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.27746582, + "step": 12198, + "time_per_iteration": 2.6116392612457275 + }, + { + "auxiliary_loss_clip": 0.01237183, + "auxiliary_loss_mlp": 0.0020892, + "balance_loss_clip": 1.02175963, + "balance_loss_mlp": 0.18599597, + "epoch": 0.7334435592965579, + "flos": 21142084561920.0, + "grad_norm": 239251.0291879866, + "language_loss": 0.83455348, + "learning_rate": 7.000454855504974e-07, + "loss": 0.84901452, + "num_input_tokens_seen": 263240620, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.22937012, + "step": 12199, + "time_per_iteration": 2.628767490386963 + }, + { + "auxiliary_loss_clip": 0.01254743, + "auxiliary_loss_mlp": 0.00203604, + "balance_loss_clip": 1.0278126, + "balance_loss_mlp": 0.1761265, + "epoch": 0.7335036825492259, + "flos": 17125332318720.0, + "grad_norm": 6.958209072996161, + "language_loss": 0.89729589, + "learning_rate": 6.997495373693729e-07, + "loss": 0.91187936, + "num_input_tokens_seen": 263254365, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.27490234, + "step": 12200, + "time_per_iteration": 2.6664860248565674 + }, + { + "auxiliary_loss_clip": 0.01245045, + "auxiliary_loss_mlp": 0.00233915, + "balance_loss_clip": 1.02357662, + "balance_loss_mlp": 0.20922714, + "epoch": 0.7335638058018938, + "flos": 23731818307200.0, + "grad_norm": 8.00007684665419, + "language_loss": 0.71158361, + "learning_rate": 6.994536384938754e-07, + "loss": 0.7263732, + "num_input_tokens_seen": 263275880, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.24682617, + "step": 12201, + "time_per_iteration": 2.6951498985290527 + }, + { + "auxiliary_loss_clip": 0.01251487, + "auxiliary_loss_mlp": 0.00218167, + "balance_loss_clip": 1.03392673, + "balance_loss_mlp": 0.19563606, + "epoch": 0.7336239290545619, + "flos": 34933289679360.0, + "grad_norm": 8.373727642710868, + "language_loss": 0.60221046, + "learning_rate": 6.991577889352264e-07, + "loss": 0.616907, + "num_input_tokens_seen": 263298315, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.22521973, + "step": 12202, + "time_per_iteration": 2.8626182079315186 + }, + { + "auxiliary_loss_clip": 0.01257428, + "auxiliary_loss_mlp": 0.00221719, + "balance_loss_clip": 1.03579271, + "balance_loss_mlp": 0.19775796, + "epoch": 0.7336840523072298, + "flos": 21103049456640.0, + "grad_norm": 70.88155269797817, + "language_loss": 0.77400792, + "learning_rate": 6.98861988704645e-07, + "loss": 0.78879941, + "num_input_tokens_seen": 263318615, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.23962402, + "step": 12203, + "time_per_iteration": 2.6359329223632812 + }, + { + "auxiliary_loss_clip": 0.01279326, + "auxiliary_loss_mlp": 0.00222506, + "balance_loss_clip": 1.05555391, + "balance_loss_mlp": 0.19766238, + "epoch": 0.7337441755598978, + "flos": 24024418496640.0, + "grad_norm": 1367.2553819419488, + "language_loss": 0.75781077, + "learning_rate": 6.985662378133474e-07, + "loss": 0.77282912, + "num_input_tokens_seen": 263336705, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.24829102, + "step": 12204, + "time_per_iteration": 2.690642833709717 + }, + { + "auxiliary_loss_clip": 0.01236832, + "auxiliary_loss_mlp": 0.00197067, + "balance_loss_clip": 1.0209862, + "balance_loss_mlp": 0.17452441, + "epoch": 0.7338042988125658, + "flos": 22711309004160.0, + "grad_norm": 11.726102968434004, + "language_loss": 0.85817206, + "learning_rate": 6.982705362725479e-07, + "loss": 0.87251103, + "num_input_tokens_seen": 263355065, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.22546387, + "step": 12205, + "time_per_iteration": 2.687021017074585 + }, + { + "auxiliary_loss_clip": 0.0126336, + "auxiliary_loss_mlp": 0.00205971, + "balance_loss_clip": 1.04496169, + "balance_loss_mlp": 0.1823799, + "epoch": 0.7338644220652337, + "flos": 21360996000000.0, + "grad_norm": 3.1268134892893085, + "language_loss": 0.87110847, + "learning_rate": 6.979748840934601e-07, + "loss": 0.88580179, + "num_input_tokens_seen": 263374460, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.23596191, + "step": 12206, + "time_per_iteration": 2.649634838104248 + }, + { + "auxiliary_loss_clip": 0.01246771, + "auxiliary_loss_mlp": 0.00208928, + "balance_loss_clip": 1.02573454, + "balance_loss_mlp": 0.18463324, + "epoch": 0.7339245453179017, + "flos": 30920236536960.0, + "grad_norm": 6.541478045751026, + "language_loss": 0.80366164, + "learning_rate": 6.976792812872958e-07, + "loss": 0.81821859, + "num_input_tokens_seen": 263393610, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24316406, + "step": 12207, + "time_per_iteration": 2.7695722579956055 + }, + { + "auxiliary_loss_clip": 0.01102047, + "auxiliary_loss_mlp": 0.00068776, + "balance_loss_clip": 0.94867128, + "balance_loss_mlp": 0.06252918, + "epoch": 0.7339846685705697, + "flos": 67899429072000.0, + "grad_norm": 0.7581809320744195, + "language_loss": 0.54125774, + "learning_rate": 6.97383727865263e-07, + "loss": 0.552966, + "num_input_tokens_seen": 263450340, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.0625, + "step": 12208, + "time_per_iteration": 3.2372446060180664 + }, + { + "auxiliary_loss_clip": 0.01253109, + "auxiliary_loss_mlp": 0.00202862, + "balance_loss_clip": 1.03301466, + "balance_loss_mlp": 0.17736267, + "epoch": 0.7340447918232377, + "flos": 22236749493120.0, + "grad_norm": 3.814983797700515, + "language_loss": 0.86065829, + "learning_rate": 6.970882238385703e-07, + "loss": 0.87521797, + "num_input_tokens_seen": 263471735, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25512695, + "step": 12209, + "time_per_iteration": 2.7253241539001465 + }, + { + "auxiliary_loss_clip": 0.01251215, + "auxiliary_loss_mlp": 0.00235788, + "balance_loss_clip": 1.03127217, + "balance_loss_mlp": 0.21278061, + "epoch": 0.7341049150759056, + "flos": 23764784014080.0, + "grad_norm": 256.62531283024583, + "language_loss": 0.85295904, + "learning_rate": 6.96792769218423e-07, + "loss": 0.86782902, + "num_input_tokens_seen": 263493245, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.23010254, + "step": 12210, + "time_per_iteration": 2.695286273956299 + }, + { + "auxiliary_loss_clip": 0.01256626, + "auxiliary_loss_mlp": 0.0022581, + "balance_loss_clip": 1.03647196, + "balance_loss_mlp": 0.20025128, + "epoch": 0.7341650383285736, + "flos": 17236547804160.0, + "grad_norm": 16.09707281974319, + "language_loss": 0.85869777, + "learning_rate": 6.964973640160236e-07, + "loss": 0.87352216, + "num_input_tokens_seen": 263511660, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25549316, + "step": 12211, + "time_per_iteration": 2.8366668224334717 + }, + { + "auxiliary_loss_clip": 0.01249513, + "auxiliary_loss_mlp": 0.00229781, + "balance_loss_clip": 1.03117752, + "balance_loss_mlp": 0.20597512, + "epoch": 0.7342251615812415, + "flos": 23403953940480.0, + "grad_norm": 31.586438709728164, + "language_loss": 0.80335844, + "learning_rate": 6.962020082425748e-07, + "loss": 0.81815135, + "num_input_tokens_seen": 263530875, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.23803711, + "step": 12212, + "time_per_iteration": 2.703235387802124 + }, + { + "auxiliary_loss_clip": 0.01259261, + "auxiliary_loss_mlp": 0.00219448, + "balance_loss_clip": 1.03468311, + "balance_loss_mlp": 0.19335268, + "epoch": 0.7342852848339095, + "flos": 22747183712640.0, + "grad_norm": 12.473432506520094, + "language_loss": 0.7771523, + "learning_rate": 6.959067019092766e-07, + "loss": 0.79193938, + "num_input_tokens_seen": 263551585, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26074219, + "step": 12213, + "time_per_iteration": 2.7583351135253906 + }, + { + "auxiliary_loss_clip": 0.01127181, + "auxiliary_loss_mlp": 0.0005275, + "balance_loss_clip": 0.9695009, + "balance_loss_mlp": 0.04683714, + "epoch": 0.7343454080865774, + "flos": 53942353925760.0, + "grad_norm": 0.6990467631280239, + "language_loss": 0.53397632, + "learning_rate": 6.956114450273276e-07, + "loss": 0.54577565, + "num_input_tokens_seen": 263609545, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.05908203, + "step": 12214, + "time_per_iteration": 3.0913808345794678 + }, + { + "auxiliary_loss_clip": 0.01278366, + "auxiliary_loss_mlp": 0.00229165, + "balance_loss_clip": 1.04895651, + "balance_loss_mlp": 0.20267701, + "epoch": 0.7344055313392455, + "flos": 12166859255040.0, + "grad_norm": 8.957653096051944, + "language_loss": 0.81924713, + "learning_rate": 6.953162376079233e-07, + "loss": 0.83432245, + "num_input_tokens_seen": 263627880, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.26489258, + "step": 12215, + "time_per_iteration": 4.0502448081970215 + }, + { + "auxiliary_loss_clip": 0.01252465, + "auxiliary_loss_mlp": 0.00233662, + "balance_loss_clip": 1.03781128, + "balance_loss_mlp": 0.20960578, + "epoch": 0.7344656545919134, + "flos": 18550052346240.0, + "grad_norm": 6.307657519449156, + "language_loss": 0.79802155, + "learning_rate": 6.950210796622573e-07, + "loss": 0.81288284, + "num_input_tokens_seen": 263645665, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24060059, + "step": 12216, + "time_per_iteration": 2.6409900188446045 + }, + { + "auxiliary_loss_clip": 0.0130708, + "auxiliary_loss_mlp": 0.00235184, + "balance_loss_clip": 1.06701159, + "balance_loss_mlp": 0.20746818, + "epoch": 0.7345257778445814, + "flos": 23661649088640.0, + "grad_norm": 14.056888863624494, + "language_loss": 0.85474944, + "learning_rate": 6.947259712015236e-07, + "loss": 0.87017202, + "num_input_tokens_seen": 263668170, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.27722168, + "step": 12217, + "time_per_iteration": 4.1446638107299805 + }, + { + "auxiliary_loss_clip": 0.01243839, + "auxiliary_loss_mlp": 0.00212813, + "balance_loss_clip": 1.02753735, + "balance_loss_mlp": 0.18913837, + "epoch": 0.7345859010972494, + "flos": 13808659127040.0, + "grad_norm": 38.96895091069983, + "language_loss": 0.85401666, + "learning_rate": 6.94430912236911e-07, + "loss": 0.8685832, + "num_input_tokens_seen": 263684190, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.23718262, + "step": 12218, + "time_per_iteration": 2.6875252723693848 + }, + { + "auxiliary_loss_clip": 0.0126522, + "auxiliary_loss_mlp": 0.00222183, + "balance_loss_clip": 1.04405284, + "balance_loss_mlp": 0.19757792, + "epoch": 0.7346460243499173, + "flos": 22272731942400.0, + "grad_norm": 4.511590587235688, + "language_loss": 0.81287509, + "learning_rate": 6.941359027796092e-07, + "loss": 0.82774913, + "num_input_tokens_seen": 263702095, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.24633789, + "step": 12219, + "time_per_iteration": 2.672482967376709 + }, + { + "auxiliary_loss_clip": 0.01250081, + "auxiliary_loss_mlp": 0.00219944, + "balance_loss_clip": 1.03445029, + "balance_loss_mlp": 0.19505268, + "epoch": 0.7347061476025853, + "flos": 23255247634560.0, + "grad_norm": 17.65719708544549, + "language_loss": 0.82680631, + "learning_rate": 6.938409428408061e-07, + "loss": 0.8415066, + "num_input_tokens_seen": 263721385, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.24902344, + "step": 12220, + "time_per_iteration": 5.093493700027466 + }, + { + "auxiliary_loss_clip": 0.01263343, + "auxiliary_loss_mlp": 0.00256998, + "balance_loss_clip": 1.03845549, + "balance_loss_mlp": 0.23115307, + "epoch": 0.7347662708552533, + "flos": 15267565923840.0, + "grad_norm": 11.973503502536625, + "language_loss": 0.7375415, + "learning_rate": 6.93546032431684e-07, + "loss": 0.75274485, + "num_input_tokens_seen": 263737835, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.25878906, + "step": 12221, + "time_per_iteration": 2.620224714279175 + }, + { + "auxiliary_loss_clip": 0.01269429, + "auxiliary_loss_mlp": 0.00222492, + "balance_loss_clip": 1.0473547, + "balance_loss_mlp": 0.19816187, + "epoch": 0.7348263941079213, + "flos": 24859987649280.0, + "grad_norm": 3.3630889793842984, + "language_loss": 0.77436095, + "learning_rate": 6.932511715634273e-07, + "loss": 0.78928018, + "num_input_tokens_seen": 263756480, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24316406, + "step": 12222, + "time_per_iteration": 2.7200722694396973 + }, + { + "auxiliary_loss_clip": 0.01244837, + "auxiliary_loss_mlp": 0.00210666, + "balance_loss_clip": 1.03376508, + "balance_loss_mlp": 0.18862469, + "epoch": 0.7348865173605892, + "flos": 24352103295360.0, + "grad_norm": 21.688633760418604, + "language_loss": 0.74086571, + "learning_rate": 6.92956360247217e-07, + "loss": 0.75542068, + "num_input_tokens_seen": 263776440, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.22009277, + "step": 12223, + "time_per_iteration": 2.6943159103393555 + }, + { + "auxiliary_loss_clip": 0.01261103, + "auxiliary_loss_mlp": 0.00223432, + "balance_loss_clip": 1.04229546, + "balance_loss_mlp": 0.1988984, + "epoch": 0.7349466406132572, + "flos": 20004613597440.0, + "grad_norm": 8.444874809512603, + "language_loss": 0.83091146, + "learning_rate": 6.926615984942332e-07, + "loss": 0.84575689, + "num_input_tokens_seen": 263793700, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.24523926, + "step": 12224, + "time_per_iteration": 2.645768165588379 + }, + { + "auxiliary_loss_clip": 0.0126054, + "auxiliary_loss_mlp": 0.00222041, + "balance_loss_clip": 1.04120553, + "balance_loss_mlp": 0.19704258, + "epoch": 0.7350067638659251, + "flos": 29825068815360.0, + "grad_norm": 302.36220094809863, + "language_loss": 0.81954235, + "learning_rate": 6.92366886315652e-07, + "loss": 0.83436811, + "num_input_tokens_seen": 263814620, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25024414, + "step": 12225, + "time_per_iteration": 4.283280611038208 + }, + { + "auxiliary_loss_clip": 0.01276311, + "auxiliary_loss_mlp": 0.00218037, + "balance_loss_clip": 1.04549336, + "balance_loss_mlp": 0.1910121, + "epoch": 0.7350668871185931, + "flos": 21866150920320.0, + "grad_norm": 133.67544045158786, + "language_loss": 0.84840178, + "learning_rate": 6.920722237226501e-07, + "loss": 0.86334527, + "num_input_tokens_seen": 263832725, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.27026367, + "step": 12226, + "time_per_iteration": 2.6750435829162598 + }, + { + "auxiliary_loss_clip": 0.01261556, + "auxiliary_loss_mlp": 0.00248888, + "balance_loss_clip": 1.04276633, + "balance_loss_mlp": 0.22219706, + "epoch": 0.735127010371261, + "flos": 22566122231040.0, + "grad_norm": 50.88158691581524, + "language_loss": 0.72746432, + "learning_rate": 6.917776107264008e-07, + "loss": 0.74256873, + "num_input_tokens_seen": 263853850, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.2668457, + "step": 12227, + "time_per_iteration": 2.8338546752929688 + }, + { + "auxiliary_loss_clip": 0.01268912, + "auxiliary_loss_mlp": 0.0022113, + "balance_loss_clip": 1.04984307, + "balance_loss_mlp": 0.19635789, + "epoch": 0.7351871336239291, + "flos": 25884339707520.0, + "grad_norm": 194.11611459073075, + "language_loss": 0.71510661, + "learning_rate": 6.914830473380749e-07, + "loss": 0.73000699, + "num_input_tokens_seen": 263874760, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24768066, + "step": 12228, + "time_per_iteration": 2.7638187408447266 + }, + { + "auxiliary_loss_clip": 0.01262472, + "auxiliary_loss_mlp": 0.0022036, + "balance_loss_clip": 1.04187822, + "balance_loss_mlp": 0.1953263, + "epoch": 0.735247256876597, + "flos": 17932173569280.0, + "grad_norm": 42.22187265005329, + "language_loss": 0.71574557, + "learning_rate": 6.911885335688427e-07, + "loss": 0.73057389, + "num_input_tokens_seen": 263893390, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25036621, + "step": 12229, + "time_per_iteration": 2.665720224380493 + }, + { + "auxiliary_loss_clip": 0.01275328, + "auxiliary_loss_mlp": 0.002302, + "balance_loss_clip": 1.04597902, + "balance_loss_mlp": 0.20355719, + "epoch": 0.735307380129265, + "flos": 28875159694080.0, + "grad_norm": 55.77157764527744, + "language_loss": 0.82570344, + "learning_rate": 6.908940694298726e-07, + "loss": 0.84075874, + "num_input_tokens_seen": 263911180, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.26660156, + "step": 12230, + "time_per_iteration": 2.7665233612060547 + }, + { + "auxiliary_loss_clip": 0.01298793, + "auxiliary_loss_mlp": 0.00244973, + "balance_loss_clip": 1.06391835, + "balance_loss_mlp": 0.21588579, + "epoch": 0.7353675033819329, + "flos": 13625658311040.0, + "grad_norm": 7.822588390996206, + "language_loss": 0.85930753, + "learning_rate": 6.90599654932332e-07, + "loss": 0.87474513, + "num_input_tokens_seen": 263928975, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.29064941, + "step": 12231, + "time_per_iteration": 2.694552421569824 + }, + { + "auxiliary_loss_clip": 0.0128441, + "auxiliary_loss_mlp": 0.00246278, + "balance_loss_clip": 1.05896544, + "balance_loss_mlp": 0.21974234, + "epoch": 0.7354276266346009, + "flos": 19463081178240.0, + "grad_norm": 35.51980100489979, + "language_loss": 0.75331759, + "learning_rate": 6.903052900873823e-07, + "loss": 0.76862442, + "num_input_tokens_seen": 263944495, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.26550293, + "step": 12232, + "time_per_iteration": 2.75712513923645 + }, + { + "auxiliary_loss_clip": 0.01271946, + "auxiliary_loss_mlp": 0.00210207, + "balance_loss_clip": 1.04557455, + "balance_loss_mlp": 0.18549535, + "epoch": 0.735487749887269, + "flos": 15771858917760.0, + "grad_norm": 5.443384656002718, + "language_loss": 0.8333686, + "learning_rate": 6.900109749061874e-07, + "loss": 0.84819013, + "num_input_tokens_seen": 263961325, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.24707031, + "step": 12233, + "time_per_iteration": 2.691032648086548 + }, + { + "auxiliary_loss_clip": 0.0128442, + "auxiliary_loss_mlp": 0.00253925, + "balance_loss_clip": 1.05261266, + "balance_loss_mlp": 0.22709076, + "epoch": 0.7355478731399369, + "flos": 18260648467200.0, + "grad_norm": 7.922902665656344, + "language_loss": 0.80829012, + "learning_rate": 6.897167093999079e-07, + "loss": 0.82367355, + "num_input_tokens_seen": 263980445, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.26843262, + "step": 12234, + "time_per_iteration": 2.741530179977417 + }, + { + "auxiliary_loss_clip": 0.01272458, + "auxiliary_loss_mlp": 0.0021435, + "balance_loss_clip": 1.05261934, + "balance_loss_mlp": 0.18892264, + "epoch": 0.7356079963926049, + "flos": 26542043688960.0, + "grad_norm": 79.38112799000211, + "language_loss": 0.71639633, + "learning_rate": 6.894224935797017e-07, + "loss": 0.73126435, + "num_input_tokens_seen": 263999330, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25427246, + "step": 12235, + "time_per_iteration": 2.729766368865967 + }, + { + "auxiliary_loss_clip": 0.0126282, + "auxiliary_loss_mlp": 0.00233053, + "balance_loss_clip": 1.04017735, + "balance_loss_mlp": 0.2076378, + "epoch": 0.7356681196452728, + "flos": 10778624467200.0, + "grad_norm": 8.520145619201891, + "language_loss": 0.94865608, + "learning_rate": 6.891283274567259e-07, + "loss": 0.9636147, + "num_input_tokens_seen": 264014150, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25415039, + "step": 12236, + "time_per_iteration": 2.6683595180511475 + }, + { + "auxiliary_loss_clip": 0.012584, + "auxiliary_loss_mlp": 0.00234542, + "balance_loss_clip": 1.03573036, + "balance_loss_mlp": 0.20963949, + "epoch": 0.7357282428979408, + "flos": 19718693337600.0, + "grad_norm": 16.37717794990735, + "language_loss": 0.76231301, + "learning_rate": 6.888342110421364e-07, + "loss": 0.77724242, + "num_input_tokens_seen": 264033140, + "router_z_loss_clip": 2.22753906, + "router_z_loss_mlp": 0.2487793, + "step": 12237, + "time_per_iteration": 2.6573774814605713 + }, + { + "auxiliary_loss_clip": 0.01275953, + "auxiliary_loss_mlp": 0.00230275, + "balance_loss_clip": 1.04905462, + "balance_loss_mlp": 0.20447797, + "epoch": 0.7357883661506087, + "flos": 19464014931840.0, + "grad_norm": 27.347344206863042, + "language_loss": 0.79293674, + "learning_rate": 6.885401443470839e-07, + "loss": 0.80799901, + "num_input_tokens_seen": 264052105, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.25793457, + "step": 12238, + "time_per_iteration": 2.754409074783325 + }, + { + "auxiliary_loss_clip": 0.01289567, + "auxiliary_loss_mlp": 0.00241605, + "balance_loss_clip": 1.06276691, + "balance_loss_mlp": 0.21448524, + "epoch": 0.7358484894032767, + "flos": 27123006263040.0, + "grad_norm": 18.08212878230921, + "language_loss": 0.80910665, + "learning_rate": 6.882461273827205e-07, + "loss": 0.82441843, + "num_input_tokens_seen": 264070690, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.27111816, + "step": 12239, + "time_per_iteration": 2.7685506343841553 + }, + { + "auxiliary_loss_clip": 0.01243972, + "auxiliary_loss_mlp": 0.00195166, + "balance_loss_clip": 1.02618229, + "balance_loss_mlp": 0.17114535, + "epoch": 0.7359086126559446, + "flos": 24502282058880.0, + "grad_norm": 22.413252839123324, + "language_loss": 0.84449756, + "learning_rate": 6.879521601601954e-07, + "loss": 0.85888892, + "num_input_tokens_seen": 264094225, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.2401123, + "step": 12240, + "time_per_iteration": 2.7528834342956543 + }, + { + "auxiliary_loss_clip": 0.01250583, + "auxiliary_loss_mlp": 0.00243607, + "balance_loss_clip": 1.03741741, + "balance_loss_mlp": 0.21798879, + "epoch": 0.7359687359086127, + "flos": 23331270769920.0, + "grad_norm": 4.239121124459072, + "language_loss": 0.91398537, + "learning_rate": 6.876582426906565e-07, + "loss": 0.9289273, + "num_input_tokens_seen": 264113190, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.25610352, + "step": 12241, + "time_per_iteration": 2.7032997608184814 + }, + { + "auxiliary_loss_clip": 0.01256251, + "auxiliary_loss_mlp": 0.00217863, + "balance_loss_clip": 1.03619242, + "balance_loss_mlp": 0.19164938, + "epoch": 0.7360288591612806, + "flos": 20193396503040.0, + "grad_norm": 8.581886236059724, + "language_loss": 0.86556304, + "learning_rate": 6.873643749852484e-07, + "loss": 0.88030416, + "num_input_tokens_seen": 264132050, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.26196289, + "step": 12242, + "time_per_iteration": 2.682818651199341 + }, + { + "auxiliary_loss_clip": 0.01255408, + "auxiliary_loss_mlp": 0.00227713, + "balance_loss_clip": 1.03914332, + "balance_loss_mlp": 0.20134404, + "epoch": 0.7360889824139486, + "flos": 24972783333120.0, + "grad_norm": 65.50764383341328, + "language_loss": 0.85587615, + "learning_rate": 6.870705570551145e-07, + "loss": 0.87070733, + "num_input_tokens_seen": 264152800, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.26391602, + "step": 12243, + "time_per_iteration": 2.7442879676818848 + }, + { + "auxiliary_loss_clip": 0.01264216, + "auxiliary_loss_mlp": 0.00209392, + "balance_loss_clip": 1.04589295, + "balance_loss_mlp": 0.18248695, + "epoch": 0.7361491056666165, + "flos": 15012312900480.0, + "grad_norm": 48.845477013922654, + "language_loss": 0.85658562, + "learning_rate": 6.867767889113969e-07, + "loss": 0.87132168, + "num_input_tokens_seen": 264169650, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.26916504, + "step": 12244, + "time_per_iteration": 2.760718822479248 + }, + { + "auxiliary_loss_clip": 0.01269564, + "auxiliary_loss_mlp": 0.00232674, + "balance_loss_clip": 1.04654789, + "balance_loss_mlp": 0.20759188, + "epoch": 0.7362092289192845, + "flos": 22930400010240.0, + "grad_norm": 19.286526711407813, + "language_loss": 0.80209786, + "learning_rate": 6.864830705652347e-07, + "loss": 0.81712019, + "num_input_tokens_seen": 264190530, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25061035, + "step": 12245, + "time_per_iteration": 2.7162952423095703 + }, + { + "auxiliary_loss_clip": 0.01253265, + "auxiliary_loss_mlp": 0.00235442, + "balance_loss_clip": 1.04460156, + "balance_loss_mlp": 0.21042004, + "epoch": 0.7362693521719526, + "flos": 20702681487360.0, + "grad_norm": 29.47710665905208, + "language_loss": 0.81483656, + "learning_rate": 6.861894020277658e-07, + "loss": 0.82972366, + "num_input_tokens_seen": 264210820, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.25012207, + "step": 12246, + "time_per_iteration": 2.6916232109069824 + }, + { + "auxiliary_loss_clip": 0.01246749, + "auxiliary_loss_mlp": 0.00218126, + "balance_loss_clip": 1.03444624, + "balance_loss_mlp": 0.19229308, + "epoch": 0.7363294754246205, + "flos": 13111381336320.0, + "grad_norm": 66.78115809199367, + "language_loss": 0.79229516, + "learning_rate": 6.858957833101266e-07, + "loss": 0.80694389, + "num_input_tokens_seen": 264227430, + "router_z_loss_clip": 2.12402344, + "router_z_loss_mlp": 0.25830078, + "step": 12247, + "time_per_iteration": 2.6968791484832764 + }, + { + "auxiliary_loss_clip": 0.01265549, + "auxiliary_loss_mlp": 0.00196554, + "balance_loss_clip": 1.04399419, + "balance_loss_mlp": 0.17159148, + "epoch": 0.7363895986772885, + "flos": 14027426910720.0, + "grad_norm": 58.29796735175838, + "language_loss": 0.80837429, + "learning_rate": 6.856022144234526e-07, + "loss": 0.82299531, + "num_input_tokens_seen": 264245230, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.24951172, + "step": 12248, + "time_per_iteration": 2.672248125076294 + }, + { + "auxiliary_loss_clip": 0.01280096, + "auxiliary_loss_mlp": 0.00220817, + "balance_loss_clip": 1.05487525, + "balance_loss_mlp": 0.19548526, + "epoch": 0.7364497219299564, + "flos": 19719986227200.0, + "grad_norm": 12.911212611115042, + "language_loss": 0.82963467, + "learning_rate": 6.853086953788727e-07, + "loss": 0.84464377, + "num_input_tokens_seen": 264263945, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.25354004, + "step": 12249, + "time_per_iteration": 2.694141387939453 + }, + { + "auxiliary_loss_clip": 0.01279544, + "auxiliary_loss_mlp": 0.0023667, + "balance_loss_clip": 1.05527186, + "balance_loss_mlp": 0.2096695, + "epoch": 0.7365098451826244, + "flos": 21361391049600.0, + "grad_norm": 8.137717936342712, + "language_loss": 0.86324573, + "learning_rate": 6.850152261875189e-07, + "loss": 0.87840784, + "num_input_tokens_seen": 264281500, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.27001953, + "step": 12250, + "time_per_iteration": 2.700054407119751 + }, + { + "auxiliary_loss_clip": 0.0126658, + "auxiliary_loss_mlp": 0.00217077, + "balance_loss_clip": 1.04075527, + "balance_loss_mlp": 0.19006424, + "epoch": 0.7365699684352923, + "flos": 23368222886400.0, + "grad_norm": 10.453047546331705, + "language_loss": 0.79185766, + "learning_rate": 6.8472180686052e-07, + "loss": 0.80669427, + "num_input_tokens_seen": 264301625, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26989746, + "step": 12251, + "time_per_iteration": 2.7506723403930664 + }, + { + "auxiliary_loss_clip": 0.01245708, + "auxiliary_loss_mlp": 0.00208095, + "balance_loss_clip": 1.03491604, + "balance_loss_mlp": 0.18322781, + "epoch": 0.7366300916879603, + "flos": 59524879927680.0, + "grad_norm": 6.429797757639255, + "language_loss": 0.7217083, + "learning_rate": 6.844284374090015e-07, + "loss": 0.73624635, + "num_input_tokens_seen": 264323975, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.24865723, + "step": 12252, + "time_per_iteration": 3.099086284637451 + }, + { + "auxiliary_loss_clip": 0.01272283, + "auxiliary_loss_mlp": 0.00215835, + "balance_loss_clip": 1.05416679, + "balance_loss_mlp": 0.19075368, + "epoch": 0.7366902149406283, + "flos": 20923137210240.0, + "grad_norm": 5.581459251337042, + "language_loss": 0.85783219, + "learning_rate": 6.841351178440884e-07, + "loss": 0.87271339, + "num_input_tokens_seen": 264343785, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.25097656, + "step": 12253, + "time_per_iteration": 2.6945266723632812 + }, + { + "auxiliary_loss_clip": 0.01242517, + "auxiliary_loss_mlp": 0.0021622, + "balance_loss_clip": 1.02738881, + "balance_loss_mlp": 0.19159172, + "epoch": 0.7367503381932963, + "flos": 17348158339200.0, + "grad_norm": 49.658501464984745, + "language_loss": 0.84993726, + "learning_rate": 6.83841848176905e-07, + "loss": 0.86452466, + "num_input_tokens_seen": 264361130, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24621582, + "step": 12254, + "time_per_iteration": 2.725686550140381 + }, + { + "auxiliary_loss_clip": 0.0126258, + "auxiliary_loss_mlp": 0.00225532, + "balance_loss_clip": 1.04435849, + "balance_loss_mlp": 0.1987696, + "epoch": 0.7368104614459642, + "flos": 17821317219840.0, + "grad_norm": 103.06309669692513, + "language_loss": 0.80921793, + "learning_rate": 6.835486284185692e-07, + "loss": 0.824099, + "num_input_tokens_seen": 264376965, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.26757812, + "step": 12255, + "time_per_iteration": 2.67702054977417 + }, + { + "auxiliary_loss_clip": 0.01284419, + "auxiliary_loss_mlp": 0.00242107, + "balance_loss_clip": 1.05895603, + "balance_loss_mlp": 0.21622637, + "epoch": 0.7368705846986322, + "flos": 24606099342720.0, + "grad_norm": 3.4084831402422093, + "language_loss": 0.83403742, + "learning_rate": 6.832554585802012e-07, + "loss": 0.84930265, + "num_input_tokens_seen": 264396310, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.25878906, + "step": 12256, + "time_per_iteration": 2.7805869579315186 + }, + { + "auxiliary_loss_clip": 0.01249566, + "auxiliary_loss_mlp": 0.00219453, + "balance_loss_clip": 1.03456879, + "balance_loss_mlp": 0.19482456, + "epoch": 0.7369307079513001, + "flos": 34970169968640.0, + "grad_norm": 39.36458451922757, + "language_loss": 0.80820298, + "learning_rate": 6.829623386729182e-07, + "loss": 0.82289314, + "num_input_tokens_seen": 264418085, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.24597168, + "step": 12257, + "time_per_iteration": 4.218636512756348 + }, + { + "auxiliary_loss_clip": 0.01268385, + "auxiliary_loss_mlp": 0.00219858, + "balance_loss_clip": 1.05114746, + "balance_loss_mlp": 0.19440649, + "epoch": 0.7369908312039681, + "flos": 21214588164480.0, + "grad_norm": 18.97561532502363, + "language_loss": 0.8515234, + "learning_rate": 6.826692687078362e-07, + "loss": 0.86640584, + "num_input_tokens_seen": 264437595, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.2545166, + "step": 12258, + "time_per_iteration": 2.675668239593506 + }, + { + "auxiliary_loss_clip": 0.01255636, + "auxiliary_loss_mlp": 0.00215023, + "balance_loss_clip": 1.03875804, + "balance_loss_mlp": 0.18904749, + "epoch": 0.7370509544566362, + "flos": 23623655477760.0, + "grad_norm": 4.43346054533031, + "language_loss": 0.74266171, + "learning_rate": 6.823762486960674e-07, + "loss": 0.75736833, + "num_input_tokens_seen": 264457385, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.25976562, + "step": 12259, + "time_per_iteration": 4.159494400024414 + }, + { + "auxiliary_loss_clip": 0.01258834, + "auxiliary_loss_mlp": 0.00226602, + "balance_loss_clip": 1.04239488, + "balance_loss_mlp": 0.20260543, + "epoch": 0.7371110777093041, + "flos": 24827704300800.0, + "grad_norm": 18.795991038646026, + "language_loss": 0.81597346, + "learning_rate": 6.820832786487225e-07, + "loss": 0.83082783, + "num_input_tokens_seen": 264477205, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.23999023, + "step": 12260, + "time_per_iteration": 2.8093581199645996 + }, + { + "auxiliary_loss_clip": 0.01271884, + "auxiliary_loss_mlp": 0.00228302, + "balance_loss_clip": 1.05089712, + "balance_loss_mlp": 0.20111074, + "epoch": 0.7371712009619721, + "flos": 23149491016320.0, + "grad_norm": 54.59212498556115, + "language_loss": 0.80190647, + "learning_rate": 6.817903585769125e-07, + "loss": 0.81690836, + "num_input_tokens_seen": 264497195, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.27160645, + "step": 12261, + "time_per_iteration": 2.665038585662842 + }, + { + "auxiliary_loss_clip": 0.01262975, + "auxiliary_loss_mlp": 0.00239868, + "balance_loss_clip": 1.04378593, + "balance_loss_mlp": 0.21432137, + "epoch": 0.73723132421464, + "flos": 23112898035840.0, + "grad_norm": 100.14797100938209, + "language_loss": 0.80279839, + "learning_rate": 6.814974884917438e-07, + "loss": 0.81782681, + "num_input_tokens_seen": 264516950, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25512695, + "step": 12262, + "time_per_iteration": 4.3127405643463135 + }, + { + "auxiliary_loss_clip": 0.01243933, + "auxiliary_loss_mlp": 0.00223468, + "balance_loss_clip": 1.02967763, + "balance_loss_mlp": 0.20029336, + "epoch": 0.737291447467308, + "flos": 19273328605440.0, + "grad_norm": 12.025232204211028, + "language_loss": 0.95066363, + "learning_rate": 6.81204668404322e-07, + "loss": 0.96533763, + "num_input_tokens_seen": 264532675, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.23181152, + "step": 12263, + "time_per_iteration": 2.6801939010620117 + }, + { + "auxiliary_loss_clip": 0.01247732, + "auxiliary_loss_mlp": 0.00204938, + "balance_loss_clip": 1.03684068, + "balance_loss_mlp": 0.18138236, + "epoch": 0.7373515707199759, + "flos": 25118257415040.0, + "grad_norm": 5.028312613136822, + "language_loss": 0.7357589, + "learning_rate": 6.809118983257522e-07, + "loss": 0.75028563, + "num_input_tokens_seen": 264555635, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.23583984, + "step": 12264, + "time_per_iteration": 2.7366175651550293 + }, + { + "auxiliary_loss_clip": 0.01242374, + "auxiliary_loss_mlp": 0.00217348, + "balance_loss_clip": 1.02962935, + "balance_loss_mlp": 0.19367269, + "epoch": 0.737411693972644, + "flos": 32408481767040.0, + "grad_norm": 28.678861410749903, + "language_loss": 0.87356931, + "learning_rate": 6.806191782671356e-07, + "loss": 0.88816655, + "num_input_tokens_seen": 264573140, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.23693848, + "step": 12265, + "time_per_iteration": 2.8255434036254883 + }, + { + "auxiliary_loss_clip": 0.01285659, + "auxiliary_loss_mlp": 0.00228911, + "balance_loss_clip": 1.05307961, + "balance_loss_mlp": 0.20180288, + "epoch": 0.7374718172253119, + "flos": 24315797623680.0, + "grad_norm": 18.420178953401184, + "language_loss": 0.80040622, + "learning_rate": 6.803265082395711e-07, + "loss": 0.81555194, + "num_input_tokens_seen": 264591610, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.27111816, + "step": 12266, + "time_per_iteration": 2.7268521785736084 + }, + { + "auxiliary_loss_clip": 0.01245628, + "auxiliary_loss_mlp": 0.0023059, + "balance_loss_clip": 1.03062201, + "balance_loss_mlp": 0.20611608, + "epoch": 0.7375319404779799, + "flos": 27156115624320.0, + "grad_norm": 6.097709618072031, + "language_loss": 0.82546854, + "learning_rate": 6.800338882541576e-07, + "loss": 0.8402307, + "num_input_tokens_seen": 264611170, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.24499512, + "step": 12267, + "time_per_iteration": 4.131671667098999 + }, + { + "auxiliary_loss_clip": 0.01238468, + "auxiliary_loss_mlp": 0.00227645, + "balance_loss_clip": 1.02487183, + "balance_loss_mlp": 0.20437524, + "epoch": 0.7375920637306478, + "flos": 18879999701760.0, + "grad_norm": 74.53478304120783, + "language_loss": 0.9193908, + "learning_rate": 6.797413183219923e-07, + "loss": 0.93405187, + "num_input_tokens_seen": 264629365, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.23254395, + "step": 12268, + "time_per_iteration": 2.709599018096924 + }, + { + "auxiliary_loss_clip": 0.01263877, + "auxiliary_loss_mlp": 0.00237554, + "balance_loss_clip": 1.04738247, + "balance_loss_mlp": 0.21116114, + "epoch": 0.7376521869833158, + "flos": 15669765486720.0, + "grad_norm": 18.0545236519316, + "language_loss": 0.82559305, + "learning_rate": 6.794487984541677e-07, + "loss": 0.84060735, + "num_input_tokens_seen": 264647915, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.26367188, + "step": 12269, + "time_per_iteration": 2.6595513820648193 + }, + { + "auxiliary_loss_clip": 0.01288952, + "auxiliary_loss_mlp": 0.00251844, + "balance_loss_clip": 1.06072092, + "balance_loss_mlp": 0.22573701, + "epoch": 0.7377123102359837, + "flos": 36971973901440.0, + "grad_norm": 21.540670517611055, + "language_loss": 0.77562159, + "learning_rate": 6.791563286617776e-07, + "loss": 0.79102951, + "num_input_tokens_seen": 264669620, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.26123047, + "step": 12270, + "time_per_iteration": 2.8648931980133057 + }, + { + "auxiliary_loss_clip": 0.01272303, + "auxiliary_loss_mlp": 0.00239467, + "balance_loss_clip": 1.05074787, + "balance_loss_mlp": 0.21603025, + "epoch": 0.7377724334886517, + "flos": 24496284487680.0, + "grad_norm": 7.907991593908506, + "language_loss": 0.76789749, + "learning_rate": 6.788639089559119e-07, + "loss": 0.78301513, + "num_input_tokens_seen": 264689345, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.234375, + "step": 12271, + "time_per_iteration": 2.689436435699463 + }, + { + "auxiliary_loss_clip": 0.01275713, + "auxiliary_loss_mlp": 0.00215179, + "balance_loss_clip": 1.0491749, + "balance_loss_mlp": 0.19115818, + "epoch": 0.7378325567413198, + "flos": 24390025079040.0, + "grad_norm": 9.651710794190384, + "language_loss": 0.78309858, + "learning_rate": 6.785715393476586e-07, + "loss": 0.79800749, + "num_input_tokens_seen": 264707625, + "router_z_loss_clip": 2.26269531, + "router_z_loss_mlp": 0.2401123, + "step": 12272, + "time_per_iteration": 2.7671549320220947 + }, + { + "auxiliary_loss_clip": 0.01251069, + "auxiliary_loss_mlp": 0.00222435, + "balance_loss_clip": 1.03780067, + "balance_loss_mlp": 0.19935597, + "epoch": 0.7378926799939877, + "flos": 17416388223360.0, + "grad_norm": 34.057397833390596, + "language_loss": 0.84933525, + "learning_rate": 6.782792198481049e-07, + "loss": 0.8640703, + "num_input_tokens_seen": 264725575, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.23071289, + "step": 12273, + "time_per_iteration": 2.841933012008667 + }, + { + "auxiliary_loss_clip": 0.0124326, + "auxiliary_loss_mlp": 0.0022524, + "balance_loss_clip": 1.03170455, + "balance_loss_mlp": 0.20181563, + "epoch": 0.7379528032466557, + "flos": 18474208778880.0, + "grad_norm": 1698.6095848733887, + "language_loss": 0.89481866, + "learning_rate": 6.779869504683355e-07, + "loss": 0.9095037, + "num_input_tokens_seen": 264742855, + "router_z_loss_clip": 2.11425781, + "router_z_loss_mlp": 0.23388672, + "step": 12274, + "time_per_iteration": 2.704786539077759 + }, + { + "auxiliary_loss_clip": 0.01283354, + "auxiliary_loss_mlp": 0.00241358, + "balance_loss_clip": 1.05621898, + "balance_loss_mlp": 0.2143811, + "epoch": 0.7380129264993236, + "flos": 17821999578240.0, + "grad_norm": 52.27709756389123, + "language_loss": 0.85283536, + "learning_rate": 6.776947312194341e-07, + "loss": 0.86808252, + "num_input_tokens_seen": 264761155, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.26977539, + "step": 12275, + "time_per_iteration": 2.8179750442504883 + }, + { + "auxiliary_loss_clip": 0.01281855, + "auxiliary_loss_mlp": 0.00244288, + "balance_loss_clip": 1.05314374, + "balance_loss_mlp": 0.21851538, + "epoch": 0.7380730497519916, + "flos": 22997372918400.0, + "grad_norm": 8.648323019593299, + "language_loss": 0.82045901, + "learning_rate": 6.774025621124813e-07, + "loss": 0.83572048, + "num_input_tokens_seen": 264780660, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.25769043, + "step": 12276, + "time_per_iteration": 2.9179494380950928 + }, + { + "auxiliary_loss_clip": 0.01271118, + "auxiliary_loss_mlp": 0.00250761, + "balance_loss_clip": 1.05032635, + "balance_loss_mlp": 0.2261202, + "epoch": 0.7381331730046595, + "flos": 20266259241600.0, + "grad_norm": 83.0724155161785, + "language_loss": 0.85610789, + "learning_rate": 6.771104431585551e-07, + "loss": 0.87132668, + "num_input_tokens_seen": 264798850, + "router_z_loss_clip": 2.20996094, + "router_z_loss_mlp": 0.24682617, + "step": 12277, + "time_per_iteration": 2.7011587619781494 + }, + { + "auxiliary_loss_clip": 0.01254575, + "auxiliary_loss_mlp": 0.00221872, + "balance_loss_clip": 1.03911924, + "balance_loss_mlp": 0.19782773, + "epoch": 0.7381932962573275, + "flos": 19754532132480.0, + "grad_norm": 491.14116470886864, + "language_loss": 0.87245536, + "learning_rate": 6.768183743687338e-07, + "loss": 0.88721979, + "num_input_tokens_seen": 264816795, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24060059, + "step": 12278, + "time_per_iteration": 2.656581163406372 + }, + { + "auxiliary_loss_clip": 0.01268226, + "auxiliary_loss_mlp": 0.00235979, + "balance_loss_clip": 1.04583597, + "balance_loss_mlp": 0.2112309, + "epoch": 0.7382534195099955, + "flos": 17305316392320.0, + "grad_norm": 6.10151170029027, + "language_loss": 0.80919015, + "learning_rate": 6.765263557540921e-07, + "loss": 0.82423222, + "num_input_tokens_seen": 264834105, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.24755859, + "step": 12279, + "time_per_iteration": 2.675281524658203 + }, + { + "auxiliary_loss_clip": 0.01251746, + "auxiliary_loss_mlp": 0.00222206, + "balance_loss_clip": 1.03133798, + "balance_loss_mlp": 0.19717184, + "epoch": 0.7383135427626635, + "flos": 18697358021760.0, + "grad_norm": 99.61362887716027, + "language_loss": 0.93835306, + "learning_rate": 6.762343873257034e-07, + "loss": 0.95309258, + "num_input_tokens_seen": 264850895, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25024414, + "step": 12280, + "time_per_iteration": 2.6653239727020264 + }, + { + "auxiliary_loss_clip": 0.01265731, + "auxiliary_loss_mlp": 0.00249372, + "balance_loss_clip": 1.04545259, + "balance_loss_mlp": 0.2250298, + "epoch": 0.7383736660153314, + "flos": 20881300844160.0, + "grad_norm": 19.500705371729683, + "language_loss": 0.80838352, + "learning_rate": 6.759424690946408e-07, + "loss": 0.82353455, + "num_input_tokens_seen": 264869505, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.2434082, + "step": 12281, + "time_per_iteration": 2.713010311126709 + }, + { + "auxiliary_loss_clip": 0.01272837, + "auxiliary_loss_mlp": 0.00235428, + "balance_loss_clip": 1.04801512, + "balance_loss_mlp": 0.20780769, + "epoch": 0.7384337892679994, + "flos": 20663215418880.0, + "grad_norm": 24.65418401691631, + "language_loss": 0.72160614, + "learning_rate": 6.756506010719711e-07, + "loss": 0.73668879, + "num_input_tokens_seen": 264886915, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.27612305, + "step": 12282, + "time_per_iteration": 2.6481986045837402 + }, + { + "auxiliary_loss_clip": 0.01288764, + "auxiliary_loss_mlp": 0.00238928, + "balance_loss_clip": 1.05980253, + "balance_loss_mlp": 0.21090215, + "epoch": 0.7384939125206673, + "flos": 29169627390720.0, + "grad_norm": 7.163832964918908, + "language_loss": 0.76737976, + "learning_rate": 6.753587832687632e-07, + "loss": 0.78265667, + "num_input_tokens_seen": 264910350, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.28039551, + "step": 12283, + "time_per_iteration": 2.7424838542938232 + }, + { + "auxiliary_loss_clip": 0.01249694, + "auxiliary_loss_mlp": 0.00260917, + "balance_loss_clip": 1.0344255, + "balance_loss_mlp": 0.23699155, + "epoch": 0.7385540357733353, + "flos": 36312833376000.0, + "grad_norm": 10.627468975322424, + "language_loss": 0.82942367, + "learning_rate": 6.750670156960832e-07, + "loss": 0.84452981, + "num_input_tokens_seen": 264930705, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.23937988, + "step": 12284, + "time_per_iteration": 2.768044948577881 + }, + { + "auxiliary_loss_clip": 0.01263038, + "auxiliary_loss_mlp": 0.00233724, + "balance_loss_clip": 1.04163456, + "balance_loss_mlp": 0.2076173, + "epoch": 0.7386141590260034, + "flos": 20302600826880.0, + "grad_norm": 144.40660542347334, + "language_loss": 0.79421377, + "learning_rate": 6.747752983649954e-07, + "loss": 0.80918139, + "num_input_tokens_seen": 264946975, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26098633, + "step": 12285, + "time_per_iteration": 2.692070960998535 + }, + { + "auxiliary_loss_clip": 0.01283501, + "auxiliary_loss_mlp": 0.00258466, + "balance_loss_clip": 1.05635345, + "balance_loss_mlp": 0.23338485, + "epoch": 0.7386742822786713, + "flos": 25483792170240.0, + "grad_norm": 1270.9824745499013, + "language_loss": 0.86845928, + "learning_rate": 6.744836312865602e-07, + "loss": 0.88387901, + "num_input_tokens_seen": 264967665, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.25085449, + "step": 12286, + "time_per_iteration": 2.698803663253784 + }, + { + "auxiliary_loss_clip": 0.01266553, + "auxiliary_loss_mlp": 0.00231125, + "balance_loss_clip": 1.04712057, + "balance_loss_mlp": 0.20725946, + "epoch": 0.7387344055313393, + "flos": 13771958405760.0, + "grad_norm": 8.221250154825556, + "language_loss": 0.74456543, + "learning_rate": 6.741920144718396e-07, + "loss": 0.75954217, + "num_input_tokens_seen": 264985480, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.23852539, + "step": 12287, + "time_per_iteration": 2.71813702583313 + }, + { + "auxiliary_loss_clip": 0.01273176, + "auxiliary_loss_mlp": 0.00247231, + "balance_loss_clip": 1.05340946, + "balance_loss_mlp": 0.2247127, + "epoch": 0.7387945287840072, + "flos": 27855189095040.0, + "grad_norm": 6.610940292304719, + "language_loss": 0.83611321, + "learning_rate": 6.739004479318903e-07, + "loss": 0.85131729, + "num_input_tokens_seen": 265004790, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.22509766, + "step": 12288, + "time_per_iteration": 2.780893325805664 + }, + { + "auxiliary_loss_clip": 0.01290982, + "auxiliary_loss_mlp": 0.00231034, + "balance_loss_clip": 1.05982661, + "balance_loss_mlp": 0.20630975, + "epoch": 0.7388546520366752, + "flos": 44233039388160.0, + "grad_norm": 3.663666178520096, + "language_loss": 0.6531955, + "learning_rate": 6.736089316777684e-07, + "loss": 0.66841567, + "num_input_tokens_seen": 265028790, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.24707031, + "step": 12289, + "time_per_iteration": 2.921173095703125 + }, + { + "auxiliary_loss_clip": 0.01182236, + "auxiliary_loss_mlp": 0.00084526, + "balance_loss_clip": 1.02642405, + "balance_loss_mlp": 0.0781362, + "epoch": 0.7389147752893431, + "flos": 70680890638080.0, + "grad_norm": 0.6368397395786621, + "language_loss": 0.48806775, + "learning_rate": 6.733174657205287e-07, + "loss": 0.5007354, + "num_input_tokens_seen": 265096660, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.06396484, + "step": 12290, + "time_per_iteration": 3.355513334274292 + }, + { + "auxiliary_loss_clip": 0.01276853, + "auxiliary_loss_mlp": 0.00237956, + "balance_loss_clip": 1.05529928, + "balance_loss_mlp": 0.21231395, + "epoch": 0.7389748985420111, + "flos": 25994980575360.0, + "grad_norm": 71.85350166931549, + "language_loss": 0.78167808, + "learning_rate": 6.730260500712237e-07, + "loss": 0.79682612, + "num_input_tokens_seen": 265116375, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25634766, + "step": 12291, + "time_per_iteration": 2.7469441890716553 + }, + { + "auxiliary_loss_clip": 0.01182127, + "auxiliary_loss_mlp": 0.00079594, + "balance_loss_clip": 1.02547216, + "balance_loss_mlp": 0.07287049, + "epoch": 0.7390350217946791, + "flos": 54403661318400.0, + "grad_norm": 0.9859024954114693, + "language_loss": 0.60809267, + "learning_rate": 6.727346847409052e-07, + "loss": 0.6207099, + "num_input_tokens_seen": 265161230, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.06738281, + "step": 12292, + "time_per_iteration": 2.833390474319458 + }, + { + "auxiliary_loss_clip": 0.01262156, + "auxiliary_loss_mlp": 0.00242475, + "balance_loss_clip": 1.04064691, + "balance_loss_mlp": 0.21652311, + "epoch": 0.7390951450473471, + "flos": 32196968530560.0, + "grad_norm": 5.959172625707286, + "language_loss": 0.74711865, + "learning_rate": 6.724433697406191e-07, + "loss": 0.76216495, + "num_input_tokens_seen": 265182515, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.25952148, + "step": 12293, + "time_per_iteration": 2.7658944129943848 + }, + { + "auxiliary_loss_clip": 0.01283685, + "auxiliary_loss_mlp": 0.00216719, + "balance_loss_clip": 1.0599376, + "balance_loss_mlp": 0.19246016, + "epoch": 0.739155268300015, + "flos": 16684241304960.0, + "grad_norm": 8.730579500182467, + "language_loss": 0.90405679, + "learning_rate": 6.721521050814134e-07, + "loss": 0.91906095, + "num_input_tokens_seen": 265198160, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.24267578, + "step": 12294, + "time_per_iteration": 2.7008557319641113 + }, + { + "auxiliary_loss_clip": 0.01256504, + "auxiliary_loss_mlp": 0.00220483, + "balance_loss_clip": 1.03518474, + "balance_loss_mlp": 0.19590238, + "epoch": 0.739215391552683, + "flos": 31649761762560.0, + "grad_norm": 2751.298062486261, + "language_loss": 0.80066085, + "learning_rate": 6.718608907743337e-07, + "loss": 0.81543076, + "num_input_tokens_seen": 265218480, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.24584961, + "step": 12295, + "time_per_iteration": 2.756542921066284 + }, + { + "auxiliary_loss_clip": 0.01249615, + "auxiliary_loss_mlp": 0.00251775, + "balance_loss_clip": 1.0364728, + "balance_loss_mlp": 0.22807637, + "epoch": 0.7392755148053509, + "flos": 29718522097920.0, + "grad_norm": 2.962426504495809, + "language_loss": 0.83596742, + "learning_rate": 6.715697268304215e-07, + "loss": 0.85098135, + "num_input_tokens_seen": 265240165, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23693848, + "step": 12296, + "time_per_iteration": 2.757657051086426 + }, + { + "auxiliary_loss_clip": 0.01255579, + "auxiliary_loss_mlp": 0.0022476, + "balance_loss_clip": 1.03605461, + "balance_loss_mlp": 0.19922489, + "epoch": 0.7393356380580189, + "flos": 37050475075200.0, + "grad_norm": 7.591795453232218, + "language_loss": 0.76146621, + "learning_rate": 6.712786132607182e-07, + "loss": 0.77626961, + "num_input_tokens_seen": 265263295, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25524902, + "step": 12297, + "time_per_iteration": 2.8081440925598145 + }, + { + "auxiliary_loss_clip": 0.01249793, + "auxiliary_loss_mlp": 0.00233276, + "balance_loss_clip": 1.03533149, + "balance_loss_mlp": 0.20731206, + "epoch": 0.739395761310687, + "flos": 19719627091200.0, + "grad_norm": 46.22600802887809, + "language_loss": 0.76785457, + "learning_rate": 6.709875500762645e-07, + "loss": 0.78268522, + "num_input_tokens_seen": 265282740, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.25964355, + "step": 12298, + "time_per_iteration": 2.6965579986572266 + }, + { + "auxiliary_loss_clip": 0.01267205, + "auxiliary_loss_mlp": 0.00255648, + "balance_loss_clip": 1.0432874, + "balance_loss_mlp": 0.22925499, + "epoch": 0.7394558845633549, + "flos": 11801504067840.0, + "grad_norm": 4.182858771359563, + "language_loss": 0.8288182, + "learning_rate": 6.706965372880946e-07, + "loss": 0.84404671, + "num_input_tokens_seen": 265300175, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26403809, + "step": 12299, + "time_per_iteration": 2.6821885108947754 + }, + { + "auxiliary_loss_clip": 0.01187769, + "auxiliary_loss_mlp": 0.00071432, + "balance_loss_clip": 1.03289342, + "balance_loss_mlp": 0.06385061, + "epoch": 0.7395160078160229, + "flos": 66195827850240.0, + "grad_norm": 0.7342932719568352, + "language_loss": 0.59923506, + "learning_rate": 6.704055749072455e-07, + "loss": 0.61182708, + "num_input_tokens_seen": 265363275, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.07568359, + "step": 12300, + "time_per_iteration": 4.596611976623535 + }, + { + "auxiliary_loss_clip": 0.01271922, + "auxiliary_loss_mlp": 0.00235956, + "balance_loss_clip": 1.0516479, + "balance_loss_mlp": 0.21145865, + "epoch": 0.7395761310686908, + "flos": 21249708687360.0, + "grad_norm": 3.1507778748855384, + "language_loss": 0.86943674, + "learning_rate": 6.7011466294475e-07, + "loss": 0.88451552, + "num_input_tokens_seen": 265382935, + "router_z_loss_clip": 2.20019531, + "router_z_loss_mlp": 0.24475098, + "step": 12301, + "time_per_iteration": 4.1425461769104 + }, + { + "auxiliary_loss_clip": 0.0125001, + "auxiliary_loss_mlp": 0.00240975, + "balance_loss_clip": 1.03897214, + "balance_loss_mlp": 0.21811107, + "epoch": 0.7396362543213588, + "flos": 25955299025280.0, + "grad_norm": 37.06473847077831, + "language_loss": 0.78968471, + "learning_rate": 6.698238014116406e-07, + "loss": 0.80459452, + "num_input_tokens_seen": 265403245, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.22875977, + "step": 12302, + "time_per_iteration": 2.790836811065674 + }, + { + "auxiliary_loss_clip": 0.01278539, + "auxiliary_loss_mlp": 0.00228829, + "balance_loss_clip": 1.05216038, + "balance_loss_mlp": 0.20471299, + "epoch": 0.7396963775740267, + "flos": 27377936064000.0, + "grad_norm": 35.62010155916454, + "language_loss": 0.82004601, + "learning_rate": 6.695329903189451e-07, + "loss": 0.83511966, + "num_input_tokens_seen": 265423105, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.24145508, + "step": 12303, + "time_per_iteration": 2.725104570388794 + }, + { + "auxiliary_loss_clip": 0.01249287, + "auxiliary_loss_mlp": 0.00232886, + "balance_loss_clip": 1.03585553, + "balance_loss_mlp": 0.20813853, + "epoch": 0.7397565008266948, + "flos": 25520133755520.0, + "grad_norm": 8.614418395697774, + "language_loss": 0.60980415, + "learning_rate": 6.692422296776927e-07, + "loss": 0.62462592, + "num_input_tokens_seen": 265443445, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24755859, + "step": 12304, + "time_per_iteration": 4.96955680847168 + }, + { + "auxiliary_loss_clip": 0.01261649, + "auxiliary_loss_mlp": 0.0024685, + "balance_loss_clip": 1.04144168, + "balance_loss_mlp": 0.22093381, + "epoch": 0.7398166240793627, + "flos": 23727760070400.0, + "grad_norm": 28.143512243882352, + "language_loss": 0.90468025, + "learning_rate": 6.689515194989084e-07, + "loss": 0.91976523, + "num_input_tokens_seen": 265462085, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.2590332, + "step": 12305, + "time_per_iteration": 2.683502197265625 + }, + { + "auxiliary_loss_clip": 0.01201364, + "auxiliary_loss_mlp": 0.00068651, + "balance_loss_clip": 1.04203963, + "balance_loss_mlp": 0.06202263, + "epoch": 0.7398767473320307, + "flos": 67267582882560.0, + "grad_norm": 0.8631616631077789, + "language_loss": 0.574377, + "learning_rate": 6.68660859793615e-07, + "loss": 0.58707714, + "num_input_tokens_seen": 265521190, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.06640625, + "step": 12306, + "time_per_iteration": 3.205760955810547 + }, + { + "auxiliary_loss_clip": 0.01278385, + "auxiliary_loss_mlp": 0.00210827, + "balance_loss_clip": 1.0519197, + "balance_loss_mlp": 0.18368325, + "epoch": 0.7399368705846986, + "flos": 22018699981440.0, + "grad_norm": 12.261800877866241, + "language_loss": 0.90508467, + "learning_rate": 6.683702505728355e-07, + "loss": 0.91997677, + "num_input_tokens_seen": 265539705, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.2713623, + "step": 12307, + "time_per_iteration": 2.7140860557556152 + }, + { + "auxiliary_loss_clip": 0.0124699, + "auxiliary_loss_mlp": 0.00233962, + "balance_loss_clip": 1.03276002, + "balance_loss_mlp": 0.21027523, + "epoch": 0.7399969938373666, + "flos": 14173870659840.0, + "grad_norm": 41.94612032727318, + "language_loss": 0.78864467, + "learning_rate": 6.680796918475893e-07, + "loss": 0.80345416, + "num_input_tokens_seen": 265555855, + "router_z_loss_clip": 2.13964844, + "router_z_loss_mlp": 0.23681641, + "step": 12308, + "time_per_iteration": 2.680783271789551 + }, + { + "auxiliary_loss_clip": 0.01249394, + "auxiliary_loss_mlp": 0.00246044, + "balance_loss_clip": 1.03488445, + "balance_loss_mlp": 0.22300103, + "epoch": 0.7400571170900345, + "flos": 25301473712640.0, + "grad_norm": 88.13468179700475, + "language_loss": 0.89085853, + "learning_rate": 6.67789183628896e-07, + "loss": 0.90581292, + "num_input_tokens_seen": 265575455, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.23046875, + "step": 12309, + "time_per_iteration": 4.143434286117554 + }, + { + "auxiliary_loss_clip": 0.0127052, + "auxiliary_loss_mlp": 0.00246445, + "balance_loss_clip": 1.04760885, + "balance_loss_mlp": 0.2202428, + "epoch": 0.7401172403427025, + "flos": 22711344917760.0, + "grad_norm": 2.131792418124988, + "language_loss": 0.80502224, + "learning_rate": 6.674987259277692e-07, + "loss": 0.82019192, + "num_input_tokens_seen": 265595250, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.26220703, + "step": 12310, + "time_per_iteration": 2.70511531829834 + }, + { + "auxiliary_loss_clip": 0.01271008, + "auxiliary_loss_mlp": 0.00245146, + "balance_loss_clip": 1.05160046, + "balance_loss_mlp": 0.22015955, + "epoch": 0.7401773635953706, + "flos": 18067448188800.0, + "grad_norm": 6.525572646256948, + "language_loss": 0.9635663, + "learning_rate": 6.672083187552239e-07, + "loss": 0.97872782, + "num_input_tokens_seen": 265606945, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25, + "step": 12311, + "time_per_iteration": 2.6612541675567627 + }, + { + "auxiliary_loss_clip": 0.0127298, + "auxiliary_loss_mlp": 0.00249347, + "balance_loss_clip": 1.0553329, + "balance_loss_mlp": 0.2237291, + "epoch": 0.7402374868480385, + "flos": 22712135016960.0, + "grad_norm": 18.776523666994663, + "language_loss": 0.86374891, + "learning_rate": 6.669179621222738e-07, + "loss": 0.87897211, + "num_input_tokens_seen": 265626115, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25646973, + "step": 12312, + "time_per_iteration": 2.7320120334625244 + }, + { + "auxiliary_loss_clip": 0.0126986, + "auxiliary_loss_mlp": 0.00240103, + "balance_loss_clip": 1.05643106, + "balance_loss_mlp": 0.21617815, + "epoch": 0.7402976101007065, + "flos": 22856675345280.0, + "grad_norm": 6.934692127465287, + "language_loss": 0.84799212, + "learning_rate": 6.666276560399273e-07, + "loss": 0.86309177, + "num_input_tokens_seen": 265646520, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.23925781, + "step": 12313, + "time_per_iteration": 2.760917901992798 + }, + { + "auxiliary_loss_clip": 0.01298998, + "auxiliary_loss_mlp": 0.00245384, + "balance_loss_clip": 1.06722915, + "balance_loss_mlp": 0.21887223, + "epoch": 0.7403577333533744, + "flos": 12345801834240.0, + "grad_norm": 24.84446095604338, + "language_loss": 0.85565025, + "learning_rate": 6.663374005191937e-07, + "loss": 0.87109405, + "num_input_tokens_seen": 265661875, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.26489258, + "step": 12314, + "time_per_iteration": 2.6825313568115234 + }, + { + "auxiliary_loss_clip": 0.01180918, + "auxiliary_loss_mlp": 0.0006326, + "balance_loss_clip": 1.02496767, + "balance_loss_mlp": 0.05644093, + "epoch": 0.7404178566060424, + "flos": 60327270869760.0, + "grad_norm": 0.8222137382726563, + "language_loss": 0.54746795, + "learning_rate": 6.660471955710809e-07, + "loss": 0.5599097, + "num_input_tokens_seen": 265721255, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.06835938, + "step": 12315, + "time_per_iteration": 3.180222511291504 + }, + { + "auxiliary_loss_clip": 0.01261389, + "auxiliary_loss_mlp": 0.00237329, + "balance_loss_clip": 1.04536736, + "balance_loss_mlp": 0.21372585, + "epoch": 0.7404779798587103, + "flos": 32014650072960.0, + "grad_norm": 27.87922019102722, + "language_loss": 0.85977137, + "learning_rate": 6.65757041206591e-07, + "loss": 0.8747586, + "num_input_tokens_seen": 265743970, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.23596191, + "step": 12316, + "time_per_iteration": 2.7724194526672363 + }, + { + "auxiliary_loss_clip": 0.0126291, + "auxiliary_loss_mlp": 0.00264082, + "balance_loss_clip": 1.0444746, + "balance_loss_mlp": 0.23892914, + "epoch": 0.7405381031113784, + "flos": 12889704551040.0, + "grad_norm": 9.211984609038568, + "language_loss": 0.84669304, + "learning_rate": 6.654669374367275e-07, + "loss": 0.86196297, + "num_input_tokens_seen": 265760890, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25134277, + "step": 12317, + "time_per_iteration": 2.6675539016723633 + }, + { + "auxiliary_loss_clip": 0.0124324, + "auxiliary_loss_mlp": 0.00243878, + "balance_loss_clip": 1.03436852, + "balance_loss_mlp": 0.22021475, + "epoch": 0.7405982263640463, + "flos": 20229127557120.0, + "grad_norm": 3.8344483651347314, + "language_loss": 0.88487709, + "learning_rate": 6.651768842724917e-07, + "loss": 0.89974827, + "num_input_tokens_seen": 265779600, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.23657227, + "step": 12318, + "time_per_iteration": 2.7700178623199463 + }, + { + "auxiliary_loss_clip": 0.01279761, + "auxiliary_loss_mlp": 0.00219607, + "balance_loss_clip": 1.05663967, + "balance_loss_mlp": 0.19603959, + "epoch": 0.7406583496167143, + "flos": 17567213431680.0, + "grad_norm": 50.350757860801636, + "language_loss": 0.85339892, + "learning_rate": 6.648868817248827e-07, + "loss": 0.86839259, + "num_input_tokens_seen": 265797030, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.23571777, + "step": 12319, + "time_per_iteration": 2.645869016647339 + }, + { + "auxiliary_loss_clip": 0.01264088, + "auxiliary_loss_mlp": 0.00252986, + "balance_loss_clip": 1.04723525, + "balance_loss_mlp": 0.23008566, + "epoch": 0.7407184728693822, + "flos": 18295733076480.0, + "grad_norm": 9.4393129128646, + "language_loss": 0.73316777, + "learning_rate": 6.64596929804897e-07, + "loss": 0.74833852, + "num_input_tokens_seen": 265815055, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.22912598, + "step": 12320, + "time_per_iteration": 2.6533761024475098 + }, + { + "auxiliary_loss_clip": 0.01290181, + "auxiliary_loss_mlp": 0.00231631, + "balance_loss_clip": 1.06447577, + "balance_loss_mlp": 0.20510654, + "epoch": 0.7407785961220502, + "flos": 16690562098560.0, + "grad_norm": 12.206445506125954, + "language_loss": 0.93864954, + "learning_rate": 6.643070285235288e-07, + "loss": 0.95386755, + "num_input_tokens_seen": 265828480, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26525879, + "step": 12321, + "time_per_iteration": 2.6335253715515137 + }, + { + "auxiliary_loss_clip": 0.01311885, + "auxiliary_loss_mlp": 0.00259949, + "balance_loss_clip": 1.07730031, + "balance_loss_mlp": 0.23176795, + "epoch": 0.7408387193747181, + "flos": 22088330496000.0, + "grad_norm": 114.08934743370307, + "language_loss": 0.80890179, + "learning_rate": 6.640171778917727e-07, + "loss": 0.82462025, + "num_input_tokens_seen": 265845825, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.28186035, + "step": 12322, + "time_per_iteration": 2.6503727436065674 + }, + { + "auxiliary_loss_clip": 0.01273417, + "auxiliary_loss_mlp": 0.00221632, + "balance_loss_clip": 1.05282807, + "balance_loss_mlp": 0.19689611, + "epoch": 0.7408988426273861, + "flos": 24236721832320.0, + "grad_norm": 144.39479727663962, + "language_loss": 0.71623683, + "learning_rate": 6.637273779206183e-07, + "loss": 0.73118734, + "num_input_tokens_seen": 265866335, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24743652, + "step": 12323, + "time_per_iteration": 2.6856162548065186 + }, + { + "auxiliary_loss_clip": 0.01274207, + "auxiliary_loss_mlp": 0.00237923, + "balance_loss_clip": 1.04965639, + "balance_loss_mlp": 0.21180406, + "epoch": 0.7409589658800542, + "flos": 29023004073600.0, + "grad_norm": 1152.7223428246532, + "language_loss": 0.82406098, + "learning_rate": 6.634376286210559e-07, + "loss": 0.83918226, + "num_input_tokens_seen": 265888945, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.26123047, + "step": 12324, + "time_per_iteration": 2.7285168170928955 + }, + { + "auxiliary_loss_clip": 0.01259609, + "auxiliary_loss_mlp": 0.00254554, + "balance_loss_clip": 1.03711486, + "balance_loss_mlp": 0.23116533, + "epoch": 0.7410190891327221, + "flos": 19351362902400.0, + "grad_norm": 12.753875034701235, + "language_loss": 0.8112638, + "learning_rate": 6.63147930004073e-07, + "loss": 0.82640541, + "num_input_tokens_seen": 265908030, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.23388672, + "step": 12325, + "time_per_iteration": 2.6304686069488525 + }, + { + "auxiliary_loss_clip": 0.01272615, + "auxiliary_loss_mlp": 0.00232745, + "balance_loss_clip": 1.04718268, + "balance_loss_mlp": 0.20691228, + "epoch": 0.7410792123853901, + "flos": 22747650589440.0, + "grad_norm": 51.61392070808152, + "language_loss": 0.79252505, + "learning_rate": 6.628582820806545e-07, + "loss": 0.80757862, + "num_input_tokens_seen": 265927030, + "router_z_loss_clip": 2.25488281, + "router_z_loss_mlp": 0.25842285, + "step": 12326, + "time_per_iteration": 2.6819567680358887 + }, + { + "auxiliary_loss_clip": 0.01268437, + "auxiliary_loss_mlp": 0.00223752, + "balance_loss_clip": 1.04557002, + "balance_loss_mlp": 0.20054199, + "epoch": 0.741139335638058, + "flos": 25372433030400.0, + "grad_norm": 212.69080790820658, + "language_loss": 0.94677323, + "learning_rate": 6.625686848617835e-07, + "loss": 0.96169513, + "num_input_tokens_seen": 265945490, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.2322998, + "step": 12327, + "time_per_iteration": 2.7108561992645264 + }, + { + "auxiliary_loss_clip": 0.01292858, + "auxiliary_loss_mlp": 0.00236716, + "balance_loss_clip": 1.06301355, + "balance_loss_mlp": 0.20919031, + "epoch": 0.741199458890726, + "flos": 18585639745920.0, + "grad_norm": 16.131904842700603, + "language_loss": 0.94526333, + "learning_rate": 6.62279138358442e-07, + "loss": 0.96055907, + "num_input_tokens_seen": 265963265, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.27526855, + "step": 12328, + "time_per_iteration": 2.6415703296661377 + }, + { + "auxiliary_loss_clip": 0.01267565, + "auxiliary_loss_mlp": 0.00251723, + "balance_loss_clip": 1.04815733, + "balance_loss_mlp": 0.22618826, + "epoch": 0.7412595821433939, + "flos": 22127078292480.0, + "grad_norm": 58.537342001614725, + "language_loss": 0.73510647, + "learning_rate": 6.619896425816103e-07, + "loss": 0.75029939, + "num_input_tokens_seen": 265982270, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25524902, + "step": 12329, + "time_per_iteration": 2.747649908065796 + }, + { + "auxiliary_loss_clip": 0.01307838, + "auxiliary_loss_mlp": 0.00282692, + "balance_loss_clip": 1.07589221, + "balance_loss_mlp": 0.25651357, + "epoch": 0.741319705396062, + "flos": 29169699217920.0, + "grad_norm": 19.722489259761467, + "language_loss": 0.73948866, + "learning_rate": 6.617001975422647e-07, + "loss": 0.75539398, + "num_input_tokens_seen": 266003835, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.26196289, + "step": 12330, + "time_per_iteration": 2.708306074142456 + }, + { + "auxiliary_loss_clip": 0.012972, + "auxiliary_loss_mlp": 0.00266641, + "balance_loss_clip": 1.06849432, + "balance_loss_mlp": 0.23894837, + "epoch": 0.7413798286487299, + "flos": 20667489137280.0, + "grad_norm": 1907.6995867907722, + "language_loss": 0.95358223, + "learning_rate": 6.614108032513823e-07, + "loss": 0.96922064, + "num_input_tokens_seen": 266021595, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27697754, + "step": 12331, + "time_per_iteration": 2.6816039085388184 + }, + { + "auxiliary_loss_clip": 0.0126682, + "auxiliary_loss_mlp": 0.00247874, + "balance_loss_clip": 1.04758477, + "balance_loss_mlp": 0.22437817, + "epoch": 0.7414399519013979, + "flos": 16398895662720.0, + "grad_norm": 13.610097619950784, + "language_loss": 0.77901328, + "learning_rate": 6.611214597199364e-07, + "loss": 0.79416019, + "num_input_tokens_seen": 266039860, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.23474121, + "step": 12332, + "time_per_iteration": 2.6564199924468994 + }, + { + "auxiliary_loss_clip": 0.01269705, + "auxiliary_loss_mlp": 0.00243835, + "balance_loss_clip": 1.047719, + "balance_loss_mlp": 0.21684563, + "epoch": 0.7415000751540658, + "flos": 25630235919360.0, + "grad_norm": 52.988527000328595, + "language_loss": 0.73411912, + "learning_rate": 6.608321669588984e-07, + "loss": 0.74925447, + "num_input_tokens_seen": 266058050, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.27001953, + "step": 12333, + "time_per_iteration": 2.7015342712402344 + }, + { + "auxiliary_loss_clip": 0.01260696, + "auxiliary_loss_mlp": 0.00231982, + "balance_loss_clip": 1.04454207, + "balance_loss_mlp": 0.20750877, + "epoch": 0.7415601984067338, + "flos": 24499732193280.0, + "grad_norm": 3.870097106515917, + "language_loss": 0.77805012, + "learning_rate": 6.605429249792387e-07, + "loss": 0.79297698, + "num_input_tokens_seen": 266078060, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24499512, + "step": 12334, + "time_per_iteration": 2.7093048095703125 + }, + { + "auxiliary_loss_clip": 0.01286268, + "auxiliary_loss_mlp": 0.00251532, + "balance_loss_clip": 1.06230116, + "balance_loss_mlp": 0.22714148, + "epoch": 0.7416203216594017, + "flos": 20887154760960.0, + "grad_norm": 6.0518320904831056, + "language_loss": 0.88798124, + "learning_rate": 6.602537337919257e-07, + "loss": 0.90335923, + "num_input_tokens_seen": 266097110, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.24401855, + "step": 12335, + "time_per_iteration": 2.7232179641723633 + }, + { + "auxiliary_loss_clip": 0.01282102, + "auxiliary_loss_mlp": 0.0025603, + "balance_loss_clip": 1.05613661, + "balance_loss_mlp": 0.22916003, + "epoch": 0.7416804449120697, + "flos": 15624265933440.0, + "grad_norm": 49.1363773580001, + "language_loss": 0.85179108, + "learning_rate": 6.599645934079259e-07, + "loss": 0.86717236, + "num_input_tokens_seen": 266110870, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.26855469, + "step": 12336, + "time_per_iteration": 2.588747501373291 + }, + { + "auxiliary_loss_clip": 0.01290757, + "auxiliary_loss_mlp": 0.00229421, + "balance_loss_clip": 1.06101012, + "balance_loss_mlp": 0.20373128, + "epoch": 0.7417405681647377, + "flos": 17120483982720.0, + "grad_norm": 13.1220028417644, + "language_loss": 0.82790154, + "learning_rate": 6.596755038382029e-07, + "loss": 0.84310329, + "num_input_tokens_seen": 266127845, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.25708008, + "step": 12337, + "time_per_iteration": 2.7125251293182373 + }, + { + "auxiliary_loss_clip": 0.01288277, + "auxiliary_loss_mlp": 0.00255678, + "balance_loss_clip": 1.06232846, + "balance_loss_mlp": 0.23076329, + "epoch": 0.7418006914174057, + "flos": 18880322924160.0, + "grad_norm": 7.855338871381207, + "language_loss": 0.83500534, + "learning_rate": 6.593864650937186e-07, + "loss": 0.85044491, + "num_input_tokens_seen": 266145400, + "router_z_loss_clip": 2.25878906, + "router_z_loss_mlp": 0.24890137, + "step": 12338, + "time_per_iteration": 2.6354596614837646 + }, + { + "auxiliary_loss_clip": 0.01259641, + "auxiliary_loss_mlp": 0.00220942, + "balance_loss_clip": 1.04086375, + "balance_loss_mlp": 0.1967068, + "epoch": 0.7418608146700737, + "flos": 21580733450880.0, + "grad_norm": 10.402588911340676, + "language_loss": 0.79122186, + "learning_rate": 6.590974771854345e-07, + "loss": 0.80602771, + "num_input_tokens_seen": 266164430, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.24230957, + "step": 12339, + "time_per_iteration": 2.713149309158325 + }, + { + "auxiliary_loss_clip": 0.01263644, + "auxiliary_loss_mlp": 0.00248999, + "balance_loss_clip": 1.04104662, + "balance_loss_mlp": 0.22375041, + "epoch": 0.7419209379227416, + "flos": 22340459036160.0, + "grad_norm": 6.226652093977864, + "language_loss": 0.88181746, + "learning_rate": 6.588085401243077e-07, + "loss": 0.89694393, + "num_input_tokens_seen": 266183855, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25280762, + "step": 12340, + "time_per_iteration": 2.7532715797424316 + }, + { + "auxiliary_loss_clip": 0.0126604, + "auxiliary_loss_mlp": 0.00240102, + "balance_loss_clip": 1.04737139, + "balance_loss_mlp": 0.21486507, + "epoch": 0.7419810611754096, + "flos": 16762275601920.0, + "grad_norm": 6.058950939536265, + "language_loss": 0.81424636, + "learning_rate": 6.585196539212958e-07, + "loss": 0.82930779, + "num_input_tokens_seen": 266202085, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25244141, + "step": 12341, + "time_per_iteration": 2.6485252380371094 + }, + { + "auxiliary_loss_clip": 0.01247024, + "auxiliary_loss_mlp": 0.00254853, + "balance_loss_clip": 1.0386281, + "balance_loss_mlp": 0.23052262, + "epoch": 0.7420411844280775, + "flos": 26212958259840.0, + "grad_norm": 12.425008886197322, + "language_loss": 0.84882271, + "learning_rate": 6.582308185873535e-07, + "loss": 0.86384153, + "num_input_tokens_seen": 266223445, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.24328613, + "step": 12342, + "time_per_iteration": 4.115522861480713 + }, + { + "auxiliary_loss_clip": 0.01276249, + "auxiliary_loss_mlp": 0.00234358, + "balance_loss_clip": 1.05618191, + "balance_loss_mlp": 0.20921716, + "epoch": 0.7421013076807456, + "flos": 68529371840640.0, + "grad_norm": 2.419586108572292, + "language_loss": 0.84504092, + "learning_rate": 6.57942034133433e-07, + "loss": 0.860147, + "num_input_tokens_seen": 266246575, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.25146484, + "step": 12343, + "time_per_iteration": 3.0371694564819336 + }, + { + "auxiliary_loss_clip": 0.01275644, + "auxiliary_loss_mlp": 0.00234607, + "balance_loss_clip": 1.05202043, + "balance_loss_mlp": 0.20954902, + "epoch": 0.7421614309334135, + "flos": 24425325169920.0, + "grad_norm": 9.241459295385848, + "language_loss": 0.74975848, + "learning_rate": 6.576533005704843e-07, + "loss": 0.76486099, + "num_input_tokens_seen": 266266055, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.25061035, + "step": 12344, + "time_per_iteration": 4.090264320373535 + }, + { + "auxiliary_loss_clip": 0.01280745, + "auxiliary_loss_mlp": 0.00239145, + "balance_loss_clip": 1.05608773, + "balance_loss_mlp": 0.21345538, + "epoch": 0.7422215541860815, + "flos": 12311076360960.0, + "grad_norm": 26.787580886230938, + "language_loss": 0.92116559, + "learning_rate": 6.573646179094572e-07, + "loss": 0.93636447, + "num_input_tokens_seen": 266282240, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.25708008, + "step": 12345, + "time_per_iteration": 2.674975872039795 + }, + { + "auxiliary_loss_clip": 0.01291667, + "auxiliary_loss_mlp": 0.00273714, + "balance_loss_clip": 1.06229436, + "balance_loss_mlp": 0.24769068, + "epoch": 0.7422816774387494, + "flos": 19645579203840.0, + "grad_norm": 37.43950005349612, + "language_loss": 0.81339109, + "learning_rate": 6.570759861612988e-07, + "loss": 0.82904494, + "num_input_tokens_seen": 266300980, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.26000977, + "step": 12346, + "time_per_iteration": 2.7408361434936523 + }, + { + "auxiliary_loss_clip": 0.01263868, + "auxiliary_loss_mlp": 0.00236924, + "balance_loss_clip": 1.04140091, + "balance_loss_mlp": 0.21183026, + "epoch": 0.7423418006914174, + "flos": 32015978876160.0, + "grad_norm": 10.113905430718432, + "language_loss": 0.8014791, + "learning_rate": 6.56787405336953e-07, + "loss": 0.81648695, + "num_input_tokens_seen": 266322215, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25097656, + "step": 12347, + "time_per_iteration": 4.377183198928833 + }, + { + "auxiliary_loss_clip": 0.01304925, + "auxiliary_loss_mlp": 0.00233684, + "balance_loss_clip": 1.07192802, + "balance_loss_mlp": 0.20799448, + "epoch": 0.7424019239440853, + "flos": 18916951818240.0, + "grad_norm": 9.218649138874824, + "language_loss": 0.89985913, + "learning_rate": 6.564988754473642e-07, + "loss": 0.91524518, + "num_input_tokens_seen": 266341600, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.2565918, + "step": 12348, + "time_per_iteration": 2.6815731525421143 + }, + { + "auxiliary_loss_clip": 0.01282174, + "auxiliary_loss_mlp": 0.00240599, + "balance_loss_clip": 1.05747008, + "balance_loss_mlp": 0.21477875, + "epoch": 0.7424620471967533, + "flos": 35876518871040.0, + "grad_norm": 9.864778043461547, + "language_loss": 0.7859416, + "learning_rate": 6.562103965034724e-07, + "loss": 0.80116934, + "num_input_tokens_seen": 266362895, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.25817871, + "step": 12349, + "time_per_iteration": 2.7848119735717773 + }, + { + "auxiliary_loss_clip": 0.01297856, + "auxiliary_loss_mlp": 0.00267954, + "balance_loss_clip": 1.06320059, + "balance_loss_mlp": 0.24028581, + "epoch": 0.7425221704494213, + "flos": 27016603200000.0, + "grad_norm": 15.764120348421692, + "language_loss": 0.87475222, + "learning_rate": 6.559219685162165e-07, + "loss": 0.8904103, + "num_input_tokens_seen": 266384015, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.27697754, + "step": 12350, + "time_per_iteration": 2.812830924987793 + }, + { + "auxiliary_loss_clip": 0.01283197, + "auxiliary_loss_mlp": 0.0025219, + "balance_loss_clip": 1.06045175, + "balance_loss_mlp": 0.22727554, + "epoch": 0.7425822937020893, + "flos": 34167135559680.0, + "grad_norm": 3.790930262190436, + "language_loss": 0.81908989, + "learning_rate": 6.556335914965343e-07, + "loss": 0.83444381, + "num_input_tokens_seen": 266405990, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.24926758, + "step": 12351, + "time_per_iteration": 4.25305962562561 + }, + { + "auxiliary_loss_clip": 0.01281712, + "auxiliary_loss_mlp": 0.00246324, + "balance_loss_clip": 1.05209661, + "balance_loss_mlp": 0.22001421, + "epoch": 0.7426424169547573, + "flos": 21283572234240.0, + "grad_norm": 7.3218196399769395, + "language_loss": 0.90289998, + "learning_rate": 6.553452654553611e-07, + "loss": 0.91818035, + "num_input_tokens_seen": 266424260, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.26318359, + "step": 12352, + "time_per_iteration": 2.653043270111084 + }, + { + "auxiliary_loss_clip": 0.01295569, + "auxiliary_loss_mlp": 0.0024227, + "balance_loss_clip": 1.06440449, + "balance_loss_mlp": 0.21561509, + "epoch": 0.7427025402074252, + "flos": 22448442297600.0, + "grad_norm": 48.575608289907755, + "language_loss": 0.79113585, + "learning_rate": 6.550569904036307e-07, + "loss": 0.8065142, + "num_input_tokens_seen": 266444580, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.26647949, + "step": 12353, + "time_per_iteration": 2.705878496170044 + }, + { + "auxiliary_loss_clip": 0.01292525, + "auxiliary_loss_mlp": 0.00251562, + "balance_loss_clip": 1.07066965, + "balance_loss_mlp": 0.22539514, + "epoch": 0.7427626634600932, + "flos": 22524609087360.0, + "grad_norm": 276.0480346472409, + "language_loss": 0.7935183, + "learning_rate": 6.547687663522739e-07, + "loss": 0.80895913, + "num_input_tokens_seen": 266465640, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.26135254, + "step": 12354, + "time_per_iteration": 2.6751458644866943 + }, + { + "auxiliary_loss_clip": 0.01193975, + "auxiliary_loss_mlp": 0.00130749, + "balance_loss_clip": 1.04040718, + "balance_loss_mlp": 0.12192784, + "epoch": 0.7428227867127611, + "flos": 67209477655680.0, + "grad_norm": 0.6817748025487731, + "language_loss": 0.58936971, + "learning_rate": 6.544805933122199e-07, + "loss": 0.60261697, + "num_input_tokens_seen": 266531950, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.08837891, + "step": 12355, + "time_per_iteration": 3.273772716522217 + }, + { + "auxiliary_loss_clip": 0.01285365, + "auxiliary_loss_mlp": 0.00243105, + "balance_loss_clip": 1.05826378, + "balance_loss_mlp": 0.21575797, + "epoch": 0.7428829099654292, + "flos": 14721221082240.0, + "grad_norm": 10.105509002819252, + "language_loss": 0.76742649, + "learning_rate": 6.541924712943971e-07, + "loss": 0.78271121, + "num_input_tokens_seen": 266550665, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.27307129, + "step": 12356, + "time_per_iteration": 2.6638338565826416 + }, + { + "auxiliary_loss_clip": 0.01290599, + "auxiliary_loss_mlp": 0.00246597, + "balance_loss_clip": 1.06291962, + "balance_loss_mlp": 0.22109783, + "epoch": 0.7429430332180971, + "flos": 48646496413440.0, + "grad_norm": 12.248818908404452, + "language_loss": 0.80559218, + "learning_rate": 6.539044003097301e-07, + "loss": 0.82096416, + "num_input_tokens_seen": 266572455, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.25500488, + "step": 12357, + "time_per_iteration": 2.9150428771972656 + }, + { + "auxiliary_loss_clip": 0.01270678, + "auxiliary_loss_mlp": 0.00239347, + "balance_loss_clip": 1.05195463, + "balance_loss_mlp": 0.21517158, + "epoch": 0.7430031564707651, + "flos": 16764071281920.0, + "grad_norm": 27.279979428767117, + "language_loss": 0.72751474, + "learning_rate": 6.53616380369143e-07, + "loss": 0.74261498, + "num_input_tokens_seen": 266590895, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24157715, + "step": 12358, + "time_per_iteration": 2.635324716567993 + }, + { + "auxiliary_loss_clip": 0.01292273, + "auxiliary_loss_mlp": 0.00240456, + "balance_loss_clip": 1.06031752, + "balance_loss_mlp": 0.21367016, + "epoch": 0.743063279723433, + "flos": 23870576545920.0, + "grad_norm": 6.8226567260965725, + "language_loss": 0.88673192, + "learning_rate": 6.533284114835591e-07, + "loss": 0.90205926, + "num_input_tokens_seen": 266607660, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.26818848, + "step": 12359, + "time_per_iteration": 2.6608457565307617 + }, + { + "auxiliary_loss_clip": 0.01276022, + "auxiliary_loss_mlp": 0.00233179, + "balance_loss_clip": 1.05333138, + "balance_loss_mlp": 0.2072387, + "epoch": 0.743123402976101, + "flos": 14391704689920.0, + "grad_norm": 14.263246819026817, + "language_loss": 0.76515102, + "learning_rate": 6.530404936638956e-07, + "loss": 0.78024298, + "num_input_tokens_seen": 266624260, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25939941, + "step": 12360, + "time_per_iteration": 2.7221288681030273 + }, + { + "auxiliary_loss_clip": 0.01271665, + "auxiliary_loss_mlp": 0.00248721, + "balance_loss_clip": 1.04852223, + "balance_loss_mlp": 0.22213764, + "epoch": 0.7431835262287689, + "flos": 27454318335360.0, + "grad_norm": 3.3429178163519744, + "language_loss": 0.80355501, + "learning_rate": 6.527526269210715e-07, + "loss": 0.81875885, + "num_input_tokens_seen": 266644210, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26611328, + "step": 12361, + "time_per_iteration": 2.6906986236572266 + }, + { + "auxiliary_loss_clip": 0.0129508, + "auxiliary_loss_mlp": 0.0023771, + "balance_loss_clip": 1.06704521, + "balance_loss_mlp": 0.21129346, + "epoch": 0.743243649481437, + "flos": 20959514709120.0, + "grad_norm": 3.900412409653168, + "language_loss": 0.66069233, + "learning_rate": 6.524648112660027e-07, + "loss": 0.67602026, + "num_input_tokens_seen": 266664230, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.26428223, + "step": 12362, + "time_per_iteration": 2.7057554721832275 + }, + { + "auxiliary_loss_clip": 0.01288297, + "auxiliary_loss_mlp": 0.00272153, + "balance_loss_clip": 1.0560267, + "balance_loss_mlp": 0.24342361, + "epoch": 0.7433037727341049, + "flos": 22783166161920.0, + "grad_norm": 78.43477996246736, + "language_loss": 0.84827423, + "learning_rate": 6.521770467096039e-07, + "loss": 0.86387873, + "num_input_tokens_seen": 266683270, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.28759766, + "step": 12363, + "time_per_iteration": 2.6924140453338623 + }, + { + "auxiliary_loss_clip": 0.01265658, + "auxiliary_loss_mlp": 0.00245473, + "balance_loss_clip": 1.04470825, + "balance_loss_mlp": 0.22023611, + "epoch": 0.7433638959867729, + "flos": 22196708807040.0, + "grad_norm": 189.29369760420508, + "language_loss": 0.83714485, + "learning_rate": 6.518893332627862e-07, + "loss": 0.85225606, + "num_input_tokens_seen": 266701235, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25244141, + "step": 12364, + "time_per_iteration": 2.7528648376464844 + }, + { + "auxiliary_loss_clip": 0.01285204, + "auxiliary_loss_mlp": 0.00272287, + "balance_loss_clip": 1.05910265, + "balance_loss_mlp": 0.24380806, + "epoch": 0.7434240192394409, + "flos": 23296760778240.0, + "grad_norm": 59.61805312879276, + "language_loss": 0.85393119, + "learning_rate": 6.516016709364604e-07, + "loss": 0.86950606, + "num_input_tokens_seen": 266721495, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.28466797, + "step": 12365, + "time_per_iteration": 2.6700475215911865 + }, + { + "auxiliary_loss_clip": 0.01283363, + "auxiliary_loss_mlp": 0.00267221, + "balance_loss_clip": 1.05538273, + "balance_loss_mlp": 0.24030387, + "epoch": 0.7434841424921088, + "flos": 54009575251200.0, + "grad_norm": 45.02109916581162, + "language_loss": 0.82717741, + "learning_rate": 6.513140597415346e-07, + "loss": 0.8426832, + "num_input_tokens_seen": 266747400, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.26904297, + "step": 12366, + "time_per_iteration": 2.9759387969970703 + }, + { + "auxiliary_loss_clip": 0.01278904, + "auxiliary_loss_mlp": 0.00263262, + "balance_loss_clip": 1.05798829, + "balance_loss_mlp": 0.23818043, + "epoch": 0.7435442657447768, + "flos": 21433966479360.0, + "grad_norm": 14.352109722750543, + "language_loss": 0.77897823, + "learning_rate": 6.510264996889141e-07, + "loss": 0.79439986, + "num_input_tokens_seen": 266767630, + "router_z_loss_clip": 2.20996094, + "router_z_loss_mlp": 0.25073242, + "step": 12367, + "time_per_iteration": 2.671523332595825 + }, + { + "auxiliary_loss_clip": 0.0129871, + "auxiliary_loss_mlp": 0.0025631, + "balance_loss_clip": 1.06542468, + "balance_loss_mlp": 0.22781873, + "epoch": 0.7436043889974447, + "flos": 24499408970880.0, + "grad_norm": 43.78658361438385, + "language_loss": 0.82542038, + "learning_rate": 6.507389907895038e-07, + "loss": 0.84097058, + "num_input_tokens_seen": 266788015, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.28491211, + "step": 12368, + "time_per_iteration": 2.706244468688965 + }, + { + "auxiliary_loss_clip": 0.01265705, + "auxiliary_loss_mlp": 0.0022569, + "balance_loss_clip": 1.04983366, + "balance_loss_mlp": 0.20288537, + "epoch": 0.7436645122501128, + "flos": 40698388512000.0, + "grad_norm": 5.241704048347659, + "language_loss": 0.76046824, + "learning_rate": 6.50451533054207e-07, + "loss": 0.77538216, + "num_input_tokens_seen": 266809010, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.22814941, + "step": 12369, + "time_per_iteration": 2.8648805618286133 + }, + { + "auxiliary_loss_clip": 0.0128662, + "auxiliary_loss_mlp": 0.00268538, + "balance_loss_clip": 1.05788827, + "balance_loss_mlp": 0.24257454, + "epoch": 0.7437246355027807, + "flos": 18908835344640.0, + "grad_norm": 6.1420304554672045, + "language_loss": 0.81279796, + "learning_rate": 6.501641264939233e-07, + "loss": 0.82834953, + "num_input_tokens_seen": 266825390, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.25964355, + "step": 12370, + "time_per_iteration": 2.620091438293457 + }, + { + "auxiliary_loss_clip": 0.01267887, + "auxiliary_loss_mlp": 0.00241462, + "balance_loss_clip": 1.04701579, + "balance_loss_mlp": 0.21733364, + "epoch": 0.7437847587554487, + "flos": 21543817248000.0, + "grad_norm": 4.497211577674663, + "language_loss": 0.84656221, + "learning_rate": 6.498767711195503e-07, + "loss": 0.86165571, + "num_input_tokens_seen": 266844675, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24133301, + "step": 12371, + "time_per_iteration": 2.716639280319214 + }, + { + "auxiliary_loss_clip": 0.01271885, + "auxiliary_loss_mlp": 0.00252943, + "balance_loss_clip": 1.04447174, + "balance_loss_mlp": 0.22737269, + "epoch": 0.7438448820081166, + "flos": 27782470010880.0, + "grad_norm": 75.66619695928526, + "language_loss": 0.79598427, + "learning_rate": 6.495894669419857e-07, + "loss": 0.81123257, + "num_input_tokens_seen": 266865160, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.25585938, + "step": 12372, + "time_per_iteration": 2.7481513023376465 + }, + { + "auxiliary_loss_clip": 0.01279511, + "auxiliary_loss_mlp": 0.00231568, + "balance_loss_clip": 1.05363297, + "balance_loss_mlp": 0.20614058, + "epoch": 0.7439050052607846, + "flos": 17967832796160.0, + "grad_norm": 110.88070416135915, + "language_loss": 0.85569847, + "learning_rate": 6.493022139721245e-07, + "loss": 0.87080932, + "num_input_tokens_seen": 266883285, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.25439453, + "step": 12373, + "time_per_iteration": 2.680837392807007 + }, + { + "auxiliary_loss_clip": 0.01298554, + "auxiliary_loss_mlp": 0.00229455, + "balance_loss_clip": 1.06582522, + "balance_loss_mlp": 0.20104736, + "epoch": 0.7439651285134525, + "flos": 22958696949120.0, + "grad_norm": 12.226254449920933, + "language_loss": 0.85953283, + "learning_rate": 6.49015012220858e-07, + "loss": 0.87481284, + "num_input_tokens_seen": 266900960, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.28381348, + "step": 12374, + "time_per_iteration": 2.651879072189331 + }, + { + "auxiliary_loss_clip": 0.01284219, + "auxiliary_loss_mlp": 0.00253195, + "balance_loss_clip": 1.05427241, + "balance_loss_mlp": 0.22681428, + "epoch": 0.7440252517661206, + "flos": 18806777827200.0, + "grad_norm": 27.562222894382376, + "language_loss": 0.85499287, + "learning_rate": 6.487278616990774e-07, + "loss": 0.87036705, + "num_input_tokens_seen": 266917710, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.26367188, + "step": 12375, + "time_per_iteration": 2.686805248260498 + }, + { + "auxiliary_loss_clip": 0.01266241, + "auxiliary_loss_mlp": 0.00222396, + "balance_loss_clip": 1.04675543, + "balance_loss_mlp": 0.19793417, + "epoch": 0.7440853750187885, + "flos": 20266295155200.0, + "grad_norm": 230.2157192614666, + "language_loss": 0.83396804, + "learning_rate": 6.484407624176733e-07, + "loss": 0.84885442, + "num_input_tokens_seen": 266934220, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.24462891, + "step": 12376, + "time_per_iteration": 2.730562925338745 + }, + { + "auxiliary_loss_clip": 0.01275455, + "auxiliary_loss_mlp": 0.00228869, + "balance_loss_clip": 1.04596317, + "balance_loss_mlp": 0.20315555, + "epoch": 0.7441454982714565, + "flos": 25337276593920.0, + "grad_norm": 32.79194256582312, + "language_loss": 0.8724041, + "learning_rate": 6.481537143875296e-07, + "loss": 0.88744736, + "num_input_tokens_seen": 266955210, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.25720215, + "step": 12377, + "time_per_iteration": 2.816422700881958 + }, + { + "auxiliary_loss_clip": 0.01278755, + "auxiliary_loss_mlp": 0.00237772, + "balance_loss_clip": 1.05042863, + "balance_loss_mlp": 0.21219015, + "epoch": 0.7442056215241245, + "flos": 64480910866560.0, + "grad_norm": 19.934309582573576, + "language_loss": 0.77322984, + "learning_rate": 6.478667176195322e-07, + "loss": 0.78839505, + "num_input_tokens_seen": 266976555, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.2557373, + "step": 12378, + "time_per_iteration": 3.0794167518615723 + }, + { + "auxiliary_loss_clip": 0.01284742, + "auxiliary_loss_mlp": 0.00235941, + "balance_loss_clip": 1.05257106, + "balance_loss_mlp": 0.20988205, + "epoch": 0.7442657447767924, + "flos": 31285376242560.0, + "grad_norm": 4.344385977652243, + "language_loss": 0.79314446, + "learning_rate": 6.475797721245648e-07, + "loss": 0.80835128, + "num_input_tokens_seen": 266997640, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.26049805, + "step": 12379, + "time_per_iteration": 2.768495798110962 + }, + { + "auxiliary_loss_clip": 0.01288495, + "auxiliary_loss_mlp": 0.00217756, + "balance_loss_clip": 1.05734742, + "balance_loss_mlp": 0.19364002, + "epoch": 0.7443258680294604, + "flos": 20807899401600.0, + "grad_norm": 51.77284803597454, + "language_loss": 0.72787005, + "learning_rate": 6.472928779135085e-07, + "loss": 0.74293256, + "num_input_tokens_seen": 267016165, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.2409668, + "step": 12380, + "time_per_iteration": 2.667865753173828 + }, + { + "auxiliary_loss_clip": 0.01301483, + "auxiliary_loss_mlp": 0.00234768, + "balance_loss_clip": 1.06299663, + "balance_loss_mlp": 0.20657468, + "epoch": 0.7443859912821283, + "flos": 22199833290240.0, + "grad_norm": 9.923404981933457, + "language_loss": 0.88546801, + "learning_rate": 6.470060349972411e-07, + "loss": 0.90083051, + "num_input_tokens_seen": 267034075, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.28198242, + "step": 12381, + "time_per_iteration": 2.627962589263916 + }, + { + "auxiliary_loss_clip": 0.01309854, + "auxiliary_loss_mlp": 0.00236682, + "balance_loss_clip": 1.06616163, + "balance_loss_mlp": 0.20806003, + "epoch": 0.7444461145347964, + "flos": 22017838055040.0, + "grad_norm": 80.39967161813905, + "language_loss": 0.81887794, + "learning_rate": 6.467192433866411e-07, + "loss": 0.83434325, + "num_input_tokens_seen": 267053645, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.28625488, + "step": 12382, + "time_per_iteration": 2.693512201309204 + }, + { + "auxiliary_loss_clip": 0.01193766, + "auxiliary_loss_mlp": 0.00107046, + "balance_loss_clip": 1.03070068, + "balance_loss_mlp": 0.09922616, + "epoch": 0.7445062377874643, + "flos": 70559047704960.0, + "grad_norm": 0.6450958150143737, + "language_loss": 0.54216093, + "learning_rate": 6.464325030925831e-07, + "loss": 0.55516905, + "num_input_tokens_seen": 267121830, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.078125, + "step": 12383, + "time_per_iteration": 3.310544967651367 + }, + { + "auxiliary_loss_clip": 0.01293448, + "auxiliary_loss_mlp": 0.00239641, + "balance_loss_clip": 1.05959868, + "balance_loss_mlp": 0.21291403, + "epoch": 0.7445663610401323, + "flos": 22164425458560.0, + "grad_norm": 5.726818046322329, + "language_loss": 0.86037654, + "learning_rate": 6.461458141259395e-07, + "loss": 0.87570745, + "num_input_tokens_seen": 267141145, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.26733398, + "step": 12384, + "time_per_iteration": 4.026790380477905 + }, + { + "auxiliary_loss_clip": 0.01264071, + "auxiliary_loss_mlp": 0.00238835, + "balance_loss_clip": 1.04010928, + "balance_loss_mlp": 0.21437326, + "epoch": 0.7446264842928002, + "flos": 24170251714560.0, + "grad_norm": 4.0005015649022955, + "language_loss": 0.87229031, + "learning_rate": 6.458591764975823e-07, + "loss": 0.88731933, + "num_input_tokens_seen": 267159280, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.24438477, + "step": 12385, + "time_per_iteration": 2.692669630050659 + }, + { + "auxiliary_loss_clip": 0.01290889, + "auxiliary_loss_mlp": 0.00254687, + "balance_loss_clip": 1.06226802, + "balance_loss_mlp": 0.22794867, + "epoch": 0.7446866075454682, + "flos": 24134556574080.0, + "grad_norm": 19.81489543662182, + "language_loss": 0.88985926, + "learning_rate": 6.455725902183813e-07, + "loss": 0.90531492, + "num_input_tokens_seen": 267179390, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.26757812, + "step": 12386, + "time_per_iteration": 4.113672256469727 + }, + { + "auxiliary_loss_clip": 0.01277621, + "auxiliary_loss_mlp": 0.00243938, + "balance_loss_clip": 1.05296683, + "balance_loss_mlp": 0.21805757, + "epoch": 0.7447467307981361, + "flos": 23548063305600.0, + "grad_norm": 5.92792226246144, + "language_loss": 0.78860497, + "learning_rate": 6.452860552992037e-07, + "loss": 0.80382055, + "num_input_tokens_seen": 267198165, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.25891113, + "step": 12387, + "time_per_iteration": 2.66705060005188 + }, + { + "auxiliary_loss_clip": 0.0126685, + "auxiliary_loss_mlp": 0.00232102, + "balance_loss_clip": 1.04342568, + "balance_loss_mlp": 0.20409974, + "epoch": 0.7448068540508042, + "flos": 19567832215680.0, + "grad_norm": 742.1653866570434, + "language_loss": 0.77712011, + "learning_rate": 6.449995717509138e-07, + "loss": 0.79210973, + "num_input_tokens_seen": 267214520, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.27990723, + "step": 12388, + "time_per_iteration": 2.6976475715637207 + }, + { + "auxiliary_loss_clip": 0.01293874, + "auxiliary_loss_mlp": 0.00240577, + "balance_loss_clip": 1.0617367, + "balance_loss_mlp": 0.21416013, + "epoch": 0.7448669773034721, + "flos": 21839721488640.0, + "grad_norm": 1.7014135736479443, + "language_loss": 0.9127807, + "learning_rate": 6.447131395843761e-07, + "loss": 0.92812514, + "num_input_tokens_seen": 267236555, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.26428223, + "step": 12389, + "time_per_iteration": 4.328755140304565 + }, + { + "auxiliary_loss_clip": 0.01302026, + "auxiliary_loss_mlp": 0.00250697, + "balance_loss_clip": 1.06577468, + "balance_loss_mlp": 0.22165796, + "epoch": 0.7449271005561401, + "flos": 25155389099520.0, + "grad_norm": 3.8722107796805805, + "language_loss": 0.85906965, + "learning_rate": 6.444267588104526e-07, + "loss": 0.87459689, + "num_input_tokens_seen": 267254800, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.29052734, + "step": 12390, + "time_per_iteration": 2.6958730220794678 + }, + { + "auxiliary_loss_clip": 0.01278131, + "auxiliary_loss_mlp": 0.00254236, + "balance_loss_clip": 1.04984343, + "balance_loss_mlp": 0.22796208, + "epoch": 0.7449872238088081, + "flos": 22273342473600.0, + "grad_norm": 18.855089381267018, + "language_loss": 0.92224497, + "learning_rate": 6.441404294400014e-07, + "loss": 0.93756866, + "num_input_tokens_seen": 267274610, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.26306152, + "step": 12391, + "time_per_iteration": 2.883460760116577 + }, + { + "auxiliary_loss_clip": 0.01288717, + "auxiliary_loss_mlp": 0.00242433, + "balance_loss_clip": 1.05871189, + "balance_loss_mlp": 0.21588509, + "epoch": 0.745047347061476, + "flos": 20594805966720.0, + "grad_norm": 44.992353707124785, + "language_loss": 0.80435711, + "learning_rate": 6.438541514838811e-07, + "loss": 0.81966865, + "num_input_tokens_seen": 267292600, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.26574707, + "step": 12392, + "time_per_iteration": 2.7058165073394775 + }, + { + "auxiliary_loss_clip": 0.01251459, + "auxiliary_loss_mlp": 0.00223474, + "balance_loss_clip": 1.03113675, + "balance_loss_mlp": 0.19873855, + "epoch": 0.745107470314144, + "flos": 22127545169280.0, + "grad_norm": 3.348419013263166, + "language_loss": 0.83503616, + "learning_rate": 6.435679249529487e-07, + "loss": 0.84978551, + "num_input_tokens_seen": 267311295, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.24731445, + "step": 12393, + "time_per_iteration": 4.093194484710693 + }, + { + "auxiliary_loss_clip": 0.01297842, + "auxiliary_loss_mlp": 0.00242586, + "balance_loss_clip": 1.06504297, + "balance_loss_mlp": 0.21594289, + "epoch": 0.745167593566812, + "flos": 22236498097920.0, + "grad_norm": 2.262205135280922, + "language_loss": 0.80751741, + "learning_rate": 6.432817498580552e-07, + "loss": 0.82292163, + "num_input_tokens_seen": 267328390, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.26635742, + "step": 12394, + "time_per_iteration": 2.69937801361084 + }, + { + "auxiliary_loss_clip": 0.01277901, + "auxiliary_loss_mlp": 0.00246, + "balance_loss_clip": 1.05307567, + "balance_loss_mlp": 0.22141866, + "epoch": 0.74522771681948, + "flos": 20666232161280.0, + "grad_norm": 240.3197244175798, + "language_loss": 0.8965801, + "learning_rate": 6.429956262100535e-07, + "loss": 0.9118191, + "num_input_tokens_seen": 267348185, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.24572754, + "step": 12395, + "time_per_iteration": 2.666973829269409 + }, + { + "auxiliary_loss_clip": 0.01297445, + "auxiliary_loss_mlp": 0.00225705, + "balance_loss_clip": 1.06316793, + "balance_loss_mlp": 0.19750038, + "epoch": 0.7452878400721479, + "flos": 21106999952640.0, + "grad_norm": 3.5553123204209647, + "language_loss": 0.81212288, + "learning_rate": 6.427095540197937e-07, + "loss": 0.82735443, + "num_input_tokens_seen": 267367010, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.28234863, + "step": 12396, + "time_per_iteration": 2.721423387527466 + }, + { + "auxiliary_loss_clip": 0.01292979, + "auxiliary_loss_mlp": 0.00251085, + "balance_loss_clip": 1.05974543, + "balance_loss_mlp": 0.22369048, + "epoch": 0.7453479633248159, + "flos": 26688056474880.0, + "grad_norm": 7.009803116813096, + "language_loss": 0.76768494, + "learning_rate": 6.424235332981245e-07, + "loss": 0.78312564, + "num_input_tokens_seen": 267386605, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.27392578, + "step": 12397, + "time_per_iteration": 2.7134506702423096 + }, + { + "auxiliary_loss_clip": 0.01268432, + "auxiliary_loss_mlp": 0.00248522, + "balance_loss_clip": 1.04440212, + "balance_loss_mlp": 0.22259434, + "epoch": 0.7454080865774838, + "flos": 17016056167680.0, + "grad_norm": 9.552688690261743, + "language_loss": 0.83439463, + "learning_rate": 6.421375640558908e-07, + "loss": 0.84956419, + "num_input_tokens_seen": 267404135, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.25915527, + "step": 12398, + "time_per_iteration": 2.6651053428649902 + }, + { + "auxiliary_loss_clip": 0.01260576, + "auxiliary_loss_mlp": 0.00231011, + "balance_loss_clip": 1.03970695, + "balance_loss_mlp": 0.20642973, + "epoch": 0.7454682098301518, + "flos": 21323900229120.0, + "grad_norm": 49.356317118422396, + "language_loss": 0.84524423, + "learning_rate": 6.418516463039363e-07, + "loss": 0.86016011, + "num_input_tokens_seen": 267423120, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24584961, + "step": 12399, + "time_per_iteration": 2.7186365127563477 + }, + { + "auxiliary_loss_clip": 0.01268781, + "auxiliary_loss_mlp": 0.00218168, + "balance_loss_clip": 1.05316925, + "balance_loss_mlp": 0.19423096, + "epoch": 0.7455283330828197, + "flos": 17858341163520.0, + "grad_norm": 2276.7774202439846, + "language_loss": 0.83101988, + "learning_rate": 6.415657800531038e-07, + "loss": 0.84588945, + "num_input_tokens_seen": 267441250, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.23937988, + "step": 12400, + "time_per_iteration": 2.64639949798584 + }, + { + "auxiliary_loss_clip": 0.01290704, + "auxiliary_loss_mlp": 0.00248645, + "balance_loss_clip": 1.06010342, + "balance_loss_mlp": 0.22254997, + "epoch": 0.7455884563354878, + "flos": 30774259664640.0, + "grad_norm": 7.192174103342073, + "language_loss": 0.90471488, + "learning_rate": 6.412799653142327e-07, + "loss": 0.92010838, + "num_input_tokens_seen": 267462820, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.26098633, + "step": 12401, + "time_per_iteration": 2.753737211227417 + }, + { + "auxiliary_loss_clip": 0.01274145, + "auxiliary_loss_mlp": 0.002348, + "balance_loss_clip": 1.04820061, + "balance_loss_mlp": 0.20943213, + "epoch": 0.7456485795881557, + "flos": 23185545292800.0, + "grad_norm": 13.452444529356061, + "language_loss": 0.74011016, + "learning_rate": 6.409942020981611e-07, + "loss": 0.75519967, + "num_input_tokens_seen": 267483065, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.25390625, + "step": 12402, + "time_per_iteration": 2.7102510929107666 + }, + { + "auxiliary_loss_clip": 0.01271939, + "auxiliary_loss_mlp": 0.00244476, + "balance_loss_clip": 1.04390335, + "balance_loss_mlp": 0.21909648, + "epoch": 0.7457087028408237, + "flos": 38727144074880.0, + "grad_norm": 30.01593075038537, + "language_loss": 0.78173125, + "learning_rate": 6.407084904157265e-07, + "loss": 0.79689538, + "num_input_tokens_seen": 267504825, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.25402832, + "step": 12403, + "time_per_iteration": 2.867724657058716 + }, + { + "auxiliary_loss_clip": 0.01190411, + "auxiliary_loss_mlp": 0.00109701, + "balance_loss_clip": 1.02783716, + "balance_loss_mlp": 0.10097501, + "epoch": 0.7457688260934917, + "flos": 56043737337600.0, + "grad_norm": 0.8292392905431775, + "language_loss": 0.57982534, + "learning_rate": 6.404228302777621e-07, + "loss": 0.59282649, + "num_input_tokens_seen": 267559260, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.08740234, + "step": 12404, + "time_per_iteration": 3.0095009803771973 + }, + { + "auxiliary_loss_clip": 0.01273114, + "auxiliary_loss_mlp": 0.00231975, + "balance_loss_clip": 1.0476433, + "balance_loss_mlp": 0.20673881, + "epoch": 0.7458289493461596, + "flos": 20116152305280.0, + "grad_norm": 14.577041088594148, + "language_loss": 0.82479846, + "learning_rate": 6.401372216950995e-07, + "loss": 0.83984941, + "num_input_tokens_seen": 267578720, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.25268555, + "step": 12405, + "time_per_iteration": 2.700167655944824 + }, + { + "auxiliary_loss_clip": 0.01285501, + "auxiliary_loss_mlp": 0.00233246, + "balance_loss_clip": 1.05791855, + "balance_loss_mlp": 0.20740189, + "epoch": 0.7458890725988276, + "flos": 20193073280640.0, + "grad_norm": 2.5200607139981797, + "language_loss": 0.75775492, + "learning_rate": 6.398516646785698e-07, + "loss": 0.77294242, + "num_input_tokens_seen": 267598250, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.25830078, + "step": 12406, + "time_per_iteration": 2.637152671813965 + }, + { + "auxiliary_loss_clip": 0.01304577, + "auxiliary_loss_mlp": 0.00263168, + "balance_loss_clip": 1.06675076, + "balance_loss_mlp": 0.2362625, + "epoch": 0.7459491958514956, + "flos": 17018749687680.0, + "grad_norm": 56.100494575367264, + "language_loss": 0.74431694, + "learning_rate": 6.39566159239002e-07, + "loss": 0.75999439, + "num_input_tokens_seen": 267615430, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.26953125, + "step": 12407, + "time_per_iteration": 2.6780710220336914 + }, + { + "auxiliary_loss_clip": 0.01297081, + "auxiliary_loss_mlp": 0.00261815, + "balance_loss_clip": 1.06239498, + "balance_loss_mlp": 0.23307331, + "epoch": 0.7460093191041636, + "flos": 25078719519360.0, + "grad_norm": 18.79105783653529, + "language_loss": 0.80049407, + "learning_rate": 6.392807053872212e-07, + "loss": 0.81608301, + "num_input_tokens_seen": 267635075, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.28771973, + "step": 12408, + "time_per_iteration": 2.680427312850952 + }, + { + "auxiliary_loss_clip": 0.01311472, + "auxiliary_loss_mlp": 0.00253502, + "balance_loss_clip": 1.06838894, + "balance_loss_mlp": 0.22399761, + "epoch": 0.7460694423568315, + "flos": 21908525990400.0, + "grad_norm": 78.38220481612734, + "language_loss": 0.81576025, + "learning_rate": 6.38995303134053e-07, + "loss": 0.83141005, + "num_input_tokens_seen": 267654105, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.29492188, + "step": 12409, + "time_per_iteration": 2.741215705871582 + }, + { + "auxiliary_loss_clip": 0.01267308, + "auxiliary_loss_mlp": 0.00228251, + "balance_loss_clip": 1.04708946, + "balance_loss_mlp": 0.2032885, + "epoch": 0.7461295656094995, + "flos": 21215737399680.0, + "grad_norm": 2.6778373324063804, + "language_loss": 0.7350111, + "learning_rate": 6.38709952490319e-07, + "loss": 0.74996674, + "num_input_tokens_seen": 267673090, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.24951172, + "step": 12410, + "time_per_iteration": 2.6356825828552246 + }, + { + "auxiliary_loss_clip": 0.01281969, + "auxiliary_loss_mlp": 0.00246556, + "balance_loss_clip": 1.05749989, + "balance_loss_mlp": 0.21820824, + "epoch": 0.7461896888621674, + "flos": 22346851656960.0, + "grad_norm": 65.04907313168127, + "language_loss": 0.92111272, + "learning_rate": 6.384246534668396e-07, + "loss": 0.93639803, + "num_input_tokens_seen": 267690605, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.28356934, + "step": 12411, + "time_per_iteration": 2.7312209606170654 + }, + { + "auxiliary_loss_clip": 0.01284468, + "auxiliary_loss_mlp": 0.00254435, + "balance_loss_clip": 1.05698776, + "balance_loss_mlp": 0.22880477, + "epoch": 0.7462498121148354, + "flos": 25482930243840.0, + "grad_norm": 11.199930551329091, + "language_loss": 0.83679616, + "learning_rate": 6.381394060744339e-07, + "loss": 0.85218513, + "num_input_tokens_seen": 267710540, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.2565918, + "step": 12412, + "time_per_iteration": 2.697864532470703 + }, + { + "auxiliary_loss_clip": 0.01281042, + "auxiliary_loss_mlp": 0.00245284, + "balance_loss_clip": 1.05516672, + "balance_loss_mlp": 0.21923685, + "epoch": 0.7463099353675033, + "flos": 33947936812800.0, + "grad_norm": 6.391154669273732, + "language_loss": 0.69768006, + "learning_rate": 6.378542103239188e-07, + "loss": 0.71294332, + "num_input_tokens_seen": 267730780, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.26049805, + "step": 12413, + "time_per_iteration": 2.78680157661438 + }, + { + "auxiliary_loss_clip": 0.01187104, + "auxiliary_loss_mlp": 0.0017087, + "balance_loss_clip": 1.02489126, + "balance_loss_mlp": 0.16138119, + "epoch": 0.7463700586201714, + "flos": 62767723691520.0, + "grad_norm": 0.6955332882386145, + "language_loss": 0.53928834, + "learning_rate": 6.375690662261082e-07, + "loss": 0.55286807, + "num_input_tokens_seen": 267794240, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.09472656, + "step": 12414, + "time_per_iteration": 3.2042486667633057 + }, + { + "auxiliary_loss_clip": 0.01289319, + "auxiliary_loss_mlp": 0.0022769, + "balance_loss_clip": 1.06502759, + "balance_loss_mlp": 0.20121375, + "epoch": 0.7464301818728393, + "flos": 33432654257280.0, + "grad_norm": 2.3321665912276783, + "language_loss": 0.62606871, + "learning_rate": 6.372839737918154e-07, + "loss": 0.64123881, + "num_input_tokens_seen": 267817190, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26464844, + "step": 12415, + "time_per_iteration": 2.8101730346679688 + }, + { + "auxiliary_loss_clip": 0.01285883, + "auxiliary_loss_mlp": 0.00257619, + "balance_loss_clip": 1.05857885, + "balance_loss_mlp": 0.23141706, + "epoch": 0.7464903051255073, + "flos": 26869872142080.0, + "grad_norm": 11.028694911496736, + "language_loss": 0.79957098, + "learning_rate": 6.369989330318506e-07, + "loss": 0.81500602, + "num_input_tokens_seen": 267836245, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.26208496, + "step": 12416, + "time_per_iteration": 2.74772310256958 + }, + { + "auxiliary_loss_clip": 0.01286547, + "auxiliary_loss_mlp": 0.00250269, + "balance_loss_clip": 1.05757499, + "balance_loss_mlp": 0.22418566, + "epoch": 0.7465504283781753, + "flos": 44086954775040.0, + "grad_norm": 8.254002445508346, + "language_loss": 0.74568462, + "learning_rate": 6.367139439570233e-07, + "loss": 0.76105273, + "num_input_tokens_seen": 267858310, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.2611084, + "step": 12417, + "time_per_iteration": 2.868565559387207 + }, + { + "auxiliary_loss_clip": 0.01311202, + "auxiliary_loss_mlp": 0.00263687, + "balance_loss_clip": 1.0754962, + "balance_loss_mlp": 0.23488617, + "epoch": 0.7466105516308432, + "flos": 19676102785920.0, + "grad_norm": 95.52785367263712, + "language_loss": 0.82160151, + "learning_rate": 6.364290065781392e-07, + "loss": 0.83735037, + "num_input_tokens_seen": 267876345, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.28771973, + "step": 12418, + "time_per_iteration": 2.693126678466797 + }, + { + "auxiliary_loss_clip": 0.01299607, + "auxiliary_loss_mlp": 0.00243018, + "balance_loss_clip": 1.06589246, + "balance_loss_mlp": 0.2165532, + "epoch": 0.7466706748835112, + "flos": 20520722165760.0, + "grad_norm": 6.0430711610527394, + "language_loss": 0.75715959, + "learning_rate": 6.361441209060039e-07, + "loss": 0.77258581, + "num_input_tokens_seen": 267896740, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.26428223, + "step": 12419, + "time_per_iteration": 2.713282585144043 + }, + { + "auxiliary_loss_clip": 0.01296149, + "auxiliary_loss_mlp": 0.00269517, + "balance_loss_clip": 1.06967402, + "balance_loss_mlp": 0.24131219, + "epoch": 0.7467307981361792, + "flos": 21690260997120.0, + "grad_norm": 70.1823341488199, + "language_loss": 0.80930758, + "learning_rate": 6.358592869514216e-07, + "loss": 0.82496428, + "num_input_tokens_seen": 267914765, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.28222656, + "step": 12420, + "time_per_iteration": 2.7412707805633545 + }, + { + "auxiliary_loss_clip": 0.01326187, + "auxiliary_loss_mlp": 0.00245494, + "balance_loss_clip": 1.08379841, + "balance_loss_mlp": 0.21867147, + "epoch": 0.7467909213888472, + "flos": 19573686132480.0, + "grad_norm": 37.431306091564174, + "language_loss": 0.7650528, + "learning_rate": 6.355745047251904e-07, + "loss": 0.78076959, + "num_input_tokens_seen": 267934085, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.26818848, + "step": 12421, + "time_per_iteration": 2.662130117416382 + }, + { + "auxiliary_loss_clip": 0.01312654, + "auxiliary_loss_mlp": 0.00254735, + "balance_loss_clip": 1.07094622, + "balance_loss_mlp": 0.22732881, + "epoch": 0.7468510446415151, + "flos": 23695225326720.0, + "grad_norm": 6.26495430022639, + "language_loss": 0.79937553, + "learning_rate": 6.352897742381107e-07, + "loss": 0.81504935, + "num_input_tokens_seen": 267955170, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.27429199, + "step": 12422, + "time_per_iteration": 2.7077560424804688 + }, + { + "auxiliary_loss_clip": 0.01282305, + "auxiliary_loss_mlp": 0.00244064, + "balance_loss_clip": 1.05937862, + "balance_loss_mlp": 0.21936424, + "epoch": 0.7469111678941831, + "flos": 29315783831040.0, + "grad_norm": 26.663280231138664, + "language_loss": 0.8167643, + "learning_rate": 6.350050955009796e-07, + "loss": 0.83202803, + "num_input_tokens_seen": 267974980, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.24682617, + "step": 12423, + "time_per_iteration": 2.7593326568603516 + }, + { + "auxiliary_loss_clip": 0.0126573, + "auxiliary_loss_mlp": 0.00237009, + "balance_loss_clip": 1.05032659, + "balance_loss_mlp": 0.21385877, + "epoch": 0.746971291146851, + "flos": 21798639308160.0, + "grad_norm": 351.72251038160005, + "language_loss": 0.73658907, + "learning_rate": 6.347204685245929e-07, + "loss": 0.75161648, + "num_input_tokens_seen": 267994985, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.23144531, + "step": 12424, + "time_per_iteration": 2.647552490234375 + }, + { + "auxiliary_loss_clip": 0.01301523, + "auxiliary_loss_mlp": 0.00225925, + "balance_loss_clip": 1.06561923, + "balance_loss_mlp": 0.20022357, + "epoch": 0.747031414399519, + "flos": 36245070368640.0, + "grad_norm": 9.253019840254478, + "language_loss": 0.83324164, + "learning_rate": 6.344358933197418e-07, + "loss": 0.84851611, + "num_input_tokens_seen": 268014985, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.25695801, + "step": 12425, + "time_per_iteration": 2.8415040969848633 + }, + { + "auxiliary_loss_clip": 0.01304204, + "auxiliary_loss_mlp": 0.00244615, + "balance_loss_clip": 1.0711937, + "balance_loss_mlp": 0.21727985, + "epoch": 0.7470915376521869, + "flos": 19974916028160.0, + "grad_norm": 5.311829231920295, + "language_loss": 0.78847528, + "learning_rate": 6.341513698972194e-07, + "loss": 0.80396342, + "num_input_tokens_seen": 268034395, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.27355957, + "step": 12426, + "time_per_iteration": 4.128875255584717 + }, + { + "auxiliary_loss_clip": 0.01301592, + "auxiliary_loss_mlp": 0.00268904, + "balance_loss_clip": 1.07128525, + "balance_loss_mlp": 0.24178348, + "epoch": 0.747151660904855, + "flos": 20084299920000.0, + "grad_norm": 3.85979496344621, + "language_loss": 0.70673186, + "learning_rate": 6.338668982678139e-07, + "loss": 0.72243679, + "num_input_tokens_seen": 268054485, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.27111816, + "step": 12427, + "time_per_iteration": 2.671505928039551 + }, + { + "auxiliary_loss_clip": 0.01283496, + "auxiliary_loss_mlp": 0.00270577, + "balance_loss_clip": 1.05641925, + "balance_loss_mlp": 0.24269393, + "epoch": 0.7472117841575229, + "flos": 16290373697280.0, + "grad_norm": 248.06693845235992, + "language_loss": 0.81249487, + "learning_rate": 6.335824784423118e-07, + "loss": 0.82803571, + "num_input_tokens_seen": 268072250, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.27868652, + "step": 12428, + "time_per_iteration": 4.04669976234436 + }, + { + "auxiliary_loss_clip": 0.01309002, + "auxiliary_loss_mlp": 0.002583, + "balance_loss_clip": 1.06995523, + "balance_loss_mlp": 0.23150128, + "epoch": 0.7472719074101909, + "flos": 21389939383680.0, + "grad_norm": 96.669855394889, + "language_loss": 0.67841893, + "learning_rate": 6.33298110431499e-07, + "loss": 0.69409192, + "num_input_tokens_seen": 268089840, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.26782227, + "step": 12429, + "time_per_iteration": 2.65919828414917 + }, + { + "auxiliary_loss_clip": 0.01318963, + "auxiliary_loss_mlp": 0.00262488, + "balance_loss_clip": 1.08271956, + "balance_loss_mlp": 0.23502181, + "epoch": 0.7473320306628589, + "flos": 29643289061760.0, + "grad_norm": 38.88807674918153, + "language_loss": 0.70444399, + "learning_rate": 6.330137942461595e-07, + "loss": 0.72025847, + "num_input_tokens_seen": 268109360, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.27478027, + "step": 12430, + "time_per_iteration": 2.7940196990966797 + }, + { + "auxiliary_loss_clip": 0.01281924, + "auxiliary_loss_mlp": 0.00229488, + "balance_loss_clip": 1.05595946, + "balance_loss_mlp": 0.20569423, + "epoch": 0.7473921539155268, + "flos": 24136100858880.0, + "grad_norm": 2.904324735805579, + "language_loss": 0.81321746, + "learning_rate": 6.327295298970734e-07, + "loss": 0.82833159, + "num_input_tokens_seen": 268131840, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.23815918, + "step": 12431, + "time_per_iteration": 4.268065929412842 + }, + { + "auxiliary_loss_clip": 0.01293511, + "auxiliary_loss_mlp": 0.00225509, + "balance_loss_clip": 1.06304407, + "balance_loss_mlp": 0.19989048, + "epoch": 0.7474522771681948, + "flos": 17487958072320.0, + "grad_norm": 17.012543359342867, + "language_loss": 0.82843435, + "learning_rate": 6.32445317395021e-07, + "loss": 0.84362447, + "num_input_tokens_seen": 268148300, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.25634766, + "step": 12432, + "time_per_iteration": 2.6053214073181152 + }, + { + "auxiliary_loss_clip": 0.01316533, + "auxiliary_loss_mlp": 0.00269219, + "balance_loss_clip": 1.07738245, + "balance_loss_mlp": 0.23982242, + "epoch": 0.7475124004208628, + "flos": 16727298733440.0, + "grad_norm": 181.96907705238738, + "language_loss": 0.82488036, + "learning_rate": 6.321611567507787e-07, + "loss": 0.84073794, + "num_input_tokens_seen": 268166450, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.29394531, + "step": 12433, + "time_per_iteration": 2.7276611328125 + }, + { + "auxiliary_loss_clip": 0.01295057, + "auxiliary_loss_mlp": 0.00256894, + "balance_loss_clip": 1.06436563, + "balance_loss_mlp": 0.2306318, + "epoch": 0.7475725236735308, + "flos": 19720237622400.0, + "grad_norm": 6.192264673162611, + "language_loss": 0.75323588, + "learning_rate": 6.318770479751232e-07, + "loss": 0.76875538, + "num_input_tokens_seen": 268186165, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.26257324, + "step": 12434, + "time_per_iteration": 2.67352557182312 + }, + { + "auxiliary_loss_clip": 0.01279982, + "auxiliary_loss_mlp": 0.0026715, + "balance_loss_clip": 1.05288982, + "balance_loss_mlp": 0.2413051, + "epoch": 0.7476326469261987, + "flos": 26286000566400.0, + "grad_norm": 29.164364973577765, + "language_loss": 0.8484478, + "learning_rate": 6.315929910788263e-07, + "loss": 0.86391914, + "num_input_tokens_seen": 268208145, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.25842285, + "step": 12435, + "time_per_iteration": 4.334566354751587 + }, + { + "auxiliary_loss_clip": 0.01299365, + "auxiliary_loss_mlp": 0.00245918, + "balance_loss_clip": 1.05941975, + "balance_loss_mlp": 0.21911931, + "epoch": 0.7476927701788667, + "flos": 31831828824960.0, + "grad_norm": 6.025619875154452, + "language_loss": 0.76502311, + "learning_rate": 6.313089860726604e-07, + "loss": 0.78047597, + "num_input_tokens_seen": 268228345, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.26831055, + "step": 12436, + "time_per_iteration": 2.846079111099243 + }, + { + "auxiliary_loss_clip": 0.01286189, + "auxiliary_loss_mlp": 0.00261829, + "balance_loss_clip": 1.05283809, + "balance_loss_mlp": 0.2342554, + "epoch": 0.7477528934315346, + "flos": 31795487239680.0, + "grad_norm": 13.71546663338658, + "language_loss": 0.77624583, + "learning_rate": 6.31025032967396e-07, + "loss": 0.79172593, + "num_input_tokens_seen": 268250260, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.27575684, + "step": 12437, + "time_per_iteration": 2.8230979442596436 + }, + { + "auxiliary_loss_clip": 0.01289427, + "auxiliary_loss_mlp": 0.00251839, + "balance_loss_clip": 1.06222093, + "balance_loss_mlp": 0.22542182, + "epoch": 0.7478130166842026, + "flos": 20371979946240.0, + "grad_norm": 20.905735110851772, + "language_loss": 0.73955882, + "learning_rate": 6.307411317737986e-07, + "loss": 0.7549715, + "num_input_tokens_seen": 268268440, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.2644043, + "step": 12438, + "time_per_iteration": 2.7023043632507324 + }, + { + "auxiliary_loss_clip": 0.01280721, + "auxiliary_loss_mlp": 0.00246922, + "balance_loss_clip": 1.05633533, + "balance_loss_mlp": 0.22108956, + "epoch": 0.7478731399368705, + "flos": 18148930191360.0, + "grad_norm": 19.469690781636253, + "language_loss": 0.86717588, + "learning_rate": 6.304572825026344e-07, + "loss": 0.88245237, + "num_input_tokens_seen": 268285765, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.25866699, + "step": 12439, + "time_per_iteration": 2.657804250717163 + }, + { + "auxiliary_loss_clip": 0.01278915, + "auxiliary_loss_mlp": 0.0025599, + "balance_loss_clip": 1.05518389, + "balance_loss_mlp": 0.22954978, + "epoch": 0.7479332631895386, + "flos": 15267889146240.0, + "grad_norm": 30.335116169125083, + "language_loss": 0.80764031, + "learning_rate": 6.301734851646674e-07, + "loss": 0.82298934, + "num_input_tokens_seen": 268304015, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26464844, + "step": 12440, + "time_per_iteration": 2.691300630569458 + }, + { + "auxiliary_loss_clip": 0.01284818, + "auxiliary_loss_mlp": 0.00240947, + "balance_loss_clip": 1.05764151, + "balance_loss_mlp": 0.21581808, + "epoch": 0.7479933864422065, + "flos": 21142515525120.0, + "grad_norm": 3.4695347437616553, + "language_loss": 0.80233204, + "learning_rate": 6.298897397706597e-07, + "loss": 0.8175897, + "num_input_tokens_seen": 268323290, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.25134277, + "step": 12441, + "time_per_iteration": 2.702354669570923 + }, + { + "auxiliary_loss_clip": 0.01289677, + "auxiliary_loss_mlp": 0.00233658, + "balance_loss_clip": 1.05759811, + "balance_loss_mlp": 0.20552427, + "epoch": 0.7480535096948745, + "flos": 14392027912320.0, + "grad_norm": 61.08820436121606, + "language_loss": 0.92358434, + "learning_rate": 6.296060463313698e-07, + "loss": 0.93881768, + "num_input_tokens_seen": 268339490, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.28137207, + "step": 12442, + "time_per_iteration": 2.715532064437866 + }, + { + "auxiliary_loss_clip": 0.01323655, + "auxiliary_loss_mlp": 0.00251724, + "balance_loss_clip": 1.0865407, + "balance_loss_mlp": 0.22218373, + "epoch": 0.7481136329475425, + "flos": 27344683048320.0, + "grad_norm": 633.9063510640644, + "language_loss": 0.73650146, + "learning_rate": 6.293224048575565e-07, + "loss": 0.7522552, + "num_input_tokens_seen": 268359865, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.29541016, + "step": 12443, + "time_per_iteration": 2.761531352996826 + }, + { + "auxiliary_loss_clip": 0.01277485, + "auxiliary_loss_mlp": 0.00245653, + "balance_loss_clip": 1.05500722, + "balance_loss_mlp": 0.22076166, + "epoch": 0.7481737562002104, + "flos": 19531454716800.0, + "grad_norm": 4.229804734088964, + "language_loss": 0.79228228, + "learning_rate": 6.29038815359975e-07, + "loss": 0.80751365, + "num_input_tokens_seen": 268377065, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.24865723, + "step": 12444, + "time_per_iteration": 2.616154193878174 + }, + { + "auxiliary_loss_clip": 0.01301418, + "auxiliary_loss_mlp": 0.00260726, + "balance_loss_clip": 1.06554246, + "balance_loss_mlp": 0.2323541, + "epoch": 0.7482338794528784, + "flos": 21760035166080.0, + "grad_norm": 2.582532633697953, + "language_loss": 0.75342238, + "learning_rate": 6.287552778493786e-07, + "loss": 0.76904392, + "num_input_tokens_seen": 268396935, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.28369141, + "step": 12445, + "time_per_iteration": 2.6954331398010254 + }, + { + "auxiliary_loss_clip": 0.01292614, + "auxiliary_loss_mlp": 0.00255013, + "balance_loss_clip": 1.06419539, + "balance_loss_mlp": 0.22828668, + "epoch": 0.7482940027055464, + "flos": 18697358021760.0, + "grad_norm": 3.970613056152344, + "language_loss": 0.82249463, + "learning_rate": 6.28471792336519e-07, + "loss": 0.83797091, + "num_input_tokens_seen": 268414460, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.26721191, + "step": 12446, + "time_per_iteration": 2.675278425216675 + }, + { + "auxiliary_loss_clip": 0.01307459, + "auxiliary_loss_mlp": 0.00246838, + "balance_loss_clip": 1.06894207, + "balance_loss_mlp": 0.21747719, + "epoch": 0.7483541259582144, + "flos": 15998024903040.0, + "grad_norm": 3.0063082973114192, + "language_loss": 0.81684119, + "learning_rate": 6.281883588321475e-07, + "loss": 0.83238411, + "num_input_tokens_seen": 268432225, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.29382324, + "step": 12447, + "time_per_iteration": 2.6692981719970703 + }, + { + "auxiliary_loss_clip": 0.0129974, + "auxiliary_loss_mlp": 0.00252928, + "balance_loss_clip": 1.06376588, + "balance_loss_mlp": 0.22565269, + "epoch": 0.7484142492108823, + "flos": 25556295772800.0, + "grad_norm": 21.10796898332828, + "language_loss": 0.85289109, + "learning_rate": 6.279049773470109e-07, + "loss": 0.8684178, + "num_input_tokens_seen": 268449270, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.27294922, + "step": 12448, + "time_per_iteration": 2.6656105518341064 + }, + { + "auxiliary_loss_clip": 0.01291148, + "auxiliary_loss_mlp": 0.00231799, + "balance_loss_clip": 1.05845881, + "balance_loss_mlp": 0.20608565, + "epoch": 0.7484743724635503, + "flos": 22887737631360.0, + "grad_norm": 5.308758201220384, + "language_loss": 0.82126027, + "learning_rate": 6.276216478918543e-07, + "loss": 0.83648974, + "num_input_tokens_seen": 268467250, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.25671387, + "step": 12449, + "time_per_iteration": 2.697805404663086 + }, + { + "auxiliary_loss_clip": 0.01318559, + "auxiliary_loss_mlp": 0.00284347, + "balance_loss_clip": 1.07292449, + "balance_loss_mlp": 0.2567139, + "epoch": 0.7485344957162182, + "flos": 25300288563840.0, + "grad_norm": 26.40440656248313, + "language_loss": 0.69458467, + "learning_rate": 6.273383704774225e-07, + "loss": 0.71061373, + "num_input_tokens_seen": 268487270, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.27636719, + "step": 12450, + "time_per_iteration": 2.6614151000976562 + }, + { + "auxiliary_loss_clip": 0.01286867, + "auxiliary_loss_mlp": 0.00234283, + "balance_loss_clip": 1.06566, + "balance_loss_mlp": 0.20870087, + "epoch": 0.7485946189688862, + "flos": 27053016612480.0, + "grad_norm": 4.3254156307020875, + "language_loss": 0.78348982, + "learning_rate": 6.270551451144577e-07, + "loss": 0.79870129, + "num_input_tokens_seen": 268508020, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.25598145, + "step": 12451, + "time_per_iteration": 2.784985303878784 + }, + { + "auxiliary_loss_clip": 0.01330058, + "auxiliary_loss_mlp": 0.00271919, + "balance_loss_clip": 1.08226073, + "balance_loss_mlp": 0.24296349, + "epoch": 0.7486547422215541, + "flos": 26906752431360.0, + "grad_norm": 7.8391319733960625, + "language_loss": 0.8972227, + "learning_rate": 6.267719718136988e-07, + "loss": 0.91324246, + "num_input_tokens_seen": 268527375, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.28967285, + "step": 12452, + "time_per_iteration": 2.7311184406280518 + }, + { + "auxiliary_loss_clip": 0.01352617, + "auxiliary_loss_mlp": 0.00269555, + "balance_loss_clip": 1.10264874, + "balance_loss_mlp": 0.24015787, + "epoch": 0.7487148654742222, + "flos": 22346277039360.0, + "grad_norm": 4.224820433888867, + "language_loss": 0.79306298, + "learning_rate": 6.264888505858843e-07, + "loss": 0.80928469, + "num_input_tokens_seen": 268544870, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.29345703, + "step": 12453, + "time_per_iteration": 2.7425343990325928 + }, + { + "auxiliary_loss_clip": 0.01302447, + "auxiliary_loss_mlp": 0.00236681, + "balance_loss_clip": 1.06788874, + "balance_loss_mlp": 0.20978743, + "epoch": 0.7487749887268901, + "flos": 23038814234880.0, + "grad_norm": 4.872183837831684, + "language_loss": 0.81901705, + "learning_rate": 6.262057814417517e-07, + "loss": 0.8344084, + "num_input_tokens_seen": 268564580, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.26928711, + "step": 12454, + "time_per_iteration": 2.694247245788574 + }, + { + "auxiliary_loss_clip": 0.01202878, + "auxiliary_loss_mlp": 0.00123985, + "balance_loss_clip": 1.03798032, + "balance_loss_mlp": 0.11511607, + "epoch": 0.7488351119795581, + "flos": 71525294536320.0, + "grad_norm": 0.7183918615209325, + "language_loss": 0.58715725, + "learning_rate": 6.259227643920322e-07, + "loss": 0.60042584, + "num_input_tokens_seen": 268629550, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.08886719, + "step": 12455, + "time_per_iteration": 3.423179864883423 + }, + { + "auxiliary_loss_clip": 0.01280562, + "auxiliary_loss_mlp": 0.00228411, + "balance_loss_clip": 1.05695462, + "balance_loss_mlp": 0.20281699, + "epoch": 0.748895235232226, + "flos": 17196255722880.0, + "grad_norm": 71.8676819485333, + "language_loss": 0.88212538, + "learning_rate": 6.256397994474592e-07, + "loss": 0.89721513, + "num_input_tokens_seen": 268646645, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.25634766, + "step": 12456, + "time_per_iteration": 2.6498279571533203 + }, + { + "auxiliary_loss_clip": 0.0119982, + "auxiliary_loss_mlp": 0.001679, + "balance_loss_clip": 1.03432882, + "balance_loss_mlp": 0.15774377, + "epoch": 0.748955358484894, + "flos": 58979256336000.0, + "grad_norm": 0.8906802383869041, + "language_loss": 0.60946584, + "learning_rate": 6.25356886618763e-07, + "loss": 0.62314302, + "num_input_tokens_seen": 268702275, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.1015625, + "step": 12457, + "time_per_iteration": 3.0612950325012207 + }, + { + "auxiliary_loss_clip": 0.01307413, + "auxiliary_loss_mlp": 0.00236567, + "balance_loss_clip": 1.07175362, + "balance_loss_mlp": 0.21144973, + "epoch": 0.749015481737562, + "flos": 11360413054080.0, + "grad_norm": 242.61831801093996, + "language_loss": 0.78290343, + "learning_rate": 6.250740259166711e-07, + "loss": 0.79834318, + "num_input_tokens_seen": 268716265, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.25134277, + "step": 12458, + "time_per_iteration": 2.6288180351257324 + }, + { + "auxiliary_loss_clip": 0.01284408, + "auxiliary_loss_mlp": 0.00247584, + "balance_loss_clip": 1.06072474, + "balance_loss_mlp": 0.22194239, + "epoch": 0.74907560499023, + "flos": 21106497162240.0, + "grad_norm": 19.929779847057716, + "language_loss": 0.83523142, + "learning_rate": 6.247912173519106e-07, + "loss": 0.85055137, + "num_input_tokens_seen": 268734330, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.2565918, + "step": 12459, + "time_per_iteration": 2.6447603702545166 + }, + { + "auxiliary_loss_clip": 0.0129858, + "auxiliary_loss_mlp": 0.00240002, + "balance_loss_clip": 1.06886172, + "balance_loss_mlp": 0.21295339, + "epoch": 0.749135728242898, + "flos": 22268027260800.0, + "grad_norm": 3.973942140528668, + "language_loss": 0.87779355, + "learning_rate": 6.245084609352043e-07, + "loss": 0.89317942, + "num_input_tokens_seen": 268753500, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.27062988, + "step": 12460, + "time_per_iteration": 2.6817595958709717 + }, + { + "auxiliary_loss_clip": 0.01293962, + "auxiliary_loss_mlp": 0.0025254, + "balance_loss_clip": 1.06412017, + "balance_loss_mlp": 0.22594473, + "epoch": 0.7491958514955659, + "flos": 24057527857920.0, + "grad_norm": 29.63831961050431, + "language_loss": 0.93207854, + "learning_rate": 6.242257566772755e-07, + "loss": 0.9475435, + "num_input_tokens_seen": 268772055, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.26611328, + "step": 12461, + "time_per_iteration": 2.6822867393493652 + }, + { + "auxiliary_loss_clip": 0.01281368, + "auxiliary_loss_mlp": 0.00224036, + "balance_loss_clip": 1.06000102, + "balance_loss_mlp": 0.20018236, + "epoch": 0.7492559747482339, + "flos": 24492118510080.0, + "grad_norm": 24.70839756217448, + "language_loss": 0.77877223, + "learning_rate": 6.239431045888435e-07, + "loss": 0.79382634, + "num_input_tokens_seen": 268792265, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.2388916, + "step": 12462, + "time_per_iteration": 2.6966402530670166 + }, + { + "auxiliary_loss_clip": 0.01301773, + "auxiliary_loss_mlp": 0.00272502, + "balance_loss_clip": 1.06982982, + "balance_loss_mlp": 0.24535772, + "epoch": 0.7493160980009018, + "flos": 27745338326400.0, + "grad_norm": 15.424165432874085, + "language_loss": 0.79411924, + "learning_rate": 6.236605046806267e-07, + "loss": 0.80986202, + "num_input_tokens_seen": 268812735, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.27111816, + "step": 12463, + "time_per_iteration": 2.789400100708008 + }, + { + "auxiliary_loss_clip": 0.01275622, + "auxiliary_loss_mlp": 0.00243021, + "balance_loss_clip": 1.05285597, + "balance_loss_mlp": 0.21838091, + "epoch": 0.7493762212535698, + "flos": 30226190970240.0, + "grad_norm": 12.00355858197271, + "language_loss": 0.84720933, + "learning_rate": 6.233779569633419e-07, + "loss": 0.86239576, + "num_input_tokens_seen": 268833090, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.24645996, + "step": 12464, + "time_per_iteration": 2.80342435836792 + }, + { + "auxiliary_loss_clip": 0.01283159, + "auxiliary_loss_mlp": 0.00253574, + "balance_loss_clip": 1.05407906, + "balance_loss_mlp": 0.22756258, + "epoch": 0.7494363445062378, + "flos": 21944472526080.0, + "grad_norm": 290.1565301506593, + "language_loss": 0.86069226, + "learning_rate": 6.230954614477034e-07, + "loss": 0.87605959, + "num_input_tokens_seen": 268851880, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.26062012, + "step": 12465, + "time_per_iteration": 2.722250461578369 + }, + { + "auxiliary_loss_clip": 0.01336415, + "auxiliary_loss_mlp": 0.00261003, + "balance_loss_clip": 1.08316112, + "balance_loss_mlp": 0.23105793, + "epoch": 0.7494964677589058, + "flos": 12490342162560.0, + "grad_norm": 4.999096187530889, + "language_loss": 0.85676384, + "learning_rate": 6.22813018144422e-07, + "loss": 0.87273806, + "num_input_tokens_seen": 268867910, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.29931641, + "step": 12466, + "time_per_iteration": 2.6257405281066895 + }, + { + "auxiliary_loss_clip": 0.01318774, + "auxiliary_loss_mlp": 0.00269332, + "balance_loss_clip": 1.08019865, + "balance_loss_mlp": 0.24166401, + "epoch": 0.7495565910115737, + "flos": 21653057485440.0, + "grad_norm": 37.22856354617648, + "language_loss": 0.74850595, + "learning_rate": 6.22530627064209e-07, + "loss": 0.76438701, + "num_input_tokens_seen": 268887260, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.27648926, + "step": 12467, + "time_per_iteration": 2.643465757369995 + }, + { + "auxiliary_loss_clip": 0.0131683, + "auxiliary_loss_mlp": 0.00248706, + "balance_loss_clip": 1.07432556, + "balance_loss_mlp": 0.22021481, + "epoch": 0.7496167142642417, + "flos": 15268535591040.0, + "grad_norm": 14.814807135000269, + "language_loss": 0.86569273, + "learning_rate": 6.222482882177735e-07, + "loss": 0.88134813, + "num_input_tokens_seen": 268902520, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.28503418, + "step": 12468, + "time_per_iteration": 4.113052845001221 + }, + { + "auxiliary_loss_clip": 0.01299763, + "auxiliary_loss_mlp": 0.00229603, + "balance_loss_clip": 1.06468976, + "balance_loss_mlp": 0.20305556, + "epoch": 0.7496768375169096, + "flos": 22054933825920.0, + "grad_norm": 11.517291407864052, + "language_loss": 0.78028494, + "learning_rate": 6.219660016158201e-07, + "loss": 0.7955786, + "num_input_tokens_seen": 268920970, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.26501465, + "step": 12469, + "time_per_iteration": 2.6290242671966553 + }, + { + "auxiliary_loss_clip": 0.01313349, + "auxiliary_loss_mlp": 0.00235918, + "balance_loss_clip": 1.07408953, + "balance_loss_mlp": 0.20798752, + "epoch": 0.7497369607695776, + "flos": 19057038860160.0, + "grad_norm": 6.490601707817686, + "language_loss": 0.77652812, + "learning_rate": 6.216837672690543e-07, + "loss": 0.7920208, + "num_input_tokens_seen": 268936600, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.2791748, + "step": 12470, + "time_per_iteration": 4.075212001800537 + }, + { + "auxiliary_loss_clip": 0.01305011, + "auxiliary_loss_mlp": 0.00269618, + "balance_loss_clip": 1.06188774, + "balance_loss_mlp": 0.23914862, + "epoch": 0.7497970840222457, + "flos": 21617434172160.0, + "grad_norm": 13.297279286671179, + "language_loss": 0.85294437, + "learning_rate": 6.214015851881793e-07, + "loss": 0.86869067, + "num_input_tokens_seen": 268956560, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.30419922, + "step": 12471, + "time_per_iteration": 2.6477580070495605 + }, + { + "auxiliary_loss_clip": 0.01288424, + "auxiliary_loss_mlp": 0.002364, + "balance_loss_clip": 1.05461848, + "balance_loss_mlp": 0.2089697, + "epoch": 0.7498572072749136, + "flos": 13735580906880.0, + "grad_norm": 13.17358645911383, + "language_loss": 0.91231173, + "learning_rate": 6.211194553838929e-07, + "loss": 0.92755997, + "num_input_tokens_seen": 268973945, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.27441406, + "step": 12472, + "time_per_iteration": 2.6355648040771484 + }, + { + "auxiliary_loss_clip": 0.012938, + "auxiliary_loss_mlp": 0.00244721, + "balance_loss_clip": 1.06477666, + "balance_loss_mlp": 0.21886453, + "epoch": 0.7499173305275816, + "flos": 22966526113920.0, + "grad_norm": 9.70357782466278, + "language_loss": 0.91755533, + "learning_rate": 6.208373778668951e-07, + "loss": 0.9329406, + "num_input_tokens_seen": 268993245, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.25891113, + "step": 12473, + "time_per_iteration": 2.65549635887146 + }, + { + "auxiliary_loss_clip": 0.01322755, + "auxiliary_loss_mlp": 0.0026459, + "balance_loss_clip": 1.07973015, + "balance_loss_mlp": 0.2375415, + "epoch": 0.7499774537802495, + "flos": 22740467869440.0, + "grad_norm": 5.020482024584094, + "language_loss": 0.82620728, + "learning_rate": 6.205553526478829e-07, + "loss": 0.84208071, + "num_input_tokens_seen": 269012125, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.27026367, + "step": 12474, + "time_per_iteration": 4.17205286026001 + }, + { + "auxiliary_loss_clip": 0.0131679, + "auxiliary_loss_mlp": 0.00285978, + "balance_loss_clip": 1.07670116, + "balance_loss_mlp": 0.25767735, + "epoch": 0.7500375770329175, + "flos": 18296559089280.0, + "grad_norm": 14.7656479144248, + "language_loss": 0.84062934, + "learning_rate": 6.202733797375492e-07, + "loss": 0.85665703, + "num_input_tokens_seen": 269030545, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.28308105, + "step": 12475, + "time_per_iteration": 2.7308402061462402 + }, + { + "auxiliary_loss_clip": 0.01307308, + "auxiliary_loss_mlp": 0.00266274, + "balance_loss_clip": 1.06817961, + "balance_loss_mlp": 0.23697272, + "epoch": 0.7500977002855854, + "flos": 19169978198400.0, + "grad_norm": 4.001296191613778, + "language_loss": 0.90004545, + "learning_rate": 6.199914591465878e-07, + "loss": 0.91578126, + "num_input_tokens_seen": 269048180, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.29296875, + "step": 12476, + "time_per_iteration": 2.7059619426727295 + }, + { + "auxiliary_loss_clip": 0.01306546, + "auxiliary_loss_mlp": 0.00247041, + "balance_loss_clip": 1.06416786, + "balance_loss_mlp": 0.22025496, + "epoch": 0.7501578235382534, + "flos": 22163886754560.0, + "grad_norm": 6.048106329043422, + "language_loss": 0.84372127, + "learning_rate": 6.19709590885688e-07, + "loss": 0.8592571, + "num_input_tokens_seen": 269068600, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.26794434, + "step": 12477, + "time_per_iteration": 2.7879600524902344 + }, + { + "auxiliary_loss_clip": 0.01205541, + "auxiliary_loss_mlp": 0.00154205, + "balance_loss_clip": 1.04353869, + "balance_loss_mlp": 0.14347588, + "epoch": 0.7502179467909214, + "flos": 64465040033280.0, + "grad_norm": 0.8004231533549595, + "language_loss": 0.53527057, + "learning_rate": 6.194277749655394e-07, + "loss": 0.54886806, + "num_input_tokens_seen": 269119045, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.10742188, + "step": 12478, + "time_per_iteration": 4.65759015083313 + }, + { + "auxiliary_loss_clip": 0.01277932, + "auxiliary_loss_mlp": 0.00223087, + "balance_loss_clip": 1.05358434, + "balance_loss_mlp": 0.19889933, + "epoch": 0.7502780700435894, + "flos": 20478275268480.0, + "grad_norm": 2.868545559724577, + "language_loss": 0.88888717, + "learning_rate": 6.191460113968272e-07, + "loss": 0.9038974, + "num_input_tokens_seen": 269136755, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.24157715, + "step": 12479, + "time_per_iteration": 2.6429803371429443 + }, + { + "auxiliary_loss_clip": 0.01301853, + "auxiliary_loss_mlp": 0.00286126, + "balance_loss_clip": 1.06565082, + "balance_loss_mlp": 0.25632399, + "epoch": 0.7503381932962573, + "flos": 20445273648000.0, + "grad_norm": 2.811533641580032, + "language_loss": 0.74383867, + "learning_rate": 6.188643001902369e-07, + "loss": 0.75971854, + "num_input_tokens_seen": 269156120, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.2980957, + "step": 12480, + "time_per_iteration": 2.672666072845459 + }, + { + "auxiliary_loss_clip": 0.01268096, + "auxiliary_loss_mlp": 0.00238407, + "balance_loss_clip": 1.04691315, + "balance_loss_mlp": 0.21338516, + "epoch": 0.7503983165489253, + "flos": 22381936266240.0, + "grad_norm": 7.870035776576065, + "language_loss": 0.84729689, + "learning_rate": 6.185826413564512e-07, + "loss": 0.86236191, + "num_input_tokens_seen": 269175650, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25024414, + "step": 12481, + "time_per_iteration": 2.6653196811676025 + }, + { + "auxiliary_loss_clip": 0.0129337, + "auxiliary_loss_mlp": 0.00261026, + "balance_loss_clip": 1.05579889, + "balance_loss_mlp": 0.23257121, + "epoch": 0.7504584398015932, + "flos": 24899453717760.0, + "grad_norm": 288.5599406885729, + "language_loss": 0.79852653, + "learning_rate": 6.183010349061501e-07, + "loss": 0.81407046, + "num_input_tokens_seen": 269197080, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.28417969, + "step": 12482, + "time_per_iteration": 2.8244144916534424 + }, + { + "auxiliary_loss_clip": 0.01284348, + "auxiliary_loss_mlp": 0.00296765, + "balance_loss_clip": 1.05648255, + "balance_loss_mlp": 0.26859558, + "epoch": 0.7505185630542612, + "flos": 25885237547520.0, + "grad_norm": 24.31309298517661, + "language_loss": 0.78583848, + "learning_rate": 6.180194808500118e-07, + "loss": 0.80164957, + "num_input_tokens_seen": 269218600, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.28186035, + "step": 12483, + "time_per_iteration": 2.769561290740967 + }, + { + "auxiliary_loss_clip": 0.01282325, + "auxiliary_loss_mlp": 0.00271098, + "balance_loss_clip": 1.054829, + "balance_loss_mlp": 0.24508618, + "epoch": 0.7505786863069293, + "flos": 23143852581120.0, + "grad_norm": 6.132587859533722, + "language_loss": 0.8102119, + "learning_rate": 6.177379791987131e-07, + "loss": 0.82574606, + "num_input_tokens_seen": 269239245, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.26000977, + "step": 12484, + "time_per_iteration": 2.7096052169799805 + }, + { + "auxiliary_loss_clip": 0.01286151, + "auxiliary_loss_mlp": 0.00274137, + "balance_loss_clip": 1.05648732, + "balance_loss_mlp": 0.2474346, + "epoch": 0.7506388095595972, + "flos": 16983377769600.0, + "grad_norm": 60.29314029086098, + "language_loss": 0.9203254, + "learning_rate": 6.174565299629295e-07, + "loss": 0.93592823, + "num_input_tokens_seen": 269258520, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.26708984, + "step": 12485, + "time_per_iteration": 2.6579833030700684 + }, + { + "auxiliary_loss_clip": 0.01270257, + "auxiliary_loss_mlp": 0.00235856, + "balance_loss_clip": 1.05176163, + "balance_loss_mlp": 0.211513, + "epoch": 0.7506989328122652, + "flos": 22344984149760.0, + "grad_norm": 27.486481467458674, + "language_loss": 0.84489346, + "learning_rate": 6.171751331533323e-07, + "loss": 0.8599546, + "num_input_tokens_seen": 269278320, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24353027, + "step": 12486, + "time_per_iteration": 2.650078535079956 + }, + { + "auxiliary_loss_clip": 0.01293108, + "auxiliary_loss_mlp": 0.00275052, + "balance_loss_clip": 1.06411994, + "balance_loss_mlp": 0.24774104, + "epoch": 0.7507590560649331, + "flos": 25776069137280.0, + "grad_norm": 3.47378740253378, + "language_loss": 0.80091572, + "learning_rate": 6.168937887805932e-07, + "loss": 0.81659722, + "num_input_tokens_seen": 269298025, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.27282715, + "step": 12487, + "time_per_iteration": 2.7029314041137695 + }, + { + "auxiliary_loss_clip": 0.01298912, + "auxiliary_loss_mlp": 0.00252273, + "balance_loss_clip": 1.06399584, + "balance_loss_mlp": 0.22585654, + "epoch": 0.7508191793176011, + "flos": 24279420124800.0, + "grad_norm": 27.92391169644297, + "language_loss": 0.77902055, + "learning_rate": 6.166124968553801e-07, + "loss": 0.79453236, + "num_input_tokens_seen": 269316770, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.26428223, + "step": 12488, + "time_per_iteration": 2.672670841217041 + }, + { + "auxiliary_loss_clip": 0.01291467, + "auxiliary_loss_mlp": 0.00274003, + "balance_loss_clip": 1.06560755, + "balance_loss_mlp": 0.24665666, + "epoch": 0.750879302570269, + "flos": 19899575251200.0, + "grad_norm": 726.1232372969641, + "language_loss": 0.83207226, + "learning_rate": 6.163312573883592e-07, + "loss": 0.84772694, + "num_input_tokens_seen": 269334755, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.2734375, + "step": 12489, + "time_per_iteration": 2.7408132553100586 + }, + { + "auxiliary_loss_clip": 0.01291517, + "auxiliary_loss_mlp": 0.00286601, + "balance_loss_clip": 1.06298804, + "balance_loss_mlp": 0.2585634, + "epoch": 0.750939425822937, + "flos": 29205681667200.0, + "grad_norm": 18.042870867786547, + "language_loss": 0.8295821, + "learning_rate": 6.160500703901956e-07, + "loss": 0.84536326, + "num_input_tokens_seen": 269353810, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.28027344, + "step": 12490, + "time_per_iteration": 2.7784836292266846 + }, + { + "auxiliary_loss_clip": 0.01305439, + "auxiliary_loss_mlp": 0.00267094, + "balance_loss_clip": 1.06732559, + "balance_loss_mlp": 0.23989066, + "epoch": 0.750999549075605, + "flos": 21142300043520.0, + "grad_norm": 55.00373735662941, + "language_loss": 0.86044168, + "learning_rate": 6.157689358715527e-07, + "loss": 0.876167, + "num_input_tokens_seen": 269372910, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.27233887, + "step": 12491, + "time_per_iteration": 2.725717782974243 + }, + { + "auxiliary_loss_clip": 0.01288395, + "auxiliary_loss_mlp": 0.00258993, + "balance_loss_clip": 1.05843759, + "balance_loss_mlp": 0.23275481, + "epoch": 0.751059672328273, + "flos": 23547740083200.0, + "grad_norm": 79.15258017327297, + "language_loss": 0.83069456, + "learning_rate": 6.154878538430899e-07, + "loss": 0.84616846, + "num_input_tokens_seen": 269391545, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.26245117, + "step": 12492, + "time_per_iteration": 2.653517007827759 + }, + { + "auxiliary_loss_clip": 0.01289564, + "auxiliary_loss_mlp": 0.00267695, + "balance_loss_clip": 1.05643845, + "balance_loss_mlp": 0.2414809, + "epoch": 0.7511197955809409, + "flos": 18989742729600.0, + "grad_norm": 311.69382688040923, + "language_loss": 0.78475136, + "learning_rate": 6.152068243154671e-07, + "loss": 0.80032396, + "num_input_tokens_seen": 269408530, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.26245117, + "step": 12493, + "time_per_iteration": 2.655421495437622 + }, + { + "auxiliary_loss_clip": 0.01279525, + "auxiliary_loss_mlp": 0.00258024, + "balance_loss_clip": 1.04832113, + "balance_loss_mlp": 0.23047486, + "epoch": 0.7511799188336089, + "flos": 22046961006720.0, + "grad_norm": 6.877721721534211, + "language_loss": 0.87289226, + "learning_rate": 6.149258472993395e-07, + "loss": 0.8882677, + "num_input_tokens_seen": 269425930, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.27526855, + "step": 12494, + "time_per_iteration": 2.6517417430877686 + }, + { + "auxiliary_loss_clip": 0.0129887, + "auxiliary_loss_mlp": 0.00259407, + "balance_loss_clip": 1.06207478, + "balance_loss_mlp": 0.23163159, + "epoch": 0.7512400420862768, + "flos": 16467125546880.0, + "grad_norm": 43.441595010049866, + "language_loss": 0.85959351, + "learning_rate": 6.146449228053634e-07, + "loss": 0.87517631, + "num_input_tokens_seen": 269443945, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.27746582, + "step": 12495, + "time_per_iteration": 2.660835027694702 + }, + { + "auxiliary_loss_clip": 0.01290041, + "auxiliary_loss_mlp": 0.00260641, + "balance_loss_clip": 1.05773044, + "balance_loss_mlp": 0.23416433, + "epoch": 0.7513001653389448, + "flos": 20448326304000.0, + "grad_norm": 11.343752149526738, + "language_loss": 0.77690554, + "learning_rate": 6.143640508441898e-07, + "loss": 0.7924124, + "num_input_tokens_seen": 269463625, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.26513672, + "step": 12496, + "time_per_iteration": 2.746290445327759 + }, + { + "auxiliary_loss_clip": 0.01293642, + "auxiliary_loss_mlp": 0.00291954, + "balance_loss_clip": 1.05971646, + "balance_loss_mlp": 0.26391554, + "epoch": 0.7513602885916129, + "flos": 23476816679040.0, + "grad_norm": 6.647271950431287, + "language_loss": 0.83953071, + "learning_rate": 6.140832314264705e-07, + "loss": 0.85538673, + "num_input_tokens_seen": 269483415, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.28063965, + "step": 12497, + "time_per_iteration": 2.670529842376709 + }, + { + "auxiliary_loss_clip": 0.01278589, + "auxiliary_loss_mlp": 0.0027212, + "balance_loss_clip": 1.04744637, + "balance_loss_mlp": 0.24463038, + "epoch": 0.7514204118442808, + "flos": 26797224885120.0, + "grad_norm": 94.4727330028092, + "language_loss": 0.83975422, + "learning_rate": 6.13802464562855e-07, + "loss": 0.85526133, + "num_input_tokens_seen": 269504635, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.27502441, + "step": 12498, + "time_per_iteration": 2.739048480987549 + }, + { + "auxiliary_loss_clip": 0.01276354, + "auxiliary_loss_mlp": 0.00278369, + "balance_loss_clip": 1.05811501, + "balance_loss_mlp": 0.25344205, + "epoch": 0.7514805350969488, + "flos": 19865639877120.0, + "grad_norm": 31.528500084232327, + "language_loss": 0.81650496, + "learning_rate": 6.135217502639878e-07, + "loss": 0.83205223, + "num_input_tokens_seen": 269523955, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24914551, + "step": 12499, + "time_per_iteration": 2.673887014389038 + }, + { + "auxiliary_loss_clip": 0.01272023, + "auxiliary_loss_mlp": 0.00247632, + "balance_loss_clip": 1.05097985, + "balance_loss_mlp": 0.22392094, + "epoch": 0.7515406583496167, + "flos": 24571553437440.0, + "grad_norm": 3.1076282040512657, + "language_loss": 0.86736447, + "learning_rate": 6.132410885405148e-07, + "loss": 0.88256097, + "num_input_tokens_seen": 269544410, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.23706055, + "step": 12500, + "time_per_iteration": 2.740064859390259 + }, + { + "auxiliary_loss_clip": 0.01329509, + "auxiliary_loss_mlp": 0.00281974, + "balance_loss_clip": 1.08002019, + "balance_loss_mlp": 0.25291044, + "epoch": 0.7516007816022847, + "flos": 20120246455680.0, + "grad_norm": 8.565206812812418, + "language_loss": 0.82216161, + "learning_rate": 6.129604794030794e-07, + "loss": 0.83827645, + "num_input_tokens_seen": 269563315, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.29064941, + "step": 12501, + "time_per_iteration": 2.686832904815674 + }, + { + "auxiliary_loss_clip": 0.01271144, + "auxiliary_loss_mlp": 0.00291392, + "balance_loss_clip": 1.04486442, + "balance_loss_mlp": 0.26498717, + "epoch": 0.7516609048549526, + "flos": 22784638619520.0, + "grad_norm": 7.29073159647736, + "language_loss": 0.85006958, + "learning_rate": 6.126799228623207e-07, + "loss": 0.86569494, + "num_input_tokens_seen": 269583950, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.26403809, + "step": 12502, + "time_per_iteration": 2.7141616344451904 + }, + { + "auxiliary_loss_clip": 0.0131533, + "auxiliary_loss_mlp": 0.00309306, + "balance_loss_clip": 1.0738889, + "balance_loss_mlp": 0.2800279, + "epoch": 0.7517210281076206, + "flos": 10634012311680.0, + "grad_norm": 41.083480787449474, + "language_loss": 0.81287336, + "learning_rate": 6.123994189288786e-07, + "loss": 0.82911974, + "num_input_tokens_seen": 269600120, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.29321289, + "step": 12503, + "time_per_iteration": 2.6630327701568604 + }, + { + "auxiliary_loss_clip": 0.01208753, + "auxiliary_loss_mlp": 0.00132461, + "balance_loss_clip": 1.05116463, + "balance_loss_mlp": 0.12158871, + "epoch": 0.7517811513602886, + "flos": 66052221275520.0, + "grad_norm": 1.0450516033114592, + "language_loss": 0.63403547, + "learning_rate": 6.121189676133903e-07, + "loss": 0.64744759, + "num_input_tokens_seen": 269659815, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.10888672, + "step": 12504, + "time_per_iteration": 3.122180938720703 + }, + { + "auxiliary_loss_clip": 0.01258409, + "auxiliary_loss_mlp": 0.00251675, + "balance_loss_clip": 1.03749323, + "balance_loss_mlp": 0.22660543, + "epoch": 0.7518412746129566, + "flos": 37268345018880.0, + "grad_norm": 2.223151359437166, + "language_loss": 0.7471661, + "learning_rate": 6.118385689264896e-07, + "loss": 0.76226693, + "num_input_tokens_seen": 269684565, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25097656, + "step": 12505, + "time_per_iteration": 2.9461798667907715 + }, + { + "auxiliary_loss_clip": 0.01201916, + "auxiliary_loss_mlp": 0.00159854, + "balance_loss_clip": 1.04451919, + "balance_loss_mlp": 0.14869569, + "epoch": 0.7519013978656245, + "flos": 60518567727360.0, + "grad_norm": 0.6312058656466595, + "language_loss": 0.54449302, + "learning_rate": 6.11558222878809e-07, + "loss": 0.55811071, + "num_input_tokens_seen": 269752325, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.11181641, + "step": 12506, + "time_per_iteration": 3.262106418609619 + }, + { + "auxiliary_loss_clip": 0.01321564, + "auxiliary_loss_mlp": 0.00274025, + "balance_loss_clip": 1.07706225, + "balance_loss_mlp": 0.24487823, + "epoch": 0.7519615211182925, + "flos": 18806885568000.0, + "grad_norm": 7.161409393359197, + "language_loss": 0.87816656, + "learning_rate": 6.112779294809796e-07, + "loss": 0.89412242, + "num_input_tokens_seen": 269770630, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.29138184, + "step": 12507, + "time_per_iteration": 2.687849998474121 + }, + { + "auxiliary_loss_clip": 0.01256848, + "auxiliary_loss_mlp": 0.00274981, + "balance_loss_clip": 1.04216886, + "balance_loss_mlp": 0.24976851, + "epoch": 0.7520216443709604, + "flos": 14575244209920.0, + "grad_norm": 8.834018747233726, + "language_loss": 0.77965659, + "learning_rate": 6.10997688743631e-07, + "loss": 0.79497486, + "num_input_tokens_seen": 269787280, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.25231934, + "step": 12508, + "time_per_iteration": 2.6248879432678223 + }, + { + "auxiliary_loss_clip": 0.01280049, + "auxiliary_loss_mlp": 0.00273786, + "balance_loss_clip": 1.05550373, + "balance_loss_mlp": 0.24746479, + "epoch": 0.7520817676236284, + "flos": 17056599644160.0, + "grad_norm": 2.1860823527440867, + "language_loss": 0.78590477, + "learning_rate": 6.107175006773885e-07, + "loss": 0.8014431, + "num_input_tokens_seen": 269805205, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26318359, + "step": 12509, + "time_per_iteration": 2.679819345474243 + }, + { + "auxiliary_loss_clip": 0.01296876, + "auxiliary_loss_mlp": 0.00284961, + "balance_loss_clip": 1.0597986, + "balance_loss_mlp": 0.25525451, + "epoch": 0.7521418908762965, + "flos": 25666397936640.0, + "grad_norm": 5.43598499479025, + "language_loss": 0.7035917, + "learning_rate": 6.104373652928785e-07, + "loss": 0.71941012, + "num_input_tokens_seen": 269824820, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.29736328, + "step": 12510, + "time_per_iteration": 2.6889264583587646 + }, + { + "auxiliary_loss_clip": 0.01281272, + "auxiliary_loss_mlp": 0.00273775, + "balance_loss_clip": 1.05777895, + "balance_loss_mlp": 0.2468572, + "epoch": 0.7522020141289644, + "flos": 20886759711360.0, + "grad_norm": 74.46407791765485, + "language_loss": 0.88450778, + "learning_rate": 6.10157282600722e-07, + "loss": 0.90005827, + "num_input_tokens_seen": 269842825, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26940918, + "step": 12511, + "time_per_iteration": 4.0594775676727295 + }, + { + "auxiliary_loss_clip": 0.01285776, + "auxiliary_loss_mlp": 0.00279549, + "balance_loss_clip": 1.05185652, + "balance_loss_mlp": 0.25235745, + "epoch": 0.7522621373816324, + "flos": 12640305444480.0, + "grad_norm": 27.423544250743607, + "language_loss": 0.84609419, + "learning_rate": 6.098772526115412e-07, + "loss": 0.86174744, + "num_input_tokens_seen": 269859000, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.27172852, + "step": 12512, + "time_per_iteration": 4.028371572494507 + }, + { + "auxiliary_loss_clip": 0.01261673, + "auxiliary_loss_mlp": 0.00263273, + "balance_loss_clip": 1.04122293, + "balance_loss_mlp": 0.23761943, + "epoch": 0.7523222606343003, + "flos": 25626141768960.0, + "grad_norm": 497.15509450032374, + "language_loss": 0.88279641, + "learning_rate": 6.095972753359537e-07, + "loss": 0.8980459, + "num_input_tokens_seen": 269878895, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25683594, + "step": 12513, + "time_per_iteration": 2.6903774738311768 + }, + { + "auxiliary_loss_clip": 0.01289522, + "auxiliary_loss_mlp": 0.00249272, + "balance_loss_clip": 1.0565114, + "balance_loss_mlp": 0.21895757, + "epoch": 0.7523823838869683, + "flos": 20448900921600.0, + "grad_norm": 10.297269541552067, + "language_loss": 0.82456231, + "learning_rate": 6.093173507845771e-07, + "loss": 0.8399502, + "num_input_tokens_seen": 269897280, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.30285645, + "step": 12514, + "time_per_iteration": 2.6166131496429443 + }, + { + "auxiliary_loss_clip": 0.01260069, + "auxiliary_loss_mlp": 0.00252455, + "balance_loss_clip": 1.04254842, + "balance_loss_mlp": 0.22763579, + "epoch": 0.7524425071396362, + "flos": 14720610551040.0, + "grad_norm": 368.2859388430732, + "language_loss": 0.76657128, + "learning_rate": 6.090374789680271e-07, + "loss": 0.7816965, + "num_input_tokens_seen": 269914640, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24829102, + "step": 12515, + "time_per_iteration": 2.6741838455200195 + }, + { + "auxiliary_loss_clip": 0.01287164, + "auxiliary_loss_mlp": 0.00262893, + "balance_loss_clip": 1.05463862, + "balance_loss_mlp": 0.23702507, + "epoch": 0.7525026303923043, + "flos": 30592048947840.0, + "grad_norm": 11.022183281724217, + "language_loss": 0.801332, + "learning_rate": 6.087576598969137e-07, + "loss": 0.81683254, + "num_input_tokens_seen": 269934960, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.25854492, + "step": 12516, + "time_per_iteration": 4.233945369720459 + }, + { + "auxiliary_loss_clip": 0.01270501, + "auxiliary_loss_mlp": 0.00263315, + "balance_loss_clip": 1.05038011, + "balance_loss_mlp": 0.23588482, + "epoch": 0.7525627536449722, + "flos": 24791757765120.0, + "grad_norm": 1.6634787136932223, + "language_loss": 0.94560826, + "learning_rate": 6.084778935818495e-07, + "loss": 0.96094638, + "num_input_tokens_seen": 269956655, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.27404785, + "step": 12517, + "time_per_iteration": 2.7058098316192627 + }, + { + "auxiliary_loss_clip": 0.01300128, + "auxiliary_loss_mlp": 0.00245428, + "balance_loss_clip": 1.06806004, + "balance_loss_mlp": 0.2192972, + "epoch": 0.7526228768976402, + "flos": 20779782030720.0, + "grad_norm": 438.0698607567295, + "language_loss": 0.82059324, + "learning_rate": 6.081981800334437e-07, + "loss": 0.83604884, + "num_input_tokens_seen": 269976835, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.26123047, + "step": 12518, + "time_per_iteration": 2.722987651824951 + }, + { + "auxiliary_loss_clip": 0.01189494, + "auxiliary_loss_mlp": 0.00105926, + "balance_loss_clip": 1.03650677, + "balance_loss_mlp": 0.09724726, + "epoch": 0.7526830001503081, + "flos": 66559243703040.0, + "grad_norm": 0.6869482806994321, + "language_loss": 0.54946476, + "learning_rate": 6.079185192623017e-07, + "loss": 0.56241894, + "num_input_tokens_seen": 270040630, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.08691406, + "step": 12519, + "time_per_iteration": 3.253899097442627 + }, + { + "auxiliary_loss_clip": 0.01277343, + "auxiliary_loss_mlp": 0.00295608, + "balance_loss_clip": 1.0510025, + "balance_loss_mlp": 0.2677722, + "epoch": 0.7527431234029761, + "flos": 23477894087040.0, + "grad_norm": 14.68673891562467, + "language_loss": 0.8293404, + "learning_rate": 6.07638911279029e-07, + "loss": 0.84506989, + "num_input_tokens_seen": 270059695, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.27832031, + "step": 12520, + "time_per_iteration": 4.172483682632446 + }, + { + "auxiliary_loss_clip": 0.01261482, + "auxiliary_loss_mlp": 0.00246523, + "balance_loss_clip": 1.0380336, + "balance_loss_mlp": 0.21955773, + "epoch": 0.752803246655644, + "flos": 22049546785920.0, + "grad_norm": 13.618211179454727, + "language_loss": 0.8071208, + "learning_rate": 6.07359356094229e-07, + "loss": 0.82220083, + "num_input_tokens_seen": 270078420, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.27001953, + "step": 12521, + "time_per_iteration": 2.7003703117370605 + }, + { + "auxiliary_loss_clip": 0.01305562, + "auxiliary_loss_mlp": 0.0027563, + "balance_loss_clip": 1.06695807, + "balance_loss_mlp": 0.24643555, + "epoch": 0.752863369908312, + "flos": 30153795108480.0, + "grad_norm": 2.843850777901777, + "language_loss": 0.75286829, + "learning_rate": 6.070798537185016e-07, + "loss": 0.76868021, + "num_input_tokens_seen": 270097040, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.29187012, + "step": 12522, + "time_per_iteration": 2.7394142150878906 + }, + { + "auxiliary_loss_clip": 0.01292765, + "auxiliary_loss_mlp": 0.0026829, + "balance_loss_clip": 1.05979729, + "balance_loss_mlp": 0.24095546, + "epoch": 0.7529234931609801, + "flos": 24567638855040.0, + "grad_norm": 13.137864832565525, + "language_loss": 0.84102857, + "learning_rate": 6.068004041624453e-07, + "loss": 0.85663915, + "num_input_tokens_seen": 270116365, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.2734375, + "step": 12523, + "time_per_iteration": 2.7307329177856445 + }, + { + "auxiliary_loss_clip": 0.01270207, + "auxiliary_loss_mlp": 0.00275655, + "balance_loss_clip": 1.04512155, + "balance_loss_mlp": 0.24946511, + "epoch": 0.752983616413648, + "flos": 23112395245440.0, + "grad_norm": 76.4800201302663, + "language_loss": 0.87523299, + "learning_rate": 6.065210074366571e-07, + "loss": 0.89069158, + "num_input_tokens_seen": 270135395, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.26208496, + "step": 12524, + "time_per_iteration": 2.6739490032196045 + }, + { + "auxiliary_loss_clip": 0.01267706, + "auxiliary_loss_mlp": 0.00270431, + "balance_loss_clip": 1.04447746, + "balance_loss_mlp": 0.2429176, + "epoch": 0.753043739666316, + "flos": 24316946858880.0, + "grad_norm": 3.5212036375808347, + "language_loss": 0.80610561, + "learning_rate": 6.062416635517326e-07, + "loss": 0.82148695, + "num_input_tokens_seen": 270156425, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.27526855, + "step": 12525, + "time_per_iteration": 2.724648952484131 + }, + { + "auxiliary_loss_clip": 0.0129138, + "auxiliary_loss_mlp": 0.00269133, + "balance_loss_clip": 1.0628463, + "balance_loss_mlp": 0.24265708, + "epoch": 0.7531038629189839, + "flos": 24243294021120.0, + "grad_norm": 5.9211812250633855, + "language_loss": 0.82918763, + "learning_rate": 6.059623725182641e-07, + "loss": 0.84479272, + "num_input_tokens_seen": 270176905, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.26477051, + "step": 12526, + "time_per_iteration": 2.692064046859741 + }, + { + "auxiliary_loss_clip": 0.01264706, + "auxiliary_loss_mlp": 0.00233559, + "balance_loss_clip": 1.03973317, + "balance_loss_mlp": 0.20668897, + "epoch": 0.7531639861716519, + "flos": 30188807890560.0, + "grad_norm": 22078.638196319032, + "language_loss": 0.80052811, + "learning_rate": 6.056831343468414e-07, + "loss": 0.81551075, + "num_input_tokens_seen": 270196640, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.2689209, + "step": 12527, + "time_per_iteration": 2.783945083618164 + }, + { + "auxiliary_loss_clip": 0.01262449, + "auxiliary_loss_mlp": 0.00258853, + "balance_loss_clip": 1.04267716, + "balance_loss_mlp": 0.23352104, + "epoch": 0.7532241094243198, + "flos": 18223193560320.0, + "grad_norm": 8.776421866891916, + "language_loss": 0.89242077, + "learning_rate": 6.054039490480539e-07, + "loss": 0.90763384, + "num_input_tokens_seen": 270213905, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.2532959, + "step": 12528, + "time_per_iteration": 2.7970943450927734 + }, + { + "auxiliary_loss_clip": 0.01298147, + "auxiliary_loss_mlp": 0.00242171, + "balance_loss_clip": 1.06018329, + "balance_loss_mlp": 0.21524188, + "epoch": 0.7532842326769879, + "flos": 20881049448960.0, + "grad_norm": 135.24776129383724, + "language_loss": 0.92432958, + "learning_rate": 6.051248166324892e-07, + "loss": 0.93973279, + "num_input_tokens_seen": 270231995, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.26940918, + "step": 12529, + "time_per_iteration": 2.6813876628875732 + }, + { + "auxiliary_loss_clip": 0.01306175, + "auxiliary_loss_mlp": 0.00238553, + "balance_loss_clip": 1.06689787, + "balance_loss_mlp": 0.20939425, + "epoch": 0.7533443559296558, + "flos": 18078689145600.0, + "grad_norm": 8.903961227739362, + "language_loss": 0.86469889, + "learning_rate": 6.048457371107303e-07, + "loss": 0.88014615, + "num_input_tokens_seen": 270251480, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.29174805, + "step": 12530, + "time_per_iteration": 2.638803243637085 + }, + { + "auxiliary_loss_clip": 0.01174664, + "auxiliary_loss_mlp": 0.00093409, + "balance_loss_clip": 1.02113891, + "balance_loss_mlp": 0.08382466, + "epoch": 0.7534044791823238, + "flos": 50254830766080.0, + "grad_norm": 1.2527930130117408, + "language_loss": 0.63276398, + "learning_rate": 6.045667104933612e-07, + "loss": 0.64544475, + "num_input_tokens_seen": 270306480, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.09570312, + "step": 12531, + "time_per_iteration": 3.053140163421631 + }, + { + "auxiliary_loss_clip": 0.01299357, + "auxiliary_loss_mlp": 0.00239048, + "balance_loss_clip": 1.06436253, + "balance_loss_mlp": 0.21267864, + "epoch": 0.7534646024349917, + "flos": 20850274471680.0, + "grad_norm": 61.40977783520651, + "language_loss": 0.80663502, + "learning_rate": 6.042877367909633e-07, + "loss": 0.8220191, + "num_input_tokens_seen": 270324595, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.2635498, + "step": 12532, + "time_per_iteration": 2.660615921020508 + }, + { + "auxiliary_loss_clip": 0.01253328, + "auxiliary_loss_mlp": 0.00255399, + "balance_loss_clip": 1.03461874, + "balance_loss_mlp": 0.23083034, + "epoch": 0.7535247256876597, + "flos": 23071779941760.0, + "grad_norm": 89.48599880176604, + "language_loss": 0.82729959, + "learning_rate": 6.040088160141132e-07, + "loss": 0.84238684, + "num_input_tokens_seen": 270344375, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24572754, + "step": 12533, + "time_per_iteration": 2.645151376724243 + }, + { + "auxiliary_loss_clip": 0.011794, + "auxiliary_loss_mlp": 0.00144149, + "balance_loss_clip": 1.02438033, + "balance_loss_mlp": 0.13308619, + "epoch": 0.7535848489403276, + "flos": 58623418252800.0, + "grad_norm": 52.961888828116145, + "language_loss": 0.56878179, + "learning_rate": 6.037299481733886e-07, + "loss": 0.58201724, + "num_input_tokens_seen": 270405235, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.11083984, + "step": 12534, + "time_per_iteration": 3.174124002456665 + }, + { + "auxiliary_loss_clip": 0.01275219, + "auxiliary_loss_mlp": 0.00265573, + "balance_loss_clip": 1.04858744, + "balance_loss_mlp": 0.23858392, + "epoch": 0.7536449721929956, + "flos": 26577882483840.0, + "grad_norm": 14.209548779578869, + "language_loss": 0.78306687, + "learning_rate": 6.03451133279365e-07, + "loss": 0.79847479, + "num_input_tokens_seen": 270425820, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.27026367, + "step": 12535, + "time_per_iteration": 2.6726419925689697 + }, + { + "auxiliary_loss_clip": 0.01293014, + "auxiliary_loss_mlp": 0.0026946, + "balance_loss_clip": 1.0580883, + "balance_loss_mlp": 0.24109977, + "epoch": 0.7537050954456637, + "flos": 25735992537600.0, + "grad_norm": 2.508772958061081, + "language_loss": 0.87550139, + "learning_rate": 6.031723713426135e-07, + "loss": 0.8911261, + "num_input_tokens_seen": 270447120, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.2833252, + "step": 12536, + "time_per_iteration": 2.780121088027954 + }, + { + "auxiliary_loss_clip": 0.01300985, + "auxiliary_loss_mlp": 0.00300677, + "balance_loss_clip": 1.06703949, + "balance_loss_mlp": 0.27101731, + "epoch": 0.7537652186983316, + "flos": 30224431203840.0, + "grad_norm": 17.30369915258467, + "language_loss": 0.8193472, + "learning_rate": 6.028936623737067e-07, + "loss": 0.83536386, + "num_input_tokens_seen": 270468680, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.29675293, + "step": 12537, + "time_per_iteration": 2.735922336578369 + }, + { + "auxiliary_loss_clip": 0.0129252, + "auxiliary_loss_mlp": 0.00285615, + "balance_loss_clip": 1.06010818, + "balance_loss_mlp": 0.25661093, + "epoch": 0.7538253419509996, + "flos": 12641239198080.0, + "grad_norm": 39.5538844123437, + "language_loss": 0.82553971, + "learning_rate": 6.026150063832111e-07, + "loss": 0.84132099, + "num_input_tokens_seen": 270486310, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.29003906, + "step": 12538, + "time_per_iteration": 2.644432306289673 + }, + { + "auxiliary_loss_clip": 0.0129547, + "auxiliary_loss_mlp": 0.00261515, + "balance_loss_clip": 1.06125176, + "balance_loss_mlp": 0.23253497, + "epoch": 0.7538854652036675, + "flos": 23185976256000.0, + "grad_norm": 4.6002542754149, + "language_loss": 0.75051856, + "learning_rate": 6.023364033816956e-07, + "loss": 0.76608843, + "num_input_tokens_seen": 270507210, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.29016113, + "step": 12539, + "time_per_iteration": 2.6887168884277344 + }, + { + "auxiliary_loss_clip": 0.01260781, + "auxiliary_loss_mlp": 0.00260483, + "balance_loss_clip": 1.0433532, + "balance_loss_mlp": 0.23518707, + "epoch": 0.7539455884563355, + "flos": 23186227651200.0, + "grad_norm": 52.010915548921965, + "language_loss": 0.82014805, + "learning_rate": 6.020578533797229e-07, + "loss": 0.83536065, + "num_input_tokens_seen": 270525250, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25292969, + "step": 12540, + "time_per_iteration": 2.729484796524048 + }, + { + "auxiliary_loss_clip": 0.01282964, + "auxiliary_loss_mlp": 0.00247675, + "balance_loss_clip": 1.05145824, + "balance_loss_mlp": 0.21865904, + "epoch": 0.7540057117090034, + "flos": 13181155505280.0, + "grad_norm": 56.20360401116509, + "language_loss": 0.83856404, + "learning_rate": 6.017793563878566e-07, + "loss": 0.85387045, + "num_input_tokens_seen": 270539295, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.29052734, + "step": 12541, + "time_per_iteration": 2.6377341747283936 + }, + { + "auxiliary_loss_clip": 0.01262733, + "auxiliary_loss_mlp": 0.00259378, + "balance_loss_clip": 1.03940952, + "balance_loss_mlp": 0.23308066, + "epoch": 0.7540658349616715, + "flos": 45478134478080.0, + "grad_norm": 3.6197868505049327, + "language_loss": 0.80541992, + "learning_rate": 6.015009124166576e-07, + "loss": 0.82064098, + "num_input_tokens_seen": 270562815, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26306152, + "step": 12542, + "time_per_iteration": 2.924018383026123 + }, + { + "auxiliary_loss_clip": 0.01285554, + "auxiliary_loss_mlp": 0.00284719, + "balance_loss_clip": 1.05707562, + "balance_loss_mlp": 0.25830221, + "epoch": 0.7541259582143394, + "flos": 19930817105280.0, + "grad_norm": 18.167900916337516, + "language_loss": 0.91760945, + "learning_rate": 6.012225214766844e-07, + "loss": 0.93331218, + "num_input_tokens_seen": 270579055, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.26391602, + "step": 12543, + "time_per_iteration": 2.633511781692505 + }, + { + "auxiliary_loss_clip": 0.01290186, + "auxiliary_loss_mlp": 0.00260785, + "balance_loss_clip": 1.06228173, + "balance_loss_mlp": 0.23575103, + "epoch": 0.7541860814670074, + "flos": 27198239299200.0, + "grad_norm": 3.4645240274852496, + "language_loss": 0.82237589, + "learning_rate": 6.009441835784927e-07, + "loss": 0.83788568, + "num_input_tokens_seen": 270599080, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.25036621, + "step": 12544, + "time_per_iteration": 2.701096296310425 + }, + { + "auxiliary_loss_clip": 0.0125423, + "auxiliary_loss_mlp": 0.00262276, + "balance_loss_clip": 1.03582418, + "balance_loss_mlp": 0.23559725, + "epoch": 0.7542462047196753, + "flos": 21324151624320.0, + "grad_norm": 7.108094156774898, + "language_loss": 0.76123655, + "learning_rate": 6.006658987326383e-07, + "loss": 0.77640164, + "num_input_tokens_seen": 270618715, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.26672363, + "step": 12545, + "time_per_iteration": 2.7560884952545166 + }, + { + "auxiliary_loss_clip": 0.01278962, + "auxiliary_loss_mlp": 0.00253756, + "balance_loss_clip": 1.05050683, + "balance_loss_mlp": 0.2282453, + "epoch": 0.7543063279723433, + "flos": 11940944664960.0, + "grad_norm": 27.469529816908917, + "language_loss": 0.78644276, + "learning_rate": 6.003876669496728e-07, + "loss": 0.80176985, + "num_input_tokens_seen": 270635695, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.25524902, + "step": 12546, + "time_per_iteration": 2.688429117202759 + }, + { + "auxiliary_loss_clip": 0.01288387, + "auxiliary_loss_mlp": 0.00265212, + "balance_loss_clip": 1.06065989, + "balance_loss_mlp": 0.23763841, + "epoch": 0.7543664512250112, + "flos": 22819974624000.0, + "grad_norm": 12.208186040597882, + "language_loss": 0.83310521, + "learning_rate": 6.00109488240147e-07, + "loss": 0.84864116, + "num_input_tokens_seen": 270654325, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.27587891, + "step": 12547, + "time_per_iteration": 2.7195005416870117 + }, + { + "auxiliary_loss_clip": 0.01277849, + "auxiliary_loss_mlp": 0.00279761, + "balance_loss_clip": 1.04689026, + "balance_loss_mlp": 0.25060257, + "epoch": 0.7544265744776792, + "flos": 20923855482240.0, + "grad_norm": 22.016145584227246, + "language_loss": 0.76250511, + "learning_rate": 5.998313626146099e-07, + "loss": 0.77808118, + "num_input_tokens_seen": 270674260, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.29162598, + "step": 12548, + "time_per_iteration": 2.703922748565674 + }, + { + "auxiliary_loss_clip": 0.01263001, + "auxiliary_loss_mlp": 0.00253722, + "balance_loss_clip": 1.04117882, + "balance_loss_mlp": 0.22883126, + "epoch": 0.7544866977303473, + "flos": 15195493284480.0, + "grad_norm": 14.19015353989516, + "language_loss": 0.94910872, + "learning_rate": 5.995532900836088e-07, + "loss": 0.9642759, + "num_input_tokens_seen": 270692200, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24902344, + "step": 12549, + "time_per_iteration": 2.662792921066284 + }, + { + "auxiliary_loss_clip": 0.01258432, + "auxiliary_loss_mlp": 0.00291913, + "balance_loss_clip": 1.03951645, + "balance_loss_mlp": 0.26616424, + "epoch": 0.7545468209830152, + "flos": 27083683848960.0, + "grad_norm": 10.842840776974802, + "language_loss": 0.83559668, + "learning_rate": 5.992752706576865e-07, + "loss": 0.85110009, + "num_input_tokens_seen": 270709675, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.25744629, + "step": 12550, + "time_per_iteration": 2.6908328533172607 + }, + { + "auxiliary_loss_clip": 0.01280867, + "auxiliary_loss_mlp": 0.00265079, + "balance_loss_clip": 1.05433667, + "balance_loss_mlp": 0.24074849, + "epoch": 0.7546069442356832, + "flos": 26871703735680.0, + "grad_norm": 2.927849313669267, + "language_loss": 0.74807966, + "learning_rate": 5.98997304347386e-07, + "loss": 0.76353908, + "num_input_tokens_seen": 270733055, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.24328613, + "step": 12551, + "time_per_iteration": 2.7266247272491455 + }, + { + "auxiliary_loss_clip": 0.01267254, + "auxiliary_loss_mlp": 0.00279397, + "balance_loss_clip": 1.04925203, + "balance_loss_mlp": 0.25394559, + "epoch": 0.7546670674883511, + "flos": 15743131015680.0, + "grad_norm": 4.09233882679482, + "language_loss": 0.93833673, + "learning_rate": 5.987193911632487e-07, + "loss": 0.9538033, + "num_input_tokens_seen": 270749275, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25463867, + "step": 12552, + "time_per_iteration": 2.6632258892059326 + }, + { + "auxiliary_loss_clip": 0.0127739, + "auxiliary_loss_mlp": 0.00273798, + "balance_loss_clip": 1.05166101, + "balance_loss_mlp": 0.24446087, + "epoch": 0.7547271907410191, + "flos": 23477714519040.0, + "grad_norm": 7.322244541135616, + "language_loss": 0.8736338, + "learning_rate": 5.98441531115812e-07, + "loss": 0.88914573, + "num_input_tokens_seen": 270768230, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.29333496, + "step": 12553, + "time_per_iteration": 4.13367772102356 + }, + { + "auxiliary_loss_clip": 0.01278944, + "auxiliary_loss_mlp": 0.00280686, + "balance_loss_clip": 1.05279756, + "balance_loss_mlp": 0.25291035, + "epoch": 0.754787313993687, + "flos": 31722804069120.0, + "grad_norm": 8.549356686428661, + "language_loss": 0.71913821, + "learning_rate": 5.981637242156135e-07, + "loss": 0.73473454, + "num_input_tokens_seen": 270786285, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.2779541, + "step": 12554, + "time_per_iteration": 4.186604022979736 + }, + { + "auxiliary_loss_clip": 0.01268286, + "auxiliary_loss_mlp": 0.00249743, + "balance_loss_clip": 1.04278708, + "balance_loss_mlp": 0.22313583, + "epoch": 0.7548474372463551, + "flos": 27563055782400.0, + "grad_norm": 7.407070774084945, + "language_loss": 0.79990304, + "learning_rate": 5.978859704731864e-07, + "loss": 0.81508338, + "num_input_tokens_seen": 270805505, + "router_z_loss_clip": 2.25097656, + "router_z_loss_mlp": 0.26599121, + "step": 12555, + "time_per_iteration": 2.7251288890838623 + }, + { + "auxiliary_loss_clip": 0.01294224, + "auxiliary_loss_mlp": 0.00303189, + "balance_loss_clip": 1.06109297, + "balance_loss_mlp": 0.27479362, + "epoch": 0.754907560499023, + "flos": 19318576763520.0, + "grad_norm": 6.53011522799755, + "language_loss": 0.86766225, + "learning_rate": 5.976082698990645e-07, + "loss": 0.88363642, + "num_input_tokens_seen": 270824610, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.28405762, + "step": 12556, + "time_per_iteration": 2.6453497409820557 + }, + { + "auxiliary_loss_clip": 0.0113702, + "auxiliary_loss_mlp": 0.00119119, + "balance_loss_clip": 0.98849726, + "balance_loss_mlp": 0.10939135, + "epoch": 0.754967683751691, + "flos": 69744628684800.0, + "grad_norm": 0.6889693573164366, + "language_loss": 0.49941641, + "learning_rate": 5.973306225037769e-07, + "loss": 0.51197779, + "num_input_tokens_seen": 270886155, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.09716797, + "step": 12557, + "time_per_iteration": 3.1473546028137207 + }, + { + "auxiliary_loss_clip": 0.01271145, + "auxiliary_loss_mlp": 0.00276877, + "balance_loss_clip": 1.04742861, + "balance_loss_mlp": 0.25011468, + "epoch": 0.7550278070043589, + "flos": 24421913377920.0, + "grad_norm": 100.52683122455679, + "language_loss": 0.78067625, + "learning_rate": 5.970530282978525e-07, + "loss": 0.79615647, + "num_input_tokens_seen": 270905325, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.2677002, + "step": 12558, + "time_per_iteration": 4.23699688911438 + }, + { + "auxiliary_loss_clip": 0.01249345, + "auxiliary_loss_mlp": 0.00276625, + "balance_loss_clip": 1.02962708, + "balance_loss_mlp": 0.25090003, + "epoch": 0.7550879302570269, + "flos": 32634611838720.0, + "grad_norm": 25.711680231316258, + "language_loss": 0.86599177, + "learning_rate": 5.967754872918187e-07, + "loss": 0.88125145, + "num_input_tokens_seen": 270927535, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25756836, + "step": 12559, + "time_per_iteration": 2.7501254081726074 + }, + { + "auxiliary_loss_clip": 0.01265397, + "auxiliary_loss_mlp": 0.00294187, + "balance_loss_clip": 1.04664207, + "balance_loss_mlp": 0.26736513, + "epoch": 0.7551480535096948, + "flos": 21795550738560.0, + "grad_norm": 166.87885934552375, + "language_loss": 0.84781212, + "learning_rate": 5.96497999496199e-07, + "loss": 0.86340791, + "num_input_tokens_seen": 270946920, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.26843262, + "step": 12560, + "time_per_iteration": 2.7232937812805176 + }, + { + "auxiliary_loss_clip": 0.01256736, + "auxiliary_loss_mlp": 0.00243705, + "balance_loss_clip": 1.03952837, + "balance_loss_mlp": 0.21906416, + "epoch": 0.7552081767623628, + "flos": 18515111391360.0, + "grad_norm": 20.19498346912948, + "language_loss": 0.78798455, + "learning_rate": 5.96220564921515e-07, + "loss": 0.80298895, + "num_input_tokens_seen": 270965705, + "router_z_loss_clip": 2.17089844, + "router_z_loss_mlp": 0.24633789, + "step": 12561, + "time_per_iteration": 2.636481285095215 + }, + { + "auxiliary_loss_clip": 0.01268747, + "auxiliary_loss_mlp": 0.00282554, + "balance_loss_clip": 1.04913831, + "balance_loss_mlp": 0.25648266, + "epoch": 0.7552683000150308, + "flos": 27634805199360.0, + "grad_norm": 5.241544416770227, + "language_loss": 0.81670702, + "learning_rate": 5.959431835782889e-07, + "loss": 0.83222008, + "num_input_tokens_seen": 270986550, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26049805, + "step": 12562, + "time_per_iteration": 2.724472761154175 + }, + { + "auxiliary_loss_clip": 0.01266133, + "auxiliary_loss_mlp": 0.00286783, + "balance_loss_clip": 1.04428625, + "balance_loss_mlp": 0.26030686, + "epoch": 0.7553284232676988, + "flos": 20302924049280.0, + "grad_norm": 6.871216483248377, + "language_loss": 0.83931816, + "learning_rate": 5.956658554770371e-07, + "loss": 0.85484731, + "num_input_tokens_seen": 271006250, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.26489258, + "step": 12563, + "time_per_iteration": 4.192183494567871 + }, + { + "auxiliary_loss_clip": 0.01323538, + "auxiliary_loss_mlp": 0.00302228, + "balance_loss_clip": 1.08272994, + "balance_loss_mlp": 0.27297398, + "epoch": 0.7553885465203668, + "flos": 33255471444480.0, + "grad_norm": 45.04692056439169, + "language_loss": 0.80006814, + "learning_rate": 5.953885806282768e-07, + "loss": 0.81632578, + "num_input_tokens_seen": 271025575, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.29272461, + "step": 12564, + "time_per_iteration": 2.8034846782684326 + }, + { + "auxiliary_loss_clip": 0.01287776, + "auxiliary_loss_mlp": 0.00247257, + "balance_loss_clip": 1.05433059, + "balance_loss_mlp": 0.22119831, + "epoch": 0.7554486697730347, + "flos": 21616249023360.0, + "grad_norm": 76.28864582420485, + "language_loss": 0.75403601, + "learning_rate": 5.951113590425228e-07, + "loss": 0.76938629, + "num_input_tokens_seen": 271045805, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.26062012, + "step": 12565, + "time_per_iteration": 2.7463457584381104 + }, + { + "auxiliary_loss_clip": 0.01284181, + "auxiliary_loss_mlp": 0.00292228, + "balance_loss_clip": 1.05259383, + "balance_loss_mlp": 0.26321211, + "epoch": 0.7555087930257027, + "flos": 27632973605760.0, + "grad_norm": 5.648656447087321, + "language_loss": 0.81027186, + "learning_rate": 5.94834190730287e-07, + "loss": 0.82603592, + "num_input_tokens_seen": 271066065, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.2902832, + "step": 12566, + "time_per_iteration": 2.7320828437805176 + }, + { + "auxiliary_loss_clip": 0.01306097, + "auxiliary_loss_mlp": 0.00274594, + "balance_loss_clip": 1.07022619, + "balance_loss_mlp": 0.24513784, + "epoch": 0.7555689162783706, + "flos": 23621644316160.0, + "grad_norm": 22.477733111148375, + "language_loss": 0.83153403, + "learning_rate": 5.945570757020789e-07, + "loss": 0.84734094, + "num_input_tokens_seen": 271085870, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.29455566, + "step": 12567, + "time_per_iteration": 2.6708714962005615 + }, + { + "auxiliary_loss_clip": 0.01263047, + "auxiliary_loss_mlp": 0.00288346, + "balance_loss_clip": 1.0424459, + "balance_loss_mlp": 0.26190504, + "epoch": 0.7556290395310387, + "flos": 24863076218880.0, + "grad_norm": 3.490698674764972, + "language_loss": 0.72501671, + "learning_rate": 5.942800139684073e-07, + "loss": 0.74053073, + "num_input_tokens_seen": 271104260, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.26452637, + "step": 12568, + "time_per_iteration": 2.6525750160217285 + }, + { + "auxiliary_loss_clip": 0.01272095, + "auxiliary_loss_mlp": 0.00275494, + "balance_loss_clip": 1.04968023, + "balance_loss_mlp": 0.24933949, + "epoch": 0.7556891627837066, + "flos": 43543770330240.0, + "grad_norm": 638.7975334153308, + "language_loss": 0.74204254, + "learning_rate": 5.940030055397789e-07, + "loss": 0.75751841, + "num_input_tokens_seen": 271125745, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.26147461, + "step": 12569, + "time_per_iteration": 2.8404653072357178 + }, + { + "auxiliary_loss_clip": 0.0131165, + "auxiliary_loss_mlp": 0.0027282, + "balance_loss_clip": 1.06767452, + "balance_loss_mlp": 0.24448434, + "epoch": 0.7557492860363746, + "flos": 26650924790400.0, + "grad_norm": 238.6576073557996, + "language_loss": 0.76253539, + "learning_rate": 5.93726050426697e-07, + "loss": 0.7783801, + "num_input_tokens_seen": 271147145, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.2833252, + "step": 12570, + "time_per_iteration": 2.75126576423645 + }, + { + "auxiliary_loss_clip": 0.01291792, + "auxiliary_loss_mlp": 0.00258513, + "balance_loss_clip": 1.05851102, + "balance_loss_mlp": 0.2297121, + "epoch": 0.7558094092890425, + "flos": 55182885010560.0, + "grad_norm": 4.3016094254758075, + "language_loss": 0.80007285, + "learning_rate": 5.934491486396647e-07, + "loss": 0.8155759, + "num_input_tokens_seen": 271170865, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.28796387, + "step": 12571, + "time_per_iteration": 2.928222179412842 + }, + { + "auxiliary_loss_clip": 0.01293595, + "auxiliary_loss_mlp": 0.00262335, + "balance_loss_clip": 1.05892646, + "balance_loss_mlp": 0.23658538, + "epoch": 0.7558695325417105, + "flos": 23988292392960.0, + "grad_norm": 23.0514948782356, + "language_loss": 0.83836651, + "learning_rate": 5.931723001891811e-07, + "loss": 0.85392576, + "num_input_tokens_seen": 271191450, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.25744629, + "step": 12572, + "time_per_iteration": 2.696108341217041 + }, + { + "auxiliary_loss_clip": 0.01296738, + "auxiliary_loss_mlp": 0.00257288, + "balance_loss_clip": 1.06761456, + "balance_loss_mlp": 0.23242137, + "epoch": 0.7559296557943784, + "flos": 14611262572800.0, + "grad_norm": 14.960785294166346, + "language_loss": 0.83309734, + "learning_rate": 5.928955050857456e-07, + "loss": 0.84863764, + "num_input_tokens_seen": 271207335, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.24890137, + "step": 12573, + "time_per_iteration": 2.650101900100708 + }, + { + "auxiliary_loss_clip": 0.01280719, + "auxiliary_loss_mlp": 0.00267023, + "balance_loss_clip": 1.05063748, + "balance_loss_mlp": 0.24072564, + "epoch": 0.7559897790470465, + "flos": 18550483309440.0, + "grad_norm": 3.5564087991668676, + "language_loss": 0.75676733, + "learning_rate": 5.926187633398527e-07, + "loss": 0.77224475, + "num_input_tokens_seen": 271226895, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.26318359, + "step": 12574, + "time_per_iteration": 2.8007190227508545 + }, + { + "auxiliary_loss_clip": 0.01278192, + "auxiliary_loss_mlp": 0.00268934, + "balance_loss_clip": 1.05106759, + "balance_loss_mlp": 0.24230258, + "epoch": 0.7560499022997144, + "flos": 17967868709760.0, + "grad_norm": 39.15720658549678, + "language_loss": 0.81326264, + "learning_rate": 5.923420749619974e-07, + "loss": 0.82873386, + "num_input_tokens_seen": 271244375, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.26623535, + "step": 12575, + "time_per_iteration": 2.624263048171997 + }, + { + "auxiliary_loss_clip": 0.01268369, + "auxiliary_loss_mlp": 0.00268048, + "balance_loss_clip": 1.04317641, + "balance_loss_mlp": 0.24129739, + "epoch": 0.7561100255523824, + "flos": 15737815802880.0, + "grad_norm": 502.9876403381837, + "language_loss": 0.80762756, + "learning_rate": 5.92065439962673e-07, + "loss": 0.82299173, + "num_input_tokens_seen": 271259530, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.26757812, + "step": 12576, + "time_per_iteration": 2.696828603744507 + }, + { + "auxiliary_loss_clip": 0.01278936, + "auxiliary_loss_mlp": 0.00243727, + "balance_loss_clip": 1.05451381, + "balance_loss_mlp": 0.21866971, + "epoch": 0.7561701488050504, + "flos": 15888102307200.0, + "grad_norm": 34.68879549866784, + "language_loss": 0.75470036, + "learning_rate": 5.917888583523669e-07, + "loss": 0.76992702, + "num_input_tokens_seen": 271276835, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.25048828, + "step": 12577, + "time_per_iteration": 2.6409530639648438 + }, + { + "auxiliary_loss_clip": 0.01269531, + "auxiliary_loss_mlp": 0.00290093, + "balance_loss_clip": 1.04831672, + "balance_loss_mlp": 0.26355749, + "epoch": 0.7562302720577183, + "flos": 20339157893760.0, + "grad_norm": 4.635938863816431, + "language_loss": 0.84905559, + "learning_rate": 5.915123301415685e-07, + "loss": 0.8646518, + "num_input_tokens_seen": 271296275, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26525879, + "step": 12578, + "time_per_iteration": 2.7034356594085693 + }, + { + "auxiliary_loss_clip": 0.01265784, + "auxiliary_loss_mlp": 0.00270896, + "balance_loss_clip": 1.04065955, + "balance_loss_mlp": 0.24381204, + "epoch": 0.7562903953103863, + "flos": 20812209033600.0, + "grad_norm": 39.136947299815056, + "language_loss": 0.82099438, + "learning_rate": 5.912358553407641e-07, + "loss": 0.83636117, + "num_input_tokens_seen": 271315685, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.27099609, + "step": 12579, + "time_per_iteration": 2.675410747528076 + }, + { + "auxiliary_loss_clip": 0.0128751, + "auxiliary_loss_mlp": 0.00246635, + "balance_loss_clip": 1.05344439, + "balance_loss_mlp": 0.21865697, + "epoch": 0.7563505185630542, + "flos": 37596999484800.0, + "grad_norm": 37.30202361841782, + "language_loss": 0.73043334, + "learning_rate": 5.90959433960437e-07, + "loss": 0.74577475, + "num_input_tokens_seen": 271336790, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.27966309, + "step": 12580, + "time_per_iteration": 2.8137407302856445 + }, + { + "auxiliary_loss_clip": 0.01284534, + "auxiliary_loss_mlp": 0.00269217, + "balance_loss_clip": 1.05865526, + "balance_loss_mlp": 0.2422995, + "epoch": 0.7564106418157223, + "flos": 20230995064320.0, + "grad_norm": 4.968352436654494, + "language_loss": 0.82333988, + "learning_rate": 5.906830660110691e-07, + "loss": 0.83887738, + "num_input_tokens_seen": 271355470, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.26904297, + "step": 12581, + "time_per_iteration": 2.6574835777282715 + }, + { + "auxiliary_loss_clip": 0.01283779, + "auxiliary_loss_mlp": 0.0026938, + "balance_loss_clip": 1.05549562, + "balance_loss_mlp": 0.24205756, + "epoch": 0.7564707650683902, + "flos": 24754877475840.0, + "grad_norm": 57.20042768766058, + "language_loss": 0.70908368, + "learning_rate": 5.904067515031412e-07, + "loss": 0.72461528, + "num_input_tokens_seen": 271375810, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27331543, + "step": 12582, + "time_per_iteration": 2.705822706222534 + }, + { + "auxiliary_loss_clip": 0.01148343, + "auxiliary_loss_mlp": 0.00099745, + "balance_loss_clip": 0.99493361, + "balance_loss_mlp": 0.08963581, + "epoch": 0.7565308883210582, + "flos": 48530076433920.0, + "grad_norm": 0.9784831049619438, + "language_loss": 0.59936279, + "learning_rate": 5.901304904471307e-07, + "loss": 0.61184365, + "num_input_tokens_seen": 271424775, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.10107422, + "step": 12583, + "time_per_iteration": 2.937648057937622 + }, + { + "auxiliary_loss_clip": 0.01277807, + "auxiliary_loss_mlp": 0.00256743, + "balance_loss_clip": 1.05547667, + "balance_loss_mlp": 0.23102966, + "epoch": 0.7565910115737261, + "flos": 12495082757760.0, + "grad_norm": 39.040208325157366, + "language_loss": 0.86805999, + "learning_rate": 5.898542828535125e-07, + "loss": 0.88340545, + "num_input_tokens_seen": 271440500, + "router_z_loss_clip": 2.22167969, + "router_z_loss_mlp": 0.25732422, + "step": 12584, + "time_per_iteration": 2.63339900970459 + }, + { + "auxiliary_loss_clip": 0.01273121, + "auxiliary_loss_mlp": 0.002681, + "balance_loss_clip": 1.05493593, + "balance_loss_mlp": 0.24353062, + "epoch": 0.7566511348263941, + "flos": 21173003193600.0, + "grad_norm": 55.50629824096023, + "language_loss": 0.83621335, + "learning_rate": 5.895781287327612e-07, + "loss": 0.85162556, + "num_input_tokens_seen": 271458180, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24572754, + "step": 12585, + "time_per_iteration": 2.735262870788574 + }, + { + "auxiliary_loss_clip": 0.0128889, + "auxiliary_loss_mlp": 0.00269298, + "balance_loss_clip": 1.06136286, + "balance_loss_mlp": 0.24091466, + "epoch": 0.756711258079062, + "flos": 21754827694080.0, + "grad_norm": 3.0701566017468713, + "language_loss": 0.91759735, + "learning_rate": 5.893020280953493e-07, + "loss": 0.93317926, + "num_input_tokens_seen": 271475730, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.28369141, + "step": 12586, + "time_per_iteration": 2.6567022800445557 + }, + { + "auxiliary_loss_clip": 0.01282714, + "auxiliary_loss_mlp": 0.00264983, + "balance_loss_clip": 1.0556401, + "balance_loss_mlp": 0.23861401, + "epoch": 0.75677138133173, + "flos": 22382905933440.0, + "grad_norm": 38.83871699993469, + "language_loss": 0.90284902, + "learning_rate": 5.890259809517459e-07, + "loss": 0.91832602, + "num_input_tokens_seen": 271495030, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.2635498, + "step": 12587, + "time_per_iteration": 2.725794792175293 + }, + { + "auxiliary_loss_clip": 0.01268969, + "auxiliary_loss_mlp": 0.00262621, + "balance_loss_clip": 1.04268456, + "balance_loss_mlp": 0.23619233, + "epoch": 0.756831504584398, + "flos": 22708974620160.0, + "grad_norm": 33.240489860849316, + "language_loss": 0.77507627, + "learning_rate": 5.88749987312418e-07, + "loss": 0.79039228, + "num_input_tokens_seen": 271515355, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.2644043, + "step": 12588, + "time_per_iteration": 2.6744072437286377 + }, + { + "auxiliary_loss_clip": 0.01274744, + "auxiliary_loss_mlp": 0.00277852, + "balance_loss_clip": 1.04738772, + "balance_loss_mlp": 0.24927762, + "epoch": 0.756891627837066, + "flos": 24098358643200.0, + "grad_norm": 57.770339864760835, + "language_loss": 0.77647471, + "learning_rate": 5.884740471878327e-07, + "loss": 0.79200065, + "num_input_tokens_seen": 271535090, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.2857666, + "step": 12589, + "time_per_iteration": 2.7360613346099854 + }, + { + "auxiliary_loss_clip": 0.01256179, + "auxiliary_loss_mlp": 0.00252045, + "balance_loss_clip": 1.0351932, + "balance_loss_mlp": 0.22778553, + "epoch": 0.756951751089734, + "flos": 19749001438080.0, + "grad_norm": 7.0240027113802315, + "language_loss": 0.98409581, + "learning_rate": 5.881981605884522e-07, + "loss": 0.99917805, + "num_input_tokens_seen": 271551075, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24267578, + "step": 12590, + "time_per_iteration": 2.687446355819702 + }, + { + "auxiliary_loss_clip": 0.01256725, + "auxiliary_loss_mlp": 0.00246167, + "balance_loss_clip": 1.03906274, + "balance_loss_mlp": 0.22288562, + "epoch": 0.7570118743424019, + "flos": 35079266551680.0, + "grad_norm": 6.521065267318732, + "language_loss": 0.73348808, + "learning_rate": 5.879223275247391e-07, + "loss": 0.74851692, + "num_input_tokens_seen": 271571035, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.2331543, + "step": 12591, + "time_per_iteration": 2.81270432472229 + }, + { + "auxiliary_loss_clip": 0.01289671, + "auxiliary_loss_mlp": 0.0026128, + "balance_loss_clip": 1.0650785, + "balance_loss_mlp": 0.23719969, + "epoch": 0.7570719975950699, + "flos": 25594540778880.0, + "grad_norm": 23.680648251269407, + "language_loss": 0.82076234, + "learning_rate": 5.876465480071528e-07, + "loss": 0.83627188, + "num_input_tokens_seen": 271592950, + "router_z_loss_clip": 2.24707031, + "router_z_loss_mlp": 0.24084473, + "step": 12592, + "time_per_iteration": 2.7410266399383545 + }, + { + "auxiliary_loss_clip": 0.01272408, + "auxiliary_loss_mlp": 0.00268506, + "balance_loss_clip": 1.0492835, + "balance_loss_mlp": 0.24286392, + "epoch": 0.7571321208477378, + "flos": 10816223028480.0, + "grad_norm": 32.371427686539455, + "language_loss": 0.79320997, + "learning_rate": 5.873708220461522e-07, + "loss": 0.80861914, + "num_input_tokens_seen": 271608835, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25683594, + "step": 12593, + "time_per_iteration": 2.634935140609741 + }, + { + "auxiliary_loss_clip": 0.01299561, + "auxiliary_loss_mlp": 0.00295063, + "balance_loss_clip": 1.06778622, + "balance_loss_mlp": 0.26781133, + "epoch": 0.7571922441004059, + "flos": 18260109763200.0, + "grad_norm": 3.925951052265435, + "language_loss": 0.7572028, + "learning_rate": 5.870951496521903e-07, + "loss": 0.77314901, + "num_input_tokens_seen": 271627730, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.27270508, + "step": 12594, + "time_per_iteration": 2.6398978233337402 + }, + { + "auxiliary_loss_clip": 0.01293201, + "auxiliary_loss_mlp": 0.00268497, + "balance_loss_clip": 1.05990028, + "balance_loss_mlp": 0.24051866, + "epoch": 0.7572523673530738, + "flos": 22890502978560.0, + "grad_norm": 8.653490560480336, + "language_loss": 0.86076421, + "learning_rate": 5.86819530835722e-07, + "loss": 0.87638116, + "num_input_tokens_seen": 271646415, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.27978516, + "step": 12595, + "time_per_iteration": 4.041696786880493 + }, + { + "auxiliary_loss_clip": 0.01267362, + "auxiliary_loss_mlp": 0.0025428, + "balance_loss_clip": 1.04447174, + "balance_loss_mlp": 0.22828063, + "epoch": 0.7573124906057418, + "flos": 20996323171200.0, + "grad_norm": 2.2734712116758544, + "language_loss": 0.8071245, + "learning_rate": 5.865439656071993e-07, + "loss": 0.82234085, + "num_input_tokens_seen": 271666240, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26000977, + "step": 12596, + "time_per_iteration": 2.695888042449951 + }, + { + "auxiliary_loss_clip": 0.01258592, + "auxiliary_loss_mlp": 0.00252769, + "balance_loss_clip": 1.03999555, + "balance_loss_mlp": 0.22989321, + "epoch": 0.7573726138584097, + "flos": 20886292834560.0, + "grad_norm": 3.257173489484663, + "language_loss": 0.87189436, + "learning_rate": 5.862684539770706e-07, + "loss": 0.88700801, + "num_input_tokens_seen": 271686370, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.22900391, + "step": 12597, + "time_per_iteration": 4.169264554977417 + }, + { + "auxiliary_loss_clip": 0.01304768, + "auxiliary_loss_mlp": 0.00258672, + "balance_loss_clip": 1.0709374, + "balance_loss_mlp": 0.23075327, + "epoch": 0.7574327371110777, + "flos": 24530507170560.0, + "grad_norm": 5.332323887170776, + "language_loss": 0.90126687, + "learning_rate": 5.859929959557835e-07, + "loss": 0.91690129, + "num_input_tokens_seen": 271705050, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.27929688, + "step": 12598, + "time_per_iteration": 2.6514058113098145 + }, + { + "auxiliary_loss_clip": 0.0125961, + "auxiliary_loss_mlp": 0.00239529, + "balance_loss_clip": 1.04087782, + "balance_loss_mlp": 0.21577027, + "epoch": 0.7574928603637456, + "flos": 23364523785600.0, + "grad_norm": 15.122121758953583, + "language_loss": 0.71768618, + "learning_rate": 5.857175915537845e-07, + "loss": 0.73267758, + "num_input_tokens_seen": 271724915, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.2376709, + "step": 12599, + "time_per_iteration": 2.658071756362915 + }, + { + "auxiliary_loss_clip": 0.01299192, + "auxiliary_loss_mlp": 0.0027053, + "balance_loss_clip": 1.06284833, + "balance_loss_mlp": 0.24304014, + "epoch": 0.7575529836164137, + "flos": 13516274419200.0, + "grad_norm": 1377.5974003012827, + "language_loss": 0.74537373, + "learning_rate": 5.854422407815161e-07, + "loss": 0.76107097, + "num_input_tokens_seen": 271742410, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.27478027, + "step": 12600, + "time_per_iteration": 4.143930196762085 + }, + { + "auxiliary_loss_clip": 0.01258436, + "auxiliary_loss_mlp": 0.00258371, + "balance_loss_clip": 1.04450476, + "balance_loss_mlp": 0.2339924, + "epoch": 0.7576131068690816, + "flos": 19646584784640.0, + "grad_norm": 19.00681050888721, + "language_loss": 0.72866392, + "learning_rate": 5.851669436494191e-07, + "loss": 0.74383199, + "num_input_tokens_seen": 271761425, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24365234, + "step": 12601, + "time_per_iteration": 2.6707918643951416 + }, + { + "auxiliary_loss_clip": 0.01263784, + "auxiliary_loss_mlp": 0.00275171, + "balance_loss_clip": 1.04520547, + "balance_loss_mlp": 0.24921891, + "epoch": 0.7576732301217496, + "flos": 20048245643520.0, + "grad_norm": 6.362487913297703, + "language_loss": 0.75549436, + "learning_rate": 5.848917001679335e-07, + "loss": 0.77088392, + "num_input_tokens_seen": 271780875, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25964355, + "step": 12602, + "time_per_iteration": 2.7143306732177734 + }, + { + "auxiliary_loss_clip": 0.01277474, + "auxiliary_loss_mlp": 0.00268849, + "balance_loss_clip": 1.04946852, + "balance_loss_mlp": 0.24189578, + "epoch": 0.7577333533744176, + "flos": 15377093470080.0, + "grad_norm": 120.4616137320142, + "language_loss": 0.78941083, + "learning_rate": 5.846165103474967e-07, + "loss": 0.80487406, + "num_input_tokens_seen": 271799490, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.26965332, + "step": 12603, + "time_per_iteration": 2.652600049972534 + }, + { + "auxiliary_loss_clip": 0.0125423, + "auxiliary_loss_mlp": 0.0026171, + "balance_loss_clip": 1.03539145, + "balance_loss_mlp": 0.23619951, + "epoch": 0.7577934766270855, + "flos": 17894862316800.0, + "grad_norm": 283.60226430667416, + "language_loss": 0.71249551, + "learning_rate": 5.843413741985439e-07, + "loss": 0.72765493, + "num_input_tokens_seen": 271817040, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.25524902, + "step": 12604, + "time_per_iteration": 2.6781163215637207 + }, + { + "auxiliary_loss_clip": 0.01270425, + "auxiliary_loss_mlp": 0.00250694, + "balance_loss_clip": 1.05086613, + "balance_loss_mlp": 0.22552937, + "epoch": 0.7578535998797535, + "flos": 21613770984960.0, + "grad_norm": 11.855018287092424, + "language_loss": 0.86560136, + "learning_rate": 5.840662917315076e-07, + "loss": 0.88081253, + "num_input_tokens_seen": 271835480, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25158691, + "step": 12605, + "time_per_iteration": 4.124801158905029 + }, + { + "auxiliary_loss_clip": 0.01291651, + "auxiliary_loss_mlp": 0.00276388, + "balance_loss_clip": 1.05555749, + "balance_loss_mlp": 0.2490537, + "epoch": 0.7579137231324214, + "flos": 18478374756480.0, + "grad_norm": 67.2861050938665, + "language_loss": 0.8986423, + "learning_rate": 5.837912629568198e-07, + "loss": 0.91432261, + "num_input_tokens_seen": 271849835, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.2734375, + "step": 12606, + "time_per_iteration": 2.7105586528778076 + }, + { + "auxiliary_loss_clip": 0.01247936, + "auxiliary_loss_mlp": 0.00252308, + "balance_loss_clip": 1.03410411, + "balance_loss_mlp": 0.22797781, + "epoch": 0.7579738463850895, + "flos": 23255032152960.0, + "grad_norm": 1940.682417772098, + "language_loss": 0.77309573, + "learning_rate": 5.835162878849087e-07, + "loss": 0.7880981, + "num_input_tokens_seen": 271869560, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.2434082, + "step": 12607, + "time_per_iteration": 2.7186691761016846 + }, + { + "auxiliary_loss_clip": 0.01290886, + "auxiliary_loss_mlp": 0.00264587, + "balance_loss_clip": 1.06013417, + "balance_loss_mlp": 0.23802763, + "epoch": 0.7580339696377574, + "flos": 14027031861120.0, + "grad_norm": 357.2157857184072, + "language_loss": 0.83808541, + "learning_rate": 5.83241366526202e-07, + "loss": 0.85364014, + "num_input_tokens_seen": 271887950, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.26574707, + "step": 12608, + "time_per_iteration": 2.639857053756714 + }, + { + "auxiliary_loss_clip": 0.01260125, + "auxiliary_loss_mlp": 0.0026024, + "balance_loss_clip": 1.03633499, + "balance_loss_mlp": 0.23395406, + "epoch": 0.7580940928904254, + "flos": 25082777756160.0, + "grad_norm": 12.991315400472335, + "language_loss": 0.78760445, + "learning_rate": 5.829664988911245e-07, + "loss": 0.80280817, + "num_input_tokens_seen": 271907700, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.26281738, + "step": 12609, + "time_per_iteration": 2.7654175758361816 + }, + { + "auxiliary_loss_clip": 0.01286621, + "auxiliary_loss_mlp": 0.00266975, + "balance_loss_clip": 1.05747306, + "balance_loss_mlp": 0.23893689, + "epoch": 0.7581542161430933, + "flos": 23836425690240.0, + "grad_norm": 50.44850031646102, + "language_loss": 0.8809157, + "learning_rate": 5.826916849901007e-07, + "loss": 0.89645171, + "num_input_tokens_seen": 271926840, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.28051758, + "step": 12610, + "time_per_iteration": 2.6466822624206543 + }, + { + "auxiliary_loss_clip": 0.01300169, + "auxiliary_loss_mlp": 0.00251779, + "balance_loss_clip": 1.07140255, + "balance_loss_mlp": 0.22689992, + "epoch": 0.7582143393957613, + "flos": 22237000888320.0, + "grad_norm": 12.661808579729744, + "language_loss": 0.78265715, + "learning_rate": 5.824169248335488e-07, + "loss": 0.79817665, + "num_input_tokens_seen": 271946465, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.2487793, + "step": 12611, + "time_per_iteration": 2.645425319671631 + }, + { + "auxiliary_loss_clip": 0.01276062, + "auxiliary_loss_mlp": 0.00283668, + "balance_loss_clip": 1.05458832, + "balance_loss_mlp": 0.25728744, + "epoch": 0.7582744626484292, + "flos": 21106389421440.0, + "grad_norm": 173.99691862823158, + "language_loss": 0.78785467, + "learning_rate": 5.821422184318893e-07, + "loss": 0.80345201, + "num_input_tokens_seen": 271967295, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26367188, + "step": 12612, + "time_per_iteration": 2.6826963424682617 + }, + { + "auxiliary_loss_clip": 0.01284326, + "auxiliary_loss_mlp": 0.00270409, + "balance_loss_clip": 1.05636096, + "balance_loss_mlp": 0.2406545, + "epoch": 0.7583345859010973, + "flos": 24604770539520.0, + "grad_norm": 5.00513809428715, + "language_loss": 0.66623664, + "learning_rate": 5.818675657955397e-07, + "loss": 0.68178397, + "num_input_tokens_seen": 271987960, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.29760742, + "step": 12613, + "time_per_iteration": 2.756448268890381 + }, + { + "auxiliary_loss_clip": 0.01286203, + "auxiliary_loss_mlp": 0.00276426, + "balance_loss_clip": 1.05910778, + "balance_loss_mlp": 0.24918643, + "epoch": 0.7583947091537652, + "flos": 33546814657920.0, + "grad_norm": 20.448332639058837, + "language_loss": 0.67450863, + "learning_rate": 5.815929669349135e-07, + "loss": 0.69013494, + "num_input_tokens_seen": 272011780, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.2722168, + "step": 12614, + "time_per_iteration": 2.844142198562622 + }, + { + "auxiliary_loss_clip": 0.01286181, + "auxiliary_loss_mlp": 0.00250118, + "balance_loss_clip": 1.05913627, + "balance_loss_mlp": 0.22204378, + "epoch": 0.7584548324064332, + "flos": 20121000641280.0, + "grad_norm": 2.6642660521020476, + "language_loss": 0.82105446, + "learning_rate": 5.813184218604246e-07, + "loss": 0.83641744, + "num_input_tokens_seen": 272030825, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.28039551, + "step": 12615, + "time_per_iteration": 2.653377056121826 + }, + { + "auxiliary_loss_clip": 0.01155203, + "auxiliary_loss_mlp": 0.0014425, + "balance_loss_clip": 1.00228357, + "balance_loss_mlp": 0.13604794, + "epoch": 0.7585149556591012, + "flos": 70402584061440.0, + "grad_norm": 0.7947148579938159, + "language_loss": 0.66735852, + "learning_rate": 5.810439305824828e-07, + "loss": 0.68035305, + "num_input_tokens_seen": 272095825, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.08203125, + "step": 12616, + "time_per_iteration": 3.18774151802063 + }, + { + "auxiliary_loss_clip": 0.01289107, + "auxiliary_loss_mlp": 0.00272185, + "balance_loss_clip": 1.06086469, + "balance_loss_mlp": 0.24519598, + "epoch": 0.7585750789117691, + "flos": 16143786293760.0, + "grad_norm": 12.831242012564738, + "language_loss": 0.93436027, + "learning_rate": 5.807694931114979e-07, + "loss": 0.94997311, + "num_input_tokens_seen": 272113950, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.26989746, + "step": 12617, + "time_per_iteration": 2.691293954849243 + }, + { + "auxiliary_loss_clip": 0.01287759, + "auxiliary_loss_mlp": 0.00263153, + "balance_loss_clip": 1.05862057, + "balance_loss_mlp": 0.23768964, + "epoch": 0.7586352021644371, + "flos": 17493165544320.0, + "grad_norm": 45.27324442694777, + "language_loss": 0.87045026, + "learning_rate": 5.804951094578757e-07, + "loss": 0.88595939, + "num_input_tokens_seen": 272130315, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.25500488, + "step": 12618, + "time_per_iteration": 2.622885227203369 + }, + { + "auxiliary_loss_clip": 0.013212, + "auxiliary_loss_mlp": 0.00289755, + "balance_loss_clip": 1.08122754, + "balance_loss_mlp": 0.26026216, + "epoch": 0.758695325417105, + "flos": 17275187859840.0, + "grad_norm": 2.841622946630219, + "language_loss": 0.85881615, + "learning_rate": 5.802207796320209e-07, + "loss": 0.87492573, + "num_input_tokens_seen": 272149080, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.29516602, + "step": 12619, + "time_per_iteration": 2.6291160583496094 + }, + { + "auxiliary_loss_clip": 0.01272431, + "auxiliary_loss_mlp": 0.00267998, + "balance_loss_clip": 1.04869795, + "balance_loss_mlp": 0.24228457, + "epoch": 0.7587554486697731, + "flos": 29495660163840.0, + "grad_norm": 21.134744816363824, + "language_loss": 0.89424145, + "learning_rate": 5.79946503644337e-07, + "loss": 0.90964574, + "num_input_tokens_seen": 272168285, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.25720215, + "step": 12620, + "time_per_iteration": 2.8846263885498047 + }, + { + "auxiliary_loss_clip": 0.01310052, + "auxiliary_loss_mlp": 0.0027743, + "balance_loss_clip": 1.07201004, + "balance_loss_mlp": 0.24972543, + "epoch": 0.758815571922441, + "flos": 16100800692480.0, + "grad_norm": 4.765544676669795, + "language_loss": 0.93704116, + "learning_rate": 5.796722815052242e-07, + "loss": 0.95291603, + "num_input_tokens_seen": 272184585, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.27685547, + "step": 12621, + "time_per_iteration": 2.633944272994995 + }, + { + "auxiliary_loss_clip": 0.01261424, + "auxiliary_loss_mlp": 0.00249991, + "balance_loss_clip": 1.04121304, + "balance_loss_mlp": 0.22525528, + "epoch": 0.758875695175109, + "flos": 16143714466560.0, + "grad_norm": 7.506009552144079, + "language_loss": 0.80361575, + "learning_rate": 5.7939811322508e-07, + "loss": 0.81872988, + "num_input_tokens_seen": 272200205, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.24743652, + "step": 12622, + "time_per_iteration": 2.660008192062378 + }, + { + "auxiliary_loss_clip": 0.01151874, + "auxiliary_loss_mlp": 0.00109612, + "balance_loss_clip": 0.99706173, + "balance_loss_mlp": 0.10222103, + "epoch": 0.7589358184277769, + "flos": 68462006860800.0, + "grad_norm": 0.8254282923909504, + "language_loss": 0.6035229, + "learning_rate": 5.791239988143024e-07, + "loss": 0.61613774, + "num_input_tokens_seen": 272259670, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.07373047, + "step": 12623, + "time_per_iteration": 3.2179250717163086 + }, + { + "auxiliary_loss_clip": 0.01268189, + "auxiliary_loss_mlp": 0.00249212, + "balance_loss_clip": 1.05212879, + "balance_loss_mlp": 0.22491759, + "epoch": 0.7589959416804449, + "flos": 20047311889920.0, + "grad_norm": 9.339506264648017, + "language_loss": 0.75714028, + "learning_rate": 5.788499382832847e-07, + "loss": 0.77231431, + "num_input_tokens_seen": 272277925, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.24316406, + "step": 12624, + "time_per_iteration": 2.6495935916900635 + }, + { + "auxiliary_loss_clip": 0.01282629, + "auxiliary_loss_mlp": 0.00253033, + "balance_loss_clip": 1.05373907, + "balance_loss_mlp": 0.22760592, + "epoch": 0.7590560649331128, + "flos": 18771800958720.0, + "grad_norm": 3.707521337048524, + "language_loss": 0.83972198, + "learning_rate": 5.785759316424196e-07, + "loss": 0.85507858, + "num_input_tokens_seen": 272296010, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.25439453, + "step": 12625, + "time_per_iteration": 2.6595163345336914 + }, + { + "auxiliary_loss_clip": 0.012789, + "auxiliary_loss_mlp": 0.00255717, + "balance_loss_clip": 1.05914307, + "balance_loss_mlp": 0.23003972, + "epoch": 0.7591161881857809, + "flos": 29825284296960.0, + "grad_norm": 22.9179340869197, + "language_loss": 0.70150089, + "learning_rate": 5.783019789020977e-07, + "loss": 0.71684712, + "num_input_tokens_seen": 272318330, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25683594, + "step": 12626, + "time_per_iteration": 2.7379026412963867 + }, + { + "auxiliary_loss_clip": 0.01319321, + "auxiliary_loss_mlp": 0.00264171, + "balance_loss_clip": 1.07746744, + "balance_loss_mlp": 0.23508358, + "epoch": 0.7591763114384488, + "flos": 20302708567680.0, + "grad_norm": 5.530054788789839, + "language_loss": 0.85549206, + "learning_rate": 5.780280800727084e-07, + "loss": 0.87132698, + "num_input_tokens_seen": 272335265, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.29101562, + "step": 12627, + "time_per_iteration": 2.7339189052581787 + }, + { + "auxiliary_loss_clip": 0.01277781, + "auxiliary_loss_mlp": 0.00263399, + "balance_loss_clip": 1.05247498, + "balance_loss_mlp": 0.23663664, + "epoch": 0.7592364346911168, + "flos": 20813609664000.0, + "grad_norm": 16.72962765557675, + "language_loss": 0.77677977, + "learning_rate": 5.777542351646356e-07, + "loss": 0.79219151, + "num_input_tokens_seen": 272354795, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.2677002, + "step": 12628, + "time_per_iteration": 2.78608775138855 + }, + { + "auxiliary_loss_clip": 0.01335463, + "auxiliary_loss_mlp": 0.00239361, + "balance_loss_clip": 1.09049714, + "balance_loss_mlp": 0.21168074, + "epoch": 0.7592965579437848, + "flos": 21251504367360.0, + "grad_norm": 26.778772125990404, + "language_loss": 0.71522987, + "learning_rate": 5.774804441882648e-07, + "loss": 0.73097813, + "num_input_tokens_seen": 272372875, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.27697754, + "step": 12629, + "time_per_iteration": 2.683120012283325 + }, + { + "auxiliary_loss_clip": 0.01266568, + "auxiliary_loss_mlp": 0.00248088, + "balance_loss_clip": 1.04522228, + "balance_loss_mlp": 0.22386429, + "epoch": 0.7593566811964527, + "flos": 26213604704640.0, + "grad_norm": 4.876445934758837, + "language_loss": 0.84743214, + "learning_rate": 5.772067071539786e-07, + "loss": 0.86257869, + "num_input_tokens_seen": 272394715, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.24243164, + "step": 12630, + "time_per_iteration": 2.712991237640381 + }, + { + "auxiliary_loss_clip": 0.01130936, + "auxiliary_loss_mlp": 0.00081728, + "balance_loss_clip": 0.98053002, + "balance_loss_mlp": 0.0743847, + "epoch": 0.7594168044491207, + "flos": 71237255374080.0, + "grad_norm": 0.9229598482375224, + "language_loss": 0.60790735, + "learning_rate": 5.769330240721562e-07, + "loss": 0.62003404, + "num_input_tokens_seen": 272458775, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.07324219, + "step": 12631, + "time_per_iteration": 3.2272579669952393 + }, + { + "auxiliary_loss_clip": 0.01316104, + "auxiliary_loss_mlp": 0.00268762, + "balance_loss_clip": 1.07628369, + "balance_loss_mlp": 0.2395077, + "epoch": 0.7594769277017887, + "flos": 26613326229120.0, + "grad_norm": 40.33735774445904, + "language_loss": 0.81648147, + "learning_rate": 5.766593949531767e-07, + "loss": 0.83233011, + "num_input_tokens_seen": 272479355, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.29260254, + "step": 12632, + "time_per_iteration": 2.6948697566986084 + }, + { + "auxiliary_loss_clip": 0.01283622, + "auxiliary_loss_mlp": 0.00253244, + "balance_loss_clip": 1.05537271, + "balance_loss_mlp": 0.22680296, + "epoch": 0.7595370509544567, + "flos": 17595941333760.0, + "grad_norm": 16.656169885579107, + "language_loss": 0.8086071, + "learning_rate": 5.763858198074154e-07, + "loss": 0.82397574, + "num_input_tokens_seen": 272493555, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.26464844, + "step": 12633, + "time_per_iteration": 2.597329616546631 + }, + { + "auxiliary_loss_clip": 0.01279593, + "auxiliary_loss_mlp": 0.00255621, + "balance_loss_clip": 1.05361652, + "balance_loss_mlp": 0.23124278, + "epoch": 0.7595971742071246, + "flos": 18002953319040.0, + "grad_norm": 26.41190958748143, + "language_loss": 0.81153089, + "learning_rate": 5.76112298645246e-07, + "loss": 0.82688302, + "num_input_tokens_seen": 272508925, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.24377441, + "step": 12634, + "time_per_iteration": 2.634786605834961 + }, + { + "auxiliary_loss_clip": 0.01272389, + "auxiliary_loss_mlp": 0.00271048, + "balance_loss_clip": 1.05065203, + "balance_loss_mlp": 0.24607371, + "epoch": 0.7596572974597926, + "flos": 28840326480000.0, + "grad_norm": 114.58533133703563, + "language_loss": 0.73354024, + "learning_rate": 5.758388314770408e-07, + "loss": 0.74897462, + "num_input_tokens_seen": 272528805, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24987793, + "step": 12635, + "time_per_iteration": 2.713315725326538 + }, + { + "auxiliary_loss_clip": 0.01276518, + "auxiliary_loss_mlp": 0.00271325, + "balance_loss_clip": 1.04679203, + "balance_loss_mlp": 0.2457072, + "epoch": 0.7597174207124605, + "flos": 14282823588480.0, + "grad_norm": 45.822509611410574, + "language_loss": 0.77312601, + "learning_rate": 5.7556541831317e-07, + "loss": 0.78860438, + "num_input_tokens_seen": 272546655, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.25622559, + "step": 12636, + "time_per_iteration": 2.650327444076538 + }, + { + "auxiliary_loss_clip": 0.01291774, + "auxiliary_loss_mlp": 0.00273509, + "balance_loss_clip": 1.06379139, + "balance_loss_mlp": 0.24691375, + "epoch": 0.7597775439651285, + "flos": 21688932193920.0, + "grad_norm": 42.08502502354406, + "language_loss": 0.89285862, + "learning_rate": 5.752920591640018e-07, + "loss": 0.9085114, + "num_input_tokens_seen": 272564010, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.26635742, + "step": 12637, + "time_per_iteration": 4.087862968444824 + }, + { + "auxiliary_loss_clip": 0.01257843, + "auxiliary_loss_mlp": 0.00267085, + "balance_loss_clip": 1.0385505, + "balance_loss_mlp": 0.24137157, + "epoch": 0.7598376672177964, + "flos": 36101248312320.0, + "grad_norm": 77.76543173872622, + "language_loss": 0.73894453, + "learning_rate": 5.750187540399017e-07, + "loss": 0.75419384, + "num_input_tokens_seen": 272585840, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25732422, + "step": 12638, + "time_per_iteration": 2.7655234336853027 + }, + { + "auxiliary_loss_clip": 0.0127536, + "auxiliary_loss_mlp": 0.0028638, + "balance_loss_clip": 1.0484786, + "balance_loss_mlp": 0.25943831, + "epoch": 0.7598977904704645, + "flos": 18332326056960.0, + "grad_norm": 212.26535849888418, + "language_loss": 0.7605871, + "learning_rate": 5.747455029512323e-07, + "loss": 0.77620453, + "num_input_tokens_seen": 272602300, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.26928711, + "step": 12639, + "time_per_iteration": 4.165729522705078 + }, + { + "auxiliary_loss_clip": 0.01283776, + "auxiliary_loss_mlp": 0.00266291, + "balance_loss_clip": 1.05757165, + "balance_loss_mlp": 0.23983884, + "epoch": 0.7599579137231324, + "flos": 20192642317440.0, + "grad_norm": 6.207100336406484, + "language_loss": 0.80703259, + "learning_rate": 5.744723059083572e-07, + "loss": 0.82253325, + "num_input_tokens_seen": 272619595, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26464844, + "step": 12640, + "time_per_iteration": 2.6329703330993652 + }, + { + "auxiliary_loss_clip": 0.01298569, + "auxiliary_loss_mlp": 0.00280582, + "balance_loss_clip": 1.06817937, + "balance_loss_mlp": 0.25263932, + "epoch": 0.7600180369758004, + "flos": 24024849459840.0, + "grad_norm": 24.075018322635245, + "language_loss": 0.74821579, + "learning_rate": 5.741991629216343e-07, + "loss": 0.76400721, + "num_input_tokens_seen": 272638825, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.27954102, + "step": 12641, + "time_per_iteration": 2.664675712585449 + }, + { + "auxiliary_loss_clip": 0.01306156, + "auxiliary_loss_mlp": 0.00266602, + "balance_loss_clip": 1.07398677, + "balance_loss_mlp": 0.2375747, + "epoch": 0.7600781602284684, + "flos": 18989527248000.0, + "grad_norm": 34.21978896635616, + "language_loss": 0.76469827, + "learning_rate": 5.73926074001422e-07, + "loss": 0.78042591, + "num_input_tokens_seen": 272657240, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2902832, + "step": 12642, + "time_per_iteration": 4.229751110076904 + }, + { + "auxiliary_loss_clip": 0.01269324, + "auxiliary_loss_mlp": 0.00253097, + "balance_loss_clip": 1.04378843, + "balance_loss_mlp": 0.22765759, + "epoch": 0.7601382834811363, + "flos": 26067520091520.0, + "grad_norm": 9.081573487650436, + "language_loss": 0.8439492, + "learning_rate": 5.736530391580765e-07, + "loss": 0.85917336, + "num_input_tokens_seen": 272677520, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.25476074, + "step": 12643, + "time_per_iteration": 2.699097156524658 + }, + { + "auxiliary_loss_clip": 0.01283213, + "auxiliary_loss_mlp": 0.00274452, + "balance_loss_clip": 1.05398703, + "balance_loss_mlp": 0.24511465, + "epoch": 0.7601984067338043, + "flos": 18844232734080.0, + "grad_norm": 13.611206102780388, + "language_loss": 0.8581003, + "learning_rate": 5.733800584019508e-07, + "loss": 0.8736769, + "num_input_tokens_seen": 272696770, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.29370117, + "step": 12644, + "time_per_iteration": 2.6784486770629883 + }, + { + "auxiliary_loss_clip": 0.0127139, + "auxiliary_loss_mlp": 0.00250781, + "balance_loss_clip": 1.04880834, + "balance_loss_mlp": 0.22453126, + "epoch": 0.7602585299864723, + "flos": 24646391424000.0, + "grad_norm": 2.3854985964746147, + "language_loss": 0.8597672, + "learning_rate": 5.731071317433957e-07, + "loss": 0.87498885, + "num_input_tokens_seen": 272718340, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.2623291, + "step": 12645, + "time_per_iteration": 2.692143201828003 + }, + { + "auxiliary_loss_clip": 0.01284925, + "auxiliary_loss_mlp": 0.00270207, + "balance_loss_clip": 1.0599364, + "balance_loss_mlp": 0.24289608, + "epoch": 0.7603186532391403, + "flos": 23842100039040.0, + "grad_norm": 9.401164668335099, + "language_loss": 0.79878932, + "learning_rate": 5.728342591927611e-07, + "loss": 0.81434065, + "num_input_tokens_seen": 272739575, + "router_z_loss_clip": 2.25097656, + "router_z_loss_mlp": 0.27319336, + "step": 12646, + "time_per_iteration": 2.684187173843384 + }, + { + "auxiliary_loss_clip": 0.01272319, + "auxiliary_loss_mlp": 0.0027154, + "balance_loss_clip": 1.04854774, + "balance_loss_mlp": 0.24562384, + "epoch": 0.7603787764918082, + "flos": 22199905117440.0, + "grad_norm": 6.955633531893365, + "language_loss": 0.76680887, + "learning_rate": 5.725614407603949e-07, + "loss": 0.78224748, + "num_input_tokens_seen": 272758710, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.25927734, + "step": 12647, + "time_per_iteration": 4.080297231674194 + }, + { + "auxiliary_loss_clip": 0.01117088, + "auxiliary_loss_mlp": 0.00101286, + "balance_loss_clip": 0.96495438, + "balance_loss_mlp": 0.09303628, + "epoch": 0.7604388997444762, + "flos": 54086894254080.0, + "grad_norm": 0.6637491776424435, + "language_loss": 0.48739842, + "learning_rate": 5.722886764566415e-07, + "loss": 0.49958214, + "num_input_tokens_seen": 272814855, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.08251953, + "step": 12648, + "time_per_iteration": 3.1484665870666504 + }, + { + "auxiliary_loss_clip": 0.01248064, + "auxiliary_loss_mlp": 0.00249525, + "balance_loss_clip": 1.03371191, + "balance_loss_mlp": 0.22539718, + "epoch": 0.7604990229971441, + "flos": 19681920789120.0, + "grad_norm": 6.520246822194471, + "language_loss": 0.83053517, + "learning_rate": 5.720159662918451e-07, + "loss": 0.84551102, + "num_input_tokens_seen": 272834400, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.24145508, + "step": 12649, + "time_per_iteration": 2.7060465812683105 + }, + { + "auxiliary_loss_clip": 0.01260904, + "auxiliary_loss_mlp": 0.00256847, + "balance_loss_clip": 1.04230392, + "balance_loss_mlp": 0.23057294, + "epoch": 0.7605591462498121, + "flos": 25228036356480.0, + "grad_norm": 7.298749613808355, + "language_loss": 0.73971081, + "learning_rate": 5.717433102763462e-07, + "loss": 0.7548883, + "num_input_tokens_seen": 272854760, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.26293945, + "step": 12650, + "time_per_iteration": 2.7326879501342773 + }, + { + "auxiliary_loss_clip": 0.01113344, + "auxiliary_loss_mlp": 0.00089957, + "balance_loss_clip": 0.96237659, + "balance_loss_mlp": 0.08270862, + "epoch": 0.76061926950248, + "flos": 66783757662720.0, + "grad_norm": 0.7736981918977436, + "language_loss": 0.62260449, + "learning_rate": 5.714707084204838e-07, + "loss": 0.63463748, + "num_input_tokens_seen": 272919030, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.07226562, + "step": 12651, + "time_per_iteration": 3.1622626781463623 + }, + { + "auxiliary_loss_clip": 0.01272256, + "auxiliary_loss_mlp": 0.00262288, + "balance_loss_clip": 1.04965115, + "balance_loss_mlp": 0.23682462, + "epoch": 0.7606793927551481, + "flos": 25338354001920.0, + "grad_norm": 4.646937430167028, + "language_loss": 0.79676628, + "learning_rate": 5.711981607345951e-07, + "loss": 0.81211174, + "num_input_tokens_seen": 272938925, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25488281, + "step": 12652, + "time_per_iteration": 2.73091721534729 + }, + { + "auxiliary_loss_clip": 0.01281946, + "auxiliary_loss_mlp": 0.0027673, + "balance_loss_clip": 1.05346537, + "balance_loss_mlp": 0.24910918, + "epoch": 0.760739516007816, + "flos": 18223624523520.0, + "grad_norm": 201.46983676024675, + "language_loss": 0.85167873, + "learning_rate": 5.709256672290152e-07, + "loss": 0.86726546, + "num_input_tokens_seen": 272954945, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27636719, + "step": 12653, + "time_per_iteration": 2.602353811264038 + }, + { + "auxiliary_loss_clip": 0.01278103, + "auxiliary_loss_mlp": 0.00270844, + "balance_loss_clip": 1.05287051, + "balance_loss_mlp": 0.24507064, + "epoch": 0.760799639260484, + "flos": 22559119079040.0, + "grad_norm": 41.408303239484404, + "language_loss": 0.87368715, + "learning_rate": 5.706532279140785e-07, + "loss": 0.88917661, + "num_input_tokens_seen": 272972855, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.25817871, + "step": 12654, + "time_per_iteration": 2.750601291656494 + }, + { + "auxiliary_loss_clip": 0.01279952, + "auxiliary_loss_mlp": 0.00273694, + "balance_loss_clip": 1.05165052, + "balance_loss_mlp": 0.24548939, + "epoch": 0.760859762513152, + "flos": 22309324922880.0, + "grad_norm": 6.10955388938357, + "language_loss": 0.87304151, + "learning_rate": 5.703808428001136e-07, + "loss": 0.88857806, + "num_input_tokens_seen": 272989895, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.28173828, + "step": 12655, + "time_per_iteration": 2.694735288619995 + }, + { + "auxiliary_loss_clip": 0.01265614, + "auxiliary_loss_mlp": 0.00265036, + "balance_loss_clip": 1.04605639, + "balance_loss_mlp": 0.24088424, + "epoch": 0.7609198857658199, + "flos": 24863902231680.0, + "grad_norm": 95.45395242506218, + "language_loss": 0.74830914, + "learning_rate": 5.701085118974505e-07, + "loss": 0.76361567, + "num_input_tokens_seen": 273011695, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.24157715, + "step": 12656, + "time_per_iteration": 2.7514352798461914 + }, + { + "auxiliary_loss_clip": 0.0128328, + "auxiliary_loss_mlp": 0.00295887, + "balance_loss_clip": 1.05648553, + "balance_loss_mlp": 0.26809952, + "epoch": 0.760980009018488, + "flos": 16836790366080.0, + "grad_norm": 163.06514292778806, + "language_loss": 0.82111639, + "learning_rate": 5.698362352164164e-07, + "loss": 0.83690804, + "num_input_tokens_seen": 273028815, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.27807617, + "step": 12657, + "time_per_iteration": 2.6058833599090576 + }, + { + "auxiliary_loss_clip": 0.01109289, + "auxiliary_loss_mlp": 0.00074764, + "balance_loss_clip": 0.95733702, + "balance_loss_mlp": 0.06665795, + "epoch": 0.7610401322711559, + "flos": 61230603029760.0, + "grad_norm": 0.8528212645030652, + "language_loss": 0.64307958, + "learning_rate": 5.695640127673347e-07, + "loss": 0.6549201, + "num_input_tokens_seen": 273084080, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.08105469, + "step": 12658, + "time_per_iteration": 3.1242079734802246 + }, + { + "auxiliary_loss_clip": 0.01271993, + "auxiliary_loss_mlp": 0.00258595, + "balance_loss_clip": 1.0530467, + "balance_loss_mlp": 0.23177341, + "epoch": 0.7611002555238239, + "flos": 19640730867840.0, + "grad_norm": 9.664089974298758, + "language_loss": 0.86907721, + "learning_rate": 5.692918445605293e-07, + "loss": 0.88438308, + "num_input_tokens_seen": 273102295, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26831055, + "step": 12659, + "time_per_iteration": 2.716341257095337 + }, + { + "auxiliary_loss_clip": 0.01259726, + "auxiliary_loss_mlp": 0.00281491, + "balance_loss_clip": 1.04040647, + "balance_loss_mlp": 0.2549426, + "epoch": 0.7611603787764918, + "flos": 26872206526080.0, + "grad_norm": 52.066370294333396, + "language_loss": 0.75392377, + "learning_rate": 5.690197306063209e-07, + "loss": 0.76933599, + "num_input_tokens_seen": 273123400, + "router_z_loss_clip": 2.19433594, + "router_z_loss_mlp": 0.26538086, + "step": 12660, + "time_per_iteration": 2.7115330696105957 + }, + { + "auxiliary_loss_clip": 0.01276889, + "auxiliary_loss_mlp": 0.00247881, + "balance_loss_clip": 1.05101657, + "balance_loss_mlp": 0.22178647, + "epoch": 0.7612205020291598, + "flos": 27344252085120.0, + "grad_norm": 9.117382853551334, + "language_loss": 0.7732867, + "learning_rate": 5.687476709150281e-07, + "loss": 0.7885344, + "num_input_tokens_seen": 273145150, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.2611084, + "step": 12661, + "time_per_iteration": 2.695098876953125 + }, + { + "auxiliary_loss_clip": 0.01265023, + "auxiliary_loss_mlp": 0.00270499, + "balance_loss_clip": 1.04531229, + "balance_loss_mlp": 0.24360535, + "epoch": 0.7612806252818277, + "flos": 29314598682240.0, + "grad_norm": 40.14297210166979, + "language_loss": 0.88851571, + "learning_rate": 5.68475665496966e-07, + "loss": 0.90387082, + "num_input_tokens_seen": 273165180, + "router_z_loss_clip": 2.20019531, + "router_z_loss_mlp": 0.26867676, + "step": 12662, + "time_per_iteration": 2.70656418800354 + }, + { + "auxiliary_loss_clip": 0.01265797, + "auxiliary_loss_mlp": 0.00271589, + "balance_loss_clip": 1.04287803, + "balance_loss_mlp": 0.24531564, + "epoch": 0.7613407485344957, + "flos": 19026048401280.0, + "grad_norm": 60.62080104282488, + "language_loss": 0.77893865, + "learning_rate": 5.682037143624505e-07, + "loss": 0.79431248, + "num_input_tokens_seen": 273184005, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.26269531, + "step": 12663, + "time_per_iteration": 2.693967819213867 + }, + { + "auxiliary_loss_clip": 0.01286552, + "auxiliary_loss_mlp": 0.00248802, + "balance_loss_clip": 1.06293273, + "balance_loss_mlp": 0.22472212, + "epoch": 0.7614008717871636, + "flos": 23256037733760.0, + "grad_norm": 3.598346751993354, + "language_loss": 0.76048672, + "learning_rate": 5.67931817521794e-07, + "loss": 0.77584022, + "num_input_tokens_seen": 273203565, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.2409668, + "step": 12664, + "time_per_iteration": 2.7175943851470947 + }, + { + "auxiliary_loss_clip": 0.01300724, + "auxiliary_loss_mlp": 0.0025871, + "balance_loss_clip": 1.06969571, + "balance_loss_mlp": 0.23318724, + "epoch": 0.7614609950398317, + "flos": 21579907438080.0, + "grad_norm": 80.05537205254802, + "language_loss": 0.86880851, + "learning_rate": 5.676599749853066e-07, + "loss": 0.88440287, + "num_input_tokens_seen": 273221645, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.25537109, + "step": 12665, + "time_per_iteration": 2.83748459815979 + }, + { + "auxiliary_loss_clip": 0.01284264, + "auxiliary_loss_mlp": 0.00266606, + "balance_loss_clip": 1.06446576, + "balance_loss_mlp": 0.24136898, + "epoch": 0.7615211182924996, + "flos": 29277897960960.0, + "grad_norm": 5.28276408912887, + "language_loss": 0.94114959, + "learning_rate": 5.673881867632959e-07, + "loss": 0.95665824, + "num_input_tokens_seen": 273242040, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25231934, + "step": 12666, + "time_per_iteration": 2.7577264308929443 + }, + { + "auxiliary_loss_clip": 0.01276448, + "auxiliary_loss_mlp": 0.00256307, + "balance_loss_clip": 1.05534613, + "balance_loss_mlp": 0.23126142, + "epoch": 0.7615812415451676, + "flos": 13261129136640.0, + "grad_norm": 6.9619219239999, + "language_loss": 0.90980184, + "learning_rate": 5.671164528660693e-07, + "loss": 0.92512941, + "num_input_tokens_seen": 273257365, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25036621, + "step": 12667, + "time_per_iteration": 2.6422979831695557 + }, + { + "auxiliary_loss_clip": 0.01261752, + "auxiliary_loss_mlp": 0.00257461, + "balance_loss_clip": 1.04759967, + "balance_loss_mlp": 0.23230821, + "epoch": 0.7616413647978356, + "flos": 18584741905920.0, + "grad_norm": 20.24145768387223, + "language_loss": 0.85315037, + "learning_rate": 5.668447733039296e-07, + "loss": 0.86834246, + "num_input_tokens_seen": 273274710, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.25183105, + "step": 12668, + "time_per_iteration": 2.6042826175689697 + }, + { + "auxiliary_loss_clip": 0.01262361, + "auxiliary_loss_mlp": 0.00260202, + "balance_loss_clip": 1.04206777, + "balance_loss_mlp": 0.23439345, + "epoch": 0.7617014880505035, + "flos": 18516188799360.0, + "grad_norm": 405.1028242604791, + "language_loss": 0.71782362, + "learning_rate": 5.6657314808718e-07, + "loss": 0.73304927, + "num_input_tokens_seen": 273292870, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.25805664, + "step": 12669, + "time_per_iteration": 2.662238836288452 + }, + { + "auxiliary_loss_clip": 0.01298241, + "auxiliary_loss_mlp": 0.00265572, + "balance_loss_clip": 1.06825781, + "balance_loss_mlp": 0.24024013, + "epoch": 0.7617616113031715, + "flos": 24973178382720.0, + "grad_norm": 141.2824557550159, + "language_loss": 0.72387612, + "learning_rate": 5.663015772261202e-07, + "loss": 0.73951423, + "num_input_tokens_seen": 273312375, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.25317383, + "step": 12670, + "time_per_iteration": 2.6809232234954834 + }, + { + "auxiliary_loss_clip": 0.01285112, + "auxiliary_loss_mlp": 0.00258062, + "balance_loss_clip": 1.05987263, + "balance_loss_mlp": 0.23264615, + "epoch": 0.7618217345558395, + "flos": 23295036925440.0, + "grad_norm": 8.908977739018141, + "language_loss": 0.79546922, + "learning_rate": 5.660300607310493e-07, + "loss": 0.81090099, + "num_input_tokens_seen": 273332590, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.25402832, + "step": 12671, + "time_per_iteration": 2.718287706375122 + }, + { + "auxiliary_loss_clip": 0.01270738, + "auxiliary_loss_mlp": 0.00275106, + "balance_loss_clip": 1.04946101, + "balance_loss_mlp": 0.24957107, + "epoch": 0.7618818578085075, + "flos": 25482894330240.0, + "grad_norm": 14.662966719418453, + "language_loss": 0.78192908, + "learning_rate": 5.657585986122613e-07, + "loss": 0.79738754, + "num_input_tokens_seen": 273352885, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.25537109, + "step": 12672, + "time_per_iteration": 2.724764108657837 + }, + { + "auxiliary_loss_clip": 0.01111847, + "auxiliary_loss_mlp": 0.00090118, + "balance_loss_clip": 0.96094966, + "balance_loss_mlp": 0.08201206, + "epoch": 0.7619419810611754, + "flos": 61151994115200.0, + "grad_norm": 0.7369080867025939, + "language_loss": 0.55961502, + "learning_rate": 5.654871908800506e-07, + "loss": 0.57163465, + "num_input_tokens_seen": 273411730, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.08105469, + "step": 12673, + "time_per_iteration": 3.147416114807129 + }, + { + "auxiliary_loss_clip": 0.01289936, + "auxiliary_loss_mlp": 0.00284863, + "balance_loss_clip": 1.05865884, + "balance_loss_mlp": 0.2559672, + "epoch": 0.7620021043138434, + "flos": 23258659426560.0, + "grad_norm": 7.194931304048916, + "language_loss": 0.82926953, + "learning_rate": 5.652158375447102e-07, + "loss": 0.84501755, + "num_input_tokens_seen": 273430020, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.28918457, + "step": 12674, + "time_per_iteration": 2.6847071647644043 + }, + { + "auxiliary_loss_clip": 0.01264105, + "auxiliary_loss_mlp": 0.0025168, + "balance_loss_clip": 1.05056691, + "balance_loss_mlp": 0.22812402, + "epoch": 0.7620622275665113, + "flos": 25082490447360.0, + "grad_norm": 32.151163972297255, + "language_loss": 0.81374478, + "learning_rate": 5.649445386165286e-07, + "loss": 0.8289026, + "num_input_tokens_seen": 273448690, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.23535156, + "step": 12675, + "time_per_iteration": 2.6854469776153564 + }, + { + "auxiliary_loss_clip": 0.01271661, + "auxiliary_loss_mlp": 0.00278195, + "balance_loss_clip": 1.05157578, + "balance_loss_mlp": 0.25165898, + "epoch": 0.7621223508191793, + "flos": 20155007842560.0, + "grad_norm": 21.91298783165497, + "language_loss": 0.79442024, + "learning_rate": 5.646732941057936e-07, + "loss": 0.80991882, + "num_input_tokens_seen": 273465190, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.26513672, + "step": 12676, + "time_per_iteration": 2.6442995071411133 + }, + { + "auxiliary_loss_clip": 0.01309317, + "auxiliary_loss_mlp": 0.00262022, + "balance_loss_clip": 1.07254887, + "balance_loss_mlp": 0.23461571, + "epoch": 0.7621824740718472, + "flos": 18000187971840.0, + "grad_norm": 11.096268159823534, + "language_loss": 0.686059, + "learning_rate": 5.644021040227927e-07, + "loss": 0.70177239, + "num_input_tokens_seen": 273478620, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.27404785, + "step": 12677, + "time_per_iteration": 2.609644651412964 + }, + { + "auxiliary_loss_clip": 0.01289093, + "auxiliary_loss_mlp": 0.00258249, + "balance_loss_clip": 1.06207156, + "balance_loss_mlp": 0.23375136, + "epoch": 0.7622425973245153, + "flos": 21725668828800.0, + "grad_norm": 22.316303070910823, + "language_loss": 0.87141776, + "learning_rate": 5.641309683778064e-07, + "loss": 0.88689125, + "num_input_tokens_seen": 273497635, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.24487305, + "step": 12678, + "time_per_iteration": 2.686627149581909 + }, + { + "auxiliary_loss_clip": 0.01263653, + "auxiliary_loss_mlp": 0.0022637, + "balance_loss_clip": 1.043823, + "balance_loss_mlp": 0.20207542, + "epoch": 0.7623027205771832, + "flos": 19718549683200.0, + "grad_norm": 107.04999422075765, + "language_loss": 0.84730542, + "learning_rate": 5.638598871811175e-07, + "loss": 0.86220568, + "num_input_tokens_seen": 273513955, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.24316406, + "step": 12679, + "time_per_iteration": 4.0583176612854 + }, + { + "auxiliary_loss_clip": 0.01266933, + "auxiliary_loss_mlp": 0.00243007, + "balance_loss_clip": 1.04432011, + "balance_loss_mlp": 0.21655481, + "epoch": 0.7623628438298512, + "flos": 23988831096960.0, + "grad_norm": 18.349422831314328, + "language_loss": 0.85801864, + "learning_rate": 5.635888604430059e-07, + "loss": 0.87311804, + "num_input_tokens_seen": 273533970, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.26452637, + "step": 12680, + "time_per_iteration": 2.6646339893341064 + }, + { + "auxiliary_loss_clip": 0.01288256, + "auxiliary_loss_mlp": 0.00250237, + "balance_loss_clip": 1.06416309, + "balance_loss_mlp": 0.22576348, + "epoch": 0.7624229670825191, + "flos": 22345702421760.0, + "grad_norm": 6.95107515506403, + "language_loss": 0.73563689, + "learning_rate": 5.633178881737493e-07, + "loss": 0.75102186, + "num_input_tokens_seen": 273553090, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.24487305, + "step": 12681, + "time_per_iteration": 4.145059108734131 + }, + { + "auxiliary_loss_clip": 0.0126622, + "auxiliary_loss_mlp": 0.00245147, + "balance_loss_clip": 1.04822731, + "balance_loss_mlp": 0.21827716, + "epoch": 0.7624830903351871, + "flos": 22711775880960.0, + "grad_norm": 323.65011493987265, + "language_loss": 0.83983397, + "learning_rate": 5.63046970383622e-07, + "loss": 0.85494757, + "num_input_tokens_seen": 273572460, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.26879883, + "step": 12682, + "time_per_iteration": 2.6562650203704834 + }, + { + "auxiliary_loss_clip": 0.01275874, + "auxiliary_loss_mlp": 0.00238138, + "balance_loss_clip": 1.05261517, + "balance_loss_mlp": 0.21352145, + "epoch": 0.7625432135878552, + "flos": 25593714766080.0, + "grad_norm": 5.722497589168924, + "language_loss": 0.75437444, + "learning_rate": 5.627761070828974e-07, + "loss": 0.76951456, + "num_input_tokens_seen": 273592815, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.24633789, + "step": 12683, + "time_per_iteration": 2.684774160385132 + }, + { + "auxiliary_loss_clip": 0.01264959, + "auxiliary_loss_mlp": 0.00272959, + "balance_loss_clip": 1.04152226, + "balance_loss_mlp": 0.24550475, + "epoch": 0.7626033368405231, + "flos": 23987645948160.0, + "grad_norm": 4.2393519134954625, + "language_loss": 0.90975869, + "learning_rate": 5.625052982818472e-07, + "loss": 0.92513788, + "num_input_tokens_seen": 273611790, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.2746582, + "step": 12684, + "time_per_iteration": 2.695608139038086 + }, + { + "auxiliary_loss_clip": 0.01269426, + "auxiliary_loss_mlp": 0.00236577, + "balance_loss_clip": 1.05039001, + "balance_loss_mlp": 0.2122938, + "epoch": 0.7626634600931911, + "flos": 12599115523200.0, + "grad_norm": 34.77029736604417, + "language_loss": 0.90014654, + "learning_rate": 5.622345439907396e-07, + "loss": 0.91520655, + "num_input_tokens_seen": 273628340, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.24267578, + "step": 12685, + "time_per_iteration": 4.18950343132019 + }, + { + "auxiliary_loss_clip": 0.01286823, + "auxiliary_loss_mlp": 0.00241017, + "balance_loss_clip": 1.06376743, + "balance_loss_mlp": 0.21600702, + "epoch": 0.762723583345859, + "flos": 26322593546880.0, + "grad_norm": 8.987446404860817, + "language_loss": 0.84869647, + "learning_rate": 5.619638442198422e-07, + "loss": 0.86397481, + "num_input_tokens_seen": 273646585, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.25012207, + "step": 12686, + "time_per_iteration": 2.6924245357513428 + }, + { + "auxiliary_loss_clip": 0.01276487, + "auxiliary_loss_mlp": 0.00244735, + "balance_loss_clip": 1.04911041, + "balance_loss_mlp": 0.2182703, + "epoch": 0.762783706598527, + "flos": 21907053532800.0, + "grad_norm": 502.954820396623, + "language_loss": 0.81199074, + "learning_rate": 5.616931989794198e-07, + "loss": 0.82720292, + "num_input_tokens_seen": 273665410, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.26477051, + "step": 12687, + "time_per_iteration": 2.6972367763519287 + }, + { + "auxiliary_loss_clip": 0.01273548, + "auxiliary_loss_mlp": 0.00235935, + "balance_loss_clip": 1.05090976, + "balance_loss_mlp": 0.21088867, + "epoch": 0.7628438298511949, + "flos": 15339782217600.0, + "grad_norm": 14.665675381086281, + "language_loss": 0.74053788, + "learning_rate": 5.614226082797369e-07, + "loss": 0.7556327, + "num_input_tokens_seen": 273683035, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25061035, + "step": 12688, + "time_per_iteration": 2.6413238048553467 + }, + { + "auxiliary_loss_clip": 0.01267946, + "auxiliary_loss_mlp": 0.00239779, + "balance_loss_clip": 1.0488596, + "balance_loss_mlp": 0.21504264, + "epoch": 0.7629039531038629, + "flos": 13006307076480.0, + "grad_norm": 23.48138802857968, + "language_loss": 0.78331769, + "learning_rate": 5.611520721310515e-07, + "loss": 0.79839498, + "num_input_tokens_seen": 273700130, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.24768066, + "step": 12689, + "time_per_iteration": 4.041190147399902 + }, + { + "auxiliary_loss_clip": 0.01290872, + "auxiliary_loss_mlp": 0.00264731, + "balance_loss_clip": 1.05914736, + "balance_loss_mlp": 0.23719397, + "epoch": 0.7629640763565309, + "flos": 26171660597760.0, + "grad_norm": 49.167961375443426, + "language_loss": 0.78509408, + "learning_rate": 5.608815905436238e-07, + "loss": 0.80065012, + "num_input_tokens_seen": 273720310, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.27539062, + "step": 12690, + "time_per_iteration": 2.721982717514038 + }, + { + "auxiliary_loss_clip": 0.0128425, + "auxiliary_loss_mlp": 0.00271096, + "balance_loss_clip": 1.05851793, + "balance_loss_mlp": 0.24341583, + "epoch": 0.7630241996091989, + "flos": 36793713680640.0, + "grad_norm": 9.449557159645515, + "language_loss": 0.76459599, + "learning_rate": 5.606111635277109e-07, + "loss": 0.78014934, + "num_input_tokens_seen": 273744475, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.27697754, + "step": 12691, + "time_per_iteration": 2.90408992767334 + }, + { + "auxiliary_loss_clip": 0.01278683, + "auxiliary_loss_mlp": 0.00244073, + "balance_loss_clip": 1.05329657, + "balance_loss_mlp": 0.2170368, + "epoch": 0.7630843228618668, + "flos": 21835160461440.0, + "grad_norm": 32.35369533795213, + "language_loss": 0.92016912, + "learning_rate": 5.603407910935662e-07, + "loss": 0.93539667, + "num_input_tokens_seen": 273764635, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.27026367, + "step": 12692, + "time_per_iteration": 2.704926013946533 + }, + { + "auxiliary_loss_clip": 0.01283587, + "auxiliary_loss_mlp": 0.00219272, + "balance_loss_clip": 1.05760288, + "balance_loss_mlp": 0.19629988, + "epoch": 0.7631444461145348, + "flos": 12640520926080.0, + "grad_norm": 84.43506609685507, + "language_loss": 0.88809896, + "learning_rate": 5.600704732514438e-07, + "loss": 0.90312755, + "num_input_tokens_seen": 273780115, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.22961426, + "step": 12693, + "time_per_iteration": 2.6399025917053223 + }, + { + "auxiliary_loss_clip": 0.01308596, + "auxiliary_loss_mlp": 0.00242748, + "balance_loss_clip": 1.07464433, + "balance_loss_mlp": 0.21500841, + "epoch": 0.7632045693672027, + "flos": 16836610798080.0, + "grad_norm": 4.616658233314432, + "language_loss": 0.83732796, + "learning_rate": 5.598002100115933e-07, + "loss": 0.85284138, + "num_input_tokens_seen": 273796605, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.27734375, + "step": 12694, + "time_per_iteration": 2.7257773876190186 + }, + { + "auxiliary_loss_clip": 0.01270864, + "auxiliary_loss_mlp": 0.00243649, + "balance_loss_clip": 1.04878175, + "balance_loss_mlp": 0.21818569, + "epoch": 0.7632646926198707, + "flos": 22017335264640.0, + "grad_norm": 19.912793942683418, + "language_loss": 0.77847421, + "learning_rate": 5.595300013842625e-07, + "loss": 0.79361939, + "num_input_tokens_seen": 273816515, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25488281, + "step": 12695, + "time_per_iteration": 2.6410582065582275 + }, + { + "auxiliary_loss_clip": 0.01275267, + "auxiliary_loss_mlp": 0.00243775, + "balance_loss_clip": 1.05210817, + "balance_loss_mlp": 0.21924129, + "epoch": 0.7633248158725388, + "flos": 23114011357440.0, + "grad_norm": 55.45335177371007, + "language_loss": 0.8049857, + "learning_rate": 5.592598473796985e-07, + "loss": 0.82017612, + "num_input_tokens_seen": 273837060, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.24536133, + "step": 12696, + "time_per_iteration": 2.6885006427764893 + }, + { + "auxiliary_loss_clip": 0.01282975, + "auxiliary_loss_mlp": 0.00248455, + "balance_loss_clip": 1.05732501, + "balance_loss_mlp": 0.22180003, + "epoch": 0.7633849391252067, + "flos": 10889839952640.0, + "grad_norm": 10.786900769057517, + "language_loss": 0.81736147, + "learning_rate": 5.589897480081453e-07, + "loss": 0.83267581, + "num_input_tokens_seen": 273853365, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.26623535, + "step": 12697, + "time_per_iteration": 2.6312382221221924 + }, + { + "auxiliary_loss_clip": 0.01274418, + "auxiliary_loss_mlp": 0.00245323, + "balance_loss_clip": 1.058375, + "balance_loss_mlp": 0.22022918, + "epoch": 0.7634450623778747, + "flos": 20994168355200.0, + "grad_norm": 3.09226679857225, + "language_loss": 0.75251657, + "learning_rate": 5.587197032798461e-07, + "loss": 0.76771396, + "num_input_tokens_seen": 273870750, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.25097656, + "step": 12698, + "time_per_iteration": 2.6544764041900635 + }, + { + "auxiliary_loss_clip": 0.01272114, + "auxiliary_loss_mlp": 0.00272343, + "balance_loss_clip": 1.04830515, + "balance_loss_mlp": 0.24575932, + "epoch": 0.7635051856305426, + "flos": 18882046776960.0, + "grad_norm": 10.136195151070405, + "language_loss": 0.80456507, + "learning_rate": 5.5844971320504e-07, + "loss": 0.82000959, + "num_input_tokens_seen": 273890890, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.26550293, + "step": 12699, + "time_per_iteration": 2.6564512252807617 + }, + { + "auxiliary_loss_clip": 0.01271575, + "auxiliary_loss_mlp": 0.00253628, + "balance_loss_clip": 1.05176628, + "balance_loss_mlp": 0.22789089, + "epoch": 0.7635653088832106, + "flos": 34786989584640.0, + "grad_norm": 11.121182999298837, + "language_loss": 0.79414809, + "learning_rate": 5.581797777939648e-07, + "loss": 0.80940008, + "num_input_tokens_seen": 273914015, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25756836, + "step": 12700, + "time_per_iteration": 2.771686315536499 + }, + { + "auxiliary_loss_clip": 0.01290633, + "auxiliary_loss_mlp": 0.00250093, + "balance_loss_clip": 1.06334662, + "balance_loss_mlp": 0.22263895, + "epoch": 0.7636254321358785, + "flos": 23178434400000.0, + "grad_norm": 615.6056670487602, + "language_loss": 0.77938581, + "learning_rate": 5.579098970568574e-07, + "loss": 0.79479301, + "num_input_tokens_seen": 273927415, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.27453613, + "step": 12701, + "time_per_iteration": 2.627302408218384 + }, + { + "auxiliary_loss_clip": 0.01276414, + "auxiliary_loss_mlp": 0.00235115, + "balance_loss_clip": 1.04677916, + "balance_loss_mlp": 0.20747024, + "epoch": 0.7636855553885465, + "flos": 21325229032320.0, + "grad_norm": 17.529886644067208, + "language_loss": 0.73591101, + "learning_rate": 5.576400710039508e-07, + "loss": 0.75102627, + "num_input_tokens_seen": 273946690, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.27661133, + "step": 12702, + "time_per_iteration": 2.6611268520355225 + }, + { + "auxiliary_loss_clip": 0.012812, + "auxiliary_loss_mlp": 0.00238816, + "balance_loss_clip": 1.05792093, + "balance_loss_mlp": 0.21150544, + "epoch": 0.7637456786412145, + "flos": 28658079849600.0, + "grad_norm": 10.753575628267152, + "language_loss": 0.74613917, + "learning_rate": 5.57370299645477e-07, + "loss": 0.76133937, + "num_input_tokens_seen": 273966870, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.27282715, + "step": 12703, + "time_per_iteration": 2.744915008544922 + }, + { + "auxiliary_loss_clip": 0.01269462, + "auxiliary_loss_mlp": 0.00259686, + "balance_loss_clip": 1.04492784, + "balance_loss_mlp": 0.23294729, + "epoch": 0.7638058018938825, + "flos": 21907269014400.0, + "grad_norm": 5.375119436500927, + "language_loss": 0.90944815, + "learning_rate": 5.571005829916668e-07, + "loss": 0.9247396, + "num_input_tokens_seen": 273986360, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26745605, + "step": 12704, + "time_per_iteration": 2.654968500137329 + }, + { + "auxiliary_loss_clip": 0.01262819, + "auxiliary_loss_mlp": 0.00228144, + "balance_loss_clip": 1.03703642, + "balance_loss_mlp": 0.20359899, + "epoch": 0.7638659251465504, + "flos": 29643899592960.0, + "grad_norm": 1084.79590878452, + "language_loss": 0.73452079, + "learning_rate": 5.568309210527469e-07, + "loss": 0.74943042, + "num_input_tokens_seen": 274009745, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.24572754, + "step": 12705, + "time_per_iteration": 2.796926736831665 + }, + { + "auxiliary_loss_clip": 0.01285243, + "auxiliary_loss_mlp": 0.00247589, + "balance_loss_clip": 1.05447924, + "balance_loss_mlp": 0.22098112, + "epoch": 0.7639260483992184, + "flos": 26141172929280.0, + "grad_norm": 2.027374859401982, + "language_loss": 0.82405746, + "learning_rate": 5.565613138389427e-07, + "loss": 0.83938575, + "num_input_tokens_seen": 274028775, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.26611328, + "step": 12706, + "time_per_iteration": 2.696748733520508 + }, + { + "auxiliary_loss_clip": 0.01283742, + "auxiliary_loss_mlp": 0.00225739, + "balance_loss_clip": 1.05814266, + "balance_loss_mlp": 0.20050268, + "epoch": 0.7639861716518863, + "flos": 20156695781760.0, + "grad_norm": 27.113752520732504, + "language_loss": 0.86145842, + "learning_rate": 5.562917613604781e-07, + "loss": 0.8765533, + "num_input_tokens_seen": 274047520, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.25256348, + "step": 12707, + "time_per_iteration": 2.668889045715332 + }, + { + "auxiliary_loss_clip": 0.01279369, + "auxiliary_loss_mlp": 0.00233706, + "balance_loss_clip": 1.05518937, + "balance_loss_mlp": 0.20821877, + "epoch": 0.7640462949045543, + "flos": 18583125793920.0, + "grad_norm": 56.16637415276084, + "language_loss": 0.86474192, + "learning_rate": 5.560222636275751e-07, + "loss": 0.87987268, + "num_input_tokens_seen": 274065350, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.25476074, + "step": 12708, + "time_per_iteration": 2.62629771232605 + }, + { + "auxiliary_loss_clip": 0.01110695, + "auxiliary_loss_mlp": 0.00060206, + "balance_loss_clip": 0.95458335, + "balance_loss_mlp": 0.0542698, + "epoch": 0.7641064181572224, + "flos": 68321991646080.0, + "grad_norm": 0.8679497125263992, + "language_loss": 0.55385458, + "learning_rate": 5.557528206504521e-07, + "loss": 0.56556368, + "num_input_tokens_seen": 274122315, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.05932617, + "step": 12709, + "time_per_iteration": 3.225857734680176 + }, + { + "auxiliary_loss_clip": 0.01286009, + "auxiliary_loss_mlp": 0.00269587, + "balance_loss_clip": 1.05525255, + "balance_loss_mlp": 0.24233535, + "epoch": 0.7641665414098903, + "flos": 17968982031360.0, + "grad_norm": 1562.6074145075663, + "language_loss": 0.73497874, + "learning_rate": 5.554834324393271e-07, + "loss": 0.75053465, + "num_input_tokens_seen": 274140555, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.27233887, + "step": 12710, + "time_per_iteration": 2.621039867401123 + }, + { + "auxiliary_loss_clip": 0.01296989, + "auxiliary_loss_mlp": 0.00262345, + "balance_loss_clip": 1.0657177, + "balance_loss_mlp": 0.23324564, + "epoch": 0.7642266646625583, + "flos": 21252078984960.0, + "grad_norm": 5.014917441913007, + "language_loss": 0.77292621, + "learning_rate": 5.552140990044154e-07, + "loss": 0.78851956, + "num_input_tokens_seen": 274161125, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.2911377, + "step": 12711, + "time_per_iteration": 2.7208259105682373 + }, + { + "auxiliary_loss_clip": 0.01272561, + "auxiliary_loss_mlp": 0.00241193, + "balance_loss_clip": 1.04996061, + "balance_loss_mlp": 0.21263005, + "epoch": 0.7642867879152262, + "flos": 22747794243840.0, + "grad_norm": 44.25669233893523, + "language_loss": 0.7947675, + "learning_rate": 5.549448203559293e-07, + "loss": 0.80990505, + "num_input_tokens_seen": 274180835, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.28527832, + "step": 12712, + "time_per_iteration": 2.743647575378418 + }, + { + "auxiliary_loss_clip": 0.01278613, + "auxiliary_loss_mlp": 0.0023695, + "balance_loss_clip": 1.05612421, + "balance_loss_mlp": 0.21154688, + "epoch": 0.7643469111678942, + "flos": 23332132696320.0, + "grad_norm": 7.590128367457276, + "language_loss": 0.88161051, + "learning_rate": 5.546755965040804e-07, + "loss": 0.89676613, + "num_input_tokens_seen": 274201190, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25415039, + "step": 12713, + "time_per_iteration": 2.7385001182556152 + }, + { + "auxiliary_loss_clip": 0.0129941, + "auxiliary_loss_mlp": 0.00242214, + "balance_loss_clip": 1.06469238, + "balance_loss_mlp": 0.21330589, + "epoch": 0.7644070344205621, + "flos": 19857092440320.0, + "grad_norm": 82.01749561819447, + "language_loss": 0.91353804, + "learning_rate": 5.544064274590776e-07, + "loss": 0.9289543, + "num_input_tokens_seen": 274217595, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.2890625, + "step": 12714, + "time_per_iteration": 2.7090137004852295 + }, + { + "auxiliary_loss_clip": 0.01275007, + "auxiliary_loss_mlp": 0.00243453, + "balance_loss_clip": 1.04885674, + "balance_loss_mlp": 0.21694098, + "epoch": 0.7644671576732301, + "flos": 22090628966400.0, + "grad_norm": 38.02727192804331, + "language_loss": 0.81575394, + "learning_rate": 5.541373132311287e-07, + "loss": 0.83093858, + "num_input_tokens_seen": 274237885, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26525879, + "step": 12715, + "time_per_iteration": 2.6875364780426025 + }, + { + "auxiliary_loss_clip": 0.01259474, + "auxiliary_loss_mlp": 0.00259965, + "balance_loss_clip": 1.03612244, + "balance_loss_mlp": 0.2340486, + "epoch": 0.7645272809258981, + "flos": 25481421872640.0, + "grad_norm": 2.2656378483611017, + "language_loss": 0.71774685, + "learning_rate": 5.538682538304376e-07, + "loss": 0.73294127, + "num_input_tokens_seen": 274258820, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.25915527, + "step": 12716, + "time_per_iteration": 2.7384514808654785 + }, + { + "auxiliary_loss_clip": 0.01287503, + "auxiliary_loss_mlp": 0.00253005, + "balance_loss_clip": 1.0624764, + "balance_loss_mlp": 0.22695763, + "epoch": 0.7645874041785661, + "flos": 21541877913600.0, + "grad_norm": 11.324470998960981, + "language_loss": 0.87118495, + "learning_rate": 5.535992492672068e-07, + "loss": 0.88659, + "num_input_tokens_seen": 274278835, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.26086426, + "step": 12717, + "time_per_iteration": 2.7705345153808594 + }, + { + "auxiliary_loss_clip": 0.0125312, + "auxiliary_loss_mlp": 0.0022754, + "balance_loss_clip": 1.03434587, + "balance_loss_mlp": 0.2047708, + "epoch": 0.764647527431234, + "flos": 20630896156800.0, + "grad_norm": 67.24863182386605, + "language_loss": 0.77688497, + "learning_rate": 5.53330299551638e-07, + "loss": 0.79169154, + "num_input_tokens_seen": 274297110, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.2277832, + "step": 12718, + "time_per_iteration": 2.7335708141326904 + }, + { + "auxiliary_loss_clip": 0.01267337, + "auxiliary_loss_mlp": 0.00243088, + "balance_loss_clip": 1.04746997, + "balance_loss_mlp": 0.21650398, + "epoch": 0.764707650683902, + "flos": 21434074220160.0, + "grad_norm": 65.99842473659072, + "language_loss": 0.88414466, + "learning_rate": 5.530614046939286e-07, + "loss": 0.89924884, + "num_input_tokens_seen": 274315610, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.26611328, + "step": 12719, + "time_per_iteration": 2.737826108932495 + }, + { + "auxiliary_loss_clip": 0.01280435, + "auxiliary_loss_mlp": 0.00239035, + "balance_loss_clip": 1.05423522, + "balance_loss_mlp": 0.21299939, + "epoch": 0.7647677739365699, + "flos": 22711201263360.0, + "grad_norm": 15.414486890632242, + "language_loss": 0.77339286, + "learning_rate": 5.527925647042754e-07, + "loss": 0.78858757, + "num_input_tokens_seen": 274333975, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26049805, + "step": 12720, + "time_per_iteration": 2.7290849685668945 + }, + { + "auxiliary_loss_clip": 0.01292575, + "auxiliary_loss_mlp": 0.00234336, + "balance_loss_clip": 1.06210136, + "balance_loss_mlp": 0.20890856, + "epoch": 0.7648278971892379, + "flos": 21324115710720.0, + "grad_norm": 124.49053410507939, + "language_loss": 0.79750955, + "learning_rate": 5.52523779592875e-07, + "loss": 0.81277865, + "num_input_tokens_seen": 274353695, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.25402832, + "step": 12721, + "time_per_iteration": 4.193897724151611 + }, + { + "auxiliary_loss_clip": 0.01270147, + "auxiliary_loss_mlp": 0.00235622, + "balance_loss_clip": 1.04014158, + "balance_loss_mlp": 0.20759566, + "epoch": 0.764888020441906, + "flos": 20667345482880.0, + "grad_norm": 786.9545835555739, + "language_loss": 0.82688308, + "learning_rate": 5.522550493699163e-07, + "loss": 0.84194082, + "num_input_tokens_seen": 274371120, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.28015137, + "step": 12722, + "time_per_iteration": 2.6784377098083496 + }, + { + "auxiliary_loss_clip": 0.01249505, + "auxiliary_loss_mlp": 0.00242784, + "balance_loss_clip": 1.03159904, + "balance_loss_mlp": 0.217154, + "epoch": 0.7649481436945739, + "flos": 25082526360960.0, + "grad_norm": 3.157824594671958, + "language_loss": 0.81988513, + "learning_rate": 5.519863740455912e-07, + "loss": 0.83480799, + "num_input_tokens_seen": 274389665, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25610352, + "step": 12723, + "time_per_iteration": 4.115354776382446 + }, + { + "auxiliary_loss_clip": 0.01277163, + "auxiliary_loss_mlp": 0.00246097, + "balance_loss_clip": 1.04906011, + "balance_loss_mlp": 0.21919173, + "epoch": 0.7650082669472419, + "flos": 24900890261760.0, + "grad_norm": 26.635030133750274, + "language_loss": 0.79632115, + "learning_rate": 5.517177536300881e-07, + "loss": 0.81155372, + "num_input_tokens_seen": 274408750, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.26953125, + "step": 12724, + "time_per_iteration": 2.7480392456054688 + }, + { + "auxiliary_loss_clip": 0.01256614, + "auxiliary_loss_mlp": 0.00241887, + "balance_loss_clip": 1.04081035, + "balance_loss_mlp": 0.21708001, + "epoch": 0.7650683901999098, + "flos": 14647388676480.0, + "grad_norm": 12.331624723612606, + "language_loss": 0.90284145, + "learning_rate": 5.514491881335935e-07, + "loss": 0.91782641, + "num_input_tokens_seen": 274424600, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.2479248, + "step": 12725, + "time_per_iteration": 2.5969669818878174 + }, + { + "auxiliary_loss_clip": 0.01256535, + "auxiliary_loss_mlp": 0.00241947, + "balance_loss_clip": 1.03298926, + "balance_loss_mlp": 0.21607873, + "epoch": 0.7651285134525778, + "flos": 26352434770560.0, + "grad_norm": 5.319726394561995, + "language_loss": 0.87116241, + "learning_rate": 5.511806775662901e-07, + "loss": 0.88614726, + "num_input_tokens_seen": 274443075, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25878906, + "step": 12726, + "time_per_iteration": 2.736837148666382 + }, + { + "auxiliary_loss_clip": 0.0128045, + "auxiliary_loss_mlp": 0.00232887, + "balance_loss_clip": 1.05383837, + "balance_loss_mlp": 0.20849666, + "epoch": 0.7651886367052457, + "flos": 26646866553600.0, + "grad_norm": 24.876471836161723, + "language_loss": 0.77201301, + "learning_rate": 5.509122219383615e-07, + "loss": 0.78714633, + "num_input_tokens_seen": 274463240, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.24401855, + "step": 12727, + "time_per_iteration": 4.272839784622192 + }, + { + "auxiliary_loss_clip": 0.01235149, + "auxiliary_loss_mlp": 0.0024854, + "balance_loss_clip": 1.02609611, + "balance_loss_mlp": 0.22358923, + "epoch": 0.7652487599579137, + "flos": 25702847262720.0, + "grad_norm": 5.938427740939801, + "language_loss": 0.86825329, + "learning_rate": 5.506438212599864e-07, + "loss": 0.8830902, + "num_input_tokens_seen": 274482750, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.24938965, + "step": 12728, + "time_per_iteration": 2.6754238605499268 + }, + { + "auxiliary_loss_clip": 0.01294576, + "auxiliary_loss_mlp": 0.00268453, + "balance_loss_clip": 1.06279206, + "balance_loss_mlp": 0.24049857, + "epoch": 0.7653088832105817, + "flos": 28585576247040.0, + "grad_norm": 5.1412127835635975, + "language_loss": 0.6720134, + "learning_rate": 5.503754755413424e-07, + "loss": 0.68764365, + "num_input_tokens_seen": 274503545, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.27941895, + "step": 12729, + "time_per_iteration": 2.7920708656311035 + }, + { + "auxiliary_loss_clip": 0.01258531, + "auxiliary_loss_mlp": 0.00239111, + "balance_loss_clip": 1.04177094, + "balance_loss_mlp": 0.21419594, + "epoch": 0.7653690064632497, + "flos": 23366750428800.0, + "grad_norm": 1.6766076977020532, + "language_loss": 0.85918164, + "learning_rate": 5.501071847926055e-07, + "loss": 0.87415808, + "num_input_tokens_seen": 274523825, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.24926758, + "step": 12730, + "time_per_iteration": 2.67775297164917 + }, + { + "auxiliary_loss_clip": 0.01269265, + "auxiliary_loss_mlp": 0.00233231, + "balance_loss_clip": 1.04637349, + "balance_loss_mlp": 0.20655197, + "epoch": 0.7654291297159176, + "flos": 15773905992960.0, + "grad_norm": 40.693625287112404, + "language_loss": 0.79025698, + "learning_rate": 5.498389490239495e-07, + "loss": 0.80528188, + "num_input_tokens_seen": 274541625, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26660156, + "step": 12731, + "time_per_iteration": 2.675168752670288 + }, + { + "auxiliary_loss_clip": 0.01271209, + "auxiliary_loss_mlp": 0.00243559, + "balance_loss_clip": 1.04941106, + "balance_loss_mlp": 0.21838206, + "epoch": 0.7654892529685856, + "flos": 18033800123520.0, + "grad_norm": 22.84271856590658, + "language_loss": 0.79287505, + "learning_rate": 5.495707682455471e-07, + "loss": 0.80802274, + "num_input_tokens_seen": 274557580, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25158691, + "step": 12732, + "time_per_iteration": 4.00103497505188 + }, + { + "auxiliary_loss_clip": 0.01261296, + "auxiliary_loss_mlp": 0.00265011, + "balance_loss_clip": 1.04081023, + "balance_loss_mlp": 0.23901157, + "epoch": 0.7655493762212535, + "flos": 27236017428480.0, + "grad_norm": 380.5987507806138, + "language_loss": 0.84754133, + "learning_rate": 5.493026424675653e-07, + "loss": 0.86280435, + "num_input_tokens_seen": 274578135, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.26037598, + "step": 12733, + "time_per_iteration": 2.6978349685668945 + }, + { + "auxiliary_loss_clip": 0.01262183, + "auxiliary_loss_mlp": 0.00218941, + "balance_loss_clip": 1.04018617, + "balance_loss_mlp": 0.19415781, + "epoch": 0.7656094994739215, + "flos": 20773964027520.0, + "grad_norm": 13.649359085294176, + "language_loss": 0.83450544, + "learning_rate": 5.490345717001726e-07, + "loss": 0.8493166, + "num_input_tokens_seen": 274595655, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24780273, + "step": 12734, + "time_per_iteration": 2.7412548065185547 + }, + { + "auxiliary_loss_clip": 0.01290823, + "auxiliary_loss_mlp": 0.00241746, + "balance_loss_clip": 1.05789089, + "balance_loss_mlp": 0.21372037, + "epoch": 0.7656696227265896, + "flos": 23039245198080.0, + "grad_norm": 1166.7471497503159, + "language_loss": 0.84016854, + "learning_rate": 5.48766555953535e-07, + "loss": 0.85549426, + "num_input_tokens_seen": 274616305, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.28039551, + "step": 12735, + "time_per_iteration": 2.7041659355163574 + }, + { + "auxiliary_loss_clip": 0.01250975, + "auxiliary_loss_mlp": 0.00258053, + "balance_loss_clip": 1.03314424, + "balance_loss_mlp": 0.23170736, + "epoch": 0.7657297459792575, + "flos": 27525636789120.0, + "grad_norm": 33.62679524328525, + "language_loss": 0.78910881, + "learning_rate": 5.484985952378145e-07, + "loss": 0.8041991, + "num_input_tokens_seen": 274638110, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.26342773, + "step": 12736, + "time_per_iteration": 2.7407126426696777 + }, + { + "auxiliary_loss_clip": 0.01291567, + "auxiliary_loss_mlp": 0.00266699, + "balance_loss_clip": 1.05670094, + "balance_loss_mlp": 0.23783794, + "epoch": 0.7657898692319255, + "flos": 17128456801920.0, + "grad_norm": 18.022030605774834, + "language_loss": 0.85962236, + "learning_rate": 5.482306895631728e-07, + "loss": 0.87520504, + "num_input_tokens_seen": 274656565, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.28845215, + "step": 12737, + "time_per_iteration": 2.650926113128662 + }, + { + "auxiliary_loss_clip": 0.01257372, + "auxiliary_loss_mlp": 0.00255671, + "balance_loss_clip": 1.0319289, + "balance_loss_mlp": 0.22819373, + "epoch": 0.7658499924845934, + "flos": 21465747037440.0, + "grad_norm": 8.187678095109984, + "language_loss": 0.84014106, + "learning_rate": 5.479628389397699e-07, + "loss": 0.85527146, + "num_input_tokens_seen": 274674215, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.2746582, + "step": 12738, + "time_per_iteration": 2.698997974395752 + }, + { + "auxiliary_loss_clip": 0.0128569, + "auxiliary_loss_mlp": 0.00240278, + "balance_loss_clip": 1.05474877, + "balance_loss_mlp": 0.21260919, + "epoch": 0.7659101157372614, + "flos": 29496665744640.0, + "grad_norm": 7.275680849010905, + "language_loss": 0.72027612, + "learning_rate": 5.476950433777603e-07, + "loss": 0.7355358, + "num_input_tokens_seen": 274693445, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2767334, + "step": 12739, + "time_per_iteration": 2.6929633617401123 + }, + { + "auxiliary_loss_clip": 0.01268896, + "auxiliary_loss_mlp": 0.00257463, + "balance_loss_clip": 1.04310226, + "balance_loss_mlp": 0.22966355, + "epoch": 0.7659702389899293, + "flos": 18551812112640.0, + "grad_norm": 4.721233462991793, + "language_loss": 0.91074562, + "learning_rate": 5.474273028873004e-07, + "loss": 0.92600918, + "num_input_tokens_seen": 274712815, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.27783203, + "step": 12740, + "time_per_iteration": 2.6642727851867676 + }, + { + "auxiliary_loss_clip": 0.01280059, + "auxiliary_loss_mlp": 0.00241045, + "balance_loss_clip": 1.05366015, + "balance_loss_mlp": 0.21464038, + "epoch": 0.7660303622425974, + "flos": 23549176627200.0, + "grad_norm": 2.5096109225099874, + "language_loss": 0.73922372, + "learning_rate": 5.471596174785429e-07, + "loss": 0.75443476, + "num_input_tokens_seen": 274732690, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.26416016, + "step": 12741, + "time_per_iteration": 2.7189760208129883 + }, + { + "auxiliary_loss_clip": 0.01276061, + "auxiliary_loss_mlp": 0.00256909, + "balance_loss_clip": 1.05047274, + "balance_loss_mlp": 0.23045608, + "epoch": 0.7660904854952653, + "flos": 18916736336640.0, + "grad_norm": 9.530630448051854, + "language_loss": 0.82543206, + "learning_rate": 5.468919871616386e-07, + "loss": 0.84076178, + "num_input_tokens_seen": 274752460, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.26464844, + "step": 12742, + "time_per_iteration": 2.694627523422241 + }, + { + "auxiliary_loss_clip": 0.01256592, + "auxiliary_loss_mlp": 0.002597, + "balance_loss_clip": 1.03885436, + "balance_loss_mlp": 0.23527369, + "epoch": 0.7661506087479333, + "flos": 23147515768320.0, + "grad_norm": 26.52227494653124, + "language_loss": 0.82514238, + "learning_rate": 5.46624411946736e-07, + "loss": 0.84030533, + "num_input_tokens_seen": 274773070, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.24438477, + "step": 12743, + "time_per_iteration": 2.6362171173095703 + }, + { + "auxiliary_loss_clip": 0.01242203, + "auxiliary_loss_mlp": 0.00219153, + "balance_loss_clip": 1.02869725, + "balance_loss_mlp": 0.19515643, + "epoch": 0.7662107320006012, + "flos": 17565776887680.0, + "grad_norm": 6.491529606636794, + "language_loss": 0.83096552, + "learning_rate": 5.463568918439805e-07, + "loss": 0.84557915, + "num_input_tokens_seen": 274790220, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.23999023, + "step": 12744, + "time_per_iteration": 2.625882863998413 + }, + { + "auxiliary_loss_clip": 0.01278073, + "auxiliary_loss_mlp": 0.00248683, + "balance_loss_clip": 1.04803503, + "balance_loss_mlp": 0.22312406, + "epoch": 0.7662708552532692, + "flos": 22303075956480.0, + "grad_norm": 12.430210442778753, + "language_loss": 0.83055806, + "learning_rate": 5.460894268635181e-07, + "loss": 0.84582567, + "num_input_tokens_seen": 274805095, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.2557373, + "step": 12745, + "time_per_iteration": 2.619189500808716 + }, + { + "auxiliary_loss_clip": 0.01265823, + "auxiliary_loss_mlp": 0.00232247, + "balance_loss_clip": 1.04093623, + "balance_loss_mlp": 0.20585383, + "epoch": 0.7663309785059371, + "flos": 15742053607680.0, + "grad_norm": 5.937222466819913, + "language_loss": 0.86783063, + "learning_rate": 5.458220170154896e-07, + "loss": 0.88281137, + "num_input_tokens_seen": 274821800, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.26416016, + "step": 12746, + "time_per_iteration": 2.660491943359375 + }, + { + "auxiliary_loss_clip": 0.01114477, + "auxiliary_loss_mlp": 0.00102529, + "balance_loss_clip": 0.95968866, + "balance_loss_mlp": 0.09470846, + "epoch": 0.7663911017586051, + "flos": 62163312514560.0, + "grad_norm": 0.6828071238977172, + "language_loss": 0.56251711, + "learning_rate": 5.455546623100362e-07, + "loss": 0.57468712, + "num_input_tokens_seen": 274886970, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.078125, + "step": 12747, + "time_per_iteration": 3.1987884044647217 + }, + { + "auxiliary_loss_clip": 0.01256404, + "auxiliary_loss_mlp": 0.0022519, + "balance_loss_clip": 1.0385989, + "balance_loss_mlp": 0.20146723, + "epoch": 0.7664512250112732, + "flos": 26506025326080.0, + "grad_norm": 22.496614400677295, + "language_loss": 0.77997625, + "learning_rate": 5.452873627572956e-07, + "loss": 0.79479218, + "num_input_tokens_seen": 274907240, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.23706055, + "step": 12748, + "time_per_iteration": 2.677819013595581 + }, + { + "auxiliary_loss_clip": 0.01253444, + "auxiliary_loss_mlp": 0.00244907, + "balance_loss_clip": 1.03091586, + "balance_loss_mlp": 0.22008777, + "epoch": 0.7665113482639411, + "flos": 16249542912000.0, + "grad_norm": 21.440581026298105, + "language_loss": 0.78483546, + "learning_rate": 5.450201183674052e-07, + "loss": 0.79981893, + "num_input_tokens_seen": 274924650, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.24816895, + "step": 12749, + "time_per_iteration": 2.6415834426879883 + }, + { + "auxiliary_loss_clip": 0.01269609, + "auxiliary_loss_mlp": 0.00255309, + "balance_loss_clip": 1.04483795, + "balance_loss_mlp": 0.22853467, + "epoch": 0.7665714715166091, + "flos": 27197880163200.0, + "grad_norm": 9.114735132843384, + "language_loss": 0.80513895, + "learning_rate": 5.447529291504967e-07, + "loss": 0.82038808, + "num_input_tokens_seen": 274944550, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.26794434, + "step": 12750, + "time_per_iteration": 2.691922426223755 + }, + { + "auxiliary_loss_clip": 0.01251441, + "auxiliary_loss_mlp": 0.00224304, + "balance_loss_clip": 1.02965724, + "balance_loss_mlp": 0.20005706, + "epoch": 0.766631594769277, + "flos": 21067785279360.0, + "grad_norm": 169.1865444981657, + "language_loss": 0.83856225, + "learning_rate": 5.444857951167026e-07, + "loss": 0.85331964, + "num_input_tokens_seen": 274961330, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.24243164, + "step": 12751, + "time_per_iteration": 2.7842702865600586 + }, + { + "auxiliary_loss_clip": 0.01247276, + "auxiliary_loss_mlp": 0.00219848, + "balance_loss_clip": 1.03050613, + "balance_loss_mlp": 0.19607788, + "epoch": 0.766691718021945, + "flos": 24097963593600.0, + "grad_norm": 17.82081923906857, + "language_loss": 0.69288468, + "learning_rate": 5.442187162761537e-07, + "loss": 0.70755595, + "num_input_tokens_seen": 274981655, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.23791504, + "step": 12752, + "time_per_iteration": 2.7001657485961914 + }, + { + "auxiliary_loss_clip": 0.01294148, + "auxiliary_loss_mlp": 0.00230022, + "balance_loss_clip": 1.06202507, + "balance_loss_mlp": 0.20386712, + "epoch": 0.7667518412746129, + "flos": 23440654661760.0, + "grad_norm": 104.38853276876567, + "language_loss": 0.78793627, + "learning_rate": 5.439516926389767e-07, + "loss": 0.80317795, + "num_input_tokens_seen": 274999970, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.26135254, + "step": 12753, + "time_per_iteration": 2.6957192420959473 + }, + { + "auxiliary_loss_clip": 0.01267438, + "auxiliary_loss_mlp": 0.00230183, + "balance_loss_clip": 1.04411399, + "balance_loss_mlp": 0.20516106, + "epoch": 0.766811964527281, + "flos": 18148786536960.0, + "grad_norm": 5.898997145829721, + "language_loss": 0.69458109, + "learning_rate": 5.436847242152971e-07, + "loss": 0.70955729, + "num_input_tokens_seen": 275015805, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.25024414, + "step": 12754, + "time_per_iteration": 2.652019739151001 + }, + { + "auxiliary_loss_clip": 0.01267333, + "auxiliary_loss_mlp": 0.00242097, + "balance_loss_clip": 1.0463506, + "balance_loss_mlp": 0.21663356, + "epoch": 0.7668720877799489, + "flos": 19536051657600.0, + "grad_norm": 3.373882237469164, + "language_loss": 0.88521528, + "learning_rate": 5.434178110152401e-07, + "loss": 0.90030956, + "num_input_tokens_seen": 275031810, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.2545166, + "step": 12755, + "time_per_iteration": 2.6875650882720947 + }, + { + "auxiliary_loss_clip": 0.01256224, + "auxiliary_loss_mlp": 0.00251833, + "balance_loss_clip": 1.03609157, + "balance_loss_mlp": 0.22746709, + "epoch": 0.7669322110326169, + "flos": 22674320974080.0, + "grad_norm": 47.01766026647117, + "language_loss": 0.78717953, + "learning_rate": 5.431509530489242e-07, + "loss": 0.8022601, + "num_input_tokens_seen": 275049325, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.24353027, + "step": 12756, + "time_per_iteration": 2.668938159942627 + }, + { + "auxiliary_loss_clip": 0.01271031, + "auxiliary_loss_mlp": 0.00258082, + "balance_loss_clip": 1.04951322, + "balance_loss_mlp": 0.23308364, + "epoch": 0.7669923342852848, + "flos": 26469396432000.0, + "grad_norm": 2.721122924380877, + "language_loss": 0.75982213, + "learning_rate": 5.428841503264706e-07, + "loss": 0.77511322, + "num_input_tokens_seen": 275070865, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25, + "step": 12757, + "time_per_iteration": 2.7869956493377686 + }, + { + "auxiliary_loss_clip": 0.01279347, + "auxiliary_loss_mlp": 0.00250541, + "balance_loss_clip": 1.05030942, + "balance_loss_mlp": 0.22252665, + "epoch": 0.7670524575379528, + "flos": 22856136641280.0, + "grad_norm": 4.827882037691112, + "language_loss": 0.85117406, + "learning_rate": 5.426174028579955e-07, + "loss": 0.8664729, + "num_input_tokens_seen": 275088015, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.28027344, + "step": 12758, + "time_per_iteration": 2.8065409660339355 + }, + { + "auxiliary_loss_clip": 0.01256663, + "auxiliary_loss_mlp": 0.00211963, + "balance_loss_clip": 1.03689814, + "balance_loss_mlp": 0.18620166, + "epoch": 0.7671125807906207, + "flos": 22452141398400.0, + "grad_norm": 59.11108311295037, + "language_loss": 0.83566093, + "learning_rate": 5.423507106536156e-07, + "loss": 0.85034722, + "num_input_tokens_seen": 275106975, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.25744629, + "step": 12759, + "time_per_iteration": 2.678190231323242 + }, + { + "auxiliary_loss_clip": 0.01252208, + "auxiliary_loss_mlp": 0.00245098, + "balance_loss_clip": 1.03189945, + "balance_loss_mlp": 0.2176802, + "epoch": 0.7671727040432887, + "flos": 35371543518720.0, + "grad_norm": 5.794891571131718, + "language_loss": 0.76103199, + "learning_rate": 5.420840737234425e-07, + "loss": 0.77600503, + "num_input_tokens_seen": 275129560, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.27416992, + "step": 12760, + "time_per_iteration": 2.80131459236145 + }, + { + "auxiliary_loss_clip": 0.01263752, + "auxiliary_loss_mlp": 0.00237329, + "balance_loss_clip": 1.0398314, + "balance_loss_mlp": 0.2121879, + "epoch": 0.7672328272959568, + "flos": 22494947431680.0, + "grad_norm": 5.477351256672551, + "language_loss": 0.85129118, + "learning_rate": 5.418174920775871e-07, + "loss": 0.86630201, + "num_input_tokens_seen": 275151180, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.25134277, + "step": 12761, + "time_per_iteration": 2.74959397315979 + }, + { + "auxiliary_loss_clip": 0.01251971, + "auxiliary_loss_mlp": 0.00260744, + "balance_loss_clip": 1.03325737, + "balance_loss_mlp": 0.23478061, + "epoch": 0.7672929505486247, + "flos": 22815557251200.0, + "grad_norm": 24.75385870329156, + "language_loss": 0.74162734, + "learning_rate": 5.415509657261589e-07, + "loss": 0.75675452, + "num_input_tokens_seen": 275170605, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.25964355, + "step": 12762, + "time_per_iteration": 2.689042329788208 + }, + { + "auxiliary_loss_clip": 0.01256268, + "auxiliary_loss_mlp": 0.00240576, + "balance_loss_clip": 1.03616214, + "balance_loss_mlp": 0.21504137, + "epoch": 0.7673530738012927, + "flos": 20338834671360.0, + "grad_norm": 33.28088805144394, + "language_loss": 0.82671475, + "learning_rate": 5.412844946792639e-07, + "loss": 0.84168315, + "num_input_tokens_seen": 275188750, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25537109, + "step": 12763, + "time_per_iteration": 4.099484205245972 + }, + { + "auxiliary_loss_clip": 0.01258688, + "auxiliary_loss_mlp": 0.00242249, + "balance_loss_clip": 1.03772998, + "balance_loss_mlp": 0.21740536, + "epoch": 0.7674131970539606, + "flos": 34933576988160.0, + "grad_norm": 43.36362877368881, + "language_loss": 0.77780318, + "learning_rate": 5.410180789470067e-07, + "loss": 0.79281253, + "num_input_tokens_seen": 275211365, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24841309, + "step": 12764, + "time_per_iteration": 2.769517183303833 + }, + { + "auxiliary_loss_clip": 0.01248571, + "auxiliary_loss_mlp": 0.00252455, + "balance_loss_clip": 1.02458501, + "balance_loss_mlp": 0.22455966, + "epoch": 0.7674733203066286, + "flos": 28328850766080.0, + "grad_norm": 7.68558741050245, + "language_loss": 0.7582649, + "learning_rate": 5.40751718539491e-07, + "loss": 0.77327514, + "num_input_tokens_seen": 275231670, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.2791748, + "step": 12765, + "time_per_iteration": 4.150561571121216 + }, + { + "auxiliary_loss_clip": 0.01240176, + "auxiliary_loss_mlp": 0.00241069, + "balance_loss_clip": 1.02504456, + "balance_loss_mlp": 0.21616621, + "epoch": 0.7675334435592965, + "flos": 16289727252480.0, + "grad_norm": 76.95806779318754, + "language_loss": 0.6739161, + "learning_rate": 5.404854134668162e-07, + "loss": 0.68872857, + "num_input_tokens_seen": 275249425, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24902344, + "step": 12766, + "time_per_iteration": 2.6815874576568604 + }, + { + "auxiliary_loss_clip": 0.01115284, + "auxiliary_loss_mlp": 0.00079075, + "balance_loss_clip": 0.96415013, + "balance_loss_mlp": 0.07101665, + "epoch": 0.7675935668119646, + "flos": 64826232220800.0, + "grad_norm": 0.7146374040159169, + "language_loss": 0.5997687, + "learning_rate": 5.402191637390803e-07, + "loss": 0.61171234, + "num_input_tokens_seen": 275312485, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.08056641, + "step": 12767, + "time_per_iteration": 3.322126626968384 + }, + { + "auxiliary_loss_clip": 0.01250942, + "auxiliary_loss_mlp": 0.00234575, + "balance_loss_clip": 1.03345978, + "balance_loss_mlp": 0.20862299, + "epoch": 0.7676536900646325, + "flos": 22675398382080.0, + "grad_norm": 29.32571923397089, + "language_loss": 0.75578046, + "learning_rate": 5.399529693663801e-07, + "loss": 0.7706356, + "num_input_tokens_seen": 275331680, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.25964355, + "step": 12768, + "time_per_iteration": 2.678720712661743 + }, + { + "auxiliary_loss_clip": 0.01284966, + "auxiliary_loss_mlp": 0.00263423, + "balance_loss_clip": 1.0596174, + "balance_loss_mlp": 0.23773351, + "epoch": 0.7677138133173005, + "flos": 26939682224640.0, + "grad_norm": 161.8080709229524, + "language_loss": 0.77301073, + "learning_rate": 5.3968683035881e-07, + "loss": 0.78849465, + "num_input_tokens_seen": 275351615, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.25720215, + "step": 12769, + "time_per_iteration": 4.329387187957764 + }, + { + "auxiliary_loss_clip": 0.0127335, + "auxiliary_loss_mlp": 0.00265324, + "balance_loss_clip": 1.04595065, + "balance_loss_mlp": 0.23808505, + "epoch": 0.7677739365699684, + "flos": 23799545400960.0, + "grad_norm": 9.157587945078788, + "language_loss": 0.87461472, + "learning_rate": 5.394207467264611e-07, + "loss": 0.89000142, + "num_input_tokens_seen": 275368815, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.27233887, + "step": 12770, + "time_per_iteration": 2.6807053089141846 + }, + { + "auxiliary_loss_clip": 0.01245268, + "auxiliary_loss_mlp": 0.00239003, + "balance_loss_clip": 1.03429174, + "balance_loss_mlp": 0.21471974, + "epoch": 0.7678340598226364, + "flos": 34455497944320.0, + "grad_norm": 5.451477807429747, + "language_loss": 0.83993512, + "learning_rate": 5.391547184794245e-07, + "loss": 0.85477781, + "num_input_tokens_seen": 275389345, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.24316406, + "step": 12771, + "time_per_iteration": 2.8015010356903076 + }, + { + "auxiliary_loss_clip": 0.01243942, + "auxiliary_loss_mlp": 0.00251948, + "balance_loss_clip": 1.02079487, + "balance_loss_mlp": 0.22501831, + "epoch": 0.7678941830753043, + "flos": 23841740903040.0, + "grad_norm": 16.04302037539354, + "language_loss": 0.75150484, + "learning_rate": 5.388887456277876e-07, + "loss": 0.7664637, + "num_input_tokens_seen": 275411240, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26916504, + "step": 12772, + "time_per_iteration": 2.7077622413635254 + }, + { + "auxiliary_loss_clip": 0.01242806, + "auxiliary_loss_mlp": 0.00247381, + "balance_loss_clip": 1.03008413, + "balance_loss_mlp": 0.22460055, + "epoch": 0.7679543063279723, + "flos": 25410929431680.0, + "grad_norm": 4.447219982833494, + "language_loss": 0.81149346, + "learning_rate": 5.386228281816349e-07, + "loss": 0.82639533, + "num_input_tokens_seen": 275432010, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.22790527, + "step": 12773, + "time_per_iteration": 2.742201089859009 + }, + { + "auxiliary_loss_clip": 0.01231313, + "auxiliary_loss_mlp": 0.00247216, + "balance_loss_clip": 1.02138758, + "balance_loss_mlp": 0.22243203, + "epoch": 0.7680144295806404, + "flos": 27962382257280.0, + "grad_norm": 3.731964194899917, + "language_loss": 0.8717888, + "learning_rate": 5.383569661510512e-07, + "loss": 0.88657403, + "num_input_tokens_seen": 275453710, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.24780273, + "step": 12774, + "time_per_iteration": 4.227788686752319 + }, + { + "auxiliary_loss_clip": 0.01239843, + "auxiliary_loss_mlp": 0.00234787, + "balance_loss_clip": 1.02231514, + "balance_loss_mlp": 0.20974085, + "epoch": 0.7680745528333083, + "flos": 20412810731520.0, + "grad_norm": 13.011509617105395, + "language_loss": 0.7883333, + "learning_rate": 5.380911595461177e-07, + "loss": 0.80307955, + "num_input_tokens_seen": 275472915, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.25036621, + "step": 12775, + "time_per_iteration": 2.6849753856658936 + }, + { + "auxiliary_loss_clip": 0.01109838, + "auxiliary_loss_mlp": 0.00154538, + "balance_loss_clip": 0.96294641, + "balance_loss_mlp": 0.14624126, + "epoch": 0.7681346760859763, + "flos": 68401103351040.0, + "grad_norm": 1.6920924159462867, + "language_loss": 0.56240648, + "learning_rate": 5.378254083769147e-07, + "loss": 0.57505029, + "num_input_tokens_seen": 275534785, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.08300781, + "step": 12776, + "time_per_iteration": 3.2437031269073486 + }, + { + "auxiliary_loss_clip": 0.01242781, + "auxiliary_loss_mlp": 0.00242188, + "balance_loss_clip": 1.02857709, + "balance_loss_mlp": 0.21715453, + "epoch": 0.7681947993386442, + "flos": 21251468453760.0, + "grad_norm": 10.363392722661375, + "language_loss": 0.81097567, + "learning_rate": 5.375597126535188e-07, + "loss": 0.82582545, + "num_input_tokens_seen": 275553205, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.25012207, + "step": 12777, + "time_per_iteration": 2.7425320148468018 + }, + { + "auxiliary_loss_clip": 0.01245754, + "auxiliary_loss_mlp": 0.00266371, + "balance_loss_clip": 1.02885389, + "balance_loss_mlp": 0.24153966, + "epoch": 0.7682549225913122, + "flos": 21397696721280.0, + "grad_norm": 1502.4512965038634, + "language_loss": 0.78464556, + "learning_rate": 5.372940723860043e-07, + "loss": 0.79976678, + "num_input_tokens_seen": 275571490, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24841309, + "step": 12778, + "time_per_iteration": 2.751849412918091 + }, + { + "auxiliary_loss_clip": 0.0124717, + "auxiliary_loss_mlp": 0.00254668, + "balance_loss_clip": 1.02384222, + "balance_loss_mlp": 0.2271069, + "epoch": 0.7683150458439801, + "flos": 23038921975680.0, + "grad_norm": 9.371413915076598, + "language_loss": 0.7808255, + "learning_rate": 5.37028487584446e-07, + "loss": 0.79584384, + "num_input_tokens_seen": 275589665, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.27575684, + "step": 12779, + "time_per_iteration": 2.7256972789764404 + }, + { + "auxiliary_loss_clip": 0.01249397, + "auxiliary_loss_mlp": 0.00235687, + "balance_loss_clip": 1.02964473, + "balance_loss_mlp": 0.21163017, + "epoch": 0.7683751690966482, + "flos": 67332397996800.0, + "grad_norm": 34.47332643898498, + "language_loss": 0.6667282, + "learning_rate": 5.367629582589133e-07, + "loss": 0.68157899, + "num_input_tokens_seen": 275615605, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.24060059, + "step": 12780, + "time_per_iteration": 3.1422266960144043 + }, + { + "auxiliary_loss_clip": 0.01260624, + "auxiliary_loss_mlp": 0.00242225, + "balance_loss_clip": 1.03852618, + "balance_loss_mlp": 0.21692899, + "epoch": 0.7684352923493161, + "flos": 21798890703360.0, + "grad_norm": 8.054529717520078, + "language_loss": 0.76280546, + "learning_rate": 5.364974844194759e-07, + "loss": 0.77783394, + "num_input_tokens_seen": 275634965, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25305176, + "step": 12781, + "time_per_iteration": 2.682741641998291 + }, + { + "auxiliary_loss_clip": 0.01256131, + "auxiliary_loss_mlp": 0.00250875, + "balance_loss_clip": 1.03852296, + "balance_loss_mlp": 0.22574615, + "epoch": 0.7684954156019841, + "flos": 25847603072640.0, + "grad_norm": 56.455211922491735, + "language_loss": 0.84407294, + "learning_rate": 5.362320660762016e-07, + "loss": 0.85914296, + "num_input_tokens_seen": 275655785, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.2512207, + "step": 12782, + "time_per_iteration": 2.668347120285034 + }, + { + "auxiliary_loss_clip": 0.01247444, + "auxiliary_loss_mlp": 0.00242906, + "balance_loss_clip": 1.02475286, + "balance_loss_mlp": 0.21678728, + "epoch": 0.768555538854652, + "flos": 25447378757760.0, + "grad_norm": 8.460205142153198, + "language_loss": 0.72366273, + "learning_rate": 5.35966703239153e-07, + "loss": 0.73856628, + "num_input_tokens_seen": 275676160, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26098633, + "step": 12783, + "time_per_iteration": 2.6936240196228027 + }, + { + "auxiliary_loss_clip": 0.01241992, + "auxiliary_loss_mlp": 0.00234109, + "balance_loss_clip": 1.02128315, + "balance_loss_mlp": 0.20937276, + "epoch": 0.76861566210732, + "flos": 19646369303040.0, + "grad_norm": 56.299664212602245, + "language_loss": 0.78881752, + "learning_rate": 5.357013959183938e-07, + "loss": 0.80357856, + "num_input_tokens_seen": 275695660, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.24743652, + "step": 12784, + "time_per_iteration": 2.6265580654144287 + }, + { + "auxiliary_loss_clip": 0.01226024, + "auxiliary_loss_mlp": 0.00234172, + "balance_loss_clip": 1.01468849, + "balance_loss_mlp": 0.2113438, + "epoch": 0.7686757853599879, + "flos": 22419032037120.0, + "grad_norm": 13.466406444856814, + "language_loss": 0.86019087, + "learning_rate": 5.354361441239843e-07, + "loss": 0.87479281, + "num_input_tokens_seen": 275714025, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.22839355, + "step": 12785, + "time_per_iteration": 2.6630618572235107 + }, + { + "auxiliary_loss_clip": 0.01248811, + "auxiliary_loss_mlp": 0.00241306, + "balance_loss_clip": 1.02921677, + "balance_loss_mlp": 0.21683228, + "epoch": 0.768735908612656, + "flos": 47774262453120.0, + "grad_norm": 848.0115892992427, + "language_loss": 0.83202511, + "learning_rate": 5.351709478659836e-07, + "loss": 0.84692633, + "num_input_tokens_seen": 275737300, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24475098, + "step": 12786, + "time_per_iteration": 2.867196798324585 + }, + { + "auxiliary_loss_clip": 0.01235084, + "auxiliary_loss_mlp": 0.00240973, + "balance_loss_clip": 1.01598954, + "balance_loss_mlp": 0.21620145, + "epoch": 0.7687960318653239, + "flos": 30263179000320.0, + "grad_norm": 29.137828322949566, + "language_loss": 0.66260588, + "learning_rate": 5.349058071544468e-07, + "loss": 0.6773665, + "num_input_tokens_seen": 275757895, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24780273, + "step": 12787, + "time_per_iteration": 2.7086031436920166 + }, + { + "auxiliary_loss_clip": 0.01229445, + "auxiliary_loss_mlp": 0.002381, + "balance_loss_clip": 1.01959097, + "balance_loss_mlp": 0.21431811, + "epoch": 0.7688561551179919, + "flos": 19573434737280.0, + "grad_norm": 32.584040721325294, + "language_loss": 0.82144153, + "learning_rate": 5.346407219994292e-07, + "loss": 0.83611697, + "num_input_tokens_seen": 275776745, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.23779297, + "step": 12788, + "time_per_iteration": 2.6373941898345947 + }, + { + "auxiliary_loss_clip": 0.01256185, + "auxiliary_loss_mlp": 0.00230168, + "balance_loss_clip": 1.0329566, + "balance_loss_mlp": 0.20466903, + "epoch": 0.7689162783706599, + "flos": 22783776693120.0, + "grad_norm": 4.254322809034814, + "language_loss": 0.7646172, + "learning_rate": 5.343756924109821e-07, + "loss": 0.77948076, + "num_input_tokens_seen": 275797205, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25488281, + "step": 12789, + "time_per_iteration": 2.6536967754364014 + }, + { + "auxiliary_loss_clip": 0.01256, + "auxiliary_loss_mlp": 0.00259081, + "balance_loss_clip": 1.03314495, + "balance_loss_mlp": 0.23330814, + "epoch": 0.7689764016233278, + "flos": 34204195416960.0, + "grad_norm": 677.5004762750588, + "language_loss": 0.79197329, + "learning_rate": 5.341107183991553e-07, + "loss": 0.80712408, + "num_input_tokens_seen": 275817935, + "router_z_loss_clip": 2.22558594, + "router_z_loss_mlp": 0.25769043, + "step": 12790, + "time_per_iteration": 2.7505643367767334 + }, + { + "auxiliary_loss_clip": 0.01237223, + "auxiliary_loss_mlp": 0.00225463, + "balance_loss_clip": 1.0203464, + "balance_loss_mlp": 0.2021699, + "epoch": 0.7690365248759958, + "flos": 17274469587840.0, + "grad_norm": 103.32874232176789, + "language_loss": 0.76718795, + "learning_rate": 5.338457999739969e-07, + "loss": 0.78181481, + "num_input_tokens_seen": 275837145, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.2331543, + "step": 12791, + "time_per_iteration": 2.6691551208496094 + }, + { + "auxiliary_loss_clip": 0.01224948, + "auxiliary_loss_mlp": 0.00227389, + "balance_loss_clip": 1.01738763, + "balance_loss_mlp": 0.20388064, + "epoch": 0.7690966481286637, + "flos": 18223157646720.0, + "grad_norm": 22.345017699084416, + "language_loss": 0.86684692, + "learning_rate": 5.335809371455526e-07, + "loss": 0.88137025, + "num_input_tokens_seen": 275855705, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.23510742, + "step": 12792, + "time_per_iteration": 2.683351993560791 + }, + { + "auxiliary_loss_clip": 0.01267852, + "auxiliary_loss_mlp": 0.00258303, + "balance_loss_clip": 1.03658795, + "balance_loss_mlp": 0.2321005, + "epoch": 0.7691567713813318, + "flos": 21537568281600.0, + "grad_norm": 20.245631456603423, + "language_loss": 0.80971551, + "learning_rate": 5.333161299238673e-07, + "loss": 0.82497704, + "num_input_tokens_seen": 275873930, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.26220703, + "step": 12793, + "time_per_iteration": 2.698831796646118 + }, + { + "auxiliary_loss_clip": 0.01251237, + "auxiliary_loss_mlp": 0.00254092, + "balance_loss_clip": 1.03178692, + "balance_loss_mlp": 0.22760388, + "epoch": 0.7692168946339997, + "flos": 39379999720320.0, + "grad_norm": 20.334903625162124, + "language_loss": 0.70203805, + "learning_rate": 5.330513783189803e-07, + "loss": 0.71709132, + "num_input_tokens_seen": 275895895, + "router_z_loss_clip": 2.19433594, + "router_z_loss_mlp": 0.26489258, + "step": 12794, + "time_per_iteration": 2.8655974864959717 + }, + { + "auxiliary_loss_clip": 0.01246201, + "auxiliary_loss_mlp": 0.00250928, + "balance_loss_clip": 1.02845645, + "balance_loss_mlp": 0.22625154, + "epoch": 0.7692770178866677, + "flos": 25009950931200.0, + "grad_norm": 2.5167676398897307, + "language_loss": 0.82426447, + "learning_rate": 5.327866823409319e-07, + "loss": 0.83923578, + "num_input_tokens_seen": 275917825, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.2467041, + "step": 12795, + "time_per_iteration": 2.7106339931488037 + }, + { + "auxiliary_loss_clip": 0.01260355, + "auxiliary_loss_mlp": 0.00238246, + "balance_loss_clip": 1.03918695, + "balance_loss_mlp": 0.21334332, + "epoch": 0.7693371411393356, + "flos": 24716273333760.0, + "grad_norm": 7.1433705368088045, + "language_loss": 0.78210145, + "learning_rate": 5.325220419997601e-07, + "loss": 0.79708743, + "num_input_tokens_seen": 275937890, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.24890137, + "step": 12796, + "time_per_iteration": 2.7988545894622803 + }, + { + "auxiliary_loss_clip": 0.0125417, + "auxiliary_loss_mlp": 0.00225719, + "balance_loss_clip": 1.03386736, + "balance_loss_mlp": 0.20094767, + "epoch": 0.7693972643920036, + "flos": 15924803028480.0, + "grad_norm": 25.771394593026752, + "language_loss": 0.7316137, + "learning_rate": 5.32257457305499e-07, + "loss": 0.74641258, + "num_input_tokens_seen": 275954495, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.24755859, + "step": 12797, + "time_per_iteration": 2.648627281188965 + }, + { + "auxiliary_loss_clip": 0.01265608, + "auxiliary_loss_mlp": 0.00273659, + "balance_loss_clip": 1.03974771, + "balance_loss_mlp": 0.24711077, + "epoch": 0.7694573876446715, + "flos": 25405901527680.0, + "grad_norm": 17.25864488602954, + "language_loss": 0.99742502, + "learning_rate": 5.319929282681823e-07, + "loss": 1.01281762, + "num_input_tokens_seen": 275972395, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.26550293, + "step": 12798, + "time_per_iteration": 2.73410964012146 + }, + { + "auxiliary_loss_clip": 0.0124246, + "auxiliary_loss_mlp": 0.0025183, + "balance_loss_clip": 1.02800894, + "balance_loss_mlp": 0.22692743, + "epoch": 0.7695175108973396, + "flos": 16654220513280.0, + "grad_norm": 16.722873040305767, + "language_loss": 0.90120739, + "learning_rate": 5.317284548978418e-07, + "loss": 0.91615033, + "num_input_tokens_seen": 275989020, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.24914551, + "step": 12799, + "time_per_iteration": 2.652498960494995 + }, + { + "auxiliary_loss_clip": 0.01274251, + "auxiliary_loss_mlp": 0.00242206, + "balance_loss_clip": 1.0536294, + "balance_loss_mlp": 0.21673059, + "epoch": 0.7695776341500075, + "flos": 13626520237440.0, + "grad_norm": 37.93211436444691, + "language_loss": 0.89368302, + "learning_rate": 5.314640372045045e-07, + "loss": 0.90884757, + "num_input_tokens_seen": 276006525, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25488281, + "step": 12800, + "time_per_iteration": 2.687779426574707 + }, + { + "auxiliary_loss_clip": 0.01272011, + "auxiliary_loss_mlp": 0.00244378, + "balance_loss_clip": 1.03965878, + "balance_loss_mlp": 0.21771087, + "epoch": 0.7696377574026755, + "flos": 24276690691200.0, + "grad_norm": 38.423968930710124, + "language_loss": 0.91974056, + "learning_rate": 5.31199675198198e-07, + "loss": 0.93490446, + "num_input_tokens_seen": 276027130, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.26647949, + "step": 12801, + "time_per_iteration": 2.6648640632629395 + }, + { + "auxiliary_loss_clip": 0.01239167, + "auxiliary_loss_mlp": 0.00219941, + "balance_loss_clip": 1.02163815, + "balance_loss_mlp": 0.19543186, + "epoch": 0.7696978806553435, + "flos": 20923137210240.0, + "grad_norm": 92.60099329687579, + "language_loss": 0.79604137, + "learning_rate": 5.30935368888947e-07, + "loss": 0.81063247, + "num_input_tokens_seen": 276045715, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.24499512, + "step": 12802, + "time_per_iteration": 2.696763753890991 + }, + { + "auxiliary_loss_clip": 0.01223515, + "auxiliary_loss_mlp": 0.002538, + "balance_loss_clip": 1.01327872, + "balance_loss_mlp": 0.22976753, + "epoch": 0.7697580039080114, + "flos": 22929609911040.0, + "grad_norm": 5.101028591003924, + "language_loss": 0.83631873, + "learning_rate": 5.306711182867747e-07, + "loss": 0.85109186, + "num_input_tokens_seen": 276065375, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.24023438, + "step": 12803, + "time_per_iteration": 2.7735984325408936 + }, + { + "auxiliary_loss_clip": 0.01090598, + "auxiliary_loss_mlp": 0.0008457, + "balance_loss_clip": 0.94428754, + "balance_loss_mlp": 0.07751252, + "epoch": 0.7698181271606794, + "flos": 68717654933760.0, + "grad_norm": 14.12486255727315, + "language_loss": 0.5484736, + "learning_rate": 5.304069234017001e-07, + "loss": 0.56022525, + "num_input_tokens_seen": 276131405, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.07080078, + "step": 12804, + "time_per_iteration": 3.169898271560669 + }, + { + "auxiliary_loss_clip": 0.01089941, + "auxiliary_loss_mlp": 0.00122016, + "balance_loss_clip": 0.94300497, + "balance_loss_mlp": 0.11452992, + "epoch": 0.7698782504133473, + "flos": 67409716999680.0, + "grad_norm": 0.7186985638953662, + "language_loss": 0.53442359, + "learning_rate": 5.301427842437429e-07, + "loss": 0.54654312, + "num_input_tokens_seen": 276200755, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.07470703, + "step": 12805, + "time_per_iteration": 4.705320835113525 + }, + { + "auxiliary_loss_clip": 0.01245336, + "auxiliary_loss_mlp": 0.00244739, + "balance_loss_clip": 1.02985287, + "balance_loss_mlp": 0.22030109, + "epoch": 0.7699383736660154, + "flos": 22488842119680.0, + "grad_norm": 95.88905504619305, + "language_loss": 0.80587304, + "learning_rate": 5.298787008229187e-07, + "loss": 0.82077372, + "num_input_tokens_seen": 276217880, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24438477, + "step": 12806, + "time_per_iteration": 2.6428346633911133 + }, + { + "auxiliary_loss_clip": 0.0124617, + "auxiliary_loss_mlp": 0.00254248, + "balance_loss_clip": 1.02663159, + "balance_loss_mlp": 0.22841486, + "epoch": 0.7699984969186833, + "flos": 21539723097600.0, + "grad_norm": 12.666929612873329, + "language_loss": 0.79728353, + "learning_rate": 5.296146731492408e-07, + "loss": 0.81228769, + "num_input_tokens_seen": 276234810, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25842285, + "step": 12807, + "time_per_iteration": 4.037569046020508 + }, + { + "auxiliary_loss_clip": 0.01257979, + "auxiliary_loss_mlp": 0.00236382, + "balance_loss_clip": 1.03667402, + "balance_loss_mlp": 0.21141946, + "epoch": 0.7700586201713513, + "flos": 21719096640000.0, + "grad_norm": 3.377475029599023, + "language_loss": 0.87583435, + "learning_rate": 5.293507012327218e-07, + "loss": 0.89077795, + "num_input_tokens_seen": 276252850, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.24951172, + "step": 12808, + "time_per_iteration": 2.652200937271118 + }, + { + "auxiliary_loss_clip": 0.01279304, + "auxiliary_loss_mlp": 0.0024346, + "balance_loss_clip": 1.05462325, + "balance_loss_mlp": 0.21811619, + "epoch": 0.7701187434240192, + "flos": 27856015107840.0, + "grad_norm": 7.907202183619345, + "language_loss": 0.8897723, + "learning_rate": 5.290867850833718e-07, + "loss": 0.90499997, + "num_input_tokens_seen": 276272525, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.25341797, + "step": 12809, + "time_per_iteration": 2.756920337677002 + }, + { + "auxiliary_loss_clip": 0.01235024, + "auxiliary_loss_mlp": 0.00240931, + "balance_loss_clip": 1.02069736, + "balance_loss_mlp": 0.21503851, + "epoch": 0.7701788666766872, + "flos": 28621307301120.0, + "grad_norm": 60.22562829298659, + "language_loss": 0.75515711, + "learning_rate": 5.288229247111993e-07, + "loss": 0.76991671, + "num_input_tokens_seen": 276294210, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.25891113, + "step": 12810, + "time_per_iteration": 2.755636692047119 + }, + { + "auxiliary_loss_clip": 0.01263043, + "auxiliary_loss_mlp": 0.00259015, + "balance_loss_clip": 1.0359472, + "balance_loss_mlp": 0.23336154, + "epoch": 0.7702389899293551, + "flos": 14246446089600.0, + "grad_norm": 95.36877056906921, + "language_loss": 0.8636421, + "learning_rate": 5.285591201262079e-07, + "loss": 0.87886262, + "num_input_tokens_seen": 276310290, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.25646973, + "step": 12811, + "time_per_iteration": 4.175467491149902 + }, + { + "auxiliary_loss_clip": 0.01106857, + "auxiliary_loss_mlp": 0.00065002, + "balance_loss_clip": 0.96334791, + "balance_loss_mlp": 0.05851712, + "epoch": 0.7702991131820232, + "flos": 70574128439040.0, + "grad_norm": 0.7824459749948809, + "language_loss": 0.56284046, + "learning_rate": 5.28295371338402e-07, + "loss": 0.57455909, + "num_input_tokens_seen": 276371715, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.06494141, + "step": 12812, + "time_per_iteration": 3.224522352218628 + }, + { + "auxiliary_loss_clip": 0.0125949, + "auxiliary_loss_mlp": 0.002223, + "balance_loss_clip": 1.03889394, + "balance_loss_mlp": 0.19702789, + "epoch": 0.7703592364346911, + "flos": 25480021242240.0, + "grad_norm": 5.191329591667375, + "language_loss": 0.78299904, + "learning_rate": 5.280316783577836e-07, + "loss": 0.79781699, + "num_input_tokens_seen": 276389895, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25268555, + "step": 12813, + "time_per_iteration": 2.684845447540283 + }, + { + "auxiliary_loss_clip": 0.01255821, + "auxiliary_loss_mlp": 0.002301, + "balance_loss_clip": 1.03662276, + "balance_loss_mlp": 0.20401676, + "epoch": 0.7704193596873591, + "flos": 19280906375040.0, + "grad_norm": 9.671210290650016, + "language_loss": 0.74771637, + "learning_rate": 5.27768041194351e-07, + "loss": 0.76257557, + "num_input_tokens_seen": 276408990, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.2611084, + "step": 12814, + "time_per_iteration": 2.6678366661071777 + }, + { + "auxiliary_loss_clip": 0.01240973, + "auxiliary_loss_mlp": 0.00250283, + "balance_loss_clip": 1.0235796, + "balance_loss_mlp": 0.2253084, + "epoch": 0.7704794829400271, + "flos": 23658452778240.0, + "grad_norm": 231.99494554801817, + "language_loss": 0.73251247, + "learning_rate": 5.275044598581018e-07, + "loss": 0.74742496, + "num_input_tokens_seen": 276428190, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.24975586, + "step": 12815, + "time_per_iteration": 2.6907424926757812 + }, + { + "auxiliary_loss_clip": 0.01262281, + "auxiliary_loss_mlp": 0.00227385, + "balance_loss_clip": 1.03436816, + "balance_loss_mlp": 0.20092073, + "epoch": 0.770539606192695, + "flos": 18989311766400.0, + "grad_norm": 171.41782999605877, + "language_loss": 0.7744323, + "learning_rate": 5.272409343590322e-07, + "loss": 0.78932899, + "num_input_tokens_seen": 276446855, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.26428223, + "step": 12816, + "time_per_iteration": 4.143450736999512 + }, + { + "auxiliary_loss_clip": 0.01268433, + "auxiliary_loss_mlp": 0.00239017, + "balance_loss_clip": 1.04677951, + "balance_loss_mlp": 0.21038258, + "epoch": 0.770599729445363, + "flos": 11830160142720.0, + "grad_norm": 4.912886046158793, + "language_loss": 0.82197392, + "learning_rate": 5.26977464707133e-07, + "loss": 0.83704841, + "num_input_tokens_seen": 276462000, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.28649902, + "step": 12817, + "time_per_iteration": 2.679966688156128 + }, + { + "auxiliary_loss_clip": 0.01265389, + "auxiliary_loss_mlp": 0.00252555, + "balance_loss_clip": 1.04155219, + "balance_loss_mlp": 0.22641255, + "epoch": 0.770659852698031, + "flos": 17822610109440.0, + "grad_norm": 12.765487179601418, + "language_loss": 0.72021091, + "learning_rate": 5.267140509123957e-07, + "loss": 0.73539031, + "num_input_tokens_seen": 276481190, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.26123047, + "step": 12818, + "time_per_iteration": 2.677358627319336 + }, + { + "auxiliary_loss_clip": 0.01260737, + "auxiliary_loss_mlp": 0.0024163, + "balance_loss_clip": 1.04541934, + "balance_loss_mlp": 0.2186821, + "epoch": 0.770719975950699, + "flos": 21871968923520.0, + "grad_norm": 56.19294915430246, + "language_loss": 0.74881297, + "learning_rate": 5.264506929848093e-07, + "loss": 0.76383662, + "num_input_tokens_seen": 276499520, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.22924805, + "step": 12819, + "time_per_iteration": 2.736767053604126 + }, + { + "auxiliary_loss_clip": 0.01259278, + "auxiliary_loss_mlp": 0.00242181, + "balance_loss_clip": 1.03816032, + "balance_loss_mlp": 0.21454839, + "epoch": 0.7707800992033669, + "flos": 21325049464320.0, + "grad_norm": 8.875041999697002, + "language_loss": 0.64651591, + "learning_rate": 5.261873909343608e-07, + "loss": 0.66153049, + "num_input_tokens_seen": 276519110, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.27636719, + "step": 12820, + "time_per_iteration": 2.715465784072876 + }, + { + "auxiliary_loss_clip": 0.01249562, + "auxiliary_loss_mlp": 0.00243273, + "balance_loss_clip": 1.03094065, + "balance_loss_mlp": 0.21772684, + "epoch": 0.7708402224560349, + "flos": 28179426188160.0, + "grad_norm": 5.8549475423893265, + "language_loss": 0.87290388, + "learning_rate": 5.259241447710343e-07, + "loss": 0.88783216, + "num_input_tokens_seen": 276538805, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.2557373, + "step": 12821, + "time_per_iteration": 2.7437856197357178 + }, + { + "auxiliary_loss_clip": 0.01263862, + "auxiliary_loss_mlp": 0.0024577, + "balance_loss_clip": 1.04219222, + "balance_loss_mlp": 0.22215472, + "epoch": 0.7709003457087028, + "flos": 15377057556480.0, + "grad_norm": 596.48649033779, + "language_loss": 0.75355709, + "learning_rate": 5.256609545048114e-07, + "loss": 0.76865339, + "num_input_tokens_seen": 276554770, + "router_z_loss_clip": 2.21777344, + "router_z_loss_mlp": 0.23620605, + "step": 12822, + "time_per_iteration": 2.626981496810913 + }, + { + "auxiliary_loss_clip": 0.01248649, + "auxiliary_loss_mlp": 0.00242897, + "balance_loss_clip": 1.03384018, + "balance_loss_mlp": 0.21755311, + "epoch": 0.7709604689613708, + "flos": 30621854257920.0, + "grad_norm": 43.22973834210171, + "language_loss": 0.78331959, + "learning_rate": 5.253978201456733e-07, + "loss": 0.798235, + "num_input_tokens_seen": 276574535, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.25366211, + "step": 12823, + "time_per_iteration": 2.799835443496704 + }, + { + "auxiliary_loss_clip": 0.01268138, + "auxiliary_loss_mlp": 0.00232351, + "balance_loss_clip": 1.03773236, + "balance_loss_mlp": 0.20320465, + "epoch": 0.7710205922140387, + "flos": 20301272023680.0, + "grad_norm": 6.994742579040156, + "language_loss": 0.84538782, + "learning_rate": 5.251347417035969e-07, + "loss": 0.86039275, + "num_input_tokens_seen": 276592925, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.29138184, + "step": 12824, + "time_per_iteration": 2.638211250305176 + }, + { + "auxiliary_loss_clip": 0.01264593, + "auxiliary_loss_mlp": 0.00244935, + "balance_loss_clip": 1.04784942, + "balance_loss_mlp": 0.21918556, + "epoch": 0.7710807154667068, + "flos": 19644214487040.0, + "grad_norm": 7727.985272493031, + "language_loss": 0.80743349, + "learning_rate": 5.248717191885592e-07, + "loss": 0.82252884, + "num_input_tokens_seen": 276610540, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.25756836, + "step": 12825, + "time_per_iteration": 2.648982524871826 + }, + { + "auxiliary_loss_clip": 0.01224659, + "auxiliary_loss_mlp": 0.00251678, + "balance_loss_clip": 1.0155226, + "balance_loss_mlp": 0.22741912, + "epoch": 0.7711408387193747, + "flos": 20006337450240.0, + "grad_norm": 10.63089926117252, + "language_loss": 0.79380828, + "learning_rate": 5.246087526105343e-07, + "loss": 0.8085717, + "num_input_tokens_seen": 276629200, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.24243164, + "step": 12826, + "time_per_iteration": 2.638233184814453 + }, + { + "auxiliary_loss_clip": 0.01242415, + "auxiliary_loss_mlp": 0.00229366, + "balance_loss_clip": 1.02582037, + "balance_loss_mlp": 0.20485696, + "epoch": 0.7712009619720427, + "flos": 24971131307520.0, + "grad_norm": 6.5452617396536725, + "language_loss": 0.87831104, + "learning_rate": 5.243458419794933e-07, + "loss": 0.89302886, + "num_input_tokens_seen": 276648655, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.24511719, + "step": 12827, + "time_per_iteration": 2.6716842651367188 + }, + { + "auxiliary_loss_clip": 0.01103828, + "auxiliary_loss_mlp": 0.00074637, + "balance_loss_clip": 0.96020627, + "balance_loss_mlp": 0.06819975, + "epoch": 0.7712610852247107, + "flos": 63249681404160.0, + "grad_norm": 0.8652510827315144, + "language_loss": 0.54834723, + "learning_rate": 5.240829873054051e-07, + "loss": 0.56013191, + "num_input_tokens_seen": 276716500, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.06445312, + "step": 12828, + "time_per_iteration": 3.325289487838745 + }, + { + "auxiliary_loss_clip": 0.0124603, + "auxiliary_loss_mlp": 0.0023572, + "balance_loss_clip": 1.03263152, + "balance_loss_mlp": 0.21299908, + "epoch": 0.7713212084773786, + "flos": 18697860812160.0, + "grad_norm": 16.10629913990984, + "language_loss": 0.76313281, + "learning_rate": 5.23820188598238e-07, + "loss": 0.77795035, + "num_input_tokens_seen": 276733535, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.22729492, + "step": 12829, + "time_per_iteration": 2.6299617290496826 + }, + { + "auxiliary_loss_clip": 0.01252897, + "auxiliary_loss_mlp": 0.00233517, + "balance_loss_clip": 1.03210759, + "balance_loss_mlp": 0.20808931, + "epoch": 0.7713813317300466, + "flos": 14173367869440.0, + "grad_norm": 17.65675293336867, + "language_loss": 0.89257574, + "learning_rate": 5.235574458679579e-07, + "loss": 0.90743983, + "num_input_tokens_seen": 276749575, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25476074, + "step": 12830, + "time_per_iteration": 2.618144989013672 + }, + { + "auxiliary_loss_clip": 0.0126284, + "auxiliary_loss_mlp": 0.00239348, + "balance_loss_clip": 1.03452277, + "balance_loss_mlp": 0.21095249, + "epoch": 0.7714414549827145, + "flos": 25703960584320.0, + "grad_norm": 11.267234802506424, + "language_loss": 0.86726522, + "learning_rate": 5.232947591245269e-07, + "loss": 0.88228714, + "num_input_tokens_seen": 276769460, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.28430176, + "step": 12831, + "time_per_iteration": 2.7140369415283203 + }, + { + "auxiliary_loss_clip": 0.01248467, + "auxiliary_loss_mlp": 0.00264307, + "balance_loss_clip": 1.02920675, + "balance_loss_mlp": 0.23827131, + "epoch": 0.7715015782353826, + "flos": 30555312312960.0, + "grad_norm": 6.224111820153066, + "language_loss": 0.67697883, + "learning_rate": 5.230321283779071e-07, + "loss": 0.6921066, + "num_input_tokens_seen": 276790820, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.26037598, + "step": 12832, + "time_per_iteration": 2.7435131072998047 + }, + { + "auxiliary_loss_clip": 0.01271558, + "auxiliary_loss_mlp": 0.00261734, + "balance_loss_clip": 1.04828501, + "balance_loss_mlp": 0.23537712, + "epoch": 0.7715617014880505, + "flos": 20229343038720.0, + "grad_norm": 20.224523601322257, + "language_loss": 0.86119008, + "learning_rate": 5.227695536380572e-07, + "loss": 0.87652302, + "num_input_tokens_seen": 276811345, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.2635498, + "step": 12833, + "time_per_iteration": 2.639610528945923 + }, + { + "auxiliary_loss_clip": 0.01096235, + "auxiliary_loss_mlp": 0.0007292, + "balance_loss_clip": 0.95523751, + "balance_loss_mlp": 0.06638706, + "epoch": 0.7716218247407185, + "flos": 63664770971520.0, + "grad_norm": 0.8052558704835378, + "language_loss": 0.54174203, + "learning_rate": 5.22507034914933e-07, + "loss": 0.5534336, + "num_input_tokens_seen": 276870950, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.06542969, + "step": 12834, + "time_per_iteration": 3.150012731552124 + }, + { + "auxiliary_loss_clip": 0.01250611, + "auxiliary_loss_mlp": 0.0028307, + "balance_loss_clip": 1.03174353, + "balance_loss_mlp": 0.25645041, + "epoch": 0.7716819479933864, + "flos": 19791807471360.0, + "grad_norm": 30.25373318608111, + "language_loss": 0.82050693, + "learning_rate": 5.222445722184903e-07, + "loss": 0.8358438, + "num_input_tokens_seen": 276890760, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.26611328, + "step": 12835, + "time_per_iteration": 2.6310555934906006 + }, + { + "auxiliary_loss_clip": 0.01261926, + "auxiliary_loss_mlp": 0.00246856, + "balance_loss_clip": 1.04230297, + "balance_loss_mlp": 0.22153637, + "epoch": 0.7717420712460544, + "flos": 18442176825600.0, + "grad_norm": 562.5870816413186, + "language_loss": 0.81561518, + "learning_rate": 5.219821655586814e-07, + "loss": 0.83070296, + "num_input_tokens_seen": 276909625, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25341797, + "step": 12836, + "time_per_iteration": 2.66184139251709 + }, + { + "auxiliary_loss_clip": 0.01257812, + "auxiliary_loss_mlp": 0.00238728, + "balance_loss_clip": 1.04291272, + "balance_loss_mlp": 0.21367061, + "epoch": 0.7718021944987223, + "flos": 35189476456320.0, + "grad_norm": 14.38765697721291, + "language_loss": 0.68767595, + "learning_rate": 5.217198149454575e-07, + "loss": 0.70264137, + "num_input_tokens_seen": 276930760, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.25073242, + "step": 12837, + "time_per_iteration": 2.8509364128112793 + }, + { + "auxiliary_loss_clip": 0.01092524, + "auxiliary_loss_mlp": 0.00060548, + "balance_loss_clip": 0.94791818, + "balance_loss_mlp": 0.054611, + "epoch": 0.7718623177513904, + "flos": 67923167961600.0, + "grad_norm": 0.8201031157277134, + "language_loss": 0.55044687, + "learning_rate": 5.214575203887666e-07, + "loss": 0.56197762, + "num_input_tokens_seen": 276989580, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.05932617, + "step": 12838, + "time_per_iteration": 3.0957891941070557 + }, + { + "auxiliary_loss_clip": 0.01251353, + "auxiliary_loss_mlp": 0.0022867, + "balance_loss_clip": 1.03492177, + "balance_loss_mlp": 0.20476812, + "epoch": 0.7719224410040583, + "flos": 18581401941120.0, + "grad_norm": 1815.8216406940726, + "language_loss": 0.77343464, + "learning_rate": 5.211952818985538e-07, + "loss": 0.78823495, + "num_input_tokens_seen": 277005450, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.23913574, + "step": 12839, + "time_per_iteration": 2.635469675064087 + }, + { + "auxiliary_loss_clip": 0.01242992, + "auxiliary_loss_mlp": 0.00241635, + "balance_loss_clip": 1.02984405, + "balance_loss_mlp": 0.21638671, + "epoch": 0.7719825642567263, + "flos": 23075802264960.0, + "grad_norm": 5.398951413699886, + "language_loss": 0.87118489, + "learning_rate": 5.209330994847647e-07, + "loss": 0.88603115, + "num_input_tokens_seen": 277023055, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.25256348, + "step": 12840, + "time_per_iteration": 2.655405044555664 + }, + { + "auxiliary_loss_clip": 0.01237591, + "auxiliary_loss_mlp": 0.00239968, + "balance_loss_clip": 1.02275515, + "balance_loss_mlp": 0.21616162, + "epoch": 0.7720426875093943, + "flos": 20339086066560.0, + "grad_norm": 3.40554664932091, + "language_loss": 0.86430502, + "learning_rate": 5.206709731573402e-07, + "loss": 0.87908059, + "num_input_tokens_seen": 277041150, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.23828125, + "step": 12841, + "time_per_iteration": 2.7080938816070557 + }, + { + "auxiliary_loss_clip": 0.01253386, + "auxiliary_loss_mlp": 0.00243955, + "balance_loss_clip": 1.03272057, + "balance_loss_mlp": 0.21875432, + "epoch": 0.7721028107620622, + "flos": 23880704181120.0, + "grad_norm": 7.732467457494695, + "language_loss": 0.82000911, + "learning_rate": 5.204089029262208e-07, + "loss": 0.83498251, + "num_input_tokens_seen": 277063895, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25219727, + "step": 12842, + "time_per_iteration": 2.835198402404785 + }, + { + "auxiliary_loss_clip": 0.01256554, + "auxiliary_loss_mlp": 0.00251219, + "balance_loss_clip": 1.03572512, + "balance_loss_mlp": 0.22600672, + "epoch": 0.7721629340147302, + "flos": 26651571235200.0, + "grad_norm": 78.87335811360593, + "language_loss": 0.75566822, + "learning_rate": 5.201468888013445e-07, + "loss": 0.77074599, + "num_input_tokens_seen": 277084045, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25231934, + "step": 12843, + "time_per_iteration": 2.823942184448242 + }, + { + "auxiliary_loss_clip": 0.01256806, + "auxiliary_loss_mlp": 0.00266694, + "balance_loss_clip": 1.03622162, + "balance_loss_mlp": 0.24103998, + "epoch": 0.7722230572673981, + "flos": 21178857110400.0, + "grad_norm": 102.7039147772605, + "language_loss": 0.83170915, + "learning_rate": 5.198849307926465e-07, + "loss": 0.84694409, + "num_input_tokens_seen": 277102625, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.25634766, + "step": 12844, + "time_per_iteration": 2.663763999938965 + }, + { + "auxiliary_loss_clip": 0.01231494, + "auxiliary_loss_mlp": 0.00243468, + "balance_loss_clip": 1.02236283, + "balance_loss_mlp": 0.22012734, + "epoch": 0.7722831805200662, + "flos": 27964644814080.0, + "grad_norm": 45.246408589918914, + "language_loss": 0.78400964, + "learning_rate": 5.196230289100596e-07, + "loss": 0.79875928, + "num_input_tokens_seen": 277123210, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.2331543, + "step": 12845, + "time_per_iteration": 2.716320514678955 + }, + { + "auxiliary_loss_clip": 0.01221911, + "auxiliary_loss_mlp": 0.00227264, + "balance_loss_clip": 1.01300359, + "balance_loss_mlp": 0.20466161, + "epoch": 0.7723433037727341, + "flos": 33875576864640.0, + "grad_norm": 5.611056447270926, + "language_loss": 0.71425295, + "learning_rate": 5.193611831635159e-07, + "loss": 0.72874475, + "num_input_tokens_seen": 277144895, + "router_z_loss_clip": 2.08886719, + "router_z_loss_mlp": 0.22583008, + "step": 12846, + "time_per_iteration": 2.753615617752075 + }, + { + "auxiliary_loss_clip": 0.01100416, + "auxiliary_loss_mlp": 0.00032445, + "balance_loss_clip": 0.95630026, + "balance_loss_mlp": 0.02646067, + "epoch": 0.7724034270254021, + "flos": 62848271940480.0, + "grad_norm": 67.97940199374975, + "language_loss": 0.60842174, + "learning_rate": 5.19099393562945e-07, + "loss": 0.61975032, + "num_input_tokens_seen": 277205160, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.05981445, + "step": 12847, + "time_per_iteration": 4.572547197341919 + }, + { + "auxiliary_loss_clip": 0.01235022, + "auxiliary_loss_mlp": 0.00268119, + "balance_loss_clip": 1.02058935, + "balance_loss_mlp": 0.24357402, + "epoch": 0.77246355027807, + "flos": 23295467888640.0, + "grad_norm": 4.624236479668811, + "language_loss": 0.86120176, + "learning_rate": 5.188376601182732e-07, + "loss": 0.8762331, + "num_input_tokens_seen": 277223005, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.2454834, + "step": 12848, + "time_per_iteration": 2.77409029006958 + }, + { + "auxiliary_loss_clip": 0.0126538, + "auxiliary_loss_mlp": 0.00249479, + "balance_loss_clip": 1.03962719, + "balance_loss_mlp": 0.22325256, + "epoch": 0.772523673530738, + "flos": 20121287950080.0, + "grad_norm": 38.36378532454981, + "language_loss": 0.79855978, + "learning_rate": 5.185759828394261e-07, + "loss": 0.81370831, + "num_input_tokens_seen": 277241785, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.26257324, + "step": 12849, + "time_per_iteration": 2.648344039916992 + }, + { + "auxiliary_loss_clip": 0.01258105, + "auxiliary_loss_mlp": 0.00236208, + "balance_loss_clip": 1.04068947, + "balance_loss_mlp": 0.21156734, + "epoch": 0.7725837967834059, + "flos": 17820096157440.0, + "grad_norm": 13.351689425649063, + "language_loss": 0.8722592, + "learning_rate": 5.183143617363261e-07, + "loss": 0.88720232, + "num_input_tokens_seen": 277259050, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24658203, + "step": 12850, + "time_per_iteration": 4.033052921295166 + }, + { + "auxiliary_loss_clip": 0.01253329, + "auxiliary_loss_mlp": 0.0023898, + "balance_loss_clip": 1.03426695, + "balance_loss_mlp": 0.21255097, + "epoch": 0.772643920036074, + "flos": 27198921657600.0, + "grad_norm": 12.392744519384172, + "language_loss": 0.87751281, + "learning_rate": 5.180527968188935e-07, + "loss": 0.89243591, + "num_input_tokens_seen": 277278235, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.26452637, + "step": 12851, + "time_per_iteration": 2.7129900455474854 + }, + { + "auxiliary_loss_clip": 0.01240523, + "auxiliary_loss_mlp": 0.00222649, + "balance_loss_clip": 1.02472639, + "balance_loss_mlp": 0.19848561, + "epoch": 0.7727040432887419, + "flos": 21579512388480.0, + "grad_norm": 198.11304910982685, + "language_loss": 0.81199026, + "learning_rate": 5.177912880970474e-07, + "loss": 0.82662201, + "num_input_tokens_seen": 277298355, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.24157715, + "step": 12852, + "time_per_iteration": 2.7089927196502686 + }, + { + "auxiliary_loss_clip": 0.01226376, + "auxiliary_loss_mlp": 0.00238518, + "balance_loss_clip": 1.0154314, + "balance_loss_mlp": 0.21363913, + "epoch": 0.7727641665414099, + "flos": 22236641752320.0, + "grad_norm": 13.575638587301421, + "language_loss": 0.89070535, + "learning_rate": 5.17529835580704e-07, + "loss": 0.90535426, + "num_input_tokens_seen": 277316095, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.24890137, + "step": 12853, + "time_per_iteration": 4.197924852371216 + }, + { + "auxiliary_loss_clip": 0.01100769, + "auxiliary_loss_mlp": 0.00048537, + "balance_loss_clip": 0.95566106, + "balance_loss_mlp": 0.04205186, + "epoch": 0.7728242897940779, + "flos": 54832221463680.0, + "grad_norm": 0.7826021142164474, + "language_loss": 0.5357641, + "learning_rate": 5.172684392797786e-07, + "loss": 0.54725718, + "num_input_tokens_seen": 277380130, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.06494141, + "step": 12854, + "time_per_iteration": 3.2553722858428955 + }, + { + "auxiliary_loss_clip": 0.01274509, + "auxiliary_loss_mlp": 0.0023304, + "balance_loss_clip": 1.05084336, + "balance_loss_mlp": 0.20525248, + "epoch": 0.7728844130467458, + "flos": 34461962392320.0, + "grad_norm": 15.500718585212713, + "language_loss": 0.8212567, + "learning_rate": 5.170070992041826e-07, + "loss": 0.8363322, + "num_input_tokens_seen": 277404015, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.27783203, + "step": 12855, + "time_per_iteration": 2.7610433101654053 + }, + { + "auxiliary_loss_clip": 0.01261009, + "auxiliary_loss_mlp": 0.00216057, + "balance_loss_clip": 1.03454733, + "balance_loss_mlp": 0.19012864, + "epoch": 0.7729445362994138, + "flos": 18916341287040.0, + "grad_norm": 202.5300899485053, + "language_loss": 0.78207308, + "learning_rate": 5.167458153638254e-07, + "loss": 0.79684377, + "num_input_tokens_seen": 277421375, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.25964355, + "step": 12856, + "time_per_iteration": 2.6504650115966797 + }, + { + "auxiliary_loss_clip": 0.01254497, + "auxiliary_loss_mlp": 0.00218478, + "balance_loss_clip": 1.03470612, + "balance_loss_mlp": 0.19314599, + "epoch": 0.7730046595520818, + "flos": 22200048771840.0, + "grad_norm": 5.670016457707896, + "language_loss": 0.86097628, + "learning_rate": 5.164845877686162e-07, + "loss": 0.87570608, + "num_input_tokens_seen": 277440170, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25317383, + "step": 12857, + "time_per_iteration": 2.6666340827941895 + }, + { + "auxiliary_loss_clip": 0.01228201, + "auxiliary_loss_mlp": 0.00230501, + "balance_loss_clip": 1.01768041, + "balance_loss_mlp": 0.20680171, + "epoch": 0.7730647828047498, + "flos": 13552328695680.0, + "grad_norm": 14.467035711416884, + "language_loss": 0.85646433, + "learning_rate": 5.162234164284591e-07, + "loss": 0.87105131, + "num_input_tokens_seen": 277456880, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.23718262, + "step": 12858, + "time_per_iteration": 4.027954578399658 + }, + { + "auxiliary_loss_clip": 0.01250913, + "auxiliary_loss_mlp": 0.00244081, + "balance_loss_clip": 1.03131199, + "balance_loss_mlp": 0.21746175, + "epoch": 0.7731249060574177, + "flos": 21976037602560.0, + "grad_norm": 12.087293472504545, + "language_loss": 0.85196459, + "learning_rate": 5.159623013532591e-07, + "loss": 0.86691451, + "num_input_tokens_seen": 277475365, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.26586914, + "step": 12859, + "time_per_iteration": 2.7285454273223877 + }, + { + "auxiliary_loss_clip": 0.01243081, + "auxiliary_loss_mlp": 0.00202088, + "balance_loss_clip": 1.03400326, + "balance_loss_mlp": 0.18061796, + "epoch": 0.7731850293100857, + "flos": 22601817371520.0, + "grad_norm": 7.523682611188777, + "language_loss": 0.74703711, + "learning_rate": 5.157012425529186e-07, + "loss": 0.7614888, + "num_input_tokens_seen": 277494975, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.21484375, + "step": 12860, + "time_per_iteration": 2.6868538856506348 + }, + { + "auxiliary_loss_clip": 0.01269117, + "auxiliary_loss_mlp": 0.00221612, + "balance_loss_clip": 1.04273272, + "balance_loss_mlp": 0.19401537, + "epoch": 0.7732451525627536, + "flos": 14098422142080.0, + "grad_norm": 97.88251598367559, + "language_loss": 0.88067472, + "learning_rate": 5.154402400373343e-07, + "loss": 0.89558196, + "num_input_tokens_seen": 277510520, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.27612305, + "step": 12861, + "time_per_iteration": 2.6787497997283936 + }, + { + "auxiliary_loss_clip": 0.01274055, + "auxiliary_loss_mlp": 0.00235257, + "balance_loss_clip": 1.0511055, + "balance_loss_mlp": 0.20856632, + "epoch": 0.7733052758154216, + "flos": 21470020755840.0, + "grad_norm": 2.932567642750793, + "language_loss": 0.83150017, + "learning_rate": 5.15179293816405e-07, + "loss": 0.84659326, + "num_input_tokens_seen": 277530505, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26647949, + "step": 12862, + "time_per_iteration": 2.7404251098632812 + }, + { + "auxiliary_loss_clip": 0.01238265, + "auxiliary_loss_mlp": 0.00235647, + "balance_loss_clip": 1.02582455, + "balance_loss_mlp": 0.21076852, + "epoch": 0.7733653990680895, + "flos": 21394284929280.0, + "grad_norm": 9.76090935719173, + "language_loss": 0.8750608, + "learning_rate": 5.149184039000256e-07, + "loss": 0.88979995, + "num_input_tokens_seen": 277550810, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.24902344, + "step": 12863, + "time_per_iteration": 2.7067933082580566 + }, + { + "auxiliary_loss_clip": 0.01252533, + "auxiliary_loss_mlp": 0.00222971, + "balance_loss_clip": 1.03417289, + "balance_loss_mlp": 0.1982944, + "epoch": 0.7734255223207576, + "flos": 17676058619520.0, + "grad_norm": 9.485917790931303, + "language_loss": 0.79530221, + "learning_rate": 5.146575702980898e-07, + "loss": 0.81005728, + "num_input_tokens_seen": 277567680, + "router_z_loss_clip": 2.18457031, + "router_z_loss_mlp": 0.24694824, + "step": 12864, + "time_per_iteration": 2.7051379680633545 + }, + { + "auxiliary_loss_clip": 0.01246079, + "auxiliary_loss_mlp": 0.00218817, + "balance_loss_clip": 1.03176343, + "balance_loss_mlp": 0.19483194, + "epoch": 0.7734856455734255, + "flos": 25230837617280.0, + "grad_norm": 4.729697439862633, + "language_loss": 0.86672622, + "learning_rate": 5.143967930204871e-07, + "loss": 0.88137519, + "num_input_tokens_seen": 277588970, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.23986816, + "step": 12865, + "time_per_iteration": 2.742351770401001 + }, + { + "auxiliary_loss_clip": 0.01273122, + "auxiliary_loss_mlp": 0.00260245, + "balance_loss_clip": 1.04525065, + "balance_loss_mlp": 0.23208769, + "epoch": 0.7735457688260935, + "flos": 23433112805760.0, + "grad_norm": 32.3943662400385, + "language_loss": 0.80434179, + "learning_rate": 5.141360720771077e-07, + "loss": 0.81967545, + "num_input_tokens_seen": 277605450, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.28173828, + "step": 12866, + "time_per_iteration": 2.7930376529693604 + }, + { + "auxiliary_loss_clip": 0.01275507, + "auxiliary_loss_mlp": 0.00225657, + "balance_loss_clip": 1.05099916, + "balance_loss_mlp": 0.19856054, + "epoch": 0.7736058920787615, + "flos": 18729246320640.0, + "grad_norm": 150.04346315802664, + "language_loss": 0.72458982, + "learning_rate": 5.138754074778371e-07, + "loss": 0.73960143, + "num_input_tokens_seen": 277622530, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.27075195, + "step": 12867, + "time_per_iteration": 2.6135456562042236 + }, + { + "auxiliary_loss_clip": 0.01236851, + "auxiliary_loss_mlp": 0.00224775, + "balance_loss_clip": 1.02354383, + "balance_loss_mlp": 0.19850126, + "epoch": 0.7736660153314294, + "flos": 22893304239360.0, + "grad_norm": 92.313965213826, + "language_loss": 0.76230502, + "learning_rate": 5.136147992325595e-07, + "loss": 0.77692127, + "num_input_tokens_seen": 277642700, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.26281738, + "step": 12868, + "time_per_iteration": 2.6858370304107666 + }, + { + "auxiliary_loss_clip": 0.0125775, + "auxiliary_loss_mlp": 0.00217617, + "balance_loss_clip": 1.04190183, + "balance_loss_mlp": 0.19178453, + "epoch": 0.7737261385840974, + "flos": 13800901789440.0, + "grad_norm": 7.273265223688193, + "language_loss": 0.865152, + "learning_rate": 5.133542473511578e-07, + "loss": 0.8799057, + "num_input_tokens_seen": 277660005, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.25830078, + "step": 12869, + "time_per_iteration": 2.607187271118164 + }, + { + "auxiliary_loss_clip": 0.01247175, + "auxiliary_loss_mlp": 0.00250024, + "balance_loss_clip": 1.03018761, + "balance_loss_mlp": 0.22413133, + "epoch": 0.7737862618367654, + "flos": 28730727106560.0, + "grad_norm": 4.39831510983338, + "language_loss": 0.81009841, + "learning_rate": 5.130937518435124e-07, + "loss": 0.82507038, + "num_input_tokens_seen": 277682890, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.25915527, + "step": 12870, + "time_per_iteration": 2.7541966438293457 + }, + { + "auxiliary_loss_clip": 0.01256278, + "auxiliary_loss_mlp": 0.00212046, + "balance_loss_clip": 1.03877234, + "balance_loss_mlp": 0.18671355, + "epoch": 0.7738463850894334, + "flos": 17018570119680.0, + "grad_norm": 4.1264017274205536, + "language_loss": 0.84140241, + "learning_rate": 5.12833312719501e-07, + "loss": 0.85608554, + "num_input_tokens_seen": 277699330, + "router_z_loss_clip": 2.17675781, + "router_z_loss_mlp": 0.25341797, + "step": 12871, + "time_per_iteration": 2.6025922298431396 + }, + { + "auxiliary_loss_clip": 0.01220366, + "auxiliary_loss_mlp": 0.00244197, + "balance_loss_clip": 1.01034617, + "balance_loss_mlp": 0.22004545, + "epoch": 0.7739065083421013, + "flos": 20704010290560.0, + "grad_norm": 40.57034174177533, + "language_loss": 0.77485663, + "learning_rate": 5.12572929988999e-07, + "loss": 0.78950226, + "num_input_tokens_seen": 277718750, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.24157715, + "step": 12872, + "time_per_iteration": 2.6933298110961914 + }, + { + "auxiliary_loss_clip": 0.0125718, + "auxiliary_loss_mlp": 0.0022768, + "balance_loss_clip": 1.0372653, + "balance_loss_mlp": 0.20207383, + "epoch": 0.7739666315947693, + "flos": 20697222620160.0, + "grad_norm": 122.61065511138243, + "language_loss": 0.93445128, + "learning_rate": 5.123126036618804e-07, + "loss": 0.94929981, + "num_input_tokens_seen": 277734645, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25598145, + "step": 12873, + "time_per_iteration": 2.644747495651245 + }, + { + "auxiliary_loss_clip": 0.01264844, + "auxiliary_loss_mlp": 0.00226941, + "balance_loss_clip": 1.04551315, + "balance_loss_mlp": 0.20251517, + "epoch": 0.7740267548474372, + "flos": 29570677718400.0, + "grad_norm": 252.929357831653, + "language_loss": 0.74547589, + "learning_rate": 5.120523337480174e-07, + "loss": 0.76039374, + "num_input_tokens_seen": 277755535, + "router_z_loss_clip": 2.19628906, + "router_z_loss_mlp": 0.24450684, + "step": 12874, + "time_per_iteration": 2.734696388244629 + }, + { + "auxiliary_loss_clip": 0.01260723, + "auxiliary_loss_mlp": 0.0022603, + "balance_loss_clip": 1.0435648, + "balance_loss_mlp": 0.20078164, + "epoch": 0.7740868781001052, + "flos": 23659099223040.0, + "grad_norm": 63.76851603315143, + "language_loss": 0.70320332, + "learning_rate": 5.117921202572785e-07, + "loss": 0.71807075, + "num_input_tokens_seen": 277775585, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25256348, + "step": 12875, + "time_per_iteration": 2.674215078353882 + }, + { + "auxiliary_loss_clip": 0.01254101, + "auxiliary_loss_mlp": 0.00237563, + "balance_loss_clip": 1.03203773, + "balance_loss_mlp": 0.21056256, + "epoch": 0.7741470013527731, + "flos": 24717314828160.0, + "grad_norm": 43.599243578066805, + "language_loss": 0.72143799, + "learning_rate": 5.115319631995318e-07, + "loss": 0.73635465, + "num_input_tokens_seen": 277794795, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.26977539, + "step": 12876, + "time_per_iteration": 2.702072858810425 + }, + { + "auxiliary_loss_clip": 0.01248004, + "auxiliary_loss_mlp": 0.00231079, + "balance_loss_clip": 1.02965868, + "balance_loss_mlp": 0.20642647, + "epoch": 0.7742071246054412, + "flos": 21871645701120.0, + "grad_norm": 17.87293561592834, + "language_loss": 0.7967748, + "learning_rate": 5.112718625846433e-07, + "loss": 0.81156558, + "num_input_tokens_seen": 277813235, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24645996, + "step": 12877, + "time_per_iteration": 2.6738336086273193 + }, + { + "auxiliary_loss_clip": 0.01263484, + "auxiliary_loss_mlp": 0.00229647, + "balance_loss_clip": 1.0399189, + "balance_loss_mlp": 0.20289619, + "epoch": 0.7742672478581091, + "flos": 22674249146880.0, + "grad_norm": 548.3668551485101, + "language_loss": 0.8991099, + "learning_rate": 5.110118184224736e-07, + "loss": 0.91404116, + "num_input_tokens_seen": 277832560, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.2677002, + "step": 12878, + "time_per_iteration": 2.7053792476654053 + }, + { + "auxiliary_loss_clip": 0.01257652, + "auxiliary_loss_mlp": 0.00239467, + "balance_loss_clip": 1.0371865, + "balance_loss_mlp": 0.21382508, + "epoch": 0.7743273711107771, + "flos": 18840892769280.0, + "grad_norm": 8.65563500519384, + "language_loss": 0.79984903, + "learning_rate": 5.10751830722885e-07, + "loss": 0.81482023, + "num_input_tokens_seen": 277850120, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25646973, + "step": 12879, + "time_per_iteration": 2.7803151607513428 + }, + { + "auxiliary_loss_clip": 0.0123907, + "auxiliary_loss_mlp": 0.00246107, + "balance_loss_clip": 1.02596426, + "balance_loss_mlp": 0.22063223, + "epoch": 0.7743874943634451, + "flos": 28729326476160.0, + "grad_norm": 292.72859435305935, + "language_loss": 0.85317731, + "learning_rate": 5.104918994957364e-07, + "loss": 0.868029, + "num_input_tokens_seen": 277871020, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.25463867, + "step": 12880, + "time_per_iteration": 2.812614917755127 + }, + { + "auxiliary_loss_clip": 0.01245972, + "auxiliary_loss_mlp": 0.00234347, + "balance_loss_clip": 1.0286088, + "balance_loss_mlp": 0.20834711, + "epoch": 0.774447617616113, + "flos": 21909639312000.0, + "grad_norm": 3.8653551791372496, + "language_loss": 0.78073287, + "learning_rate": 5.102320247508847e-07, + "loss": 0.79553604, + "num_input_tokens_seen": 277891525, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.26000977, + "step": 12881, + "time_per_iteration": 2.7884013652801514 + }, + { + "auxiliary_loss_clip": 0.01269778, + "auxiliary_loss_mlp": 0.00264359, + "balance_loss_clip": 1.04443073, + "balance_loss_mlp": 0.23496228, + "epoch": 0.774507740868781, + "flos": 19500643825920.0, + "grad_norm": 15.672923749609533, + "language_loss": 0.91374868, + "learning_rate": 5.099722064981832e-07, + "loss": 0.92909002, + "num_input_tokens_seen": 277910425, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.29418945, + "step": 12882, + "time_per_iteration": 2.6561548709869385 + }, + { + "auxiliary_loss_clip": 0.01076604, + "auxiliary_loss_mlp": 0.00050599, + "balance_loss_clip": 0.92966306, + "balance_loss_mlp": 0.04306469, + "epoch": 0.774567864121449, + "flos": 59426560402560.0, + "grad_norm": 0.7544995522018929, + "language_loss": 0.59548444, + "learning_rate": 5.097124447474858e-07, + "loss": 0.60675645, + "num_input_tokens_seen": 277972795, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.07519531, + "step": 12883, + "time_per_iteration": 3.128261089324951 + }, + { + "auxiliary_loss_clip": 0.01254095, + "auxiliary_loss_mlp": 0.00231005, + "balance_loss_clip": 1.03616977, + "balance_loss_mlp": 0.20480278, + "epoch": 0.774627987374117, + "flos": 13225326255360.0, + "grad_norm": 15.412545171402407, + "language_loss": 0.82867396, + "learning_rate": 5.094527395086416e-07, + "loss": 0.84352493, + "num_input_tokens_seen": 277990675, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.26171875, + "step": 12884, + "time_per_iteration": 2.6274242401123047 + }, + { + "auxiliary_loss_clip": 0.01248067, + "auxiliary_loss_mlp": 0.00228196, + "balance_loss_clip": 1.03743172, + "balance_loss_mlp": 0.20418699, + "epoch": 0.7746881106267849, + "flos": 21394033534080.0, + "grad_norm": 84.65330546812044, + "language_loss": 0.85873926, + "learning_rate": 5.091930907914986e-07, + "loss": 0.87350184, + "num_input_tokens_seen": 278010050, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.23999023, + "step": 12885, + "time_per_iteration": 2.753474473953247 + }, + { + "auxiliary_loss_clip": 0.01243964, + "auxiliary_loss_mlp": 0.00214413, + "balance_loss_clip": 1.02215183, + "balance_loss_mlp": 0.18949872, + "epoch": 0.7747482338794529, + "flos": 25629338079360.0, + "grad_norm": 11.768135936337396, + "language_loss": 0.74697912, + "learning_rate": 5.089334986059029e-07, + "loss": 0.76156288, + "num_input_tokens_seen": 278030660, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24926758, + "step": 12886, + "time_per_iteration": 2.723646640777588 + }, + { + "auxiliary_loss_clip": 0.01245465, + "auxiliary_loss_mlp": 0.00221034, + "balance_loss_clip": 1.02917004, + "balance_loss_mlp": 0.19695359, + "epoch": 0.7748083571321208, + "flos": 11546933402880.0, + "grad_norm": 3.511893999971291, + "language_loss": 0.76930857, + "learning_rate": 5.086739629616987e-07, + "loss": 0.78397352, + "num_input_tokens_seen": 278047645, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.24084473, + "step": 12887, + "time_per_iteration": 2.681191921234131 + }, + { + "auxiliary_loss_clip": 0.01221921, + "auxiliary_loss_mlp": 0.00208193, + "balance_loss_clip": 1.01107955, + "balance_loss_mlp": 0.18506634, + "epoch": 0.7748684803847888, + "flos": 19062425900160.0, + "grad_norm": 4.348727360250039, + "language_loss": 0.80538106, + "learning_rate": 5.084144838687275e-07, + "loss": 0.81968218, + "num_input_tokens_seen": 278066170, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.23132324, + "step": 12888, + "time_per_iteration": 2.689887523651123 + }, + { + "auxiliary_loss_clip": 0.01244472, + "auxiliary_loss_mlp": 0.002437, + "balance_loss_clip": 1.02637637, + "balance_loss_mlp": 0.21820083, + "epoch": 0.7749286036374567, + "flos": 22273162905600.0, + "grad_norm": 49.732738148633715, + "language_loss": 0.89527822, + "learning_rate": 5.081550613368279e-07, + "loss": 0.91015989, + "num_input_tokens_seen": 278085545, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25512695, + "step": 12889, + "time_per_iteration": 4.058763742446899 + }, + { + "auxiliary_loss_clip": 0.01273057, + "auxiliary_loss_mlp": 0.00252601, + "balance_loss_clip": 1.04872, + "balance_loss_mlp": 0.22592232, + "epoch": 0.7749887268901248, + "flos": 20192462749440.0, + "grad_norm": 15.78685214344116, + "language_loss": 0.86309278, + "learning_rate": 5.07895695375838e-07, + "loss": 0.87834942, + "num_input_tokens_seen": 278102995, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26672363, + "step": 12890, + "time_per_iteration": 2.6511127948760986 + }, + { + "auxiliary_loss_clip": 0.01272772, + "auxiliary_loss_mlp": 0.00241751, + "balance_loss_clip": 1.0477531, + "balance_loss_mlp": 0.2153701, + "epoch": 0.7750488501427927, + "flos": 20337541781760.0, + "grad_norm": 11.571241039085566, + "language_loss": 0.7536521, + "learning_rate": 5.076363859955932e-07, + "loss": 0.76879734, + "num_input_tokens_seen": 278121460, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.26379395, + "step": 12891, + "time_per_iteration": 2.66204833984375 + }, + { + "auxiliary_loss_clip": 0.01271856, + "auxiliary_loss_mlp": 0.00216212, + "balance_loss_clip": 1.04791903, + "balance_loss_mlp": 0.19085652, + "epoch": 0.7751089733954607, + "flos": 28364043116160.0, + "grad_norm": 28.187115181631835, + "language_loss": 0.85474479, + "learning_rate": 5.073771332059257e-07, + "loss": 0.86962545, + "num_input_tokens_seen": 278143905, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.25366211, + "step": 12892, + "time_per_iteration": 4.158003091812134 + }, + { + "auxiliary_loss_clip": 0.01287846, + "auxiliary_loss_mlp": 0.00243394, + "balance_loss_clip": 1.05800271, + "balance_loss_mlp": 0.2148906, + "epoch": 0.7751690966481286, + "flos": 16943803960320.0, + "grad_norm": 27.914766266578305, + "language_loss": 0.79496288, + "learning_rate": 5.071179370166669e-07, + "loss": 0.81027526, + "num_input_tokens_seen": 278160850, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.28466797, + "step": 12893, + "time_per_iteration": 2.765023946762085 + }, + { + "auxiliary_loss_clip": 0.01085096, + "auxiliary_loss_mlp": 0.00023706, + "balance_loss_clip": 0.93632233, + "balance_loss_mlp": 0.01564701, + "epoch": 0.7752292199007966, + "flos": 65668050339840.0, + "grad_norm": 2.0906801949986775, + "language_loss": 0.57774091, + "learning_rate": 5.068587974376468e-07, + "loss": 0.58882892, + "num_input_tokens_seen": 278219950, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.08056641, + "step": 12894, + "time_per_iteration": 3.3085410594940186 + }, + { + "auxiliary_loss_clip": 0.01253448, + "auxiliary_loss_mlp": 0.00223614, + "balance_loss_clip": 1.0317471, + "balance_loss_mlp": 0.19651753, + "epoch": 0.7752893431534646, + "flos": 20594662312320.0, + "grad_norm": 11.22484025629422, + "language_loss": 0.87018561, + "learning_rate": 5.065997144786895e-07, + "loss": 0.88495624, + "num_input_tokens_seen": 278237805, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.27124023, + "step": 12895, + "time_per_iteration": 2.716477632522583 + }, + { + "auxiliary_loss_clip": 0.01258332, + "auxiliary_loss_mlp": 0.00233313, + "balance_loss_clip": 1.03430784, + "balance_loss_mlp": 0.20711058, + "epoch": 0.7753494664061326, + "flos": 20485350247680.0, + "grad_norm": 5184.048696331857, + "language_loss": 0.75281, + "learning_rate": 5.063406881496209e-07, + "loss": 0.76772648, + "num_input_tokens_seen": 278257660, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.26220703, + "step": 12896, + "time_per_iteration": 4.217605113983154 + }, + { + "auxiliary_loss_clip": 0.01266195, + "auxiliary_loss_mlp": 0.00234265, + "balance_loss_clip": 1.04849482, + "balance_loss_mlp": 0.21038762, + "epoch": 0.7754095896588006, + "flos": 20265900105600.0, + "grad_norm": 12.04671452490976, + "language_loss": 0.74984336, + "learning_rate": 5.060817184602629e-07, + "loss": 0.76484793, + "num_input_tokens_seen": 278275110, + "router_z_loss_clip": 2.17675781, + "router_z_loss_mlp": 0.23876953, + "step": 12897, + "time_per_iteration": 2.6508378982543945 + }, + { + "auxiliary_loss_clip": 0.01259441, + "auxiliary_loss_mlp": 0.00222146, + "balance_loss_clip": 1.0395267, + "balance_loss_mlp": 0.19544317, + "epoch": 0.7754697129114685, + "flos": 23331091201920.0, + "grad_norm": 4.243216019129883, + "language_loss": 0.8178463, + "learning_rate": 5.058228054204364e-07, + "loss": 0.83266217, + "num_input_tokens_seen": 278293035, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.2668457, + "step": 12898, + "time_per_iteration": 2.7206027507781982 + }, + { + "auxiliary_loss_clip": 0.01271332, + "auxiliary_loss_mlp": 0.00228821, + "balance_loss_clip": 1.04489064, + "balance_loss_mlp": 0.20065166, + "epoch": 0.7755298361641365, + "flos": 17347619635200.0, + "grad_norm": 15.815378932386963, + "language_loss": 0.78160596, + "learning_rate": 5.055639490399588e-07, + "loss": 0.79660749, + "num_input_tokens_seen": 278311010, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.28149414, + "step": 12899, + "time_per_iteration": 2.7522242069244385 + }, + { + "auxiliary_loss_clip": 0.0127699, + "auxiliary_loss_mlp": 0.00224502, + "balance_loss_clip": 1.05230784, + "balance_loss_mlp": 0.19754831, + "epoch": 0.7755899594168044, + "flos": 19645866512640.0, + "grad_norm": 16.617663377278298, + "language_loss": 0.85282987, + "learning_rate": 5.053051493286453e-07, + "loss": 0.86784482, + "num_input_tokens_seen": 278329900, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.26977539, + "step": 12900, + "time_per_iteration": 4.044504404067993 + }, + { + "auxiliary_loss_clip": 0.01242908, + "auxiliary_loss_mlp": 0.00219852, + "balance_loss_clip": 1.03070295, + "balance_loss_mlp": 0.19533044, + "epoch": 0.7756500826694724, + "flos": 27414457217280.0, + "grad_norm": 17.922385941984402, + "language_loss": 0.83115011, + "learning_rate": 5.050464062963113e-07, + "loss": 0.84577775, + "num_input_tokens_seen": 278349980, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.24523926, + "step": 12901, + "time_per_iteration": 2.70332670211792 + }, + { + "auxiliary_loss_clip": 0.01246146, + "auxiliary_loss_mlp": 0.00214791, + "balance_loss_clip": 1.03430629, + "balance_loss_mlp": 0.19066326, + "epoch": 0.7757102059221404, + "flos": 28730511624960.0, + "grad_norm": 2.6693492486493993, + "language_loss": 0.84294105, + "learning_rate": 5.047877199527666e-07, + "loss": 0.85755044, + "num_input_tokens_seen": 278372485, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.24108887, + "step": 12902, + "time_per_iteration": 2.7551798820495605 + }, + { + "auxiliary_loss_clip": 0.01258864, + "auxiliary_loss_mlp": 0.00207996, + "balance_loss_clip": 1.04043698, + "balance_loss_mlp": 0.18279502, + "epoch": 0.7757703291748084, + "flos": 22486795044480.0, + "grad_norm": 20.72398791325989, + "language_loss": 0.80304325, + "learning_rate": 5.045290903078215e-07, + "loss": 0.81771177, + "num_input_tokens_seen": 278391660, + "router_z_loss_clip": 2.18457031, + "router_z_loss_mlp": 0.25219727, + "step": 12903, + "time_per_iteration": 2.7267367839813232 + }, + { + "auxiliary_loss_clip": 0.012726, + "auxiliary_loss_mlp": 0.00233261, + "balance_loss_clip": 1.04903889, + "balance_loss_mlp": 0.20589103, + "epoch": 0.7758304524274763, + "flos": 21430159637760.0, + "grad_norm": 9.44599644494363, + "language_loss": 0.86103565, + "learning_rate": 5.042705173712835e-07, + "loss": 0.87609422, + "num_input_tokens_seen": 278409125, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.27368164, + "step": 12904, + "time_per_iteration": 2.712670087814331 + }, + { + "auxiliary_loss_clip": 0.01249326, + "auxiliary_loss_mlp": 0.00219907, + "balance_loss_clip": 1.03347564, + "balance_loss_mlp": 0.19537354, + "epoch": 0.7758905756801443, + "flos": 23659242877440.0, + "grad_norm": 145.01968003158225, + "language_loss": 0.76398766, + "learning_rate": 5.040120011529576e-07, + "loss": 0.77867997, + "num_input_tokens_seen": 278429450, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24523926, + "step": 12905, + "time_per_iteration": 2.7801804542541504 + }, + { + "auxiliary_loss_clip": 0.01271277, + "auxiliary_loss_mlp": 0.0023445, + "balance_loss_clip": 1.04967964, + "balance_loss_mlp": 0.2074133, + "epoch": 0.7759506989328122, + "flos": 28365479660160.0, + "grad_norm": 12.369513367734433, + "language_loss": 0.75895476, + "learning_rate": 5.037535416626459e-07, + "loss": 0.77401197, + "num_input_tokens_seen": 278449925, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.27038574, + "step": 12906, + "time_per_iteration": 2.726865768432617 + }, + { + "auxiliary_loss_clip": 0.01285394, + "auxiliary_loss_mlp": 0.00239827, + "balance_loss_clip": 1.05774939, + "balance_loss_mlp": 0.20934498, + "epoch": 0.7760108221854802, + "flos": 14902785354240.0, + "grad_norm": 8.730028173344728, + "language_loss": 0.90438485, + "learning_rate": 5.034951389101498e-07, + "loss": 0.91963708, + "num_input_tokens_seen": 278467255, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.30493164, + "step": 12907, + "time_per_iteration": 2.6092119216918945 + }, + { + "auxiliary_loss_clip": 0.01242608, + "auxiliary_loss_mlp": 0.00215528, + "balance_loss_clip": 1.0313859, + "balance_loss_mlp": 0.19113818, + "epoch": 0.7760709454381483, + "flos": 14792503622400.0, + "grad_norm": 88.41672455471947, + "language_loss": 0.77226973, + "learning_rate": 5.032367929052685e-07, + "loss": 0.78685105, + "num_input_tokens_seen": 278484250, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.24377441, + "step": 12908, + "time_per_iteration": 2.660560369491577 + }, + { + "auxiliary_loss_clip": 0.01252986, + "auxiliary_loss_mlp": 0.00226527, + "balance_loss_clip": 1.02843559, + "balance_loss_mlp": 0.20046771, + "epoch": 0.7761310686908162, + "flos": 17379831156480.0, + "grad_norm": 2.286484694379444, + "language_loss": 0.79393625, + "learning_rate": 5.029785036577976e-07, + "loss": 0.80873132, + "num_input_tokens_seen": 278502740, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26074219, + "step": 12909, + "time_per_iteration": 2.6452951431274414 + }, + { + "auxiliary_loss_clip": 0.01240783, + "auxiliary_loss_mlp": 0.00209074, + "balance_loss_clip": 1.02746296, + "balance_loss_mlp": 0.18443379, + "epoch": 0.7761911919434842, + "flos": 25556547168000.0, + "grad_norm": 15.299219019368067, + "language_loss": 0.74973387, + "learning_rate": 5.027202711775324e-07, + "loss": 0.76423252, + "num_input_tokens_seen": 278523890, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24633789, + "step": 12910, + "time_per_iteration": 2.7392613887786865 + }, + { + "auxiliary_loss_clip": 0.01258299, + "auxiliary_loss_mlp": 0.00219211, + "balance_loss_clip": 1.03491282, + "balance_loss_mlp": 0.19361669, + "epoch": 0.7762513151961521, + "flos": 23179763203200.0, + "grad_norm": 151.03230315895183, + "language_loss": 0.79733413, + "learning_rate": 5.024620954742646e-07, + "loss": 0.81210923, + "num_input_tokens_seen": 278543185, + "router_z_loss_clip": 2.23144531, + "router_z_loss_mlp": 0.25610352, + "step": 12911, + "time_per_iteration": 2.679025888442993 + }, + { + "auxiliary_loss_clip": 0.0128329, + "auxiliary_loss_mlp": 0.00230341, + "balance_loss_clip": 1.05688322, + "balance_loss_mlp": 0.20409098, + "epoch": 0.7763114384488201, + "flos": 21689614552320.0, + "grad_norm": 10.750717420329636, + "language_loss": 0.75663757, + "learning_rate": 5.022039765577836e-07, + "loss": 0.77177382, + "num_input_tokens_seen": 278559220, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.2623291, + "step": 12912, + "time_per_iteration": 2.6502342224121094 + }, + { + "auxiliary_loss_clip": 0.01085966, + "auxiliary_loss_mlp": 0.00065452, + "balance_loss_clip": 0.94050872, + "balance_loss_mlp": 0.05624895, + "epoch": 0.776371561701488, + "flos": 69025554316800.0, + "grad_norm": 0.765642590694399, + "language_loss": 0.52766335, + "learning_rate": 5.019459144378779e-07, + "loss": 0.53917754, + "num_input_tokens_seen": 278618185, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.09179688, + "step": 12913, + "time_per_iteration": 3.237210750579834 + }, + { + "auxiliary_loss_clip": 0.01269822, + "auxiliary_loss_mlp": 0.00224147, + "balance_loss_clip": 1.04403853, + "balance_loss_mlp": 0.19640742, + "epoch": 0.776431684954156, + "flos": 22893914770560.0, + "grad_norm": 24.807566301629816, + "language_loss": 0.71640491, + "learning_rate": 5.016879091243338e-07, + "loss": 0.73134458, + "num_input_tokens_seen": 278636210, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.27758789, + "step": 12914, + "time_per_iteration": 2.680666208267212 + }, + { + "auxiliary_loss_clip": 0.01257012, + "auxiliary_loss_mlp": 0.00225214, + "balance_loss_clip": 1.03812194, + "balance_loss_mlp": 0.19973868, + "epoch": 0.776491808206824, + "flos": 20261554560000.0, + "grad_norm": 1.8491742747754158, + "language_loss": 0.88483894, + "learning_rate": 5.014299606269339e-07, + "loss": 0.89966118, + "num_input_tokens_seen": 278653305, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.25500488, + "step": 12915, + "time_per_iteration": 2.655639171600342 + }, + { + "auxiliary_loss_clip": 0.0125733, + "auxiliary_loss_mlp": 0.00230356, + "balance_loss_clip": 1.03608108, + "balance_loss_mlp": 0.20362934, + "epoch": 0.776551931459492, + "flos": 26759051706240.0, + "grad_norm": 331.1432131360405, + "language_loss": 0.83203906, + "learning_rate": 5.011720689554603e-07, + "loss": 0.84691596, + "num_input_tokens_seen": 278671850, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26745605, + "step": 12916, + "time_per_iteration": 2.7593867778778076 + }, + { + "auxiliary_loss_clip": 0.01255338, + "auxiliary_loss_mlp": 0.00221497, + "balance_loss_clip": 1.03653932, + "balance_loss_mlp": 0.19616537, + "epoch": 0.7766120547121599, + "flos": 52665080250240.0, + "grad_norm": 10.859453580105704, + "language_loss": 0.7156496, + "learning_rate": 5.009142341196919e-07, + "loss": 0.73041797, + "num_input_tokens_seen": 278697860, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25354004, + "step": 12917, + "time_per_iteration": 3.0413341522216797 + }, + { + "auxiliary_loss_clip": 0.0124412, + "auxiliary_loss_mlp": 0.00213506, + "balance_loss_clip": 1.03346169, + "balance_loss_mlp": 0.18960452, + "epoch": 0.7766721779648279, + "flos": 25156215112320.0, + "grad_norm": 34.22556083535453, + "language_loss": 0.69995278, + "learning_rate": 5.006564561294065e-07, + "loss": 0.71452904, + "num_input_tokens_seen": 278720655, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.23876953, + "step": 12918, + "time_per_iteration": 2.7178475856781006 + }, + { + "auxiliary_loss_clip": 0.01249507, + "auxiliary_loss_mlp": 0.00224154, + "balance_loss_clip": 1.03486586, + "balance_loss_mlp": 0.19882166, + "epoch": 0.7767323012174958, + "flos": 23760761690880.0, + "grad_norm": 125.96937677754636, + "language_loss": 0.80689877, + "learning_rate": 5.003987349943777e-07, + "loss": 0.82163537, + "num_input_tokens_seen": 278737375, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.2532959, + "step": 12919, + "time_per_iteration": 2.6825029850006104 + }, + { + "auxiliary_loss_clip": 0.01281837, + "auxiliary_loss_mlp": 0.00258804, + "balance_loss_clip": 1.05441177, + "balance_loss_mlp": 0.22965762, + "epoch": 0.7767924244701638, + "flos": 22086642556800.0, + "grad_norm": 79.62037085795713, + "language_loss": 0.87163216, + "learning_rate": 5.001410707243792e-07, + "loss": 0.88703859, + "num_input_tokens_seen": 278756510, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.29125977, + "step": 12920, + "time_per_iteration": 2.6795122623443604 + }, + { + "auxiliary_loss_clip": 0.01265953, + "auxiliary_loss_mlp": 0.00210512, + "balance_loss_clip": 1.03912246, + "balance_loss_mlp": 0.18224788, + "epoch": 0.7768525477228319, + "flos": 21981640124160.0, + "grad_norm": 8.43449608872046, + "language_loss": 0.77926683, + "learning_rate": 4.998834633291829e-07, + "loss": 0.7940315, + "num_input_tokens_seen": 278775410, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.28259277, + "step": 12921, + "time_per_iteration": 2.655284881591797 + }, + { + "auxiliary_loss_clip": 0.0127707, + "auxiliary_loss_mlp": 0.002303, + "balance_loss_clip": 1.04927933, + "balance_loss_mlp": 0.20270288, + "epoch": 0.7769126709754998, + "flos": 21794581071360.0, + "grad_norm": 470.82967839974094, + "language_loss": 0.84249735, + "learning_rate": 4.996259128185547e-07, + "loss": 0.85757107, + "num_input_tokens_seen": 278794260, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.27600098, + "step": 12922, + "time_per_iteration": 2.7056233882904053 + }, + { + "auxiliary_loss_clip": 0.01250101, + "auxiliary_loss_mlp": 0.00226739, + "balance_loss_clip": 1.02947593, + "balance_loss_mlp": 0.201252, + "epoch": 0.7769727942281678, + "flos": 20047994248320.0, + "grad_norm": 18.312790646180392, + "language_loss": 0.87952638, + "learning_rate": 4.993684192022625e-07, + "loss": 0.89429474, + "num_input_tokens_seen": 278813290, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.25476074, + "step": 12923, + "time_per_iteration": 2.6798768043518066 + }, + { + "auxiliary_loss_clip": 0.01246017, + "auxiliary_loss_mlp": 0.00239879, + "balance_loss_clip": 1.0312078, + "balance_loss_mlp": 0.21303311, + "epoch": 0.7770329174808357, + "flos": 21686777377920.0, + "grad_norm": 18.863714335620223, + "language_loss": 0.98769879, + "learning_rate": 4.991109824900699e-07, + "loss": 1.00255775, + "num_input_tokens_seen": 278830610, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.26867676, + "step": 12924, + "time_per_iteration": 2.693669319152832 + }, + { + "auxiliary_loss_clip": 0.01255561, + "auxiliary_loss_mlp": 0.00226561, + "balance_loss_clip": 1.03395247, + "balance_loss_mlp": 0.19982252, + "epoch": 0.7770930407335037, + "flos": 25849255098240.0, + "grad_norm": 8.431127502711172, + "language_loss": 0.74633074, + "learning_rate": 4.988536026917401e-07, + "loss": 0.76115197, + "num_input_tokens_seen": 278849530, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.2677002, + "step": 12925, + "time_per_iteration": 2.71305513381958 + }, + { + "auxiliary_loss_clip": 0.01265649, + "auxiliary_loss_mlp": 0.00226469, + "balance_loss_clip": 1.03910577, + "balance_loss_mlp": 0.19948016, + "epoch": 0.7771531639861716, + "flos": 24347865490560.0, + "grad_norm": 96.54782489989756, + "language_loss": 0.80509764, + "learning_rate": 4.985962798170314e-07, + "loss": 0.82001883, + "num_input_tokens_seen": 278869005, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26965332, + "step": 12926, + "time_per_iteration": 2.7438793182373047 + }, + { + "auxiliary_loss_clip": 0.01275697, + "auxiliary_loss_mlp": 0.00224276, + "balance_loss_clip": 1.04665422, + "balance_loss_mlp": 0.19579718, + "epoch": 0.7772132872388396, + "flos": 25629948610560.0, + "grad_norm": 90.73486134156663, + "language_loss": 0.76587486, + "learning_rate": 4.983390138757027e-07, + "loss": 0.78087461, + "num_input_tokens_seen": 278888790, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.28491211, + "step": 12927, + "time_per_iteration": 2.7893826961517334 + }, + { + "auxiliary_loss_clip": 0.0126902, + "auxiliary_loss_mlp": 0.00236288, + "balance_loss_clip": 1.04317927, + "balance_loss_mlp": 0.20876263, + "epoch": 0.7772734104915076, + "flos": 26067412350720.0, + "grad_norm": 35.11736696828609, + "language_loss": 0.84290171, + "learning_rate": 4.980818048775093e-07, + "loss": 0.8579548, + "num_input_tokens_seen": 278908150, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.27502441, + "step": 12928, + "time_per_iteration": 2.7146034240722656 + }, + { + "auxiliary_loss_clip": 0.01235724, + "auxiliary_loss_mlp": 0.00205411, + "balance_loss_clip": 1.01890206, + "balance_loss_mlp": 0.18099679, + "epoch": 0.7773335337441756, + "flos": 22925048883840.0, + "grad_norm": 8.402953225994409, + "language_loss": 0.82317048, + "learning_rate": 4.978246528322036e-07, + "loss": 0.83758181, + "num_input_tokens_seen": 278927425, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24389648, + "step": 12929, + "time_per_iteration": 2.657606363296509 + }, + { + "auxiliary_loss_clip": 0.01265275, + "auxiliary_loss_mlp": 0.00227506, + "balance_loss_clip": 1.04352915, + "balance_loss_mlp": 0.20138764, + "epoch": 0.7773936569968435, + "flos": 20776765288320.0, + "grad_norm": 11.003820954019623, + "language_loss": 0.87013519, + "learning_rate": 4.975675577495377e-07, + "loss": 0.88506299, + "num_input_tokens_seen": 278946475, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.26135254, + "step": 12930, + "time_per_iteration": 2.6961331367492676 + }, + { + "auxiliary_loss_clip": 0.01255047, + "auxiliary_loss_mlp": 0.00224875, + "balance_loss_clip": 1.03906083, + "balance_loss_mlp": 0.20021111, + "epoch": 0.7774537802495115, + "flos": 20372267255040.0, + "grad_norm": 22.861825214722263, + "language_loss": 0.86518145, + "learning_rate": 4.973105196392613e-07, + "loss": 0.87998068, + "num_input_tokens_seen": 278964345, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.2467041, + "step": 12931, + "time_per_iteration": 2.6199100017547607 + }, + { + "auxiliary_loss_clip": 0.01081813, + "auxiliary_loss_mlp": 0.00033296, + "balance_loss_clip": 0.93627524, + "balance_loss_mlp": 0.02380712, + "epoch": 0.7775139035021794, + "flos": 53912081738880.0, + "grad_norm": 0.7757071626892396, + "language_loss": 0.58954066, + "learning_rate": 4.970535385111199e-07, + "loss": 0.60069174, + "num_input_tokens_seen": 279022380, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.09472656, + "step": 12932, + "time_per_iteration": 4.556074142456055 + }, + { + "auxiliary_loss_clip": 0.01259647, + "auxiliary_loss_mlp": 0.00197477, + "balance_loss_clip": 1.03353846, + "balance_loss_mlp": 0.17187075, + "epoch": 0.7775740267548474, + "flos": 28842481296000.0, + "grad_norm": 13.834573187728951, + "language_loss": 0.82843697, + "learning_rate": 4.967966143748595e-07, + "loss": 0.84300816, + "num_input_tokens_seen": 279044275, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.25622559, + "step": 12933, + "time_per_iteration": 2.8430984020233154 + }, + { + "auxiliary_loss_clip": 0.01245341, + "auxiliary_loss_mlp": 0.00215289, + "balance_loss_clip": 1.02513433, + "balance_loss_mlp": 0.188944, + "epoch": 0.7776341500075155, + "flos": 21872471713920.0, + "grad_norm": 209.9332184943851, + "language_loss": 0.82001579, + "learning_rate": 4.965397472402215e-07, + "loss": 0.83462203, + "num_input_tokens_seen": 279063375, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26330566, + "step": 12934, + "time_per_iteration": 4.1079394817352295 + }, + { + "auxiliary_loss_clip": 0.01257879, + "auxiliary_loss_mlp": 0.00215124, + "balance_loss_clip": 1.03379285, + "balance_loss_mlp": 0.18768221, + "epoch": 0.7776942732601834, + "flos": 20229845829120.0, + "grad_norm": 33.36217034258308, + "language_loss": 0.79719198, + "learning_rate": 4.962829371169475e-07, + "loss": 0.81192201, + "num_input_tokens_seen": 279082680, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.27429199, + "step": 12935, + "time_per_iteration": 2.644845962524414 + }, + { + "auxiliary_loss_clip": 0.01251427, + "auxiliary_loss_mlp": 0.00219841, + "balance_loss_clip": 1.03038359, + "balance_loss_mlp": 0.19392487, + "epoch": 0.7777543965128514, + "flos": 22231829329920.0, + "grad_norm": 85.07181807440844, + "language_loss": 0.89775121, + "learning_rate": 4.960261840147746e-07, + "loss": 0.91246384, + "num_input_tokens_seen": 279099805, + "router_z_loss_clip": 2.21386719, + "router_z_loss_mlp": 0.25915527, + "step": 12936, + "time_per_iteration": 2.6791605949401855 + }, + { + "auxiliary_loss_clip": 0.01250215, + "auxiliary_loss_mlp": 0.00222835, + "balance_loss_clip": 1.03075504, + "balance_loss_mlp": 0.19608428, + "epoch": 0.7778145197655193, + "flos": 14501950508160.0, + "grad_norm": 25.013280593991354, + "language_loss": 0.7850377, + "learning_rate": 4.957694879434397e-07, + "loss": 0.79976821, + "num_input_tokens_seen": 279117975, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.26733398, + "step": 12937, + "time_per_iteration": 2.6188995838165283 + }, + { + "auxiliary_loss_clip": 0.01252039, + "auxiliary_loss_mlp": 0.00202465, + "balance_loss_clip": 1.03184533, + "balance_loss_mlp": 0.17760953, + "epoch": 0.7778746430181873, + "flos": 21140288881920.0, + "grad_norm": 6.48571790881416, + "language_loss": 0.940229, + "learning_rate": 4.955128489126777e-07, + "loss": 0.95477402, + "num_input_tokens_seen": 279137255, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.24865723, + "step": 12938, + "time_per_iteration": 4.297469139099121 + }, + { + "auxiliary_loss_clip": 0.01257314, + "auxiliary_loss_mlp": 0.00213585, + "balance_loss_clip": 1.03442109, + "balance_loss_mlp": 0.18626186, + "epoch": 0.7779347662708552, + "flos": 20266366982400.0, + "grad_norm": 9.324392793500177, + "language_loss": 0.95170045, + "learning_rate": 4.95256266932218e-07, + "loss": 0.96640944, + "num_input_tokens_seen": 279154500, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.2734375, + "step": 12939, + "time_per_iteration": 2.7526421546936035 + }, + { + "auxiliary_loss_clip": 0.01248391, + "auxiliary_loss_mlp": 0.00215915, + "balance_loss_clip": 1.03437912, + "balance_loss_mlp": 0.19055909, + "epoch": 0.7779948895235232, + "flos": 19209013303680.0, + "grad_norm": 58.520284483370894, + "language_loss": 0.79045373, + "learning_rate": 4.949997420117915e-07, + "loss": 0.80509675, + "num_input_tokens_seen": 279173635, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.25366211, + "step": 12940, + "time_per_iteration": 2.668226957321167 + }, + { + "auxiliary_loss_clip": 0.0125539, + "auxiliary_loss_mlp": 0.0022813, + "balance_loss_clip": 1.02977371, + "balance_loss_mlp": 0.20095035, + "epoch": 0.7780550127761912, + "flos": 23914711382400.0, + "grad_norm": 14.499276255945697, + "language_loss": 0.84494936, + "learning_rate": 4.947432741611255e-07, + "loss": 0.8597846, + "num_input_tokens_seen": 279194430, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.27172852, + "step": 12941, + "time_per_iteration": 2.714939594268799 + }, + { + "auxiliary_loss_clip": 0.01275798, + "auxiliary_loss_mlp": 0.00215149, + "balance_loss_clip": 1.04087067, + "balance_loss_mlp": 0.18767163, + "epoch": 0.7781151360288592, + "flos": 32415951795840.0, + "grad_norm": 159.18229052610346, + "language_loss": 0.83205521, + "learning_rate": 4.944868633899462e-07, + "loss": 0.84696472, + "num_input_tokens_seen": 279212920, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.2746582, + "step": 12942, + "time_per_iteration": 4.159067392349243 + }, + { + "auxiliary_loss_clip": 0.01231383, + "auxiliary_loss_mlp": 0.00224709, + "balance_loss_clip": 1.0160296, + "balance_loss_mlp": 0.1995559, + "epoch": 0.7781752592815271, + "flos": 22346384780160.0, + "grad_norm": 7.21692366259594, + "language_loss": 0.75777292, + "learning_rate": 4.942305097079751e-07, + "loss": 0.77233386, + "num_input_tokens_seen": 279232310, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.25134277, + "step": 12943, + "time_per_iteration": 2.6589066982269287 + }, + { + "auxiliary_loss_clip": 0.01070536, + "auxiliary_loss_mlp": 0.00016015, + "balance_loss_clip": 0.92267847, + "balance_loss_mlp": 0.0056672, + "epoch": 0.7782353825341951, + "flos": 70460183520000.0, + "grad_norm": 0.7806757165398607, + "language_loss": 0.57897693, + "learning_rate": 4.939742131249347e-07, + "loss": 0.58984244, + "num_input_tokens_seen": 279295375, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.10351562, + "step": 12944, + "time_per_iteration": 3.2966887950897217 + }, + { + "auxiliary_loss_clip": 0.01253556, + "auxiliary_loss_mlp": 0.00231397, + "balance_loss_clip": 1.02905142, + "balance_loss_mlp": 0.20202357, + "epoch": 0.778295505786863, + "flos": 19062569554560.0, + "grad_norm": 125.77300155175298, + "language_loss": 0.78307807, + "learning_rate": 4.937179736505428e-07, + "loss": 0.79792756, + "num_input_tokens_seen": 279313660, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.2935791, + "step": 12945, + "time_per_iteration": 2.7075395584106445 + }, + { + "auxiliary_loss_clip": 0.01257527, + "auxiliary_loss_mlp": 0.00227002, + "balance_loss_clip": 1.03424597, + "balance_loss_mlp": 0.19975048, + "epoch": 0.778355629039531, + "flos": 20999734963200.0, + "grad_norm": 29.269429304424282, + "language_loss": 0.75892615, + "learning_rate": 4.93461791294516e-07, + "loss": 0.77377146, + "num_input_tokens_seen": 279334495, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.27246094, + "step": 12946, + "time_per_iteration": 2.7571144104003906 + }, + { + "auxiliary_loss_clip": 0.01247471, + "auxiliary_loss_mlp": 0.00207958, + "balance_loss_clip": 1.02820122, + "balance_loss_mlp": 0.18415204, + "epoch": 0.7784157522921991, + "flos": 21398091770880.0, + "grad_norm": 3.5927227979435012, + "language_loss": 0.71665233, + "learning_rate": 4.932056660665689e-07, + "loss": 0.73120666, + "num_input_tokens_seen": 279352985, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.23791504, + "step": 12947, + "time_per_iteration": 2.683212995529175 + }, + { + "auxiliary_loss_clip": 0.01231799, + "auxiliary_loss_mlp": 0.00218739, + "balance_loss_clip": 1.01541758, + "balance_loss_mlp": 0.19272722, + "epoch": 0.778475875544867, + "flos": 20813861059200.0, + "grad_norm": 36.38523109227311, + "language_loss": 0.75843167, + "learning_rate": 4.929495979764147e-07, + "loss": 0.77293706, + "num_input_tokens_seen": 279371360, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.26000977, + "step": 12948, + "time_per_iteration": 2.6894073486328125 + }, + { + "auxiliary_loss_clip": 0.01240841, + "auxiliary_loss_mlp": 0.00210164, + "balance_loss_clip": 1.01845264, + "balance_loss_mlp": 0.18377143, + "epoch": 0.778535998797535, + "flos": 14355363104640.0, + "grad_norm": 19.909588198773193, + "language_loss": 0.84644222, + "learning_rate": 4.926935870337625e-07, + "loss": 0.86095232, + "num_input_tokens_seen": 279389400, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.26391602, + "step": 12949, + "time_per_iteration": 2.5946786403656006 + }, + { + "auxiliary_loss_clip": 0.01282023, + "auxiliary_loss_mlp": 0.00225854, + "balance_loss_clip": 1.05426669, + "balance_loss_mlp": 0.1989727, + "epoch": 0.7785961220502029, + "flos": 19209552007680.0, + "grad_norm": 5.195926441817377, + "language_loss": 0.73760796, + "learning_rate": 4.924376332483202e-07, + "loss": 0.75268674, + "num_input_tokens_seen": 279409715, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.26879883, + "step": 12950, + "time_per_iteration": 2.706953287124634 + }, + { + "auxiliary_loss_clip": 0.01251696, + "auxiliary_loss_mlp": 0.00221118, + "balance_loss_clip": 1.0297606, + "balance_loss_mlp": 0.19513038, + "epoch": 0.7786562453028709, + "flos": 25738757884800.0, + "grad_norm": 128.07106660444126, + "language_loss": 0.80252647, + "learning_rate": 4.921817366297938e-07, + "loss": 0.8172546, + "num_input_tokens_seen": 279427705, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.2598877, + "step": 12951, + "time_per_iteration": 2.640488386154175 + }, + { + "auxiliary_loss_clip": 0.01242397, + "auxiliary_loss_mlp": 0.00225071, + "balance_loss_clip": 1.02335501, + "balance_loss_mlp": 0.1993103, + "epoch": 0.7787163685555388, + "flos": 25739440243200.0, + "grad_norm": 30.611847150403097, + "language_loss": 0.7687552, + "learning_rate": 4.919258971878877e-07, + "loss": 0.78342992, + "num_input_tokens_seen": 279448215, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25744629, + "step": 12952, + "time_per_iteration": 2.737388849258423 + }, + { + "auxiliary_loss_clip": 0.01212014, + "auxiliary_loss_mlp": 0.00191579, + "balance_loss_clip": 1.00455987, + "balance_loss_mlp": 0.16690287, + "epoch": 0.7787764918082068, + "flos": 22747722416640.0, + "grad_norm": 52.40796550303848, + "language_loss": 0.87100834, + "learning_rate": 4.916701149323022e-07, + "loss": 0.88504422, + "num_input_tokens_seen": 279466260, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.24658203, + "step": 12953, + "time_per_iteration": 2.684340715408325 + }, + { + "auxiliary_loss_clip": 0.01248908, + "auxiliary_loss_mlp": 0.00196229, + "balance_loss_clip": 1.02683055, + "balance_loss_mlp": 0.17180267, + "epoch": 0.7788366150608748, + "flos": 15190860430080.0, + "grad_norm": 22.125562718542316, + "language_loss": 0.8595252, + "learning_rate": 4.91414389872737e-07, + "loss": 0.87397659, + "num_input_tokens_seen": 279484520, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.2442627, + "step": 12954, + "time_per_iteration": 2.6449499130249023 + }, + { + "auxiliary_loss_clip": 0.01240587, + "auxiliary_loss_mlp": 0.00223882, + "balance_loss_clip": 1.01812279, + "balance_loss_mlp": 0.19870487, + "epoch": 0.7788967383135428, + "flos": 21210242618880.0, + "grad_norm": 6.9378641327134964, + "language_loss": 0.786762, + "learning_rate": 4.911587220188905e-07, + "loss": 0.80140674, + "num_input_tokens_seen": 279503130, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25183105, + "step": 12955, + "time_per_iteration": 2.6917474269866943 + }, + { + "auxiliary_loss_clip": 0.01227341, + "auxiliary_loss_mlp": 0.0020817, + "balance_loss_clip": 1.01394022, + "balance_loss_mlp": 0.18268323, + "epoch": 0.7789568615662107, + "flos": 21682970536320.0, + "grad_norm": 15.924674813752498, + "language_loss": 0.75073588, + "learning_rate": 4.909031113804551e-07, + "loss": 0.76509094, + "num_input_tokens_seen": 279521930, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.25488281, + "step": 12956, + "time_per_iteration": 2.6332123279571533 + }, + { + "auxiliary_loss_clip": 0.01229458, + "auxiliary_loss_mlp": 0.00236013, + "balance_loss_clip": 1.01116848, + "balance_loss_mlp": 0.21071669, + "epoch": 0.7790169848188787, + "flos": 26360371676160.0, + "grad_norm": 216.30364465727632, + "language_loss": 0.84790164, + "learning_rate": 4.906475579671252e-07, + "loss": 0.86255634, + "num_input_tokens_seen": 279542375, + "router_z_loss_clip": 2.18066406, + "router_z_loss_mlp": 0.25292969, + "step": 12957, + "time_per_iteration": 2.6980607509613037 + }, + { + "auxiliary_loss_clip": 0.0123716, + "auxiliary_loss_mlp": 0.00228278, + "balance_loss_clip": 1.01836574, + "balance_loss_mlp": 0.2020279, + "epoch": 0.7790771080715466, + "flos": 25516183259520.0, + "grad_norm": 3.388980000085202, + "language_loss": 0.84069812, + "learning_rate": 4.903920617885917e-07, + "loss": 0.8553524, + "num_input_tokens_seen": 279561885, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26245117, + "step": 12958, + "time_per_iteration": 2.675447940826416 + }, + { + "auxiliary_loss_clip": 0.012428, + "auxiliary_loss_mlp": 0.00219841, + "balance_loss_clip": 1.02209425, + "balance_loss_mlp": 0.1935908, + "epoch": 0.7791372313242146, + "flos": 16034186920320.0, + "grad_norm": 6.4599828169369875, + "language_loss": 0.79464966, + "learning_rate": 4.901366228545418e-07, + "loss": 0.80927604, + "num_input_tokens_seen": 279579965, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26269531, + "step": 12959, + "time_per_iteration": 2.7068676948547363 + }, + { + "auxiliary_loss_clip": 0.0124765, + "auxiliary_loss_mlp": 0.00223672, + "balance_loss_clip": 1.02352118, + "balance_loss_mlp": 0.19805422, + "epoch": 0.7791973545768827, + "flos": 23842207779840.0, + "grad_norm": 10.154486464000113, + "language_loss": 0.84727383, + "learning_rate": 4.898812411746632e-07, + "loss": 0.86198705, + "num_input_tokens_seen": 279599030, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.25610352, + "step": 12960, + "time_per_iteration": 2.655852794647217 + }, + { + "auxiliary_loss_clip": 0.01270769, + "auxiliary_loss_mlp": 0.00210845, + "balance_loss_clip": 1.04062307, + "balance_loss_mlp": 0.18610877, + "epoch": 0.7792574778295506, + "flos": 24168384207360.0, + "grad_norm": 214.15558939047776, + "language_loss": 0.84161729, + "learning_rate": 4.896259167586385e-07, + "loss": 0.85643339, + "num_input_tokens_seen": 279614400, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.24731445, + "step": 12961, + "time_per_iteration": 2.686166763305664 + }, + { + "auxiliary_loss_clip": 0.01229885, + "auxiliary_loss_mlp": 0.00211233, + "balance_loss_clip": 1.01827526, + "balance_loss_mlp": 0.18771276, + "epoch": 0.7793176010822186, + "flos": 21464921024640.0, + "grad_norm": 4.710212835537806, + "language_loss": 0.79902619, + "learning_rate": 4.893706496161511e-07, + "loss": 0.81343734, + "num_input_tokens_seen": 279633745, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.23522949, + "step": 12962, + "time_per_iteration": 2.6445977687835693 + }, + { + "auxiliary_loss_clip": 0.01224431, + "auxiliary_loss_mlp": 0.00193022, + "balance_loss_clip": 1.00726497, + "balance_loss_mlp": 0.16832227, + "epoch": 0.7793777243348865, + "flos": 20666699038080.0, + "grad_norm": 176.05614417214215, + "language_loss": 0.79526484, + "learning_rate": 4.891154397568795e-07, + "loss": 0.80943936, + "num_input_tokens_seen": 279651165, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24694824, + "step": 12963, + "time_per_iteration": 2.6311538219451904 + }, + { + "auxiliary_loss_clip": 0.01226244, + "auxiliary_loss_mlp": 0.00205776, + "balance_loss_clip": 1.01272655, + "balance_loss_mlp": 0.18171927, + "epoch": 0.7794378475875545, + "flos": 27125771610240.0, + "grad_norm": 61.57601564144057, + "language_loss": 0.735165, + "learning_rate": 4.888602871905019e-07, + "loss": 0.74948519, + "num_input_tokens_seen": 279671175, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24060059, + "step": 12964, + "time_per_iteration": 2.6878037452697754 + }, + { + "auxiliary_loss_clip": 0.01238857, + "auxiliary_loss_mlp": 0.00209892, + "balance_loss_clip": 1.01887214, + "balance_loss_mlp": 0.1834397, + "epoch": 0.7794979708402224, + "flos": 28074136446720.0, + "grad_norm": 76.4365587660423, + "language_loss": 0.82380384, + "learning_rate": 4.88605191926694e-07, + "loss": 0.83829129, + "num_input_tokens_seen": 279688675, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.26452637, + "step": 12965, + "time_per_iteration": 2.7241263389587402 + }, + { + "auxiliary_loss_clip": 0.01215041, + "auxiliary_loss_mlp": 0.00208863, + "balance_loss_clip": 1.00483048, + "balance_loss_mlp": 0.18348336, + "epoch": 0.7795580940928905, + "flos": 26869548919680.0, + "grad_norm": 64.23062658846396, + "language_loss": 0.77093256, + "learning_rate": 4.883501539751289e-07, + "loss": 0.78517157, + "num_input_tokens_seen": 279710245, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.25378418, + "step": 12966, + "time_per_iteration": 2.7625341415405273 + }, + { + "auxiliary_loss_clip": 0.01239835, + "auxiliary_loss_mlp": 0.00183471, + "balance_loss_clip": 1.02278447, + "balance_loss_mlp": 0.16016531, + "epoch": 0.7796182173455584, + "flos": 23835384195840.0, + "grad_norm": 16.47102550936087, + "language_loss": 0.81968176, + "learning_rate": 4.880951733454768e-07, + "loss": 0.83391482, + "num_input_tokens_seen": 279729045, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.23303223, + "step": 12967, + "time_per_iteration": 2.732750177383423 + }, + { + "auxiliary_loss_clip": 0.01236622, + "auxiliary_loss_mlp": 0.00206884, + "balance_loss_clip": 1.01646042, + "balance_loss_mlp": 0.18146902, + "epoch": 0.7796783405982264, + "flos": 19792238434560.0, + "grad_norm": 19.816011279085792, + "language_loss": 0.83481407, + "learning_rate": 4.878402500474073e-07, + "loss": 0.84924912, + "num_input_tokens_seen": 279748350, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25415039, + "step": 12968, + "time_per_iteration": 2.706890344619751 + }, + { + "auxiliary_loss_clip": 0.01238233, + "auxiliary_loss_mlp": 0.00206575, + "balance_loss_clip": 1.01947582, + "balance_loss_mlp": 0.18168369, + "epoch": 0.7797384638508943, + "flos": 15450207603840.0, + "grad_norm": 1194.8211577592128, + "language_loss": 0.7152757, + "learning_rate": 4.875853840905874e-07, + "loss": 0.72972375, + "num_input_tokens_seen": 279765620, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24914551, + "step": 12969, + "time_per_iteration": 2.6947693824768066 + }, + { + "auxiliary_loss_clip": 0.01204048, + "auxiliary_loss_mlp": 0.00201587, + "balance_loss_clip": 0.9946388, + "balance_loss_mlp": 0.17854388, + "epoch": 0.7797985871035623, + "flos": 20922742160640.0, + "grad_norm": 24.520613527178615, + "language_loss": 0.77240443, + "learning_rate": 4.873305754846811e-07, + "loss": 0.78646076, + "num_input_tokens_seen": 279782485, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.23059082, + "step": 12970, + "time_per_iteration": 2.675076723098755 + }, + { + "auxiliary_loss_clip": 0.01240123, + "auxiliary_loss_mlp": 0.00205205, + "balance_loss_clip": 1.02601004, + "balance_loss_mlp": 0.18007557, + "epoch": 0.7798587103562302, + "flos": 36937212514560.0, + "grad_norm": 6.627705101011302, + "language_loss": 0.80549705, + "learning_rate": 4.870758242393507e-07, + "loss": 0.81995034, + "num_input_tokens_seen": 279804170, + "router_z_loss_clip": 2.13964844, + "router_z_loss_mlp": 0.2512207, + "step": 12971, + "time_per_iteration": 2.7911126613616943 + }, + { + "auxiliary_loss_clip": 0.01247053, + "auxiliary_loss_mlp": 0.00226441, + "balance_loss_clip": 1.02416813, + "balance_loss_mlp": 0.19857022, + "epoch": 0.7799188336088982, + "flos": 22419283432320.0, + "grad_norm": 572.0207609201108, + "language_loss": 0.82789207, + "learning_rate": 4.868211303642578e-07, + "loss": 0.84262699, + "num_input_tokens_seen": 279823730, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.27880859, + "step": 12972, + "time_per_iteration": 2.6765475273132324 + }, + { + "auxiliary_loss_clip": 0.01255881, + "auxiliary_loss_mlp": 0.00218576, + "balance_loss_clip": 1.03476477, + "balance_loss_mlp": 0.19148004, + "epoch": 0.7799789568615663, + "flos": 18880466578560.0, + "grad_norm": 5.244344554093659, + "language_loss": 0.81381452, + "learning_rate": 4.865664938690584e-07, + "loss": 0.82855904, + "num_input_tokens_seen": 279843035, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.27111816, + "step": 12973, + "time_per_iteration": 2.623101234436035 + }, + { + "auxiliary_loss_clip": 0.01234888, + "auxiliary_loss_mlp": 0.00202184, + "balance_loss_clip": 1.02031136, + "balance_loss_mlp": 0.17891453, + "epoch": 0.7800390801142342, + "flos": 20262272832000.0, + "grad_norm": 53.311556130900925, + "language_loss": 0.85902131, + "learning_rate": 4.863119147634089e-07, + "loss": 0.87339199, + "num_input_tokens_seen": 279861450, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.23266602, + "step": 12974, + "time_per_iteration": 4.19887113571167 + }, + { + "auxiliary_loss_clip": 0.01239219, + "auxiliary_loss_mlp": 0.00214949, + "balance_loss_clip": 1.02610302, + "balance_loss_mlp": 0.19039237, + "epoch": 0.7800992033669022, + "flos": 16690310703360.0, + "grad_norm": 23.134253299711727, + "language_loss": 0.7643441, + "learning_rate": 4.86057393056964e-07, + "loss": 0.77888578, + "num_input_tokens_seen": 279878660, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.24584961, + "step": 12975, + "time_per_iteration": 2.6556384563446045 + }, + { + "auxiliary_loss_clip": 0.01235871, + "auxiliary_loss_mlp": 0.00212663, + "balance_loss_clip": 1.01733637, + "balance_loss_mlp": 0.18771216, + "epoch": 0.7801593266195701, + "flos": 18585208782720.0, + "grad_norm": 3966.892530472151, + "language_loss": 0.89908206, + "learning_rate": 4.858029287593739e-07, + "loss": 0.91356742, + "num_input_tokens_seen": 279895685, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24963379, + "step": 12976, + "time_per_iteration": 4.028537273406982 + }, + { + "auxiliary_loss_clip": 0.01240761, + "auxiliary_loss_mlp": 0.00196871, + "balance_loss_clip": 1.02153945, + "balance_loss_mlp": 0.17151567, + "epoch": 0.7802194498722381, + "flos": 25484941405440.0, + "grad_norm": 7.270277207444462, + "language_loss": 0.72818255, + "learning_rate": 4.85548521880289e-07, + "loss": 0.74255884, + "num_input_tokens_seen": 279917240, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.2532959, + "step": 12977, + "time_per_iteration": 2.685765504837036 + }, + { + "auxiliary_loss_clip": 0.01243859, + "auxiliary_loss_mlp": 0.0021668, + "balance_loss_clip": 1.02448618, + "balance_loss_mlp": 0.18978646, + "epoch": 0.780279573124906, + "flos": 31176315573120.0, + "grad_norm": 144.7963509457103, + "language_loss": 0.81020421, + "learning_rate": 4.852941724293554e-07, + "loss": 0.82480961, + "num_input_tokens_seen": 279938665, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26916504, + "step": 12978, + "time_per_iteration": 2.840775728225708 + }, + { + "auxiliary_loss_clip": 0.01265322, + "auxiliary_loss_mlp": 0.00208623, + "balance_loss_clip": 1.03361225, + "balance_loss_mlp": 0.18205127, + "epoch": 0.780339696377574, + "flos": 26944027770240.0, + "grad_norm": 13.276446925128756, + "language_loss": 0.71842343, + "learning_rate": 4.85039880416219e-07, + "loss": 0.73316288, + "num_input_tokens_seen": 279957965, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.26623535, + "step": 12979, + "time_per_iteration": 2.690765380859375 + }, + { + "auxiliary_loss_clip": 0.01226807, + "auxiliary_loss_mlp": 0.00203819, + "balance_loss_clip": 1.01396048, + "balance_loss_mlp": 0.18033446, + "epoch": 0.780399819630242, + "flos": 27957426180480.0, + "grad_norm": 42.83677708973445, + "language_loss": 0.85718369, + "learning_rate": 4.847856458505217e-07, + "loss": 0.87148988, + "num_input_tokens_seen": 279977490, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23486328, + "step": 12980, + "time_per_iteration": 4.339831829071045 + }, + { + "auxiliary_loss_clip": 0.01248059, + "auxiliary_loss_mlp": 0.00214052, + "balance_loss_clip": 1.02365279, + "balance_loss_mlp": 0.18727753, + "epoch": 0.78045994288291, + "flos": 22486795044480.0, + "grad_norm": 11.340840567328787, + "language_loss": 0.85048962, + "learning_rate": 4.845314687419046e-07, + "loss": 0.86511075, + "num_input_tokens_seen": 279994220, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26794434, + "step": 12981, + "time_per_iteration": 2.688406229019165 + }, + { + "auxiliary_loss_clip": 0.01236756, + "auxiliary_loss_mlp": 0.00225578, + "balance_loss_clip": 1.01801801, + "balance_loss_mlp": 0.19903025, + "epoch": 0.7805200661355779, + "flos": 20850849089280.0, + "grad_norm": 6.018409720981341, + "language_loss": 0.82019055, + "learning_rate": 4.842773491000067e-07, + "loss": 0.83481389, + "num_input_tokens_seen": 280012590, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26550293, + "step": 12982, + "time_per_iteration": 2.6811201572418213 + }, + { + "auxiliary_loss_clip": 0.01234522, + "auxiliary_loss_mlp": 0.00189131, + "balance_loss_clip": 1.01832247, + "balance_loss_mlp": 0.16486031, + "epoch": 0.7805801893882459, + "flos": 25665966973440.0, + "grad_norm": 3.336172361233026, + "language_loss": 0.79078364, + "learning_rate": 4.840232869344636e-07, + "loss": 0.80502015, + "num_input_tokens_seen": 280033700, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24255371, + "step": 12983, + "time_per_iteration": 2.7027289867401123 + }, + { + "auxiliary_loss_clip": 0.01238815, + "auxiliary_loss_mlp": 0.00205977, + "balance_loss_clip": 1.02448559, + "balance_loss_mlp": 0.18212286, + "epoch": 0.7806403126409138, + "flos": 11327806483200.0, + "grad_norm": 5.2158170134815665, + "language_loss": 0.84790134, + "learning_rate": 4.837692822549086e-07, + "loss": 0.86234927, + "num_input_tokens_seen": 280052215, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.23840332, + "step": 12984, + "time_per_iteration": 2.655400514602661 + }, + { + "auxiliary_loss_clip": 0.01260578, + "auxiliary_loss_mlp": 0.00207973, + "balance_loss_clip": 1.03319991, + "balance_loss_mlp": 0.18391635, + "epoch": 0.7807004358935818, + "flos": 19573362910080.0, + "grad_norm": 31.016882211534934, + "language_loss": 0.91902816, + "learning_rate": 4.835153350709746e-07, + "loss": 0.93371367, + "num_input_tokens_seen": 280070525, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.24060059, + "step": 12985, + "time_per_iteration": 4.185423135757446 + }, + { + "auxiliary_loss_clip": 0.01223625, + "auxiliary_loss_mlp": 0.00201937, + "balance_loss_clip": 1.00615382, + "balance_loss_mlp": 0.17641392, + "epoch": 0.7807605591462499, + "flos": 19135827342720.0, + "grad_norm": 8.515762815659226, + "language_loss": 0.84195393, + "learning_rate": 4.832614453922915e-07, + "loss": 0.85620958, + "num_input_tokens_seen": 280089855, + "router_z_loss_clip": 2.17285156, + "router_z_loss_mlp": 0.25512695, + "step": 12986, + "time_per_iteration": 2.68963885307312 + }, + { + "auxiliary_loss_clip": 0.01240006, + "auxiliary_loss_mlp": 0.00205923, + "balance_loss_clip": 1.01931369, + "balance_loss_mlp": 0.18091336, + "epoch": 0.7808206823989178, + "flos": 32374654133760.0, + "grad_norm": 216.57746776625424, + "language_loss": 0.82788873, + "learning_rate": 4.830076132284859e-07, + "loss": 0.84234804, + "num_input_tokens_seen": 280109960, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25, + "step": 12987, + "time_per_iteration": 2.752666473388672 + }, + { + "auxiliary_loss_clip": 0.01081914, + "auxiliary_loss_mlp": 0.00092796, + "balance_loss_clip": 0.92777896, + "balance_loss_mlp": 0.08440325, + "epoch": 0.7808808056515858, + "flos": 55050235061760.0, + "grad_norm": 0.7157086139320568, + "language_loss": 0.54215562, + "learning_rate": 4.82753838589184e-07, + "loss": 0.55390275, + "num_input_tokens_seen": 280169805, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.08398438, + "step": 12988, + "time_per_iteration": 3.160268783569336 + }, + { + "auxiliary_loss_clip": 0.012414, + "auxiliary_loss_mlp": 0.00226416, + "balance_loss_clip": 1.02134717, + "balance_loss_mlp": 0.20128691, + "epoch": 0.7809409289042537, + "flos": 12859468277760.0, + "grad_norm": 348.35507267326375, + "language_loss": 0.89019382, + "learning_rate": 4.82500121484009e-07, + "loss": 0.90487194, + "num_input_tokens_seen": 280184630, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25146484, + "step": 12989, + "time_per_iteration": 2.6547319889068604 + }, + { + "auxiliary_loss_clip": 0.01214229, + "auxiliary_loss_mlp": 0.00215859, + "balance_loss_clip": 1.00558805, + "balance_loss_mlp": 0.19134969, + "epoch": 0.7810010521569217, + "flos": 21687244254720.0, + "grad_norm": 27.594626530983863, + "language_loss": 0.78495353, + "learning_rate": 4.822464619225806e-07, + "loss": 0.79925442, + "num_input_tokens_seen": 280203880, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.24487305, + "step": 12990, + "time_per_iteration": 2.692359209060669 + }, + { + "auxiliary_loss_clip": 0.0123957, + "auxiliary_loss_mlp": 0.00224916, + "balance_loss_clip": 1.02092791, + "balance_loss_mlp": 0.19759315, + "epoch": 0.7810611754095896, + "flos": 16757068129920.0, + "grad_norm": 65.40421069665753, + "language_loss": 0.8392899, + "learning_rate": 4.819928599145184e-07, + "loss": 0.85393476, + "num_input_tokens_seen": 280220460, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.27355957, + "step": 12991, + "time_per_iteration": 2.6360480785369873 + }, + { + "auxiliary_loss_clip": 0.01240331, + "auxiliary_loss_mlp": 0.00210779, + "balance_loss_clip": 1.02231383, + "balance_loss_mlp": 0.18620971, + "epoch": 0.7811212986622577, + "flos": 43507464658560.0, + "grad_norm": 46.72349023065151, + "language_loss": 0.74909538, + "learning_rate": 4.817393154694398e-07, + "loss": 0.76360643, + "num_input_tokens_seen": 280242680, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24572754, + "step": 12992, + "time_per_iteration": 2.8671185970306396 + }, + { + "auxiliary_loss_clip": 0.01244849, + "auxiliary_loss_mlp": 0.00200938, + "balance_loss_clip": 1.02108812, + "balance_loss_mlp": 0.1743902, + "epoch": 0.7811814219149256, + "flos": 21757700782080.0, + "grad_norm": 16.704388674453845, + "language_loss": 0.7069115, + "learning_rate": 4.814858285969578e-07, + "loss": 0.72136939, + "num_input_tokens_seen": 280260655, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.265625, + "step": 12993, + "time_per_iteration": 2.6308515071868896 + }, + { + "auxiliary_loss_clip": 0.01224138, + "auxiliary_loss_mlp": 0.00208969, + "balance_loss_clip": 1.00886285, + "balance_loss_mlp": 0.18305308, + "epoch": 0.7812415451675936, + "flos": 24061514267520.0, + "grad_norm": 40.30184772489457, + "language_loss": 0.77144712, + "learning_rate": 4.812323993066862e-07, + "loss": 0.78577816, + "num_input_tokens_seen": 280281185, + "router_z_loss_clip": 2.15722656, + "router_z_loss_mlp": 0.25915527, + "step": 12994, + "time_per_iteration": 2.68083119392395 + }, + { + "auxiliary_loss_clip": 0.01240716, + "auxiliary_loss_mlp": 0.00213214, + "balance_loss_clip": 1.02509665, + "balance_loss_mlp": 0.18969376, + "epoch": 0.7813016684202615, + "flos": 18989706816000.0, + "grad_norm": 21.700218086742687, + "language_loss": 0.79209661, + "learning_rate": 4.809790276082335e-07, + "loss": 0.80663592, + "num_input_tokens_seen": 280298255, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.23547363, + "step": 12995, + "time_per_iteration": 2.6370160579681396 + }, + { + "auxiliary_loss_clip": 0.01213747, + "auxiliary_loss_mlp": 0.00198187, + "balance_loss_clip": 1.00520778, + "balance_loss_mlp": 0.17379683, + "epoch": 0.7813617916729295, + "flos": 25260786581760.0, + "grad_norm": 126.24769943570153, + "language_loss": 0.80518121, + "learning_rate": 4.807257135112088e-07, + "loss": 0.81930053, + "num_input_tokens_seen": 280319000, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.24389648, + "step": 12996, + "time_per_iteration": 2.6807973384857178 + }, + { + "auxiliary_loss_clip": 0.01273483, + "auxiliary_loss_mlp": 0.00219812, + "balance_loss_clip": 1.04696953, + "balance_loss_mlp": 0.19371663, + "epoch": 0.7814219149255974, + "flos": 17966037116160.0, + "grad_norm": 79.12754483325044, + "language_loss": 0.84496796, + "learning_rate": 4.804724570252167e-07, + "loss": 0.85990089, + "num_input_tokens_seen": 280336375, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.26086426, + "step": 12997, + "time_per_iteration": 2.6297659873962402 + }, + { + "auxiliary_loss_clip": 0.01264085, + "auxiliary_loss_mlp": 0.00220584, + "balance_loss_clip": 1.03296685, + "balance_loss_mlp": 0.191688, + "epoch": 0.7814820381782654, + "flos": 25776176878080.0, + "grad_norm": 3.1003174886589413, + "language_loss": 0.89390993, + "learning_rate": 4.802192581598614e-07, + "loss": 0.90875655, + "num_input_tokens_seen": 280358760, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.28857422, + "step": 12998, + "time_per_iteration": 2.6963419914245605 + }, + { + "auxiliary_loss_clip": 0.01269822, + "auxiliary_loss_mlp": 0.00230108, + "balance_loss_clip": 1.03637862, + "balance_loss_mlp": 0.20386991, + "epoch": 0.7815421614309335, + "flos": 20519572930560.0, + "grad_norm": 66.53097259659732, + "language_loss": 0.83727586, + "learning_rate": 4.799661169247453e-07, + "loss": 0.85227519, + "num_input_tokens_seen": 280377085, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.2623291, + "step": 12999, + "time_per_iteration": 2.674636125564575 + }, + { + "auxiliary_loss_clip": 0.01261524, + "auxiliary_loss_mlp": 0.00213301, + "balance_loss_clip": 1.03362322, + "balance_loss_mlp": 0.18690783, + "epoch": 0.7816022846836014, + "flos": 21287666384640.0, + "grad_norm": 4.5006573519597906, + "language_loss": 0.92253995, + "learning_rate": 4.797130333294652e-07, + "loss": 0.93728817, + "num_input_tokens_seen": 280395465, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.2635498, + "step": 13000, + "time_per_iteration": 2.630629062652588 + }, + { + "auxiliary_loss_clip": 0.01263142, + "auxiliary_loss_mlp": 0.00225536, + "balance_loss_clip": 1.04046845, + "balance_loss_mlp": 0.19956063, + "epoch": 0.7816624079362694, + "flos": 19208402772480.0, + "grad_norm": 74.64648707972255, + "language_loss": 0.74299186, + "learning_rate": 4.794600073836192e-07, + "loss": 0.7578786, + "num_input_tokens_seen": 280412775, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25964355, + "step": 13001, + "time_per_iteration": 2.634693145751953 + }, + { + "auxiliary_loss_clip": 0.01250455, + "auxiliary_loss_mlp": 0.00207509, + "balance_loss_clip": 1.0311811, + "balance_loss_mlp": 0.18252289, + "epoch": 0.7817225311889373, + "flos": 26104687689600.0, + "grad_norm": 5.850436765836123, + "language_loss": 0.75330031, + "learning_rate": 4.792070390968027e-07, + "loss": 0.76787996, + "num_input_tokens_seen": 280432905, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24987793, + "step": 13002, + "time_per_iteration": 2.7151730060577393 + }, + { + "auxiliary_loss_clip": 0.01266662, + "auxiliary_loss_mlp": 0.00206818, + "balance_loss_clip": 1.04057598, + "balance_loss_mlp": 0.1788041, + "epoch": 0.7817826544416053, + "flos": 21250929749760.0, + "grad_norm": 41.13321969585581, + "language_loss": 0.85435021, + "learning_rate": 4.78954128478607e-07, + "loss": 0.86908495, + "num_input_tokens_seen": 280450785, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.2800293, + "step": 13003, + "time_per_iteration": 2.676223039627075 + }, + { + "auxiliary_loss_clip": 0.01235943, + "auxiliary_loss_mlp": 0.00212234, + "balance_loss_clip": 1.01645446, + "balance_loss_mlp": 0.18784334, + "epoch": 0.7818427776942732, + "flos": 19932181822080.0, + "grad_norm": 7.402755922226686, + "language_loss": 0.70831764, + "learning_rate": 4.787012755386233e-07, + "loss": 0.72279942, + "num_input_tokens_seen": 280468400, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24401855, + "step": 13004, + "time_per_iteration": 2.6530966758728027 + }, + { + "auxiliary_loss_clip": 0.01220766, + "auxiliary_loss_mlp": 0.00194772, + "balance_loss_clip": 1.00721192, + "balance_loss_mlp": 0.16985707, + "epoch": 0.7819029009469413, + "flos": 11363753018880.0, + "grad_norm": 7.087857769835801, + "language_loss": 0.91359985, + "learning_rate": 4.784484802864403e-07, + "loss": 0.92775518, + "num_input_tokens_seen": 280483930, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24951172, + "step": 13005, + "time_per_iteration": 2.679480791091919 + }, + { + "auxiliary_loss_clip": 0.01232972, + "auxiliary_loss_mlp": 0.00202456, + "balance_loss_clip": 1.0177834, + "balance_loss_mlp": 0.17841189, + "epoch": 0.7819630241996092, + "flos": 24279276470400.0, + "grad_norm": 2.8750236232222655, + "language_loss": 0.83666992, + "learning_rate": 4.781957427316432e-07, + "loss": 0.85102427, + "num_input_tokens_seen": 280503465, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24060059, + "step": 13006, + "time_per_iteration": 2.713557720184326 + }, + { + "auxiliary_loss_clip": 0.01263596, + "auxiliary_loss_mlp": 0.0023736, + "balance_loss_clip": 1.03612828, + "balance_loss_mlp": 0.20952451, + "epoch": 0.7820231474522772, + "flos": 22708902792960.0, + "grad_norm": 2.895107710596516, + "language_loss": 0.79188609, + "learning_rate": 4.779430628838157e-07, + "loss": 0.80689561, + "num_input_tokens_seen": 280523375, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.27844238, + "step": 13007, + "time_per_iteration": 2.6942782402038574 + }, + { + "auxiliary_loss_clip": 0.01235858, + "auxiliary_loss_mlp": 0.00210831, + "balance_loss_clip": 1.01869071, + "balance_loss_mlp": 0.18602347, + "epoch": 0.7820832707049451, + "flos": 20047419630720.0, + "grad_norm": 45.3370219888529, + "language_loss": 0.7908752, + "learning_rate": 4.776904407525397e-07, + "loss": 0.80534208, + "num_input_tokens_seen": 280542920, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24804688, + "step": 13008, + "time_per_iteration": 2.6401000022888184 + }, + { + "auxiliary_loss_clip": 0.01257086, + "auxiliary_loss_mlp": 0.00206938, + "balance_loss_clip": 1.03838873, + "balance_loss_mlp": 0.1824764, + "epoch": 0.7821433939576131, + "flos": 27162795553920.0, + "grad_norm": 39.13966341514411, + "language_loss": 0.76791108, + "learning_rate": 4.774378763473954e-07, + "loss": 0.78255135, + "num_input_tokens_seen": 280561700, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24487305, + "step": 13009, + "time_per_iteration": 2.7246603965759277 + }, + { + "auxiliary_loss_clip": 0.01248991, + "auxiliary_loss_mlp": 0.00202685, + "balance_loss_clip": 1.02678323, + "balance_loss_mlp": 0.17667331, + "epoch": 0.782203517210281, + "flos": 22602068766720.0, + "grad_norm": 2.009513380332088, + "language_loss": 0.89049268, + "learning_rate": 4.771853696779586e-07, + "loss": 0.90500951, + "num_input_tokens_seen": 280580605, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26000977, + "step": 13010, + "time_per_iteration": 2.686748743057251 + }, + { + "auxiliary_loss_clip": 0.01234883, + "auxiliary_loss_mlp": 0.0020735, + "balance_loss_clip": 1.02029049, + "balance_loss_mlp": 0.18343671, + "epoch": 0.782263640462949, + "flos": 29059812535680.0, + "grad_norm": 3.1547837062307393, + "language_loss": 0.70061827, + "learning_rate": 4.76932920753806e-07, + "loss": 0.71504056, + "num_input_tokens_seen": 280601495, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.23950195, + "step": 13011, + "time_per_iteration": 2.7727651596069336 + }, + { + "auxiliary_loss_clip": 0.01249579, + "auxiliary_loss_mlp": 0.00221657, + "balance_loss_clip": 1.03215432, + "balance_loss_mlp": 0.19863737, + "epoch": 0.782323763715617, + "flos": 25299498464640.0, + "grad_norm": 54.66002192406816, + "language_loss": 0.7620362, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.77674854, + "num_input_tokens_seen": 280622760, + "router_z_loss_clip": 2.17285156, + "router_z_loss_mlp": 0.22998047, + "step": 13012, + "time_per_iteration": 2.751333236694336 + }, + { + "auxiliary_loss_clip": 0.01083054, + "auxiliary_loss_mlp": 0.00037655, + "balance_loss_clip": 0.92865467, + "balance_loss_mlp": 0.03136034, + "epoch": 0.782383886968285, + "flos": 65194388668800.0, + "grad_norm": 0.6884724403952267, + "language_loss": 0.54362071, + "learning_rate": 4.764281961796395e-07, + "loss": 0.55482781, + "num_input_tokens_seen": 280687115, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.06298828, + "step": 13013, + "time_per_iteration": 3.2975046634674072 + }, + { + "auxiliary_loss_clip": 0.01258402, + "auxiliary_loss_mlp": 0.00220531, + "balance_loss_clip": 1.03880501, + "balance_loss_mlp": 0.19527024, + "epoch": 0.782444010220953, + "flos": 18405440190720.0, + "grad_norm": 12.226302738309105, + "language_loss": 0.75910866, + "learning_rate": 4.76175920548765e-07, + "loss": 0.77389801, + "num_input_tokens_seen": 280705000, + "router_z_loss_clip": 2.19238281, + "router_z_loss_mlp": 0.25280762, + "step": 13014, + "time_per_iteration": 2.6687326431274414 + }, + { + "auxiliary_loss_clip": 0.01077512, + "auxiliary_loss_mlp": 0.00052925, + "balance_loss_clip": 0.92539084, + "balance_loss_mlp": 0.04629658, + "epoch": 0.7825041334736209, + "flos": 63955003841280.0, + "grad_norm": 0.6692692896484363, + "language_loss": 0.57643449, + "learning_rate": 4.759237027014524e-07, + "loss": 0.58773881, + "num_input_tokens_seen": 280773525, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.06640625, + "step": 13015, + "time_per_iteration": 3.2053847312927246 + }, + { + "auxiliary_loss_clip": 0.0121904, + "auxiliary_loss_mlp": 0.00210926, + "balance_loss_clip": 1.00859833, + "balance_loss_mlp": 0.18845467, + "epoch": 0.7825642567262889, + "flos": 20339373375360.0, + "grad_norm": 6.656662790512305, + "language_loss": 0.82373726, + "learning_rate": 4.756715426472666e-07, + "loss": 0.83803689, + "num_input_tokens_seen": 280791915, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.22473145, + "step": 13016, + "time_per_iteration": 4.061009883880615 + }, + { + "auxiliary_loss_clip": 0.01259278, + "auxiliary_loss_mlp": 0.00227876, + "balance_loss_clip": 1.03537583, + "balance_loss_mlp": 0.20167403, + "epoch": 0.7826243799789568, + "flos": 20262955190400.0, + "grad_norm": 39.2249696446195, + "language_loss": 0.82910532, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.84397686, + "num_input_tokens_seen": 280811460, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.26196289, + "step": 13017, + "time_per_iteration": 2.6817708015441895 + }, + { + "auxiliary_loss_clip": 0.01244934, + "auxiliary_loss_mlp": 0.00233364, + "balance_loss_clip": 1.02665389, + "balance_loss_mlp": 0.20879519, + "epoch": 0.7826845032316249, + "flos": 21132926593920.0, + "grad_norm": 4.399713972484514, + "language_loss": 0.84374785, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.85853082, + "num_input_tokens_seen": 280825415, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24560547, + "step": 13018, + "time_per_iteration": 4.076875686645508 + }, + { + "auxiliary_loss_clip": 0.01242933, + "auxiliary_loss_mlp": 0.00203621, + "balance_loss_clip": 1.02561557, + "balance_loss_mlp": 0.17995746, + "epoch": 0.7827446264842928, + "flos": 22492253911680.0, + "grad_norm": 2.5648425131604657, + "language_loss": 0.82442319, + "learning_rate": 4.749154093390708e-07, + "loss": 0.83888865, + "num_input_tokens_seen": 280845335, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.23693848, + "step": 13019, + "time_per_iteration": 2.6467161178588867 + }, + { + "auxiliary_loss_clip": 0.01239388, + "auxiliary_loss_mlp": 0.00233619, + "balance_loss_clip": 1.02553689, + "balance_loss_mlp": 0.20897821, + "epoch": 0.7828047497369608, + "flos": 28840649702400.0, + "grad_norm": 7.780043470914796, + "language_loss": 0.74295777, + "learning_rate": 4.746634805529852e-07, + "loss": 0.75768781, + "num_input_tokens_seen": 280867145, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24621582, + "step": 13020, + "time_per_iteration": 2.722555160522461 + }, + { + "auxiliary_loss_clip": 0.01238974, + "auxiliary_loss_mlp": 0.00218469, + "balance_loss_clip": 1.02777386, + "balance_loss_mlp": 0.19546141, + "epoch": 0.7828648729896287, + "flos": 23257689759360.0, + "grad_norm": 12.88693257840787, + "language_loss": 0.70819384, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.72276831, + "num_input_tokens_seen": 280886185, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.23010254, + "step": 13021, + "time_per_iteration": 2.635263204574585 + }, + { + "auxiliary_loss_clip": 0.01231098, + "auxiliary_loss_mlp": 0.00196948, + "balance_loss_clip": 1.01811063, + "balance_loss_mlp": 0.17507286, + "epoch": 0.7829249962422967, + "flos": 25265670831360.0, + "grad_norm": 34.12789000003095, + "language_loss": 0.77330399, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.78758448, + "num_input_tokens_seen": 280907665, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.21887207, + "step": 13022, + "time_per_iteration": 4.201209545135498 + }, + { + "auxiliary_loss_clip": 0.01074264, + "auxiliary_loss_mlp": 0.00048265, + "balance_loss_clip": 0.92379767, + "balance_loss_mlp": 0.04039736, + "epoch": 0.7829851194949646, + "flos": 70722044645760.0, + "grad_norm": 0.6325930228435039, + "language_loss": 0.55720735, + "learning_rate": 4.739080412784131e-07, + "loss": 0.56843269, + "num_input_tokens_seen": 280971405, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.07861328, + "step": 13023, + "time_per_iteration": 3.3985989093780518 + }, + { + "auxiliary_loss_clip": 0.01218991, + "auxiliary_loss_mlp": 0.00219868, + "balance_loss_clip": 1.01012921, + "balance_loss_mlp": 0.19674164, + "epoch": 0.7830452427476327, + "flos": 25660795415040.0, + "grad_norm": 33.78440349031529, + "language_loss": 0.71955562, + "learning_rate": 4.736563439132792e-07, + "loss": 0.73394418, + "num_input_tokens_seen": 280989615, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.23120117, + "step": 13024, + "time_per_iteration": 2.7439496517181396 + }, + { + "auxiliary_loss_clip": 0.01258648, + "auxiliary_loss_mlp": 0.002068, + "balance_loss_clip": 1.03457975, + "balance_loss_mlp": 0.18364927, + "epoch": 0.7831053660003006, + "flos": 22784315397120.0, + "grad_norm": 22.344008753407408, + "language_loss": 0.8430829, + "learning_rate": 4.734047044272498e-07, + "loss": 0.85773742, + "num_input_tokens_seen": 281009450, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.23132324, + "step": 13025, + "time_per_iteration": 2.732992649078369 + }, + { + "auxiliary_loss_clip": 0.01244034, + "auxiliary_loss_mlp": 0.00202701, + "balance_loss_clip": 1.02824318, + "balance_loss_mlp": 0.18039647, + "epoch": 0.7831654892529686, + "flos": 25812267068160.0, + "grad_norm": 5.263063896171608, + "language_loss": 0.87672424, + "learning_rate": 4.731531228298673e-07, + "loss": 0.8911916, + "num_input_tokens_seen": 281028120, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.22302246, + "step": 13026, + "time_per_iteration": 2.696462631225586 + }, + { + "auxiliary_loss_clip": 0.01242468, + "auxiliary_loss_mlp": 0.00199341, + "balance_loss_clip": 1.01849294, + "balance_loss_mlp": 0.17492652, + "epoch": 0.7832256125056366, + "flos": 20771557816320.0, + "grad_norm": 10.898651691031937, + "language_loss": 0.85091627, + "learning_rate": 4.729015991306715e-07, + "loss": 0.86533439, + "num_input_tokens_seen": 281042130, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.2442627, + "step": 13027, + "time_per_iteration": 4.0654919147491455 + }, + { + "auxiliary_loss_clip": 0.0123033, + "auxiliary_loss_mlp": 0.00212278, + "balance_loss_clip": 1.01831841, + "balance_loss_mlp": 0.18875808, + "epoch": 0.7832857357583045, + "flos": 21506541909120.0, + "grad_norm": 2.3374400542342926, + "language_loss": 0.77251518, + "learning_rate": 4.726501333391997e-07, + "loss": 0.78694117, + "num_input_tokens_seen": 281060945, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.23522949, + "step": 13028, + "time_per_iteration": 2.730891704559326 + }, + { + "auxiliary_loss_clip": 0.01257358, + "auxiliary_loss_mlp": 0.00241685, + "balance_loss_clip": 1.0342443, + "balance_loss_mlp": 0.21473137, + "epoch": 0.7833458590109725, + "flos": 18077791305600.0, + "grad_norm": 4.0330594424493516, + "language_loss": 0.76213658, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.77712703, + "num_input_tokens_seen": 281079270, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26965332, + "step": 13029, + "time_per_iteration": 2.7300047874450684 + }, + { + "auxiliary_loss_clip": 0.01256742, + "auxiliary_loss_mlp": 0.0021924, + "balance_loss_clip": 1.03525209, + "balance_loss_mlp": 0.19397929, + "epoch": 0.7834059822636404, + "flos": 28288738252800.0, + "grad_norm": 34.6268436411174, + "language_loss": 0.88494962, + "learning_rate": 4.721473755175698e-07, + "loss": 0.89970934, + "num_input_tokens_seen": 281099500, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25244141, + "step": 13030, + "time_per_iteration": 2.941351890563965 + }, + { + "auxiliary_loss_clip": 0.01257371, + "auxiliary_loss_mlp": 0.00207474, + "balance_loss_clip": 1.0352658, + "balance_loss_mlp": 0.18266661, + "epoch": 0.7834661055163085, + "flos": 31686211088640.0, + "grad_norm": 9.172777555960634, + "language_loss": 0.80518484, + "learning_rate": 4.71896083506476e-07, + "loss": 0.81983334, + "num_input_tokens_seen": 281121250, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24816895, + "step": 13031, + "time_per_iteration": 2.805762767791748 + }, + { + "auxiliary_loss_clip": 0.01254901, + "auxiliary_loss_mlp": 0.00203685, + "balance_loss_clip": 1.02784944, + "balance_loss_mlp": 0.17972437, + "epoch": 0.7835262287689764, + "flos": 12933192942720.0, + "grad_norm": 18.464868939818373, + "language_loss": 0.88580614, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.90039194, + "num_input_tokens_seen": 281138760, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.23937988, + "step": 13032, + "time_per_iteration": 2.6394622325897217 + }, + { + "auxiliary_loss_clip": 0.01251297, + "auxiliary_loss_mlp": 0.0022326, + "balance_loss_clip": 1.03220415, + "balance_loss_mlp": 0.19742706, + "epoch": 0.7835863520216444, + "flos": 16143211676160.0, + "grad_norm": 10.315943972139483, + "language_loss": 0.71066725, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.72541285, + "num_input_tokens_seen": 281157420, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25842285, + "step": 13033, + "time_per_iteration": 2.609969139099121 + }, + { + "auxiliary_loss_clip": 0.01248926, + "auxiliary_loss_mlp": 0.00216945, + "balance_loss_clip": 1.02854812, + "balance_loss_mlp": 0.19036084, + "epoch": 0.7836464752743123, + "flos": 11509909459200.0, + "grad_norm": 66.33187607592848, + "language_loss": 0.80270666, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.81736529, + "num_input_tokens_seen": 281174620, + "router_z_loss_clip": 2.20214844, + "router_z_loss_mlp": 0.26599121, + "step": 13034, + "time_per_iteration": 2.6741387844085693 + }, + { + "auxiliary_loss_clip": 0.01247765, + "auxiliary_loss_mlp": 0.00206026, + "balance_loss_clip": 1.03020334, + "balance_loss_mlp": 0.18064663, + "epoch": 0.7837065985269803, + "flos": 18223696350720.0, + "grad_norm": 4.670648793059117, + "language_loss": 0.80269861, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.81723654, + "num_input_tokens_seen": 281193865, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25390625, + "step": 13035, + "time_per_iteration": 2.6385042667388916 + }, + { + "auxiliary_loss_clip": 0.01267538, + "auxiliary_loss_mlp": 0.00225532, + "balance_loss_clip": 1.04205298, + "balance_loss_mlp": 0.19978327, + "epoch": 0.7837667217796482, + "flos": 24754410599040.0, + "grad_norm": 1372.0967908456291, + "language_loss": 0.7580899, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.77302063, + "num_input_tokens_seen": 281212250, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.25769043, + "step": 13036, + "time_per_iteration": 2.665850877761841 + }, + { + "auxiliary_loss_clip": 0.01279603, + "auxiliary_loss_mlp": 0.00226324, + "balance_loss_clip": 1.04964328, + "balance_loss_mlp": 0.20039597, + "epoch": 0.7838268450323163, + "flos": 22383121415040.0, + "grad_norm": 15.54750865131778, + "language_loss": 0.80640656, + "learning_rate": 4.703895486362031e-07, + "loss": 0.82146585, + "num_input_tokens_seen": 281230850, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.25927734, + "step": 13037, + "time_per_iteration": 2.695424795150757 + }, + { + "auxiliary_loss_clip": 0.01248459, + "auxiliary_loss_mlp": 0.00231957, + "balance_loss_clip": 1.02813625, + "balance_loss_mlp": 0.20719719, + "epoch": 0.7838869682849842, + "flos": 19500284689920.0, + "grad_norm": 13.402209979701386, + "language_loss": 0.70922554, + "learning_rate": 4.701386624460717e-07, + "loss": 0.72402966, + "num_input_tokens_seen": 281249810, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.24780273, + "step": 13038, + "time_per_iteration": 2.60798716545105 + }, + { + "auxiliary_loss_clip": 0.0124983, + "auxiliary_loss_mlp": 0.00204265, + "balance_loss_clip": 1.03196955, + "balance_loss_mlp": 0.17954105, + "epoch": 0.7839470915376522, + "flos": 32892845690880.0, + "grad_norm": 2.454463442078956, + "language_loss": 0.76992869, + "learning_rate": 4.698878342684349e-07, + "loss": 0.78446966, + "num_input_tokens_seen": 281273730, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24743652, + "step": 13039, + "time_per_iteration": 2.7684693336486816 + }, + { + "auxiliary_loss_clip": 0.01223497, + "auxiliary_loss_mlp": 0.00194096, + "balance_loss_clip": 1.01106453, + "balance_loss_mlp": 0.17143445, + "epoch": 0.7840072147903202, + "flos": 29676003373440.0, + "grad_norm": 17.50059297099022, + "language_loss": 0.75294912, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.76712501, + "num_input_tokens_seen": 281293670, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.2265625, + "step": 13040, + "time_per_iteration": 2.720083475112915 + }, + { + "auxiliary_loss_clip": 0.01243058, + "auxiliary_loss_mlp": 0.00223918, + "balance_loss_clip": 1.02125573, + "balance_loss_mlp": 0.19924167, + "epoch": 0.7840673380429881, + "flos": 18186744234240.0, + "grad_norm": 22.435007703621835, + "language_loss": 0.7550056, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.76967537, + "num_input_tokens_seen": 281313070, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.2467041, + "step": 13041, + "time_per_iteration": 2.7100839614868164 + }, + { + "auxiliary_loss_clip": 0.01079114, + "auxiliary_loss_mlp": 0.00054833, + "balance_loss_clip": 0.92805266, + "balance_loss_mlp": 0.04815757, + "epoch": 0.7841274612956561, + "flos": 66346006613760.0, + "grad_norm": 0.6424405194281785, + "language_loss": 0.56741017, + "learning_rate": 4.691356979055998e-07, + "loss": 0.57874966, + "num_input_tokens_seen": 281374880, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.06689453, + "step": 13042, + "time_per_iteration": 3.1545472145080566 + }, + { + "auxiliary_loss_clip": 0.01243556, + "auxiliary_loss_mlp": 0.00219837, + "balance_loss_clip": 1.02498233, + "balance_loss_mlp": 0.19605453, + "epoch": 0.784187584548324, + "flos": 26648482665600.0, + "grad_norm": 104.9232247249522, + "language_loss": 0.93292975, + "learning_rate": 4.688851018730369e-07, + "loss": 0.94756365, + "num_input_tokens_seen": 281392620, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.23791504, + "step": 13043, + "time_per_iteration": 2.7251017093658447 + }, + { + "auxiliary_loss_clip": 0.01255686, + "auxiliary_loss_mlp": 0.00203846, + "balance_loss_clip": 1.03604794, + "balance_loss_mlp": 0.17961107, + "epoch": 0.7842477078009921, + "flos": 25740158515200.0, + "grad_norm": 8.969652152178636, + "language_loss": 0.92640638, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.94100171, + "num_input_tokens_seen": 281413140, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24230957, + "step": 13044, + "time_per_iteration": 2.7122583389282227 + }, + { + "auxiliary_loss_clip": 0.01265759, + "auxiliary_loss_mlp": 0.00221516, + "balance_loss_clip": 1.03971148, + "balance_loss_mlp": 0.19695854, + "epoch": 0.78430783105366, + "flos": 21980957765760.0, + "grad_norm": 19.717843262073053, + "language_loss": 0.86739218, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.88226491, + "num_input_tokens_seen": 281430860, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.24560547, + "step": 13045, + "time_per_iteration": 2.7338812351226807 + }, + { + "auxiliary_loss_clip": 0.01258803, + "auxiliary_loss_mlp": 0.00201446, + "balance_loss_clip": 1.04050756, + "balance_loss_mlp": 0.17694855, + "epoch": 0.784367954306328, + "flos": 23842279607040.0, + "grad_norm": 8.84356058119435, + "language_loss": 0.80112714, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.81572962, + "num_input_tokens_seen": 281451385, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24475098, + "step": 13046, + "time_per_iteration": 2.7146244049072266 + }, + { + "auxiliary_loss_clip": 0.01246176, + "auxiliary_loss_mlp": 0.00219421, + "balance_loss_clip": 1.02548015, + "balance_loss_mlp": 0.19542462, + "epoch": 0.7844280775589959, + "flos": 24826662806400.0, + "grad_norm": 119.57206159597919, + "language_loss": 0.71731758, + "learning_rate": 4.678832984380809e-07, + "loss": 0.73197353, + "num_input_tokens_seen": 281472255, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.2401123, + "step": 13047, + "time_per_iteration": 2.710035800933838 + }, + { + "auxiliary_loss_clip": 0.01222737, + "auxiliary_loss_mlp": 0.0020589, + "balance_loss_clip": 1.01211011, + "balance_loss_mlp": 0.18282318, + "epoch": 0.7844882008116639, + "flos": 22455660931200.0, + "grad_norm": 25.165524483891314, + "language_loss": 0.79723573, + "learning_rate": 4.676329928006515e-07, + "loss": 0.81152195, + "num_input_tokens_seen": 281492860, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.23071289, + "step": 13048, + "time_per_iteration": 2.697352647781372 + }, + { + "auxiliary_loss_clip": 0.01250691, + "auxiliary_loss_mlp": 0.00228025, + "balance_loss_clip": 1.02926397, + "balance_loss_mlp": 0.20330065, + "epoch": 0.7845483240643318, + "flos": 26104041244800.0, + "grad_norm": 2.5906656046106873, + "language_loss": 0.82603014, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.84081733, + "num_input_tokens_seen": 281511815, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.24731445, + "step": 13049, + "time_per_iteration": 2.6875557899475098 + }, + { + "auxiliary_loss_clip": 0.01274445, + "auxiliary_loss_mlp": 0.00227156, + "balance_loss_clip": 1.0449661, + "balance_loss_mlp": 0.20031022, + "epoch": 0.7846084473169999, + "flos": 19354307817600.0, + "grad_norm": 35.54122270729712, + "language_loss": 0.83165973, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.84667575, + "num_input_tokens_seen": 281530090, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.26855469, + "step": 13050, + "time_per_iteration": 2.660062551498413 + }, + { + "auxiliary_loss_clip": 0.01233323, + "auxiliary_loss_mlp": 0.00212935, + "balance_loss_clip": 1.01770568, + "balance_loss_mlp": 0.18806756, + "epoch": 0.7846685705696678, + "flos": 23325811902720.0, + "grad_norm": 20.36139741035254, + "language_loss": 0.82327789, + "learning_rate": 4.668824245713825e-07, + "loss": 0.83774054, + "num_input_tokens_seen": 281547075, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24853516, + "step": 13051, + "time_per_iteration": 2.7238576412200928 + }, + { + "auxiliary_loss_clip": 0.01268143, + "auxiliary_loss_mlp": 0.00254886, + "balance_loss_clip": 1.04352582, + "balance_loss_mlp": 0.22780119, + "epoch": 0.7847286938223358, + "flos": 35809545962880.0, + "grad_norm": 2.6602771594759385, + "language_loss": 0.80853069, + "learning_rate": 4.666323514209227e-07, + "loss": 0.82376093, + "num_input_tokens_seen": 281568080, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.27075195, + "step": 13052, + "time_per_iteration": 2.8782267570495605 + }, + { + "auxiliary_loss_clip": 0.01243015, + "auxiliary_loss_mlp": 0.00218407, + "balance_loss_clip": 1.02660441, + "balance_loss_mlp": 0.19455352, + "epoch": 0.7847888170750038, + "flos": 18478159274880.0, + "grad_norm": 54.66998068921527, + "language_loss": 0.77906525, + "learning_rate": 4.663823364159183e-07, + "loss": 0.79367954, + "num_input_tokens_seen": 281586925, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.23864746, + "step": 13053, + "time_per_iteration": 2.6475422382354736 + }, + { + "auxiliary_loss_clip": 0.01237255, + "auxiliary_loss_mlp": 0.00213047, + "balance_loss_clip": 1.02235389, + "balance_loss_mlp": 0.18952695, + "epoch": 0.7848489403276717, + "flos": 25119155255040.0, + "grad_norm": 5.119599131863661, + "language_loss": 0.78693271, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.80143577, + "num_input_tokens_seen": 281603915, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.23535156, + "step": 13054, + "time_per_iteration": 2.747056007385254 + }, + { + "auxiliary_loss_clip": 0.01251052, + "auxiliary_loss_mlp": 0.00203217, + "balance_loss_clip": 1.03145981, + "balance_loss_mlp": 0.17725322, + "epoch": 0.7849090635803397, + "flos": 26502433966080.0, + "grad_norm": 6.89441626237142, + "language_loss": 0.82416868, + "learning_rate": 4.658824808801938e-07, + "loss": 0.83871138, + "num_input_tokens_seen": 281624220, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25952148, + "step": 13055, + "time_per_iteration": 2.699556350708008 + }, + { + "auxiliary_loss_clip": 0.01261788, + "auxiliary_loss_mlp": 0.00226374, + "balance_loss_clip": 1.03651452, + "balance_loss_mlp": 0.20069641, + "epoch": 0.7849691868330076, + "flos": 20959658363520.0, + "grad_norm": 8.528910884071719, + "language_loss": 0.82497597, + "learning_rate": 4.656326403684283e-07, + "loss": 0.83985752, + "num_input_tokens_seen": 281642325, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.25671387, + "step": 13056, + "time_per_iteration": 2.720327854156494 + }, + { + "auxiliary_loss_clip": 0.01256415, + "auxiliary_loss_mlp": 0.00224227, + "balance_loss_clip": 1.03729546, + "balance_loss_mlp": 0.20001557, + "epoch": 0.7850293100856757, + "flos": 26067484177920.0, + "grad_norm": 14.53267089711101, + "language_loss": 0.77817178, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.79297823, + "num_input_tokens_seen": 281663065, + "router_z_loss_clip": 2.19238281, + "router_z_loss_mlp": 0.24230957, + "step": 13057, + "time_per_iteration": 2.6988909244537354 + }, + { + "auxiliary_loss_clip": 0.01242186, + "auxiliary_loss_mlp": 0.00210268, + "balance_loss_clip": 1.02214932, + "balance_loss_mlp": 0.18655753, + "epoch": 0.7850894333383436, + "flos": 22491894775680.0, + "grad_norm": 15.62411640202038, + "language_loss": 0.83583605, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.85036057, + "num_input_tokens_seen": 281681005, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.23730469, + "step": 13058, + "time_per_iteration": 4.021360874176025 + }, + { + "auxiliary_loss_clip": 0.01256554, + "auxiliary_loss_mlp": 0.00225808, + "balance_loss_clip": 1.03612185, + "balance_loss_mlp": 0.20318258, + "epoch": 0.7851495565910116, + "flos": 20558643949440.0, + "grad_norm": 8.83705305339014, + "language_loss": 0.77888083, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.79370445, + "num_input_tokens_seen": 281697965, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.22619629, + "step": 13059, + "time_per_iteration": 2.6702728271484375 + }, + { + "auxiliary_loss_clip": 0.0127392, + "auxiliary_loss_mlp": 0.00225076, + "balance_loss_clip": 1.04111922, + "balance_loss_mlp": 0.19772945, + "epoch": 0.7852096798436795, + "flos": 15924838942080.0, + "grad_norm": 67.4797474011675, + "language_loss": 0.84787637, + "learning_rate": 4.646338602497144e-07, + "loss": 0.86286628, + "num_input_tokens_seen": 281716035, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.2734375, + "step": 13060, + "time_per_iteration": 4.072642803192139 + }, + { + "auxiliary_loss_clip": 0.01240023, + "auxiliary_loss_mlp": 0.0021619, + "balance_loss_clip": 1.02268314, + "balance_loss_mlp": 0.19123991, + "epoch": 0.7852698030963475, + "flos": 19062282245760.0, + "grad_norm": 57.16406017676538, + "language_loss": 0.83511078, + "learning_rate": 4.643843107494654e-07, + "loss": 0.84967291, + "num_input_tokens_seen": 281732815, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24975586, + "step": 13061, + "time_per_iteration": 2.626573085784912 + }, + { + "auxiliary_loss_clip": 0.0124453, + "auxiliary_loss_mlp": 0.00225041, + "balance_loss_clip": 1.02267694, + "balance_loss_mlp": 0.19942263, + "epoch": 0.7853299263490154, + "flos": 24644380262400.0, + "grad_norm": 3.3030291971987813, + "language_loss": 0.82970297, + "learning_rate": 4.641348194799164e-07, + "loss": 0.84439862, + "num_input_tokens_seen": 281751980, + "router_z_loss_clip": 2.21777344, + "router_z_loss_mlp": 0.25610352, + "step": 13062, + "time_per_iteration": 2.7087514400482178 + }, + { + "auxiliary_loss_clip": 0.01251326, + "auxiliary_loss_mlp": 0.00212026, + "balance_loss_clip": 1.03339624, + "balance_loss_mlp": 0.18913826, + "epoch": 0.7853900496016835, + "flos": 22017981709440.0, + "grad_norm": 629.9631445142546, + "language_loss": 0.76033688, + "learning_rate": 4.638853864505297e-07, + "loss": 0.77497041, + "num_input_tokens_seen": 281772670, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.2286377, + "step": 13063, + "time_per_iteration": 2.6595499515533447 + }, + { + "auxiliary_loss_clip": 0.01245062, + "auxiliary_loss_mlp": 0.00194329, + "balance_loss_clip": 1.02734733, + "balance_loss_mlp": 0.17130992, + "epoch": 0.7854501728543514, + "flos": 30227412032640.0, + "grad_norm": 10.251212742206356, + "language_loss": 0.82266629, + "learning_rate": 4.636360116707625e-07, + "loss": 0.83706015, + "num_input_tokens_seen": 281792930, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.22998047, + "step": 13064, + "time_per_iteration": 2.8023855686187744 + }, + { + "auxiliary_loss_clip": 0.01251132, + "auxiliary_loss_mlp": 0.00237278, + "balance_loss_clip": 1.02927804, + "balance_loss_mlp": 0.21016955, + "epoch": 0.7855102961070194, + "flos": 18843694030080.0, + "grad_norm": 45.48408710665011, + "language_loss": 0.76130617, + "learning_rate": 4.633866951500718e-07, + "loss": 0.77619028, + "num_input_tokens_seen": 281811805, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.27124023, + "step": 13065, + "time_per_iteration": 4.26900577545166 + }, + { + "auxiliary_loss_clip": 0.01252109, + "auxiliary_loss_mlp": 0.00218849, + "balance_loss_clip": 1.03359783, + "balance_loss_mlp": 0.19358817, + "epoch": 0.7855704193596874, + "flos": 22309971367680.0, + "grad_norm": 21.91147439367599, + "language_loss": 0.86657178, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.88128138, + "num_input_tokens_seen": 281831885, + "router_z_loss_clip": 2.18652344, + "router_z_loss_mlp": 0.25256348, + "step": 13066, + "time_per_iteration": 2.802011251449585 + }, + { + "auxiliary_loss_clip": 0.01095634, + "auxiliary_loss_mlp": 0.00063961, + "balance_loss_clip": 0.9420166, + "balance_loss_mlp": 0.05752374, + "epoch": 0.7856305426123553, + "flos": 60004434407040.0, + "grad_norm": 0.7741542542799573, + "language_loss": 0.52848983, + "learning_rate": 4.628882369237346e-07, + "loss": 0.54008579, + "num_input_tokens_seen": 281900310, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.06445312, + "step": 13067, + "time_per_iteration": 3.318704128265381 + }, + { + "auxiliary_loss_clip": 0.01238884, + "auxiliary_loss_mlp": 0.00214695, + "balance_loss_clip": 1.02281761, + "balance_loss_mlp": 0.18852885, + "epoch": 0.7856906658650233, + "flos": 21868593045120.0, + "grad_norm": 23.93218982729872, + "language_loss": 0.75047886, + "learning_rate": 4.62639095236989e-07, + "loss": 0.76501465, + "num_input_tokens_seen": 281918870, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.26184082, + "step": 13068, + "time_per_iteration": 2.7167530059814453 + }, + { + "auxiliary_loss_clip": 0.0123139, + "auxiliary_loss_mlp": 0.00184386, + "balance_loss_clip": 1.01737952, + "balance_loss_mlp": 0.16165242, + "epoch": 0.7857507891176913, + "flos": 23622937205760.0, + "grad_norm": 170.3444216191764, + "language_loss": 0.75893724, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.77309501, + "num_input_tokens_seen": 281936905, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.22741699, + "step": 13069, + "time_per_iteration": 4.166734933853149 + }, + { + "auxiliary_loss_clip": 0.01268275, + "auxiliary_loss_mlp": 0.00231444, + "balance_loss_clip": 1.04289544, + "balance_loss_mlp": 0.20542045, + "epoch": 0.7858109123703593, + "flos": 25520061928320.0, + "grad_norm": 4.347305228647228, + "language_loss": 0.83049756, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.84549475, + "num_input_tokens_seen": 281955625, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.26037598, + "step": 13070, + "time_per_iteration": 2.700254201889038 + }, + { + "auxiliary_loss_clip": 0.01245984, + "auxiliary_loss_mlp": 0.001988, + "balance_loss_clip": 1.02641809, + "balance_loss_mlp": 0.17395645, + "epoch": 0.7858710356230272, + "flos": 17457398576640.0, + "grad_norm": 82.31310919932727, + "language_loss": 0.75251716, + "learning_rate": 4.618920199958083e-07, + "loss": 0.76696503, + "num_input_tokens_seen": 281973285, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.24853516, + "step": 13071, + "time_per_iteration": 2.6336936950683594 + }, + { + "auxiliary_loss_clip": 0.01253651, + "auxiliary_loss_mlp": 0.0022711, + "balance_loss_clip": 1.03150141, + "balance_loss_mlp": 0.20114625, + "epoch": 0.7859311588756952, + "flos": 24679680353280.0, + "grad_norm": 9.606408581346285, + "language_loss": 0.81247544, + "learning_rate": 4.616431115532442e-07, + "loss": 0.82728314, + "num_input_tokens_seen": 281991410, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25952148, + "step": 13072, + "time_per_iteration": 2.7319447994232178 + }, + { + "auxiliary_loss_clip": 0.01275098, + "auxiliary_loss_mlp": 0.00224839, + "balance_loss_clip": 1.04585826, + "balance_loss_mlp": 0.19888671, + "epoch": 0.7859912821283631, + "flos": 21799142098560.0, + "grad_norm": 36.38784875649851, + "language_loss": 0.79449034, + "learning_rate": 4.613942614453268e-07, + "loss": 0.80948973, + "num_input_tokens_seen": 282010845, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.25952148, + "step": 13073, + "time_per_iteration": 2.678715467453003 + }, + { + "auxiliary_loss_clip": 0.01257764, + "auxiliary_loss_mlp": 0.00205981, + "balance_loss_clip": 1.03239286, + "balance_loss_mlp": 0.17937383, + "epoch": 0.7860514053810311, + "flos": 20847293642880.0, + "grad_norm": 11.757560147393978, + "language_loss": 0.83593822, + "learning_rate": 4.611454696814938e-07, + "loss": 0.85057569, + "num_input_tokens_seen": 282029635, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26586914, + "step": 13074, + "time_per_iteration": 2.6301026344299316 + }, + { + "auxiliary_loss_clip": 0.01234978, + "auxiliary_loss_mlp": 0.0022508, + "balance_loss_clip": 1.02272117, + "balance_loss_mlp": 0.20057088, + "epoch": 0.786111528633699, + "flos": 24315689882880.0, + "grad_norm": 28.5573641494948, + "language_loss": 0.8124553, + "learning_rate": 4.608967362711782e-07, + "loss": 0.82705587, + "num_input_tokens_seen": 282050285, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.24523926, + "step": 13075, + "time_per_iteration": 2.7052865028381348 + }, + { + "auxiliary_loss_clip": 0.01280986, + "auxiliary_loss_mlp": 0.00203081, + "balance_loss_clip": 1.05631244, + "balance_loss_mlp": 0.17830959, + "epoch": 0.7861716518863671, + "flos": 24353180703360.0, + "grad_norm": 4.8978619358706705, + "language_loss": 0.75557268, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.77041334, + "num_input_tokens_seen": 282071040, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.2479248, + "step": 13076, + "time_per_iteration": 2.7693278789520264 + }, + { + "auxiliary_loss_clip": 0.0127347, + "auxiliary_loss_mlp": 0.00212272, + "balance_loss_clip": 1.05130911, + "balance_loss_mlp": 0.18621311, + "epoch": 0.786231775139035, + "flos": 14022399006720.0, + "grad_norm": 118.56588888833174, + "language_loss": 0.8937996, + "learning_rate": 4.603994445488282e-07, + "loss": 0.90865701, + "num_input_tokens_seen": 282086610, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.26074219, + "step": 13077, + "time_per_iteration": 2.635636806488037 + }, + { + "auxiliary_loss_clip": 0.0126148, + "auxiliary_loss_mlp": 0.0022634, + "balance_loss_clip": 1.0379374, + "balance_loss_mlp": 0.20124659, + "epoch": 0.786291898391703, + "flos": 33724248865920.0, + "grad_norm": 9.719030716110984, + "language_loss": 0.78089058, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.7957688, + "num_input_tokens_seen": 282107440, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.25085449, + "step": 13078, + "time_per_iteration": 2.7345705032348633 + }, + { + "auxiliary_loss_clip": 0.01231946, + "auxiliary_loss_mlp": 0.00218679, + "balance_loss_clip": 1.02042389, + "balance_loss_mlp": 0.19593361, + "epoch": 0.786352021644371, + "flos": 25811476968960.0, + "grad_norm": 4.613682705558808, + "language_loss": 0.87161756, + "learning_rate": 4.599023863537039e-07, + "loss": 0.88612378, + "num_input_tokens_seen": 282127290, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.22753906, + "step": 13079, + "time_per_iteration": 2.711592674255371 + }, + { + "auxiliary_loss_clip": 0.01242254, + "auxiliary_loss_mlp": 0.001999, + "balance_loss_clip": 1.02938843, + "balance_loss_mlp": 0.17655879, + "epoch": 0.7864121448970389, + "flos": 28910818920960.0, + "grad_norm": 33.304927685251634, + "language_loss": 0.74722457, + "learning_rate": 4.596539448524146e-07, + "loss": 0.76164615, + "num_input_tokens_seen": 282147505, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23339844, + "step": 13080, + "time_per_iteration": 2.7042531967163086 + }, + { + "auxiliary_loss_clip": 0.01257215, + "auxiliary_loss_mlp": 0.00220437, + "balance_loss_clip": 1.03587055, + "balance_loss_mlp": 0.19498578, + "epoch": 0.7864722681497069, + "flos": 19208833735680.0, + "grad_norm": 6.386351040065943, + "language_loss": 0.77981174, + "learning_rate": 4.594055617612016e-07, + "loss": 0.79458827, + "num_input_tokens_seen": 282166450, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25427246, + "step": 13081, + "time_per_iteration": 2.6706178188323975 + }, + { + "auxiliary_loss_clip": 0.01246974, + "auxiliary_loss_mlp": 0.00206747, + "balance_loss_clip": 1.03098726, + "balance_loss_mlp": 0.18192723, + "epoch": 0.7865323914023749, + "flos": 21871573873920.0, + "grad_norm": 44.67230090614582, + "language_loss": 0.75002217, + "learning_rate": 4.591572370894838e-07, + "loss": 0.76455939, + "num_input_tokens_seen": 282186465, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24804688, + "step": 13082, + "time_per_iteration": 2.668178081512451 + }, + { + "auxiliary_loss_clip": 0.01246228, + "auxiliary_loss_mlp": 0.00207342, + "balance_loss_clip": 1.02694869, + "balance_loss_mlp": 0.18366739, + "epoch": 0.7865925146550429, + "flos": 25520313323520.0, + "grad_norm": 21.4314543709459, + "language_loss": 0.73138857, + "learning_rate": 4.589089708466789e-07, + "loss": 0.74592429, + "num_input_tokens_seen": 282207180, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.23681641, + "step": 13083, + "time_per_iteration": 2.701592206954956 + }, + { + "auxiliary_loss_clip": 0.01265691, + "auxiliary_loss_mlp": 0.00208188, + "balance_loss_clip": 1.03676045, + "balance_loss_mlp": 0.1807934, + "epoch": 0.7866526379077108, + "flos": 19097366855040.0, + "grad_norm": 15.756268684197083, + "language_loss": 0.86132008, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.87605882, + "num_input_tokens_seen": 282225865, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.27429199, + "step": 13084, + "time_per_iteration": 2.595592498779297 + }, + { + "auxiliary_loss_clip": 0.01239776, + "auxiliary_loss_mlp": 0.00206039, + "balance_loss_clip": 1.02471042, + "balance_loss_mlp": 0.18356802, + "epoch": 0.7867127611603788, + "flos": 16173771171840.0, + "grad_norm": 37.5908463465442, + "language_loss": 0.76650655, + "learning_rate": 4.584126136854591e-07, + "loss": 0.78096461, + "num_input_tokens_seen": 282242895, + "router_z_loss_clip": 2.15136719, + "router_z_loss_mlp": 0.22497559, + "step": 13085, + "time_per_iteration": 2.6232354640960693 + }, + { + "auxiliary_loss_clip": 0.01263217, + "auxiliary_loss_mlp": 0.00222143, + "balance_loss_clip": 1.03492999, + "balance_loss_mlp": 0.19694185, + "epoch": 0.7867728844130467, + "flos": 20773640805120.0, + "grad_norm": 253.69442056822695, + "language_loss": 0.81442571, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.8292793, + "num_input_tokens_seen": 282260425, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.25231934, + "step": 13086, + "time_per_iteration": 2.640385389328003 + }, + { + "auxiliary_loss_clip": 0.01245619, + "auxiliary_loss_mlp": 0.00195666, + "balance_loss_clip": 1.02866435, + "balance_loss_mlp": 0.17033441, + "epoch": 0.7868330076657147, + "flos": 21760106993280.0, + "grad_norm": 97.8055042392618, + "language_loss": 0.81574726, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.83016014, + "num_input_tokens_seen": 282279335, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.2532959, + "step": 13087, + "time_per_iteration": 2.6687729358673096 + }, + { + "auxiliary_loss_clip": 0.0122832, + "auxiliary_loss_mlp": 0.00224298, + "balance_loss_clip": 1.01362491, + "balance_loss_mlp": 0.20036089, + "epoch": 0.7868931309183826, + "flos": 25700692446720.0, + "grad_norm": 10.3180264076929, + "language_loss": 0.76497924, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.77950549, + "num_input_tokens_seen": 282299905, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.23937988, + "step": 13088, + "time_per_iteration": 2.7521519660949707 + }, + { + "auxiliary_loss_clip": 0.01096319, + "auxiliary_loss_mlp": 0.00057634, + "balance_loss_clip": 0.94434589, + "balance_loss_mlp": 0.05033831, + "epoch": 0.7869532541710507, + "flos": 64644883430400.0, + "grad_norm": 0.7172201046128968, + "language_loss": 0.54680371, + "learning_rate": 4.574206009240431e-07, + "loss": 0.55834317, + "num_input_tokens_seen": 282367620, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.07275391, + "step": 13089, + "time_per_iteration": 3.2705702781677246 + }, + { + "auxiliary_loss_clip": 0.01097265, + "auxiliary_loss_mlp": 0.00067981, + "balance_loss_clip": 0.94668317, + "balance_loss_mlp": 0.06116207, + "epoch": 0.7870133774237186, + "flos": 67453600440960.0, + "grad_norm": 0.7144238226787382, + "language_loss": 0.49266508, + "learning_rate": 4.571727439470976e-07, + "loss": 0.50431752, + "num_input_tokens_seen": 282435695, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.06835938, + "step": 13090, + "time_per_iteration": 3.2774274349212646 + }, + { + "auxiliary_loss_clip": 0.0123296, + "auxiliary_loss_mlp": 0.00195209, + "balance_loss_clip": 1.01739967, + "balance_loss_mlp": 0.17217743, + "epoch": 0.7870735006763866, + "flos": 26068310190720.0, + "grad_norm": 3.548803143607071, + "language_loss": 0.89414251, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.9084242, + "num_input_tokens_seen": 282456025, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.23034668, + "step": 13091, + "time_per_iteration": 2.6999571323394775 + }, + { + "auxiliary_loss_clip": 0.01094368, + "auxiliary_loss_mlp": 0.00066974, + "balance_loss_clip": 0.9429391, + "balance_loss_mlp": 0.06039357, + "epoch": 0.7871336239290546, + "flos": 70289572896000.0, + "grad_norm": 0.8079349075148051, + "language_loss": 0.63622558, + "learning_rate": 4.566772055150947e-07, + "loss": 0.64783901, + "num_input_tokens_seen": 282520995, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.06591797, + "step": 13092, + "time_per_iteration": 3.172266721725464 + }, + { + "auxiliary_loss_clip": 0.01246168, + "auxiliary_loss_mlp": 0.00218712, + "balance_loss_clip": 1.02521038, + "balance_loss_mlp": 0.19316536, + "epoch": 0.7871937471817225, + "flos": 15778574760960.0, + "grad_norm": 29.59435047614907, + "language_loss": 0.88944411, + "learning_rate": 4.564295240788285e-07, + "loss": 0.90409291, + "num_input_tokens_seen": 282539355, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25561523, + "step": 13093, + "time_per_iteration": 2.721970558166504 + }, + { + "auxiliary_loss_clip": 0.01230744, + "auxiliary_loss_mlp": 0.00188286, + "balance_loss_clip": 1.0125078, + "balance_loss_mlp": 0.1641935, + "epoch": 0.7872538704343905, + "flos": 20485242506880.0, + "grad_norm": 81.92766753511157, + "language_loss": 0.83719295, + "learning_rate": 4.561819011749106e-07, + "loss": 0.85138333, + "num_input_tokens_seen": 282555735, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.2409668, + "step": 13094, + "time_per_iteration": 2.670842170715332 + }, + { + "auxiliary_loss_clip": 0.01243754, + "auxiliary_loss_mlp": 0.00224845, + "balance_loss_clip": 1.02097404, + "balance_loss_mlp": 0.19959605, + "epoch": 0.7873139936870585, + "flos": 25082670015360.0, + "grad_norm": 56.51454818069856, + "language_loss": 0.86204833, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.87673432, + "num_input_tokens_seen": 282574550, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25280762, + "step": 13095, + "time_per_iteration": 2.7483232021331787 + }, + { + "auxiliary_loss_clip": 0.01255456, + "auxiliary_loss_mlp": 0.00209981, + "balance_loss_clip": 1.02951622, + "balance_loss_mlp": 0.18458909, + "epoch": 0.7873741169397265, + "flos": 30883176679680.0, + "grad_norm": 15.033749463980687, + "language_loss": 0.75488102, + "learning_rate": 4.556868310016715e-07, + "loss": 0.76953542, + "num_input_tokens_seen": 282596520, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.25402832, + "step": 13096, + "time_per_iteration": 2.7231228351593018 + }, + { + "auxiliary_loss_clip": 0.01222488, + "auxiliary_loss_mlp": 0.00191981, + "balance_loss_clip": 1.01089239, + "balance_loss_mlp": 0.16922414, + "epoch": 0.7874342401923944, + "flos": 46791962242560.0, + "grad_norm": 14.02274298696027, + "language_loss": 0.75246024, + "learning_rate": 4.55439383751125e-07, + "loss": 0.76660496, + "num_input_tokens_seen": 282620560, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.22766113, + "step": 13097, + "time_per_iteration": 2.927507162094116 + }, + { + "auxiliary_loss_clip": 0.01273149, + "auxiliary_loss_mlp": 0.00207477, + "balance_loss_clip": 1.04525805, + "balance_loss_mlp": 0.18096499, + "epoch": 0.7874943634450624, + "flos": 23584548545280.0, + "grad_norm": 13.943764542811582, + "language_loss": 0.87328094, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.88808721, + "num_input_tokens_seen": 282639830, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.26513672, + "step": 13098, + "time_per_iteration": 2.684392213821411 + }, + { + "auxiliary_loss_clip": 0.012314, + "auxiliary_loss_mlp": 0.00195737, + "balance_loss_clip": 1.01677382, + "balance_loss_mlp": 0.17336093, + "epoch": 0.7875544866977303, + "flos": 20191169859840.0, + "grad_norm": 3.762547610573993, + "language_loss": 0.81369442, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.82796574, + "num_input_tokens_seen": 282660130, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.22399902, + "step": 13099, + "time_per_iteration": 2.7116940021514893 + }, + { + "auxiliary_loss_clip": 0.01257023, + "auxiliary_loss_mlp": 0.00212495, + "balance_loss_clip": 1.03776729, + "balance_loss_mlp": 0.18799752, + "epoch": 0.7876146099503983, + "flos": 22602571557120.0, + "grad_norm": 5.3487021793504255, + "language_loss": 0.84484303, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.8595382, + "num_input_tokens_seen": 282681125, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24511719, + "step": 13100, + "time_per_iteration": 4.147493124008179 + }, + { + "auxiliary_loss_clip": 0.01265451, + "auxiliary_loss_mlp": 0.00220247, + "balance_loss_clip": 1.03575182, + "balance_loss_mlp": 0.19349624, + "epoch": 0.7876747332030662, + "flos": 10705833555840.0, + "grad_norm": 7.551240129450193, + "language_loss": 0.78539664, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.80025369, + "num_input_tokens_seen": 282696690, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.26757812, + "step": 13101, + "time_per_iteration": 2.6222496032714844 + }, + { + "auxiliary_loss_clip": 0.01243931, + "auxiliary_loss_mlp": 0.00198407, + "balance_loss_clip": 1.02555871, + "balance_loss_mlp": 0.17415984, + "epoch": 0.7877348564557343, + "flos": 38399315621760.0, + "grad_norm": 10.70158512516379, + "language_loss": 0.83275193, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.84717524, + "num_input_tokens_seen": 282721210, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24243164, + "step": 13102, + "time_per_iteration": 4.23903751373291 + }, + { + "auxiliary_loss_clip": 0.01228151, + "auxiliary_loss_mlp": 0.00221666, + "balance_loss_clip": 1.01357841, + "balance_loss_mlp": 0.19784829, + "epoch": 0.7877949797084022, + "flos": 18329524796160.0, + "grad_norm": 11.949221136122025, + "language_loss": 0.87855607, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.89305425, + "num_input_tokens_seen": 282738505, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.23840332, + "step": 13103, + "time_per_iteration": 2.6434707641601562 + }, + { + "auxiliary_loss_clip": 0.01271238, + "auxiliary_loss_mlp": 0.00212824, + "balance_loss_clip": 1.04433572, + "balance_loss_mlp": 0.18843356, + "epoch": 0.7878551029610702, + "flos": 25806736373760.0, + "grad_norm": 13.96612637455344, + "language_loss": 0.88321817, + "learning_rate": 4.537088934794913e-07, + "loss": 0.89805877, + "num_input_tokens_seen": 282756895, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.24401855, + "step": 13104, + "time_per_iteration": 2.7515950202941895 + }, + { + "auxiliary_loss_clip": 0.01243135, + "auxiliary_loss_mlp": 0.00205882, + "balance_loss_clip": 1.02230227, + "balance_loss_mlp": 0.18246968, + "epoch": 0.7879152262137382, + "flos": 22342685679360.0, + "grad_norm": 137.18513435967049, + "language_loss": 0.80301124, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.81750143, + "num_input_tokens_seen": 282774955, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.23388672, + "step": 13105, + "time_per_iteration": 2.7590067386627197 + }, + { + "auxiliary_loss_clip": 0.01239488, + "auxiliary_loss_mlp": 0.00214814, + "balance_loss_clip": 1.01376462, + "balance_loss_mlp": 0.18848091, + "epoch": 0.7879753494664061, + "flos": 24785329230720.0, + "grad_norm": 9.0969655750624, + "language_loss": 0.84954017, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.86408317, + "num_input_tokens_seen": 282793165, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26342773, + "step": 13106, + "time_per_iteration": 2.6987545490264893 + }, + { + "auxiliary_loss_clip": 0.01241434, + "auxiliary_loss_mlp": 0.00201656, + "balance_loss_clip": 1.02001894, + "balance_loss_mlp": 0.17712313, + "epoch": 0.7880354727190741, + "flos": 16909078487040.0, + "grad_norm": 3867.6299648925824, + "language_loss": 0.84275711, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.85718799, + "num_input_tokens_seen": 282809820, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.24523926, + "step": 13107, + "time_per_iteration": 4.21003532409668 + }, + { + "auxiliary_loss_clip": 0.01231679, + "auxiliary_loss_mlp": 0.00188028, + "balance_loss_clip": 1.01895452, + "balance_loss_mlp": 0.16609317, + "epoch": 0.7880955959717421, + "flos": 22230500526720.0, + "grad_norm": 21.72220197822257, + "language_loss": 0.79695749, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.81115454, + "num_input_tokens_seen": 282828600, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.21948242, + "step": 13108, + "time_per_iteration": 2.7177891731262207 + }, + { + "auxiliary_loss_clip": 0.01101296, + "auxiliary_loss_mlp": 0.00042951, + "balance_loss_clip": 0.94741952, + "balance_loss_mlp": 0.0355125, + "epoch": 0.7881557192244101, + "flos": 69183200131200.0, + "grad_norm": 0.8672798269681281, + "language_loss": 0.59915078, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.61059326, + "num_input_tokens_seen": 282882775, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.07421875, + "step": 13109, + "time_per_iteration": 3.145371913909912 + }, + { + "auxiliary_loss_clip": 0.01253546, + "auxiliary_loss_mlp": 0.00226862, + "balance_loss_clip": 1.03490686, + "balance_loss_mlp": 0.2012682, + "epoch": 0.788215842477078, + "flos": 24935436167040.0, + "grad_norm": 4.682680974527127, + "language_loss": 0.79875278, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.81355691, + "num_input_tokens_seen": 282902680, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25585938, + "step": 13110, + "time_per_iteration": 2.738661766052246 + }, + { + "auxiliary_loss_clip": 0.01233867, + "auxiliary_loss_mlp": 0.00200286, + "balance_loss_clip": 1.01813579, + "balance_loss_mlp": 0.1778146, + "epoch": 0.788275965729746, + "flos": 26106483369600.0, + "grad_norm": 4.585450469427106, + "language_loss": 0.80837965, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.82272112, + "num_input_tokens_seen": 282923625, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.22473145, + "step": 13111, + "time_per_iteration": 4.137788772583008 + }, + { + "auxiliary_loss_clip": 0.01236055, + "auxiliary_loss_mlp": 0.00206228, + "balance_loss_clip": 1.01692128, + "balance_loss_mlp": 0.18121809, + "epoch": 0.7883360889824139, + "flos": 21214803646080.0, + "grad_norm": 54.55076188146769, + "language_loss": 0.71983802, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.73426086, + "num_input_tokens_seen": 282941955, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25012207, + "step": 13112, + "time_per_iteration": 2.67777156829834 + }, + { + "auxiliary_loss_clip": 0.01245348, + "auxiliary_loss_mlp": 0.0020892, + "balance_loss_clip": 1.0217247, + "balance_loss_mlp": 0.18271767, + "epoch": 0.7883962122350819, + "flos": 21142551438720.0, + "grad_norm": 6.176783462314498, + "language_loss": 0.76971227, + "learning_rate": 4.514881996216644e-07, + "loss": 0.78425491, + "num_input_tokens_seen": 282961280, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26196289, + "step": 13113, + "time_per_iteration": 2.7192180156707764 + }, + { + "auxiliary_loss_clip": 0.01229696, + "auxiliary_loss_mlp": 0.00195209, + "balance_loss_clip": 1.01497841, + "balance_loss_mlp": 0.17121205, + "epoch": 0.7884563354877498, + "flos": 15302901928320.0, + "grad_norm": 39.82539489126125, + "language_loss": 0.727, + "learning_rate": 4.5124174933361e-07, + "loss": 0.74124908, + "num_input_tokens_seen": 282978210, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.23986816, + "step": 13114, + "time_per_iteration": 2.6741318702697754 + }, + { + "auxiliary_loss_clip": 0.0123617, + "auxiliary_loss_mlp": 0.00212697, + "balance_loss_clip": 1.01674557, + "balance_loss_mlp": 0.18687651, + "epoch": 0.7885164587404179, + "flos": 24388301226240.0, + "grad_norm": 6.09079784322184, + "language_loss": 0.72265738, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.73714608, + "num_input_tokens_seen": 282998845, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25830078, + "step": 13115, + "time_per_iteration": 2.768845796585083 + }, + { + "auxiliary_loss_clip": 0.01242816, + "auxiliary_loss_mlp": 0.00225846, + "balance_loss_clip": 1.02059329, + "balance_loss_mlp": 0.19991848, + "epoch": 0.7885765819930858, + "flos": 14385886686720.0, + "grad_norm": 216.363041099254, + "language_loss": 0.93706167, + "learning_rate": 4.50749024954048e-07, + "loss": 0.95174825, + "num_input_tokens_seen": 283015200, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25915527, + "step": 13116, + "time_per_iteration": 2.6353321075439453 + }, + { + "auxiliary_loss_clip": 0.01264942, + "auxiliary_loss_mlp": 0.00232123, + "balance_loss_clip": 1.03548419, + "balance_loss_mlp": 0.20477596, + "epoch": 0.7886367052457538, + "flos": 18259930195200.0, + "grad_norm": 8.405841604036748, + "language_loss": 0.82281935, + "learning_rate": 4.505027508812245e-07, + "loss": 0.83779001, + "num_input_tokens_seen": 283033680, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.27355957, + "step": 13117, + "time_per_iteration": 2.667958974838257 + }, + { + "auxiliary_loss_clip": 0.0123191, + "auxiliary_loss_mlp": 0.00232184, + "balance_loss_clip": 1.01750243, + "balance_loss_mlp": 0.20661348, + "epoch": 0.7886968284984217, + "flos": 15305092657920.0, + "grad_norm": 261.08196233536484, + "language_loss": 0.85610044, + "learning_rate": 4.502565355654926e-07, + "loss": 0.87074137, + "num_input_tokens_seen": 283050620, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.25561523, + "step": 13118, + "time_per_iteration": 2.6074297428131104 + }, + { + "auxiliary_loss_clip": 0.01244131, + "auxiliary_loss_mlp": 0.002063, + "balance_loss_clip": 1.02941346, + "balance_loss_mlp": 0.18311419, + "epoch": 0.7887569517510897, + "flos": 21215450090880.0, + "grad_norm": 12.318056355060778, + "language_loss": 0.80504251, + "learning_rate": 4.500103790161878e-07, + "loss": 0.81954676, + "num_input_tokens_seen": 283070215, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.23193359, + "step": 13119, + "time_per_iteration": 2.711944818496704 + }, + { + "auxiliary_loss_clip": 0.01225992, + "auxiliary_loss_mlp": 0.00203451, + "balance_loss_clip": 1.01234937, + "balance_loss_mlp": 0.18193324, + "epoch": 0.7888170750037578, + "flos": 22711237176960.0, + "grad_norm": 9.50903849518193, + "language_loss": 0.79511863, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.80941302, + "num_input_tokens_seen": 283091485, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.21508789, + "step": 13120, + "time_per_iteration": 2.6600284576416016 + }, + { + "auxiliary_loss_clip": 0.0125422, + "auxiliary_loss_mlp": 0.00207113, + "balance_loss_clip": 1.03531528, + "balance_loss_mlp": 0.18406957, + "epoch": 0.7888771982564257, + "flos": 36429148592640.0, + "grad_norm": 22.90084099356108, + "language_loss": 0.85500205, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.86961538, + "num_input_tokens_seen": 283115040, + "router_z_loss_clip": 2.19042969, + "router_z_loss_mlp": 0.23034668, + "step": 13121, + "time_per_iteration": 2.8684659004211426 + }, + { + "auxiliary_loss_clip": 0.01230962, + "auxiliary_loss_mlp": 0.00198764, + "balance_loss_clip": 1.01230073, + "balance_loss_mlp": 0.17587595, + "epoch": 0.7889373215090937, + "flos": 27309993488640.0, + "grad_norm": 34.844666367351714, + "language_loss": 0.85216224, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.86645949, + "num_input_tokens_seen": 283136925, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.22900391, + "step": 13122, + "time_per_iteration": 2.8993124961853027 + }, + { + "auxiliary_loss_clip": 0.01230287, + "auxiliary_loss_mlp": 0.00214074, + "balance_loss_clip": 1.01449418, + "balance_loss_mlp": 0.19155537, + "epoch": 0.7889974447617616, + "flos": 19829010983040.0, + "grad_norm": 29.369414586679785, + "language_loss": 0.85792381, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.87236738, + "num_input_tokens_seen": 283155725, + "router_z_loss_clip": 2.15527344, + "router_z_loss_mlp": 0.22521973, + "step": 13123, + "time_per_iteration": 2.6595842838287354 + }, + { + "auxiliary_loss_clip": 0.01252011, + "auxiliary_loss_mlp": 0.00241603, + "balance_loss_clip": 1.02676809, + "balance_loss_mlp": 0.21444687, + "epoch": 0.7890575680144296, + "flos": 17271201450240.0, + "grad_norm": 4.012332436894762, + "language_loss": 0.76117778, + "learning_rate": 4.487804780926985e-07, + "loss": 0.77611387, + "num_input_tokens_seen": 283173845, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.27172852, + "step": 13124, + "time_per_iteration": 2.675924777984619 + }, + { + "auxiliary_loss_clip": 0.01260255, + "auxiliary_loss_mlp": 0.00214571, + "balance_loss_clip": 1.03343773, + "balance_loss_mlp": 0.19072931, + "epoch": 0.7891176912670975, + "flos": 27600151553280.0, + "grad_norm": 13.56917860188531, + "language_loss": 0.84415758, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.85890579, + "num_input_tokens_seen": 283191985, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.23852539, + "step": 13125, + "time_per_iteration": 2.7045187950134277 + }, + { + "auxiliary_loss_clip": 0.01234826, + "auxiliary_loss_mlp": 0.0019693, + "balance_loss_clip": 1.01568198, + "balance_loss_mlp": 0.17286195, + "epoch": 0.7891778145197655, + "flos": 22711668140160.0, + "grad_norm": 4.238614892885009, + "language_loss": 0.80538845, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.81970608, + "num_input_tokens_seen": 283210855, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.2409668, + "step": 13126, + "time_per_iteration": 2.706866979598999 + }, + { + "auxiliary_loss_clip": 0.01250799, + "auxiliary_loss_mlp": 0.00213781, + "balance_loss_clip": 1.02960837, + "balance_loss_mlp": 0.1884492, + "epoch": 0.7892379377724335, + "flos": 17310775259520.0, + "grad_norm": 6.887634762417076, + "language_loss": 0.85745382, + "learning_rate": 4.480432433327845e-07, + "loss": 0.87209964, + "num_input_tokens_seen": 283229665, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.2532959, + "step": 13127, + "time_per_iteration": 2.6812210083007812 + }, + { + "auxiliary_loss_clip": 0.01242287, + "auxiliary_loss_mlp": 0.00201399, + "balance_loss_clip": 1.02718496, + "balance_loss_mlp": 0.179548, + "epoch": 0.7892980610251015, + "flos": 25775674087680.0, + "grad_norm": 2.4039406194226496, + "language_loss": 0.90982485, + "learning_rate": 4.47797616101103e-07, + "loss": 0.92426169, + "num_input_tokens_seen": 283248615, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.21826172, + "step": 13128, + "time_per_iteration": 2.7352020740509033 + }, + { + "auxiliary_loss_clip": 0.01231984, + "auxiliary_loss_mlp": 0.00198285, + "balance_loss_clip": 1.01847172, + "balance_loss_mlp": 0.17602846, + "epoch": 0.7893581842777694, + "flos": 21579943351680.0, + "grad_norm": 3.6184867666961296, + "language_loss": 0.77697676, + "learning_rate": 4.475520477290904e-07, + "loss": 0.79127944, + "num_input_tokens_seen": 283267135, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.22241211, + "step": 13129, + "time_per_iteration": 2.671686887741089 + }, + { + "auxiliary_loss_clip": 0.01127858, + "auxiliary_loss_mlp": 0.00116509, + "balance_loss_clip": 0.97202402, + "balance_loss_mlp": 0.1083555, + "epoch": 0.7894183075304374, + "flos": 69016468176000.0, + "grad_norm": 0.7043971965011516, + "language_loss": 0.60531402, + "learning_rate": 4.473065382260597e-07, + "loss": 0.61775768, + "num_input_tokens_seen": 283328940, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.08154297, + "step": 13130, + "time_per_iteration": 3.2569127082824707 + }, + { + "auxiliary_loss_clip": 0.0123912, + "auxiliary_loss_mlp": 0.00199526, + "balance_loss_clip": 1.02332401, + "balance_loss_mlp": 0.17667392, + "epoch": 0.7894784307831053, + "flos": 24243258107520.0, + "grad_norm": 34.485588000734914, + "language_loss": 0.81149918, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.82588565, + "num_input_tokens_seen": 283350000, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.22875977, + "step": 13131, + "time_per_iteration": 2.7640187740325928 + }, + { + "auxiliary_loss_clip": 0.01298344, + "auxiliary_loss_mlp": 0.00232407, + "balance_loss_clip": 1.05476594, + "balance_loss_mlp": 0.20335577, + "epoch": 0.7895385540357733, + "flos": 20266546550400.0, + "grad_norm": 12.027626235370969, + "language_loss": 0.83917534, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.85448283, + "num_input_tokens_seen": 283368020, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.29040527, + "step": 13132, + "time_per_iteration": 2.6529951095581055 + }, + { + "auxiliary_loss_clip": 0.01246538, + "auxiliary_loss_mlp": 0.00215539, + "balance_loss_clip": 1.02457094, + "balance_loss_mlp": 0.18964721, + "epoch": 0.7895986772884414, + "flos": 20996574566400.0, + "grad_norm": 109.54053729387593, + "language_loss": 0.71526027, + "learning_rate": 4.465703630239468e-07, + "loss": 0.72988105, + "num_input_tokens_seen": 283387030, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.25891113, + "step": 13133, + "time_per_iteration": 2.7360644340515137 + }, + { + "auxiliary_loss_clip": 0.01281821, + "auxiliary_loss_mlp": 0.00224337, + "balance_loss_clip": 1.04716468, + "balance_loss_mlp": 0.1971333, + "epoch": 0.7896588005411093, + "flos": 18657999694080.0, + "grad_norm": 10.911504595053357, + "language_loss": 0.93020445, + "learning_rate": 4.463250890899195e-07, + "loss": 0.94526613, + "num_input_tokens_seen": 283402090, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.27233887, + "step": 13134, + "time_per_iteration": 2.6385700702667236 + }, + { + "auxiliary_loss_clip": 0.01251974, + "auxiliary_loss_mlp": 0.00207788, + "balance_loss_clip": 1.03216195, + "balance_loss_mlp": 0.18239652, + "epoch": 0.7897189237937773, + "flos": 18405907067520.0, + "grad_norm": 21.440628764261056, + "language_loss": 0.88818669, + "learning_rate": 4.460798740713998e-07, + "loss": 0.90278435, + "num_input_tokens_seen": 283421035, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.25366211, + "step": 13135, + "time_per_iteration": 2.690096378326416 + }, + { + "auxiliary_loss_clip": 0.01240022, + "auxiliary_loss_mlp": 0.00211723, + "balance_loss_clip": 1.02267337, + "balance_loss_mlp": 0.18590248, + "epoch": 0.7897790470464452, + "flos": 23731602825600.0, + "grad_norm": 7.297266625746801, + "language_loss": 0.7847479, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.79926533, + "num_input_tokens_seen": 283441830, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25842285, + "step": 13136, + "time_per_iteration": 2.7181344032287598 + }, + { + "auxiliary_loss_clip": 0.01259345, + "auxiliary_loss_mlp": 0.0023445, + "balance_loss_clip": 1.03295732, + "balance_loss_mlp": 0.20853405, + "epoch": 0.7898391702991132, + "flos": 15918949111680.0, + "grad_norm": 8.703370579101698, + "language_loss": 0.82353795, + "learning_rate": 4.455896208180778e-07, + "loss": 0.83847582, + "num_input_tokens_seen": 283459540, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.25915527, + "step": 13137, + "time_per_iteration": 2.6891896724700928 + }, + { + "auxiliary_loss_clip": 0.01237656, + "auxiliary_loss_mlp": 0.00191615, + "balance_loss_clip": 1.02043045, + "balance_loss_mlp": 0.16674845, + "epoch": 0.7898992935517811, + "flos": 19829046896640.0, + "grad_norm": 5.157149637968335, + "language_loss": 0.83120191, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.84549463, + "num_input_tokens_seen": 283478790, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24853516, + "step": 13138, + "time_per_iteration": 2.6567556858062744 + }, + { + "auxiliary_loss_clip": 0.01236067, + "auxiliary_loss_mlp": 0.00215149, + "balance_loss_clip": 1.01839113, + "balance_loss_mlp": 0.19296461, + "epoch": 0.7899594168044491, + "flos": 16216253982720.0, + "grad_norm": 277.4102441836117, + "language_loss": 0.76490963, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.77942175, + "num_input_tokens_seen": 283495720, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.22192383, + "step": 13139, + "time_per_iteration": 2.6820759773254395 + }, + { + "auxiliary_loss_clip": 0.01116914, + "auxiliary_loss_mlp": 0.00143769, + "balance_loss_clip": 0.96133077, + "balance_loss_mlp": 0.13594885, + "epoch": 0.790019540057117, + "flos": 68331005959680.0, + "grad_norm": 0.8653991395872381, + "language_loss": 0.59560609, + "learning_rate": 4.448546830368003e-07, + "loss": 0.60821289, + "num_input_tokens_seen": 283558795, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.078125, + "step": 13140, + "time_per_iteration": 3.244492769241333 + }, + { + "auxiliary_loss_clip": 0.01245114, + "auxiliary_loss_mlp": 0.00214232, + "balance_loss_clip": 1.0266788, + "balance_loss_mlp": 0.18961535, + "epoch": 0.7900796633097851, + "flos": 30332773601280.0, + "grad_norm": 6.965899795399672, + "language_loss": 0.83940393, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.85399741, + "num_input_tokens_seen": 283579305, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24609375, + "step": 13141, + "time_per_iteration": 2.7538228034973145 + }, + { + "auxiliary_loss_clip": 0.01245932, + "auxiliary_loss_mlp": 0.00233424, + "balance_loss_clip": 1.0266813, + "balance_loss_mlp": 0.20930827, + "epoch": 0.790139786562453, + "flos": 22126790983680.0, + "grad_norm": 224.6797147980219, + "language_loss": 0.755557, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.77035058, + "num_input_tokens_seen": 283597840, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.24145508, + "step": 13142, + "time_per_iteration": 2.6963860988616943 + }, + { + "auxiliary_loss_clip": 0.01120505, + "auxiliary_loss_mlp": 0.00130961, + "balance_loss_clip": 0.96438807, + "balance_loss_mlp": 0.12395179, + "epoch": 0.790199909815121, + "flos": 58207284213120.0, + "grad_norm": 1.763560783369387, + "language_loss": 0.59229243, + "learning_rate": 4.441202759969049e-07, + "loss": 0.60480714, + "num_input_tokens_seen": 283647950, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.0703125, + "step": 13143, + "time_per_iteration": 4.3365209102630615 + }, + { + "auxiliary_loss_clip": 0.01248984, + "auxiliary_loss_mlp": 0.00212757, + "balance_loss_clip": 1.0304693, + "balance_loss_mlp": 0.18781862, + "epoch": 0.7902600330677889, + "flos": 34533316759680.0, + "grad_norm": 5.112342301571589, + "language_loss": 0.80957496, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.82419235, + "num_input_tokens_seen": 283670645, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24951172, + "step": 13144, + "time_per_iteration": 4.173142433166504 + }, + { + "auxiliary_loss_clip": 0.01248011, + "auxiliary_loss_mlp": 0.00216706, + "balance_loss_clip": 1.02486897, + "balance_loss_mlp": 0.19017038, + "epoch": 0.7903201563204569, + "flos": 22346384780160.0, + "grad_norm": 86.44753568940614, + "language_loss": 0.91272599, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.92737317, + "num_input_tokens_seen": 283688830, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26538086, + "step": 13145, + "time_per_iteration": 2.6889121532440186 + }, + { + "auxiliary_loss_clip": 0.01232988, + "auxiliary_loss_mlp": 0.00212511, + "balance_loss_clip": 1.01718891, + "balance_loss_mlp": 0.18870467, + "epoch": 0.790380279573125, + "flos": 22053533195520.0, + "grad_norm": 38.16247682095744, + "language_loss": 0.81929249, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.83374751, + "num_input_tokens_seen": 283708625, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.23791504, + "step": 13146, + "time_per_iteration": 2.683138370513916 + }, + { + "auxiliary_loss_clip": 0.01243475, + "auxiliary_loss_mlp": 0.00220908, + "balance_loss_clip": 1.02109373, + "balance_loss_mlp": 0.19745927, + "epoch": 0.7904404028257929, + "flos": 20302600826880.0, + "grad_norm": 23.53552496340858, + "language_loss": 0.84964561, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.8642894, + "num_input_tokens_seen": 283725710, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.23461914, + "step": 13147, + "time_per_iteration": 2.695978879928589 + }, + { + "auxiliary_loss_clip": 0.01230792, + "auxiliary_loss_mlp": 0.00215251, + "balance_loss_clip": 1.01400828, + "balance_loss_mlp": 0.19055064, + "epoch": 0.7905005260784609, + "flos": 20008923229440.0, + "grad_norm": 39.2000993983356, + "language_loss": 0.79277962, + "learning_rate": 4.428974443697087e-07, + "loss": 0.80724001, + "num_input_tokens_seen": 283744150, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24682617, + "step": 13148, + "time_per_iteration": 2.671116352081299 + }, + { + "auxiliary_loss_clip": 0.01249605, + "auxiliary_loss_mlp": 0.00211103, + "balance_loss_clip": 1.02770734, + "balance_loss_mlp": 0.18625955, + "epoch": 0.7905606493311288, + "flos": 26905926418560.0, + "grad_norm": 22.198634697221973, + "language_loss": 0.80182499, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.81643206, + "num_input_tokens_seen": 283764170, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.24853516, + "step": 13149, + "time_per_iteration": 4.245004653930664 + }, + { + "auxiliary_loss_clip": 0.01259075, + "auxiliary_loss_mlp": 0.00232606, + "balance_loss_clip": 1.02790499, + "balance_loss_mlp": 0.20609362, + "epoch": 0.7906207725837968, + "flos": 23696230907520.0, + "grad_norm": 120.1933606834315, + "language_loss": 0.74069345, + "learning_rate": 4.424087249723225e-07, + "loss": 0.75561023, + "num_input_tokens_seen": 283784305, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.26513672, + "step": 13150, + "time_per_iteration": 2.750566005706787 + }, + { + "auxiliary_loss_clip": 0.01229731, + "auxiliary_loss_mlp": 0.00206046, + "balance_loss_clip": 1.01613855, + "balance_loss_mlp": 0.18331242, + "epoch": 0.7906808958364647, + "flos": 20848837927680.0, + "grad_norm": 5.366986753067599, + "language_loss": 0.78122079, + "learning_rate": 4.421644538650231e-07, + "loss": 0.79557854, + "num_input_tokens_seen": 283804040, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.22729492, + "step": 13151, + "time_per_iteration": 2.796795129776001 + }, + { + "auxiliary_loss_clip": 0.01252112, + "auxiliary_loss_mlp": 0.0022148, + "balance_loss_clip": 1.02905989, + "balance_loss_mlp": 0.1980079, + "epoch": 0.7907410190891327, + "flos": 40735196974080.0, + "grad_norm": 2825.305667922748, + "language_loss": 0.77175272, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.78648865, + "num_input_tokens_seen": 283827120, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.23498535, + "step": 13152, + "time_per_iteration": 2.885089635848999 + }, + { + "auxiliary_loss_clip": 0.01252121, + "auxiliary_loss_mlp": 0.00206774, + "balance_loss_clip": 1.0329206, + "balance_loss_mlp": 0.18368277, + "epoch": 0.7908011423418007, + "flos": 13261165050240.0, + "grad_norm": 12.785550469853707, + "language_loss": 0.81839502, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.83298397, + "num_input_tokens_seen": 283844820, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.2310791, + "step": 13153, + "time_per_iteration": 2.6655986309051514 + }, + { + "auxiliary_loss_clip": 0.01257073, + "auxiliary_loss_mlp": 0.00213389, + "balance_loss_clip": 1.03523517, + "balance_loss_mlp": 0.1884149, + "epoch": 0.7908612655944687, + "flos": 19754747614080.0, + "grad_norm": 27.55603175311605, + "language_loss": 0.85902524, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.87372983, + "num_input_tokens_seen": 283862870, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24951172, + "step": 13154, + "time_per_iteration": 4.128228425979614 + }, + { + "auxiliary_loss_clip": 0.01273522, + "auxiliary_loss_mlp": 0.00220697, + "balance_loss_clip": 1.03573024, + "balance_loss_mlp": 0.1944351, + "epoch": 0.7909213888471366, + "flos": 21287738211840.0, + "grad_norm": 60.99277426490816, + "language_loss": 0.82499558, + "learning_rate": 4.411879602612185e-07, + "loss": 0.83993781, + "num_input_tokens_seen": 283882405, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.26269531, + "step": 13155, + "time_per_iteration": 2.714562177658081 + }, + { + "auxiliary_loss_clip": 0.01253492, + "auxiliary_loss_mlp": 0.00197817, + "balance_loss_clip": 1.03034902, + "balance_loss_mlp": 0.17235386, + "epoch": 0.7909815120998046, + "flos": 22528882805760.0, + "grad_norm": 19.17917202892157, + "language_loss": 0.8447305, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.85924351, + "num_input_tokens_seen": 283902070, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25476074, + "step": 13156, + "time_per_iteration": 2.755249261856079 + }, + { + "auxiliary_loss_clip": 0.01235883, + "auxiliary_loss_mlp": 0.00191411, + "balance_loss_clip": 1.01635957, + "balance_loss_mlp": 0.16654375, + "epoch": 0.7910416353524725, + "flos": 26727702111360.0, + "grad_norm": 73.62265239592028, + "language_loss": 0.71521187, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.7294848, + "num_input_tokens_seen": 283924100, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.24890137, + "step": 13157, + "time_per_iteration": 2.7404820919036865 + }, + { + "auxiliary_loss_clip": 0.01248506, + "auxiliary_loss_mlp": 0.00222514, + "balance_loss_clip": 1.02491736, + "balance_loss_mlp": 0.19724202, + "epoch": 0.7911017586051405, + "flos": 24644847139200.0, + "grad_norm": 4.330373624204686, + "language_loss": 0.82718408, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.84189427, + "num_input_tokens_seen": 283944955, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.25256348, + "step": 13158, + "time_per_iteration": 2.7005462646484375 + }, + { + "auxiliary_loss_clip": 0.01224128, + "auxiliary_loss_mlp": 0.00192748, + "balance_loss_clip": 1.01337028, + "balance_loss_mlp": 0.17033666, + "epoch": 0.7911618818578086, + "flos": 17565489578880.0, + "grad_norm": 72.34876072229818, + "language_loss": 0.78789663, + "learning_rate": 4.40212412422309e-07, + "loss": 0.80206537, + "num_input_tokens_seen": 283963125, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.22412109, + "step": 13159, + "time_per_iteration": 2.626145362854004 + }, + { + "auxiliary_loss_clip": 0.01243581, + "auxiliary_loss_mlp": 0.00208023, + "balance_loss_clip": 1.02116323, + "balance_loss_mlp": 0.1827271, + "epoch": 0.7912220051104765, + "flos": 16721660298240.0, + "grad_norm": 3.3629252908345726, + "language_loss": 0.74565017, + "learning_rate": 4.399686733077206e-07, + "loss": 0.76016629, + "num_input_tokens_seen": 283982850, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25305176, + "step": 13160, + "time_per_iteration": 2.6828629970550537 + }, + { + "auxiliary_loss_clip": 0.01217916, + "auxiliary_loss_mlp": 0.00207833, + "balance_loss_clip": 1.00897551, + "balance_loss_mlp": 0.18591064, + "epoch": 0.7912821283631445, + "flos": 13698736531200.0, + "grad_norm": 18.340203382754183, + "language_loss": 0.80170017, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.81595773, + "num_input_tokens_seen": 283998275, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.21923828, + "step": 13161, + "time_per_iteration": 2.702373504638672 + }, + { + "auxiliary_loss_clip": 0.01232035, + "auxiliary_loss_mlp": 0.0020021, + "balance_loss_clip": 1.01702118, + "balance_loss_mlp": 0.17769149, + "epoch": 0.7913422516158124, + "flos": 23769021818880.0, + "grad_norm": 5.5870736735245785, + "language_loss": 0.81671053, + "learning_rate": 4.39481372557418e-07, + "loss": 0.83103293, + "num_input_tokens_seen": 284018750, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.22521973, + "step": 13162, + "time_per_iteration": 2.771310806274414 + }, + { + "auxiliary_loss_clip": 0.01262062, + "auxiliary_loss_mlp": 0.00207527, + "balance_loss_clip": 1.03806627, + "balance_loss_mlp": 0.18170607, + "epoch": 0.7914023748684804, + "flos": 19938251220480.0, + "grad_norm": 56.46613946293597, + "language_loss": 0.79814899, + "learning_rate": 4.392378109401811e-07, + "loss": 0.81284487, + "num_input_tokens_seen": 284037850, + "router_z_loss_clip": 2.24121094, + "router_z_loss_mlp": 0.25842285, + "step": 13163, + "time_per_iteration": 2.6334853172302246 + }, + { + "auxiliary_loss_clip": 0.01236097, + "auxiliary_loss_mlp": 0.00192129, + "balance_loss_clip": 1.02058673, + "balance_loss_mlp": 0.16802511, + "epoch": 0.7914624981211483, + "flos": 20594805966720.0, + "grad_norm": 6.670564976617729, + "language_loss": 0.80734742, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.8216297, + "num_input_tokens_seen": 284056380, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.2409668, + "step": 13164, + "time_per_iteration": 2.6926498413085938 + }, + { + "auxiliary_loss_clip": 0.01245384, + "auxiliary_loss_mlp": 0.00210715, + "balance_loss_clip": 1.02469134, + "balance_loss_mlp": 0.18658713, + "epoch": 0.7915226213738163, + "flos": 21799465320960.0, + "grad_norm": 27.839520021736906, + "language_loss": 0.75325143, + "learning_rate": 4.387508652677177e-07, + "loss": 0.76781237, + "num_input_tokens_seen": 284074945, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24108887, + "step": 13165, + "time_per_iteration": 2.6458113193511963 + }, + { + "auxiliary_loss_clip": 0.01215843, + "auxiliary_loss_mlp": 0.00198295, + "balance_loss_clip": 1.0081296, + "balance_loss_mlp": 0.17698073, + "epoch": 0.7915827446264843, + "flos": 16288362535680.0, + "grad_norm": 23.01351783449072, + "language_loss": 0.79140174, + "learning_rate": 4.385074812309557e-07, + "loss": 0.80554318, + "num_input_tokens_seen": 284092070, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.21313477, + "step": 13166, + "time_per_iteration": 2.6624362468719482 + }, + { + "auxiliary_loss_clip": 0.01251044, + "auxiliary_loss_mlp": 0.00216196, + "balance_loss_clip": 1.02996492, + "balance_loss_mlp": 0.19335584, + "epoch": 0.7916428678791523, + "flos": 25702595867520.0, + "grad_norm": 6.824687890987323, + "language_loss": 0.86259949, + "learning_rate": 4.382641564061462e-07, + "loss": 0.87727189, + "num_input_tokens_seen": 284112255, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.22839355, + "step": 13167, + "time_per_iteration": 2.89607310295105 + }, + { + "auxiliary_loss_clip": 0.01230473, + "auxiliary_loss_mlp": 0.00209847, + "balance_loss_clip": 1.01242733, + "balance_loss_mlp": 0.18545653, + "epoch": 0.7917029911318202, + "flos": 23878513451520.0, + "grad_norm": 2.361629165066597, + "language_loss": 0.92184865, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.93625188, + "num_input_tokens_seen": 284132330, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24389648, + "step": 13168, + "time_per_iteration": 2.948854446411133 + }, + { + "auxiliary_loss_clip": 0.01249058, + "auxiliary_loss_mlp": 0.00196824, + "balance_loss_clip": 1.02704024, + "balance_loss_mlp": 0.17173012, + "epoch": 0.7917631143844882, + "flos": 21646593037440.0, + "grad_norm": 2.6795114093287844, + "language_loss": 0.81002945, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.82448828, + "num_input_tokens_seen": 284150640, + "router_z_loss_clip": 2.22167969, + "router_z_loss_mlp": 0.25109863, + "step": 13169, + "time_per_iteration": 2.7106986045837402 + }, + { + "auxiliary_loss_clip": 0.01252743, + "auxiliary_loss_mlp": 0.00206429, + "balance_loss_clip": 1.02745843, + "balance_loss_mlp": 0.18252765, + "epoch": 0.7918232376371561, + "flos": 38874198355200.0, + "grad_norm": 5.883455657817193, + "language_loss": 0.77751338, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.79210508, + "num_input_tokens_seen": 284171910, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.23913574, + "step": 13170, + "time_per_iteration": 2.789294481277466 + }, + { + "auxiliary_loss_clip": 0.01227806, + "auxiliary_loss_mlp": 0.00200479, + "balance_loss_clip": 1.01495171, + "balance_loss_mlp": 0.17888981, + "epoch": 0.7918833608898241, + "flos": 20775544225920.0, + "grad_norm": 14.732063678976541, + "language_loss": 0.78285348, + "learning_rate": 4.372914494109412e-07, + "loss": 0.79713631, + "num_input_tokens_seen": 284191340, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.21606445, + "step": 13171, + "time_per_iteration": 2.6892974376678467 + }, + { + "auxiliary_loss_clip": 0.01239914, + "auxiliary_loss_mlp": 0.0020925, + "balance_loss_clip": 1.02061033, + "balance_loss_mlp": 0.18512245, + "epoch": 0.7919434841424922, + "flos": 33910122769920.0, + "grad_norm": 10.134422282176, + "language_loss": 0.76717472, + "learning_rate": 4.370484207842553e-07, + "loss": 0.78166634, + "num_input_tokens_seen": 284212495, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24157715, + "step": 13172, + "time_per_iteration": 2.7374603748321533 + }, + { + "auxiliary_loss_clip": 0.01224686, + "auxiliary_loss_mlp": 0.00199895, + "balance_loss_clip": 1.00955963, + "balance_loss_mlp": 0.17799667, + "epoch": 0.7920036073951601, + "flos": 21064660796160.0, + "grad_norm": 29.289104506830444, + "language_loss": 0.86052388, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.87476969, + "num_input_tokens_seen": 284230825, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.21899414, + "step": 13173, + "time_per_iteration": 2.7438483238220215 + }, + { + "auxiliary_loss_clip": 0.01227213, + "auxiliary_loss_mlp": 0.002092, + "balance_loss_clip": 1.01473081, + "balance_loss_mlp": 0.18653879, + "epoch": 0.7920637306478281, + "flos": 23655974739840.0, + "grad_norm": 3.9399529907613378, + "language_loss": 0.83291602, + "learning_rate": 4.365625413419365e-07, + "loss": 0.84728014, + "num_input_tokens_seen": 284250365, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.22680664, + "step": 13174, + "time_per_iteration": 2.713083267211914 + }, + { + "auxiliary_loss_clip": 0.01217537, + "auxiliary_loss_mlp": 0.00193682, + "balance_loss_clip": 1.00541139, + "balance_loss_mlp": 0.17000748, + "epoch": 0.792123853900496, + "flos": 27195438038400.0, + "grad_norm": 336.06490431539424, + "language_loss": 0.76501799, + "learning_rate": 4.363196905447297e-07, + "loss": 0.77913022, + "num_input_tokens_seen": 284269635, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.23669434, + "step": 13175, + "time_per_iteration": 2.7407984733581543 + }, + { + "auxiliary_loss_clip": 0.01243053, + "auxiliary_loss_mlp": 0.00208765, + "balance_loss_clip": 1.02545094, + "balance_loss_mlp": 0.1866276, + "epoch": 0.792183977153164, + "flos": 19098659744640.0, + "grad_norm": 7.59355990813891, + "language_loss": 0.69647914, + "learning_rate": 4.360768990424364e-07, + "loss": 0.71099734, + "num_input_tokens_seen": 284288380, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.22155762, + "step": 13176, + "time_per_iteration": 2.6199002265930176 + }, + { + "auxiliary_loss_clip": 0.01234879, + "auxiliary_loss_mlp": 0.0018657, + "balance_loss_clip": 1.02074289, + "balance_loss_mlp": 0.16438517, + "epoch": 0.7922441004058319, + "flos": 17128851851520.0, + "grad_norm": 2.305721745379097, + "language_loss": 0.83627439, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.8504889, + "num_input_tokens_seen": 284306920, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.22192383, + "step": 13177, + "time_per_iteration": 2.695570468902588 + }, + { + "auxiliary_loss_clip": 0.01236885, + "auxiliary_loss_mlp": 0.00187289, + "balance_loss_clip": 1.02313697, + "balance_loss_mlp": 0.16467464, + "epoch": 0.7923042236585, + "flos": 17821640442240.0, + "grad_norm": 15.416012550430255, + "language_loss": 0.73075724, + "learning_rate": 4.355914939594174e-07, + "loss": 0.74499893, + "num_input_tokens_seen": 284324700, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.22607422, + "step": 13178, + "time_per_iteration": 2.620849370956421 + }, + { + "auxiliary_loss_clip": 0.01232952, + "auxiliary_loss_mlp": 0.00190259, + "balance_loss_clip": 1.01799262, + "balance_loss_mlp": 0.16760913, + "epoch": 0.7923643469111679, + "flos": 29935206892800.0, + "grad_norm": 112.6410339015841, + "language_loss": 0.76272768, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.77695984, + "num_input_tokens_seen": 284345985, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.22680664, + "step": 13179, + "time_per_iteration": 2.7470221519470215 + }, + { + "auxiliary_loss_clip": 0.01258509, + "auxiliary_loss_mlp": 0.00227193, + "balance_loss_clip": 1.03357983, + "balance_loss_mlp": 0.19996569, + "epoch": 0.7924244701638359, + "flos": 22674716023680.0, + "grad_norm": 5.928043640276092, + "language_loss": 0.85032141, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.86517847, + "num_input_tokens_seen": 284364475, + "router_z_loss_clip": 2.25097656, + "router_z_loss_mlp": 0.2722168, + "step": 13180, + "time_per_iteration": 2.673217296600342 + }, + { + "auxiliary_loss_clip": 0.01252826, + "auxiliary_loss_mlp": 0.00210144, + "balance_loss_clip": 1.03362322, + "balance_loss_mlp": 0.1856699, + "epoch": 0.7924845934165038, + "flos": 17968156018560.0, + "grad_norm": 10.196099968253366, + "language_loss": 0.90316534, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.91779506, + "num_input_tokens_seen": 284382125, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24487305, + "step": 13181, + "time_per_iteration": 2.6615023612976074 + }, + { + "auxiliary_loss_clip": 0.01228021, + "auxiliary_loss_mlp": 0.00206846, + "balance_loss_clip": 1.01442766, + "balance_loss_mlp": 0.18327856, + "epoch": 0.7925447166691718, + "flos": 23476960333440.0, + "grad_norm": 80.87985500191917, + "language_loss": 0.84591544, + "learning_rate": 4.346213957372895e-07, + "loss": 0.86026406, + "num_input_tokens_seen": 284401585, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.23571777, + "step": 13182, + "time_per_iteration": 2.708810567855835 + }, + { + "auxiliary_loss_clip": 0.01277994, + "auxiliary_loss_mlp": 0.00225701, + "balance_loss_clip": 1.04214764, + "balance_loss_mlp": 0.19843769, + "epoch": 0.7926048399218397, + "flos": 20447572118400.0, + "grad_norm": 12.67059254104185, + "language_loss": 0.84017777, + "learning_rate": 4.34379019557056e-07, + "loss": 0.85521472, + "num_input_tokens_seen": 284419125, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.27258301, + "step": 13183, + "time_per_iteration": 2.663673162460327 + }, + { + "auxiliary_loss_clip": 0.01257402, + "auxiliary_loss_mlp": 0.00212196, + "balance_loss_clip": 1.03160453, + "balance_loss_mlp": 0.18796109, + "epoch": 0.7926649631745077, + "flos": 37160038535040.0, + "grad_norm": 27.688113463298706, + "language_loss": 0.78536052, + "learning_rate": 4.341367027453264e-07, + "loss": 0.80005652, + "num_input_tokens_seen": 284440445, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.24267578, + "step": 13184, + "time_per_iteration": 2.9393982887268066 + }, + { + "auxiliary_loss_clip": 0.01250622, + "auxiliary_loss_mlp": 0.00209306, + "balance_loss_clip": 1.02900505, + "balance_loss_mlp": 0.18470135, + "epoch": 0.7927250864271758, + "flos": 17018606033280.0, + "grad_norm": 58.15230256867171, + "language_loss": 0.807877, + "learning_rate": 4.338944453112907e-07, + "loss": 0.82247627, + "num_input_tokens_seen": 284459370, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.24621582, + "step": 13185, + "time_per_iteration": 4.127445220947266 + }, + { + "auxiliary_loss_clip": 0.01246261, + "auxiliary_loss_mlp": 0.00211811, + "balance_loss_clip": 1.02436459, + "balance_loss_mlp": 0.18705113, + "epoch": 0.7927852096798437, + "flos": 17749208666880.0, + "grad_norm": 97.7198930677389, + "language_loss": 0.74513733, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.759718, + "num_input_tokens_seen": 284477525, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24743652, + "step": 13186, + "time_per_iteration": 2.612457275390625 + }, + { + "auxiliary_loss_clip": 0.01231266, + "auxiliary_loss_mlp": 0.00210116, + "balance_loss_clip": 1.01809263, + "balance_loss_mlp": 0.18715589, + "epoch": 0.7928453329325117, + "flos": 23838436851840.0, + "grad_norm": 62.64566417811416, + "language_loss": 0.8240723, + "learning_rate": 4.334101086130408e-07, + "loss": 0.83848614, + "num_input_tokens_seen": 284496590, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.22973633, + "step": 13187, + "time_per_iteration": 4.140455484390259 + }, + { + "auxiliary_loss_clip": 0.01229977, + "auxiliary_loss_mlp": 0.00208689, + "balance_loss_clip": 1.01439929, + "balance_loss_mlp": 0.18502587, + "epoch": 0.7929054561851796, + "flos": 17454920538240.0, + "grad_norm": 23.054145446397193, + "language_loss": 0.81074381, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.82513052, + "num_input_tokens_seen": 284511470, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.2364502, + "step": 13188, + "time_per_iteration": 2.587362051010132 + }, + { + "auxiliary_loss_clip": 0.01246613, + "auxiliary_loss_mlp": 0.0021818, + "balance_loss_clip": 1.02142119, + "balance_loss_mlp": 0.19080992, + "epoch": 0.7929655794378476, + "flos": 21981280988160.0, + "grad_norm": 261.1744806867916, + "language_loss": 0.74255085, + "learning_rate": 4.329260095357725e-07, + "loss": 0.75719875, + "num_input_tokens_seen": 284531125, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.2734375, + "step": 13189, + "time_per_iteration": 2.6871156692504883 + }, + { + "auxiliary_loss_clip": 0.01235591, + "auxiliary_loss_mlp": 0.00210785, + "balance_loss_clip": 1.01297092, + "balance_loss_mlp": 0.18726525, + "epoch": 0.7930257026905155, + "flos": 17273930883840.0, + "grad_norm": 19.950613366442127, + "language_loss": 0.83568513, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.85014886, + "num_input_tokens_seen": 284549340, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.23510742, + "step": 13190, + "time_per_iteration": 2.6609575748443604 + }, + { + "auxiliary_loss_clip": 0.01238991, + "auxiliary_loss_mlp": 0.00203032, + "balance_loss_clip": 1.02864206, + "balance_loss_mlp": 0.18066819, + "epoch": 0.7930858259431836, + "flos": 27300584125440.0, + "grad_norm": 18.692852160756857, + "language_loss": 0.79087365, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.8052938, + "num_input_tokens_seen": 284567060, + "router_z_loss_clip": 2.10253906, + "router_z_loss_mlp": 0.22351074, + "step": 13191, + "time_per_iteration": 4.354615688323975 + }, + { + "auxiliary_loss_clip": 0.01259762, + "auxiliary_loss_mlp": 0.00221318, + "balance_loss_clip": 1.03346813, + "balance_loss_mlp": 0.19577141, + "epoch": 0.7931459491958515, + "flos": 19863736456320.0, + "grad_norm": 5.748731444252395, + "language_loss": 0.7596311, + "learning_rate": 4.322003066198219e-07, + "loss": 0.7744419, + "num_input_tokens_seen": 284586600, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.25549316, + "step": 13192, + "time_per_iteration": 2.6866166591644287 + }, + { + "auxiliary_loss_clip": 0.01255357, + "auxiliary_loss_mlp": 0.00217831, + "balance_loss_clip": 1.03312588, + "balance_loss_mlp": 0.19357198, + "epoch": 0.7932060724485195, + "flos": 23147120718720.0, + "grad_norm": 2.1316742100931996, + "language_loss": 0.82553607, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.84026796, + "num_input_tokens_seen": 284605715, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.24279785, + "step": 13193, + "time_per_iteration": 2.709317684173584 + }, + { + "auxiliary_loss_clip": 0.01249201, + "auxiliary_loss_mlp": 0.00225588, + "balance_loss_clip": 1.03111243, + "balance_loss_mlp": 0.2017104, + "epoch": 0.7932661957011874, + "flos": 29934847756800.0, + "grad_norm": 12.473432993562028, + "language_loss": 0.79882789, + "learning_rate": 4.317168019161741e-07, + "loss": 0.81357574, + "num_input_tokens_seen": 284628540, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.23876953, + "step": 13194, + "time_per_iteration": 2.8075287342071533 + }, + { + "auxiliary_loss_clip": 0.01262409, + "auxiliary_loss_mlp": 0.00223307, + "balance_loss_clip": 1.03632534, + "balance_loss_mlp": 0.19793949, + "epoch": 0.7933263189538554, + "flos": 22559119079040.0, + "grad_norm": 66.74401134756553, + "language_loss": 0.78635406, + "learning_rate": 4.314751387639517e-07, + "loss": 0.80121118, + "num_input_tokens_seen": 284646040, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.25366211, + "step": 13195, + "time_per_iteration": 2.7143137454986572 + }, + { + "auxiliary_loss_clip": 0.01252222, + "auxiliary_loss_mlp": 0.00214503, + "balance_loss_clip": 1.02815485, + "balance_loss_mlp": 0.18883723, + "epoch": 0.7933864422065233, + "flos": 25479051575040.0, + "grad_norm": 20.6456130027667, + "language_loss": 0.85126543, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.8659327, + "num_input_tokens_seen": 284665110, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.2565918, + "step": 13196, + "time_per_iteration": 4.142575025558472 + }, + { + "auxiliary_loss_clip": 0.01257809, + "auxiliary_loss_mlp": 0.00212112, + "balance_loss_clip": 1.03633583, + "balance_loss_mlp": 0.185707, + "epoch": 0.7934465654591913, + "flos": 33583156243200.0, + "grad_norm": 5.59741657169601, + "language_loss": 0.76953828, + "learning_rate": 4.309919909045268e-07, + "loss": 0.7842375, + "num_input_tokens_seen": 284686515, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.26403809, + "step": 13197, + "time_per_iteration": 2.8046646118164062 + }, + { + "auxiliary_loss_clip": 0.0124231, + "auxiliary_loss_mlp": 0.00208924, + "balance_loss_clip": 1.02434397, + "balance_loss_mlp": 0.18297246, + "epoch": 0.7935066887118594, + "flos": 31432538263680.0, + "grad_norm": 17.536607378824073, + "language_loss": 0.74391472, + "learning_rate": 4.30750506215646e-07, + "loss": 0.75842708, + "num_input_tokens_seen": 284707300, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.25952148, + "step": 13198, + "time_per_iteration": 2.780355215072632 + }, + { + "auxiliary_loss_clip": 0.01247987, + "auxiliary_loss_mlp": 0.00213172, + "balance_loss_clip": 1.0222739, + "balance_loss_mlp": 0.18614721, + "epoch": 0.7935668119645273, + "flos": 14682616940160.0, + "grad_norm": 1532.1523214880265, + "language_loss": 0.83622634, + "learning_rate": 4.30509081032864e-07, + "loss": 0.85083795, + "num_input_tokens_seen": 284723545, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.27038574, + "step": 13199, + "time_per_iteration": 2.644821882247925 + }, + { + "auxiliary_loss_clip": 0.01247623, + "auxiliary_loss_mlp": 0.00209656, + "balance_loss_clip": 1.02907395, + "balance_loss_mlp": 0.18545701, + "epoch": 0.7936269352171953, + "flos": 18004246208640.0, + "grad_norm": 3.697888002567617, + "language_loss": 0.88435066, + "learning_rate": 4.302677153653349e-07, + "loss": 0.89892352, + "num_input_tokens_seen": 284742650, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24206543, + "step": 13200, + "time_per_iteration": 2.648928165435791 + }, + { + "auxiliary_loss_clip": 0.01234573, + "auxiliary_loss_mlp": 0.00199022, + "balance_loss_clip": 1.02245903, + "balance_loss_mlp": 0.17709893, + "epoch": 0.7936870584698632, + "flos": 18880215183360.0, + "grad_norm": 24.47996604661523, + "language_loss": 0.82951379, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.84384978, + "num_input_tokens_seen": 284760955, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.21923828, + "step": 13201, + "time_per_iteration": 2.610272169113159 + }, + { + "auxiliary_loss_clip": 0.01245044, + "auxiliary_loss_mlp": 0.00213825, + "balance_loss_clip": 1.02506566, + "balance_loss_mlp": 0.18889804, + "epoch": 0.7937471817225312, + "flos": 23367001824000.0, + "grad_norm": 30.200425334299304, + "language_loss": 0.75145042, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.76603913, + "num_input_tokens_seen": 284780745, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.24938965, + "step": 13202, + "time_per_iteration": 2.6895530223846436 + }, + { + "auxiliary_loss_clip": 0.01241447, + "auxiliary_loss_mlp": 0.00222387, + "balance_loss_clip": 1.02177119, + "balance_loss_mlp": 0.19666177, + "epoch": 0.7938073049751991, + "flos": 22674428714880.0, + "grad_norm": 6.443916083163891, + "language_loss": 0.82362688, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.83826524, + "num_input_tokens_seen": 284799000, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25732422, + "step": 13203, + "time_per_iteration": 2.634995460510254 + }, + { + "auxiliary_loss_clip": 0.01261458, + "auxiliary_loss_mlp": 0.00215912, + "balance_loss_clip": 1.04227877, + "balance_loss_mlp": 0.19172452, + "epoch": 0.7938674282278672, + "flos": 22851431959680.0, + "grad_norm": 199.7582469653167, + "language_loss": 0.77239949, + "learning_rate": 4.293028480307643e-07, + "loss": 0.78717315, + "num_input_tokens_seen": 284817450, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.24206543, + "step": 13204, + "time_per_iteration": 2.6831390857696533 + }, + { + "auxiliary_loss_clip": 0.01232503, + "auxiliary_loss_mlp": 0.00220603, + "balance_loss_clip": 1.01867604, + "balance_loss_mlp": 0.19690405, + "epoch": 0.7939275514805351, + "flos": 27012509049600.0, + "grad_norm": 20.840402908972248, + "language_loss": 0.83891642, + "learning_rate": 4.290617800767438e-07, + "loss": 0.85344756, + "num_input_tokens_seen": 284838865, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.23681641, + "step": 13205, + "time_per_iteration": 2.7248647212982178 + }, + { + "auxiliary_loss_clip": 0.01240337, + "auxiliary_loss_mlp": 0.00208007, + "balance_loss_clip": 1.02185595, + "balance_loss_mlp": 0.18429592, + "epoch": 0.7939876747332031, + "flos": 21142838747520.0, + "grad_norm": 4.426987165718106, + "language_loss": 0.84820265, + "learning_rate": 4.28820771692858e-07, + "loss": 0.86268616, + "num_input_tokens_seen": 284857975, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.23730469, + "step": 13206, + "time_per_iteration": 2.6756465435028076 + }, + { + "auxiliary_loss_clip": 0.01262079, + "auxiliary_loss_mlp": 0.00205245, + "balance_loss_clip": 1.03710186, + "balance_loss_mlp": 0.18031834, + "epoch": 0.794047797985871, + "flos": 23289075267840.0, + "grad_norm": 4.393236190342007, + "language_loss": 0.86152279, + "learning_rate": 4.285798228882456e-07, + "loss": 0.87619603, + "num_input_tokens_seen": 284877145, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.24938965, + "step": 13207, + "time_per_iteration": 2.741239309310913 + }, + { + "auxiliary_loss_clip": 0.01250608, + "auxiliary_loss_mlp": 0.0020701, + "balance_loss_clip": 1.03106022, + "balance_loss_mlp": 0.18141584, + "epoch": 0.794107921238539, + "flos": 24608074590720.0, + "grad_norm": 5.868907872537064, + "language_loss": 0.90563428, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.92021048, + "num_input_tokens_seen": 284895560, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25622559, + "step": 13208, + "time_per_iteration": 2.651477336883545 + }, + { + "auxiliary_loss_clip": 0.01123604, + "auxiliary_loss_mlp": 0.00062154, + "balance_loss_clip": 0.97412688, + "balance_loss_mlp": 0.05414264, + "epoch": 0.7941680444912069, + "flos": 64093690252800.0, + "grad_norm": 0.7252432645717167, + "language_loss": 0.57905078, + "learning_rate": 4.280981040533875e-07, + "loss": 0.59090841, + "num_input_tokens_seen": 284963135, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.08007812, + "step": 13209, + "time_per_iteration": 3.232123613357544 + }, + { + "auxiliary_loss_clip": 0.01256523, + "auxiliary_loss_mlp": 0.00219065, + "balance_loss_clip": 1.03098381, + "balance_loss_mlp": 0.19415006, + "epoch": 0.794228167743875, + "flos": 24388839930240.0, + "grad_norm": 28.496231512800005, + "language_loss": 0.72121608, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.73597193, + "num_input_tokens_seen": 284981755, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.24926758, + "step": 13210, + "time_per_iteration": 2.6655592918395996 + }, + { + "auxiliary_loss_clip": 0.01231483, + "auxiliary_loss_mlp": 0.00196521, + "balance_loss_clip": 1.01876211, + "balance_loss_mlp": 0.17349012, + "epoch": 0.794288290996543, + "flos": 28512498026880.0, + "grad_norm": 162.71740213261276, + "language_loss": 0.76888514, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.78316516, + "num_input_tokens_seen": 285003060, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.23022461, + "step": 13211, + "time_per_iteration": 2.7732200622558594 + }, + { + "auxiliary_loss_clip": 0.01253916, + "auxiliary_loss_mlp": 0.00231163, + "balance_loss_clip": 1.02654088, + "balance_loss_mlp": 0.20586649, + "epoch": 0.7943484142492109, + "flos": 25922117836800.0, + "grad_norm": 32.546236792785756, + "language_loss": 0.79482865, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.80967945, + "num_input_tokens_seen": 285021640, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.25280762, + "step": 13212, + "time_per_iteration": 2.7046329975128174 + }, + { + "auxiliary_loss_clip": 0.01215762, + "auxiliary_loss_mlp": 0.00198424, + "balance_loss_clip": 1.00960922, + "balance_loss_mlp": 0.17749101, + "epoch": 0.7944085375018789, + "flos": 23915286000000.0, + "grad_norm": 8.352817809473706, + "language_loss": 0.86004496, + "learning_rate": 4.271353817368246e-07, + "loss": 0.87418675, + "num_input_tokens_seen": 285040490, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.20922852, + "step": 13213, + "time_per_iteration": 2.8237009048461914 + }, + { + "auxiliary_loss_clip": 0.01259992, + "auxiliary_loss_mlp": 0.00204988, + "balance_loss_clip": 1.03695428, + "balance_loss_mlp": 0.18112186, + "epoch": 0.7944686607545468, + "flos": 20229953569920.0, + "grad_norm": 3747.4457343000236, + "language_loss": 0.81679916, + "learning_rate": 4.268948502428327e-07, + "loss": 0.83144897, + "num_input_tokens_seen": 285059270, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.23852539, + "step": 13214, + "time_per_iteration": 2.6512110233306885 + }, + { + "auxiliary_loss_clip": 0.01224659, + "auxiliary_loss_mlp": 0.00185573, + "balance_loss_clip": 1.00659502, + "balance_loss_mlp": 0.16251804, + "epoch": 0.7945287840072148, + "flos": 21980993679360.0, + "grad_norm": 60.139786442736494, + "language_loss": 0.80935949, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.82346177, + "num_input_tokens_seen": 285075390, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.23071289, + "step": 13215, + "time_per_iteration": 2.664173126220703 + }, + { + "auxiliary_loss_clip": 0.0124367, + "auxiliary_loss_mlp": 0.00212705, + "balance_loss_clip": 1.02894068, + "balance_loss_mlp": 0.18867245, + "epoch": 0.7945889072598827, + "flos": 26397718842240.0, + "grad_norm": 2.177067793063461, + "language_loss": 0.86096895, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.87553269, + "num_input_tokens_seen": 285096290, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24023438, + "step": 13216, + "time_per_iteration": 2.709200620651245 + }, + { + "auxiliary_loss_clip": 0.01243606, + "auxiliary_loss_mlp": 0.00214103, + "balance_loss_clip": 1.02349782, + "balance_loss_mlp": 0.18861577, + "epoch": 0.7946490305125508, + "flos": 25810255906560.0, + "grad_norm": 8.098278788984295, + "language_loss": 0.81597036, + "learning_rate": 4.261736137111598e-07, + "loss": 0.83054745, + "num_input_tokens_seen": 285116020, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25500488, + "step": 13217, + "time_per_iteration": 2.8105363845825195 + }, + { + "auxiliary_loss_clip": 0.01230251, + "auxiliary_loss_mlp": 0.00201744, + "balance_loss_clip": 1.01817632, + "balance_loss_mlp": 0.17990527, + "epoch": 0.7947091537652187, + "flos": 15960965045760.0, + "grad_norm": 28.35786312450135, + "language_loss": 0.81684959, + "learning_rate": 4.259333208810907e-07, + "loss": 0.83116955, + "num_input_tokens_seen": 285133510, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.21826172, + "step": 13218, + "time_per_iteration": 2.6333329677581787 + }, + { + "auxiliary_loss_clip": 0.01253807, + "auxiliary_loss_mlp": 0.00218518, + "balance_loss_clip": 1.03306508, + "balance_loss_mlp": 0.19452128, + "epoch": 0.7947692770178867, + "flos": 18587866389120.0, + "grad_norm": 17.89620280451043, + "language_loss": 0.94080561, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.95552886, + "num_input_tokens_seen": 285151690, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24023438, + "step": 13219, + "time_per_iteration": 2.712911605834961 + }, + { + "auxiliary_loss_clip": 0.01248302, + "auxiliary_loss_mlp": 0.00200053, + "balance_loss_clip": 1.02847815, + "balance_loss_mlp": 0.17429203, + "epoch": 0.7948294002705546, + "flos": 20442220992000.0, + "grad_norm": 2.558484521118442, + "language_loss": 0.85173559, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.8662191, + "num_input_tokens_seen": 285170485, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.2578125, + "step": 13220, + "time_per_iteration": 2.725888252258301 + }, + { + "auxiliary_loss_clip": 0.01261714, + "auxiliary_loss_mlp": 0.00223578, + "balance_loss_clip": 1.03298163, + "balance_loss_mlp": 0.1973879, + "epoch": 0.7948895235232226, + "flos": 38181194282880.0, + "grad_norm": 171.99765494257588, + "language_loss": 0.78144515, + "learning_rate": 4.252128005599176e-07, + "loss": 0.79629809, + "num_input_tokens_seen": 285191050, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.26220703, + "step": 13221, + "time_per_iteration": 2.7997210025787354 + }, + { + "auxiliary_loss_clip": 0.01246565, + "auxiliary_loss_mlp": 0.0019741, + "balance_loss_clip": 1.03223431, + "balance_loss_mlp": 0.17357981, + "epoch": 0.7949496467758905, + "flos": 15559806977280.0, + "grad_norm": 50.89350430174263, + "language_loss": 0.83295095, + "learning_rate": 4.249727465395634e-07, + "loss": 0.84739065, + "num_input_tokens_seen": 285208750, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.23852539, + "step": 13222, + "time_per_iteration": 2.64424729347229 + }, + { + "auxiliary_loss_clip": 0.01128851, + "auxiliary_loss_mlp": 0.00113711, + "balance_loss_clip": 0.97379756, + "balance_loss_mlp": 0.10522332, + "epoch": 0.7950097700285585, + "flos": 70897036728960.0, + "grad_norm": 0.7655919837487744, + "language_loss": 0.65554649, + "learning_rate": 4.247327522443993e-07, + "loss": 0.66797215, + "num_input_tokens_seen": 285264605, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.08496094, + "step": 13223, + "time_per_iteration": 3.0239062309265137 + }, + { + "auxiliary_loss_clip": 0.01237705, + "auxiliary_loss_mlp": 0.00221256, + "balance_loss_clip": 1.01925445, + "balance_loss_mlp": 0.19607854, + "epoch": 0.7950698932812266, + "flos": 23951627585280.0, + "grad_norm": 5.8261822460486075, + "language_loss": 0.7997967, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.81438631, + "num_input_tokens_seen": 285283940, + "router_z_loss_clip": 2.18457031, + "router_z_loss_mlp": 0.25170898, + "step": 13224, + "time_per_iteration": 2.6759393215179443 + }, + { + "auxiliary_loss_clip": 0.01118175, + "auxiliary_loss_mlp": 0.00067091, + "balance_loss_clip": 0.9702636, + "balance_loss_mlp": 0.0595095, + "epoch": 0.7951300165338945, + "flos": 60282561415680.0, + "grad_norm": 597.1156136479367, + "language_loss": 0.54299188, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.55484462, + "num_input_tokens_seen": 285349525, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07568359, + "step": 13225, + "time_per_iteration": 3.177720308303833 + }, + { + "auxiliary_loss_clip": 0.01239284, + "auxiliary_loss_mlp": 0.00199766, + "balance_loss_clip": 1.02355361, + "balance_loss_mlp": 0.17674688, + "epoch": 0.7951901397865625, + "flos": 22819004956800.0, + "grad_norm": 172.89885551278357, + "language_loss": 0.7294836, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.74387407, + "num_input_tokens_seen": 285367355, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.23010254, + "step": 13226, + "time_per_iteration": 2.636798143386841 + }, + { + "auxiliary_loss_clip": 0.012577, + "auxiliary_loss_mlp": 0.00196597, + "balance_loss_clip": 1.03537607, + "balance_loss_mlp": 0.17077598, + "epoch": 0.7952502630392304, + "flos": 35695672871040.0, + "grad_norm": 8.864164496337763, + "language_loss": 0.78903735, + "learning_rate": 4.237733724976349e-07, + "loss": 0.80358034, + "num_input_tokens_seen": 285386190, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25830078, + "step": 13227, + "time_per_iteration": 4.30753231048584 + }, + { + "auxiliary_loss_clip": 0.01232542, + "auxiliary_loss_mlp": 0.00195765, + "balance_loss_clip": 1.019449, + "balance_loss_mlp": 0.17441472, + "epoch": 0.7953103862918984, + "flos": 25629840869760.0, + "grad_norm": 76.95280039873832, + "language_loss": 0.77834803, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.79263109, + "num_input_tokens_seen": 285406150, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.21325684, + "step": 13228, + "time_per_iteration": 2.643220901489258 + }, + { + "auxiliary_loss_clip": 0.01237182, + "auxiliary_loss_mlp": 0.0021127, + "balance_loss_clip": 1.02140403, + "balance_loss_mlp": 0.18760739, + "epoch": 0.7953705095445663, + "flos": 40551980676480.0, + "grad_norm": 95.22465435190553, + "language_loss": 0.7666254, + "learning_rate": 4.232940412119095e-07, + "loss": 0.78110993, + "num_input_tokens_seen": 285429900, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.23669434, + "step": 13229, + "time_per_iteration": 4.18335747718811 + }, + { + "auxiliary_loss_clip": 0.01278565, + "auxiliary_loss_mlp": 0.00207421, + "balance_loss_clip": 1.0498724, + "balance_loss_mlp": 0.17928779, + "epoch": 0.7954306327972344, + "flos": 27636672706560.0, + "grad_norm": 18.006080560959152, + "language_loss": 0.7969172, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.81177711, + "num_input_tokens_seen": 285452555, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.28112793, + "step": 13230, + "time_per_iteration": 2.7070672512054443 + }, + { + "auxiliary_loss_clip": 0.01156302, + "auxiliary_loss_mlp": 0.00056207, + "balance_loss_clip": 1.00118375, + "balance_loss_mlp": 0.0491974, + "epoch": 0.7954907560499023, + "flos": 59504055995520.0, + "grad_norm": 0.9696029197366931, + "language_loss": 0.6290611, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.64118612, + "num_input_tokens_seen": 285515700, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.0703125, + "step": 13231, + "time_per_iteration": 3.2635040283203125 + }, + { + "auxiliary_loss_clip": 0.01248838, + "auxiliary_loss_mlp": 0.00219729, + "balance_loss_clip": 1.02860308, + "balance_loss_mlp": 0.19618568, + "epoch": 0.7955508793025703, + "flos": 20120533764480.0, + "grad_norm": 3.516625775772951, + "language_loss": 0.77261776, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.78730345, + "num_input_tokens_seen": 285533910, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.23547363, + "step": 13232, + "time_per_iteration": 2.8252370357513428 + }, + { + "auxiliary_loss_clip": 0.01237148, + "auxiliary_loss_mlp": 0.00198851, + "balance_loss_clip": 1.0200963, + "balance_loss_mlp": 0.17505634, + "epoch": 0.7956110025552382, + "flos": 26505378881280.0, + "grad_norm": 211.27522839723017, + "language_loss": 0.84432185, + "learning_rate": 4.223360961792952e-07, + "loss": 0.8586818, + "num_input_tokens_seen": 285554080, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.23803711, + "step": 13233, + "time_per_iteration": 4.2158613204956055 + }, + { + "auxiliary_loss_clip": 0.0125078, + "auxiliary_loss_mlp": 0.00205236, + "balance_loss_clip": 1.0344069, + "balance_loss_mlp": 0.18241939, + "epoch": 0.7956711258079062, + "flos": 22565475786240.0, + "grad_norm": 12.849951536221331, + "language_loss": 0.85120881, + "learning_rate": 4.220967594613769e-07, + "loss": 0.86576891, + "num_input_tokens_seen": 285572325, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.2277832, + "step": 13234, + "time_per_iteration": 2.6405181884765625 + }, + { + "auxiliary_loss_clip": 0.01225035, + "auxiliary_loss_mlp": 0.00201068, + "balance_loss_clip": 1.01100349, + "balance_loss_mlp": 0.17809603, + "epoch": 0.7957312490605741, + "flos": 17379005143680.0, + "grad_norm": 6.2163458496577375, + "language_loss": 0.79627568, + "learning_rate": 4.218574825777077e-07, + "loss": 0.81053668, + "num_input_tokens_seen": 285589770, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.22961426, + "step": 13235, + "time_per_iteration": 2.61320424079895 + }, + { + "auxiliary_loss_clip": 0.01239627, + "auxiliary_loss_mlp": 0.0019168, + "balance_loss_clip": 1.02083945, + "balance_loss_mlp": 0.16854101, + "epoch": 0.7957913723132422, + "flos": 22491427898880.0, + "grad_norm": 6.994640592956855, + "language_loss": 0.76434493, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.77865803, + "num_input_tokens_seen": 285610065, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.23144531, + "step": 13236, + "time_per_iteration": 2.6703128814697266 + }, + { + "auxiliary_loss_clip": 0.01233962, + "auxiliary_loss_mlp": 0.0020676, + "balance_loss_clip": 1.01629305, + "balance_loss_mlp": 0.18288217, + "epoch": 0.7958514955659101, + "flos": 22638087129600.0, + "grad_norm": 143.18470435375372, + "language_loss": 0.81850857, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.83291578, + "num_input_tokens_seen": 285628480, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.23852539, + "step": 13237, + "time_per_iteration": 2.705169439315796 + }, + { + "auxiliary_loss_clip": 0.01269457, + "auxiliary_loss_mlp": 0.00218972, + "balance_loss_clip": 1.04133892, + "balance_loss_mlp": 0.19217353, + "epoch": 0.7959116188185781, + "flos": 20704225772160.0, + "grad_norm": 3.884567771219963, + "language_loss": 0.82213843, + "learning_rate": 4.211400110229175e-07, + "loss": 0.83702266, + "num_input_tokens_seen": 285647805, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.2677002, + "step": 13238, + "time_per_iteration": 4.120609521865845 + }, + { + "auxiliary_loss_clip": 0.01247004, + "auxiliary_loss_mlp": 0.00217391, + "balance_loss_clip": 1.02469039, + "balance_loss_mlp": 0.19147514, + "epoch": 0.7959717420712461, + "flos": 19024683684480.0, + "grad_norm": 35.05938039467809, + "language_loss": 0.83483315, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.84947711, + "num_input_tokens_seen": 285665505, + "router_z_loss_clip": 2.22167969, + "router_z_loss_mlp": 0.25866699, + "step": 13239, + "time_per_iteration": 2.6624863147735596 + }, + { + "auxiliary_loss_clip": 0.01248082, + "auxiliary_loss_mlp": 0.00218798, + "balance_loss_clip": 1.02833486, + "balance_loss_mlp": 0.19434775, + "epoch": 0.796031865323914, + "flos": 26356636661760.0, + "grad_norm": 4.710936968232002, + "language_loss": 0.78840083, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.80306965, + "num_input_tokens_seen": 285685855, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24450684, + "step": 13240, + "time_per_iteration": 2.727114677429199 + }, + { + "auxiliary_loss_clip": 0.01163333, + "auxiliary_loss_mlp": 0.00076045, + "balance_loss_clip": 1.01387215, + "balance_loss_mlp": 0.06903511, + "epoch": 0.796091988576582, + "flos": 62069440320000.0, + "grad_norm": 0.8749732617329505, + "language_loss": 0.57883584, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.59122962, + "num_input_tokens_seen": 285735710, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.0703125, + "step": 13241, + "time_per_iteration": 2.9187567234039307 + }, + { + "auxiliary_loss_clip": 0.01226674, + "auxiliary_loss_mlp": 0.00202949, + "balance_loss_clip": 1.01931608, + "balance_loss_mlp": 0.18180105, + "epoch": 0.7961521118292499, + "flos": 39020103400320.0, + "grad_norm": 10.618924393334641, + "language_loss": 0.72368616, + "learning_rate": 4.201842205128772e-07, + "loss": 0.73798239, + "num_input_tokens_seen": 285757045, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.21166992, + "step": 13242, + "time_per_iteration": 2.810734272003174 + }, + { + "auxiliary_loss_clip": 0.0124891, + "auxiliary_loss_mlp": 0.00227788, + "balance_loss_clip": 1.02607584, + "balance_loss_mlp": 0.20354125, + "epoch": 0.796212235081918, + "flos": 21762836426880.0, + "grad_norm": 613.423243443754, + "language_loss": 0.87043697, + "learning_rate": 4.199454226296526e-07, + "loss": 0.88520396, + "num_input_tokens_seen": 285776050, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.24255371, + "step": 13243, + "time_per_iteration": 2.6423768997192383 + }, + { + "auxiliary_loss_clip": 0.01236461, + "auxiliary_loss_mlp": 0.00243101, + "balance_loss_clip": 1.01945519, + "balance_loss_mlp": 0.21741116, + "epoch": 0.7962723583345859, + "flos": 21178857110400.0, + "grad_norm": 10.08545002060913, + "language_loss": 0.85709357, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.87188917, + "num_input_tokens_seen": 285796830, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.25695801, + "step": 13244, + "time_per_iteration": 2.7063939571380615 + }, + { + "auxiliary_loss_clip": 0.01237093, + "auxiliary_loss_mlp": 0.00207172, + "balance_loss_clip": 1.01740098, + "balance_loss_mlp": 0.18203031, + "epoch": 0.7963324815872539, + "flos": 17128636369920.0, + "grad_norm": 3.8905176519017783, + "language_loss": 0.81318265, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.82762527, + "num_input_tokens_seen": 285814755, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.25146484, + "step": 13245, + "time_per_iteration": 2.592916965484619 + }, + { + "auxiliary_loss_clip": 0.01256042, + "auxiliary_loss_mlp": 0.00207187, + "balance_loss_clip": 1.03243637, + "balance_loss_mlp": 0.18043642, + "epoch": 0.7963926048399218, + "flos": 21397481239680.0, + "grad_norm": 1.905362855459799, + "language_loss": 0.84302843, + "learning_rate": 4.192293885111549e-07, + "loss": 0.85766071, + "num_input_tokens_seen": 285834255, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26782227, + "step": 13246, + "time_per_iteration": 2.627763032913208 + }, + { + "auxiliary_loss_clip": 0.01268426, + "auxiliary_loss_mlp": 0.00212958, + "balance_loss_clip": 1.04013824, + "balance_loss_mlp": 0.18595758, + "epoch": 0.7964527280925898, + "flos": 25184188828800.0, + "grad_norm": 36.689555643999014, + "language_loss": 0.75735408, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.77216792, + "num_input_tokens_seen": 285853540, + "router_z_loss_clip": 2.28808594, + "router_z_loss_mlp": 0.27001953, + "step": 13247, + "time_per_iteration": 2.672680377960205 + }, + { + "auxiliary_loss_clip": 0.0125334, + "auxiliary_loss_mlp": 0.00218061, + "balance_loss_clip": 1.03584433, + "balance_loss_mlp": 0.19426635, + "epoch": 0.7965128513452577, + "flos": 27015884928000.0, + "grad_norm": 5.99808075913781, + "language_loss": 0.81285286, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.82756686, + "num_input_tokens_seen": 285872705, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.23791504, + "step": 13248, + "time_per_iteration": 2.6568360328674316 + }, + { + "auxiliary_loss_clip": 0.01252524, + "auxiliary_loss_mlp": 0.00204968, + "balance_loss_clip": 1.02921319, + "balance_loss_mlp": 0.17904013, + "epoch": 0.7965729745979258, + "flos": 24419578993920.0, + "grad_norm": 28.348975452962573, + "language_loss": 0.86089468, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.87546962, + "num_input_tokens_seen": 285890290, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.25891113, + "step": 13249, + "time_per_iteration": 2.669780731201172 + }, + { + "auxiliary_loss_clip": 0.01241924, + "auxiliary_loss_mlp": 0.00199298, + "balance_loss_clip": 1.02697301, + "balance_loss_mlp": 0.17725602, + "epoch": 0.7966330978505937, + "flos": 18840389978880.0, + "grad_norm": 29.443996353918894, + "language_loss": 0.71149516, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.72590739, + "num_input_tokens_seen": 285909190, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.22033691, + "step": 13250, + "time_per_iteration": 2.598623275756836 + }, + { + "auxiliary_loss_clip": 0.01232331, + "auxiliary_loss_mlp": 0.00201876, + "balance_loss_clip": 1.01433635, + "balance_loss_mlp": 0.17730746, + "epoch": 0.7966932211032617, + "flos": 13152319862400.0, + "grad_norm": 5971.878748756533, + "language_loss": 0.85414362, + "learning_rate": 4.180371972938206e-07, + "loss": 0.86848569, + "num_input_tokens_seen": 285927570, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24572754, + "step": 13251, + "time_per_iteration": 2.6243185997009277 + }, + { + "auxiliary_loss_clip": 0.01256545, + "auxiliary_loss_mlp": 0.00224979, + "balance_loss_clip": 1.03550673, + "balance_loss_mlp": 0.20006457, + "epoch": 0.7967533443559297, + "flos": 23949760078080.0, + "grad_norm": 18.410349546700377, + "language_loss": 0.82523894, + "learning_rate": 4.177989389787624e-07, + "loss": 0.84005421, + "num_input_tokens_seen": 285945810, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.24938965, + "step": 13252, + "time_per_iteration": 2.714676856994629 + }, + { + "auxiliary_loss_clip": 0.01238074, + "auxiliary_loss_mlp": 0.00224177, + "balance_loss_clip": 1.02136123, + "balance_loss_mlp": 0.19975097, + "epoch": 0.7968134676085976, + "flos": 30368791964160.0, + "grad_norm": 92.33062904126038, + "language_loss": 0.75979692, + "learning_rate": 4.175607406609278e-07, + "loss": 0.77441943, + "num_input_tokens_seen": 285964235, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24401855, + "step": 13253, + "time_per_iteration": 2.788999080657959 + }, + { + "auxiliary_loss_clip": 0.01260426, + "auxiliary_loss_mlp": 0.00236851, + "balance_loss_clip": 1.03470922, + "balance_loss_mlp": 0.210053, + "epoch": 0.7968735908612656, + "flos": 23075048079360.0, + "grad_norm": 7.027724365399763, + "language_loss": 0.74032688, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.75529963, + "num_input_tokens_seen": 285983710, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26806641, + "step": 13254, + "time_per_iteration": 2.6461777687072754 + }, + { + "auxiliary_loss_clip": 0.01249116, + "auxiliary_loss_mlp": 0.00228008, + "balance_loss_clip": 1.03081238, + "balance_loss_mlp": 0.20318857, + "epoch": 0.7969337141139335, + "flos": 23582250074880.0, + "grad_norm": 13.999731745913033, + "language_loss": 0.7729373, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.78770858, + "num_input_tokens_seen": 286003425, + "router_z_loss_clip": 2.18261719, + "router_z_loss_mlp": 0.24829102, + "step": 13255, + "time_per_iteration": 2.708155393600464 + }, + { + "auxiliary_loss_clip": 0.01240576, + "auxiliary_loss_mlp": 0.00221637, + "balance_loss_clip": 1.027596, + "balance_loss_mlp": 0.19692516, + "epoch": 0.7969938373666016, + "flos": 19755860935680.0, + "grad_norm": 3.6896892666756016, + "language_loss": 0.86227167, + "learning_rate": 4.168465057810733e-07, + "loss": 0.87689376, + "num_input_tokens_seen": 286020130, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.24731445, + "step": 13256, + "time_per_iteration": 2.6654441356658936 + }, + { + "auxiliary_loss_clip": 0.01246149, + "auxiliary_loss_mlp": 0.00217313, + "balance_loss_clip": 1.02855635, + "balance_loss_mlp": 0.19407919, + "epoch": 0.7970539606192695, + "flos": 24134089697280.0, + "grad_norm": 12.955025671501444, + "language_loss": 0.73218632, + "learning_rate": 4.166085475424315e-07, + "loss": 0.74682093, + "num_input_tokens_seen": 286040230, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.2322998, + "step": 13257, + "time_per_iteration": 2.72440767288208 + }, + { + "auxiliary_loss_clip": 0.01262098, + "auxiliary_loss_mlp": 0.00219857, + "balance_loss_clip": 1.04333007, + "balance_loss_mlp": 0.19577712, + "epoch": 0.7971140838719375, + "flos": 17968622895360.0, + "grad_norm": 4.45887707512312, + "language_loss": 0.81917036, + "learning_rate": 4.163706493461523e-07, + "loss": 0.83398998, + "num_input_tokens_seen": 286059475, + "router_z_loss_clip": 2.18652344, + "router_z_loss_mlp": 0.24060059, + "step": 13258, + "time_per_iteration": 2.637598991394043 + }, + { + "auxiliary_loss_clip": 0.0126559, + "auxiliary_loss_mlp": 0.00225018, + "balance_loss_clip": 1.03946364, + "balance_loss_mlp": 0.20015061, + "epoch": 0.7971742071246054, + "flos": 19169547235200.0, + "grad_norm": 57.18527935281728, + "language_loss": 0.78919804, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.80410415, + "num_input_tokens_seen": 286077820, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.2487793, + "step": 13259, + "time_per_iteration": 2.6689794063568115 + }, + { + "auxiliary_loss_clip": 0.01239908, + "auxiliary_loss_mlp": 0.00210493, + "balance_loss_clip": 1.02539289, + "balance_loss_mlp": 0.18660372, + "epoch": 0.7972343303772734, + "flos": 27125951178240.0, + "grad_norm": 308.72907683486204, + "language_loss": 0.79128015, + "learning_rate": 4.158950331167641e-07, + "loss": 0.80578417, + "num_input_tokens_seen": 286097285, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.23901367, + "step": 13260, + "time_per_iteration": 2.681950092315674 + }, + { + "auxiliary_loss_clip": 0.01231801, + "auxiliary_loss_mlp": 0.00220988, + "balance_loss_clip": 1.01819932, + "balance_loss_mlp": 0.19643098, + "epoch": 0.7972944536299413, + "flos": 20996646393600.0, + "grad_norm": 15.39389867487627, + "language_loss": 0.85922754, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.8737554, + "num_input_tokens_seen": 286116000, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24560547, + "step": 13261, + "time_per_iteration": 2.6668612957000732 + }, + { + "auxiliary_loss_clip": 0.01230568, + "auxiliary_loss_mlp": 0.00184488, + "balance_loss_clip": 1.01966941, + "balance_loss_mlp": 0.16100393, + "epoch": 0.7973545768826094, + "flos": 21580015178880.0, + "grad_norm": 37.78197553372923, + "language_loss": 0.82144725, + "learning_rate": 4.154196571650501e-07, + "loss": 0.83559787, + "num_input_tokens_seen": 286135110, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.23486328, + "step": 13262, + "time_per_iteration": 2.6787497997283936 + }, + { + "auxiliary_loss_clip": 0.01267038, + "auxiliary_loss_mlp": 0.00204518, + "balance_loss_clip": 1.04288781, + "balance_loss_mlp": 0.17977056, + "epoch": 0.7974147001352773, + "flos": 20558536208640.0, + "grad_norm": 3.2947057289004933, + "language_loss": 0.82142001, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.83613563, + "num_input_tokens_seen": 286152835, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.24768066, + "step": 13263, + "time_per_iteration": 2.663862705230713 + }, + { + "auxiliary_loss_clip": 0.01264221, + "auxiliary_loss_mlp": 0.00214448, + "balance_loss_clip": 1.03878093, + "balance_loss_mlp": 0.18884197, + "epoch": 0.7974748233879453, + "flos": 20996790048000.0, + "grad_norm": 13.556732671349918, + "language_loss": 0.80818999, + "learning_rate": 4.149445215631153e-07, + "loss": 0.82297665, + "num_input_tokens_seen": 286171785, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.25610352, + "step": 13264, + "time_per_iteration": 2.642371654510498 + }, + { + "auxiliary_loss_clip": 0.01231332, + "auxiliary_loss_mlp": 0.00217145, + "balance_loss_clip": 1.01769638, + "balance_loss_mlp": 0.19383919, + "epoch": 0.7975349466406133, + "flos": 22565188477440.0, + "grad_norm": 3.174423683047426, + "language_loss": 0.83249474, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.8469795, + "num_input_tokens_seen": 286190420, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.2331543, + "step": 13265, + "time_per_iteration": 2.6306214332580566 + }, + { + "auxiliary_loss_clip": 0.01257571, + "auxiliary_loss_mlp": 0.00205521, + "balance_loss_clip": 1.03494143, + "balance_loss_mlp": 0.17946199, + "epoch": 0.7975950698932812, + "flos": 21689542725120.0, + "grad_norm": 10.192205676234801, + "language_loss": 0.83780837, + "learning_rate": 4.144696263830285e-07, + "loss": 0.85243928, + "num_input_tokens_seen": 286210105, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26037598, + "step": 13266, + "time_per_iteration": 2.6510586738586426 + }, + { + "auxiliary_loss_clip": 0.01227841, + "auxiliary_loss_mlp": 0.00201751, + "balance_loss_clip": 1.0214808, + "balance_loss_mlp": 0.17975655, + "epoch": 0.7976551931459492, + "flos": 19604568850560.0, + "grad_norm": 6.581833675913213, + "language_loss": 0.91058421, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.92488015, + "num_input_tokens_seen": 286228180, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.2199707, + "step": 13267, + "time_per_iteration": 2.601616859436035 + }, + { + "auxiliary_loss_clip": 0.0124195, + "auxiliary_loss_mlp": 0.00209478, + "balance_loss_clip": 1.02785587, + "balance_loss_mlp": 0.18545707, + "epoch": 0.7977153163986171, + "flos": 21687603390720.0, + "grad_norm": 5.085661535240184, + "language_loss": 0.85334927, + "learning_rate": 4.139949716968223e-07, + "loss": 0.86786354, + "num_input_tokens_seen": 286247305, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.23999023, + "step": 13268, + "time_per_iteration": 2.650585651397705 + }, + { + "auxiliary_loss_clip": 0.01243569, + "auxiliary_loss_mlp": 0.00207279, + "balance_loss_clip": 1.0267936, + "balance_loss_mlp": 0.18319836, + "epoch": 0.7977754396512852, + "flos": 23476780765440.0, + "grad_norm": 18.335315957361097, + "language_loss": 0.85906976, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.87357819, + "num_input_tokens_seen": 286268145, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.2409668, + "step": 13269, + "time_per_iteration": 4.156991243362427 + }, + { + "auxiliary_loss_clip": 0.01224028, + "auxiliary_loss_mlp": 0.00202644, + "balance_loss_clip": 1.01885498, + "balance_loss_mlp": 0.18124625, + "epoch": 0.7978355629039531, + "flos": 22382223575040.0, + "grad_norm": 10.933165859344685, + "language_loss": 0.89740264, + "learning_rate": 4.135205575764922e-07, + "loss": 0.91166937, + "num_input_tokens_seen": 286286775, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.21411133, + "step": 13270, + "time_per_iteration": 2.692385196685791 + }, + { + "auxiliary_loss_clip": 0.01241541, + "auxiliary_loss_mlp": 0.00212751, + "balance_loss_clip": 1.02469683, + "balance_loss_mlp": 0.18673998, + "epoch": 0.7978956861566211, + "flos": 20266331068800.0, + "grad_norm": 14.904026597244542, + "language_loss": 0.68777651, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.70231944, + "num_input_tokens_seen": 286305590, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.2598877, + "step": 13271, + "time_per_iteration": 4.041945695877075 + }, + { + "auxiliary_loss_clip": 0.01261708, + "auxiliary_loss_mlp": 0.00220989, + "balance_loss_clip": 1.0445298, + "balance_loss_mlp": 0.19770767, + "epoch": 0.797955809409289, + "flos": 28112417366400.0, + "grad_norm": 4.544745822813709, + "language_loss": 0.79141891, + "learning_rate": 4.130463840939975e-07, + "loss": 0.80624592, + "num_input_tokens_seen": 286328050, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.23291016, + "step": 13272, + "time_per_iteration": 2.7085559368133545 + }, + { + "auxiliary_loss_clip": 0.01241733, + "auxiliary_loss_mlp": 0.00193647, + "balance_loss_clip": 1.02284515, + "balance_loss_mlp": 0.1694233, + "epoch": 0.798015932661957, + "flos": 15559591495680.0, + "grad_norm": 23.355581968652665, + "language_loss": 0.79204094, + "learning_rate": 4.128093876144161e-07, + "loss": 0.80639476, + "num_input_tokens_seen": 286345265, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24230957, + "step": 13273, + "time_per_iteration": 2.608564853668213 + }, + { + "auxiliary_loss_clip": 0.01259425, + "auxiliary_loss_mlp": 0.00197025, + "balance_loss_clip": 1.03751481, + "balance_loss_mlp": 0.17158604, + "epoch": 0.7980760559146249, + "flos": 23951196622080.0, + "grad_norm": 7.885431739925885, + "language_loss": 0.85231531, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.86687976, + "num_input_tokens_seen": 286364465, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25463867, + "step": 13274, + "time_per_iteration": 2.7092010974884033 + }, + { + "auxiliary_loss_clip": 0.01226261, + "auxiliary_loss_mlp": 0.00200304, + "balance_loss_clip": 1.01959479, + "balance_loss_mlp": 0.17828585, + "epoch": 0.798136179167293, + "flos": 28038082170240.0, + "grad_norm": 11.377340524631531, + "language_loss": 0.83395571, + "learning_rate": 4.12335575223518e-07, + "loss": 0.8482213, + "num_input_tokens_seen": 286385565, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.22021484, + "step": 13275, + "time_per_iteration": 2.676877021789551 + }, + { + "auxiliary_loss_clip": 0.01249595, + "auxiliary_loss_mlp": 0.0021799, + "balance_loss_clip": 1.02726173, + "balance_loss_mlp": 0.19083381, + "epoch": 0.7981963024199609, + "flos": 35984538046080.0, + "grad_norm": 23.762643393299246, + "language_loss": 0.73619854, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.7508744, + "num_input_tokens_seen": 286403950, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.27160645, + "step": 13276, + "time_per_iteration": 4.21284031867981 + }, + { + "auxiliary_loss_clip": 0.01230608, + "auxiliary_loss_mlp": 0.00210285, + "balance_loss_clip": 1.02312124, + "balance_loss_mlp": 0.18787396, + "epoch": 0.7982564256726289, + "flos": 25884914325120.0, + "grad_norm": 7.656751525461095, + "language_loss": 0.6824652, + "learning_rate": 4.118620036501945e-07, + "loss": 0.69687414, + "num_input_tokens_seen": 286426160, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.22399902, + "step": 13277, + "time_per_iteration": 2.6657140254974365 + }, + { + "auxiliary_loss_clip": 0.01260605, + "auxiliary_loss_mlp": 0.00198557, + "balance_loss_clip": 1.03984082, + "balance_loss_mlp": 0.17501345, + "epoch": 0.7983165489252969, + "flos": 25739152934400.0, + "grad_norm": 88.33504493823, + "language_loss": 0.87225646, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.88684809, + "num_input_tokens_seen": 286446610, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.23547363, + "step": 13278, + "time_per_iteration": 2.6657557487487793 + }, + { + "auxiliary_loss_clip": 0.01251856, + "auxiliary_loss_mlp": 0.00207872, + "balance_loss_clip": 1.02739692, + "balance_loss_mlp": 0.18178937, + "epoch": 0.7983766721779648, + "flos": 21908202768000.0, + "grad_norm": 8.684533412749031, + "language_loss": 0.74095225, + "learning_rate": 4.113886729662768e-07, + "loss": 0.75554955, + "num_input_tokens_seen": 286465460, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26098633, + "step": 13279, + "time_per_iteration": 2.656404972076416 + }, + { + "auxiliary_loss_clip": 0.01213158, + "auxiliary_loss_mlp": 0.00196755, + "balance_loss_clip": 1.00886178, + "balance_loss_mlp": 0.17514281, + "epoch": 0.7984367954306328, + "flos": 29347420734720.0, + "grad_norm": 18.428940427105243, + "language_loss": 0.76573193, + "learning_rate": 4.111520979802825e-07, + "loss": 0.77983105, + "num_input_tokens_seen": 286485720, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.21618652, + "step": 13280, + "time_per_iteration": 2.8659298419952393 + }, + { + "auxiliary_loss_clip": 0.01243522, + "auxiliary_loss_mlp": 0.00199001, + "balance_loss_clip": 1.02312231, + "balance_loss_mlp": 0.17442033, + "epoch": 0.7984969186833007, + "flos": 31357772104320.0, + "grad_norm": 515.4138826758893, + "language_loss": 0.72482026, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.73924541, + "num_input_tokens_seen": 286507465, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.24572754, + "step": 13281, + "time_per_iteration": 4.318729877471924 + }, + { + "auxiliary_loss_clip": 0.01258966, + "auxiliary_loss_mlp": 0.00228273, + "balance_loss_clip": 1.03815639, + "balance_loss_mlp": 0.20226142, + "epoch": 0.7985570419359688, + "flos": 24312924535680.0, + "grad_norm": 31.443577200114223, + "language_loss": 0.88542247, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.90029484, + "num_input_tokens_seen": 286526345, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26000977, + "step": 13282, + "time_per_iteration": 2.7402892112731934 + }, + { + "auxiliary_loss_clip": 0.01241226, + "auxiliary_loss_mlp": 0.00204027, + "balance_loss_clip": 1.01963496, + "balance_loss_mlp": 0.17755023, + "epoch": 0.7986171651886367, + "flos": 15742233175680.0, + "grad_norm": 3.3706505418667887, + "language_loss": 0.82110262, + "learning_rate": 4.10442734553802e-07, + "loss": 0.83555508, + "num_input_tokens_seen": 286544095, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.26501465, + "step": 13283, + "time_per_iteration": 2.6869969367980957 + }, + { + "auxiliary_loss_clip": 0.01234921, + "auxiliary_loss_mlp": 0.00199514, + "balance_loss_clip": 1.02385449, + "balance_loss_mlp": 0.17632751, + "epoch": 0.7986772884413047, + "flos": 11619401091840.0, + "grad_norm": 5.440364273001664, + "language_loss": 0.82918674, + "learning_rate": 4.102064006186967e-07, + "loss": 0.84353113, + "num_input_tokens_seen": 286560960, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.23193359, + "step": 13284, + "time_per_iteration": 2.6739907264709473 + }, + { + "auxiliary_loss_clip": 0.01236863, + "auxiliary_loss_mlp": 0.00210331, + "balance_loss_clip": 1.02268004, + "balance_loss_mlp": 0.1869894, + "epoch": 0.7987374116939726, + "flos": 22091059929600.0, + "grad_norm": 68.32600856545905, + "language_loss": 0.78582126, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.80029321, + "num_input_tokens_seen": 286579865, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.23352051, + "step": 13285, + "time_per_iteration": 2.6498446464538574 + }, + { + "auxiliary_loss_clip": 0.01247595, + "auxiliary_loss_mlp": 0.00217143, + "balance_loss_clip": 1.03112841, + "balance_loss_mlp": 0.19309838, + "epoch": 0.7987975349466406, + "flos": 17890696339200.0, + "grad_norm": 85.38747576390287, + "language_loss": 0.82599223, + "learning_rate": 4.097339136128437e-07, + "loss": 0.84063965, + "num_input_tokens_seen": 286597295, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24047852, + "step": 13286, + "time_per_iteration": 2.616816997528076 + }, + { + "auxiliary_loss_clip": 0.01260792, + "auxiliary_loss_mlp": 0.00198883, + "balance_loss_clip": 1.04132032, + "balance_loss_mlp": 0.175482, + "epoch": 0.7988576581993085, + "flos": 19719232041600.0, + "grad_norm": 120.20844151050508, + "language_loss": 0.8500905, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.8646872, + "num_input_tokens_seen": 286616270, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.23388672, + "step": 13287, + "time_per_iteration": 2.605144500732422 + }, + { + "auxiliary_loss_clip": 0.01236683, + "auxiliary_loss_mlp": 0.00211557, + "balance_loss_clip": 1.0235188, + "balance_loss_mlp": 0.18716723, + "epoch": 0.7989177814519766, + "flos": 28036358317440.0, + "grad_norm": 21.582041640416687, + "language_loss": 0.70979893, + "learning_rate": 4.092616678191863e-07, + "loss": 0.72428131, + "num_input_tokens_seen": 286638315, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24414062, + "step": 13288, + "time_per_iteration": 2.7114150524139404 + }, + { + "auxiliary_loss_clip": 0.01241629, + "auxiliary_loss_mlp": 0.00194477, + "balance_loss_clip": 1.02550125, + "balance_loss_mlp": 0.17361543, + "epoch": 0.7989779047046445, + "flos": 28871029630080.0, + "grad_norm": 81.99177668884035, + "language_loss": 0.79208148, + "learning_rate": 4.090256353993169e-07, + "loss": 0.80644244, + "num_input_tokens_seen": 286658630, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.20874023, + "step": 13289, + "time_per_iteration": 2.666494369506836 + }, + { + "auxiliary_loss_clip": 0.01226885, + "auxiliary_loss_mlp": 0.0019104, + "balance_loss_clip": 1.01781988, + "balance_loss_mlp": 0.16714999, + "epoch": 0.7990380279573125, + "flos": 18186887888640.0, + "grad_norm": 14.443015563584334, + "language_loss": 0.73252249, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.74670172, + "num_input_tokens_seen": 286676870, + "router_z_loss_clip": 2.08886719, + "router_z_loss_mlp": 0.23876953, + "step": 13290, + "time_per_iteration": 2.6194076538085938 + }, + { + "auxiliary_loss_clip": 0.01250475, + "auxiliary_loss_mlp": 0.00196182, + "balance_loss_clip": 1.0329417, + "balance_loss_mlp": 0.17322268, + "epoch": 0.7990981512099805, + "flos": 20879936127360.0, + "grad_norm": 12.189201392432244, + "language_loss": 0.79401398, + "learning_rate": 4.08553751558248e-07, + "loss": 0.8084805, + "num_input_tokens_seen": 286694300, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.22961426, + "step": 13291, + "time_per_iteration": 2.7054336071014404 + }, + { + "auxiliary_loss_clip": 0.01237396, + "auxiliary_loss_mlp": 0.00201053, + "balance_loss_clip": 1.02459979, + "balance_loss_mlp": 0.17759246, + "epoch": 0.7991582744626484, + "flos": 26099911180800.0, + "grad_norm": 8.769158630480351, + "language_loss": 0.70491695, + "learning_rate": 4.083179001549422e-07, + "loss": 0.71930146, + "num_input_tokens_seen": 286714545, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.23461914, + "step": 13292, + "time_per_iteration": 2.6829683780670166 + }, + { + "auxiliary_loss_clip": 0.0124162, + "auxiliary_loss_mlp": 0.00227892, + "balance_loss_clip": 1.02610767, + "balance_loss_mlp": 0.20360877, + "epoch": 0.7992183977153164, + "flos": 35295843605760.0, + "grad_norm": 9.850675043188652, + "language_loss": 0.62243438, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.63712949, + "num_input_tokens_seen": 286734525, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.24291992, + "step": 13293, + "time_per_iteration": 2.7421510219573975 + }, + { + "auxiliary_loss_clip": 0.01246654, + "auxiliary_loss_mlp": 0.00206435, + "balance_loss_clip": 1.02940965, + "balance_loss_mlp": 0.18229511, + "epoch": 0.7992785209679844, + "flos": 51853426577280.0, + "grad_norm": 10.660722634331274, + "language_loss": 0.82678676, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.84131765, + "num_input_tokens_seen": 286753430, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.24157715, + "step": 13294, + "time_per_iteration": 2.8733322620391846 + }, + { + "auxiliary_loss_clip": 0.01260881, + "auxiliary_loss_mlp": 0.00218382, + "balance_loss_clip": 1.03981543, + "balance_loss_mlp": 0.19306178, + "epoch": 0.7993386442206524, + "flos": 22565116650240.0, + "grad_norm": 87.16052505754796, + "language_loss": 0.81738609, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.83217871, + "num_input_tokens_seen": 286771915, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.2532959, + "step": 13295, + "time_per_iteration": 2.6623427867889404 + }, + { + "auxiliary_loss_clip": 0.01244481, + "auxiliary_loss_mlp": 0.00220041, + "balance_loss_clip": 1.03223658, + "balance_loss_mlp": 0.19850026, + "epoch": 0.7993987674733203, + "flos": 18800277465600.0, + "grad_norm": 12.605196911419164, + "language_loss": 0.84981537, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.86446059, + "num_input_tokens_seen": 286789835, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.2154541, + "step": 13296, + "time_per_iteration": 2.6478521823883057 + }, + { + "auxiliary_loss_clip": 0.01165275, + "auxiliary_loss_mlp": 0.00097526, + "balance_loss_clip": 1.02666259, + "balance_loss_mlp": 0.08922907, + "epoch": 0.7994588907259883, + "flos": 69421720394880.0, + "grad_norm": 0.6826416606507184, + "language_loss": 0.60390925, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.61653721, + "num_input_tokens_seen": 286855580, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.08300781, + "step": 13297, + "time_per_iteration": 3.286343812942505 + }, + { + "auxiliary_loss_clip": 0.01234687, + "auxiliary_loss_mlp": 0.00198198, + "balance_loss_clip": 1.02513433, + "balance_loss_mlp": 0.17621566, + "epoch": 0.7995190139786562, + "flos": 13480327883520.0, + "grad_norm": 5.409362748078585, + "language_loss": 0.80898333, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.82331216, + "num_input_tokens_seen": 286874360, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.21972656, + "step": 13298, + "time_per_iteration": 2.6769845485687256 + }, + { + "auxiliary_loss_clip": 0.01270649, + "auxiliary_loss_mlp": 0.00217794, + "balance_loss_clip": 1.04625058, + "balance_loss_mlp": 0.19206814, + "epoch": 0.7995791372313242, + "flos": 21652842003840.0, + "grad_norm": 6.023256678626334, + "language_loss": 0.83364415, + "learning_rate": 4.066686308212037e-07, + "loss": 0.84852862, + "num_input_tokens_seen": 286891950, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.25720215, + "step": 13299, + "time_per_iteration": 2.781677722930908 + }, + { + "auxiliary_loss_clip": 0.01246528, + "auxiliary_loss_mlp": 0.00216899, + "balance_loss_clip": 1.03461349, + "balance_loss_mlp": 0.1943799, + "epoch": 0.7996392604839921, + "flos": 26068130622720.0, + "grad_norm": 6.344094059529175, + "language_loss": 0.84375632, + "learning_rate": 4.064332625220828e-07, + "loss": 0.85839057, + "num_input_tokens_seen": 286911725, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.2253418, + "step": 13300, + "time_per_iteration": 2.7108616828918457 + }, + { + "auxiliary_loss_clip": 0.01256214, + "auxiliary_loss_mlp": 0.0022044, + "balance_loss_clip": 1.03594065, + "balance_loss_mlp": 0.19576378, + "epoch": 0.7996993837366602, + "flos": 24606889441920.0, + "grad_norm": 23.92639154449928, + "language_loss": 0.72664154, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.74140811, + "num_input_tokens_seen": 286931400, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24707031, + "step": 13301, + "time_per_iteration": 2.701855182647705 + }, + { + "auxiliary_loss_clip": 0.01261743, + "auxiliary_loss_mlp": 0.0024257, + "balance_loss_clip": 1.04379177, + "balance_loss_mlp": 0.2154264, + "epoch": 0.7997595069893281, + "flos": 20992049452800.0, + "grad_norm": 18.535204670228858, + "language_loss": 0.79499316, + "learning_rate": 4.059627072173928e-07, + "loss": 0.81003624, + "num_input_tokens_seen": 286949795, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.27172852, + "step": 13302, + "time_per_iteration": 2.6812937259674072 + }, + { + "auxiliary_loss_clip": 0.01256816, + "auxiliary_loss_mlp": 0.00238105, + "balance_loss_clip": 1.03721011, + "balance_loss_mlp": 0.21327372, + "epoch": 0.7998196302419961, + "flos": 24426510318720.0, + "grad_norm": 17.022303540396287, + "language_loss": 0.91394842, + "learning_rate": 4.057275202296684e-07, + "loss": 0.92889762, + "num_input_tokens_seen": 286968805, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24829102, + "step": 13303, + "time_per_iteration": 2.699009895324707 + }, + { + "auxiliary_loss_clip": 0.0122414, + "auxiliary_loss_mlp": 0.00212711, + "balance_loss_clip": 1.01629722, + "balance_loss_mlp": 0.1913729, + "epoch": 0.7998797534946641, + "flos": 30264651457920.0, + "grad_norm": 3.110660103770848, + "language_loss": 0.67668688, + "learning_rate": 4.054923936969166e-07, + "loss": 0.69105542, + "num_input_tokens_seen": 286990235, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.21350098, + "step": 13304, + "time_per_iteration": 2.7082479000091553 + }, + { + "auxiliary_loss_clip": 0.01233327, + "auxiliary_loss_mlp": 0.00207822, + "balance_loss_clip": 1.01777673, + "balance_loss_mlp": 0.18467206, + "epoch": 0.799939876747332, + "flos": 23513984277120.0, + "grad_norm": 38.885444674562834, + "language_loss": 0.76425844, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.77866995, + "num_input_tokens_seen": 287011060, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.23156738, + "step": 13305, + "time_per_iteration": 2.6777825355529785 + }, + { + "auxiliary_loss_clip": 0.01236867, + "auxiliary_loss_mlp": 0.0023555, + "balance_loss_clip": 1.02734983, + "balance_loss_mlp": 0.21251842, + "epoch": 0.8, + "flos": 19318109886720.0, + "grad_norm": 6.234388367366129, + "language_loss": 0.76192635, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.77665055, + "num_input_tokens_seen": 287029215, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.23022461, + "step": 13306, + "time_per_iteration": 2.6380460262298584 + }, + { + "auxiliary_loss_clip": 0.01248651, + "auxiliary_loss_mlp": 0.00236471, + "balance_loss_clip": 1.03186142, + "balance_loss_mlp": 0.21171147, + "epoch": 0.800060123252668, + "flos": 32412432263040.0, + "grad_norm": 4725.8635618591225, + "language_loss": 0.76579297, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.78064418, + "num_input_tokens_seen": 287050855, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.24731445, + "step": 13307, + "time_per_iteration": 2.736438274383545 + }, + { + "auxiliary_loss_clip": 0.01236823, + "auxiliary_loss_mlp": 0.00232907, + "balance_loss_clip": 1.02257633, + "balance_loss_mlp": 0.2094464, + "epoch": 0.800120246505336, + "flos": 20010611168640.0, + "grad_norm": 12.619989079260478, + "language_loss": 0.85518718, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.86988443, + "num_input_tokens_seen": 287069915, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.23461914, + "step": 13308, + "time_per_iteration": 2.633929491043091 + }, + { + "auxiliary_loss_clip": 0.01264141, + "auxiliary_loss_mlp": 0.00229141, + "balance_loss_clip": 1.03814209, + "balance_loss_mlp": 0.20397568, + "epoch": 0.8001803697580039, + "flos": 31868278151040.0, + "grad_norm": 5.514510663790233, + "language_loss": 0.84383506, + "learning_rate": 4.0431766816972e-07, + "loss": 0.85876787, + "num_input_tokens_seen": 287091450, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.25158691, + "step": 13309, + "time_per_iteration": 2.7503931522369385 + }, + { + "auxiliary_loss_clip": 0.01169723, + "auxiliary_loss_mlp": 0.00147439, + "balance_loss_clip": 1.02973199, + "balance_loss_mlp": 0.13866562, + "epoch": 0.8002404930106719, + "flos": 63392066916480.0, + "grad_norm": 0.8903896692886496, + "language_loss": 0.63918954, + "learning_rate": 4.040829045539571e-07, + "loss": 0.65236115, + "num_input_tokens_seen": 287148365, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.08789062, + "step": 13310, + "time_per_iteration": 3.0954532623291016 + }, + { + "auxiliary_loss_clip": 0.01250569, + "auxiliary_loss_mlp": 0.00239907, + "balance_loss_clip": 1.03254592, + "balance_loss_mlp": 0.21587458, + "epoch": 0.8003006162633398, + "flos": 27855476403840.0, + "grad_norm": 12.37433616509396, + "language_loss": 0.91808617, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.93299097, + "num_input_tokens_seen": 287168280, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.24023438, + "step": 13311, + "time_per_iteration": 4.143649339675903 + }, + { + "auxiliary_loss_clip": 0.01251982, + "auxiliary_loss_mlp": 0.00217726, + "balance_loss_clip": 1.03278494, + "balance_loss_mlp": 0.1937415, + "epoch": 0.8003607395160078, + "flos": 18223337214720.0, + "grad_norm": 14.19547829874809, + "language_loss": 0.74998879, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.76468581, + "num_input_tokens_seen": 287185980, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.23986816, + "step": 13312, + "time_per_iteration": 2.5976626873016357 + }, + { + "auxiliary_loss_clip": 0.01260148, + "auxiliary_loss_mlp": 0.00236474, + "balance_loss_clip": 1.03470135, + "balance_loss_mlp": 0.21095164, + "epoch": 0.8004208627686757, + "flos": 20886975192960.0, + "grad_norm": 19.623763667929808, + "language_loss": 0.82742548, + "learning_rate": 4.033789768462843e-07, + "loss": 0.84239173, + "num_input_tokens_seen": 287203875, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.25549316, + "step": 13313, + "time_per_iteration": 4.166054725646973 + }, + { + "auxiliary_loss_clip": 0.01246592, + "auxiliary_loss_mlp": 0.002576, + "balance_loss_clip": 1.02906442, + "balance_loss_mlp": 0.23251849, + "epoch": 0.8004809860213438, + "flos": 26436143416320.0, + "grad_norm": 10.741185378911528, + "language_loss": 0.81799328, + "learning_rate": 4.031444553532575e-07, + "loss": 0.83303523, + "num_input_tokens_seen": 287226445, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.25085449, + "step": 13314, + "time_per_iteration": 2.8518967628479004 + }, + { + "auxiliary_loss_clip": 0.01169184, + "auxiliary_loss_mlp": 0.00067298, + "balance_loss_clip": 1.03642321, + "balance_loss_mlp": 0.0589054, + "epoch": 0.8005411092740117, + "flos": 63648612829440.0, + "grad_norm": 0.7505227835315214, + "language_loss": 0.53426361, + "learning_rate": 4.029099944131522e-07, + "loss": 0.54662842, + "num_input_tokens_seen": 287286240, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.08398438, + "step": 13315, + "time_per_iteration": 3.0841362476348877 + }, + { + "auxiliary_loss_clip": 0.01239473, + "auxiliary_loss_mlp": 0.00233234, + "balance_loss_clip": 1.02565324, + "balance_loss_mlp": 0.20921339, + "epoch": 0.8006012325266797, + "flos": 36138056774400.0, + "grad_norm": 139.2078915974386, + "language_loss": 0.77404112, + "learning_rate": 4.026755940348603e-07, + "loss": 0.78876811, + "num_input_tokens_seen": 287310265, + "router_z_loss_clip": 2.13769531, + "router_z_loss_mlp": 0.24047852, + "step": 13316, + "time_per_iteration": 2.8092644214630127 + }, + { + "auxiliary_loss_clip": 0.01262383, + "auxiliary_loss_mlp": 0.00234517, + "balance_loss_clip": 1.04227352, + "balance_loss_mlp": 0.21110471, + "epoch": 0.8006613557793477, + "flos": 33838947970560.0, + "grad_norm": 9.200924746423478, + "language_loss": 0.73046958, + "learning_rate": 4.024412542272706e-07, + "loss": 0.74543858, + "num_input_tokens_seen": 287331610, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.23413086, + "step": 13317, + "time_per_iteration": 2.8291587829589844 + }, + { + "auxiliary_loss_clip": 0.01156032, + "auxiliary_loss_mlp": 0.00092233, + "balance_loss_clip": 1.02565026, + "balance_loss_mlp": 0.08264848, + "epoch": 0.8007214790320156, + "flos": 67348310699520.0, + "grad_norm": 37.0108842143121, + "language_loss": 0.57998139, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.59246403, + "num_input_tokens_seen": 287394795, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.09570312, + "step": 13318, + "time_per_iteration": 4.730731010437012 + }, + { + "auxiliary_loss_clip": 0.01248159, + "auxiliary_loss_mlp": 0.00241581, + "balance_loss_clip": 1.03264701, + "balance_loss_mlp": 0.21736917, + "epoch": 0.8007816022846836, + "flos": 23185653033600.0, + "grad_norm": 10.644071373184, + "language_loss": 0.72693861, + "learning_rate": 4.019727563597366e-07, + "loss": 0.74183601, + "num_input_tokens_seen": 287414595, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24243164, + "step": 13319, + "time_per_iteration": 2.7234225273132324 + }, + { + "auxiliary_loss_clip": 0.01272517, + "auxiliary_loss_mlp": 0.00244091, + "balance_loss_clip": 1.04718542, + "balance_loss_mlp": 0.21707842, + "epoch": 0.8008417255373516, + "flos": 21981388728960.0, + "grad_norm": 5.50458695864645, + "language_loss": 0.82008284, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.83524895, + "num_input_tokens_seen": 287434395, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.27026367, + "step": 13320, + "time_per_iteration": 2.6549532413482666 + }, + { + "auxiliary_loss_clip": 0.01247052, + "auxiliary_loss_mlp": 0.00246568, + "balance_loss_clip": 1.02928138, + "balance_loss_mlp": 0.21984088, + "epoch": 0.8009018487900196, + "flos": 16727334647040.0, + "grad_norm": 9.865385512779099, + "language_loss": 0.87542582, + "learning_rate": 4.015045008816138e-07, + "loss": 0.89036202, + "num_input_tokens_seen": 287450590, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.26708984, + "step": 13321, + "time_per_iteration": 2.658527374267578 + }, + { + "auxiliary_loss_clip": 0.01226318, + "auxiliary_loss_mlp": 0.00218291, + "balance_loss_clip": 1.01878607, + "balance_loss_mlp": 0.19400841, + "epoch": 0.8009619720426875, + "flos": 20813609664000.0, + "grad_norm": 13.794431951452433, + "language_loss": 0.75439554, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.76884162, + "num_input_tokens_seen": 287468455, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.24279785, + "step": 13322, + "time_per_iteration": 4.093682527542114 + }, + { + "auxiliary_loss_clip": 0.01255104, + "auxiliary_loss_mlp": 0.00239711, + "balance_loss_clip": 1.03626621, + "balance_loss_mlp": 0.21507019, + "epoch": 0.8010220952953555, + "flos": 17931096161280.0, + "grad_norm": 6.0271863287017435, + "language_loss": 0.85842311, + "learning_rate": 4.010364878639265e-07, + "loss": 0.87337124, + "num_input_tokens_seen": 287486485, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24645996, + "step": 13323, + "time_per_iteration": 2.638410806655884 + }, + { + "auxiliary_loss_clip": 0.01254425, + "auxiliary_loss_mlp": 0.00232007, + "balance_loss_clip": 1.0309484, + "balance_loss_mlp": 0.20663974, + "epoch": 0.8010822185480234, + "flos": 24572235795840.0, + "grad_norm": 30.850466085548575, + "language_loss": 0.81758034, + "learning_rate": 4.00802572299932e-07, + "loss": 0.83244467, + "num_input_tokens_seen": 287503940, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25378418, + "step": 13324, + "time_per_iteration": 2.7644364833831787 + }, + { + "auxiliary_loss_clip": 0.01271601, + "auxiliary_loss_mlp": 0.00237172, + "balance_loss_clip": 1.04468071, + "balance_loss_mlp": 0.21120842, + "epoch": 0.8011423418006914, + "flos": 21829988903040.0, + "grad_norm": 30.963216070949777, + "language_loss": 0.84231734, + "learning_rate": 4.005687173776635e-07, + "loss": 0.85740507, + "num_input_tokens_seen": 287521660, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.25939941, + "step": 13325, + "time_per_iteration": 2.6473357677459717 + }, + { + "auxiliary_loss_clip": 0.01232688, + "auxiliary_loss_mlp": 0.0023196, + "balance_loss_clip": 1.02338266, + "balance_loss_mlp": 0.210788, + "epoch": 0.8012024650533593, + "flos": 23915178259200.0, + "grad_norm": 161.34635696752164, + "language_loss": 0.86002553, + "learning_rate": 4.003349231059898e-07, + "loss": 0.87467194, + "num_input_tokens_seen": 287541505, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.21179199, + "step": 13326, + "time_per_iteration": 2.7384510040283203 + }, + { + "auxiliary_loss_clip": 0.01236845, + "auxiliary_loss_mlp": 0.00210055, + "balance_loss_clip": 1.02598953, + "balance_loss_mlp": 0.18697609, + "epoch": 0.8012625883060274, + "flos": 23587062497280.0, + "grad_norm": 25.35427691245447, + "language_loss": 0.73929614, + "learning_rate": 4.001011894937765e-07, + "loss": 0.75376511, + "num_input_tokens_seen": 287560015, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.2310791, + "step": 13327, + "time_per_iteration": 2.68664813041687 + }, + { + "auxiliary_loss_clip": 0.01222487, + "auxiliary_loss_mlp": 0.00220997, + "balance_loss_clip": 1.01402974, + "balance_loss_mlp": 0.19797784, + "epoch": 0.8013227115586953, + "flos": 20813932886400.0, + "grad_norm": 46.80935288864243, + "language_loss": 0.80416256, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.81859744, + "num_input_tokens_seen": 287579150, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.22998047, + "step": 13328, + "time_per_iteration": 2.6755244731903076 + }, + { + "auxiliary_loss_clip": 0.01262573, + "auxiliary_loss_mlp": 0.00222169, + "balance_loss_clip": 1.0359478, + "balance_loss_mlp": 0.19564503, + "epoch": 0.8013828348113633, + "flos": 15888317788800.0, + "grad_norm": 23.857715401436934, + "language_loss": 0.83983791, + "learning_rate": 3.996339042831798e-07, + "loss": 0.85468537, + "num_input_tokens_seen": 287597420, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26525879, + "step": 13329, + "time_per_iteration": 2.643158435821533 + }, + { + "auxiliary_loss_clip": 0.0114824, + "auxiliary_loss_mlp": 0.0009657, + "balance_loss_clip": 1.01178992, + "balance_loss_mlp": 0.08874957, + "epoch": 0.8014429580640313, + "flos": 71062981562880.0, + "grad_norm": 0.7281421273228869, + "language_loss": 0.5176186, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.53006667, + "num_input_tokens_seen": 287667280, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.078125, + "step": 13330, + "time_per_iteration": 3.2616822719573975 + }, + { + "auxiliary_loss_clip": 0.01273737, + "auxiliary_loss_mlp": 0.00239242, + "balance_loss_clip": 1.04442763, + "balance_loss_mlp": 0.21081066, + "epoch": 0.8015030813166992, + "flos": 23076340968960.0, + "grad_norm": 141.26607903430966, + "language_loss": 0.82844937, + "learning_rate": 3.991668618167519e-07, + "loss": 0.84357917, + "num_input_tokens_seen": 287687375, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.2845459, + "step": 13331, + "time_per_iteration": 2.76023006439209 + }, + { + "auxiliary_loss_clip": 0.01253461, + "auxiliary_loss_mlp": 0.00225075, + "balance_loss_clip": 1.03474998, + "balance_loss_mlp": 0.20241365, + "epoch": 0.8015632045693672, + "flos": 21872328059520.0, + "grad_norm": 15.966703536998855, + "language_loss": 0.85656261, + "learning_rate": 3.989334316347401e-07, + "loss": 0.87134796, + "num_input_tokens_seen": 287707895, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.22668457, + "step": 13332, + "time_per_iteration": 2.7064309120178223 + }, + { + "auxiliary_loss_clip": 0.01248391, + "auxiliary_loss_mlp": 0.00210581, + "balance_loss_clip": 1.03058589, + "balance_loss_mlp": 0.1872994, + "epoch": 0.8016233278220352, + "flos": 23656728925440.0, + "grad_norm": 550.6250363063909, + "language_loss": 0.92105079, + "learning_rate": 3.987000621653338e-07, + "loss": 0.93564057, + "num_input_tokens_seen": 287723990, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.23291016, + "step": 13333, + "time_per_iteration": 2.6614484786987305 + }, + { + "auxiliary_loss_clip": 0.01261054, + "auxiliary_loss_mlp": 0.00237031, + "balance_loss_clip": 1.04259574, + "balance_loss_mlp": 0.21084026, + "epoch": 0.8016834510747032, + "flos": 16253170185600.0, + "grad_norm": 3.1050735853742153, + "language_loss": 0.83754075, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.85252154, + "num_input_tokens_seen": 287742380, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.26196289, + "step": 13334, + "time_per_iteration": 2.6660256385803223 + }, + { + "auxiliary_loss_clip": 0.01243336, + "auxiliary_loss_mlp": 0.00213839, + "balance_loss_clip": 1.03162324, + "balance_loss_mlp": 0.19189245, + "epoch": 0.8017435743273711, + "flos": 12276027665280.0, + "grad_norm": 10.9289256331366, + "language_loss": 0.84959412, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.86416584, + "num_input_tokens_seen": 287760130, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.21960449, + "step": 13335, + "time_per_iteration": 2.789978265762329 + }, + { + "auxiliary_loss_clip": 0.01239604, + "auxiliary_loss_mlp": 0.00238372, + "balance_loss_clip": 1.02638066, + "balance_loss_mlp": 0.21423197, + "epoch": 0.8018036975800391, + "flos": 17196112068480.0, + "grad_norm": 100.94040730779551, + "language_loss": 0.83407229, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.84885204, + "num_input_tokens_seen": 287777565, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24145508, + "step": 13336, + "time_per_iteration": 2.6316394805908203 + }, + { + "auxiliary_loss_clip": 0.01289011, + "auxiliary_loss_mlp": 0.00246087, + "balance_loss_clip": 1.05656385, + "balance_loss_mlp": 0.21826372, + "epoch": 0.801863820832707, + "flos": 20631865824000.0, + "grad_norm": 41.0829768475869, + "language_loss": 0.85865968, + "learning_rate": 3.977671915907068e-07, + "loss": 0.87401068, + "num_input_tokens_seen": 287796310, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.27832031, + "step": 13337, + "time_per_iteration": 2.66713547706604 + }, + { + "auxiliary_loss_clip": 0.01253497, + "auxiliary_loss_mlp": 0.00238593, + "balance_loss_clip": 1.03026509, + "balance_loss_mlp": 0.21359463, + "epoch": 0.801923944085375, + "flos": 30445569285120.0, + "grad_norm": 4.1154273739625475, + "language_loss": 0.87915128, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.89407218, + "num_input_tokens_seen": 287817330, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.25024414, + "step": 13338, + "time_per_iteration": 2.7267210483551025 + }, + { + "auxiliary_loss_clip": 0.01262631, + "auxiliary_loss_mlp": 0.00243513, + "balance_loss_clip": 1.03338671, + "balance_loss_mlp": 0.21617785, + "epoch": 0.801984067338043, + "flos": 20010575255040.0, + "grad_norm": 40.87659454273491, + "language_loss": 0.83817852, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.85323995, + "num_input_tokens_seen": 287835095, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.27331543, + "step": 13339, + "time_per_iteration": 2.6773712635040283 + }, + { + "auxiliary_loss_clip": 0.01230098, + "auxiliary_loss_mlp": 0.00235633, + "balance_loss_clip": 1.01570952, + "balance_loss_mlp": 0.21189804, + "epoch": 0.802044190590711, + "flos": 22784028088320.0, + "grad_norm": 28.600044238528948, + "language_loss": 0.84815347, + "learning_rate": 3.970681765754775e-07, + "loss": 0.86281079, + "num_input_tokens_seen": 287854595, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.23718262, + "step": 13340, + "time_per_iteration": 2.694871664047241 + }, + { + "auxiliary_loss_clip": 0.01247634, + "auxiliary_loss_mlp": 0.00228216, + "balance_loss_clip": 1.03291106, + "balance_loss_mlp": 0.20611452, + "epoch": 0.8021043138433789, + "flos": 27600115639680.0, + "grad_norm": 8.035370125091926, + "language_loss": 0.7485615, + "learning_rate": 3.968352931252936e-07, + "loss": 0.76332003, + "num_input_tokens_seen": 287876960, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.22106934, + "step": 13341, + "time_per_iteration": 2.707085371017456 + }, + { + "auxiliary_loss_clip": 0.01130056, + "auxiliary_loss_mlp": 0.00091603, + "balance_loss_clip": 0.9924407, + "balance_loss_mlp": 0.08344873, + "epoch": 0.8021644370960469, + "flos": 62063730057600.0, + "grad_norm": 0.7893093306031082, + "language_loss": 0.60218072, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.61439729, + "num_input_tokens_seen": 287936530, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.08154297, + "step": 13342, + "time_per_iteration": 3.132772922515869 + }, + { + "auxiliary_loss_clip": 0.01267858, + "auxiliary_loss_mlp": 0.00252464, + "balance_loss_clip": 1.04352903, + "balance_loss_mlp": 0.22663185, + "epoch": 0.8022245603487148, + "flos": 23361794352000.0, + "grad_norm": 23.99942322000829, + "language_loss": 0.70732796, + "learning_rate": 3.963697086102522e-07, + "loss": 0.72253114, + "num_input_tokens_seen": 287954285, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.25805664, + "step": 13343, + "time_per_iteration": 2.681363582611084 + }, + { + "auxiliary_loss_clip": 0.0124556, + "auxiliary_loss_mlp": 0.00218167, + "balance_loss_clip": 1.03194833, + "balance_loss_mlp": 0.19690034, + "epoch": 0.8022846836013828, + "flos": 10853354712960.0, + "grad_norm": 57.01862378702845, + "language_loss": 0.78110051, + "learning_rate": 3.96137007563051e-07, + "loss": 0.7957378, + "num_input_tokens_seen": 287971595, + "router_z_loss_clip": 2.13574219, + "router_z_loss_mlp": 0.21264648, + "step": 13344, + "time_per_iteration": 2.76662015914917 + }, + { + "auxiliary_loss_clip": 0.01256586, + "auxiliary_loss_mlp": 0.00236444, + "balance_loss_clip": 1.03482771, + "balance_loss_mlp": 0.21192294, + "epoch": 0.8023448068540509, + "flos": 29240443054080.0, + "grad_norm": 5.9417861497591336, + "language_loss": 0.77748871, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.79241902, + "num_input_tokens_seen": 287992540, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.24536133, + "step": 13345, + "time_per_iteration": 2.7502329349517822 + }, + { + "auxiliary_loss_clip": 0.01135481, + "auxiliary_loss_mlp": 0.00107311, + "balance_loss_clip": 0.9968884, + "balance_loss_mlp": 0.09844204, + "epoch": 0.8024049301067188, + "flos": 64153588181760.0, + "grad_norm": 0.8927187641369104, + "language_loss": 0.62292808, + "learning_rate": 3.956717879334059e-07, + "loss": 0.63535601, + "num_input_tokens_seen": 288052810, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.08886719, + "step": 13346, + "time_per_iteration": 3.198486089706421 + }, + { + "auxiliary_loss_clip": 0.01245775, + "auxiliary_loss_mlp": 0.00225083, + "balance_loss_clip": 1.02880907, + "balance_loss_mlp": 0.20183763, + "epoch": 0.8024650533593868, + "flos": 28585360765440.0, + "grad_norm": 13.437239300558874, + "language_loss": 0.82039237, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.83510101, + "num_input_tokens_seen": 288073045, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.23242188, + "step": 13347, + "time_per_iteration": 2.7025835514068604 + }, + { + "auxiliary_loss_clip": 0.01266283, + "auxiliary_loss_mlp": 0.00230912, + "balance_loss_clip": 1.03794754, + "balance_loss_mlp": 0.20530613, + "epoch": 0.8025251766120547, + "flos": 16982264448000.0, + "grad_norm": 70.36423090610836, + "language_loss": 0.83024544, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.84521747, + "num_input_tokens_seen": 288091165, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.25598145, + "step": 13348, + "time_per_iteration": 2.6590094566345215 + }, + { + "auxiliary_loss_clip": 0.01261738, + "auxiliary_loss_mlp": 0.00238203, + "balance_loss_clip": 1.04118407, + "balance_loss_mlp": 0.21322861, + "epoch": 0.8025852998647227, + "flos": 22163671272960.0, + "grad_norm": 11.006415164490289, + "language_loss": 0.85313785, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.86813724, + "num_input_tokens_seen": 288110595, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24963379, + "step": 13349, + "time_per_iteration": 2.712337017059326 + }, + { + "auxiliary_loss_clip": 0.01245711, + "auxiliary_loss_mlp": 0.00219925, + "balance_loss_clip": 1.03271246, + "balance_loss_mlp": 0.1979664, + "epoch": 0.8026454231173906, + "flos": 22017012042240.0, + "grad_norm": 7.580473379573667, + "language_loss": 0.90387166, + "learning_rate": 3.947420787800755e-07, + "loss": 0.91852802, + "num_input_tokens_seen": 288128995, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.21960449, + "step": 13350, + "time_per_iteration": 2.6711838245391846 + }, + { + "auxiliary_loss_clip": 0.01238474, + "auxiliary_loss_mlp": 0.00196337, + "balance_loss_clip": 1.02410853, + "balance_loss_mlp": 0.17335302, + "epoch": 0.8027055463700586, + "flos": 22491320158080.0, + "grad_norm": 27.22096605772479, + "language_loss": 0.7778008, + "learning_rate": 3.945098036485679e-07, + "loss": 0.79214895, + "num_input_tokens_seen": 288149265, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.2298584, + "step": 13351, + "time_per_iteration": 2.7243239879608154 + }, + { + "auxiliary_loss_clip": 0.01257721, + "auxiliary_loss_mlp": 0.00254811, + "balance_loss_clip": 1.03902221, + "balance_loss_mlp": 0.23024245, + "epoch": 0.8027656696227266, + "flos": 28912901909760.0, + "grad_norm": 59.958548515775725, + "language_loss": 0.68042499, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.69555032, + "num_input_tokens_seen": 288170745, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24584961, + "step": 13352, + "time_per_iteration": 2.7727866172790527 + }, + { + "auxiliary_loss_clip": 0.0124882, + "auxiliary_loss_mlp": 0.00238562, + "balance_loss_clip": 1.03333926, + "balance_loss_mlp": 0.21423134, + "epoch": 0.8028257928753946, + "flos": 18589374760320.0, + "grad_norm": 180.43140815835739, + "language_loss": 0.84499592, + "learning_rate": 3.940454360354046e-07, + "loss": 0.85986972, + "num_input_tokens_seen": 288189415, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.24353027, + "step": 13353, + "time_per_iteration": 2.6403348445892334 + }, + { + "auxiliary_loss_clip": 0.01290813, + "auxiliary_loss_mlp": 0.00258958, + "balance_loss_clip": 1.06163442, + "balance_loss_mlp": 0.23235051, + "epoch": 0.8028859161280625, + "flos": 19130009339520.0, + "grad_norm": 13.38424889889698, + "language_loss": 0.82958329, + "learning_rate": 3.938133435713582e-07, + "loss": 0.84508103, + "num_input_tokens_seen": 288206900, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.26623535, + "step": 13354, + "time_per_iteration": 4.05318546295166 + }, + { + "auxiliary_loss_clip": 0.0123208, + "auxiliary_loss_mlp": 0.00232791, + "balance_loss_clip": 1.01566601, + "balance_loss_mlp": 0.20863955, + "epoch": 0.8029460393807305, + "flos": 20229881742720.0, + "grad_norm": 83.5248597671148, + "language_loss": 0.74910235, + "learning_rate": 3.935813120140714e-07, + "loss": 0.76375109, + "num_input_tokens_seen": 288224800, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24169922, + "step": 13355, + "time_per_iteration": 4.0628321170806885 + }, + { + "auxiliary_loss_clip": 0.0127832, + "auxiliary_loss_mlp": 0.00221015, + "balance_loss_clip": 1.05140519, + "balance_loss_mlp": 0.19651759, + "epoch": 0.8030061626333984, + "flos": 49783320933120.0, + "grad_norm": 108.85149059242262, + "language_loss": 0.78861445, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.80360782, + "num_input_tokens_seen": 288249400, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.24523926, + "step": 13356, + "time_per_iteration": 2.8984999656677246 + }, + { + "auxiliary_loss_clip": 0.01230792, + "auxiliary_loss_mlp": 0.00242097, + "balance_loss_clip": 1.01560855, + "balance_loss_mlp": 0.21969806, + "epoch": 0.8030662858860664, + "flos": 21615243442560.0, + "grad_norm": 45.49889289481716, + "language_loss": 0.8343066, + "learning_rate": 3.931174316549666e-07, + "loss": 0.8490355, + "num_input_tokens_seen": 288268780, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.22399902, + "step": 13357, + "time_per_iteration": 2.6517579555511475 + }, + { + "auxiliary_loss_clip": 0.01250197, + "auxiliary_loss_mlp": 0.0025898, + "balance_loss_clip": 1.02914035, + "balance_loss_mlp": 0.23425552, + "epoch": 0.8031264091387345, + "flos": 25630056351360.0, + "grad_norm": 47.7594139089667, + "language_loss": 0.84940434, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.86449617, + "num_input_tokens_seen": 288290830, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.24719238, + "step": 13358, + "time_per_iteration": 2.6944169998168945 + }, + { + "auxiliary_loss_clip": 0.01244925, + "auxiliary_loss_mlp": 0.0023035, + "balance_loss_clip": 1.03067589, + "balance_loss_mlp": 0.20550701, + "epoch": 0.8031865323914024, + "flos": 19646225648640.0, + "grad_norm": 7.223557595053155, + "language_loss": 0.89886284, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.91361558, + "num_input_tokens_seen": 288308865, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.24829102, + "step": 13359, + "time_per_iteration": 2.6465065479278564 + }, + { + "auxiliary_loss_clip": 0.01229162, + "auxiliary_loss_mlp": 0.00222954, + "balance_loss_clip": 1.01619279, + "balance_loss_mlp": 0.20059049, + "epoch": 0.8032466556440704, + "flos": 26169110732160.0, + "grad_norm": 5.69112239375238, + "language_loss": 0.80982065, + "learning_rate": 3.924220681368928e-07, + "loss": 0.82434177, + "num_input_tokens_seen": 288327325, + "router_z_loss_clip": 2.12597656, + "router_z_loss_mlp": 0.22375488, + "step": 13360, + "time_per_iteration": 4.235654354095459 + }, + { + "auxiliary_loss_clip": 0.01248028, + "auxiliary_loss_mlp": 0.00244401, + "balance_loss_clip": 1.02648234, + "balance_loss_mlp": 0.21972466, + "epoch": 0.8033067788967383, + "flos": 25520026014720.0, + "grad_norm": 78.73548626470712, + "language_loss": 0.77857566, + "learning_rate": 3.921904022048512e-07, + "loss": 0.79349989, + "num_input_tokens_seen": 288347285, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.24694824, + "step": 13361, + "time_per_iteration": 2.684387683868408 + }, + { + "auxiliary_loss_clip": 0.0127472, + "auxiliary_loss_mlp": 0.00240106, + "balance_loss_clip": 1.04683375, + "balance_loss_mlp": 0.21440491, + "epoch": 0.8033669021494063, + "flos": 24024274842240.0, + "grad_norm": 136.24253691707455, + "language_loss": 0.76891148, + "learning_rate": 3.919587972411098e-07, + "loss": 0.7840597, + "num_input_tokens_seen": 288367785, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.25708008, + "step": 13362, + "time_per_iteration": 2.6669068336486816 + }, + { + "auxiliary_loss_clip": 0.01292887, + "auxiliary_loss_mlp": 0.00239366, + "balance_loss_clip": 1.05437064, + "balance_loss_mlp": 0.21288951, + "epoch": 0.8034270254020742, + "flos": 13588059749760.0, + "grad_norm": 55.04409575539353, + "language_loss": 0.88798356, + "learning_rate": 3.91727253254452e-07, + "loss": 0.90330613, + "num_input_tokens_seen": 288384135, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.26501465, + "step": 13363, + "time_per_iteration": 2.6693735122680664 + }, + { + "auxiliary_loss_clip": 0.01253219, + "auxiliary_loss_mlp": 0.00238332, + "balance_loss_clip": 1.03125858, + "balance_loss_mlp": 0.21215346, + "epoch": 0.8034871486547422, + "flos": 27412661537280.0, + "grad_norm": 49.48004097073689, + "language_loss": 0.83734316, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.85225868, + "num_input_tokens_seen": 288403805, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.26208496, + "step": 13364, + "time_per_iteration": 4.135625600814819 + }, + { + "auxiliary_loss_clip": 0.01250596, + "auxiliary_loss_mlp": 0.00211924, + "balance_loss_clip": 1.03789449, + "balance_loss_mlp": 0.18817732, + "epoch": 0.8035472719074102, + "flos": 32598593475840.0, + "grad_norm": 7.316695139101207, + "language_loss": 0.70202166, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.71664685, + "num_input_tokens_seen": 288424895, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.23754883, + "step": 13365, + "time_per_iteration": 2.7325398921966553 + }, + { + "auxiliary_loss_clip": 0.01258917, + "auxiliary_loss_mlp": 0.0023792, + "balance_loss_clip": 1.03571105, + "balance_loss_mlp": 0.2125161, + "epoch": 0.8036073951600782, + "flos": 21287989607040.0, + "grad_norm": 190.10057254573832, + "language_loss": 0.76276237, + "learning_rate": 3.910329872447706e-07, + "loss": 0.77773076, + "num_input_tokens_seen": 288443865, + "router_z_loss_clip": 2.23144531, + "router_z_loss_mlp": 0.25390625, + "step": 13366, + "time_per_iteration": 2.7344837188720703 + }, + { + "auxiliary_loss_clip": 0.01239281, + "auxiliary_loss_mlp": 0.00227204, + "balance_loss_clip": 1.02651763, + "balance_loss_mlp": 0.20398186, + "epoch": 0.8036675184127461, + "flos": 18113845582080.0, + "grad_norm": 25.887449963571843, + "language_loss": 0.82626504, + "learning_rate": 3.908016872542259e-07, + "loss": 0.84092987, + "num_input_tokens_seen": 288461065, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.2322998, + "step": 13367, + "time_per_iteration": 2.668088436126709 + }, + { + "auxiliary_loss_clip": 0.01236995, + "auxiliary_loss_mlp": 0.00255311, + "balance_loss_clip": 1.02092957, + "balance_loss_mlp": 0.23249485, + "epoch": 0.8037276416654141, + "flos": 26030280666240.0, + "grad_norm": 31.075213940065026, + "language_loss": 0.79836792, + "learning_rate": 3.905704482846428e-07, + "loss": 0.81329101, + "num_input_tokens_seen": 288481865, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.22839355, + "step": 13368, + "time_per_iteration": 2.70517635345459 + }, + { + "auxiliary_loss_clip": 0.01266416, + "auxiliary_loss_mlp": 0.00232032, + "balance_loss_clip": 1.04115427, + "balance_loss_mlp": 0.2067356, + "epoch": 0.803787764918082, + "flos": 18802180886400.0, + "grad_norm": 113.14697651812297, + "language_loss": 0.77455181, + "learning_rate": 3.90339270344789e-07, + "loss": 0.78953624, + "num_input_tokens_seen": 288499345, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.25268555, + "step": 13369, + "time_per_iteration": 2.637619733810425 + }, + { + "auxiliary_loss_clip": 0.0124085, + "auxiliary_loss_mlp": 0.00249227, + "balance_loss_clip": 1.02665401, + "balance_loss_mlp": 0.22390737, + "epoch": 0.80384788817075, + "flos": 20225787592320.0, + "grad_norm": 1.8972763831888355, + "language_loss": 0.80979264, + "learning_rate": 3.901081534434312e-07, + "loss": 0.82469338, + "num_input_tokens_seen": 288517660, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.25341797, + "step": 13370, + "time_per_iteration": 2.657428503036499 + }, + { + "auxiliary_loss_clip": 0.01259928, + "auxiliary_loss_mlp": 0.00235682, + "balance_loss_clip": 1.03652656, + "balance_loss_mlp": 0.20988479, + "epoch": 0.8039080114234181, + "flos": 18515290959360.0, + "grad_norm": 176.30559898324182, + "language_loss": 0.98139298, + "learning_rate": 3.898770975893342e-07, + "loss": 0.99634904, + "num_input_tokens_seen": 288534180, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.25805664, + "step": 13371, + "time_per_iteration": 2.6572108268737793 + }, + { + "auxiliary_loss_clip": 0.01257792, + "auxiliary_loss_mlp": 0.00251467, + "balance_loss_clip": 1.03254783, + "balance_loss_mlp": 0.22447772, + "epoch": 0.803968134676086, + "flos": 22382510883840.0, + "grad_norm": 18.45679510602033, + "language_loss": 0.82192779, + "learning_rate": 3.89646102791259e-07, + "loss": 0.83702034, + "num_input_tokens_seen": 288553350, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.26965332, + "step": 13372, + "time_per_iteration": 2.6380958557128906 + }, + { + "auxiliary_loss_clip": 0.01258101, + "auxiliary_loss_mlp": 0.00216393, + "balance_loss_clip": 1.03403425, + "balance_loss_mlp": 0.19215785, + "epoch": 0.804028257928754, + "flos": 23842566915840.0, + "grad_norm": 76.62934500683733, + "language_loss": 0.89280176, + "learning_rate": 3.894151690579646e-07, + "loss": 0.90754664, + "num_input_tokens_seen": 288571325, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.24255371, + "step": 13373, + "time_per_iteration": 2.717031717300415 + }, + { + "auxiliary_loss_clip": 0.01238785, + "auxiliary_loss_mlp": 0.00236687, + "balance_loss_clip": 1.02605939, + "balance_loss_mlp": 0.21263021, + "epoch": 0.8040883811814219, + "flos": 23550720912000.0, + "grad_norm": 20.43892996652995, + "language_loss": 0.83311069, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.84786546, + "num_input_tokens_seen": 288592100, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.24035645, + "step": 13374, + "time_per_iteration": 2.6644508838653564 + }, + { + "auxiliary_loss_clip": 0.01273537, + "auxiliary_loss_mlp": 0.00262213, + "balance_loss_clip": 1.04318357, + "balance_loss_mlp": 0.23524803, + "epoch": 0.8041485044340899, + "flos": 19026263882880.0, + "grad_norm": 1343.961670477193, + "language_loss": 0.8037284, + "learning_rate": 3.889534848207452e-07, + "loss": 0.81908584, + "num_input_tokens_seen": 288612305, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.26965332, + "step": 13375, + "time_per_iteration": 2.6703829765319824 + }, + { + "auxiliary_loss_clip": 0.01138887, + "auxiliary_loss_mlp": 0.0011901, + "balance_loss_clip": 1.00126243, + "balance_loss_mlp": 0.11023647, + "epoch": 0.8042086276867578, + "flos": 70005663797760.0, + "grad_norm": 8.922791356050814, + "language_loss": 0.55136746, + "learning_rate": 3.887227343343271e-07, + "loss": 0.56394643, + "num_input_tokens_seen": 288676015, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.08789062, + "step": 13376, + "time_per_iteration": 3.22613263130188 + }, + { + "auxiliary_loss_clip": 0.01259717, + "auxiliary_loss_mlp": 0.00235692, + "balance_loss_clip": 1.03685713, + "balance_loss_mlp": 0.20977575, + "epoch": 0.8042687509394258, + "flos": 21872435800320.0, + "grad_norm": 8.220750806448063, + "language_loss": 0.81260562, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.82755971, + "num_input_tokens_seen": 288696455, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.2590332, + "step": 13377, + "time_per_iteration": 2.680286407470703 + }, + { + "auxiliary_loss_clip": 0.01249995, + "auxiliary_loss_mlp": 0.00224674, + "balance_loss_clip": 1.03266716, + "balance_loss_mlp": 0.20160744, + "epoch": 0.8043288741920938, + "flos": 26614870513920.0, + "grad_norm": 84.97066140119793, + "language_loss": 0.79709738, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.81184411, + "num_input_tokens_seen": 288715560, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.23059082, + "step": 13378, + "time_per_iteration": 2.813237190246582 + }, + { + "auxiliary_loss_clip": 0.01262085, + "auxiliary_loss_mlp": 0.00221871, + "balance_loss_clip": 1.03893769, + "balance_loss_mlp": 0.1978502, + "epoch": 0.8043889974447618, + "flos": 33403387651200.0, + "grad_norm": 6.824453897003627, + "language_loss": 0.75001442, + "learning_rate": 3.880308495088347e-07, + "loss": 0.76485395, + "num_input_tokens_seen": 288739485, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.24023438, + "step": 13379, + "time_per_iteration": 2.7630767822265625 + }, + { + "auxiliary_loss_clip": 0.01277426, + "auxiliary_loss_mlp": 0.00265951, + "balance_loss_clip": 1.04223514, + "balance_loss_mlp": 0.23625553, + "epoch": 0.8044491206974297, + "flos": 20375966355840.0, + "grad_norm": 28.812975997072872, + "language_loss": 0.87465799, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.89009178, + "num_input_tokens_seen": 288757420, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.29663086, + "step": 13380, + "time_per_iteration": 2.713365316390991 + }, + { + "auxiliary_loss_clip": 0.01250515, + "auxiliary_loss_mlp": 0.00250771, + "balance_loss_clip": 1.0334667, + "balance_loss_mlp": 0.22558242, + "epoch": 0.8045092439500977, + "flos": 23403810286080.0, + "grad_norm": 2.659097751368952, + "language_loss": 0.76742923, + "learning_rate": 3.875698985740887e-07, + "loss": 0.78244209, + "num_input_tokens_seen": 288775535, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.25183105, + "step": 13381, + "time_per_iteration": 2.671079397201538 + }, + { + "auxiliary_loss_clip": 0.01241161, + "auxiliary_loss_mlp": 0.00221605, + "balance_loss_clip": 1.02440238, + "balance_loss_mlp": 0.19787082, + "epoch": 0.8045693672027656, + "flos": 24097245321600.0, + "grad_norm": 40.2647674719375, + "language_loss": 0.74620104, + "learning_rate": 3.873395148176135e-07, + "loss": 0.76082873, + "num_input_tokens_seen": 288795035, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.23706055, + "step": 13382, + "time_per_iteration": 2.687542200088501 + }, + { + "auxiliary_loss_clip": 0.01250689, + "auxiliary_loss_mlp": 0.00237702, + "balance_loss_clip": 1.02764618, + "balance_loss_mlp": 0.21369284, + "epoch": 0.8046294904554336, + "flos": 27707165147520.0, + "grad_norm": 84.00647880272892, + "language_loss": 0.85339087, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.86827475, + "num_input_tokens_seen": 288816270, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.24023438, + "step": 13383, + "time_per_iteration": 2.6875877380371094 + }, + { + "auxiliary_loss_clip": 0.01252318, + "auxiliary_loss_mlp": 0.00216847, + "balance_loss_clip": 1.03127384, + "balance_loss_mlp": 0.19542484, + "epoch": 0.8046896137081017, + "flos": 24972998814720.0, + "grad_norm": 377.4702004881045, + "language_loss": 0.7905826, + "learning_rate": 3.868789307701381e-07, + "loss": 0.80527425, + "num_input_tokens_seen": 288836050, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.21435547, + "step": 13384, + "time_per_iteration": 2.693364143371582 + }, + { + "auxiliary_loss_clip": 0.01275083, + "auxiliary_loss_mlp": 0.0026227, + "balance_loss_clip": 1.04468215, + "balance_loss_mlp": 0.23625837, + "epoch": 0.8047497369607696, + "flos": 17675484001920.0, + "grad_norm": 31.032553415830986, + "language_loss": 0.91479015, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.93016374, + "num_input_tokens_seen": 288852900, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.26025391, + "step": 13385, + "time_per_iteration": 2.6032614707946777 + }, + { + "auxiliary_loss_clip": 0.01251802, + "auxiliary_loss_mlp": 0.00233957, + "balance_loss_clip": 1.03419256, + "balance_loss_mlp": 0.209746, + "epoch": 0.8048098602134376, + "flos": 22382079920640.0, + "grad_norm": 90.19486286047395, + "language_loss": 0.80491412, + "learning_rate": 3.864185914015108e-07, + "loss": 0.81977177, + "num_input_tokens_seen": 288872625, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24230957, + "step": 13386, + "time_per_iteration": 2.6752798557281494 + }, + { + "auxiliary_loss_clip": 0.01142112, + "auxiliary_loss_mlp": 0.000791, + "balance_loss_clip": 1.00483704, + "balance_loss_mlp": 0.07118483, + "epoch": 0.8048699834661055, + "flos": 71200949702400.0, + "grad_norm": 0.6384686952769673, + "language_loss": 0.50608557, + "learning_rate": 3.861885134935865e-07, + "loss": 0.51829773, + "num_input_tokens_seen": 288939180, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.07910156, + "step": 13387, + "time_per_iteration": 3.2328684329986572 + }, + { + "auxiliary_loss_clip": 0.0124773, + "auxiliary_loss_mlp": 0.00237973, + "balance_loss_clip": 1.03156507, + "balance_loss_mlp": 0.21249783, + "epoch": 0.8049301067187735, + "flos": 23660320285440.0, + "grad_norm": 23.020309983108582, + "language_loss": 0.80344731, + "learning_rate": 3.859584967815559e-07, + "loss": 0.81830436, + "num_input_tokens_seen": 288958925, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.25463867, + "step": 13388, + "time_per_iteration": 2.7053186893463135 + }, + { + "auxiliary_loss_clip": 0.01273152, + "auxiliary_loss_mlp": 0.00253128, + "balance_loss_clip": 1.05116785, + "balance_loss_mlp": 0.22820163, + "epoch": 0.8049902299714414, + "flos": 24426330750720.0, + "grad_norm": 71.27830891852007, + "language_loss": 0.80172086, + "learning_rate": 3.857285412741411e-07, + "loss": 0.81698364, + "num_input_tokens_seen": 288980935, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.24902344, + "step": 13389, + "time_per_iteration": 2.7365095615386963 + }, + { + "auxiliary_loss_clip": 0.01247211, + "auxiliary_loss_mlp": 0.00225011, + "balance_loss_clip": 1.03143477, + "balance_loss_mlp": 0.20128885, + "epoch": 0.8050503532241094, + "flos": 17492626840320.0, + "grad_norm": 56.10731137447746, + "language_loss": 0.89683944, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.91156167, + "num_input_tokens_seen": 288996780, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.23706055, + "step": 13390, + "time_per_iteration": 2.6669185161590576 + }, + { + "auxiliary_loss_clip": 0.01145253, + "auxiliary_loss_mlp": 0.0009024, + "balance_loss_clip": 1.0076654, + "balance_loss_mlp": 0.08232447, + "epoch": 0.8051104764767774, + "flos": 57658030369920.0, + "grad_norm": 0.7667739381795756, + "language_loss": 0.55112123, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.5634762, + "num_input_tokens_seen": 289057590, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.07910156, + "step": 13391, + "time_per_iteration": 3.1184849739074707 + }, + { + "auxiliary_loss_clip": 0.01248163, + "auxiliary_loss_mlp": 0.00237319, + "balance_loss_clip": 1.03437996, + "balance_loss_mlp": 0.21345332, + "epoch": 0.8051705997294454, + "flos": 18003456109440.0, + "grad_norm": 450.2839108287048, + "language_loss": 0.91094708, + "learning_rate": 3.850390420667762e-07, + "loss": 0.92580187, + "num_input_tokens_seen": 289076285, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.23864746, + "step": 13392, + "time_per_iteration": 2.598285675048828 + }, + { + "auxiliary_loss_clip": 0.01259479, + "auxiliary_loss_mlp": 0.00225605, + "balance_loss_clip": 1.03335166, + "balance_loss_mlp": 0.20011824, + "epoch": 0.8052307229821133, + "flos": 26397754755840.0, + "grad_norm": 1.865151941120161, + "language_loss": 0.76227707, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.77712786, + "num_input_tokens_seen": 289097585, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.25463867, + "step": 13393, + "time_per_iteration": 2.7135493755340576 + }, + { + "auxiliary_loss_clip": 0.01248537, + "auxiliary_loss_mlp": 0.00219981, + "balance_loss_clip": 1.02663803, + "balance_loss_mlp": 0.195317, + "epoch": 0.8052908462347813, + "flos": 21757018423680.0, + "grad_norm": 4.49333863306604, + "language_loss": 0.85536569, + "learning_rate": 3.84579682111414e-07, + "loss": 0.87005085, + "num_input_tokens_seen": 289116890, + "router_z_loss_clip": 2.21777344, + "router_z_loss_mlp": 0.24658203, + "step": 13394, + "time_per_iteration": 2.6541073322296143 + }, + { + "auxiliary_loss_clip": 0.01240466, + "auxiliary_loss_mlp": 0.00234984, + "balance_loss_clip": 1.02460265, + "balance_loss_mlp": 0.21270406, + "epoch": 0.8053509694874492, + "flos": 25442279026560.0, + "grad_norm": 9.060740892138583, + "language_loss": 0.71725887, + "learning_rate": 3.843500940147304e-07, + "loss": 0.73201334, + "num_input_tokens_seen": 289136670, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.22265625, + "step": 13395, + "time_per_iteration": 2.6910953521728516 + }, + { + "auxiliary_loss_clip": 0.01146215, + "auxiliary_loss_mlp": 0.00092398, + "balance_loss_clip": 1.00726295, + "balance_loss_mlp": 0.08419637, + "epoch": 0.8054110927401172, + "flos": 57668122091520.0, + "grad_norm": 0.7287275401128784, + "language_loss": 0.56923556, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.58162177, + "num_input_tokens_seen": 289200150, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.08203125, + "step": 13396, + "time_per_iteration": 4.699950933456421 + }, + { + "auxiliary_loss_clip": 0.01255019, + "auxiliary_loss_mlp": 0.00246364, + "balance_loss_clip": 1.03394127, + "balance_loss_mlp": 0.22130668, + "epoch": 0.8054712159927853, + "flos": 19276201693440.0, + "grad_norm": 121.12208683514432, + "language_loss": 0.85006285, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.86507666, + "num_input_tokens_seen": 289218125, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.25048828, + "step": 13397, + "time_per_iteration": 4.187336206436157 + }, + { + "auxiliary_loss_clip": 0.01259086, + "auxiliary_loss_mlp": 0.00235568, + "balance_loss_clip": 1.03968883, + "balance_loss_mlp": 0.21107073, + "epoch": 0.8055313392454532, + "flos": 17967617314560.0, + "grad_norm": 57.827894557795126, + "language_loss": 0.77488101, + "learning_rate": 3.836616973531266e-07, + "loss": 0.78982747, + "num_input_tokens_seen": 289237115, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24499512, + "step": 13398, + "time_per_iteration": 2.6124346256256104 + }, + { + "auxiliary_loss_clip": 0.01254785, + "auxiliary_loss_mlp": 0.00236582, + "balance_loss_clip": 1.03636885, + "balance_loss_mlp": 0.21241796, + "epoch": 0.8055914624981212, + "flos": 13478352635520.0, + "grad_norm": 3.2574701233077494, + "language_loss": 0.79885721, + "learning_rate": 3.834323543710805e-07, + "loss": 0.81377089, + "num_input_tokens_seen": 289253635, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24169922, + "step": 13399, + "time_per_iteration": 2.7147955894470215 + }, + { + "auxiliary_loss_clip": 0.01264085, + "auxiliary_loss_mlp": 0.0022882, + "balance_loss_clip": 1.03995109, + "balance_loss_mlp": 0.20496653, + "epoch": 0.8056515857507891, + "flos": 13224787551360.0, + "grad_norm": 7.993489262583065, + "language_loss": 0.81231809, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.82724714, + "num_input_tokens_seen": 289270085, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.23864746, + "step": 13400, + "time_per_iteration": 2.601632833480835 + }, + { + "auxiliary_loss_clip": 0.01248706, + "auxiliary_loss_mlp": 0.00255825, + "balance_loss_clip": 1.03121829, + "balance_loss_mlp": 0.23149444, + "epoch": 0.8057117090034571, + "flos": 23878190229120.0, + "grad_norm": 90.83406184252226, + "language_loss": 0.71074671, + "learning_rate": 3.829738523169037e-07, + "loss": 0.72579199, + "num_input_tokens_seen": 289289645, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.2434082, + "step": 13401, + "time_per_iteration": 2.7417078018188477 + }, + { + "auxiliary_loss_clip": 0.01264047, + "auxiliary_loss_mlp": 0.0022414, + "balance_loss_clip": 1.04393911, + "balance_loss_mlp": 0.20046508, + "epoch": 0.805771832256125, + "flos": 21214300855680.0, + "grad_norm": 16.430105362590332, + "language_loss": 0.94417059, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.95905244, + "num_input_tokens_seen": 289306630, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.23669434, + "step": 13402, + "time_per_iteration": 4.176253318786621 + }, + { + "auxiliary_loss_clip": 0.01279083, + "auxiliary_loss_mlp": 0.00250118, + "balance_loss_clip": 1.05207396, + "balance_loss_mlp": 0.22339134, + "epoch": 0.805831955508793, + "flos": 17566818382080.0, + "grad_norm": 172.07423420212947, + "language_loss": 0.78020012, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.79549217, + "num_input_tokens_seen": 289324960, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.26745605, + "step": 13403, + "time_per_iteration": 2.6191420555114746 + }, + { + "auxiliary_loss_clip": 0.01261014, + "auxiliary_loss_mlp": 0.00255452, + "balance_loss_clip": 1.04163229, + "balance_loss_mlp": 0.23133603, + "epoch": 0.805892078761461, + "flos": 26907542530560.0, + "grad_norm": 18.2882141141593, + "language_loss": 0.9095974, + "learning_rate": 3.822865591408084e-07, + "loss": 0.92476213, + "num_input_tokens_seen": 289344980, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.2409668, + "step": 13404, + "time_per_iteration": 2.7151122093200684 + }, + { + "auxiliary_loss_clip": 0.01244529, + "auxiliary_loss_mlp": 0.00226377, + "balance_loss_clip": 1.02918839, + "balance_loss_mlp": 0.20394158, + "epoch": 0.805952202014129, + "flos": 31506442496640.0, + "grad_norm": 4.523370585123993, + "language_loss": 0.78012609, + "learning_rate": 3.820575840915743e-07, + "loss": 0.79483509, + "num_input_tokens_seen": 289367500, + "router_z_loss_clip": 2.15136719, + "router_z_loss_mlp": 0.22436523, + "step": 13405, + "time_per_iteration": 2.7438855171203613 + }, + { + "auxiliary_loss_clip": 0.01257641, + "auxiliary_loss_mlp": 0.00228806, + "balance_loss_clip": 1.03842735, + "balance_loss_mlp": 0.2045826, + "epoch": 0.8060123252667969, + "flos": 24389953251840.0, + "grad_norm": 284.6279377324054, + "language_loss": 0.84087789, + "learning_rate": 3.818286703948788e-07, + "loss": 0.85574234, + "num_input_tokens_seen": 289385930, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24206543, + "step": 13406, + "time_per_iteration": 4.094002962112427 + }, + { + "auxiliary_loss_clip": 0.01245383, + "auxiliary_loss_mlp": 0.00228872, + "balance_loss_clip": 1.02690315, + "balance_loss_mlp": 0.20506588, + "epoch": 0.8060724485194649, + "flos": 23479941162240.0, + "grad_norm": 2.285205576482345, + "language_loss": 0.81345505, + "learning_rate": 3.815998180594018e-07, + "loss": 0.8281976, + "num_input_tokens_seen": 289408025, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.23803711, + "step": 13407, + "time_per_iteration": 2.691359281539917 + }, + { + "auxiliary_loss_clip": 0.01249637, + "auxiliary_loss_mlp": 0.00227407, + "balance_loss_clip": 1.0286411, + "balance_loss_mlp": 0.20257543, + "epoch": 0.8061325717721328, + "flos": 18624495283200.0, + "grad_norm": 22.84082826215416, + "language_loss": 0.83983898, + "learning_rate": 3.81371027093822e-07, + "loss": 0.85460943, + "num_input_tokens_seen": 289426575, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.2479248, + "step": 13408, + "time_per_iteration": 2.6235780715942383 + }, + { + "auxiliary_loss_clip": 0.0124423, + "auxiliary_loss_mlp": 0.00230476, + "balance_loss_clip": 1.02462709, + "balance_loss_mlp": 0.20579922, + "epoch": 0.8061926950248008, + "flos": 23582752865280.0, + "grad_norm": 30.60667076260256, + "language_loss": 0.78524947, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.7999965, + "num_input_tokens_seen": 289447760, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24682617, + "step": 13409, + "time_per_iteration": 2.7211503982543945 + }, + { + "auxiliary_loss_clip": 0.01262282, + "auxiliary_loss_mlp": 0.00231293, + "balance_loss_clip": 1.03244066, + "balance_loss_mlp": 0.20467384, + "epoch": 0.8062528182774689, + "flos": 11143333209600.0, + "grad_norm": 23.054110592069275, + "language_loss": 0.85434699, + "learning_rate": 3.809136293070545e-07, + "loss": 0.86928272, + "num_input_tokens_seen": 289463920, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.26599121, + "step": 13410, + "time_per_iteration": 2.656146764755249 + }, + { + "auxiliary_loss_clip": 0.01251801, + "auxiliary_loss_mlp": 0.00238477, + "balance_loss_clip": 1.03421736, + "balance_loss_mlp": 0.21501659, + "epoch": 0.8063129415301368, + "flos": 22346815743360.0, + "grad_norm": 11.28384225965076, + "language_loss": 0.7496655, + "learning_rate": 3.806850225032117e-07, + "loss": 0.76456833, + "num_input_tokens_seen": 289482635, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.234375, + "step": 13411, + "time_per_iteration": 2.6835126876831055 + }, + { + "auxiliary_loss_clip": 0.01252042, + "auxiliary_loss_mlp": 0.00249117, + "balance_loss_clip": 1.03376555, + "balance_loss_mlp": 0.22318941, + "epoch": 0.8063730647828048, + "flos": 23988400133760.0, + "grad_norm": 32.174117926128616, + "language_loss": 0.75467443, + "learning_rate": 3.804564771039551e-07, + "loss": 0.76968604, + "num_input_tokens_seen": 289502040, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25915527, + "step": 13412, + "time_per_iteration": 2.7118241786956787 + }, + { + "auxiliary_loss_clip": 0.01264257, + "auxiliary_loss_mlp": 0.00268769, + "balance_loss_clip": 1.03252089, + "balance_loss_mlp": 0.24032545, + "epoch": 0.8064331880354727, + "flos": 21321494017920.0, + "grad_norm": 12.08115915949616, + "language_loss": 0.87558705, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.8909173, + "num_input_tokens_seen": 289520740, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.2845459, + "step": 13413, + "time_per_iteration": 2.6894147396087646 + }, + { + "auxiliary_loss_clip": 0.01250243, + "auxiliary_loss_mlp": 0.00244408, + "balance_loss_clip": 1.03484488, + "balance_loss_mlp": 0.22140096, + "epoch": 0.8064933112881407, + "flos": 19682890456320.0, + "grad_norm": 3.687088213967103, + "language_loss": 0.91515368, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.9301002, + "num_input_tokens_seen": 289535840, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.23010254, + "step": 13414, + "time_per_iteration": 2.641120672225952 + }, + { + "auxiliary_loss_clip": 0.01245008, + "auxiliary_loss_mlp": 0.00220543, + "balance_loss_clip": 1.02949715, + "balance_loss_mlp": 0.19748822, + "epoch": 0.8065534345408086, + "flos": 19279721226240.0, + "grad_norm": 2.280350307751352, + "language_loss": 0.74744481, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.76210034, + "num_input_tokens_seen": 289555205, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.23071289, + "step": 13415, + "time_per_iteration": 2.6619577407836914 + }, + { + "auxiliary_loss_clip": 0.0123263, + "auxiliary_loss_mlp": 0.00222261, + "balance_loss_clip": 1.02014446, + "balance_loss_mlp": 0.19993338, + "epoch": 0.8066135577934767, + "flos": 19677718897920.0, + "grad_norm": 24.11568833660313, + "language_loss": 0.83011311, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.84466195, + "num_input_tokens_seen": 289573000, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.2232666, + "step": 13416, + "time_per_iteration": 2.6445231437683105 + }, + { + "auxiliary_loss_clip": 0.01252896, + "auxiliary_loss_mlp": 0.00237302, + "balance_loss_clip": 1.03214204, + "balance_loss_mlp": 0.2140439, + "epoch": 0.8066736810461446, + "flos": 21143592933120.0, + "grad_norm": 9.15248673146207, + "language_loss": 0.74198294, + "learning_rate": 3.793146714797086e-07, + "loss": 0.75688493, + "num_input_tokens_seen": 289592625, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.2322998, + "step": 13417, + "time_per_iteration": 2.681936740875244 + }, + { + "auxiliary_loss_clip": 0.01251826, + "auxiliary_loss_mlp": 0.00252742, + "balance_loss_clip": 1.03338814, + "balance_loss_mlp": 0.22729042, + "epoch": 0.8067338042988126, + "flos": 22598261925120.0, + "grad_norm": 123.29715642451211, + "language_loss": 0.88614208, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.90118778, + "num_input_tokens_seen": 289610780, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.2545166, + "step": 13418, + "time_per_iteration": 2.6361660957336426 + }, + { + "auxiliary_loss_clip": 0.01254563, + "auxiliary_loss_mlp": 0.00233818, + "balance_loss_clip": 1.03731489, + "balance_loss_mlp": 0.20935667, + "epoch": 0.8067939275514805, + "flos": 16508423208960.0, + "grad_norm": 15.925859379614634, + "language_loss": 0.92935592, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.94423974, + "num_input_tokens_seen": 289628890, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24462891, + "step": 13419, + "time_per_iteration": 2.6347508430480957 + }, + { + "auxiliary_loss_clip": 0.01261074, + "auxiliary_loss_mlp": 0.00227731, + "balance_loss_clip": 1.03413439, + "balance_loss_mlp": 0.20251787, + "epoch": 0.8068540508041485, + "flos": 28541836460160.0, + "grad_norm": 40.16399378297997, + "language_loss": 0.82942653, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.84431458, + "num_input_tokens_seen": 289647220, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.2520752, + "step": 13420, + "time_per_iteration": 2.665339708328247 + }, + { + "auxiliary_loss_clip": 0.01238303, + "auxiliary_loss_mlp": 0.00231488, + "balance_loss_clip": 1.0232482, + "balance_loss_mlp": 0.20854029, + "epoch": 0.8069141740568164, + "flos": 21652482867840.0, + "grad_norm": 9.009966713304287, + "language_loss": 0.86823291, + "learning_rate": 3.784023331462207e-07, + "loss": 0.88293087, + "num_input_tokens_seen": 289665800, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.22949219, + "step": 13421, + "time_per_iteration": 2.669095039367676 + }, + { + "auxiliary_loss_clip": 0.01256225, + "auxiliary_loss_mlp": 0.00233196, + "balance_loss_clip": 1.03630042, + "balance_loss_mlp": 0.20903192, + "epoch": 0.8069742973094844, + "flos": 17529327561600.0, + "grad_norm": 1716.2104624425883, + "language_loss": 0.85891342, + "learning_rate": 3.78174402269098e-07, + "loss": 0.87380767, + "num_input_tokens_seen": 289682705, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.24157715, + "step": 13422, + "time_per_iteration": 2.5828311443328857 + }, + { + "auxiliary_loss_clip": 0.01234135, + "auxiliary_loss_mlp": 0.00223859, + "balance_loss_clip": 1.01759672, + "balance_loss_mlp": 0.20196025, + "epoch": 0.8070344205621525, + "flos": 23367037737600.0, + "grad_norm": 105.13362525872822, + "language_loss": 0.74513805, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.75971794, + "num_input_tokens_seen": 289702920, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.21899414, + "step": 13423, + "time_per_iteration": 2.6549413204193115 + }, + { + "auxiliary_loss_clip": 0.01268307, + "auxiliary_loss_mlp": 0.00225823, + "balance_loss_clip": 1.03782344, + "balance_loss_mlp": 0.19897754, + "epoch": 0.8070945438148204, + "flos": 22930184528640.0, + "grad_norm": 10.001271829873442, + "language_loss": 0.87968123, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.8946225, + "num_input_tokens_seen": 289723280, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.26855469, + "step": 13424, + "time_per_iteration": 2.6913044452667236 + }, + { + "auxiliary_loss_clip": 0.01245122, + "auxiliary_loss_mlp": 0.00263436, + "balance_loss_clip": 1.03073287, + "balance_loss_mlp": 0.23883098, + "epoch": 0.8071546670674884, + "flos": 25300683613440.0, + "grad_norm": 6.258808657573291, + "language_loss": 0.86463463, + "learning_rate": 3.774909786710232e-07, + "loss": 0.87972021, + "num_input_tokens_seen": 289743475, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.24597168, + "step": 13425, + "time_per_iteration": 2.6728477478027344 + }, + { + "auxiliary_loss_clip": 0.0124957, + "auxiliary_loss_mlp": 0.00228333, + "balance_loss_clip": 1.02787173, + "balance_loss_mlp": 0.20451505, + "epoch": 0.8072147903201563, + "flos": 18113701927680.0, + "grad_norm": 8558.256844419511, + "language_loss": 0.89631563, + "learning_rate": 3.772632938448923e-07, + "loss": 0.91109467, + "num_input_tokens_seen": 289761400, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.23803711, + "step": 13426, + "time_per_iteration": 2.731123447418213 + }, + { + "auxiliary_loss_clip": 0.01262484, + "auxiliary_loss_mlp": 0.00219362, + "balance_loss_clip": 1.03890538, + "balance_loss_mlp": 0.19579436, + "epoch": 0.8072749135728243, + "flos": 26688164215680.0, + "grad_norm": 3.0227160680949874, + "language_loss": 0.82497156, + "learning_rate": 3.770356705530997e-07, + "loss": 0.83978999, + "num_input_tokens_seen": 289781025, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.23571777, + "step": 13427, + "time_per_iteration": 2.841608762741089 + }, + { + "auxiliary_loss_clip": 0.01257025, + "auxiliary_loss_mlp": 0.00235949, + "balance_loss_clip": 1.03769422, + "balance_loss_mlp": 0.21157047, + "epoch": 0.8073350368254922, + "flos": 19240291071360.0, + "grad_norm": 320.01965037648597, + "language_loss": 0.76852983, + "learning_rate": 3.768081088042774e-07, + "loss": 0.78345954, + "num_input_tokens_seen": 289798380, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.24389648, + "step": 13428, + "time_per_iteration": 2.689815044403076 + }, + { + "auxiliary_loss_clip": 0.01252751, + "auxiliary_loss_mlp": 0.00243252, + "balance_loss_clip": 1.02789426, + "balance_loss_mlp": 0.21886221, + "epoch": 0.8073951600781603, + "flos": 13334530579200.0, + "grad_norm": 2942.461241852755, + "language_loss": 0.83870494, + "learning_rate": 3.765806086070544e-07, + "loss": 0.85366499, + "num_input_tokens_seen": 289814515, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.24389648, + "step": 13429, + "time_per_iteration": 2.625699520111084 + }, + { + "auxiliary_loss_clip": 0.01220719, + "auxiliary_loss_mlp": 0.00206126, + "balance_loss_clip": 1.01021051, + "balance_loss_mlp": 0.18286823, + "epoch": 0.8074552833308282, + "flos": 22853191726080.0, + "grad_norm": 23.499952320300277, + "language_loss": 0.74466062, + "learning_rate": 3.763531699700568e-07, + "loss": 0.75892901, + "num_input_tokens_seen": 289834315, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.23254395, + "step": 13430, + "time_per_iteration": 2.7014336585998535 + }, + { + "auxiliary_loss_clip": 0.01262704, + "auxiliary_loss_mlp": 0.00252309, + "balance_loss_clip": 1.04387772, + "balance_loss_mlp": 0.22791901, + "epoch": 0.8075154065834962, + "flos": 20339409288960.0, + "grad_norm": 13.211270407531273, + "language_loss": 0.8742609, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.88941109, + "num_input_tokens_seen": 289853770, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24377441, + "step": 13431, + "time_per_iteration": 2.6349031925201416 + }, + { + "auxiliary_loss_clip": 0.01249615, + "auxiliary_loss_mlp": 0.00234586, + "balance_loss_clip": 1.03149223, + "balance_loss_mlp": 0.20984988, + "epoch": 0.8075755298361641, + "flos": 21908059113600.0, + "grad_norm": 4.6482697957036505, + "language_loss": 0.8944813, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.90932333, + "num_input_tokens_seen": 289870480, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24743652, + "step": 13432, + "time_per_iteration": 2.718703508377075 + }, + { + "auxiliary_loss_clip": 0.01261095, + "auxiliary_loss_mlp": 0.0025805, + "balance_loss_clip": 1.03807998, + "balance_loss_mlp": 0.23085867, + "epoch": 0.8076356530888321, + "flos": 15669298609920.0, + "grad_norm": 22.80120514440815, + "language_loss": 0.79597014, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.81116164, + "num_input_tokens_seen": 289888275, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.27197266, + "step": 13433, + "time_per_iteration": 2.6393189430236816 + }, + { + "auxiliary_loss_clip": 0.01236564, + "auxiliary_loss_mlp": 0.00237755, + "balance_loss_clip": 1.02514303, + "balance_loss_mlp": 0.21402074, + "epoch": 0.8076957763415, + "flos": 37777414521600.0, + "grad_norm": 4.748352995834898, + "language_loss": 0.78486615, + "learning_rate": 3.754440311967828e-07, + "loss": 0.79960936, + "num_input_tokens_seen": 289911495, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.23754883, + "step": 13434, + "time_per_iteration": 2.8145241737365723 + }, + { + "auxiliary_loss_clip": 0.01271945, + "auxiliary_loss_mlp": 0.00231238, + "balance_loss_clip": 1.05169308, + "balance_loss_mlp": 0.20745617, + "epoch": 0.807755899594168, + "flos": 19610781903360.0, + "grad_norm": 19700.835075823325, + "language_loss": 0.76070607, + "learning_rate": 3.752169004902361e-07, + "loss": 0.77573788, + "num_input_tokens_seen": 289930045, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.23779297, + "step": 13435, + "time_per_iteration": 2.698435068130493 + }, + { + "auxiliary_loss_clip": 0.0127035, + "auxiliary_loss_mlp": 0.00241399, + "balance_loss_clip": 1.04275489, + "balance_loss_mlp": 0.21541128, + "epoch": 0.8078160228468361, + "flos": 23294893271040.0, + "grad_norm": 3.2848726158730615, + "language_loss": 0.81739044, + "learning_rate": 3.749898313956279e-07, + "loss": 0.83250797, + "num_input_tokens_seen": 289950815, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.2598877, + "step": 13436, + "time_per_iteration": 2.6979031562805176 + }, + { + "auxiliary_loss_clip": 0.01240251, + "auxiliary_loss_mlp": 0.00243392, + "balance_loss_clip": 1.02127647, + "balance_loss_mlp": 0.21858504, + "epoch": 0.807876146099504, + "flos": 27162651899520.0, + "grad_norm": 15.111051984232606, + "language_loss": 0.78022313, + "learning_rate": 3.747628239215674e-07, + "loss": 0.79505956, + "num_input_tokens_seen": 289971730, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24841309, + "step": 13437, + "time_per_iteration": 2.7338192462921143 + }, + { + "auxiliary_loss_clip": 0.01254632, + "auxiliary_loss_mlp": 0.00225478, + "balance_loss_clip": 1.03665984, + "balance_loss_mlp": 0.20187488, + "epoch": 0.807936269352172, + "flos": 27160030206720.0, + "grad_norm": 3.25190176416486, + "language_loss": 0.8013736, + "learning_rate": 3.745358780766636e-07, + "loss": 0.81617463, + "num_input_tokens_seen": 289992995, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.23571777, + "step": 13438, + "time_per_iteration": 4.0928168296813965 + }, + { + "auxiliary_loss_clip": 0.01247094, + "auxiliary_loss_mlp": 0.00252324, + "balance_loss_clip": 1.02789414, + "balance_loss_mlp": 0.22717035, + "epoch": 0.8079963926048399, + "flos": 20740423703040.0, + "grad_norm": 65.55299533951268, + "language_loss": 0.84924513, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.86423934, + "num_input_tokens_seen": 290009405, + "router_z_loss_clip": 2.19433594, + "router_z_loss_mlp": 0.25158691, + "step": 13439, + "time_per_iteration": 4.104784965515137 + }, + { + "auxiliary_loss_clip": 0.01254695, + "auxiliary_loss_mlp": 0.00218037, + "balance_loss_clip": 1.03483987, + "balance_loss_mlp": 0.19495818, + "epoch": 0.8080565158575079, + "flos": 25009663622400.0, + "grad_norm": 10.132168332826955, + "language_loss": 0.85041952, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.86514688, + "num_input_tokens_seen": 290031085, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.23059082, + "step": 13440, + "time_per_iteration": 2.7142832279205322 + }, + { + "auxiliary_loss_clip": 0.0123807, + "auxiliary_loss_mlp": 0.00233642, + "balance_loss_clip": 1.02188754, + "balance_loss_mlp": 0.20842946, + "epoch": 0.8081166391101758, + "flos": 18698076293760.0, + "grad_norm": 6.882992382563688, + "language_loss": 0.69572771, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.71044481, + "num_input_tokens_seen": 290048670, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.25231934, + "step": 13441, + "time_per_iteration": 2.6069788932800293 + }, + { + "auxiliary_loss_clip": 0.01245987, + "auxiliary_loss_mlp": 0.00234686, + "balance_loss_clip": 1.02653992, + "balance_loss_mlp": 0.20875832, + "epoch": 0.8081767623628439, + "flos": 19828651847040.0, + "grad_norm": 6.922772703735724, + "language_loss": 0.83093703, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.84574378, + "num_input_tokens_seen": 290064085, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25915527, + "step": 13442, + "time_per_iteration": 2.712170362472534 + }, + { + "auxiliary_loss_clip": 0.012328, + "auxiliary_loss_mlp": 0.00247998, + "balance_loss_clip": 1.01818562, + "balance_loss_mlp": 0.22386965, + "epoch": 0.8082368856155118, + "flos": 35772952982400.0, + "grad_norm": 5.111849033620663, + "language_loss": 0.77814567, + "learning_rate": 3.734020735906169e-07, + "loss": 0.79295361, + "num_input_tokens_seen": 290086255, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24108887, + "step": 13443, + "time_per_iteration": 2.797401189804077 + }, + { + "auxiliary_loss_clip": 0.01244169, + "auxiliary_loss_mlp": 0.00236437, + "balance_loss_clip": 1.02836001, + "balance_loss_mlp": 0.21290547, + "epoch": 0.8082970088681798, + "flos": 17198015489280.0, + "grad_norm": 61.29089920834056, + "language_loss": 0.88807452, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.90288055, + "num_input_tokens_seen": 290103995, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.23547363, + "step": 13444, + "time_per_iteration": 4.214062690734863 + }, + { + "auxiliary_loss_clip": 0.01166099, + "auxiliary_loss_mlp": 0.00060127, + "balance_loss_clip": 1.01682425, + "balance_loss_mlp": 0.05178274, + "epoch": 0.8083571321208477, + "flos": 63555207511680.0, + "grad_norm": 0.7942436192855066, + "language_loss": 0.53062141, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.54288363, + "num_input_tokens_seen": 290157245, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.08349609, + "step": 13445, + "time_per_iteration": 3.015890598297119 + }, + { + "auxiliary_loss_clip": 0.01256207, + "auxiliary_loss_mlp": 0.00218155, + "balance_loss_clip": 1.03670609, + "balance_loss_mlp": 0.19427773, + "epoch": 0.8084172553735157, + "flos": 17930701111680.0, + "grad_norm": 27.272448777501047, + "language_loss": 0.81041145, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.82515514, + "num_input_tokens_seen": 290174970, + "router_z_loss_clip": 2.19433594, + "router_z_loss_mlp": 0.23864746, + "step": 13446, + "time_per_iteration": 2.6572768688201904 + }, + { + "auxiliary_loss_clip": 0.01257751, + "auxiliary_loss_mlp": 0.00227467, + "balance_loss_clip": 1.03479505, + "balance_loss_mlp": 0.20236169, + "epoch": 0.8084773786261836, + "flos": 24097999507200.0, + "grad_norm": 17.767417368868184, + "language_loss": 0.79819924, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.8130514, + "num_input_tokens_seen": 290194395, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.25097656, + "step": 13447, + "time_per_iteration": 2.716792106628418 + }, + { + "auxiliary_loss_clip": 0.01271721, + "auxiliary_loss_mlp": 0.002428, + "balance_loss_clip": 1.04046988, + "balance_loss_mlp": 0.21657398, + "epoch": 0.8085375018788516, + "flos": 15588211656960.0, + "grad_norm": 53.94329727117832, + "language_loss": 0.86619854, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.88134378, + "num_input_tokens_seen": 290209200, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.26245117, + "step": 13448, + "time_per_iteration": 3.999521493911743 + }, + { + "auxiliary_loss_clip": 0.01168722, + "auxiliary_loss_mlp": 0.00052994, + "balance_loss_clip": 1.01880574, + "balance_loss_mlp": 0.0460319, + "epoch": 0.8085976251315197, + "flos": 67561296393600.0, + "grad_norm": 0.7137493339030268, + "language_loss": 0.63215363, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.64437079, + "num_input_tokens_seen": 290274565, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.06982422, + "step": 13449, + "time_per_iteration": 3.1849558353424072 + }, + { + "auxiliary_loss_clip": 0.01251931, + "auxiliary_loss_mlp": 0.00222075, + "balance_loss_clip": 1.03155947, + "balance_loss_mlp": 0.19809011, + "epoch": 0.8086577483841876, + "flos": 22561453463040.0, + "grad_norm": 11.27390570574001, + "language_loss": 0.81659973, + "learning_rate": 3.718173381422105e-07, + "loss": 0.83133972, + "num_input_tokens_seen": 290293630, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.2401123, + "step": 13450, + "time_per_iteration": 2.6623189449310303 + }, + { + "auxiliary_loss_clip": 0.01261065, + "auxiliary_loss_mlp": 0.00245368, + "balance_loss_clip": 1.03610611, + "balance_loss_mlp": 0.21978578, + "epoch": 0.8087178716368556, + "flos": 17968084191360.0, + "grad_norm": 120.09069910165458, + "language_loss": 0.80615884, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.82122314, + "num_input_tokens_seen": 290311450, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.25561523, + "step": 13451, + "time_per_iteration": 2.640828847885132 + }, + { + "auxiliary_loss_clip": 0.01268009, + "auxiliary_loss_mlp": 0.00259933, + "balance_loss_clip": 1.03761303, + "balance_loss_mlp": 0.23382597, + "epoch": 0.8087779948895235, + "flos": 21719527603200.0, + "grad_norm": 11.117121114421733, + "language_loss": 0.89087069, + "learning_rate": 3.713651121244543e-07, + "loss": 0.9061501, + "num_input_tokens_seen": 290330165, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.26147461, + "step": 13452, + "time_per_iteration": 2.6674225330352783 + }, + { + "auxiliary_loss_clip": 0.01255948, + "auxiliary_loss_mlp": 0.00231264, + "balance_loss_clip": 1.03117549, + "balance_loss_mlp": 0.20705296, + "epoch": 0.8088381181421915, + "flos": 29092885983360.0, + "grad_norm": 129.28119465182687, + "language_loss": 0.85309124, + "learning_rate": 3.711390917482875e-07, + "loss": 0.86796331, + "num_input_tokens_seen": 290350815, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.24206543, + "step": 13453, + "time_per_iteration": 2.782118797302246 + }, + { + "auxiliary_loss_clip": 0.01254033, + "auxiliary_loss_mlp": 0.00221158, + "balance_loss_clip": 1.0299964, + "balance_loss_mlp": 0.19506305, + "epoch": 0.8088982413948594, + "flos": 22198432659840.0, + "grad_norm": 25.01543612638971, + "language_loss": 0.87455785, + "learning_rate": 3.709131331386892e-07, + "loss": 0.88930976, + "num_input_tokens_seen": 290367380, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26098633, + "step": 13454, + "time_per_iteration": 2.640681028366089 + }, + { + "auxiliary_loss_clip": 0.01248811, + "auxiliary_loss_mlp": 0.00236798, + "balance_loss_clip": 1.03214443, + "balance_loss_mlp": 0.21258703, + "epoch": 0.8089583646475275, + "flos": 28036717453440.0, + "grad_norm": 62.69702436607521, + "language_loss": 0.83993047, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.85478657, + "num_input_tokens_seen": 290387965, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.2421875, + "step": 13455, + "time_per_iteration": 2.729024887084961 + }, + { + "auxiliary_loss_clip": 0.01257772, + "auxiliary_loss_mlp": 0.00244552, + "balance_loss_clip": 1.03380275, + "balance_loss_mlp": 0.2181707, + "epoch": 0.8090184879001954, + "flos": 16617735273600.0, + "grad_norm": 75.55141927589207, + "language_loss": 0.88773578, + "learning_rate": 3.70461401253471e-07, + "loss": 0.90275902, + "num_input_tokens_seen": 290404150, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.26379395, + "step": 13456, + "time_per_iteration": 2.6238224506378174 + }, + { + "auxiliary_loss_clip": 0.0124892, + "auxiliary_loss_mlp": 0.00221462, + "balance_loss_clip": 1.03315544, + "balance_loss_mlp": 0.19828725, + "epoch": 0.8090786111528634, + "flos": 27340804379520.0, + "grad_norm": 5.054235932248078, + "language_loss": 0.81554586, + "learning_rate": 3.702356279949801e-07, + "loss": 0.83024967, + "num_input_tokens_seen": 290422370, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.23156738, + "step": 13457, + "time_per_iteration": 2.7002880573272705 + }, + { + "auxiliary_loss_clip": 0.01234952, + "auxiliary_loss_mlp": 0.00224633, + "balance_loss_clip": 1.01485848, + "balance_loss_mlp": 0.20052855, + "epoch": 0.8091387344055313, + "flos": 21105742976640.0, + "grad_norm": 37.360141017949616, + "language_loss": 0.81626058, + "learning_rate": 3.700099165373176e-07, + "loss": 0.83085644, + "num_input_tokens_seen": 290442645, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.24108887, + "step": 13458, + "time_per_iteration": 2.674607276916504 + }, + { + "auxiliary_loss_clip": 0.01240459, + "auxiliary_loss_mlp": 0.00259654, + "balance_loss_clip": 1.02433372, + "balance_loss_mlp": 0.23492986, + "epoch": 0.8091988576581993, + "flos": 11655060318720.0, + "grad_norm": 12.502761481466584, + "language_loss": 0.88668311, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.90168428, + "num_input_tokens_seen": 290458520, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.24707031, + "step": 13459, + "time_per_iteration": 2.6068167686462402 + }, + { + "auxiliary_loss_clip": 0.01270368, + "auxiliary_loss_mlp": 0.00240305, + "balance_loss_clip": 1.04043043, + "balance_loss_mlp": 0.21407925, + "epoch": 0.8092589809108672, + "flos": 22963329803520.0, + "grad_norm": 84.06053804408687, + "language_loss": 0.87887156, + "learning_rate": 3.695586790587113e-07, + "loss": 0.8939783, + "num_input_tokens_seen": 290474465, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.26220703, + "step": 13460, + "time_per_iteration": 2.651014566421509 + }, + { + "auxiliary_loss_clip": 0.01258589, + "auxiliary_loss_mlp": 0.0023657, + "balance_loss_clip": 1.03490579, + "balance_loss_mlp": 0.21035601, + "epoch": 0.8093191041635353, + "flos": 13260985482240.0, + "grad_norm": 46.08530000831884, + "language_loss": 0.9338938, + "learning_rate": 3.693331530548789e-07, + "loss": 0.94884539, + "num_input_tokens_seen": 290492060, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26196289, + "step": 13461, + "time_per_iteration": 2.6302249431610107 + }, + { + "auxiliary_loss_clip": 0.01274259, + "auxiliary_loss_mlp": 0.00250197, + "balance_loss_clip": 1.05459976, + "balance_loss_mlp": 0.22531852, + "epoch": 0.8093792274162032, + "flos": 25516003691520.0, + "grad_norm": 50.08833199619352, + "language_loss": 0.8498919, + "learning_rate": 3.69107688886096e-07, + "loss": 0.8651365, + "num_input_tokens_seen": 290511510, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24890137, + "step": 13462, + "time_per_iteration": 2.6727752685546875 + }, + { + "auxiliary_loss_clip": 0.01259616, + "auxiliary_loss_mlp": 0.00245837, + "balance_loss_clip": 1.03407502, + "balance_loss_mlp": 0.22101754, + "epoch": 0.8094393506688712, + "flos": 23546483107200.0, + "grad_norm": 10.587487505662676, + "language_loss": 0.90515327, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.92020786, + "num_input_tokens_seen": 290530035, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.24829102, + "step": 13463, + "time_per_iteration": 2.6616151332855225 + }, + { + "auxiliary_loss_clip": 0.01244722, + "auxiliary_loss_mlp": 0.002404, + "balance_loss_clip": 1.02837491, + "balance_loss_mlp": 0.2185128, + "epoch": 0.8094994739215392, + "flos": 17055917285760.0, + "grad_norm": 10.358524995193848, + "language_loss": 0.69346321, + "learning_rate": 3.686569460878779e-07, + "loss": 0.70831442, + "num_input_tokens_seen": 290548245, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.21887207, + "step": 13464, + "time_per_iteration": 2.648324966430664 + }, + { + "auxiliary_loss_clip": 0.01253712, + "auxiliary_loss_mlp": 0.00249693, + "balance_loss_clip": 1.03348136, + "balance_loss_mlp": 0.22475445, + "epoch": 0.8095595971742071, + "flos": 23551223702400.0, + "grad_norm": 2.266254937630501, + "language_loss": 0.69911921, + "learning_rate": 3.684316674755341e-07, + "loss": 0.71415323, + "num_input_tokens_seen": 290568625, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.24951172, + "step": 13465, + "time_per_iteration": 2.705716609954834 + }, + { + "auxiliary_loss_clip": 0.01239152, + "auxiliary_loss_mlp": 0.00243431, + "balance_loss_clip": 1.02808952, + "balance_loss_mlp": 0.21910049, + "epoch": 0.8096197204268751, + "flos": 20373201008640.0, + "grad_norm": 6.528605361067902, + "language_loss": 0.88202685, + "learning_rate": 3.682064507324256e-07, + "loss": 0.89685267, + "num_input_tokens_seen": 290586575, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.24328613, + "step": 13466, + "time_per_iteration": 2.640038251876831 + }, + { + "auxiliary_loss_clip": 0.01255895, + "auxiliary_loss_mlp": 0.00228271, + "balance_loss_clip": 1.03482461, + "balance_loss_mlp": 0.20322526, + "epoch": 0.809679843679543, + "flos": 27818775682560.0, + "grad_norm": 10102.504244300411, + "language_loss": 0.8321321, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.84697372, + "num_input_tokens_seen": 290606790, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25048828, + "step": 13467, + "time_per_iteration": 2.7206127643585205 + }, + { + "auxiliary_loss_clip": 0.01258224, + "auxiliary_loss_mlp": 0.00256395, + "balance_loss_clip": 1.03612494, + "balance_loss_mlp": 0.22940651, + "epoch": 0.8097399669322111, + "flos": 22014103040640.0, + "grad_norm": 8.818521342053408, + "language_loss": 0.84682059, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.86196679, + "num_input_tokens_seen": 290625525, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.2701416, + "step": 13468, + "time_per_iteration": 2.6615004539489746 + }, + { + "auxiliary_loss_clip": 0.01240269, + "auxiliary_loss_mlp": 0.00226938, + "balance_loss_clip": 1.027794, + "balance_loss_mlp": 0.20383494, + "epoch": 0.809800090184879, + "flos": 18988988544000.0, + "grad_norm": 12.989857332840073, + "language_loss": 0.76582122, + "learning_rate": 3.675311718038978e-07, + "loss": 0.78049326, + "num_input_tokens_seen": 290644935, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.2310791, + "step": 13469, + "time_per_iteration": 2.653299331665039 + }, + { + "auxiliary_loss_clip": 0.01161235, + "auxiliary_loss_mlp": 0.00079509, + "balance_loss_clip": 1.01505029, + "balance_loss_mlp": 0.07197522, + "epoch": 0.809860213437547, + "flos": 66099516508800.0, + "grad_norm": 0.6683568957295799, + "language_loss": 0.53867906, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.55108649, + "num_input_tokens_seen": 290710735, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.07519531, + "step": 13470, + "time_per_iteration": 3.243690252304077 + }, + { + "auxiliary_loss_clip": 0.01251549, + "auxiliary_loss_mlp": 0.0025069, + "balance_loss_clip": 1.03085601, + "balance_loss_mlp": 0.22615676, + "epoch": 0.8099203366902149, + "flos": 20882485992960.0, + "grad_norm": 3630.459010881376, + "language_loss": 0.76924002, + "learning_rate": 3.670812953542279e-07, + "loss": 0.78426242, + "num_input_tokens_seen": 290729565, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.24536133, + "step": 13471, + "time_per_iteration": 2.658543586730957 + }, + { + "auxiliary_loss_clip": 0.01248919, + "auxiliary_loss_mlp": 0.00229943, + "balance_loss_clip": 1.03209651, + "balance_loss_mlp": 0.20699526, + "epoch": 0.8099804599428829, + "flos": 26030927111040.0, + "grad_norm": 44.05005542012157, + "language_loss": 0.87078762, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.88557625, + "num_input_tokens_seen": 290749360, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.22949219, + "step": 13472, + "time_per_iteration": 2.7330634593963623 + }, + { + "auxiliary_loss_clip": 0.01163377, + "auxiliary_loss_mlp": 0.00070328, + "balance_loss_clip": 1.01320767, + "balance_loss_mlp": 0.06308004, + "epoch": 0.8100405831955508, + "flos": 69303573584640.0, + "grad_norm": 0.7262934514640456, + "language_loss": 0.56908512, + "learning_rate": 3.666316665863201e-07, + "loss": 0.58142221, + "num_input_tokens_seen": 290812145, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.07226562, + "step": 13473, + "time_per_iteration": 3.180678606033325 + }, + { + "auxiliary_loss_clip": 0.01254491, + "auxiliary_loss_mlp": 0.00215422, + "balance_loss_clip": 1.0359025, + "balance_loss_mlp": 0.19184276, + "epoch": 0.8101007064482189, + "flos": 15012492468480.0, + "grad_norm": 39.93573334662261, + "language_loss": 0.84900194, + "learning_rate": 3.664069451043399e-07, + "loss": 0.86370111, + "num_input_tokens_seen": 290829845, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.23608398, + "step": 13474, + "time_per_iteration": 2.667511463165283 + }, + { + "auxiliary_loss_clip": 0.01254042, + "auxiliary_loss_mlp": 0.00251525, + "balance_loss_clip": 1.03554177, + "balance_loss_mlp": 0.2252396, + "epoch": 0.8101608297008868, + "flos": 21067210661760.0, + "grad_norm": 8.068543430294122, + "language_loss": 0.84820509, + "learning_rate": 3.661822855683723e-07, + "loss": 0.86326075, + "num_input_tokens_seen": 290848815, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.26318359, + "step": 13475, + "time_per_iteration": 2.7101924419403076 + }, + { + "auxiliary_loss_clip": 0.01234668, + "auxiliary_loss_mlp": 0.00228539, + "balance_loss_clip": 1.02062738, + "balance_loss_mlp": 0.20592535, + "epoch": 0.8102209529535548, + "flos": 23731279603200.0, + "grad_norm": 46.92043242221108, + "language_loss": 0.8151347, + "learning_rate": 3.659576879869364e-07, + "loss": 0.82976675, + "num_input_tokens_seen": 290868580, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.22631836, + "step": 13476, + "time_per_iteration": 2.712204694747925 + }, + { + "auxiliary_loss_clip": 0.01273496, + "auxiliary_loss_mlp": 0.0021759, + "balance_loss_clip": 1.04355752, + "balance_loss_mlp": 0.19126877, + "epoch": 0.8102810762062228, + "flos": 10955879107200.0, + "grad_norm": 62.36986046615791, + "language_loss": 0.85849738, + "learning_rate": 3.657331523685485e-07, + "loss": 0.8734082, + "num_input_tokens_seen": 290883540, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.26330566, + "step": 13477, + "time_per_iteration": 2.694314956665039 + }, + { + "auxiliary_loss_clip": 0.01257463, + "auxiliary_loss_mlp": 0.0023305, + "balance_loss_clip": 1.03599453, + "balance_loss_mlp": 0.20733692, + "epoch": 0.8103411994588907, + "flos": 14648825220480.0, + "grad_norm": 315.63889248006694, + "language_loss": 0.78968084, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.80458599, + "num_input_tokens_seen": 290901560, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25720215, + "step": 13478, + "time_per_iteration": 2.720038890838623 + }, + { + "auxiliary_loss_clip": 0.01167602, + "auxiliary_loss_mlp": 0.00071688, + "balance_loss_clip": 1.01758814, + "balance_loss_mlp": 0.06429717, + "epoch": 0.8104013227115587, + "flos": 59153314665600.0, + "grad_norm": 0.6706795497544106, + "language_loss": 0.51671851, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.52911139, + "num_input_tokens_seen": 290959185, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.07373047, + "step": 13479, + "time_per_iteration": 3.130262613296509 + }, + { + "auxiliary_loss_clip": 0.01258519, + "auxiliary_loss_mlp": 0.00233337, + "balance_loss_clip": 1.04199517, + "balance_loss_mlp": 0.20906594, + "epoch": 0.8104614459642266, + "flos": 19828687760640.0, + "grad_norm": 7.293655027982415, + "language_loss": 0.78299069, + "learning_rate": 3.650599173768072e-07, + "loss": 0.79790926, + "num_input_tokens_seen": 290979585, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.24291992, + "step": 13480, + "time_per_iteration": 4.092426300048828 + }, + { + "auxiliary_loss_clip": 0.01251655, + "auxiliary_loss_mlp": 0.00247765, + "balance_loss_clip": 1.0314492, + "balance_loss_mlp": 0.22302948, + "epoch": 0.8105215692168947, + "flos": 25374264624000.0, + "grad_norm": 4.327641230476764, + "language_loss": 0.88600904, + "learning_rate": 3.648356296957327e-07, + "loss": 0.90100324, + "num_input_tokens_seen": 291000865, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.24780273, + "step": 13481, + "time_per_iteration": 2.718433380126953 + }, + { + "auxiliary_loss_clip": 0.0125255, + "auxiliary_loss_mlp": 0.00234808, + "balance_loss_clip": 1.029212, + "balance_loss_mlp": 0.21091846, + "epoch": 0.8105816924695626, + "flos": 20481722974080.0, + "grad_norm": 108.82492594393477, + "language_loss": 0.82044578, + "learning_rate": 3.646114040202548e-07, + "loss": 0.8353194, + "num_input_tokens_seen": 291018285, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.23876953, + "step": 13482, + "time_per_iteration": 4.042736768722534 + }, + { + "auxiliary_loss_clip": 0.01251376, + "auxiliary_loss_mlp": 0.00238652, + "balance_loss_clip": 1.03075814, + "balance_loss_mlp": 0.21414214, + "epoch": 0.8106418157222306, + "flos": 14538687143040.0, + "grad_norm": 3.507069146609087, + "language_loss": 0.75156182, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.76646209, + "num_input_tokens_seen": 291035745, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24487305, + "step": 13483, + "time_per_iteration": 2.5852248668670654 + }, + { + "auxiliary_loss_clip": 0.01245531, + "auxiliary_loss_mlp": 0.00241605, + "balance_loss_clip": 1.02626669, + "balance_loss_mlp": 0.21568921, + "epoch": 0.8107019389748985, + "flos": 22564470205440.0, + "grad_norm": 3.523924127071528, + "language_loss": 0.82763863, + "learning_rate": 3.641631387200992e-07, + "loss": 0.84250998, + "num_input_tokens_seen": 291053280, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25878906, + "step": 13484, + "time_per_iteration": 2.6638400554656982 + }, + { + "auxiliary_loss_clip": 0.01280877, + "auxiliary_loss_mlp": 0.00219944, + "balance_loss_clip": 1.05031717, + "balance_loss_mlp": 0.19508937, + "epoch": 0.8107620622275665, + "flos": 19609560840960.0, + "grad_norm": 8.298796791088785, + "language_loss": 0.81433129, + "learning_rate": 3.639390991124183e-07, + "loss": 0.8293395, + "num_input_tokens_seen": 291072855, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.24853516, + "step": 13485, + "time_per_iteration": 2.6407179832458496 + }, + { + "auxiliary_loss_clip": 0.01252005, + "auxiliary_loss_mlp": 0.00218867, + "balance_loss_clip": 1.03444028, + "balance_loss_mlp": 0.19625337, + "epoch": 0.8108221854802344, + "flos": 16143498984960.0, + "grad_norm": 28.134921208988604, + "language_loss": 0.83524251, + "learning_rate": 3.637151215443308e-07, + "loss": 0.84995121, + "num_input_tokens_seen": 291090285, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.22607422, + "step": 13486, + "time_per_iteration": 2.6572391986846924 + }, + { + "auxiliary_loss_clip": 0.01282986, + "auxiliary_loss_mlp": 0.00233951, + "balance_loss_clip": 1.0525105, + "balance_loss_mlp": 0.20743914, + "epoch": 0.8108823087329025, + "flos": 21106209853440.0, + "grad_norm": 9.704399337916424, + "language_loss": 0.79523718, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.81040657, + "num_input_tokens_seen": 291107675, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.26513672, + "step": 13487, + "time_per_iteration": 4.175585985183716 + }, + { + "auxiliary_loss_clip": 0.01240634, + "auxiliary_loss_mlp": 0.00218999, + "balance_loss_clip": 1.02551043, + "balance_loss_mlp": 0.19742167, + "epoch": 0.8109424319855704, + "flos": 29199648182400.0, + "grad_norm": 6.018014934480455, + "language_loss": 0.90040052, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.91499686, + "num_input_tokens_seen": 291126900, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.21557617, + "step": 13488, + "time_per_iteration": 2.7000784873962402 + }, + { + "auxiliary_loss_clip": 0.01255296, + "auxiliary_loss_mlp": 0.00246364, + "balance_loss_clip": 1.03717256, + "balance_loss_mlp": 0.22140174, + "epoch": 0.8110025552382384, + "flos": 23111856541440.0, + "grad_norm": 47.022319750060696, + "language_loss": 0.81009358, + "learning_rate": 3.630435611625502e-07, + "loss": 0.8251102, + "num_input_tokens_seen": 291145285, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24951172, + "step": 13489, + "time_per_iteration": 2.857109785079956 + }, + { + "auxiliary_loss_clip": 0.01242099, + "auxiliary_loss_mlp": 0.00229604, + "balance_loss_clip": 1.02636719, + "balance_loss_mlp": 0.2045466, + "epoch": 0.8110626784909064, + "flos": 22379961018240.0, + "grad_norm": 7.672187099651406, + "language_loss": 0.78728807, + "learning_rate": 3.628198318377453e-07, + "loss": 0.80200511, + "num_input_tokens_seen": 291163485, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.25048828, + "step": 13490, + "time_per_iteration": 4.135535955429077 + }, + { + "auxiliary_loss_clip": 0.01262773, + "auxiliary_loss_mlp": 0.00237792, + "balance_loss_clip": 1.04339886, + "balance_loss_mlp": 0.2148326, + "epoch": 0.8111228017435743, + "flos": 23368043318400.0, + "grad_norm": 17.62562101508556, + "language_loss": 0.80451554, + "learning_rate": 3.625961645949762e-07, + "loss": 0.81952125, + "num_input_tokens_seen": 291182215, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.22949219, + "step": 13491, + "time_per_iteration": 2.6563916206359863 + }, + { + "auxiliary_loss_clip": 0.01257886, + "auxiliary_loss_mlp": 0.00255024, + "balance_loss_clip": 1.0354414, + "balance_loss_mlp": 0.22962052, + "epoch": 0.8111829249962423, + "flos": 21286553063040.0, + "grad_norm": 146.71934536026149, + "language_loss": 0.74184042, + "learning_rate": 3.623725594427245e-07, + "loss": 0.75696957, + "num_input_tokens_seen": 291203145, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25402832, + "step": 13492, + "time_per_iteration": 2.7401821613311768 + }, + { + "auxiliary_loss_clip": 0.01252518, + "auxiliary_loss_mlp": 0.00250427, + "balance_loss_clip": 1.03022861, + "balance_loss_mlp": 0.22522576, + "epoch": 0.8112430482489102, + "flos": 22345558767360.0, + "grad_norm": 13.185582932723104, + "language_loss": 0.79659981, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.8116293, + "num_input_tokens_seen": 291220600, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25183105, + "step": 13493, + "time_per_iteration": 2.6355457305908203 + }, + { + "auxiliary_loss_clip": 0.01256699, + "auxiliary_loss_mlp": 0.00220188, + "balance_loss_clip": 1.03442144, + "balance_loss_mlp": 0.19511791, + "epoch": 0.8113031715015783, + "flos": 31138321962240.0, + "grad_norm": 2.5824214517009487, + "language_loss": 0.79434687, + "learning_rate": 3.619255354436885e-07, + "loss": 0.80911577, + "num_input_tokens_seen": 291241195, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.25085449, + "step": 13494, + "time_per_iteration": 2.7185592651367188 + }, + { + "auxiliary_loss_clip": 0.01277924, + "auxiliary_loss_mlp": 0.00260696, + "balance_loss_clip": 1.04791403, + "balance_loss_mlp": 0.23392203, + "epoch": 0.8113632947542462, + "flos": 25335445000320.0, + "grad_norm": 5.148851858492411, + "language_loss": 0.85103476, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.86642098, + "num_input_tokens_seen": 291258715, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.26782227, + "step": 13495, + "time_per_iteration": 2.672102928161621 + }, + { + "auxiliary_loss_clip": 0.01270557, + "auxiliary_loss_mlp": 0.00233225, + "balance_loss_clip": 1.0428679, + "balance_loss_mlp": 0.20585501, + "epoch": 0.8114234180069142, + "flos": 28439168411520.0, + "grad_norm": 15.246778347304039, + "language_loss": 0.88522822, + "learning_rate": 3.614787599084417e-07, + "loss": 0.90026605, + "num_input_tokens_seen": 291278030, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.27380371, + "step": 13496, + "time_per_iteration": 2.742910861968994 + }, + { + "auxiliary_loss_clip": 0.01261879, + "auxiliary_loss_mlp": 0.00219011, + "balance_loss_clip": 1.03656793, + "balance_loss_mlp": 0.19526415, + "epoch": 0.8114835412595821, + "flos": 20338870584960.0, + "grad_norm": 5.946275623870035, + "language_loss": 0.79858351, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.8133924, + "num_input_tokens_seen": 291296740, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.23742676, + "step": 13497, + "time_per_iteration": 2.6759040355682373 + }, + { + "auxiliary_loss_clip": 0.01262295, + "auxiliary_loss_mlp": 0.0026259, + "balance_loss_clip": 1.04006422, + "balance_loss_mlp": 0.23700735, + "epoch": 0.8115436645122501, + "flos": 22490889194880.0, + "grad_norm": 2.9476671653097664, + "language_loss": 0.83227056, + "learning_rate": 3.610322329047508e-07, + "loss": 0.8475194, + "num_input_tokens_seen": 291318730, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.2557373, + "step": 13498, + "time_per_iteration": 2.702817678451538 + }, + { + "auxiliary_loss_clip": 0.01247738, + "auxiliary_loss_mlp": 0.0023631, + "balance_loss_clip": 1.02524114, + "balance_loss_mlp": 0.21071552, + "epoch": 0.811603787764918, + "flos": 13845288021120.0, + "grad_norm": 31.149791138252063, + "language_loss": 0.93538445, + "learning_rate": 3.608090626234055e-07, + "loss": 0.95022488, + "num_input_tokens_seen": 291336755, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25598145, + "step": 13499, + "time_per_iteration": 2.6531219482421875 + }, + { + "auxiliary_loss_clip": 0.01264085, + "auxiliary_loss_mlp": 0.00260407, + "balance_loss_clip": 1.03901768, + "balance_loss_mlp": 0.23236893, + "epoch": 0.8116639110175861, + "flos": 21614632911360.0, + "grad_norm": 19.788418810415305, + "language_loss": 0.82724428, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.84248918, + "num_input_tokens_seen": 291356795, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.28039551, + "step": 13500, + "time_per_iteration": 2.6419479846954346 + }, + { + "auxiliary_loss_clip": 0.01147936, + "auxiliary_loss_mlp": 0.00111114, + "balance_loss_clip": 0.99212372, + "balance_loss_mlp": 0.10286444, + "epoch": 0.811724034270254, + "flos": 64459799625600.0, + "grad_norm": 0.8394028406525358, + "language_loss": 0.59294742, + "learning_rate": 3.603629085440303e-07, + "loss": 0.60553795, + "num_input_tokens_seen": 291416005, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.08251953, + "step": 13501, + "time_per_iteration": 3.202393054962158 + }, + { + "auxiliary_loss_clip": 0.01235177, + "auxiliary_loss_mlp": 0.00217401, + "balance_loss_clip": 1.02623451, + "balance_loss_mlp": 0.19420266, + "epoch": 0.811784157522922, + "flos": 24754123290240.0, + "grad_norm": 14.430684631741803, + "language_loss": 0.86995828, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.88448411, + "num_input_tokens_seen": 291434870, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.23217773, + "step": 13502, + "time_per_iteration": 2.716451406478882 + }, + { + "auxiliary_loss_clip": 0.01245939, + "auxiliary_loss_mlp": 0.00228484, + "balance_loss_clip": 1.02616572, + "balance_loss_mlp": 0.20560782, + "epoch": 0.81184428077559, + "flos": 12167146563840.0, + "grad_norm": 13.38976488702154, + "language_loss": 0.79775119, + "learning_rate": 3.599170031654635e-07, + "loss": 0.81249535, + "num_input_tokens_seen": 291452230, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.22888184, + "step": 13503, + "time_per_iteration": 2.7014784812927246 + }, + { + "auxiliary_loss_clip": 0.01262009, + "auxiliary_loss_mlp": 0.00231519, + "balance_loss_clip": 1.03855157, + "balance_loss_mlp": 0.20631804, + "epoch": 0.8119044040282579, + "flos": 44422037775360.0, + "grad_norm": 18.60518469167934, + "language_loss": 0.7355299, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.75046521, + "num_input_tokens_seen": 291477425, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25195312, + "step": 13504, + "time_per_iteration": 2.917557716369629 + }, + { + "auxiliary_loss_clip": 0.01254924, + "auxiliary_loss_mlp": 0.00245292, + "balance_loss_clip": 1.03387463, + "balance_loss_mlp": 0.22024632, + "epoch": 0.8119645272809259, + "flos": 52155507957120.0, + "grad_norm": 28.133769306885604, + "language_loss": 0.81264168, + "learning_rate": 3.594713465553403e-07, + "loss": 0.82764387, + "num_input_tokens_seen": 291501070, + "router_z_loss_clip": 2.20996094, + "router_z_loss_mlp": 0.25036621, + "step": 13505, + "time_per_iteration": 2.922914981842041 + }, + { + "auxiliary_loss_clip": 0.01258322, + "auxiliary_loss_mlp": 0.00246376, + "balance_loss_clip": 1.03827882, + "balance_loss_mlp": 0.22090077, + "epoch": 0.8120246505335939, + "flos": 30232978640640.0, + "grad_norm": 37.19981979744804, + "language_loss": 0.79809898, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.813146, + "num_input_tokens_seen": 291524945, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.25476074, + "step": 13506, + "time_per_iteration": 2.7759575843811035 + }, + { + "auxiliary_loss_clip": 0.01268009, + "auxiliary_loss_mlp": 0.00244, + "balance_loss_clip": 1.04112148, + "balance_loss_mlp": 0.2184175, + "epoch": 0.8120847737862619, + "flos": 22127652910080.0, + "grad_norm": 7.700564288680627, + "language_loss": 0.85545897, + "learning_rate": 3.590259387812593e-07, + "loss": 0.87057906, + "num_input_tokens_seen": 291544605, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.2557373, + "step": 13507, + "time_per_iteration": 2.6925323009490967 + }, + { + "auxiliary_loss_clip": 0.01265869, + "auxiliary_loss_mlp": 0.00238473, + "balance_loss_clip": 1.04006195, + "balance_loss_mlp": 0.2140951, + "epoch": 0.8121448970389298, + "flos": 23295180579840.0, + "grad_norm": 15.370127337959591, + "language_loss": 0.78955495, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.80459839, + "num_input_tokens_seen": 291563850, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.24389648, + "step": 13508, + "time_per_iteration": 2.675222396850586 + }, + { + "auxiliary_loss_clip": 0.01257579, + "auxiliary_loss_mlp": 0.00230644, + "balance_loss_clip": 1.03625321, + "balance_loss_mlp": 0.20606278, + "epoch": 0.8122050202915978, + "flos": 22164138149760.0, + "grad_norm": 2.3275325830914553, + "language_loss": 0.83311975, + "learning_rate": 3.585807799107785e-07, + "loss": 0.84800196, + "num_input_tokens_seen": 291581730, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.24560547, + "step": 13509, + "time_per_iteration": 2.656312942504883 + }, + { + "auxiliary_loss_clip": 0.01271578, + "auxiliary_loss_mlp": 0.00235412, + "balance_loss_clip": 1.04183292, + "balance_loss_mlp": 0.20898351, + "epoch": 0.8122651435442657, + "flos": 23258946735360.0, + "grad_norm": 3120.5613210437077, + "language_loss": 0.8447907, + "learning_rate": 3.58358293835491e-07, + "loss": 0.8598606, + "num_input_tokens_seen": 291601225, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.26416016, + "step": 13510, + "time_per_iteration": 2.7643418312072754 + }, + { + "auxiliary_loss_clip": 0.01266319, + "auxiliary_loss_mlp": 0.00235606, + "balance_loss_clip": 1.04228115, + "balance_loss_mlp": 0.20966636, + "epoch": 0.8123252667969337, + "flos": 16140015365760.0, + "grad_norm": 6.037697463409731, + "language_loss": 0.78869438, + "learning_rate": 3.581358700114212e-07, + "loss": 0.80371362, + "num_input_tokens_seen": 291616995, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.25952148, + "step": 13511, + "time_per_iteration": 2.640000104904175 + }, + { + "auxiliary_loss_clip": 0.01243744, + "auxiliary_loss_mlp": 0.00252353, + "balance_loss_clip": 1.02436304, + "balance_loss_mlp": 0.22779605, + "epoch": 0.8123853900496016, + "flos": 21245399055360.0, + "grad_norm": 36.32082310917654, + "language_loss": 0.87225795, + "learning_rate": 3.57913508447004e-07, + "loss": 0.88721901, + "num_input_tokens_seen": 291636145, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.24560547, + "step": 13512, + "time_per_iteration": 2.7179341316223145 + }, + { + "auxiliary_loss_clip": 0.01250444, + "auxiliary_loss_mlp": 0.00214029, + "balance_loss_clip": 1.02953935, + "balance_loss_mlp": 0.19027032, + "epoch": 0.8124455133022697, + "flos": 64377596373120.0, + "grad_norm": 3.514086413204601, + "language_loss": 0.72342539, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.73807013, + "num_input_tokens_seen": 291662440, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.23754883, + "step": 13513, + "time_per_iteration": 3.0391275882720947 + }, + { + "auxiliary_loss_clip": 0.01254053, + "auxiliary_loss_mlp": 0.00227965, + "balance_loss_clip": 1.03053796, + "balance_loss_mlp": 0.20091683, + "epoch": 0.8125056365549376, + "flos": 23842207779840.0, + "grad_norm": 19.704419048971104, + "language_loss": 0.80756259, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.82238281, + "num_input_tokens_seen": 291680950, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.27026367, + "step": 13514, + "time_per_iteration": 2.670525074005127 + }, + { + "auxiliary_loss_clip": 0.01243149, + "auxiliary_loss_mlp": 0.00227046, + "balance_loss_clip": 1.0277276, + "balance_loss_mlp": 0.20296581, + "epoch": 0.8125657598076056, + "flos": 23550325862400.0, + "grad_norm": 52.36381222954289, + "language_loss": 0.70208389, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.71678579, + "num_input_tokens_seen": 291702395, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.2409668, + "step": 13515, + "time_per_iteration": 2.7418272495269775 + }, + { + "auxiliary_loss_clip": 0.0122438, + "auxiliary_loss_mlp": 0.00228461, + "balance_loss_clip": 1.01415718, + "balance_loss_mlp": 0.20546573, + "epoch": 0.8126258830602736, + "flos": 20704225772160.0, + "grad_norm": 27.059677875910104, + "language_loss": 0.83650333, + "learning_rate": 3.570246849544616e-07, + "loss": 0.85103172, + "num_input_tokens_seen": 291721135, + "router_z_loss_clip": 2.10058594, + "router_z_loss_mlp": 0.2298584, + "step": 13516, + "time_per_iteration": 2.737856864929199 + }, + { + "auxiliary_loss_clip": 0.01266226, + "auxiliary_loss_mlp": 0.00250108, + "balance_loss_clip": 1.04750431, + "balance_loss_mlp": 0.22683883, + "epoch": 0.8126860063129415, + "flos": 23618160696960.0, + "grad_norm": 37.1275351238903, + "language_loss": 0.97356945, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.98873281, + "num_input_tokens_seen": 291741235, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.23266602, + "step": 13517, + "time_per_iteration": 2.730692148208618 + }, + { + "auxiliary_loss_clip": 0.01248602, + "auxiliary_loss_mlp": 0.00239723, + "balance_loss_clip": 1.0362941, + "balance_loss_mlp": 0.21653718, + "epoch": 0.8127461295656095, + "flos": 25007149670400.0, + "grad_norm": 16.297532896476618, + "language_loss": 0.85162956, + "learning_rate": 3.565806469852244e-07, + "loss": 0.86651284, + "num_input_tokens_seen": 291761430, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.23168945, + "step": 13518, + "time_per_iteration": 2.8045406341552734 + }, + { + "auxiliary_loss_clip": 0.01251434, + "auxiliary_loss_mlp": 0.00226166, + "balance_loss_clip": 1.03558755, + "balance_loss_mlp": 0.20311141, + "epoch": 0.8128062528182775, + "flos": 27342169096320.0, + "grad_norm": 100.88082534939616, + "language_loss": 0.86301112, + "learning_rate": 3.56358721474336e-07, + "loss": 0.87778717, + "num_input_tokens_seen": 291781755, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.23046875, + "step": 13519, + "time_per_iteration": 2.8490850925445557 + }, + { + "auxiliary_loss_clip": 0.01263778, + "auxiliary_loss_mlp": 0.00235977, + "balance_loss_clip": 1.03726745, + "balance_loss_mlp": 0.20815358, + "epoch": 0.8128663760709455, + "flos": 26506312634880.0, + "grad_norm": 18269.55922526264, + "language_loss": 0.80655849, + "learning_rate": 3.561368582904905e-07, + "loss": 0.82155603, + "num_input_tokens_seen": 291804410, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.27819824, + "step": 13520, + "time_per_iteration": 2.7178120613098145 + }, + { + "auxiliary_loss_clip": 0.01253184, + "auxiliary_loss_mlp": 0.00247703, + "balance_loss_clip": 1.03528988, + "balance_loss_mlp": 0.22370584, + "epoch": 0.8129264993236134, + "flos": 17931239815680.0, + "grad_norm": 32.71165318826856, + "language_loss": 0.78387046, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.79887938, + "num_input_tokens_seen": 291823285, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24023438, + "step": 13521, + "time_per_iteration": 2.70349383354187 + }, + { + "auxiliary_loss_clip": 0.01268956, + "auxiliary_loss_mlp": 0.00246545, + "balance_loss_clip": 1.04180801, + "balance_loss_mlp": 0.22018735, + "epoch": 0.8129866225762814, + "flos": 26177694082560.0, + "grad_norm": 39.94981684135408, + "language_loss": 0.77574682, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.79090184, + "num_input_tokens_seen": 291845305, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.2635498, + "step": 13522, + "time_per_iteration": 4.14990758895874 + }, + { + "auxiliary_loss_clip": 0.01250102, + "auxiliary_loss_mlp": 0.00249203, + "balance_loss_clip": 1.03625107, + "balance_loss_mlp": 0.2257902, + "epoch": 0.8130467458289493, + "flos": 21032197879680.0, + "grad_norm": 5.278705218801356, + "language_loss": 0.75606155, + "learning_rate": 3.554716427853233e-07, + "loss": 0.77105457, + "num_input_tokens_seen": 291863715, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.23413086, + "step": 13523, + "time_per_iteration": 2.6319921016693115 + }, + { + "auxiliary_loss_clip": 0.01246901, + "auxiliary_loss_mlp": 0.00232413, + "balance_loss_clip": 1.02951372, + "balance_loss_mlp": 0.20770073, + "epoch": 0.8131068690816173, + "flos": 15487051979520.0, + "grad_norm": 7.362166997872346, + "language_loss": 0.79548568, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.81027877, + "num_input_tokens_seen": 291880735, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.24731445, + "step": 13524, + "time_per_iteration": 4.1123669147491455 + }, + { + "auxiliary_loss_clip": 0.01245566, + "auxiliary_loss_mlp": 0.00228439, + "balance_loss_clip": 1.02713466, + "balance_loss_mlp": 0.20427507, + "epoch": 0.8131669923342852, + "flos": 29351227576320.0, + "grad_norm": 7.652571253702531, + "language_loss": 0.69684613, + "learning_rate": 3.550284775712653e-07, + "loss": 0.71158612, + "num_input_tokens_seen": 291900535, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24157715, + "step": 13525, + "time_per_iteration": 2.736398458480835 + }, + { + "auxiliary_loss_clip": 0.01244978, + "auxiliary_loss_mlp": 0.00206285, + "balance_loss_clip": 1.02882326, + "balance_loss_mlp": 0.18293181, + "epoch": 0.8132271155869533, + "flos": 35256162055680.0, + "grad_norm": 60.08572391206648, + "language_loss": 0.71770406, + "learning_rate": 3.548069885262628e-07, + "loss": 0.73221666, + "num_input_tokens_seen": 291919760, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.23364258, + "step": 13526, + "time_per_iteration": 2.751244068145752 + }, + { + "auxiliary_loss_clip": 0.01233431, + "auxiliary_loss_mlp": 0.00231526, + "balance_loss_clip": 1.01870847, + "balance_loss_mlp": 0.20739798, + "epoch": 0.8132872388396212, + "flos": 27781895393280.0, + "grad_norm": 4.8078015626904635, + "language_loss": 0.82497138, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.83962095, + "num_input_tokens_seen": 291938915, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24121094, + "step": 13527, + "time_per_iteration": 2.734947443008423 + }, + { + "auxiliary_loss_clip": 0.01262773, + "auxiliary_loss_mlp": 0.00248904, + "balance_loss_clip": 1.04149556, + "balance_loss_mlp": 0.22340514, + "epoch": 0.8133473620922892, + "flos": 27819601695360.0, + "grad_norm": 56.61141286664804, + "language_loss": 0.77315533, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.78827214, + "num_input_tokens_seen": 291958145, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25512695, + "step": 13528, + "time_per_iteration": 2.7453033924102783 + }, + { + "auxiliary_loss_clip": 0.01248625, + "auxiliary_loss_mlp": 0.00241789, + "balance_loss_clip": 1.03063548, + "balance_loss_mlp": 0.21630171, + "epoch": 0.8134074853449572, + "flos": 18989527248000.0, + "grad_norm": 46.322956932422244, + "language_loss": 0.79638535, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.81128949, + "num_input_tokens_seen": 291976860, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25463867, + "step": 13529, + "time_per_iteration": 4.208867311477661 + }, + { + "auxiliary_loss_clip": 0.01241699, + "auxiliary_loss_mlp": 0.00217009, + "balance_loss_clip": 1.0257498, + "balance_loss_mlp": 0.19444311, + "epoch": 0.8134676085976251, + "flos": 24242863057920.0, + "grad_norm": 67.89003158133195, + "language_loss": 0.84055507, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.85514218, + "num_input_tokens_seen": 291998085, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.22570801, + "step": 13530, + "time_per_iteration": 2.685152292251587 + }, + { + "auxiliary_loss_clip": 0.01233501, + "auxiliary_loss_mlp": 0.00220808, + "balance_loss_clip": 1.02189028, + "balance_loss_mlp": 0.19880182, + "epoch": 0.8135277318502931, + "flos": 19062389986560.0, + "grad_norm": 3.1681398095051905, + "language_loss": 0.90336001, + "learning_rate": 3.537004792574052e-07, + "loss": 0.91790307, + "num_input_tokens_seen": 292016585, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.21984863, + "step": 13531, + "time_per_iteration": 2.625089645385742 + }, + { + "auxiliary_loss_clip": 0.01264161, + "auxiliary_loss_mlp": 0.00256322, + "balance_loss_clip": 1.03897822, + "balance_loss_mlp": 0.22960687, + "epoch": 0.813587855102961, + "flos": 17269728992640.0, + "grad_norm": 52.915002268697386, + "language_loss": 0.83871138, + "learning_rate": 3.534793646536065e-07, + "loss": 0.85391623, + "num_input_tokens_seen": 292033255, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.26721191, + "step": 13532, + "time_per_iteration": 4.045852899551392 + }, + { + "auxiliary_loss_clip": 0.01244592, + "auxiliary_loss_mlp": 0.00213317, + "balance_loss_clip": 1.02739048, + "balance_loss_mlp": 0.18946359, + "epoch": 0.8136479783556291, + "flos": 20157593621760.0, + "grad_norm": 29.089776172542667, + "language_loss": 0.84558058, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.86015964, + "num_input_tokens_seen": 292051800, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.23840332, + "step": 13533, + "time_per_iteration": 2.6456258296966553 + }, + { + "auxiliary_loss_clip": 0.01268743, + "auxiliary_loss_mlp": 0.00246846, + "balance_loss_clip": 1.0437994, + "balance_loss_mlp": 0.22128725, + "epoch": 0.813708101608297, + "flos": 22052348046720.0, + "grad_norm": 46.692679162384145, + "language_loss": 0.83607823, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.85123408, + "num_input_tokens_seen": 292072215, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.25549316, + "step": 13534, + "time_per_iteration": 2.7209250926971436 + }, + { + "auxiliary_loss_clip": 0.01253147, + "auxiliary_loss_mlp": 0.00226829, + "balance_loss_clip": 1.03528214, + "balance_loss_mlp": 0.20413205, + "epoch": 0.813768224860965, + "flos": 16173412035840.0, + "grad_norm": 185.24598784764217, + "language_loss": 1.00213301, + "learning_rate": 3.5281639549310336e-07, + "loss": 1.01693273, + "num_input_tokens_seen": 292088830, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.22680664, + "step": 13535, + "time_per_iteration": 2.6354846954345703 + }, + { + "auxiliary_loss_clip": 0.01248226, + "auxiliary_loss_mlp": 0.00219428, + "balance_loss_clip": 1.03362453, + "balance_loss_mlp": 0.19727927, + "epoch": 0.8138283481136329, + "flos": 24352318776960.0, + "grad_norm": 53.770768321391905, + "language_loss": 0.77499425, + "learning_rate": 3.52595530684499e-07, + "loss": 0.78967083, + "num_input_tokens_seen": 292109225, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.22143555, + "step": 13536, + "time_per_iteration": 2.6855294704437256 + }, + { + "auxiliary_loss_clip": 0.01255797, + "auxiliary_loss_mlp": 0.00223386, + "balance_loss_clip": 1.03305876, + "balance_loss_mlp": 0.19904372, + "epoch": 0.8138884713663009, + "flos": 25516362827520.0, + "grad_norm": 5.104108752004816, + "language_loss": 0.83032143, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.84511328, + "num_input_tokens_seen": 292129660, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.24353027, + "step": 13537, + "time_per_iteration": 2.7276642322540283 + }, + { + "auxiliary_loss_clip": 0.01230128, + "auxiliary_loss_mlp": 0.00217179, + "balance_loss_clip": 1.01595473, + "balance_loss_mlp": 0.1943627, + "epoch": 0.8139485946189688, + "flos": 22454368041600.0, + "grad_norm": 9.5294011274582, + "language_loss": 0.82874942, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.8432225, + "num_input_tokens_seen": 292149090, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.22814941, + "step": 13538, + "time_per_iteration": 2.6948652267456055 + }, + { + "auxiliary_loss_clip": 0.0126248, + "auxiliary_loss_mlp": 0.00231052, + "balance_loss_clip": 1.03921175, + "balance_loss_mlp": 0.20682889, + "epoch": 0.8140087178716369, + "flos": 21250391045760.0, + "grad_norm": 4.691297662153734, + "language_loss": 0.84092104, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.85585636, + "num_input_tokens_seen": 292169260, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.2421875, + "step": 13539, + "time_per_iteration": 2.718554735183716 + }, + { + "auxiliary_loss_clip": 0.01231138, + "auxiliary_loss_mlp": 0.00217939, + "balance_loss_clip": 1.01783752, + "balance_loss_mlp": 0.19581358, + "epoch": 0.8140688411243048, + "flos": 39415730774400.0, + "grad_norm": 20.362097618469463, + "language_loss": 0.72214556, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.73663634, + "num_input_tokens_seen": 292188145, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.22143555, + "step": 13540, + "time_per_iteration": 2.784166097640991 + }, + { + "auxiliary_loss_clip": 0.01247539, + "auxiliary_loss_mlp": 0.00221948, + "balance_loss_clip": 1.02965927, + "balance_loss_mlp": 0.19882125, + "epoch": 0.8141289643769728, + "flos": 25415885508480.0, + "grad_norm": 6780.4690583192, + "language_loss": 0.72740412, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.74209899, + "num_input_tokens_seen": 292212135, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.2310791, + "step": 13541, + "time_per_iteration": 2.8120169639587402 + }, + { + "auxiliary_loss_clip": 0.01269454, + "auxiliary_loss_mlp": 0.00235907, + "balance_loss_clip": 1.0427022, + "balance_loss_mlp": 0.20954967, + "epoch": 0.8141890876296408, + "flos": 12568053237120.0, + "grad_norm": 6.040106645177595, + "language_loss": 0.80084497, + "learning_rate": 3.512716539904355e-07, + "loss": 0.81589854, + "num_input_tokens_seen": 292230645, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26367188, + "step": 13542, + "time_per_iteration": 2.6247639656066895 + }, + { + "auxiliary_loss_clip": 0.01266185, + "auxiliary_loss_mlp": 0.00236832, + "balance_loss_clip": 1.0423429, + "balance_loss_mlp": 0.21219176, + "epoch": 0.8142492108823087, + "flos": 14967172483200.0, + "grad_norm": 8.897830887097742, + "language_loss": 0.91117233, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.92620248, + "num_input_tokens_seen": 292243540, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.2467041, + "step": 13543, + "time_per_iteration": 2.619767904281616 + }, + { + "auxiliary_loss_clip": 0.01265477, + "auxiliary_loss_mlp": 0.00222101, + "balance_loss_clip": 1.04156959, + "balance_loss_mlp": 0.19719842, + "epoch": 0.8143093341349767, + "flos": 12422004537600.0, + "grad_norm": 27.09973744512186, + "language_loss": 0.89328361, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.90815938, + "num_input_tokens_seen": 292261715, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.24902344, + "step": 13544, + "time_per_iteration": 2.5914998054504395 + }, + { + "auxiliary_loss_clip": 0.01290137, + "auxiliary_loss_mlp": 0.00238956, + "balance_loss_clip": 1.05172324, + "balance_loss_mlp": 0.21281339, + "epoch": 0.8143694573876447, + "flos": 11910564737280.0, + "grad_norm": 5.448066730464954, + "language_loss": 0.86469084, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.87998176, + "num_input_tokens_seen": 292275080, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.26159668, + "step": 13545, + "time_per_iteration": 2.61784029006958 + }, + { + "auxiliary_loss_clip": 0.01222455, + "auxiliary_loss_mlp": 0.00227487, + "balance_loss_clip": 1.01280403, + "balance_loss_mlp": 0.20470604, + "epoch": 0.8144295806403127, + "flos": 21212900225280.0, + "grad_norm": 29.99562815298172, + "language_loss": 0.8350544, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.84955382, + "num_input_tokens_seen": 292294635, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.22802734, + "step": 13546, + "time_per_iteration": 2.649095058441162 + }, + { + "auxiliary_loss_clip": 0.01251074, + "auxiliary_loss_mlp": 0.0022316, + "balance_loss_clip": 1.03612459, + "balance_loss_mlp": 0.19985431, + "epoch": 0.8144897038929806, + "flos": 19865280741120.0, + "grad_norm": 208.5203966441145, + "language_loss": 0.80781567, + "learning_rate": 3.501701426337178e-07, + "loss": 0.82255793, + "num_input_tokens_seen": 292312695, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.23291016, + "step": 13547, + "time_per_iteration": 2.6567447185516357 + }, + { + "auxiliary_loss_clip": 0.01276007, + "auxiliary_loss_mlp": 0.00238432, + "balance_loss_clip": 1.04938865, + "balance_loss_mlp": 0.21259919, + "epoch": 0.8145498271456486, + "flos": 24571733005440.0, + "grad_norm": 17.22913828946049, + "language_loss": 0.80540508, + "learning_rate": 3.49950028014111e-07, + "loss": 0.82054949, + "num_input_tokens_seen": 292332005, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.25805664, + "step": 13548, + "time_per_iteration": 2.678844690322876 + }, + { + "auxiliary_loss_clip": 0.0126454, + "auxiliary_loss_mlp": 0.00213653, + "balance_loss_clip": 1.03935695, + "balance_loss_mlp": 0.18964434, + "epoch": 0.8146099503983165, + "flos": 20193037367040.0, + "grad_norm": 6.643646694199728, + "language_loss": 0.86246407, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.87724602, + "num_input_tokens_seen": 292348365, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.23999023, + "step": 13549, + "time_per_iteration": 2.6857943534851074 + }, + { + "auxiliary_loss_clip": 0.01251135, + "auxiliary_loss_mlp": 0.00249869, + "balance_loss_clip": 1.03351092, + "balance_loss_mlp": 0.22452548, + "epoch": 0.8146700736509845, + "flos": 19536949497600.0, + "grad_norm": 15.35205066156056, + "language_loss": 0.80989861, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.82490861, + "num_input_tokens_seen": 292368050, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.25305176, + "step": 13550, + "time_per_iteration": 2.7402467727661133 + }, + { + "auxiliary_loss_clip": 0.0122302, + "auxiliary_loss_mlp": 0.00216701, + "balance_loss_clip": 1.0188179, + "balance_loss_mlp": 0.19411072, + "epoch": 0.8147301969036524, + "flos": 18041341979520.0, + "grad_norm": 13.738913912677365, + "language_loss": 0.79102993, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.80542713, + "num_input_tokens_seen": 292385315, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.22607422, + "step": 13551, + "time_per_iteration": 2.618367910385132 + }, + { + "auxiliary_loss_clip": 0.01289034, + "auxiliary_loss_mlp": 0.00230997, + "balance_loss_clip": 1.05350697, + "balance_loss_mlp": 0.20403174, + "epoch": 0.8147903201563205, + "flos": 18004713085440.0, + "grad_norm": 3.383675758626476, + "language_loss": 0.80767846, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.82287872, + "num_input_tokens_seen": 292403375, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.26977539, + "step": 13552, + "time_per_iteration": 2.630678653717041 + }, + { + "auxiliary_loss_clip": 0.01245952, + "auxiliary_loss_mlp": 0.00219802, + "balance_loss_clip": 1.02690101, + "balance_loss_mlp": 0.19522101, + "epoch": 0.8148504434089884, + "flos": 20259327916800.0, + "grad_norm": 182.160143087172, + "language_loss": 0.89881992, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.91347742, + "num_input_tokens_seen": 292419260, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24584961, + "step": 13553, + "time_per_iteration": 2.622894763946533 + }, + { + "auxiliary_loss_clip": 0.01254226, + "auxiliary_loss_mlp": 0.00220035, + "balance_loss_clip": 1.03327179, + "balance_loss_mlp": 0.19603767, + "epoch": 0.8149105666616564, + "flos": 12494723621760.0, + "grad_norm": 50.88388042489687, + "language_loss": 0.78257149, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.79731411, + "num_input_tokens_seen": 292436095, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.23950195, + "step": 13554, + "time_per_iteration": 2.632939338684082 + }, + { + "auxiliary_loss_clip": 0.01258477, + "auxiliary_loss_mlp": 0.00219552, + "balance_loss_clip": 1.03604412, + "balance_loss_mlp": 0.19475633, + "epoch": 0.8149706899143244, + "flos": 32523683662080.0, + "grad_norm": 152.0857071431594, + "language_loss": 0.7466628, + "learning_rate": 3.484109781056723e-07, + "loss": 0.76144314, + "num_input_tokens_seen": 292457190, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.24804688, + "step": 13555, + "time_per_iteration": 2.7388505935668945 + }, + { + "auxiliary_loss_clip": 0.01247562, + "auxiliary_loss_mlp": 0.00232719, + "balance_loss_clip": 1.02454376, + "balance_loss_mlp": 0.20770884, + "epoch": 0.8150308131669923, + "flos": 19386088375680.0, + "grad_norm": 52.69184183143722, + "language_loss": 0.83178413, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.84658694, + "num_input_tokens_seen": 292474300, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25, + "step": 13556, + "time_per_iteration": 2.688929319381714 + }, + { + "auxiliary_loss_clip": 0.01240909, + "auxiliary_loss_mlp": 0.0021488, + "balance_loss_clip": 1.03059494, + "balance_loss_mlp": 0.19285055, + "epoch": 0.8150909364196604, + "flos": 17421380213760.0, + "grad_norm": 159.82228728202358, + "language_loss": 0.86893535, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.88349319, + "num_input_tokens_seen": 292492420, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.2199707, + "step": 13557, + "time_per_iteration": 2.627779960632324 + }, + { + "auxiliary_loss_clip": 0.01255051, + "auxiliary_loss_mlp": 0.00244111, + "balance_loss_clip": 1.03243959, + "balance_loss_mlp": 0.21950589, + "epoch": 0.8151510596723283, + "flos": 27162795553920.0, + "grad_norm": 7.883874783048359, + "language_loss": 0.73363245, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.74862409, + "num_input_tokens_seen": 292512895, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.24609375, + "step": 13558, + "time_per_iteration": 2.711686372756958 + }, + { + "auxiliary_loss_clip": 0.0113699, + "auxiliary_loss_mlp": 0.00124357, + "balance_loss_clip": 0.9798395, + "balance_loss_mlp": 0.11625078, + "epoch": 0.8152111829249963, + "flos": 64219052718720.0, + "grad_norm": 0.8814042396580454, + "language_loss": 0.56171191, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.57432532, + "num_input_tokens_seen": 292566580, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.08105469, + "step": 13559, + "time_per_iteration": 3.138695240020752 + }, + { + "auxiliary_loss_clip": 0.01141877, + "auxiliary_loss_mlp": 0.00124073, + "balance_loss_clip": 0.98326218, + "balance_loss_mlp": 0.1168248, + "epoch": 0.8152713061776642, + "flos": 67072012306560.0, + "grad_norm": 0.7451755935656752, + "language_loss": 0.54796052, + "learning_rate": 3.473135354283334e-07, + "loss": 0.56061995, + "num_input_tokens_seen": 292621490, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.07226562, + "step": 13560, + "time_per_iteration": 3.0047426223754883 + }, + { + "auxiliary_loss_clip": 0.01239645, + "auxiliary_loss_mlp": 0.00231474, + "balance_loss_clip": 1.02363825, + "balance_loss_mlp": 0.20785867, + "epoch": 0.8153314294303322, + "flos": 14391130072320.0, + "grad_norm": 19.74038884258692, + "language_loss": 0.75670397, + "learning_rate": 3.470942348696948e-07, + "loss": 0.77141517, + "num_input_tokens_seen": 292638660, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.23608398, + "step": 13561, + "time_per_iteration": 2.7067933082580566 + }, + { + "auxiliary_loss_clip": 0.01255107, + "auxiliary_loss_mlp": 0.00224621, + "balance_loss_clip": 1.03103495, + "balance_loss_mlp": 0.20104158, + "epoch": 0.8153915526830001, + "flos": 25623520076160.0, + "grad_norm": 25.807826330386735, + "language_loss": 0.89785993, + "learning_rate": 3.468749969894085e-07, + "loss": 0.91265714, + "num_input_tokens_seen": 292658545, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.23571777, + "step": 13562, + "time_per_iteration": 2.73317813873291 + }, + { + "auxiliary_loss_clip": 0.01252275, + "auxiliary_loss_mlp": 0.00238546, + "balance_loss_clip": 1.03084004, + "balance_loss_mlp": 0.21533608, + "epoch": 0.8154516759356681, + "flos": 23369156640000.0, + "grad_norm": 62.71155646591662, + "language_loss": 0.81014389, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.82505214, + "num_input_tokens_seen": 292678460, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.23217773, + "step": 13563, + "time_per_iteration": 2.688458204269409 + }, + { + "auxiliary_loss_clip": 0.01237161, + "auxiliary_loss_mlp": 0.00246645, + "balance_loss_clip": 1.01733816, + "balance_loss_mlp": 0.22078824, + "epoch": 0.815511799188336, + "flos": 28149189914880.0, + "grad_norm": 180.43029810059005, + "language_loss": 0.77392042, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.78875852, + "num_input_tokens_seen": 292699815, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25866699, + "step": 13564, + "time_per_iteration": 4.268319845199585 + }, + { + "auxiliary_loss_clip": 0.01254612, + "auxiliary_loss_mlp": 0.00229452, + "balance_loss_clip": 1.03008997, + "balance_loss_mlp": 0.20474008, + "epoch": 0.8155719224410041, + "flos": 16983413683200.0, + "grad_norm": 8.919595631482935, + "language_loss": 0.78983355, + "learning_rate": 3.462176595017854e-07, + "loss": 0.80467421, + "num_input_tokens_seen": 292717370, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.24731445, + "step": 13565, + "time_per_iteration": 2.6007192134857178 + }, + { + "auxiliary_loss_clip": 0.01247101, + "auxiliary_loss_mlp": 0.00242206, + "balance_loss_clip": 1.02832258, + "balance_loss_mlp": 0.21570534, + "epoch": 0.815632045693672, + "flos": 24681727428480.0, + "grad_norm": 13.571364120840183, + "language_loss": 0.86213732, + "learning_rate": 3.459986724180188e-07, + "loss": 0.87703037, + "num_input_tokens_seen": 292737110, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26477051, + "step": 13566, + "time_per_iteration": 4.108477592468262 + }, + { + "auxiliary_loss_clip": 0.01235443, + "auxiliary_loss_mlp": 0.00231436, + "balance_loss_clip": 1.02697802, + "balance_loss_mlp": 0.20993076, + "epoch": 0.81569216894634, + "flos": 19938323047680.0, + "grad_norm": 3.0367861806755676, + "language_loss": 0.88884473, + "learning_rate": 3.457797480541491e-07, + "loss": 0.90351355, + "num_input_tokens_seen": 292756510, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.21520996, + "step": 13567, + "time_per_iteration": 2.632107973098755 + }, + { + "auxiliary_loss_clip": 0.01231667, + "auxiliary_loss_mlp": 0.00206362, + "balance_loss_clip": 1.01802564, + "balance_loss_mlp": 0.18365213, + "epoch": 0.8157522921990079, + "flos": 21799393493760.0, + "grad_norm": 14.739333905282908, + "language_loss": 0.88111109, + "learning_rate": 3.455608864184771e-07, + "loss": 0.89549136, + "num_input_tokens_seen": 292776710, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.22729492, + "step": 13568, + "time_per_iteration": 2.6391797065734863 + }, + { + "auxiliary_loss_clip": 0.01228165, + "auxiliary_loss_mlp": 0.00234236, + "balance_loss_clip": 1.01481688, + "balance_loss_mlp": 0.21084669, + "epoch": 0.8158124154516759, + "flos": 18508323720960.0, + "grad_norm": 26.06239235038625, + "language_loss": 0.84840405, + "learning_rate": 3.453420875193016e-07, + "loss": 0.86302811, + "num_input_tokens_seen": 292794350, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.23388672, + "step": 13569, + "time_per_iteration": 2.6246917247772217 + }, + { + "auxiliary_loss_clip": 0.01245487, + "auxiliary_loss_mlp": 0.00204075, + "balance_loss_clip": 1.02761078, + "balance_loss_mlp": 0.17945787, + "epoch": 0.815872538704344, + "flos": 26830801123200.0, + "grad_norm": 318.78422639666746, + "language_loss": 0.70006704, + "learning_rate": 3.451233513649199e-07, + "loss": 0.71456259, + "num_input_tokens_seen": 292814005, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.24633789, + "step": 13570, + "time_per_iteration": 2.6746420860290527 + }, + { + "auxiliary_loss_clip": 0.01283323, + "auxiliary_loss_mlp": 0.00237169, + "balance_loss_clip": 1.05420828, + "balance_loss_mlp": 0.21281411, + "epoch": 0.8159326619570119, + "flos": 21725704742400.0, + "grad_norm": 12.152664016810554, + "language_loss": 0.89691299, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.91211784, + "num_input_tokens_seen": 292833485, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.24353027, + "step": 13571, + "time_per_iteration": 4.226847887039185 + }, + { + "auxiliary_loss_clip": 0.01250047, + "auxiliary_loss_mlp": 0.00249437, + "balance_loss_clip": 1.02639675, + "balance_loss_mlp": 0.22435494, + "epoch": 0.8159927852096799, + "flos": 13840726993920.0, + "grad_norm": 43.00434104765732, + "language_loss": 0.91894388, + "learning_rate": 3.446860673237142e-07, + "loss": 0.93393874, + "num_input_tokens_seen": 292848045, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.25097656, + "step": 13572, + "time_per_iteration": 2.5651023387908936 + }, + { + "auxiliary_loss_clip": 0.01237195, + "auxiliary_loss_mlp": 0.00229114, + "balance_loss_clip": 1.01704848, + "balance_loss_mlp": 0.20582101, + "epoch": 0.8160529084623478, + "flos": 24499516711680.0, + "grad_norm": 179.05023055646618, + "language_loss": 0.73213691, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.74680007, + "num_input_tokens_seen": 292869965, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.23278809, + "step": 13573, + "time_per_iteration": 2.7188475131988525 + }, + { + "auxiliary_loss_clip": 0.01234683, + "auxiliary_loss_mlp": 0.00236156, + "balance_loss_clip": 1.01657486, + "balance_loss_mlp": 0.21215871, + "epoch": 0.8161130317150158, + "flos": 24826339584000.0, + "grad_norm": 6.761420486317194, + "language_loss": 0.837883, + "learning_rate": 3.442490343611868e-07, + "loss": 0.8525914, + "num_input_tokens_seen": 292889680, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.23999023, + "step": 13574, + "time_per_iteration": 2.6817822456359863 + }, + { + "auxiliary_loss_clip": 0.01260971, + "auxiliary_loss_mlp": 0.00250538, + "balance_loss_clip": 1.03393531, + "balance_loss_mlp": 0.22357282, + "epoch": 0.8161731549676837, + "flos": 30956542208640.0, + "grad_norm": 25.963301999029163, + "language_loss": 0.68143773, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.69655281, + "num_input_tokens_seen": 292912360, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.26977539, + "step": 13575, + "time_per_iteration": 4.205013751983643 + }, + { + "auxiliary_loss_clip": 0.01259114, + "auxiliary_loss_mlp": 0.00251215, + "balance_loss_clip": 1.03528738, + "balance_loss_mlp": 0.22538272, + "epoch": 0.8162332782203517, + "flos": 18551991680640.0, + "grad_norm": 5.731836077073053, + "language_loss": 0.81350482, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.82860816, + "num_input_tokens_seen": 292928325, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.25830078, + "step": 13576, + "time_per_iteration": 2.7013890743255615 + }, + { + "auxiliary_loss_clip": 0.01138795, + "auxiliary_loss_mlp": 0.0012653, + "balance_loss_clip": 0.97863472, + "balance_loss_mlp": 0.11832883, + "epoch": 0.8162934014730197, + "flos": 70386853904640.0, + "grad_norm": 0.8001935386036532, + "language_loss": 0.58063459, + "learning_rate": 3.435939558349155e-07, + "loss": 0.59328783, + "num_input_tokens_seen": 292992795, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.08203125, + "step": 13577, + "time_per_iteration": 3.1637229919433594 + }, + { + "auxiliary_loss_clip": 0.0123236, + "auxiliary_loss_mlp": 0.00214354, + "balance_loss_clip": 1.0170964, + "balance_loss_mlp": 0.19230068, + "epoch": 0.8163535247256877, + "flos": 21214839559680.0, + "grad_norm": 178.10629830551696, + "language_loss": 0.79151565, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.80598283, + "num_input_tokens_seen": 293011950, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.22058105, + "step": 13578, + "time_per_iteration": 2.706305503845215 + }, + { + "auxiliary_loss_clip": 0.01243091, + "auxiliary_loss_mlp": 0.00233693, + "balance_loss_clip": 1.0265851, + "balance_loss_mlp": 0.20896855, + "epoch": 0.8164136479783556, + "flos": 21098847565440.0, + "grad_norm": 7.194957940691047, + "language_loss": 0.81192064, + "learning_rate": 3.431575508590172e-07, + "loss": 0.82668847, + "num_input_tokens_seen": 293030175, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.24731445, + "step": 13579, + "time_per_iteration": 2.7310259342193604 + }, + { + "auxiliary_loss_clip": 0.0124422, + "auxiliary_loss_mlp": 0.00219615, + "balance_loss_clip": 1.02251065, + "balance_loss_mlp": 0.19578488, + "epoch": 0.8164737712310236, + "flos": 21720640924800.0, + "grad_norm": 17.678489813359178, + "language_loss": 0.84979862, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.86443698, + "num_input_tokens_seen": 293047980, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.23840332, + "step": 13580, + "time_per_iteration": 2.7038021087646484 + }, + { + "auxiliary_loss_clip": 0.01218598, + "auxiliary_loss_mlp": 0.0021832, + "balance_loss_clip": 1.00785804, + "balance_loss_mlp": 0.19654022, + "epoch": 0.8165338944836915, + "flos": 19536805843200.0, + "grad_norm": 39.928527283867055, + "language_loss": 0.76388764, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.77825689, + "num_input_tokens_seen": 293067030, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.21789551, + "step": 13581, + "time_per_iteration": 2.6899499893188477 + }, + { + "auxiliary_loss_clip": 0.01249858, + "auxiliary_loss_mlp": 0.00233051, + "balance_loss_clip": 1.03340292, + "balance_loss_mlp": 0.20930448, + "epoch": 0.8165940177363595, + "flos": 22928568416640.0, + "grad_norm": 29.207434561582446, + "language_loss": 0.67811364, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.69294274, + "num_input_tokens_seen": 293085575, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.23754883, + "step": 13582, + "time_per_iteration": 2.6897215843200684 + }, + { + "auxiliary_loss_clip": 0.01222838, + "auxiliary_loss_mlp": 0.00203583, + "balance_loss_clip": 1.013592, + "balance_loss_mlp": 0.18152896, + "epoch": 0.8166541409890276, + "flos": 23370377702400.0, + "grad_norm": 8.581124132876525, + "language_loss": 0.86669493, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.88095915, + "num_input_tokens_seen": 293108200, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.22045898, + "step": 13583, + "time_per_iteration": 2.7488291263580322 + }, + { + "auxiliary_loss_clip": 0.01245953, + "auxiliary_loss_mlp": 0.00217859, + "balance_loss_clip": 1.02464128, + "balance_loss_mlp": 0.19337381, + "epoch": 0.8167142642416955, + "flos": 18441997257600.0, + "grad_norm": 9.803979082924888, + "language_loss": 0.81997538, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.83461356, + "num_input_tokens_seen": 293126020, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.24475098, + "step": 13584, + "time_per_iteration": 2.6698060035705566 + }, + { + "auxiliary_loss_clip": 0.01262901, + "auxiliary_loss_mlp": 0.00241194, + "balance_loss_clip": 1.04090929, + "balance_loss_mlp": 0.2158021, + "epoch": 0.8167743874943635, + "flos": 21214983214080.0, + "grad_norm": 4.467093834016869, + "language_loss": 0.81975222, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.83479315, + "num_input_tokens_seen": 293144620, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25402832, + "step": 13585, + "time_per_iteration": 2.6799745559692383 + }, + { + "auxiliary_loss_clip": 0.01246515, + "auxiliary_loss_mlp": 0.00216529, + "balance_loss_clip": 1.02520871, + "balance_loss_mlp": 0.1906372, + "epoch": 0.8168345107470314, + "flos": 18697681244160.0, + "grad_norm": 33.48706628664923, + "language_loss": 0.79981363, + "learning_rate": 3.416321129478068e-07, + "loss": 0.81444407, + "num_input_tokens_seen": 293162850, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25878906, + "step": 13586, + "time_per_iteration": 2.590622663497925 + }, + { + "auxiliary_loss_clip": 0.01234704, + "auxiliary_loss_mlp": 0.00229572, + "balance_loss_clip": 1.0162338, + "balance_loss_mlp": 0.20645729, + "epoch": 0.8168946339996994, + "flos": 16253098358400.0, + "grad_norm": 40.25104903012679, + "language_loss": 0.68641245, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.70105523, + "num_input_tokens_seen": 293181620, + "router_z_loss_clip": 2.18457031, + "router_z_loss_mlp": 0.2310791, + "step": 13587, + "time_per_iteration": 2.6386237144470215 + }, + { + "auxiliary_loss_clip": 0.01265695, + "auxiliary_loss_mlp": 0.00248065, + "balance_loss_clip": 1.04007125, + "balance_loss_mlp": 0.22316191, + "epoch": 0.8169547572523673, + "flos": 26941585645440.0, + "grad_norm": 24.991072115904384, + "language_loss": 0.78783035, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.80296797, + "num_input_tokens_seen": 293200270, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.24914551, + "step": 13588, + "time_per_iteration": 2.6782288551330566 + }, + { + "auxiliary_loss_clip": 0.01263433, + "auxiliary_loss_mlp": 0.00234098, + "balance_loss_clip": 1.03752255, + "balance_loss_mlp": 0.20738363, + "epoch": 0.8170148805050353, + "flos": 18952323736320.0, + "grad_norm": 10.732826624707533, + "language_loss": 0.82071978, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.83569509, + "num_input_tokens_seen": 293218960, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.26745605, + "step": 13589, + "time_per_iteration": 2.6707208156585693 + }, + { + "auxiliary_loss_clip": 0.01234079, + "auxiliary_loss_mlp": 0.00224027, + "balance_loss_clip": 1.01830506, + "balance_loss_mlp": 0.19952931, + "epoch": 0.8170750037577033, + "flos": 21834909066240.0, + "grad_norm": 23.35409681224662, + "language_loss": 0.80224895, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.81683004, + "num_input_tokens_seen": 293236450, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24511719, + "step": 13590, + "time_per_iteration": 2.6321022510528564 + }, + { + "auxiliary_loss_clip": 0.01270869, + "auxiliary_loss_mlp": 0.00236293, + "balance_loss_clip": 1.04300213, + "balance_loss_mlp": 0.20903006, + "epoch": 0.8171351270103713, + "flos": 33507169021440.0, + "grad_norm": 62.07627934848193, + "language_loss": 0.75208771, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.76715934, + "num_input_tokens_seen": 293256480, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.27270508, + "step": 13591, + "time_per_iteration": 2.7721383571624756 + }, + { + "auxiliary_loss_clip": 0.01255017, + "auxiliary_loss_mlp": 0.00234418, + "balance_loss_clip": 1.02899265, + "balance_loss_mlp": 0.20931205, + "epoch": 0.8171952502630392, + "flos": 22708184520960.0, + "grad_norm": 20.53249106273902, + "language_loss": 0.79607373, + "learning_rate": 3.403270471641373e-07, + "loss": 0.81096804, + "num_input_tokens_seen": 293274960, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.25109863, + "step": 13592, + "time_per_iteration": 2.722709894180298 + }, + { + "auxiliary_loss_clip": 0.01247602, + "auxiliary_loss_mlp": 0.00239584, + "balance_loss_clip": 1.02709758, + "balance_loss_mlp": 0.21590948, + "epoch": 0.8172553735157072, + "flos": 26723715701760.0, + "grad_norm": 14.577371307776817, + "language_loss": 0.74675333, + "learning_rate": 3.401097564244759e-07, + "loss": 0.76162529, + "num_input_tokens_seen": 293295945, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.23706055, + "step": 13593, + "time_per_iteration": 2.696730613708496 + }, + { + "auxiliary_loss_clip": 0.01245893, + "auxiliary_loss_mlp": 0.00210375, + "balance_loss_clip": 1.02647471, + "balance_loss_mlp": 0.1856395, + "epoch": 0.8173154967683751, + "flos": 15961072786560.0, + "grad_norm": 2.115531180191339, + "language_loss": 0.7745508, + "learning_rate": 3.398925286280188e-07, + "loss": 0.78911352, + "num_input_tokens_seen": 293313300, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24731445, + "step": 13594, + "time_per_iteration": 2.651156425476074 + }, + { + "auxiliary_loss_clip": 0.0125225, + "auxiliary_loss_mlp": 0.00244313, + "balance_loss_clip": 1.03416109, + "balance_loss_mlp": 0.21861155, + "epoch": 0.8173756200210431, + "flos": 25986720447360.0, + "grad_norm": 13.323615664979247, + "language_loss": 0.75671184, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.77167743, + "num_input_tokens_seen": 293333085, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25671387, + "step": 13595, + "time_per_iteration": 2.676764965057373 + }, + { + "auxiliary_loss_clip": 0.01265339, + "auxiliary_loss_mlp": 0.00240987, + "balance_loss_clip": 1.03218341, + "balance_loss_mlp": 0.21412951, + "epoch": 0.8174357432737112, + "flos": 25664422688640.0, + "grad_norm": 9.44417395291569, + "language_loss": 0.8448534, + "learning_rate": 3.394582618976658e-07, + "loss": 0.85991663, + "num_input_tokens_seen": 293351895, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.26855469, + "step": 13596, + "time_per_iteration": 2.821202516555786 + }, + { + "auxiliary_loss_clip": 0.01222711, + "auxiliary_loss_mlp": 0.00232833, + "balance_loss_clip": 1.00785422, + "balance_loss_mlp": 0.20893104, + "epoch": 0.8174958665263791, + "flos": 21835088634240.0, + "grad_norm": 22.063513547718664, + "language_loss": 0.70612109, + "learning_rate": 3.392412229802362e-07, + "loss": 0.72067654, + "num_input_tokens_seen": 293371165, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.2388916, + "step": 13597, + "time_per_iteration": 2.6506845951080322 + }, + { + "auxiliary_loss_clip": 0.01225278, + "auxiliary_loss_mlp": 0.00217049, + "balance_loss_clip": 1.01099396, + "balance_loss_mlp": 0.19356504, + "epoch": 0.8175559897790471, + "flos": 22455517276800.0, + "grad_norm": 9.429242738081662, + "language_loss": 0.88628447, + "learning_rate": 3.390242470389462e-07, + "loss": 0.90070772, + "num_input_tokens_seen": 293391150, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.23474121, + "step": 13598, + "time_per_iteration": 2.707749605178833 + }, + { + "auxiliary_loss_clip": 0.01256129, + "auxiliary_loss_mlp": 0.00231405, + "balance_loss_clip": 1.03481412, + "balance_loss_mlp": 0.206478, + "epoch": 0.817616113031715, + "flos": 23615790399360.0, + "grad_norm": 11.046739670019871, + "language_loss": 0.89472878, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.90960413, + "num_input_tokens_seen": 293409440, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.24951172, + "step": 13599, + "time_per_iteration": 2.6525466442108154 + }, + { + "auxiliary_loss_clip": 0.01225443, + "auxiliary_loss_mlp": 0.00244937, + "balance_loss_clip": 1.01437676, + "balance_loss_mlp": 0.22090447, + "epoch": 0.817676236284383, + "flos": 27672260106240.0, + "grad_norm": 18.18627347229968, + "language_loss": 0.90638965, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.92109346, + "num_input_tokens_seen": 293428995, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.2401123, + "step": 13600, + "time_per_iteration": 2.7490286827087402 + }, + { + "auxiliary_loss_clip": 0.01242171, + "auxiliary_loss_mlp": 0.00218399, + "balance_loss_clip": 1.02467167, + "balance_loss_mlp": 0.1953322, + "epoch": 0.8177363595370509, + "flos": 24681009156480.0, + "grad_norm": 92.63417949838149, + "language_loss": 0.81137621, + "learning_rate": 3.383736971541766e-07, + "loss": 0.82598186, + "num_input_tokens_seen": 293449155, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.23095703, + "step": 13601, + "time_per_iteration": 2.6697206497192383 + }, + { + "auxiliary_loss_clip": 0.01258376, + "auxiliary_loss_mlp": 0.00240428, + "balance_loss_clip": 1.03257287, + "balance_loss_mlp": 0.21498901, + "epoch": 0.817796482789719, + "flos": 17346326745600.0, + "grad_norm": 7.01483989835848, + "language_loss": 0.78517038, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.80015838, + "num_input_tokens_seen": 293466125, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.2545166, + "step": 13602, + "time_per_iteration": 2.635955333709717 + }, + { + "auxiliary_loss_clip": 0.01247971, + "auxiliary_loss_mlp": 0.0022649, + "balance_loss_clip": 1.02880645, + "balance_loss_mlp": 0.20207639, + "epoch": 0.8178566060423869, + "flos": 17778475272960.0, + "grad_norm": 239.31191441023182, + "language_loss": 0.92977273, + "learning_rate": 3.379403122624718e-07, + "loss": 0.94451725, + "num_input_tokens_seen": 293481345, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.2442627, + "step": 13603, + "time_per_iteration": 2.622375726699829 + }, + { + "auxiliary_loss_clip": 0.01233078, + "auxiliary_loss_mlp": 0.00249555, + "balance_loss_clip": 1.01996124, + "balance_loss_mlp": 0.22535545, + "epoch": 0.8179167292950549, + "flos": 24973250209920.0, + "grad_norm": 43.46399181354795, + "language_loss": 0.79644322, + "learning_rate": 3.377237143507159e-07, + "loss": 0.81126952, + "num_input_tokens_seen": 293502330, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24194336, + "step": 13604, + "time_per_iteration": 2.735285520553589 + }, + { + "auxiliary_loss_clip": 0.01242408, + "auxiliary_loss_mlp": 0.00223806, + "balance_loss_clip": 1.02613473, + "balance_loss_mlp": 0.20180006, + "epoch": 0.8179768525477228, + "flos": 22856783086080.0, + "grad_norm": 17.268153239676447, + "language_loss": 0.82432437, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.83898652, + "num_input_tokens_seen": 293521415, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.22009277, + "step": 13605, + "time_per_iteration": 2.7484636306762695 + }, + { + "auxiliary_loss_clip": 0.01244341, + "auxiliary_loss_mlp": 0.00229201, + "balance_loss_clip": 1.02103996, + "balance_loss_mlp": 0.20515689, + "epoch": 0.8180369758003908, + "flos": 18515147304960.0, + "grad_norm": 6.872064212114467, + "language_loss": 0.83944142, + "learning_rate": 3.372907076364666e-07, + "loss": 0.85417694, + "num_input_tokens_seen": 293539245, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.24047852, + "step": 13606, + "time_per_iteration": 4.157891035079956 + }, + { + "auxiliary_loss_clip": 0.01229637, + "auxiliary_loss_mlp": 0.00216448, + "balance_loss_clip": 1.01210761, + "balance_loss_mlp": 0.19296405, + "epoch": 0.8180970990530587, + "flos": 33182105915520.0, + "grad_norm": 10.03262438283251, + "language_loss": 0.75111127, + "learning_rate": 3.370742988503916e-07, + "loss": 0.76557207, + "num_input_tokens_seen": 293560640, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.23486328, + "step": 13607, + "time_per_iteration": 2.7612736225128174 + }, + { + "auxiliary_loss_clip": 0.01235232, + "auxiliary_loss_mlp": 0.00226194, + "balance_loss_clip": 1.0163734, + "balance_loss_mlp": 0.20151812, + "epoch": 0.8181572223057267, + "flos": 25010022758400.0, + "grad_norm": 42.01879452164553, + "language_loss": 0.77668351, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.79129773, + "num_input_tokens_seen": 293579465, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24707031, + "step": 13608, + "time_per_iteration": 4.223840951919556 + }, + { + "auxiliary_loss_clip": 0.01238061, + "auxiliary_loss_mlp": 0.00232788, + "balance_loss_clip": 1.02119541, + "balance_loss_mlp": 0.2089695, + "epoch": 0.8182173455583948, + "flos": 28548731871360.0, + "grad_norm": 6.872353997768753, + "language_loss": 0.88280845, + "learning_rate": 3.366416704613735e-07, + "loss": 0.89751691, + "num_input_tokens_seen": 293600540, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.23815918, + "step": 13609, + "time_per_iteration": 2.6917612552642822 + }, + { + "auxiliary_loss_clip": 0.01124973, + "auxiliary_loss_mlp": 0.00123548, + "balance_loss_clip": 0.96447891, + "balance_loss_mlp": 0.11658575, + "epoch": 0.8182774688110627, + "flos": 72028043245440.0, + "grad_norm": 0.7383993673933895, + "language_loss": 0.55270004, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.56518525, + "num_input_tokens_seen": 293665160, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.06982422, + "step": 13610, + "time_per_iteration": 3.278946876525879 + }, + { + "auxiliary_loss_clip": 0.01230903, + "auxiliary_loss_mlp": 0.0023003, + "balance_loss_clip": 1.01862049, + "balance_loss_mlp": 0.20652156, + "epoch": 0.8183375920637307, + "flos": 19755358145280.0, + "grad_norm": 7.907689107263748, + "language_loss": 0.86046761, + "learning_rate": 3.362092943712107e-07, + "loss": 0.87507695, + "num_input_tokens_seen": 293683995, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.23535156, + "step": 13611, + "time_per_iteration": 2.6614105701446533 + }, + { + "auxiliary_loss_clip": 0.0127349, + "auxiliary_loss_mlp": 0.00254554, + "balance_loss_clip": 1.04217577, + "balance_loss_mlp": 0.22782713, + "epoch": 0.8183977153163986, + "flos": 22341895580160.0, + "grad_norm": 59.677783021721815, + "language_loss": 0.84386289, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.85914338, + "num_input_tokens_seen": 293704115, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.26757812, + "step": 13612, + "time_per_iteration": 2.686920166015625 + }, + { + "auxiliary_loss_clip": 0.01232405, + "auxiliary_loss_mlp": 0.00216965, + "balance_loss_clip": 1.01718616, + "balance_loss_mlp": 0.19405346, + "epoch": 0.8184578385690666, + "flos": 17712472032000.0, + "grad_norm": 4.573402672039386, + "language_loss": 0.94057643, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.95507014, + "num_input_tokens_seen": 293722225, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.22900391, + "step": 13613, + "time_per_iteration": 4.182492733001709 + }, + { + "auxiliary_loss_clip": 0.01252191, + "auxiliary_loss_mlp": 0.00214689, + "balance_loss_clip": 1.0335834, + "balance_loss_mlp": 0.19174162, + "epoch": 0.8185179618217345, + "flos": 25701159323520.0, + "grad_norm": 6.1772431034183, + "language_loss": 0.78254831, + "learning_rate": 3.355612034397746e-07, + "loss": 0.79721713, + "num_input_tokens_seen": 293743995, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.22949219, + "step": 13614, + "time_per_iteration": 2.702536106109619 + }, + { + "auxiliary_loss_clip": 0.01250215, + "auxiliary_loss_mlp": 0.00235987, + "balance_loss_clip": 1.02632284, + "balance_loss_mlp": 0.21046394, + "epoch": 0.8185780850744026, + "flos": 25960326929280.0, + "grad_norm": 161.6201602206527, + "language_loss": 0.885768, + "learning_rate": 3.353452993497479e-07, + "loss": 0.90063, + "num_input_tokens_seen": 293764935, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.25524902, + "step": 13615, + "time_per_iteration": 2.73157000541687 + }, + { + "auxiliary_loss_clip": 0.01233571, + "auxiliary_loss_mlp": 0.00236708, + "balance_loss_clip": 1.01857722, + "balance_loss_mlp": 0.21207955, + "epoch": 0.8186382083270705, + "flos": 25228431406080.0, + "grad_norm": 136.58700573919373, + "language_loss": 0.81148791, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.82619071, + "num_input_tokens_seen": 293784035, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24645996, + "step": 13616, + "time_per_iteration": 2.6968929767608643 + }, + { + "auxiliary_loss_clip": 0.01229197, + "auxiliary_loss_mlp": 0.00212616, + "balance_loss_clip": 1.01175332, + "balance_loss_mlp": 0.18705788, + "epoch": 0.8186983315797385, + "flos": 22415009713920.0, + "grad_norm": 138.82314073262458, + "language_loss": 0.81712729, + "learning_rate": 3.349136805494979e-07, + "loss": 0.83154535, + "num_input_tokens_seen": 293803360, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.25524902, + "step": 13617, + "time_per_iteration": 4.097262144088745 + }, + { + "auxiliary_loss_clip": 0.01215602, + "auxiliary_loss_mlp": 0.00234467, + "balance_loss_clip": 1.00343537, + "balance_loss_mlp": 0.21175721, + "epoch": 0.8187584548324064, + "flos": 22018017623040.0, + "grad_norm": 12.372772789887094, + "language_loss": 0.77010036, + "learning_rate": 3.346979658556415e-07, + "loss": 0.78460109, + "num_input_tokens_seen": 293821325, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.22705078, + "step": 13618, + "time_per_iteration": 2.7285943031311035 + }, + { + "auxiliary_loss_clip": 0.012695, + "auxiliary_loss_mlp": 0.00256034, + "balance_loss_clip": 1.03741097, + "balance_loss_mlp": 0.2294504, + "epoch": 0.8188185780850744, + "flos": 29241664116480.0, + "grad_norm": 10.199501280667347, + "language_loss": 0.77605677, + "learning_rate": 3.344823143102058e-07, + "loss": 0.7913121, + "num_input_tokens_seen": 293840315, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.26550293, + "step": 13619, + "time_per_iteration": 2.7123286724090576 + }, + { + "auxiliary_loss_clip": 0.01243323, + "auxiliary_loss_mlp": 0.00229191, + "balance_loss_clip": 1.02051568, + "balance_loss_mlp": 0.20367992, + "epoch": 0.8188787013377423, + "flos": 20696504348160.0, + "grad_norm": 170.26012773334634, + "language_loss": 0.83210528, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.84683043, + "num_input_tokens_seen": 293855685, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25512695, + "step": 13620, + "time_per_iteration": 2.673306465148926 + }, + { + "auxiliary_loss_clip": 0.01220856, + "auxiliary_loss_mlp": 0.00199678, + "balance_loss_clip": 1.00759304, + "balance_loss_mlp": 0.17664658, + "epoch": 0.8189388245904103, + "flos": 23732967542400.0, + "grad_norm": 31.292526239485994, + "language_loss": 0.83273578, + "learning_rate": 3.340512006973011e-07, + "loss": 0.84694111, + "num_input_tokens_seen": 293875540, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.23046875, + "step": 13621, + "time_per_iteration": 2.6782937049865723 + }, + { + "auxiliary_loss_clip": 0.01240932, + "auxiliary_loss_mlp": 0.00229672, + "balance_loss_clip": 1.02131569, + "balance_loss_mlp": 0.20648529, + "epoch": 0.8189989478430784, + "flos": 28255090187520.0, + "grad_norm": 155.54383010912207, + "language_loss": 0.7555353, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.77024132, + "num_input_tokens_seen": 293896570, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.23168945, + "step": 13622, + "time_per_iteration": 2.6988582611083984 + }, + { + "auxiliary_loss_clip": 0.01253285, + "auxiliary_loss_mlp": 0.00229265, + "balance_loss_clip": 1.02757955, + "balance_loss_mlp": 0.2031108, + "epoch": 0.8190590710957463, + "flos": 21397696721280.0, + "grad_norm": 20.514331944934668, + "language_loss": 0.83619738, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.85102284, + "num_input_tokens_seen": 293914680, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.26147461, + "step": 13623, + "time_per_iteration": 2.681013345718384 + }, + { + "auxiliary_loss_clip": 0.01252441, + "auxiliary_loss_mlp": 0.00234643, + "balance_loss_clip": 1.03097248, + "balance_loss_mlp": 0.20958543, + "epoch": 0.8191191943484143, + "flos": 38796451367040.0, + "grad_norm": 578.633403426441, + "language_loss": 0.7105062, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.72537702, + "num_input_tokens_seen": 293936480, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25048828, + "step": 13624, + "time_per_iteration": 2.7900772094726562 + }, + { + "auxiliary_loss_clip": 0.01230671, + "auxiliary_loss_mlp": 0.00217994, + "balance_loss_clip": 1.01641321, + "balance_loss_mlp": 0.19423531, + "epoch": 0.8191793176010822, + "flos": 25446516831360.0, + "grad_norm": 142.43244871160977, + "language_loss": 0.85976791, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.87425458, + "num_input_tokens_seen": 293957815, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.23779297, + "step": 13625, + "time_per_iteration": 2.7486565113067627 + }, + { + "auxiliary_loss_clip": 0.01259118, + "auxiliary_loss_mlp": 0.00229021, + "balance_loss_clip": 1.03150463, + "balance_loss_mlp": 0.20210335, + "epoch": 0.8192394408537502, + "flos": 25083029151360.0, + "grad_norm": 6.637832144209108, + "language_loss": 0.87028491, + "learning_rate": 3.329745223345244e-07, + "loss": 0.88516629, + "num_input_tokens_seen": 293975440, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.26904297, + "step": 13626, + "time_per_iteration": 2.6799509525299072 + }, + { + "auxiliary_loss_clip": 0.01239807, + "auxiliary_loss_mlp": 0.00221908, + "balance_loss_clip": 1.02225757, + "balance_loss_mlp": 0.19921096, + "epoch": 0.8192995641064181, + "flos": 27673732563840.0, + "grad_norm": 9.5238976292753, + "language_loss": 0.80082422, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.81544137, + "num_input_tokens_seen": 293997540, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.22668457, + "step": 13627, + "time_per_iteration": 2.823000907897949 + }, + { + "auxiliary_loss_clip": 0.01270842, + "auxiliary_loss_mlp": 0.00223706, + "balance_loss_clip": 1.038517, + "balance_loss_mlp": 0.19782606, + "epoch": 0.8193596873590862, + "flos": 21288492397440.0, + "grad_norm": 393.7156227157432, + "language_loss": 0.76812488, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.78307033, + "num_input_tokens_seen": 294017030, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.2590332, + "step": 13628, + "time_per_iteration": 2.6596760749816895 + }, + { + "auxiliary_loss_clip": 0.01254449, + "auxiliary_loss_mlp": 0.00229448, + "balance_loss_clip": 1.03008604, + "balance_loss_mlp": 0.20439002, + "epoch": 0.8194198106117541, + "flos": 17492626840320.0, + "grad_norm": 2.417840381882748, + "language_loss": 0.92729759, + "learning_rate": 3.323292738168171e-07, + "loss": 0.94213653, + "num_input_tokens_seen": 294035700, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.25061035, + "step": 13629, + "time_per_iteration": 2.724249839782715 + }, + { + "auxiliary_loss_clip": 0.01251406, + "auxiliary_loss_mlp": 0.00247312, + "balance_loss_clip": 1.02811038, + "balance_loss_mlp": 0.22122872, + "epoch": 0.8194799338644221, + "flos": 15267925059840.0, + "grad_norm": 181.3411595057849, + "language_loss": 0.83143896, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.84642613, + "num_input_tokens_seen": 294049730, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26086426, + "step": 13630, + "time_per_iteration": 2.66106915473938 + }, + { + "auxiliary_loss_clip": 0.01248923, + "auxiliary_loss_mlp": 0.00244801, + "balance_loss_clip": 1.02701235, + "balance_loss_mlp": 0.21880096, + "epoch": 0.81954005711709, + "flos": 14718814871040.0, + "grad_norm": 614.7116099961211, + "language_loss": 0.8014667, + "learning_rate": 3.31899424315957e-07, + "loss": 0.81640387, + "num_input_tokens_seen": 294066545, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.2598877, + "step": 13631, + "time_per_iteration": 2.6554653644561768 + }, + { + "auxiliary_loss_clip": 0.01244195, + "auxiliary_loss_mlp": 0.0021925, + "balance_loss_clip": 1.02374268, + "balance_loss_mlp": 0.19493136, + "epoch": 0.819600180369758, + "flos": 23074042498560.0, + "grad_norm": 56.88657449291879, + "language_loss": 0.82867718, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.84331167, + "num_input_tokens_seen": 294087455, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24328613, + "step": 13632, + "time_per_iteration": 2.659135580062866 + }, + { + "auxiliary_loss_clip": 0.01245434, + "auxiliary_loss_mlp": 0.00210328, + "balance_loss_clip": 1.02502787, + "balance_loss_mlp": 0.18686718, + "epoch": 0.8196603036224259, + "flos": 27599792417280.0, + "grad_norm": 1277.1665420570603, + "language_loss": 0.73602366, + "learning_rate": 3.314698278332588e-07, + "loss": 0.75058126, + "num_input_tokens_seen": 294107480, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.23449707, + "step": 13633, + "time_per_iteration": 2.7224810123443604 + }, + { + "auxiliary_loss_clip": 0.01226567, + "auxiliary_loss_mlp": 0.00215108, + "balance_loss_clip": 1.01582181, + "balance_loss_mlp": 0.19245851, + "epoch": 0.8197204268750939, + "flos": 28582020800640.0, + "grad_norm": 16.838950097140543, + "language_loss": 0.82175523, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.83617198, + "num_input_tokens_seen": 294130115, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.2265625, + "step": 13634, + "time_per_iteration": 2.746833086013794 + }, + { + "auxiliary_loss_clip": 0.01249416, + "auxiliary_loss_mlp": 0.00222323, + "balance_loss_clip": 1.03161204, + "balance_loss_mlp": 0.1983735, + "epoch": 0.819780550127762, + "flos": 23258300290560.0, + "grad_norm": 23.25011803649152, + "language_loss": 0.8891021, + "learning_rate": 3.310404844338841e-07, + "loss": 0.9038195, + "num_input_tokens_seen": 294148495, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.23962402, + "step": 13635, + "time_per_iteration": 2.6425766944885254 + }, + { + "auxiliary_loss_clip": 0.01237155, + "auxiliary_loss_mlp": 0.00240142, + "balance_loss_clip": 1.01828504, + "balance_loss_mlp": 0.21531102, + "epoch": 0.8198406733804299, + "flos": 26685255214080.0, + "grad_norm": 32.2447068419712, + "language_loss": 0.81934237, + "learning_rate": 3.308259076607949e-07, + "loss": 0.83411539, + "num_input_tokens_seen": 294169595, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.24829102, + "step": 13636, + "time_per_iteration": 2.737190008163452 + }, + { + "auxiliary_loss_clip": 0.01246973, + "auxiliary_loss_mlp": 0.00229042, + "balance_loss_clip": 1.02858293, + "balance_loss_mlp": 0.20251754, + "epoch": 0.8199007966330979, + "flos": 20084084438400.0, + "grad_norm": 7.296427976224378, + "language_loss": 0.89477926, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.9095394, + "num_input_tokens_seen": 294183885, + "router_z_loss_clip": 2.18261719, + "router_z_loss_mlp": 0.26513672, + "step": 13637, + "time_per_iteration": 2.61794114112854 + }, + { + "auxiliary_loss_clip": 0.01257128, + "auxiliary_loss_mlp": 0.00214053, + "balance_loss_clip": 1.0336715, + "balance_loss_mlp": 0.18713582, + "epoch": 0.8199609198857658, + "flos": 31902788142720.0, + "grad_norm": 13.16731069067959, + "language_loss": 0.7988708, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.8135826, + "num_input_tokens_seen": 294200150, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26928711, + "step": 13638, + "time_per_iteration": 2.7373180389404297 + }, + { + "auxiliary_loss_clip": 0.01257292, + "auxiliary_loss_mlp": 0.00229786, + "balance_loss_clip": 1.03119087, + "balance_loss_mlp": 0.20381021, + "epoch": 0.8200210431384338, + "flos": 26470150617600.0, + "grad_norm": 4.791338275655386, + "language_loss": 0.89958721, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.91445804, + "num_input_tokens_seen": 294220385, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.2598877, + "step": 13639, + "time_per_iteration": 2.6779372692108154 + }, + { + "auxiliary_loss_clip": 0.01224772, + "auxiliary_loss_mlp": 0.0021258, + "balance_loss_clip": 1.01015687, + "balance_loss_mlp": 0.19045471, + "epoch": 0.8200811663911017, + "flos": 22091454979200.0, + "grad_norm": 5.225023142683345, + "language_loss": 0.87684983, + "learning_rate": 3.299682336022589e-07, + "loss": 0.89122337, + "num_input_tokens_seen": 294239355, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.22155762, + "step": 13640, + "time_per_iteration": 2.679725170135498 + }, + { + "auxiliary_loss_clip": 0.01267753, + "auxiliary_loss_mlp": 0.00224948, + "balance_loss_clip": 1.03786564, + "balance_loss_mlp": 0.19959234, + "epoch": 0.8201412896437698, + "flos": 37593659520000.0, + "grad_norm": 8.439023030084238, + "language_loss": 0.70380116, + "learning_rate": 3.297539733867336e-07, + "loss": 0.71872818, + "num_input_tokens_seen": 294259395, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.25354004, + "step": 13641, + "time_per_iteration": 2.782400131225586 + }, + { + "auxiliary_loss_clip": 0.01239016, + "auxiliary_loss_mlp": 0.0021963, + "balance_loss_clip": 1.01888204, + "balance_loss_mlp": 0.19560906, + "epoch": 0.8202014128964377, + "flos": 19646333389440.0, + "grad_norm": 2.932030131352737, + "language_loss": 0.81939214, + "learning_rate": 3.295397765071055e-07, + "loss": 0.83397859, + "num_input_tokens_seen": 294277365, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.23999023, + "step": 13642, + "time_per_iteration": 2.7147645950317383 + }, + { + "auxiliary_loss_clip": 0.01243662, + "auxiliary_loss_mlp": 0.00222272, + "balance_loss_clip": 1.02568531, + "balance_loss_mlp": 0.20018294, + "epoch": 0.8202615361491057, + "flos": 31467335564160.0, + "grad_norm": 14.787210729213797, + "language_loss": 0.80226874, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.81692809, + "num_input_tokens_seen": 294297555, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.22094727, + "step": 13643, + "time_per_iteration": 2.7551143169403076 + }, + { + "auxiliary_loss_clip": 0.01232984, + "auxiliary_loss_mlp": 0.00236832, + "balance_loss_clip": 1.01829958, + "balance_loss_mlp": 0.2126442, + "epoch": 0.8203216594017736, + "flos": 24715555061760.0, + "grad_norm": 6.8590470912508215, + "language_loss": 0.72676098, + "learning_rate": 3.291115727880256e-07, + "loss": 0.74145919, + "num_input_tokens_seen": 294317600, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.24169922, + "step": 13644, + "time_per_iteration": 2.6882035732269287 + }, + { + "auxiliary_loss_clip": 0.01236148, + "auxiliary_loss_mlp": 0.00243983, + "balance_loss_clip": 1.02211404, + "balance_loss_mlp": 0.22049864, + "epoch": 0.8203817826544416, + "flos": 26031824951040.0, + "grad_norm": 23.820883903321622, + "language_loss": 0.77343839, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.78823966, + "num_input_tokens_seen": 294340215, + "router_z_loss_clip": 2.13769531, + "router_z_loss_mlp": 0.23474121, + "step": 13645, + "time_per_iteration": 2.723560094833374 + }, + { + "auxiliary_loss_clip": 0.0123103, + "auxiliary_loss_mlp": 0.00212428, + "balance_loss_clip": 1.01705456, + "balance_loss_mlp": 0.18785886, + "epoch": 0.8204419059071095, + "flos": 25954544839680.0, + "grad_norm": 14.35527184315142, + "language_loss": 0.78650713, + "learning_rate": 3.286836225099707e-07, + "loss": 0.80094177, + "num_input_tokens_seen": 294358590, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.24584961, + "step": 13646, + "time_per_iteration": 2.7297866344451904 + }, + { + "auxiliary_loss_clip": 0.01271724, + "auxiliary_loss_mlp": 0.00243041, + "balance_loss_clip": 1.04672635, + "balance_loss_mlp": 0.21491942, + "epoch": 0.8205020291597775, + "flos": 23580059345280.0, + "grad_norm": 5.120213029104192, + "language_loss": 0.86889851, + "learning_rate": 3.284697424316132e-07, + "loss": 0.88404614, + "num_input_tokens_seen": 294375825, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.28137207, + "step": 13647, + "time_per_iteration": 2.6841328144073486 + }, + { + "auxiliary_loss_clip": 0.01243732, + "auxiliary_loss_mlp": 0.00218895, + "balance_loss_clip": 1.02677619, + "balance_loss_mlp": 0.19617419, + "epoch": 0.8205621524124456, + "flos": 26799164219520.0, + "grad_norm": 15.570122333779041, + "language_loss": 0.75166202, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.76628828, + "num_input_tokens_seen": 294398500, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.22753906, + "step": 13648, + "time_per_iteration": 2.7461588382720947 + }, + { + "auxiliary_loss_clip": 0.01257447, + "auxiliary_loss_mlp": 0.00228439, + "balance_loss_clip": 1.03442454, + "balance_loss_mlp": 0.20396492, + "epoch": 0.8206222756651135, + "flos": 27527863432320.0, + "grad_norm": 5.399678894828169, + "language_loss": 0.87199652, + "learning_rate": 3.28042172436791e-07, + "loss": 0.88685536, + "num_input_tokens_seen": 294418840, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.24462891, + "step": 13649, + "time_per_iteration": 4.1821746826171875 + }, + { + "auxiliary_loss_clip": 0.01260453, + "auxiliary_loss_mlp": 0.00232517, + "balance_loss_clip": 1.03754437, + "balance_loss_mlp": 0.20716138, + "epoch": 0.8206823989177815, + "flos": 21178605715200.0, + "grad_norm": 2.934184487465175, + "language_loss": 0.76703769, + "learning_rate": 3.278284825365396e-07, + "loss": 0.7819674, + "num_input_tokens_seen": 294438215, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25341797, + "step": 13650, + "time_per_iteration": 4.081311225891113 + }, + { + "auxiliary_loss_clip": 0.01263501, + "auxiliary_loss_mlp": 0.00221003, + "balance_loss_clip": 1.04180765, + "balance_loss_mlp": 0.19666037, + "epoch": 0.8207425221704494, + "flos": 11509622150400.0, + "grad_norm": 68.8148422625759, + "language_loss": 0.73510504, + "learning_rate": 3.276148560452001e-07, + "loss": 0.74995005, + "num_input_tokens_seen": 294455260, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.2434082, + "step": 13651, + "time_per_iteration": 2.6158719062805176 + }, + { + "auxiliary_loss_clip": 0.01272527, + "auxiliary_loss_mlp": 0.00243324, + "balance_loss_clip": 1.04788935, + "balance_loss_mlp": 0.21757466, + "epoch": 0.8208026454231174, + "flos": 19791987039360.0, + "grad_norm": 21.950805584674466, + "language_loss": 0.79784769, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.81300622, + "num_input_tokens_seen": 294473205, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.25732422, + "step": 13652, + "time_per_iteration": 2.6387953758239746 + }, + { + "auxiliary_loss_clip": 0.01219008, + "auxiliary_loss_mlp": 0.00215244, + "balance_loss_clip": 1.01057911, + "balance_loss_mlp": 0.19483548, + "epoch": 0.8208627686757853, + "flos": 15667538843520.0, + "grad_norm": 181.75661215739916, + "language_loss": 0.80228043, + "learning_rate": 3.271877933216558e-07, + "loss": 0.81662297, + "num_input_tokens_seen": 294490645, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.20422363, + "step": 13653, + "time_per_iteration": 2.6369502544403076 + }, + { + "auxiliary_loss_clip": 0.01271833, + "auxiliary_loss_mlp": 0.00227553, + "balance_loss_clip": 1.04706132, + "balance_loss_mlp": 0.20175579, + "epoch": 0.8209228919284534, + "flos": 37482659516160.0, + "grad_norm": 17.004547955933646, + "language_loss": 0.70912635, + "learning_rate": 3.269743571056451e-07, + "loss": 0.72412014, + "num_input_tokens_seen": 294513500, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.25830078, + "step": 13654, + "time_per_iteration": 2.7935566902160645 + }, + { + "auxiliary_loss_clip": 0.01254462, + "auxiliary_loss_mlp": 0.00221711, + "balance_loss_clip": 1.03050542, + "balance_loss_mlp": 0.19722551, + "epoch": 0.8209830151811213, + "flos": 23112969863040.0, + "grad_norm": 2.707119339929814, + "language_loss": 0.76125014, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.77601182, + "num_input_tokens_seen": 294535710, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.24487305, + "step": 13655, + "time_per_iteration": 4.268051624298096 + }, + { + "auxiliary_loss_clip": 0.01239907, + "auxiliary_loss_mlp": 0.00218908, + "balance_loss_clip": 1.02497005, + "balance_loss_mlp": 0.19565003, + "epoch": 0.8210431384337893, + "flos": 21288169175040.0, + "grad_norm": 81.889303743954, + "language_loss": 0.89060974, + "learning_rate": 3.265476750056162e-07, + "loss": 0.90519786, + "num_input_tokens_seen": 294554055, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.23242188, + "step": 13656, + "time_per_iteration": 2.804494857788086 + }, + { + "auxiliary_loss_clip": 0.01245342, + "auxiliary_loss_mlp": 0.0024035, + "balance_loss_clip": 1.02566564, + "balance_loss_mlp": 0.21424335, + "epoch": 0.8211032616864572, + "flos": 11502403516800.0, + "grad_norm": 26.257839822303094, + "language_loss": 0.82505226, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.83990914, + "num_input_tokens_seen": 294570390, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.2611084, + "step": 13657, + "time_per_iteration": 2.8462464809417725 + }, + { + "auxiliary_loss_clip": 0.01238647, + "auxiliary_loss_mlp": 0.0018821, + "balance_loss_clip": 1.02005768, + "balance_loss_mlp": 0.16380775, + "epoch": 0.8211633849391252, + "flos": 29821477455360.0, + "grad_norm": 16.67974773419019, + "language_loss": 0.64542651, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.65969503, + "num_input_tokens_seen": 294593050, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24401855, + "step": 13658, + "time_per_iteration": 2.7173759937286377 + }, + { + "auxiliary_loss_clip": 0.01235294, + "auxiliary_loss_mlp": 0.00213987, + "balance_loss_clip": 1.01976478, + "balance_loss_mlp": 0.1910753, + "epoch": 0.8212235081917931, + "flos": 13115439573120.0, + "grad_norm": 41.57184216171161, + "language_loss": 0.89648181, + "learning_rate": 3.259081278068805e-07, + "loss": 0.91097462, + "num_input_tokens_seen": 294608550, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.22924805, + "step": 13659, + "time_per_iteration": 4.0472471714019775 + }, + { + "auxiliary_loss_clip": 0.01216562, + "auxiliary_loss_mlp": 0.00224556, + "balance_loss_clip": 1.01067019, + "balance_loss_mlp": 0.20283625, + "epoch": 0.8212836314444611, + "flos": 40515351782400.0, + "grad_norm": 7.205116579726877, + "language_loss": 0.65633023, + "learning_rate": 3.256950723599887e-07, + "loss": 0.67074138, + "num_input_tokens_seen": 294630380, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.21716309, + "step": 13660, + "time_per_iteration": 2.860856056213379 + }, + { + "auxiliary_loss_clip": 0.01250479, + "auxiliary_loss_mlp": 0.0024773, + "balance_loss_clip": 1.03152609, + "balance_loss_mlp": 0.22205274, + "epoch": 0.8213437546971292, + "flos": 18770543982720.0, + "grad_norm": 9.301822166249279, + "language_loss": 0.81717086, + "learning_rate": 3.254820804029075e-07, + "loss": 0.83215296, + "num_input_tokens_seen": 294648655, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.2565918, + "step": 13661, + "time_per_iteration": 2.6207244396209717 + }, + { + "auxiliary_loss_clip": 0.01279097, + "auxiliary_loss_mlp": 0.00251919, + "balance_loss_clip": 1.05228782, + "balance_loss_mlp": 0.22558588, + "epoch": 0.8214038779497971, + "flos": 19682279925120.0, + "grad_norm": 12.792296415836011, + "language_loss": 0.82918072, + "learning_rate": 3.252691519437143e-07, + "loss": 0.84449089, + "num_input_tokens_seen": 294666915, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.2635498, + "step": 13662, + "time_per_iteration": 2.689063549041748 + }, + { + "auxiliary_loss_clip": 0.01108874, + "auxiliary_loss_mlp": 0.00155879, + "balance_loss_clip": 0.95874131, + "balance_loss_mlp": 0.146008, + "epoch": 0.8214640012024651, + "flos": 71602969697280.0, + "grad_norm": 0.7408562411072988, + "language_loss": 0.5355463, + "learning_rate": 3.250562869904825e-07, + "loss": 0.54819381, + "num_input_tokens_seen": 294731545, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.09863281, + "step": 13663, + "time_per_iteration": 3.314476251602173 + }, + { + "auxiliary_loss_clip": 0.0125228, + "auxiliary_loss_mlp": 0.00243212, + "balance_loss_clip": 1.03496218, + "balance_loss_mlp": 0.21904835, + "epoch": 0.821524124455133, + "flos": 14757203531520.0, + "grad_norm": 29.046762703257798, + "language_loss": 0.74119705, + "learning_rate": 3.248434855512838e-07, + "loss": 0.75615197, + "num_input_tokens_seen": 294748745, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24169922, + "step": 13664, + "time_per_iteration": 2.6688344478607178 + }, + { + "auxiliary_loss_clip": 0.01227981, + "auxiliary_loss_mlp": 0.00235818, + "balance_loss_clip": 1.01918674, + "balance_loss_mlp": 0.21283421, + "epoch": 0.821584247707801, + "flos": 25082274965760.0, + "grad_norm": 5.613330639938815, + "language_loss": 0.81126535, + "learning_rate": 3.246307476341881e-07, + "loss": 0.8259033, + "num_input_tokens_seen": 294768955, + "router_z_loss_clip": 2.08886719, + "router_z_loss_mlp": 0.22998047, + "step": 13665, + "time_per_iteration": 2.6938154697418213 + }, + { + "auxiliary_loss_clip": 0.01240915, + "auxiliary_loss_mlp": 0.0022119, + "balance_loss_clip": 1.02450442, + "balance_loss_mlp": 0.19694272, + "epoch": 0.8216443709604689, + "flos": 36830701710720.0, + "grad_norm": 19.312257949647538, + "language_loss": 0.75910032, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.77372134, + "num_input_tokens_seen": 294789250, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.24243164, + "step": 13666, + "time_per_iteration": 2.773989677429199 + }, + { + "auxiliary_loss_clip": 0.01249972, + "auxiliary_loss_mlp": 0.00212411, + "balance_loss_clip": 1.03317082, + "balance_loss_mlp": 0.18883166, + "epoch": 0.821704494213137, + "flos": 25081808088960.0, + "grad_norm": 2.3392429262338355, + "language_loss": 0.84126705, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.85589087, + "num_input_tokens_seen": 294809760, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.2355957, + "step": 13667, + "time_per_iteration": 2.7081947326660156 + }, + { + "auxiliary_loss_clip": 0.01250573, + "auxiliary_loss_mlp": 0.00220989, + "balance_loss_clip": 1.03033924, + "balance_loss_mlp": 0.19720653, + "epoch": 0.8217646174658049, + "flos": 14356117290240.0, + "grad_norm": 32.91499290911049, + "language_loss": 0.84639972, + "learning_rate": 3.239929150961773e-07, + "loss": 0.86111534, + "num_input_tokens_seen": 294826495, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.2376709, + "step": 13668, + "time_per_iteration": 2.6379787921905518 + }, + { + "auxiliary_loss_clip": 0.01247541, + "auxiliary_loss_mlp": 0.00222743, + "balance_loss_clip": 1.03325963, + "balance_loss_mlp": 0.19836485, + "epoch": 0.8218247407184729, + "flos": 22090557139200.0, + "grad_norm": 66.67600582677461, + "language_loss": 0.83786201, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.85256481, + "num_input_tokens_seen": 294845370, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.24389648, + "step": 13669, + "time_per_iteration": 2.7121329307556152 + }, + { + "auxiliary_loss_clip": 0.01235252, + "auxiliary_loss_mlp": 0.00204454, + "balance_loss_clip": 1.02056801, + "balance_loss_mlp": 0.18193524, + "epoch": 0.8218848639711408, + "flos": 16764035368320.0, + "grad_norm": 10.91392769205187, + "language_loss": 0.84687203, + "learning_rate": 3.235680111625161e-07, + "loss": 0.86126906, + "num_input_tokens_seen": 294863740, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.22521973, + "step": 13670, + "time_per_iteration": 2.6542582511901855 + }, + { + "auxiliary_loss_clip": 0.01265687, + "auxiliary_loss_mlp": 0.00245999, + "balance_loss_clip": 1.04445469, + "balance_loss_mlp": 0.22017856, + "epoch": 0.8219449872238088, + "flos": 25994801007360.0, + "grad_norm": 440.46525290282216, + "language_loss": 0.82288915, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.83800602, + "num_input_tokens_seen": 294882815, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25830078, + "step": 13671, + "time_per_iteration": 2.68607759475708 + }, + { + "auxiliary_loss_clip": 0.01288915, + "auxiliary_loss_mlp": 0.0023902, + "balance_loss_clip": 1.05405521, + "balance_loss_mlp": 0.2109459, + "epoch": 0.8220051104764767, + "flos": 20778094091520.0, + "grad_norm": 38.35309260010062, + "language_loss": 0.84440565, + "learning_rate": 3.23143361510728e-07, + "loss": 0.859685, + "num_input_tokens_seen": 294901985, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.28063965, + "step": 13672, + "time_per_iteration": 2.6699037551879883 + }, + { + "auxiliary_loss_clip": 0.0124324, + "auxiliary_loss_mlp": 0.00252, + "balance_loss_clip": 1.02960837, + "balance_loss_mlp": 0.22704962, + "epoch": 0.8220652337291448, + "flos": 14574849160320.0, + "grad_norm": 18.142793899282335, + "language_loss": 0.81932831, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.83428061, + "num_input_tokens_seen": 294919705, + "router_z_loss_clip": 2.13574219, + "router_z_loss_mlp": 0.24951172, + "step": 13673, + "time_per_iteration": 2.7024781703948975 + }, + { + "auxiliary_loss_clip": 0.01256177, + "auxiliary_loss_mlp": 0.00232818, + "balance_loss_clip": 1.03212619, + "balance_loss_mlp": 0.20828438, + "epoch": 0.8221253569818128, + "flos": 23805866194560.0, + "grad_norm": 72.21136026046727, + "language_loss": 0.87313193, + "learning_rate": 3.227189662052254e-07, + "loss": 0.88802189, + "num_input_tokens_seen": 294939900, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.24523926, + "step": 13674, + "time_per_iteration": 2.6698179244995117 + }, + { + "auxiliary_loss_clip": 0.01263377, + "auxiliary_loss_mlp": 0.00253738, + "balance_loss_clip": 1.03981853, + "balance_loss_mlp": 0.22835883, + "epoch": 0.8221854802344807, + "flos": 21288241002240.0, + "grad_norm": 5.878062521689124, + "language_loss": 0.79801971, + "learning_rate": 3.225068639524484e-07, + "loss": 0.81319082, + "num_input_tokens_seen": 294959110, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25390625, + "step": 13675, + "time_per_iteration": 2.6710855960845947 + }, + { + "auxiliary_loss_clip": 0.01234729, + "auxiliary_loss_mlp": 0.00231161, + "balance_loss_clip": 1.02227378, + "balance_loss_mlp": 0.20801061, + "epoch": 0.8222456034871487, + "flos": 20956785275520.0, + "grad_norm": 6.541662131526932, + "language_loss": 0.80523026, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.81988919, + "num_input_tokens_seen": 294978660, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.23156738, + "step": 13676, + "time_per_iteration": 2.7572646141052246 + }, + { + "auxiliary_loss_clip": 0.01229733, + "auxiliary_loss_mlp": 0.00242074, + "balance_loss_clip": 1.02137756, + "balance_loss_mlp": 0.21922135, + "epoch": 0.8223057267398166, + "flos": 21397517153280.0, + "grad_norm": 25.893425971811478, + "language_loss": 0.87010145, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.88481957, + "num_input_tokens_seen": 294998075, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.2286377, + "step": 13677, + "time_per_iteration": 2.6480190753936768 + }, + { + "auxiliary_loss_clip": 0.01246649, + "auxiliary_loss_mlp": 0.0024106, + "balance_loss_clip": 1.03006721, + "balance_loss_mlp": 0.21681239, + "epoch": 0.8223658499924846, + "flos": 15268212368640.0, + "grad_norm": 172.844779936462, + "language_loss": 0.81316245, + "learning_rate": 3.218709388905245e-07, + "loss": 0.82803953, + "num_input_tokens_seen": 295015950, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24243164, + "step": 13678, + "time_per_iteration": 2.678593635559082 + }, + { + "auxiliary_loss_clip": 0.01249737, + "auxiliary_loss_mlp": 0.00238603, + "balance_loss_clip": 1.03460026, + "balance_loss_mlp": 0.21358114, + "epoch": 0.8224259732451525, + "flos": 31249537447680.0, + "grad_norm": 34.378082543398804, + "language_loss": 0.80116951, + "learning_rate": 3.216590911288133e-07, + "loss": 0.81605291, + "num_input_tokens_seen": 295036800, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.25036621, + "step": 13679, + "time_per_iteration": 2.7520902156829834 + }, + { + "auxiliary_loss_clip": 0.01229325, + "auxiliary_loss_mlp": 0.00213411, + "balance_loss_clip": 1.01534271, + "balance_loss_mlp": 0.18999822, + "epoch": 0.8224860964978206, + "flos": 21574628138880.0, + "grad_norm": 4.042398476127678, + "language_loss": 0.7843374, + "learning_rate": 3.214473070099564e-07, + "loss": 0.79876477, + "num_input_tokens_seen": 295055300, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.23413086, + "step": 13680, + "time_per_iteration": 2.720616340637207 + }, + { + "auxiliary_loss_clip": 0.01247513, + "auxiliary_loss_mlp": 0.00234018, + "balance_loss_clip": 1.03880715, + "balance_loss_mlp": 0.2103077, + "epoch": 0.8225462197504885, + "flos": 25483217552640.0, + "grad_norm": 47.33663552588526, + "language_loss": 0.67661369, + "learning_rate": 3.21235586541986e-07, + "loss": 0.69142902, + "num_input_tokens_seen": 295076420, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.23693848, + "step": 13681, + "time_per_iteration": 2.686305284500122 + }, + { + "auxiliary_loss_clip": 0.01243938, + "auxiliary_loss_mlp": 0.00240993, + "balance_loss_clip": 1.02640605, + "balance_loss_mlp": 0.21648324, + "epoch": 0.8226063430031565, + "flos": 39385458587520.0, + "grad_norm": 13.530218311318416, + "language_loss": 0.77126098, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.78611028, + "num_input_tokens_seen": 295100540, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24499512, + "step": 13682, + "time_per_iteration": 2.858214855194092 + }, + { + "auxiliary_loss_clip": 0.01250528, + "auxiliary_loss_mlp": 0.00235185, + "balance_loss_clip": 1.03342938, + "balance_loss_mlp": 0.20962654, + "epoch": 0.8226664662558244, + "flos": 22815269942400.0, + "grad_norm": 6.374335967324102, + "language_loss": 0.87066352, + "learning_rate": 3.20812336590816e-07, + "loss": 0.8855207, + "num_input_tokens_seen": 295120180, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.25549316, + "step": 13683, + "time_per_iteration": 2.6456854343414307 + }, + { + "auxiliary_loss_clip": 0.01237029, + "auxiliary_loss_mlp": 0.0022887, + "balance_loss_clip": 1.02489209, + "balance_loss_mlp": 0.20567146, + "epoch": 0.8227265895084924, + "flos": 25665607837440.0, + "grad_norm": 54.25876249103956, + "language_loss": 0.95874703, + "learning_rate": 3.206008071236661e-07, + "loss": 0.97340602, + "num_input_tokens_seen": 295138530, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.23193359, + "step": 13684, + "time_per_iteration": 2.737603187561035 + }, + { + "auxiliary_loss_clip": 0.01228565, + "auxiliary_loss_mlp": 0.00213356, + "balance_loss_clip": 1.01550329, + "balance_loss_mlp": 0.18901309, + "epoch": 0.8227867127611603, + "flos": 26179274280960.0, + "grad_norm": 71.91474888039888, + "language_loss": 0.84901667, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.86343592, + "num_input_tokens_seen": 295160260, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.24365234, + "step": 13685, + "time_per_iteration": 2.7387423515319824 + }, + { + "auxiliary_loss_clip": 0.01245198, + "auxiliary_loss_mlp": 0.0021491, + "balance_loss_clip": 1.02883542, + "balance_loss_mlp": 0.19067426, + "epoch": 0.8228468360138284, + "flos": 22018053536640.0, + "grad_norm": 9.175162194788308, + "language_loss": 0.75161052, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.76621151, + "num_input_tokens_seen": 295177055, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.24243164, + "step": 13686, + "time_per_iteration": 2.6856203079223633 + }, + { + "auxiliary_loss_clip": 0.01252212, + "auxiliary_loss_mlp": 0.00212692, + "balance_loss_clip": 1.03255177, + "balance_loss_mlp": 0.18710992, + "epoch": 0.8229069592664963, + "flos": 14903359971840.0, + "grad_norm": 6.618273250106911, + "language_loss": 0.8738755, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.88852453, + "num_input_tokens_seen": 295193870, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.25622559, + "step": 13687, + "time_per_iteration": 2.642263412475586 + }, + { + "auxiliary_loss_clip": 0.01250415, + "auxiliary_loss_mlp": 0.0022075, + "balance_loss_clip": 1.03233647, + "balance_loss_mlp": 0.19613338, + "epoch": 0.8229670825191643, + "flos": 15669478177920.0, + "grad_norm": 8.116875019613335, + "language_loss": 0.80769032, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.82240194, + "num_input_tokens_seen": 295211040, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24597168, + "step": 13688, + "time_per_iteration": 2.706721305847168 + }, + { + "auxiliary_loss_clip": 0.01237136, + "auxiliary_loss_mlp": 0.00221888, + "balance_loss_clip": 1.02440405, + "balance_loss_mlp": 0.19907084, + "epoch": 0.8230272057718323, + "flos": 23183498217600.0, + "grad_norm": 11.290484975133644, + "language_loss": 0.79901791, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.81360817, + "num_input_tokens_seen": 295231300, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.22802734, + "step": 13689, + "time_per_iteration": 2.7404541969299316 + }, + { + "auxiliary_loss_clip": 0.01257332, + "auxiliary_loss_mlp": 0.0022619, + "balance_loss_clip": 1.03241658, + "balance_loss_mlp": 0.1988195, + "epoch": 0.8230873290245002, + "flos": 21032413361280.0, + "grad_norm": 2.713952788341214, + "language_loss": 0.75348747, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.76832271, + "num_input_tokens_seen": 295251045, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.27416992, + "step": 13690, + "time_per_iteration": 2.7195749282836914 + }, + { + "auxiliary_loss_clip": 0.01242649, + "auxiliary_loss_mlp": 0.002309, + "balance_loss_clip": 1.02938831, + "balance_loss_mlp": 0.20780903, + "epoch": 0.8231474522771682, + "flos": 21250139650560.0, + "grad_norm": 70.44654865624261, + "language_loss": 0.92159998, + "learning_rate": 3.191218844260988e-07, + "loss": 0.93633544, + "num_input_tokens_seen": 295270225, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23083496, + "step": 13691, + "time_per_iteration": 4.133214235305786 + }, + { + "auxiliary_loss_clip": 0.01253323, + "auxiliary_loss_mlp": 0.00233414, + "balance_loss_clip": 1.03353584, + "balance_loss_mlp": 0.2098701, + "epoch": 0.8232075755298361, + "flos": 23842028211840.0, + "grad_norm": 188.86806102349396, + "language_loss": 0.83057982, + "learning_rate": 3.189108646472252e-07, + "loss": 0.84544718, + "num_input_tokens_seen": 295288950, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.2355957, + "step": 13692, + "time_per_iteration": 4.138348817825317 + }, + { + "auxiliary_loss_clip": 0.01256276, + "auxiliary_loss_mlp": 0.00219079, + "balance_loss_clip": 1.03657186, + "balance_loss_mlp": 0.19503444, + "epoch": 0.8232676987825042, + "flos": 21653955325440.0, + "grad_norm": 1.703064595473075, + "language_loss": 0.76884782, + "learning_rate": 3.186999086154205e-07, + "loss": 0.78360134, + "num_input_tokens_seen": 295309405, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24047852, + "step": 13693, + "time_per_iteration": 2.6602659225463867 + }, + { + "auxiliary_loss_clip": 0.01235106, + "auxiliary_loss_mlp": 0.00227539, + "balance_loss_clip": 1.02553749, + "balance_loss_mlp": 0.20618893, + "epoch": 0.8233278220351721, + "flos": 26322701287680.0, + "grad_norm": 5.9562302222932, + "language_loss": 0.88839418, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.90302062, + "num_input_tokens_seen": 295331115, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.21350098, + "step": 13694, + "time_per_iteration": 2.747666358947754 + }, + { + "auxiliary_loss_clip": 0.01239788, + "auxiliary_loss_mlp": 0.00217655, + "balance_loss_clip": 1.0233674, + "balance_loss_mlp": 0.19175132, + "epoch": 0.8233879452878401, + "flos": 21725812483200.0, + "grad_norm": 5.869027643170198, + "language_loss": 0.84043533, + "learning_rate": 3.182781878250118e-07, + "loss": 0.85500979, + "num_input_tokens_seen": 295350495, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.2590332, + "step": 13695, + "time_per_iteration": 2.6517174243927 + }, + { + "auxiliary_loss_clip": 0.01220973, + "auxiliary_loss_mlp": 0.0020508, + "balance_loss_clip": 1.01205659, + "balance_loss_mlp": 0.18319336, + "epoch": 0.823448068540508, + "flos": 20557746109440.0, + "grad_norm": 5.090813151301775, + "language_loss": 0.87449217, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.8887527, + "num_input_tokens_seen": 295368225, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.21875, + "step": 13696, + "time_per_iteration": 2.6309125423431396 + }, + { + "auxiliary_loss_clip": 0.01105385, + "auxiliary_loss_mlp": 0.00110521, + "balance_loss_clip": 0.95935684, + "balance_loss_mlp": 0.10322586, + "epoch": 0.823508191793176, + "flos": 67273688194560.0, + "grad_norm": 0.7262349095254194, + "language_loss": 0.63090885, + "learning_rate": 3.178567221188393e-07, + "loss": 0.64306796, + "num_input_tokens_seen": 295430035, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.07275391, + "step": 13697, + "time_per_iteration": 4.700345754623413 + }, + { + "auxiliary_loss_clip": 0.01237537, + "auxiliary_loss_mlp": 0.00217337, + "balance_loss_clip": 1.02631581, + "balance_loss_mlp": 0.19574869, + "epoch": 0.8235683150458439, + "flos": 17928402641280.0, + "grad_norm": 26.340340670157833, + "language_loss": 0.79147303, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.80602169, + "num_input_tokens_seen": 295447765, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.21569824, + "step": 13698, + "time_per_iteration": 2.605940818786621 + }, + { + "auxiliary_loss_clip": 0.01239521, + "auxiliary_loss_mlp": 0.002269, + "balance_loss_clip": 1.01893389, + "balance_loss_mlp": 0.20248568, + "epoch": 0.823628438298512, + "flos": 18916089891840.0, + "grad_norm": 3.23361335005492, + "language_loss": 0.80313373, + "learning_rate": 3.174355115608305e-07, + "loss": 0.81779796, + "num_input_tokens_seen": 295464810, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.24389648, + "step": 13699, + "time_per_iteration": 2.7057089805603027 + }, + { + "auxiliary_loss_clip": 0.01230184, + "auxiliary_loss_mlp": 0.002096, + "balance_loss_clip": 1.01932991, + "balance_loss_mlp": 0.18745068, + "epoch": 0.8236885615511799, + "flos": 18696460181760.0, + "grad_norm": 92.54941802529576, + "language_loss": 0.90949273, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.92389059, + "num_input_tokens_seen": 295482605, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.22167969, + "step": 13700, + "time_per_iteration": 2.659273386001587 + }, + { + "auxiliary_loss_clip": 0.01240607, + "auxiliary_loss_mlp": 0.00220151, + "balance_loss_clip": 1.0226512, + "balance_loss_mlp": 0.19561726, + "epoch": 0.8237486848038479, + "flos": 23695009845120.0, + "grad_norm": 63.76990812076491, + "language_loss": 0.80563623, + "learning_rate": 3.170145562148763e-07, + "loss": 0.82024384, + "num_input_tokens_seen": 295503780, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.2454834, + "step": 13701, + "time_per_iteration": 4.130049705505371 + }, + { + "auxiliary_loss_clip": 0.01252803, + "auxiliary_loss_mlp": 0.00236022, + "balance_loss_clip": 1.03082907, + "balance_loss_mlp": 0.21122669, + "epoch": 0.8238088080565159, + "flos": 23441301106560.0, + "grad_norm": 45.3421641611917, + "language_loss": 0.80697507, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.82186329, + "num_input_tokens_seen": 295522035, + "router_z_loss_clip": 2.21582031, + "router_z_loss_mlp": 0.2479248, + "step": 13702, + "time_per_iteration": 2.6792609691619873 + }, + { + "auxiliary_loss_clip": 0.01255957, + "auxiliary_loss_mlp": 0.00223299, + "balance_loss_clip": 1.03338742, + "balance_loss_mlp": 0.19701307, + "epoch": 0.8238689313091838, + "flos": 22746537267840.0, + "grad_norm": 98.87493402274612, + "language_loss": 0.8288964, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.84368896, + "num_input_tokens_seen": 295541190, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26269531, + "step": 13703, + "time_per_iteration": 2.6331515312194824 + }, + { + "auxiliary_loss_clip": 0.01260561, + "auxiliary_loss_mlp": 0.00230415, + "balance_loss_clip": 1.038872, + "balance_loss_mlp": 0.20386674, + "epoch": 0.8239290545618518, + "flos": 25630092264960.0, + "grad_norm": 12.134572155476082, + "language_loss": 0.79572046, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.8106302, + "num_input_tokens_seen": 295558860, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.26501465, + "step": 13704, + "time_per_iteration": 2.7313072681427 + }, + { + "auxiliary_loss_clip": 0.01247891, + "auxiliary_loss_mlp": 0.00227987, + "balance_loss_clip": 1.03171611, + "balance_loss_mlp": 0.20433626, + "epoch": 0.8239891778145197, + "flos": 26026473824640.0, + "grad_norm": 32.560862376204454, + "language_loss": 0.71583915, + "learning_rate": 3.161734114144916e-07, + "loss": 0.73059791, + "num_input_tokens_seen": 295578155, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.23669434, + "step": 13705, + "time_per_iteration": 2.739715337753296 + }, + { + "auxiliary_loss_clip": 0.0124168, + "auxiliary_loss_mlp": 0.00254802, + "balance_loss_clip": 1.0227921, + "balance_loss_mlp": 0.22955298, + "epoch": 0.8240493010671878, + "flos": 21833257040640.0, + "grad_norm": 13.816152728300471, + "language_loss": 0.78797925, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.80294406, + "num_input_tokens_seen": 295599170, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25256348, + "step": 13706, + "time_per_iteration": 2.724320650100708 + }, + { + "auxiliary_loss_clip": 0.01250745, + "auxiliary_loss_mlp": 0.0021616, + "balance_loss_clip": 1.03473306, + "balance_loss_mlp": 0.19157901, + "epoch": 0.8241094243198557, + "flos": 18551919853440.0, + "grad_norm": 11.329205204278571, + "language_loss": 0.78145373, + "learning_rate": 3.157532220876475e-07, + "loss": 0.79612279, + "num_input_tokens_seen": 295617465, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.24609375, + "step": 13707, + "time_per_iteration": 2.712522268295288 + }, + { + "auxiliary_loss_clip": 0.01243916, + "auxiliary_loss_mlp": 0.00235078, + "balance_loss_clip": 1.02715433, + "balance_loss_mlp": 0.2104378, + "epoch": 0.8241695475725237, + "flos": 25447163276160.0, + "grad_norm": 9.378367201950107, + "language_loss": 0.85314208, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.86793208, + "num_input_tokens_seen": 295634960, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24658203, + "step": 13708, + "time_per_iteration": 2.687114715576172 + }, + { + "auxiliary_loss_clip": 0.01244743, + "auxiliary_loss_mlp": 0.00212005, + "balance_loss_clip": 1.0303781, + "balance_loss_mlp": 0.18861581, + "epoch": 0.8242296708251916, + "flos": 18989670902400.0, + "grad_norm": 13.98955378160717, + "language_loss": 0.77171767, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.78628516, + "num_input_tokens_seen": 295652725, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.23400879, + "step": 13709, + "time_per_iteration": 2.6638786792755127 + }, + { + "auxiliary_loss_clip": 0.01250175, + "auxiliary_loss_mlp": 0.00234275, + "balance_loss_clip": 1.03313017, + "balance_loss_mlp": 0.21003985, + "epoch": 0.8242897940778596, + "flos": 22600883617920.0, + "grad_norm": 27.04804398585057, + "language_loss": 0.91761935, + "learning_rate": 3.151234171183319e-07, + "loss": 0.93246394, + "num_input_tokens_seen": 295671195, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24206543, + "step": 13710, + "time_per_iteration": 2.7124404907226562 + }, + { + "auxiliary_loss_clip": 0.01227089, + "auxiliary_loss_mlp": 0.00213394, + "balance_loss_clip": 1.01686203, + "balance_loss_mlp": 0.19113779, + "epoch": 0.8243499173305275, + "flos": 21468153248640.0, + "grad_norm": 99.74923823045009, + "language_loss": 0.84715009, + "learning_rate": 3.149136098993257e-07, + "loss": 0.86155498, + "num_input_tokens_seen": 295689130, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.22241211, + "step": 13711, + "time_per_iteration": 2.7122344970703125 + }, + { + "auxiliary_loss_clip": 0.01241908, + "auxiliary_loss_mlp": 0.00212587, + "balance_loss_clip": 1.02648282, + "balance_loss_mlp": 0.1877677, + "epoch": 0.8244100405831956, + "flos": 20010359773440.0, + "grad_norm": 28.45550344578042, + "language_loss": 0.73181266, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.74635756, + "num_input_tokens_seen": 295706385, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24816895, + "step": 13712, + "time_per_iteration": 2.632077693939209 + }, + { + "auxiliary_loss_clip": 0.01237037, + "auxiliary_loss_mlp": 0.00232018, + "balance_loss_clip": 1.02369022, + "balance_loss_mlp": 0.2094878, + "epoch": 0.8244701638358635, + "flos": 26430684549120.0, + "grad_norm": 4.149625348004869, + "language_loss": 0.81302315, + "learning_rate": 3.14494187165202e-07, + "loss": 0.82771373, + "num_input_tokens_seen": 295727925, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.2253418, + "step": 13713, + "time_per_iteration": 2.730330228805542 + }, + { + "auxiliary_loss_clip": 0.01254818, + "auxiliary_loss_mlp": 0.00215109, + "balance_loss_clip": 1.03582156, + "balance_loss_mlp": 0.19237542, + "epoch": 0.8245302870885315, + "flos": 17640004343040.0, + "grad_norm": 2.886085383691765, + "language_loss": 0.89391631, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.90861559, + "num_input_tokens_seen": 295744420, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.22717285, + "step": 13714, + "time_per_iteration": 2.610612630844116 + }, + { + "auxiliary_loss_clip": 0.01236079, + "auxiliary_loss_mlp": 0.00235219, + "balance_loss_clip": 1.02101314, + "balance_loss_mlp": 0.21067351, + "epoch": 0.8245904103411995, + "flos": 26209510554240.0, + "grad_norm": 68.65541869933067, + "language_loss": 0.73131657, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.7460295, + "num_input_tokens_seen": 295765105, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24536133, + "step": 13715, + "time_per_iteration": 2.736536741256714 + }, + { + "auxiliary_loss_clip": 0.01231519, + "auxiliary_loss_mlp": 0.00234092, + "balance_loss_clip": 1.01908624, + "balance_loss_mlp": 0.20933272, + "epoch": 0.8246505335938674, + "flos": 24205084928640.0, + "grad_norm": 5.250951576341245, + "language_loss": 0.8265897, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.84124577, + "num_input_tokens_seen": 295784200, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.24755859, + "step": 13716, + "time_per_iteration": 2.648739814758301 + }, + { + "auxiliary_loss_clip": 0.01120863, + "auxiliary_loss_mlp": 0.00075684, + "balance_loss_clip": 0.97442561, + "balance_loss_mlp": 0.06862691, + "epoch": 0.8247106568465354, + "flos": 67092195749760.0, + "grad_norm": 0.7110320997913081, + "language_loss": 0.58483642, + "learning_rate": 3.136561087351175e-07, + "loss": 0.59680194, + "num_input_tokens_seen": 295846555, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.07080078, + "step": 13717, + "time_per_iteration": 3.3186609745025635 + }, + { + "auxiliary_loss_clip": 0.01235099, + "auxiliary_loss_mlp": 0.00201219, + "balance_loss_clip": 1.02276647, + "balance_loss_mlp": 0.17840239, + "epoch": 0.8247707800992033, + "flos": 12568232805120.0, + "grad_norm": 35.70377066259417, + "language_loss": 0.91626281, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.93062598, + "num_input_tokens_seen": 295863425, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.22814941, + "step": 13718, + "time_per_iteration": 2.6424427032470703 + }, + { + "auxiliary_loss_clip": 0.01233468, + "auxiliary_loss_mlp": 0.00228118, + "balance_loss_clip": 1.02066708, + "balance_loss_mlp": 0.20379902, + "epoch": 0.8248309033518714, + "flos": 15923617879680.0, + "grad_norm": 73.73984528235015, + "language_loss": 0.75610912, + "learning_rate": 3.132374531662778e-07, + "loss": 0.77072489, + "num_input_tokens_seen": 295880925, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.24328613, + "step": 13719, + "time_per_iteration": 2.733717203140259 + }, + { + "auxiliary_loss_clip": 0.01243472, + "auxiliary_loss_mlp": 0.0023118, + "balance_loss_clip": 1.02771688, + "balance_loss_mlp": 0.20752853, + "epoch": 0.8248910266045393, + "flos": 17564735393280.0, + "grad_norm": 4.756571579354355, + "language_loss": 0.80945337, + "learning_rate": 3.13028221321197e-07, + "loss": 0.82419991, + "num_input_tokens_seen": 295898205, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.2364502, + "step": 13720, + "time_per_iteration": 2.6479713916778564 + }, + { + "auxiliary_loss_clip": 0.01246253, + "auxiliary_loss_mlp": 0.0023609, + "balance_loss_clip": 1.02906942, + "balance_loss_mlp": 0.21158043, + "epoch": 0.8249511498572073, + "flos": 28619655275520.0, + "grad_norm": 20.576203577300436, + "language_loss": 0.80852336, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.82334685, + "num_input_tokens_seen": 295918130, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24511719, + "step": 13721, + "time_per_iteration": 2.702624559402466 + }, + { + "auxiliary_loss_clip": 0.01236102, + "auxiliary_loss_mlp": 0.00211034, + "balance_loss_clip": 1.02380729, + "balance_loss_mlp": 0.18961197, + "epoch": 0.8250112731098752, + "flos": 25556583081600.0, + "grad_norm": 108.77752603267722, + "language_loss": 0.84982389, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.86429524, + "num_input_tokens_seen": 295937760, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.21398926, + "step": 13722, + "time_per_iteration": 2.799842119216919 + }, + { + "auxiliary_loss_clip": 0.01228403, + "auxiliary_loss_mlp": 0.00215937, + "balance_loss_clip": 1.01953173, + "balance_loss_mlp": 0.19319192, + "epoch": 0.8250713963625432, + "flos": 27746164339200.0, + "grad_norm": 2.309025607945843, + "language_loss": 0.71246827, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.72691166, + "num_input_tokens_seen": 295957585, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.2277832, + "step": 13723, + "time_per_iteration": 2.675764560699463 + }, + { + "auxiliary_loss_clip": 0.01232261, + "auxiliary_loss_mlp": 0.00224283, + "balance_loss_clip": 1.01723194, + "balance_loss_mlp": 0.20138261, + "epoch": 0.8251315196152111, + "flos": 21610610588160.0, + "grad_norm": 6.623763668634026, + "language_loss": 0.80538511, + "learning_rate": 3.121919337215666e-07, + "loss": 0.81995058, + "num_input_tokens_seen": 295977135, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.22912598, + "step": 13724, + "time_per_iteration": 2.66599702835083 + }, + { + "auxiliary_loss_clip": 0.01241447, + "auxiliary_loss_mlp": 0.00222984, + "balance_loss_clip": 1.0249002, + "balance_loss_mlp": 0.19784269, + "epoch": 0.8251916428678792, + "flos": 28579363194240.0, + "grad_norm": 2.3863167282345517, + "language_loss": 0.72928286, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.74392718, + "num_input_tokens_seen": 295996265, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.2512207, + "step": 13725, + "time_per_iteration": 2.671313762664795 + }, + { + "auxiliary_loss_clip": 0.01236678, + "auxiliary_loss_mlp": 0.00237116, + "balance_loss_clip": 1.02170467, + "balance_loss_mlp": 0.21291672, + "epoch": 0.8252517661205471, + "flos": 23075191733760.0, + "grad_norm": 15.060454260926559, + "language_loss": 0.88056469, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.89530265, + "num_input_tokens_seen": 296014745, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24194336, + "step": 13726, + "time_per_iteration": 2.685744285583496 + }, + { + "auxiliary_loss_clip": 0.01230692, + "auxiliary_loss_mlp": 0.00223333, + "balance_loss_clip": 1.02219033, + "balance_loss_mlp": 0.2010649, + "epoch": 0.8253118893732151, + "flos": 31759576617600.0, + "grad_norm": 9.879298219852705, + "language_loss": 0.77446032, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.78900057, + "num_input_tokens_seen": 296036960, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.22277832, + "step": 13727, + "time_per_iteration": 2.725353956222534 + }, + { + "auxiliary_loss_clip": 0.01262121, + "auxiliary_loss_mlp": 0.00215735, + "balance_loss_clip": 1.03982079, + "balance_loss_mlp": 0.19129729, + "epoch": 0.8253720126258831, + "flos": 18296415434880.0, + "grad_norm": 12.68690984681354, + "language_loss": 0.73351455, + "learning_rate": 3.113566701515036e-07, + "loss": 0.74829304, + "num_input_tokens_seen": 296056540, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.24450684, + "step": 13728, + "time_per_iteration": 2.654900312423706 + }, + { + "auxiliary_loss_clip": 0.01271623, + "auxiliary_loss_mlp": 0.00219986, + "balance_loss_clip": 1.04398966, + "balance_loss_mlp": 0.19365218, + "epoch": 0.825432135878551, + "flos": 26797332625920.0, + "grad_norm": 25.478564419143616, + "language_loss": 0.80749714, + "learning_rate": 3.111480143230092e-07, + "loss": 0.82241321, + "num_input_tokens_seen": 296077950, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.26330566, + "step": 13729, + "time_per_iteration": 2.716139316558838 + }, + { + "auxiliary_loss_clip": 0.0112564, + "auxiliary_loss_mlp": 0.0010179, + "balance_loss_clip": 0.97979748, + "balance_loss_mlp": 0.0930166, + "epoch": 0.825492259131219, + "flos": 54219116217600.0, + "grad_norm": 0.8720807073653413, + "language_loss": 0.61714756, + "learning_rate": 3.109394225359514e-07, + "loss": 0.62942183, + "num_input_tokens_seen": 296127060, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.08789062, + "step": 13730, + "time_per_iteration": 2.9684746265411377 + }, + { + "auxiliary_loss_clip": 0.01240568, + "auxiliary_loss_mlp": 0.00248936, + "balance_loss_clip": 1.0288353, + "balance_loss_mlp": 0.22424799, + "epoch": 0.825552382383887, + "flos": 43756145493120.0, + "grad_norm": 12.075186337833602, + "language_loss": 0.73284853, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.74774361, + "num_input_tokens_seen": 296147775, + "router_z_loss_clip": 2.12011719, + "router_z_loss_mlp": 0.24707031, + "step": 13731, + "time_per_iteration": 2.8733692169189453 + }, + { + "auxiliary_loss_clip": 0.01273518, + "auxiliary_loss_mlp": 0.00232276, + "balance_loss_clip": 1.04332185, + "balance_loss_mlp": 0.20593053, + "epoch": 0.825612505636555, + "flos": 12602814624000.0, + "grad_norm": 5.1594736656851286, + "language_loss": 0.78695595, + "learning_rate": 3.105224311177812e-07, + "loss": 0.80201387, + "num_input_tokens_seen": 296163560, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.2635498, + "step": 13732, + "time_per_iteration": 2.6765167713165283 + }, + { + "auxiliary_loss_clip": 0.01256289, + "auxiliary_loss_mlp": 0.00228551, + "balance_loss_clip": 1.0344367, + "balance_loss_mlp": 0.20382693, + "epoch": 0.8256726288892229, + "flos": 17595618111360.0, + "grad_norm": 167.93281129424227, + "language_loss": 0.84550118, + "learning_rate": 3.103140315024817e-07, + "loss": 0.8603496, + "num_input_tokens_seen": 296178730, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.24731445, + "step": 13733, + "time_per_iteration": 4.1236958503723145 + }, + { + "auxiliary_loss_clip": 0.0123679, + "auxiliary_loss_mlp": 0.00233739, + "balance_loss_clip": 1.0230757, + "balance_loss_mlp": 0.20964727, + "epoch": 0.8257327521418909, + "flos": 23805794367360.0, + "grad_norm": 5.594116131344165, + "language_loss": 0.86728245, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.88198775, + "num_input_tokens_seen": 296200175, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2409668, + "step": 13734, + "time_per_iteration": 2.7720351219177246 + }, + { + "auxiliary_loss_clip": 0.01236384, + "auxiliary_loss_mlp": 0.00215903, + "balance_loss_clip": 1.02265763, + "balance_loss_mlp": 0.19482657, + "epoch": 0.8257928753945588, + "flos": 19281121856640.0, + "grad_norm": 51.600220434275315, + "language_loss": 0.9053669, + "learning_rate": 3.098974244989676e-07, + "loss": 0.91988981, + "num_input_tokens_seen": 296219305, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.21081543, + "step": 13735, + "time_per_iteration": 4.0675859451293945 + }, + { + "auxiliary_loss_clip": 0.01243093, + "auxiliary_loss_mlp": 0.00254172, + "balance_loss_clip": 1.02663863, + "balance_loss_mlp": 0.22870913, + "epoch": 0.8258529986472268, + "flos": 18478841633280.0, + "grad_norm": 8.567213070340127, + "language_loss": 0.78533185, + "learning_rate": 3.096892171265497e-07, + "loss": 0.80030453, + "num_input_tokens_seen": 296236945, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.25463867, + "step": 13736, + "time_per_iteration": 2.7923378944396973 + }, + { + "auxiliary_loss_clip": 0.01135499, + "auxiliary_loss_mlp": 0.00082045, + "balance_loss_clip": 0.9878093, + "balance_loss_mlp": 0.07412942, + "epoch": 0.8259131218998947, + "flos": 62137957512960.0, + "grad_norm": 0.8361744366926249, + "language_loss": 0.67033142, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.68250686, + "num_input_tokens_seen": 296294685, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.07910156, + "step": 13737, + "time_per_iteration": 3.2065865993499756 + }, + { + "auxiliary_loss_clip": 0.01245297, + "auxiliary_loss_mlp": 0.00206937, + "balance_loss_clip": 1.03305912, + "balance_loss_mlp": 0.18457344, + "epoch": 0.8259732451525628, + "flos": 22159038418560.0, + "grad_norm": 3.863960758361799, + "language_loss": 0.7635408, + "learning_rate": 3.0927299467987e-07, + "loss": 0.77806318, + "num_input_tokens_seen": 296314790, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.22363281, + "step": 13738, + "time_per_iteration": 2.700362205505371 + }, + { + "auxiliary_loss_clip": 0.01263875, + "auxiliary_loss_mlp": 0.00234355, + "balance_loss_clip": 1.04243517, + "balance_loss_mlp": 0.20933321, + "epoch": 0.8260333684052307, + "flos": 38361645233280.0, + "grad_norm": 8.493720784494672, + "language_loss": 0.7580108, + "learning_rate": 3.090649796213911e-07, + "loss": 0.77299309, + "num_input_tokens_seen": 296335355, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25024414, + "step": 13739, + "time_per_iteration": 4.382062196731567 + }, + { + "auxiliary_loss_clip": 0.01135154, + "auxiliary_loss_mlp": 0.00144387, + "balance_loss_clip": 0.9886443, + "balance_loss_mlp": 0.13527916, + "epoch": 0.8260934916578987, + "flos": 62185611882240.0, + "grad_norm": 0.8663924147076932, + "language_loss": 0.58351749, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.59631288, + "num_input_tokens_seen": 296399885, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.09130859, + "step": 13740, + "time_per_iteration": 3.189614772796631 + }, + { + "auxiliary_loss_clip": 0.01277336, + "auxiliary_loss_mlp": 0.00223443, + "balance_loss_clip": 1.04884434, + "balance_loss_mlp": 0.19677539, + "epoch": 0.8261536149105667, + "flos": 22565475786240.0, + "grad_norm": 13.364197431927689, + "language_loss": 0.854689, + "learning_rate": 3.086491418735959e-07, + "loss": 0.8696968, + "num_input_tokens_seen": 296417660, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.26647949, + "step": 13741, + "time_per_iteration": 2.6721713542938232 + }, + { + "auxiliary_loss_clip": 0.01248233, + "auxiliary_loss_mlp": 0.00240956, + "balance_loss_clip": 1.02797616, + "balance_loss_mlp": 0.21505208, + "epoch": 0.8262137381632346, + "flos": 32525479342080.0, + "grad_norm": 106.6984855606408, + "language_loss": 0.70320237, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.71809423, + "num_input_tokens_seen": 296438255, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25927734, + "step": 13742, + "time_per_iteration": 2.7714455127716064 + }, + { + "auxiliary_loss_clip": 0.012794, + "auxiliary_loss_mlp": 0.00221907, + "balance_loss_clip": 1.04593682, + "balance_loss_mlp": 0.19586015, + "epoch": 0.8262738614159026, + "flos": 14136451666560.0, + "grad_norm": 6.206290013136441, + "language_loss": 0.85225707, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.86727011, + "num_input_tokens_seen": 296454485, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.26037598, + "step": 13743, + "time_per_iteration": 4.133354425430298 + }, + { + "auxiliary_loss_clip": 0.01261048, + "auxiliary_loss_mlp": 0.00219147, + "balance_loss_clip": 1.04239273, + "balance_loss_mlp": 0.1942682, + "epoch": 0.8263339846685706, + "flos": 19825347795840.0, + "grad_norm": 74.9525593955595, + "language_loss": 0.73916334, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.75396532, + "num_input_tokens_seen": 296473740, + "router_z_loss_clip": 2.18652344, + "router_z_loss_mlp": 0.24914551, + "step": 13744, + "time_per_iteration": 2.665152072906494 + }, + { + "auxiliary_loss_clip": 0.01239497, + "auxiliary_loss_mlp": 0.00223062, + "balance_loss_clip": 1.02517974, + "balance_loss_mlp": 0.19832581, + "epoch": 0.8263941079212386, + "flos": 22745962650240.0, + "grad_norm": 28.219475610407027, + "language_loss": 0.83585417, + "learning_rate": 3.078182360753612e-07, + "loss": 0.85047972, + "num_input_tokens_seen": 296493355, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.24731445, + "step": 13745, + "time_per_iteration": 2.655303716659546 + }, + { + "auxiliary_loss_clip": 0.01212838, + "auxiliary_loss_mlp": 0.00205452, + "balance_loss_clip": 1.0059278, + "balance_loss_mlp": 0.18331502, + "epoch": 0.8264542311739065, + "flos": 20120641505280.0, + "grad_norm": 41.39163883663468, + "language_loss": 0.85554135, + "learning_rate": 3.076106700253709e-07, + "loss": 0.86972421, + "num_input_tokens_seen": 296510520, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.22119141, + "step": 13746, + "time_per_iteration": 2.7531418800354004 + }, + { + "auxiliary_loss_clip": 0.01268524, + "auxiliary_loss_mlp": 0.00231303, + "balance_loss_clip": 1.04368258, + "balance_loss_mlp": 0.20636395, + "epoch": 0.8265143544265745, + "flos": 16837149502080.0, + "grad_norm": 250.147131077403, + "language_loss": 0.77533954, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.7903378, + "num_input_tokens_seen": 296528265, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.24926758, + "step": 13747, + "time_per_iteration": 2.795776128768921 + }, + { + "auxiliary_loss_clip": 0.01246858, + "auxiliary_loss_mlp": 0.00220894, + "balance_loss_clip": 1.02465999, + "balance_loss_mlp": 0.19500187, + "epoch": 0.8265744776792424, + "flos": 22018592240640.0, + "grad_norm": 136.48325268213713, + "language_loss": 0.82071501, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.83539248, + "num_input_tokens_seen": 296547810, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25866699, + "step": 13748, + "time_per_iteration": 2.711371421813965 + }, + { + "auxiliary_loss_clip": 0.01240226, + "auxiliary_loss_mlp": 0.00206766, + "balance_loss_clip": 1.02271843, + "balance_loss_mlp": 0.18245879, + "epoch": 0.8266346009319104, + "flos": 19244852098560.0, + "grad_norm": 20.258204629541055, + "language_loss": 0.71682024, + "learning_rate": 3.069883569603102e-07, + "loss": 0.7312901, + "num_input_tokens_seen": 296565940, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.24304199, + "step": 13749, + "time_per_iteration": 2.7066643238067627 + }, + { + "auxiliary_loss_clip": 0.01235561, + "auxiliary_loss_mlp": 0.00232813, + "balance_loss_clip": 1.0225836, + "balance_loss_mlp": 0.20932877, + "epoch": 0.8266947241845783, + "flos": 24166768095360.0, + "grad_norm": 8457.750649150728, + "language_loss": 0.80598569, + "learning_rate": 3.067810476598132e-07, + "loss": 0.82066941, + "num_input_tokens_seen": 296585090, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23461914, + "step": 13750, + "time_per_iteration": 2.661062240600586 + }, + { + "auxiliary_loss_clip": 0.0126055, + "auxiliary_loss_mlp": 0.00208175, + "balance_loss_clip": 1.03888261, + "balance_loss_mlp": 0.18259311, + "epoch": 0.8267548474372464, + "flos": 21105814803840.0, + "grad_norm": 8.791103582183815, + "language_loss": 0.74327219, + "learning_rate": 3.065738025663496e-07, + "loss": 0.75795943, + "num_input_tokens_seen": 296604950, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25549316, + "step": 13751, + "time_per_iteration": 2.6904919147491455 + }, + { + "auxiliary_loss_clip": 0.01230282, + "auxiliary_loss_mlp": 0.00195705, + "balance_loss_clip": 1.02074802, + "balance_loss_mlp": 0.17287596, + "epoch": 0.8268149706899143, + "flos": 39968288668800.0, + "grad_norm": 7.942913869885441, + "language_loss": 0.68371159, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.69797146, + "num_input_tokens_seen": 296627780, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.22839355, + "step": 13752, + "time_per_iteration": 2.8289999961853027 + }, + { + "auxiliary_loss_clip": 0.01134559, + "auxiliary_loss_mlp": 0.00088488, + "balance_loss_clip": 0.98859489, + "balance_loss_mlp": 0.08047757, + "epoch": 0.8268750939425823, + "flos": 65782423244160.0, + "grad_norm": 0.7630124809574861, + "language_loss": 0.56689179, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.57912225, + "num_input_tokens_seen": 296683850, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.08007812, + "step": 13753, + "time_per_iteration": 3.1919877529144287 + }, + { + "auxiliary_loss_clip": 0.0113349, + "auxiliary_loss_mlp": 0.00077252, + "balance_loss_clip": 0.98879313, + "balance_loss_mlp": 0.07095728, + "epoch": 0.8269352171952503, + "flos": 52981455242880.0, + "grad_norm": 0.6762874096634176, + "language_loss": 0.54082799, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.55293542, + "num_input_tokens_seen": 296741420, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.06298828, + "step": 13754, + "time_per_iteration": 3.257965564727783 + }, + { + "auxiliary_loss_clip": 0.01243352, + "auxiliary_loss_mlp": 0.00219648, + "balance_loss_clip": 1.02978277, + "balance_loss_mlp": 0.1975587, + "epoch": 0.8269953404479182, + "flos": 23076125487360.0, + "grad_norm": 3.581537670597949, + "language_loss": 0.78144348, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.7960735, + "num_input_tokens_seen": 296759620, + "router_z_loss_clip": 2.13574219, + "router_z_loss_mlp": 0.2208252, + "step": 13755, + "time_per_iteration": 2.675516366958618 + }, + { + "auxiliary_loss_clip": 0.01228141, + "auxiliary_loss_mlp": 0.00209441, + "balance_loss_clip": 1.01589537, + "balance_loss_mlp": 0.18640991, + "epoch": 0.8270554637005862, + "flos": 14209996763520.0, + "grad_norm": 5.466438829741084, + "language_loss": 0.77077186, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.78514767, + "num_input_tokens_seen": 296777275, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.23034668, + "step": 13756, + "time_per_iteration": 2.6372978687286377 + }, + { + "auxiliary_loss_clip": 0.01253849, + "auxiliary_loss_mlp": 0.00226293, + "balance_loss_clip": 1.03122413, + "balance_loss_mlp": 0.200353, + "epoch": 0.8271155869532542, + "flos": 21762046327680.0, + "grad_norm": 97.28484533715145, + "language_loss": 0.83036524, + "learning_rate": 3.053316807931623e-07, + "loss": 0.84516662, + "num_input_tokens_seen": 296796655, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25927734, + "step": 13757, + "time_per_iteration": 2.652130126953125 + }, + { + "auxiliary_loss_clip": 0.01263736, + "auxiliary_loss_mlp": 0.00205901, + "balance_loss_clip": 1.04183459, + "balance_loss_mlp": 0.18067625, + "epoch": 0.8271757102059222, + "flos": 15120475729920.0, + "grad_norm": 136.89634758473167, + "language_loss": 0.75142902, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.76612532, + "num_input_tokens_seen": 296813705, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25219727, + "step": 13758, + "time_per_iteration": 2.63655161857605 + }, + { + "auxiliary_loss_clip": 0.01217864, + "auxiliary_loss_mlp": 0.00204591, + "balance_loss_clip": 1.01014304, + "balance_loss_mlp": 0.18271625, + "epoch": 0.8272358334585901, + "flos": 24133730561280.0, + "grad_norm": 6.956377693169685, + "language_loss": 0.75452864, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.76875317, + "num_input_tokens_seen": 296833985, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.21875, + "step": 13759, + "time_per_iteration": 2.6333537101745605 + }, + { + "auxiliary_loss_clip": 0.01246996, + "auxiliary_loss_mlp": 0.00205426, + "balance_loss_clip": 1.02925563, + "balance_loss_mlp": 0.18064271, + "epoch": 0.8272959567112581, + "flos": 18990712396800.0, + "grad_norm": 4.66962923167097, + "language_loss": 0.75595659, + "learning_rate": 3.047114873375161e-07, + "loss": 0.77048081, + "num_input_tokens_seen": 296850150, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24768066, + "step": 13760, + "time_per_iteration": 2.639218330383301 + }, + { + "auxiliary_loss_clip": 0.01242132, + "auxiliary_loss_mlp": 0.00200869, + "balance_loss_clip": 1.02916789, + "balance_loss_mlp": 0.17863688, + "epoch": 0.827356079963926, + "flos": 20631614428800.0, + "grad_norm": 37.267361356554645, + "language_loss": 0.86831021, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.88274026, + "num_input_tokens_seen": 296869585, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.22229004, + "step": 13761, + "time_per_iteration": 2.664045572280884 + }, + { + "auxiliary_loss_clip": 0.01228514, + "auxiliary_loss_mlp": 0.00214966, + "balance_loss_clip": 1.01805639, + "balance_loss_mlp": 0.19100484, + "epoch": 0.827416203216594, + "flos": 22416625825920.0, + "grad_norm": 5.238561914498069, + "language_loss": 0.77642775, + "learning_rate": 3.042983464482387e-07, + "loss": 0.79086256, + "num_input_tokens_seen": 296887710, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.23950195, + "step": 13762, + "time_per_iteration": 2.6790876388549805 + }, + { + "auxiliary_loss_clip": 0.0122878, + "auxiliary_loss_mlp": 0.00206595, + "balance_loss_clip": 1.01617765, + "balance_loss_mlp": 0.18430313, + "epoch": 0.827476326469262, + "flos": 19026192055680.0, + "grad_norm": 14.024822236878371, + "language_loss": 0.78175646, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.79611015, + "num_input_tokens_seen": 296906265, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.22290039, + "step": 13763, + "time_per_iteration": 2.63822078704834 + }, + { + "auxiliary_loss_clip": 0.01136945, + "auxiliary_loss_mlp": 0.0008619, + "balance_loss_clip": 0.98955786, + "balance_loss_mlp": 0.07822638, + "epoch": 0.82753644972193, + "flos": 68500575089280.0, + "grad_norm": 0.8200110864949555, + "language_loss": 0.64361113, + "learning_rate": 3.038854627636651e-07, + "loss": 0.65584248, + "num_input_tokens_seen": 296971290, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.07958984, + "step": 13764, + "time_per_iteration": 3.2747726440429688 + }, + { + "auxiliary_loss_clip": 0.01251143, + "auxiliary_loss_mlp": 0.00215161, + "balance_loss_clip": 1.03639734, + "balance_loss_mlp": 0.19191527, + "epoch": 0.8275965729745979, + "flos": 18405404277120.0, + "grad_norm": 89.32998378167966, + "language_loss": 0.88119102, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.89585406, + "num_input_tokens_seen": 296989060, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.2322998, + "step": 13765, + "time_per_iteration": 2.670809030532837 + }, + { + "auxiliary_loss_clip": 0.01260283, + "auxiliary_loss_mlp": 0.00243591, + "balance_loss_clip": 1.03580236, + "balance_loss_mlp": 0.21722168, + "epoch": 0.8276566962272659, + "flos": 28512067063680.0, + "grad_norm": 9.07144145268564, + "language_loss": 0.7236082, + "learning_rate": 3.034728363464214e-07, + "loss": 0.73864692, + "num_input_tokens_seen": 297011300, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26391602, + "step": 13766, + "time_per_iteration": 2.6884818077087402 + }, + { + "auxiliary_loss_clip": 0.01251381, + "auxiliary_loss_mlp": 0.00228193, + "balance_loss_clip": 1.0340941, + "balance_loss_mlp": 0.20417231, + "epoch": 0.8277168194799339, + "flos": 20230240878720.0, + "grad_norm": 3.7253424323374036, + "language_loss": 0.91214973, + "learning_rate": 3.03266619632609e-07, + "loss": 0.92694545, + "num_input_tokens_seen": 297030350, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.23999023, + "step": 13767, + "time_per_iteration": 2.645993232727051 + }, + { + "auxiliary_loss_clip": 0.01260142, + "auxiliary_loss_mlp": 0.0024018, + "balance_loss_clip": 1.04393899, + "balance_loss_mlp": 0.21582527, + "epoch": 0.8277769427326018, + "flos": 28476623318400.0, + "grad_norm": 3.6185304549008523, + "language_loss": 0.75436783, + "learning_rate": 3.030604672590964e-07, + "loss": 0.76937103, + "num_input_tokens_seen": 297049710, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24365234, + "step": 13768, + "time_per_iteration": 2.701990842819214 + }, + { + "auxiliary_loss_clip": 0.0123752, + "auxiliary_loss_mlp": 0.00219366, + "balance_loss_clip": 1.02300525, + "balance_loss_mlp": 0.19614369, + "epoch": 0.8278370659852698, + "flos": 27197628768000.0, + "grad_norm": 32.86982186399373, + "language_loss": 0.82507348, + "learning_rate": 3.028543792337006e-07, + "loss": 0.83964229, + "num_input_tokens_seen": 297070510, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.23205566, + "step": 13769, + "time_per_iteration": 2.7151389122009277 + }, + { + "auxiliary_loss_clip": 0.01252721, + "auxiliary_loss_mlp": 0.00224346, + "balance_loss_clip": 1.03177619, + "balance_loss_mlp": 0.19990838, + "epoch": 0.8278971892379378, + "flos": 37816126404480.0, + "grad_norm": 475.8146310245161, + "language_loss": 0.81802207, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.83279276, + "num_input_tokens_seen": 297092585, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24450684, + "step": 13770, + "time_per_iteration": 2.780045747756958 + }, + { + "auxiliary_loss_clip": 0.01257034, + "auxiliary_loss_mlp": 0.00230311, + "balance_loss_clip": 1.03644466, + "balance_loss_mlp": 0.20489573, + "epoch": 0.8279573124906058, + "flos": 22560160573440.0, + "grad_norm": 6.622580544150541, + "language_loss": 0.82319808, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.83807153, + "num_input_tokens_seen": 297110055, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25415039, + "step": 13771, + "time_per_iteration": 2.6750035285949707 + }, + { + "auxiliary_loss_clip": 0.01255859, + "auxiliary_loss_mlp": 0.00233195, + "balance_loss_clip": 1.03467369, + "balance_loss_mlp": 0.20895989, + "epoch": 0.8280174357432737, + "flos": 36064619418240.0, + "grad_norm": 12.371922511913198, + "language_loss": 0.78636062, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.80125123, + "num_input_tokens_seen": 297132170, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24230957, + "step": 13772, + "time_per_iteration": 2.7810404300689697 + }, + { + "auxiliary_loss_clip": 0.01240538, + "auxiliary_loss_mlp": 0.00212527, + "balance_loss_clip": 1.02186298, + "balance_loss_mlp": 0.18652776, + "epoch": 0.8280775589959417, + "flos": 22961067246720.0, + "grad_norm": 38.98230412585889, + "language_loss": 0.84209853, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.85662913, + "num_input_tokens_seen": 297149515, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.26000977, + "step": 13773, + "time_per_iteration": 2.6616311073303223 + }, + { + "auxiliary_loss_clip": 0.01241788, + "auxiliary_loss_mlp": 0.00215048, + "balance_loss_clip": 1.02787054, + "balance_loss_mlp": 0.19021642, + "epoch": 0.8281376822486096, + "flos": 26063282286720.0, + "grad_norm": 26.76171681635544, + "language_loss": 0.81892121, + "learning_rate": 3.01824904601915e-07, + "loss": 0.8334896, + "num_input_tokens_seen": 297170320, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24865723, + "step": 13774, + "time_per_iteration": 2.7025716304779053 + }, + { + "auxiliary_loss_clip": 0.01283138, + "auxiliary_loss_mlp": 0.00239619, + "balance_loss_clip": 1.05093694, + "balance_loss_mlp": 0.2148833, + "epoch": 0.8281978055012776, + "flos": 20667776446080.0, + "grad_norm": 212.2076913752154, + "language_loss": 0.80505908, + "learning_rate": 3.01619202829249e-07, + "loss": 0.82028663, + "num_input_tokens_seen": 297189935, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.24743652, + "step": 13775, + "time_per_iteration": 4.053435564041138 + }, + { + "auxiliary_loss_clip": 0.01264522, + "auxiliary_loss_mlp": 0.00232782, + "balance_loss_clip": 1.04104233, + "balance_loss_mlp": 0.20656797, + "epoch": 0.8282579287539455, + "flos": 29315281040640.0, + "grad_norm": 188.62565141071192, + "language_loss": 0.82478911, + "learning_rate": 3.01413565459353e-07, + "loss": 0.83976215, + "num_input_tokens_seen": 297210885, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26220703, + "step": 13776, + "time_per_iteration": 2.7121739387512207 + }, + { + "auxiliary_loss_clip": 0.01256664, + "auxiliary_loss_mlp": 0.00248622, + "balance_loss_clip": 1.03604829, + "balance_loss_mlp": 0.22187141, + "epoch": 0.8283180520066136, + "flos": 15706178899200.0, + "grad_norm": 4.625787613198145, + "language_loss": 0.86910814, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.884161, + "num_input_tokens_seen": 297228500, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.26733398, + "step": 13777, + "time_per_iteration": 4.086107015609741 + }, + { + "auxiliary_loss_clip": 0.01259472, + "auxiliary_loss_mlp": 0.0021859, + "balance_loss_clip": 1.04062665, + "balance_loss_mlp": 0.19471279, + "epoch": 0.8283781752592815, + "flos": 24791470456320.0, + "grad_norm": 4.9575915670400015, + "language_loss": 0.90455806, + "learning_rate": 3.010024839590604e-07, + "loss": 0.9193387, + "num_input_tokens_seen": 297249470, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.2388916, + "step": 13778, + "time_per_iteration": 2.6484200954437256 + }, + { + "auxiliary_loss_clip": 0.01236732, + "auxiliary_loss_mlp": 0.00224769, + "balance_loss_clip": 1.02502775, + "balance_loss_mlp": 0.20078456, + "epoch": 0.8284382985119495, + "flos": 18982811404800.0, + "grad_norm": 11.578791714006718, + "language_loss": 0.82244647, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.83706146, + "num_input_tokens_seen": 297265970, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.23999023, + "step": 13779, + "time_per_iteration": 2.7026782035827637 + }, + { + "auxiliary_loss_clip": 0.01143771, + "auxiliary_loss_mlp": 0.00085905, + "balance_loss_clip": 0.99478519, + "balance_loss_mlp": 0.07884748, + "epoch": 0.8284984217646175, + "flos": 61034460814080.0, + "grad_norm": 0.9798142161237802, + "language_loss": 0.55828029, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.57057703, + "num_input_tokens_seen": 297325525, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.07080078, + "step": 13780, + "time_per_iteration": 3.1780967712402344 + }, + { + "auxiliary_loss_clip": 0.01246513, + "auxiliary_loss_mlp": 0.00228055, + "balance_loss_clip": 1.02917981, + "balance_loss_mlp": 0.20485698, + "epoch": 0.8285585450172854, + "flos": 19714635100800.0, + "grad_norm": 323.5546785607675, + "language_loss": 0.87015879, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.8849045, + "num_input_tokens_seen": 297345025, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.23205566, + "step": 13781, + "time_per_iteration": 4.179501056671143 + }, + { + "auxiliary_loss_clip": 0.01255693, + "auxiliary_loss_mlp": 0.0023605, + "balance_loss_clip": 1.03313041, + "balance_loss_mlp": 0.2096334, + "epoch": 0.8286186682699535, + "flos": 21688896280320.0, + "grad_norm": 94.09848572989968, + "language_loss": 0.85934108, + "learning_rate": 3.001810941346543e-07, + "loss": 0.87425852, + "num_input_tokens_seen": 297363570, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.26416016, + "step": 13782, + "time_per_iteration": 2.6848580837249756 + }, + { + "auxiliary_loss_clip": 0.01233149, + "auxiliary_loss_mlp": 0.00222206, + "balance_loss_clip": 1.01688385, + "balance_loss_mlp": 0.19751783, + "epoch": 0.8286787915226214, + "flos": 25775566346880.0, + "grad_norm": 442.2845867364426, + "language_loss": 0.83576179, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.85031533, + "num_input_tokens_seen": 297385385, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.24719238, + "step": 13783, + "time_per_iteration": 2.6668589115142822 + }, + { + "auxiliary_loss_clip": 0.01257395, + "auxiliary_loss_mlp": 0.00232133, + "balance_loss_clip": 1.03773832, + "balance_loss_mlp": 0.20822006, + "epoch": 0.8287389147752894, + "flos": 21288348743040.0, + "grad_norm": 13.899776357603388, + "language_loss": 0.81127763, + "learning_rate": 2.997707859351304e-07, + "loss": 0.82617289, + "num_input_tokens_seen": 297403950, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.23901367, + "step": 13784, + "time_per_iteration": 2.7319283485412598 + }, + { + "auxiliary_loss_clip": 0.01244266, + "auxiliary_loss_mlp": 0.00222873, + "balance_loss_clip": 1.02401578, + "balance_loss_mlp": 0.19705245, + "epoch": 0.8287990380279573, + "flos": 33544875323520.0, + "grad_norm": 16.89689404276723, + "language_loss": 0.78871685, + "learning_rate": 2.99565728540772e-07, + "loss": 0.80338824, + "num_input_tokens_seen": 297424565, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.25830078, + "step": 13785, + "time_per_iteration": 2.7777044773101807 + }, + { + "auxiliary_loss_clip": 0.01259473, + "auxiliary_loss_mlp": 0.00218284, + "balance_loss_clip": 1.03781331, + "balance_loss_mlp": 0.19345284, + "epoch": 0.8288591612806253, + "flos": 22966346545920.0, + "grad_norm": 17.233553605888478, + "language_loss": 0.76175666, + "learning_rate": 2.993607356270516e-07, + "loss": 0.7765342, + "num_input_tokens_seen": 297445180, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.24804688, + "step": 13786, + "time_per_iteration": 4.111645936965942 + }, + { + "auxiliary_loss_clip": 0.01284486, + "auxiliary_loss_mlp": 0.00246875, + "balance_loss_clip": 1.05260921, + "balance_loss_mlp": 0.22013602, + "epoch": 0.8289192845332932, + "flos": 18588979710720.0, + "grad_norm": 4.60534145889195, + "language_loss": 0.85015708, + "learning_rate": 2.991558072017426e-07, + "loss": 0.86547065, + "num_input_tokens_seen": 297463790, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.26745605, + "step": 13787, + "time_per_iteration": 2.651737928390503 + }, + { + "auxiliary_loss_clip": 0.01247581, + "auxiliary_loss_mlp": 0.00248749, + "balance_loss_clip": 1.03205323, + "balance_loss_mlp": 0.22458486, + "epoch": 0.8289794077859612, + "flos": 15450423085440.0, + "grad_norm": 94.98577729245056, + "language_loss": 0.88310397, + "learning_rate": 2.989509432726163e-07, + "loss": 0.8980673, + "num_input_tokens_seen": 297480100, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.24169922, + "step": 13788, + "time_per_iteration": 2.5857040882110596 + }, + { + "auxiliary_loss_clip": 0.01228949, + "auxiliary_loss_mlp": 0.00244201, + "balance_loss_clip": 1.01715589, + "balance_loss_mlp": 0.21847609, + "epoch": 0.8290395310386292, + "flos": 28877853214080.0, + "grad_norm": 7.475433475822789, + "language_loss": 0.77819824, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.79292977, + "num_input_tokens_seen": 297499890, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.25732422, + "step": 13789, + "time_per_iteration": 2.7031240463256836 + }, + { + "auxiliary_loss_clip": 0.01237254, + "auxiliary_loss_mlp": 0.00223346, + "balance_loss_clip": 1.02214837, + "balance_loss_mlp": 0.19915839, + "epoch": 0.8290996542912972, + "flos": 36576274700160.0, + "grad_norm": 70.60422017996845, + "language_loss": 0.74819934, + "learning_rate": 2.985414089339813e-07, + "loss": 0.7628054, + "num_input_tokens_seen": 297521440, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24194336, + "step": 13790, + "time_per_iteration": 2.8739311695098877 + }, + { + "auxiliary_loss_clip": 0.01245451, + "auxiliary_loss_mlp": 0.00235216, + "balance_loss_clip": 1.02744913, + "balance_loss_mlp": 0.2095736, + "epoch": 0.8291597775439651, + "flos": 23623009032960.0, + "grad_norm": 9.795337496227742, + "language_loss": 0.83472574, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.84953243, + "num_input_tokens_seen": 297539920, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.25622559, + "step": 13791, + "time_per_iteration": 2.6970527172088623 + }, + { + "auxiliary_loss_clip": 0.01231132, + "auxiliary_loss_mlp": 0.00224889, + "balance_loss_clip": 1.02288365, + "balance_loss_mlp": 0.20271645, + "epoch": 0.8292199007966331, + "flos": 21397481239680.0, + "grad_norm": 12.586594012563165, + "language_loss": 0.75925064, + "learning_rate": 2.981321326732651e-07, + "loss": 0.77381086, + "num_input_tokens_seen": 297560000, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.22167969, + "step": 13792, + "time_per_iteration": 2.7274858951568604 + }, + { + "auxiliary_loss_clip": 0.01246715, + "auxiliary_loss_mlp": 0.00236668, + "balance_loss_clip": 1.02426505, + "balance_loss_mlp": 0.21127626, + "epoch": 0.829280024049301, + "flos": 28767607395840.0, + "grad_norm": 97.49957148964462, + "language_loss": 0.7233448, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.73817873, + "num_input_tokens_seen": 297579300, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25427246, + "step": 13793, + "time_per_iteration": 2.8064236640930176 + }, + { + "auxiliary_loss_clip": 0.01240316, + "auxiliary_loss_mlp": 0.00240087, + "balance_loss_clip": 1.0207932, + "balance_loss_mlp": 0.21490952, + "epoch": 0.829340147301969, + "flos": 19938071652480.0, + "grad_norm": 16.533549050206364, + "language_loss": 0.75018191, + "learning_rate": 2.977231145525461e-07, + "loss": 0.76498592, + "num_input_tokens_seen": 297598095, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25195312, + "step": 13794, + "time_per_iteration": 2.648599863052368 + }, + { + "auxiliary_loss_clip": 0.01252489, + "auxiliary_loss_mlp": 0.00231258, + "balance_loss_clip": 1.03053331, + "balance_loss_mlp": 0.20393501, + "epoch": 0.829400270554637, + "flos": 25228575060480.0, + "grad_norm": 184.98972570930403, + "language_loss": 0.76529086, + "learning_rate": 2.975187023140757e-07, + "loss": 0.78012836, + "num_input_tokens_seen": 297615955, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.27319336, + "step": 13795, + "time_per_iteration": 2.6704583168029785 + }, + { + "auxiliary_loss_clip": 0.0123443, + "auxiliary_loss_mlp": 0.00220766, + "balance_loss_clip": 1.02103209, + "balance_loss_mlp": 0.19793773, + "epoch": 0.829460393807305, + "flos": 24463570176000.0, + "grad_norm": 18.064916733766907, + "language_loss": 0.72891682, + "learning_rate": 2.973143546338661e-07, + "loss": 0.74346876, + "num_input_tokens_seen": 297636285, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.22802734, + "step": 13796, + "time_per_iteration": 2.7402515411376953 + }, + { + "auxiliary_loss_clip": 0.01231135, + "auxiliary_loss_mlp": 0.00217583, + "balance_loss_clip": 1.01510262, + "balance_loss_mlp": 0.19385993, + "epoch": 0.829520517059973, + "flos": 15122486891520.0, + "grad_norm": 7.301935890668897, + "language_loss": 0.78551757, + "learning_rate": 2.971100715196666e-07, + "loss": 0.80000478, + "num_input_tokens_seen": 297653315, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.23693848, + "step": 13797, + "time_per_iteration": 2.6100783348083496 + }, + { + "auxiliary_loss_clip": 0.01254978, + "auxiliary_loss_mlp": 0.00212997, + "balance_loss_clip": 1.03668964, + "balance_loss_mlp": 0.19076477, + "epoch": 0.8295806403126409, + "flos": 21579979265280.0, + "grad_norm": 897.2442972134277, + "language_loss": 0.83165491, + "learning_rate": 2.969058529792243e-07, + "loss": 0.84633464, + "num_input_tokens_seen": 297673480, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.22253418, + "step": 13798, + "time_per_iteration": 2.6724605560302734 + }, + { + "auxiliary_loss_clip": 0.01221082, + "auxiliary_loss_mlp": 0.00210501, + "balance_loss_clip": 1.01250827, + "balance_loss_mlp": 0.18866198, + "epoch": 0.8296407635653089, + "flos": 21726566668800.0, + "grad_norm": 2.9183843256278994, + "language_loss": 0.82121563, + "learning_rate": 2.967016990202822e-07, + "loss": 0.83553147, + "num_input_tokens_seen": 297693250, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.21838379, + "step": 13799, + "time_per_iteration": 2.6565942764282227 + }, + { + "auxiliary_loss_clip": 0.01238572, + "auxiliary_loss_mlp": 0.00238203, + "balance_loss_clip": 1.02502453, + "balance_loss_mlp": 0.21487333, + "epoch": 0.8297008868179768, + "flos": 11181147252480.0, + "grad_norm": 97.39358959105127, + "language_loss": 0.77144474, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.78621244, + "num_input_tokens_seen": 297710975, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2331543, + "step": 13800, + "time_per_iteration": 2.6554486751556396 + }, + { + "auxiliary_loss_clip": 0.0125845, + "auxiliary_loss_mlp": 0.00233744, + "balance_loss_clip": 1.03791928, + "balance_loss_mlp": 0.21035497, + "epoch": 0.8297610100706448, + "flos": 20664041431680.0, + "grad_norm": 125.94898943565737, + "language_loss": 0.85036457, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.86528647, + "num_input_tokens_seen": 297730860, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.23388672, + "step": 13801, + "time_per_iteration": 2.6669509410858154 + }, + { + "auxiliary_loss_clip": 0.01237897, + "auxiliary_loss_mlp": 0.00230582, + "balance_loss_clip": 1.0238452, + "balance_loss_mlp": 0.20885043, + "epoch": 0.8298211333233128, + "flos": 20376325491840.0, + "grad_norm": 312.94258918611513, + "language_loss": 0.81359339, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.82827818, + "num_input_tokens_seen": 297749765, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.2175293, + "step": 13802, + "time_per_iteration": 2.7154736518859863 + }, + { + "auxiliary_loss_clip": 0.01243505, + "auxiliary_loss_mlp": 0.00248267, + "balance_loss_clip": 1.03043866, + "balance_loss_mlp": 0.22412759, + "epoch": 0.8298812565759808, + "flos": 21508696725120.0, + "grad_norm": 21.000970331903588, + "language_loss": 0.80955434, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.82447207, + "num_input_tokens_seen": 297770380, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.24133301, + "step": 13803, + "time_per_iteration": 2.646883249282837 + }, + { + "auxiliary_loss_clip": 0.01239857, + "auxiliary_loss_mlp": 0.00228813, + "balance_loss_clip": 1.02238083, + "balance_loss_mlp": 0.20492311, + "epoch": 0.8299413798286487, + "flos": 22818681734400.0, + "grad_norm": 6.107876303975471, + "language_loss": 0.83322006, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.84790677, + "num_input_tokens_seen": 297789440, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.23876953, + "step": 13804, + "time_per_iteration": 2.6580870151519775 + }, + { + "auxiliary_loss_clip": 0.01252406, + "auxiliary_loss_mlp": 0.00236474, + "balance_loss_clip": 1.03563797, + "balance_loss_mlp": 0.21240589, + "epoch": 0.8300015030813167, + "flos": 29679199683840.0, + "grad_norm": 4035.036957262985, + "language_loss": 0.81072009, + "learning_rate": 2.954781319115016e-07, + "loss": 0.82560891, + "num_input_tokens_seen": 297810425, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24084473, + "step": 13805, + "time_per_iteration": 2.7219719886779785 + }, + { + "auxiliary_loss_clip": 0.01250442, + "auxiliary_loss_mlp": 0.0023703, + "balance_loss_clip": 1.02985334, + "balance_loss_mlp": 0.21181759, + "epoch": 0.8300616263339846, + "flos": 19719483436800.0, + "grad_norm": 9.063772738240289, + "language_loss": 0.85948133, + "learning_rate": 2.952744302396906e-07, + "loss": 0.87435603, + "num_input_tokens_seen": 297827680, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25195312, + "step": 13806, + "time_per_iteration": 2.678441286087036 + }, + { + "auxiliary_loss_clip": 0.012555, + "auxiliary_loss_mlp": 0.00235658, + "balance_loss_clip": 1.03208649, + "balance_loss_mlp": 0.20964643, + "epoch": 0.8301217495866526, + "flos": 19901945548800.0, + "grad_norm": 64.40860713094932, + "language_loss": 0.69716597, + "learning_rate": 2.950707932112444e-07, + "loss": 0.7120775, + "num_input_tokens_seen": 297848005, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26025391, + "step": 13807, + "time_per_iteration": 2.6556942462921143 + }, + { + "auxiliary_loss_clip": 0.01260434, + "auxiliary_loss_mlp": 0.00217766, + "balance_loss_clip": 1.04056144, + "balance_loss_mlp": 0.19307749, + "epoch": 0.8301818728393207, + "flos": 19715784336000.0, + "grad_norm": 144.61951363751692, + "language_loss": 0.82023251, + "learning_rate": 2.948672208338847e-07, + "loss": 0.83501446, + "num_input_tokens_seen": 297866730, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24707031, + "step": 13808, + "time_per_iteration": 2.662076234817505 + }, + { + "auxiliary_loss_clip": 0.01258321, + "auxiliary_loss_mlp": 0.00253403, + "balance_loss_clip": 1.03631282, + "balance_loss_mlp": 0.22739121, + "epoch": 0.8302419960919886, + "flos": 28293658416000.0, + "grad_norm": 19.076714820311846, + "language_loss": 0.76192862, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.77704585, + "num_input_tokens_seen": 297886390, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26013184, + "step": 13809, + "time_per_iteration": 2.715114116668701 + }, + { + "auxiliary_loss_clip": 0.01247243, + "auxiliary_loss_mlp": 0.00204683, + "balance_loss_clip": 1.03241348, + "balance_loss_mlp": 0.18273634, + "epoch": 0.8303021193446566, + "flos": 18223444955520.0, + "grad_norm": 46.186096444210534, + "language_loss": 0.83046895, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.84498823, + "num_input_tokens_seen": 297905110, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.21948242, + "step": 13810, + "time_per_iteration": 2.6643331050872803 + }, + { + "auxiliary_loss_clip": 0.01237264, + "auxiliary_loss_mlp": 0.00211861, + "balance_loss_clip": 1.02535307, + "balance_loss_mlp": 0.1893547, + "epoch": 0.8303622425973245, + "flos": 23111425578240.0, + "grad_norm": 6.199573626404562, + "language_loss": 0.89054924, + "learning_rate": 2.94256891685505e-07, + "loss": 0.9050405, + "num_input_tokens_seen": 297925460, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.22521973, + "step": 13811, + "time_per_iteration": 2.71201753616333 + }, + { + "auxiliary_loss_clip": 0.012571, + "auxiliary_loss_mlp": 0.00228161, + "balance_loss_clip": 1.03825879, + "balance_loss_mlp": 0.20447424, + "epoch": 0.8304223658499925, + "flos": 19572860119680.0, + "grad_norm": 10.428280280625035, + "language_loss": 0.81417882, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.82903147, + "num_input_tokens_seen": 297941760, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.23669434, + "step": 13812, + "time_per_iteration": 2.6210925579071045 + }, + { + "auxiliary_loss_clip": 0.01229768, + "auxiliary_loss_mlp": 0.0021238, + "balance_loss_clip": 1.02218461, + "balance_loss_mlp": 0.18988499, + "epoch": 0.8304824891026604, + "flos": 24426115269120.0, + "grad_norm": 257.95596626486974, + "language_loss": 0.84575832, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.8601799, + "num_input_tokens_seen": 297959745, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.22485352, + "step": 13813, + "time_per_iteration": 2.6970741748809814 + }, + { + "auxiliary_loss_clip": 0.01265045, + "auxiliary_loss_mlp": 0.00244689, + "balance_loss_clip": 1.04274559, + "balance_loss_mlp": 0.21863002, + "epoch": 0.8305426123553284, + "flos": 22381792611840.0, + "grad_norm": 14.818326722933298, + "language_loss": 0.79712784, + "learning_rate": 2.93647144674658e-07, + "loss": 0.81222522, + "num_input_tokens_seen": 297977665, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.26049805, + "step": 13814, + "time_per_iteration": 2.6786389350891113 + }, + { + "auxiliary_loss_clip": 0.01290516, + "auxiliary_loss_mlp": 0.00246012, + "balance_loss_clip": 1.0536474, + "balance_loss_mlp": 0.21896353, + "epoch": 0.8306027356079964, + "flos": 14903575453440.0, + "grad_norm": 888.0188581491986, + "language_loss": 0.78344643, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.79881167, + "num_input_tokens_seen": 297993525, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.27062988, + "step": 13815, + "time_per_iteration": 2.6335439682006836 + }, + { + "auxiliary_loss_clip": 0.01257862, + "auxiliary_loss_mlp": 0.00211582, + "balance_loss_clip": 1.03760135, + "balance_loss_mlp": 0.18645266, + "epoch": 0.8306628588606644, + "flos": 19644573623040.0, + "grad_norm": 20.944854037308055, + "language_loss": 0.85415787, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.86885226, + "num_input_tokens_seen": 298012920, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.2512207, + "step": 13816, + "time_per_iteration": 2.617687702178955 + }, + { + "auxiliary_loss_clip": 0.01240626, + "auxiliary_loss_mlp": 0.0024419, + "balance_loss_clip": 1.02802896, + "balance_loss_mlp": 0.21997905, + "epoch": 0.8307229821133323, + "flos": 24389737770240.0, + "grad_norm": 7.831650968266981, + "language_loss": 0.88076079, + "learning_rate": 2.930379800094371e-07, + "loss": 0.8956089, + "num_input_tokens_seen": 298033310, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.2421875, + "step": 13817, + "time_per_iteration": 4.0785300731658936 + }, + { + "auxiliary_loss_clip": 0.01243724, + "auxiliary_loss_mlp": 0.00245404, + "balance_loss_clip": 1.02863467, + "balance_loss_mlp": 0.22020333, + "epoch": 0.8307831053660003, + "flos": 20996933702400.0, + "grad_norm": 3.7922020437824355, + "language_loss": 0.84956449, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.86445582, + "num_input_tokens_seen": 298053530, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.25195312, + "step": 13818, + "time_per_iteration": 2.6947429180145264 + }, + { + "auxiliary_loss_clip": 0.01264067, + "auxiliary_loss_mlp": 0.00230884, + "balance_loss_clip": 1.04279721, + "balance_loss_mlp": 0.2056717, + "epoch": 0.8308432286186682, + "flos": 21397301671680.0, + "grad_norm": 6.3221500605381165, + "language_loss": 0.89299858, + "learning_rate": 2.926321938606453e-07, + "loss": 0.90794814, + "num_input_tokens_seen": 298069305, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.2520752, + "step": 13819, + "time_per_iteration": 4.141093969345093 + }, + { + "auxiliary_loss_clip": 0.01137863, + "auxiliary_loss_mlp": 0.00079354, + "balance_loss_clip": 0.9876911, + "balance_loss_mlp": 0.07124736, + "epoch": 0.8309033518713362, + "flos": 62533656714240.0, + "grad_norm": 0.8030089267201549, + "language_loss": 0.55680835, + "learning_rate": 2.924293978977399e-07, + "loss": 0.56898057, + "num_input_tokens_seen": 298125830, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.08105469, + "step": 13820, + "time_per_iteration": 3.1619043350219727 + }, + { + "auxiliary_loss_clip": 0.01253437, + "auxiliary_loss_mlp": 0.00225351, + "balance_loss_clip": 1.03535271, + "balance_loss_mlp": 0.20113996, + "epoch": 0.8309634751240043, + "flos": 16979104051200.0, + "grad_norm": 10.749788927244932, + "language_loss": 0.77753973, + "learning_rate": 2.922266666860831e-07, + "loss": 0.79232764, + "num_input_tokens_seen": 298142320, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.2421875, + "step": 13821, + "time_per_iteration": 2.6801838874816895 + }, + { + "auxiliary_loss_clip": 0.01261663, + "auxiliary_loss_mlp": 0.00245768, + "balance_loss_clip": 1.03595054, + "balance_loss_mlp": 0.22096057, + "epoch": 0.8310235983766722, + "flos": 22674464628480.0, + "grad_norm": 2.9733022639774545, + "language_loss": 0.77193981, + "learning_rate": 2.920240002333625e-07, + "loss": 0.78701413, + "num_input_tokens_seen": 298161845, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.24804688, + "step": 13822, + "time_per_iteration": 2.693302631378174 + }, + { + "auxiliary_loss_clip": 0.01238213, + "auxiliary_loss_mlp": 0.00245006, + "balance_loss_clip": 1.02748787, + "balance_loss_mlp": 0.22079468, + "epoch": 0.8310837216293402, + "flos": 30811463176320.0, + "grad_norm": 98.96779557857764, + "language_loss": 0.69136167, + "learning_rate": 2.918213985472631e-07, + "loss": 0.70619392, + "num_input_tokens_seen": 298184165, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.2421875, + "step": 13823, + "time_per_iteration": 4.348609685897827 + }, + { + "auxiliary_loss_clip": 0.01118151, + "auxiliary_loss_mlp": 0.0005821, + "balance_loss_clip": 0.96950424, + "balance_loss_mlp": 0.04977007, + "epoch": 0.8311438448820081, + "flos": 71276074997760.0, + "grad_norm": 0.9693701903317519, + "language_loss": 0.60529602, + "learning_rate": 2.916188616354669e-07, + "loss": 0.61705959, + "num_input_tokens_seen": 298251720, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.08447266, + "step": 13824, + "time_per_iteration": 3.2749969959259033 + }, + { + "auxiliary_loss_clip": 0.01231812, + "auxiliary_loss_mlp": 0.00248031, + "balance_loss_clip": 1.02023482, + "balance_loss_mlp": 0.2245353, + "epoch": 0.8312039681346761, + "flos": 20887082933760.0, + "grad_norm": 6.116696708296442, + "language_loss": 0.81350416, + "learning_rate": 2.914163895056552e-07, + "loss": 0.82830256, + "num_input_tokens_seen": 298271910, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.23498535, + "step": 13825, + "time_per_iteration": 2.640084981918335 + }, + { + "auxiliary_loss_clip": 0.01262407, + "auxiliary_loss_mlp": 0.00244593, + "balance_loss_clip": 1.04014015, + "balance_loss_mlp": 0.21985707, + "epoch": 0.831264091387344, + "flos": 17017528625280.0, + "grad_norm": 56.322355776331285, + "language_loss": 0.8763839, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.89145386, + "num_input_tokens_seen": 298288105, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.24743652, + "step": 13826, + "time_per_iteration": 2.6370654106140137 + }, + { + "auxiliary_loss_clip": 0.01254225, + "auxiliary_loss_mlp": 0.00238169, + "balance_loss_clip": 1.03845787, + "balance_loss_mlp": 0.21270634, + "epoch": 0.831324214640012, + "flos": 24419578993920.0, + "grad_norm": 13.043725121205929, + "language_loss": 0.75174606, + "learning_rate": 2.910116396226914e-07, + "loss": 0.76666999, + "num_input_tokens_seen": 298307600, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.2545166, + "step": 13827, + "time_per_iteration": 2.6431050300598145 + }, + { + "auxiliary_loss_clip": 0.01247462, + "auxiliary_loss_mlp": 0.00231504, + "balance_loss_clip": 1.02937961, + "balance_loss_mlp": 0.20645835, + "epoch": 0.83138433789268, + "flos": 13545576938880.0, + "grad_norm": 9.226943275741856, + "language_loss": 0.81125385, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.82604349, + "num_input_tokens_seen": 298323055, + "router_z_loss_clip": 2.18261719, + "router_z_loss_mlp": 0.25048828, + "step": 13828, + "time_per_iteration": 4.006492614746094 + }, + { + "auxiliary_loss_clip": 0.012479, + "auxiliary_loss_mlp": 0.00239689, + "balance_loss_clip": 1.03409505, + "balance_loss_mlp": 0.21520318, + "epoch": 0.831444461145348, + "flos": 44492386561920.0, + "grad_norm": 41.296919260227604, + "language_loss": 0.74980307, + "learning_rate": 2.906071489597657e-07, + "loss": 0.76467896, + "num_input_tokens_seen": 298346950, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24511719, + "step": 13829, + "time_per_iteration": 2.8562862873077393 + }, + { + "auxiliary_loss_clip": 0.01261573, + "auxiliary_loss_mlp": 0.0023478, + "balance_loss_clip": 1.04317141, + "balance_loss_mlp": 0.20905453, + "epoch": 0.8315045843980159, + "flos": 22705024124160.0, + "grad_norm": 72.56369163897446, + "language_loss": 0.90519452, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.92015803, + "num_input_tokens_seen": 298366315, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25732422, + "step": 13830, + "time_per_iteration": 2.623713493347168 + }, + { + "auxiliary_loss_clip": 0.01252222, + "auxiliary_loss_mlp": 0.00233893, + "balance_loss_clip": 1.03937364, + "balance_loss_mlp": 0.21057522, + "epoch": 0.8315647076506839, + "flos": 16873491087360.0, + "grad_norm": 163.1740020647691, + "language_loss": 0.82869506, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.84355617, + "num_input_tokens_seen": 298385185, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.2331543, + "step": 13831, + "time_per_iteration": 2.6503617763519287 + }, + { + "auxiliary_loss_clip": 0.0125491, + "auxiliary_loss_mlp": 0.00236162, + "balance_loss_clip": 1.03866494, + "balance_loss_mlp": 0.21278527, + "epoch": 0.8316248309033518, + "flos": 13808730954240.0, + "grad_norm": 4.712171340617204, + "language_loss": 0.80023301, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.8151437, + "num_input_tokens_seen": 298402335, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.23400879, + "step": 13832, + "time_per_iteration": 2.5967254638671875 + }, + { + "auxiliary_loss_clip": 0.01262648, + "auxiliary_loss_mlp": 0.00239962, + "balance_loss_clip": 1.04241788, + "balance_loss_mlp": 0.2158581, + "epoch": 0.8316849541560198, + "flos": 23512511819520.0, + "grad_norm": 24.672247831249784, + "language_loss": 0.91111326, + "learning_rate": 2.897989455393979e-07, + "loss": 0.92613935, + "num_input_tokens_seen": 298423370, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.24084473, + "step": 13833, + "time_per_iteration": 2.6808390617370605 + }, + { + "auxiliary_loss_clip": 0.01262378, + "auxiliary_loss_mlp": 0.00229977, + "balance_loss_clip": 1.04490662, + "balance_loss_mlp": 0.20611136, + "epoch": 0.8317450774086879, + "flos": 23771356202880.0, + "grad_norm": 296.2131734149485, + "language_loss": 0.82004309, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.83496666, + "num_input_tokens_seen": 298444835, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.23864746, + "step": 13834, + "time_per_iteration": 2.7046937942504883 + }, + { + "auxiliary_loss_clip": 0.01260389, + "auxiliary_loss_mlp": 0.00227554, + "balance_loss_clip": 1.04146039, + "balance_loss_mlp": 0.2034376, + "epoch": 0.8318052006613558, + "flos": 16215535710720.0, + "grad_norm": 23.7430650338012, + "language_loss": 0.87938696, + "learning_rate": 2.893952329045459e-07, + "loss": 0.89426637, + "num_input_tokens_seen": 298461845, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24133301, + "step": 13835, + "time_per_iteration": 2.716485023498535 + }, + { + "auxiliary_loss_clip": 0.01258448, + "auxiliary_loss_mlp": 0.00223151, + "balance_loss_clip": 1.0358125, + "balance_loss_mlp": 0.19699669, + "epoch": 0.8318653239140238, + "flos": 19974556892160.0, + "grad_norm": 24.017322734328975, + "language_loss": 0.89827585, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.91309184, + "num_input_tokens_seen": 298479095, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.26171875, + "step": 13836, + "time_per_iteration": 2.7292585372924805 + }, + { + "auxiliary_loss_clip": 0.01244848, + "auxiliary_loss_mlp": 0.00222658, + "balance_loss_clip": 1.03304005, + "balance_loss_mlp": 0.19848201, + "epoch": 0.8319254471666917, + "flos": 17704714694400.0, + "grad_norm": 5.28082408937824, + "language_loss": 0.86214459, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.87681961, + "num_input_tokens_seen": 298494475, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.24182129, + "step": 13837, + "time_per_iteration": 2.6462252140045166 + }, + { + "auxiliary_loss_clip": 0.01260141, + "auxiliary_loss_mlp": 0.00250577, + "balance_loss_clip": 1.0381186, + "balance_loss_mlp": 0.22539988, + "epoch": 0.8319855704193597, + "flos": 19536554448000.0, + "grad_norm": 11.090970803105646, + "language_loss": 0.9020195, + "learning_rate": 2.887901504686685e-07, + "loss": 0.91712666, + "num_input_tokens_seen": 298513185, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25170898, + "step": 13838, + "time_per_iteration": 2.646807909011841 + }, + { + "auxiliary_loss_clip": 0.01263165, + "auxiliary_loss_mlp": 0.00244577, + "balance_loss_clip": 1.04890537, + "balance_loss_mlp": 0.22041366, + "epoch": 0.8320456936720276, + "flos": 21178067011200.0, + "grad_norm": 6.811739518405201, + "language_loss": 0.81996512, + "learning_rate": 2.885885860916795e-07, + "loss": 0.8350426, + "num_input_tokens_seen": 298531885, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24169922, + "step": 13839, + "time_per_iteration": 2.77290940284729 + }, + { + "auxiliary_loss_clip": 0.01257815, + "auxiliary_loss_mlp": 0.00246635, + "balance_loss_clip": 1.03885841, + "balance_loss_mlp": 0.22187537, + "epoch": 0.8321058169246957, + "flos": 33250874503680.0, + "grad_norm": 16.771998866196643, + "language_loss": 0.74395764, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.75900221, + "num_input_tokens_seen": 298554905, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.24731445, + "step": 13840, + "time_per_iteration": 2.839007616043091 + }, + { + "auxiliary_loss_clip": 0.01259811, + "auxiliary_loss_mlp": 0.00237555, + "balance_loss_clip": 1.03717017, + "balance_loss_mlp": 0.21173392, + "epoch": 0.8321659401773636, + "flos": 14208129256320.0, + "grad_norm": 10.63976154055097, + "language_loss": 0.85496867, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.86994231, + "num_input_tokens_seen": 298571185, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25854492, + "step": 13841, + "time_per_iteration": 2.6224141120910645 + }, + { + "auxiliary_loss_clip": 0.01260438, + "auxiliary_loss_mlp": 0.0023773, + "balance_loss_clip": 1.04176474, + "balance_loss_mlp": 0.21170625, + "epoch": 0.8322260634300316, + "flos": 15158253859200.0, + "grad_norm": 6.565811622883925, + "language_loss": 0.77105165, + "learning_rate": 2.879842823726262e-07, + "loss": 0.78603333, + "num_input_tokens_seen": 298588505, + "router_z_loss_clip": 2.18847656, + "router_z_loss_mlp": 0.26037598, + "step": 13842, + "time_per_iteration": 2.6358580589294434 + }, + { + "auxiliary_loss_clip": 0.0124941, + "auxiliary_loss_mlp": 0.00220551, + "balance_loss_clip": 1.0357275, + "balance_loss_mlp": 0.19554067, + "epoch": 0.8322861866826995, + "flos": 25300827267840.0, + "grad_norm": 19.97146697020617, + "language_loss": 0.78260374, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.79730332, + "num_input_tokens_seen": 298609295, + "router_z_loss_clip": 2.13574219, + "router_z_loss_mlp": 0.25024414, + "step": 13843, + "time_per_iteration": 2.6698038578033447 + }, + { + "auxiliary_loss_clip": 0.01241542, + "auxiliary_loss_mlp": 0.00221936, + "balance_loss_clip": 1.03088486, + "balance_loss_mlp": 0.1994893, + "epoch": 0.8323463099353675, + "flos": 17019360218880.0, + "grad_norm": 13.71192230217096, + "language_loss": 0.85910898, + "learning_rate": 2.875817378128975e-07, + "loss": 0.87374377, + "num_input_tokens_seen": 298625765, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.22436523, + "step": 13844, + "time_per_iteration": 2.6656699180603027 + }, + { + "auxiliary_loss_clip": 0.01141178, + "auxiliary_loss_mlp": 0.00106275, + "balance_loss_clip": 0.99043518, + "balance_loss_mlp": 0.09840752, + "epoch": 0.8324064331880354, + "flos": 55607889709440.0, + "grad_norm": 0.7651262316004078, + "language_loss": 0.5482384, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.56071293, + "num_input_tokens_seen": 298683005, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.07861328, + "step": 13845, + "time_per_iteration": 3.0707898139953613 + }, + { + "auxiliary_loss_clip": 0.01261067, + "auxiliary_loss_mlp": 0.00232365, + "balance_loss_clip": 1.04176593, + "balance_loss_mlp": 0.20841566, + "epoch": 0.8324665564407034, + "flos": 26138623063680.0, + "grad_norm": 7.793613726941194, + "language_loss": 0.82155013, + "learning_rate": 2.871794529934555e-07, + "loss": 0.83648443, + "num_input_tokens_seen": 298703060, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.23950195, + "step": 13846, + "time_per_iteration": 2.7012441158294678 + }, + { + "auxiliary_loss_clip": 0.0127525, + "auxiliary_loss_mlp": 0.00231281, + "balance_loss_clip": 1.04942322, + "balance_loss_mlp": 0.20504341, + "epoch": 0.8325266796933715, + "flos": 22049187649920.0, + "grad_norm": 24.363714453125773, + "language_loss": 0.86209196, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.87715721, + "num_input_tokens_seen": 298721765, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.26257324, + "step": 13847, + "time_per_iteration": 2.68108868598938 + }, + { + "auxiliary_loss_clip": 0.01240156, + "auxiliary_loss_mlp": 0.00213072, + "balance_loss_clip": 1.02685642, + "balance_loss_mlp": 0.18894404, + "epoch": 0.8325868029460394, + "flos": 22816634659200.0, + "grad_norm": 6.646408599914071, + "language_loss": 0.81511569, + "learning_rate": 2.867774279753175e-07, + "loss": 0.82964802, + "num_input_tokens_seen": 298740825, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24133301, + "step": 13848, + "time_per_iteration": 2.7149817943573 + }, + { + "auxiliary_loss_clip": 0.01252564, + "auxiliary_loss_mlp": 0.00219491, + "balance_loss_clip": 1.03701997, + "balance_loss_mlp": 0.19640031, + "epoch": 0.8326469261987074, + "flos": 14757454926720.0, + "grad_norm": 24.19997684299332, + "language_loss": 0.70498061, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.71970117, + "num_input_tokens_seen": 298758515, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.23071289, + "step": 13849, + "time_per_iteration": 2.626472234725952 + }, + { + "auxiliary_loss_clip": 0.01245963, + "auxiliary_loss_mlp": 0.0022354, + "balance_loss_clip": 1.03249753, + "balance_loss_mlp": 0.20124771, + "epoch": 0.8327070494513753, + "flos": 22926126291840.0, + "grad_norm": 3.80235230795064, + "language_loss": 0.90057361, + "learning_rate": 2.863756628194638e-07, + "loss": 0.91526866, + "num_input_tokens_seen": 298776375, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.22277832, + "step": 13850, + "time_per_iteration": 2.7565460205078125 + }, + { + "auxiliary_loss_clip": 0.01238535, + "auxiliary_loss_mlp": 0.00227404, + "balance_loss_clip": 1.02730107, + "balance_loss_mlp": 0.20446862, + "epoch": 0.8327671727040433, + "flos": 20665334321280.0, + "grad_norm": 8.60012012176768, + "language_loss": 0.85319155, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.8678509, + "num_input_tokens_seen": 298795135, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.22961426, + "step": 13851, + "time_per_iteration": 2.7233338356018066 + }, + { + "auxiliary_loss_clip": 0.01139699, + "auxiliary_loss_mlp": 0.00137306, + "balance_loss_clip": 0.9915055, + "balance_loss_mlp": 0.12934296, + "epoch": 0.8328272959567112, + "flos": 56060760384000.0, + "grad_norm": 0.9620154445664564, + "language_loss": 0.55278707, + "learning_rate": 2.859741575868344e-07, + "loss": 0.56555712, + "num_input_tokens_seen": 298855475, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07958984, + "step": 13852, + "time_per_iteration": 3.133488178253174 + }, + { + "auxiliary_loss_clip": 0.0123702, + "auxiliary_loss_mlp": 0.00218012, + "balance_loss_clip": 1.02560973, + "balance_loss_mlp": 0.19628012, + "epoch": 0.8328874192093793, + "flos": 32303084284800.0, + "grad_norm": 10.282039438913827, + "language_loss": 0.74003565, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.75458598, + "num_input_tokens_seen": 298875875, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.21728516, + "step": 13853, + "time_per_iteration": 2.73909330368042 + }, + { + "auxiliary_loss_clip": 0.01246707, + "auxiliary_loss_mlp": 0.00237239, + "balance_loss_clip": 1.02768183, + "balance_loss_mlp": 0.21275368, + "epoch": 0.8329475424620472, + "flos": 23512691387520.0, + "grad_norm": 110.96289802698962, + "language_loss": 0.86351681, + "learning_rate": 2.855729123383286e-07, + "loss": 0.87835628, + "num_input_tokens_seen": 298895950, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24487305, + "step": 13854, + "time_per_iteration": 2.637152671813965 + }, + { + "auxiliary_loss_clip": 0.01149375, + "auxiliary_loss_mlp": 0.00195855, + "balance_loss_clip": 0.99815136, + "balance_loss_mlp": 0.18736708, + "epoch": 0.8330076657147152, + "flos": 67840680378240.0, + "grad_norm": 0.757006895129408, + "language_loss": 0.57889485, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.59234715, + "num_input_tokens_seen": 298955770, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.08496094, + "step": 13855, + "time_per_iteration": 3.023240327835083 + }, + { + "auxiliary_loss_clip": 0.01251637, + "auxiliary_loss_mlp": 0.00228334, + "balance_loss_clip": 1.03302813, + "balance_loss_mlp": 0.20299006, + "epoch": 0.8330677889673831, + "flos": 22892801448960.0, + "grad_norm": 309.77100567679184, + "language_loss": 0.8242563, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.83905602, + "num_input_tokens_seen": 298976545, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25341797, + "step": 13856, + "time_per_iteration": 2.7564315795898438 + }, + { + "auxiliary_loss_clip": 0.0125073, + "auxiliary_loss_mlp": 0.00245995, + "balance_loss_clip": 1.034464, + "balance_loss_mlp": 0.22037646, + "epoch": 0.8331279122200511, + "flos": 27345042184320.0, + "grad_norm": 9.537398100261386, + "language_loss": 0.82104933, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.83601665, + "num_input_tokens_seen": 298996750, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.25622559, + "step": 13857, + "time_per_iteration": 2.7504165172576904 + }, + { + "auxiliary_loss_clip": 0.01235697, + "auxiliary_loss_mlp": 0.00212058, + "balance_loss_clip": 1.0282675, + "balance_loss_mlp": 0.18995686, + "epoch": 0.833188035472719, + "flos": 19938179393280.0, + "grad_norm": 3.615291738024865, + "language_loss": 0.79325855, + "learning_rate": 2.847712020370958e-07, + "loss": 0.80773616, + "num_input_tokens_seen": 299014895, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.22106934, + "step": 13858, + "time_per_iteration": 2.6877708435058594 + }, + { + "auxiliary_loss_clip": 0.01290425, + "auxiliary_loss_mlp": 0.00250024, + "balance_loss_clip": 1.06108475, + "balance_loss_mlp": 0.22175951, + "epoch": 0.833248158725387, + "flos": 15232624968960.0, + "grad_norm": 32.232577787365834, + "language_loss": 0.86521351, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.88061798, + "num_input_tokens_seen": 299032855, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.28259277, + "step": 13859, + "time_per_iteration": 2.654614210128784 + }, + { + "auxiliary_loss_clip": 0.01262294, + "auxiliary_loss_mlp": 0.00237027, + "balance_loss_clip": 1.04462814, + "balance_loss_mlp": 0.21223134, + "epoch": 0.8333082819780551, + "flos": 24535535074560.0, + "grad_norm": 4.441699606250712, + "language_loss": 0.86301148, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.87800473, + "num_input_tokens_seen": 299052055, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24780273, + "step": 13860, + "time_per_iteration": 4.134536266326904 + }, + { + "auxiliary_loss_clip": 0.01246247, + "auxiliary_loss_mlp": 0.00245111, + "balance_loss_clip": 1.03449273, + "balance_loss_mlp": 0.22188941, + "epoch": 0.833368405230723, + "flos": 31467407391360.0, + "grad_norm": 16.14362777445629, + "language_loss": 0.88738424, + "learning_rate": 2.841706022218644e-07, + "loss": 0.90229785, + "num_input_tokens_seen": 299075285, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.2322998, + "step": 13861, + "time_per_iteration": 4.2075395584106445 + }, + { + "auxiliary_loss_clip": 0.01256041, + "auxiliary_loss_mlp": 0.00235356, + "balance_loss_clip": 1.03746939, + "balance_loss_mlp": 0.20998821, + "epoch": 0.833428528483391, + "flos": 14902713527040.0, + "grad_norm": 11.473391171840477, + "language_loss": 0.87088877, + "learning_rate": 2.839705324021806e-07, + "loss": 0.88580275, + "num_input_tokens_seen": 299092520, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25402832, + "step": 13862, + "time_per_iteration": 2.651503324508667 + }, + { + "auxiliary_loss_clip": 0.01256325, + "auxiliary_loss_mlp": 0.00258129, + "balance_loss_clip": 1.03655362, + "balance_loss_mlp": 0.23267823, + "epoch": 0.8334886517360589, + "flos": 22199833290240.0, + "grad_norm": 282.1429552772105, + "language_loss": 0.84314531, + "learning_rate": 2.83770527654505e-07, + "loss": 0.85828984, + "num_input_tokens_seen": 299109450, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25463867, + "step": 13863, + "time_per_iteration": 2.718950033187866 + }, + { + "auxiliary_loss_clip": 0.01249039, + "auxiliary_loss_mlp": 0.0021998, + "balance_loss_clip": 1.03782582, + "balance_loss_mlp": 0.19747353, + "epoch": 0.8335487749887269, + "flos": 30372562892160.0, + "grad_norm": 267.19248680580796, + "language_loss": 0.8169986, + "learning_rate": 2.835705879864232e-07, + "loss": 0.83168876, + "num_input_tokens_seen": 299129540, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.2253418, + "step": 13864, + "time_per_iteration": 2.7328476905822754 + }, + { + "auxiliary_loss_clip": 0.01253752, + "auxiliary_loss_mlp": 0.00227082, + "balance_loss_clip": 1.03708994, + "balance_loss_mlp": 0.20285875, + "epoch": 0.8336088982413948, + "flos": 24681152810880.0, + "grad_norm": 2.652391819235367, + "language_loss": 0.76489562, + "learning_rate": 2.833707134055168e-07, + "loss": 0.77970392, + "num_input_tokens_seen": 299148670, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24243164, + "step": 13865, + "time_per_iteration": 2.7088258266448975 + }, + { + "auxiliary_loss_clip": 0.01258892, + "auxiliary_loss_mlp": 0.00235096, + "balance_loss_clip": 1.04841781, + "balance_loss_mlp": 0.2116951, + "epoch": 0.8336690214940629, + "flos": 38177207873280.0, + "grad_norm": 3.591509157395617, + "language_loss": 0.82437032, + "learning_rate": 2.831709039193653e-07, + "loss": 0.83931017, + "num_input_tokens_seen": 299169330, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.23413086, + "step": 13866, + "time_per_iteration": 4.330663204193115 + }, + { + "auxiliary_loss_clip": 0.01129108, + "auxiliary_loss_mlp": 0.00077726, + "balance_loss_clip": 0.98731381, + "balance_loss_mlp": 0.07000114, + "epoch": 0.8337291447467308, + "flos": 55565119589760.0, + "grad_norm": 0.8408839763035625, + "language_loss": 0.61918813, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.63125646, + "num_input_tokens_seen": 299220980, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.07714844, + "step": 13867, + "time_per_iteration": 3.1018972396850586 + }, + { + "auxiliary_loss_clip": 0.01263233, + "auxiliary_loss_mlp": 0.00237365, + "balance_loss_clip": 1.0476172, + "balance_loss_mlp": 0.21286753, + "epoch": 0.8337892679993988, + "flos": 24133550993280.0, + "grad_norm": 31.42192113352572, + "language_loss": 0.79189903, + "learning_rate": 2.827714802616301e-07, + "loss": 0.80690503, + "num_input_tokens_seen": 299240130, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.24499512, + "step": 13868, + "time_per_iteration": 2.670517921447754 + }, + { + "auxiliary_loss_clip": 0.01266506, + "auxiliary_loss_mlp": 0.00227662, + "balance_loss_clip": 1.04924393, + "balance_loss_mlp": 0.20255625, + "epoch": 0.8338493912520667, + "flos": 28183915388160.0, + "grad_norm": 2.6745417749301867, + "language_loss": 0.86488926, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.87983096, + "num_input_tokens_seen": 299260705, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.2512207, + "step": 13869, + "time_per_iteration": 2.7124533653259277 + }, + { + "auxiliary_loss_clip": 0.01268606, + "auxiliary_loss_mlp": 0.002235, + "balance_loss_clip": 1.04994631, + "balance_loss_mlp": 0.20038566, + "epoch": 0.8339095145047347, + "flos": 22158356060160.0, + "grad_norm": 961.539759478965, + "language_loss": 0.88789672, + "learning_rate": 2.823723170738028e-07, + "loss": 0.90281779, + "num_input_tokens_seen": 299278925, + "router_z_loss_clip": 2.18847656, + "router_z_loss_mlp": 0.2310791, + "step": 13870, + "time_per_iteration": 4.051081895828247 + }, + { + "auxiliary_loss_clip": 0.01248756, + "auxiliary_loss_mlp": 0.00223357, + "balance_loss_clip": 1.0317229, + "balance_loss_mlp": 0.19996831, + "epoch": 0.8339696377574026, + "flos": 17307112072320.0, + "grad_norm": 94.67120819769241, + "language_loss": 0.80142617, + "learning_rate": 2.821728331750264e-07, + "loss": 0.81614733, + "num_input_tokens_seen": 299291580, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.23376465, + "step": 13871, + "time_per_iteration": 2.7701234817504883 + }, + { + "auxiliary_loss_clip": 0.01240075, + "auxiliary_loss_mlp": 0.00222306, + "balance_loss_clip": 1.02707708, + "balance_loss_mlp": 0.19902432, + "epoch": 0.8340297610100706, + "flos": 20668351063680.0, + "grad_norm": 21.48627631839179, + "language_loss": 0.7712326, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.78585637, + "num_input_tokens_seen": 299310385, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23278809, + "step": 13872, + "time_per_iteration": 2.6460704803466797 + }, + { + "auxiliary_loss_clip": 0.01266975, + "auxiliary_loss_mlp": 0.00233412, + "balance_loss_clip": 1.04515982, + "balance_loss_mlp": 0.20769854, + "epoch": 0.8340898842627387, + "flos": 20515442866560.0, + "grad_norm": 44.73213539929616, + "language_loss": 0.82164693, + "learning_rate": 2.817740608055712e-07, + "loss": 0.83665085, + "num_input_tokens_seen": 299327660, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.25708008, + "step": 13873, + "time_per_iteration": 2.6297760009765625 + }, + { + "auxiliary_loss_clip": 0.01283428, + "auxiliary_loss_mlp": 0.00228477, + "balance_loss_clip": 1.05421901, + "balance_loss_mlp": 0.19886523, + "epoch": 0.8341500075154066, + "flos": 21425850005760.0, + "grad_norm": 15.683187883502741, + "language_loss": 0.84218472, + "learning_rate": 2.81574772350013e-07, + "loss": 0.8573038, + "num_input_tokens_seen": 299343685, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.29614258, + "step": 13874, + "time_per_iteration": 2.6503989696502686 + }, + { + "auxiliary_loss_clip": 0.01260701, + "auxiliary_loss_mlp": 0.00239877, + "balance_loss_clip": 1.04756773, + "balance_loss_mlp": 0.2170721, + "epoch": 0.8342101307680746, + "flos": 22090988102400.0, + "grad_norm": 65.10395329525184, + "language_loss": 0.74930525, + "learning_rate": 2.813755490573118e-07, + "loss": 0.76431108, + "num_input_tokens_seen": 299363305, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.22802734, + "step": 13875, + "time_per_iteration": 2.7033369541168213 + }, + { + "auxiliary_loss_clip": 0.01246972, + "auxiliary_loss_mlp": 0.0024061, + "balance_loss_clip": 1.03336716, + "balance_loss_mlp": 0.21664852, + "epoch": 0.8342702540207425, + "flos": 21871466133120.0, + "grad_norm": 35.6247521691611, + "language_loss": 0.86562002, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.88049579, + "num_input_tokens_seen": 299382630, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.23950195, + "step": 13876, + "time_per_iteration": 2.67905330657959 + }, + { + "auxiliary_loss_clip": 0.01238116, + "auxiliary_loss_mlp": 0.0020385, + "balance_loss_clip": 1.02627707, + "balance_loss_mlp": 0.18119985, + "epoch": 0.8343303772734105, + "flos": 22528487756160.0, + "grad_norm": 178.4478056406995, + "language_loss": 0.95420909, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.96862876, + "num_input_tokens_seen": 299402385, + "router_z_loss_clip": 2.11425781, + "router_z_loss_mlp": 0.2265625, + "step": 13877, + "time_per_iteration": 2.6645212173461914 + }, + { + "auxiliary_loss_clip": 0.01250246, + "auxiliary_loss_mlp": 0.00235273, + "balance_loss_clip": 1.03231502, + "balance_loss_mlp": 0.21044208, + "epoch": 0.8343905005260784, + "flos": 14939773384320.0, + "grad_norm": 13.320538036946756, + "language_loss": 0.75795174, + "learning_rate": 2.807782702318828e-07, + "loss": 0.77280694, + "num_input_tokens_seen": 299419820, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.24829102, + "step": 13878, + "time_per_iteration": 2.6764557361602783 + }, + { + "auxiliary_loss_clip": 0.01271151, + "auxiliary_loss_mlp": 0.00226007, + "balance_loss_clip": 1.04528713, + "balance_loss_mlp": 0.20026982, + "epoch": 0.8344506237787465, + "flos": 15012456554880.0, + "grad_norm": 31.465089763112232, + "language_loss": 0.87459135, + "learning_rate": 2.805793076661309e-07, + "loss": 0.88956296, + "num_input_tokens_seen": 299436265, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.25720215, + "step": 13879, + "time_per_iteration": 2.687739372253418 + }, + { + "auxiliary_loss_clip": 0.01248172, + "auxiliary_loss_mlp": 0.00227844, + "balance_loss_clip": 1.03312039, + "balance_loss_mlp": 0.20277466, + "epoch": 0.8345107470314144, + "flos": 17560389847680.0, + "grad_norm": 135.67830840457447, + "language_loss": 0.89776331, + "learning_rate": 2.803804103009828e-07, + "loss": 0.91252351, + "num_input_tokens_seen": 299451660, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.25085449, + "step": 13880, + "time_per_iteration": 2.6845197677612305 + }, + { + "auxiliary_loss_clip": 0.01252255, + "auxiliary_loss_mlp": 0.00218039, + "balance_loss_clip": 1.03555536, + "balance_loss_mlp": 0.19413757, + "epoch": 0.8345708702840824, + "flos": 25187277398400.0, + "grad_norm": 15.31731083723965, + "language_loss": 0.83332235, + "learning_rate": 2.80181578143982e-07, + "loss": 0.84802532, + "num_input_tokens_seen": 299472070, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.23913574, + "step": 13881, + "time_per_iteration": 2.679426670074463 + }, + { + "auxiliary_loss_clip": 0.01244117, + "auxiliary_loss_mlp": 0.00244759, + "balance_loss_clip": 1.03107023, + "balance_loss_mlp": 0.22057158, + "epoch": 0.8346309935367503, + "flos": 15083559527040.0, + "grad_norm": 93.65799037542583, + "language_loss": 0.87864429, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.89353305, + "num_input_tokens_seen": 299486725, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.24194336, + "step": 13882, + "time_per_iteration": 2.7177250385284424 + }, + { + "auxiliary_loss_clip": 0.01249713, + "auxiliary_loss_mlp": 0.0025268, + "balance_loss_clip": 1.03407633, + "balance_loss_mlp": 0.22688295, + "epoch": 0.8346911167894183, + "flos": 22930615491840.0, + "grad_norm": 4.713681502623207, + "language_loss": 0.89110625, + "learning_rate": 2.79784109484579e-07, + "loss": 0.90613019, + "num_input_tokens_seen": 299505435, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.25817871, + "step": 13883, + "time_per_iteration": 2.6467316150665283 + }, + { + "auxiliary_loss_clip": 0.01259921, + "auxiliary_loss_mlp": 0.00239717, + "balance_loss_clip": 1.04154325, + "balance_loss_mlp": 0.21514761, + "epoch": 0.8347512400420862, + "flos": 20193037367040.0, + "grad_norm": 12.04935402057782, + "language_loss": 0.82619524, + "learning_rate": 2.795854729972482e-07, + "loss": 0.84119165, + "num_input_tokens_seen": 299523555, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24523926, + "step": 13884, + "time_per_iteration": 2.6880276203155518 + }, + { + "auxiliary_loss_clip": 0.01305213, + "auxiliary_loss_mlp": 0.00245295, + "balance_loss_clip": 1.06747818, + "balance_loss_mlp": 0.21689951, + "epoch": 0.8348113632947542, + "flos": 25954832148480.0, + "grad_norm": 10.799399774436171, + "language_loss": 0.79262841, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.80813348, + "num_input_tokens_seen": 299541660, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.28381348, + "step": 13885, + "time_per_iteration": 2.843804121017456 + }, + { + "auxiliary_loss_clip": 0.01249722, + "auxiliary_loss_mlp": 0.00226991, + "balance_loss_clip": 1.03582919, + "balance_loss_mlp": 0.20335157, + "epoch": 0.8348714865474223, + "flos": 34204554552960.0, + "grad_norm": 620.8634900510882, + "language_loss": 0.76443416, + "learning_rate": 2.791883957449912e-07, + "loss": 0.77920127, + "num_input_tokens_seen": 299562465, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.23632812, + "step": 13886, + "time_per_iteration": 2.8130409717559814 + }, + { + "auxiliary_loss_clip": 0.01258431, + "auxiliary_loss_mlp": 0.00207397, + "balance_loss_clip": 1.04096532, + "balance_loss_mlp": 0.1828759, + "epoch": 0.8349316098000902, + "flos": 24390132819840.0, + "grad_norm": 87.53446254820078, + "language_loss": 0.85654044, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.87119871, + "num_input_tokens_seen": 299582700, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.24536133, + "step": 13887, + "time_per_iteration": 2.7548434734344482 + }, + { + "auxiliary_loss_clip": 0.01274019, + "auxiliary_loss_mlp": 0.00233154, + "balance_loss_clip": 1.05274653, + "balance_loss_mlp": 0.20779788, + "epoch": 0.8349917330527582, + "flos": 23032744836480.0, + "grad_norm": 106.57875416138464, + "language_loss": 0.7918399, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.80691165, + "num_input_tokens_seen": 299600310, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25341797, + "step": 13888, + "time_per_iteration": 2.6814167499542236 + }, + { + "auxiliary_loss_clip": 0.01278125, + "auxiliary_loss_mlp": 0.0023903, + "balance_loss_clip": 1.05050695, + "balance_loss_mlp": 0.21398437, + "epoch": 0.8350518563054261, + "flos": 13625873792640.0, + "grad_norm": 16.859763353888123, + "language_loss": 0.77963853, + "learning_rate": 2.785932692855244e-07, + "loss": 0.79481012, + "num_input_tokens_seen": 299617025, + "router_z_loss_clip": 2.27246094, + "router_z_loss_mlp": 0.25036621, + "step": 13889, + "time_per_iteration": 2.664193868637085 + }, + { + "auxiliary_loss_clip": 0.01258472, + "auxiliary_loss_mlp": 0.00220093, + "balance_loss_clip": 1.03914452, + "balance_loss_mlp": 0.19651377, + "epoch": 0.8351119795580941, + "flos": 21579799697280.0, + "grad_norm": 9.659830748195038, + "language_loss": 0.7722019, + "learning_rate": 2.783950243408399e-07, + "loss": 0.78698754, + "num_input_tokens_seen": 299633050, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.23571777, + "step": 13890, + "time_per_iteration": 2.6464157104492188 + }, + { + "auxiliary_loss_clip": 0.01276042, + "auxiliary_loss_mlp": 0.00238988, + "balance_loss_clip": 1.05443811, + "balance_loss_mlp": 0.21434793, + "epoch": 0.835172102810762, + "flos": 20038297576320.0, + "grad_norm": 22.692925171413645, + "language_loss": 0.69700474, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.71215498, + "num_input_tokens_seen": 299646445, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.24621582, + "step": 13891, + "time_per_iteration": 2.6518824100494385 + }, + { + "auxiliary_loss_clip": 0.01247365, + "auxiliary_loss_mlp": 0.00226557, + "balance_loss_clip": 1.03565359, + "balance_loss_mlp": 0.20303681, + "epoch": 0.8352322260634301, + "flos": 25111577485440.0, + "grad_norm": 11.072407850125444, + "language_loss": 0.76764309, + "learning_rate": 2.779987303092846e-07, + "loss": 0.78238225, + "num_input_tokens_seen": 299662665, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.23522949, + "step": 13892, + "time_per_iteration": 2.679222583770752 + }, + { + "auxiliary_loss_clip": 0.01252581, + "auxiliary_loss_mlp": 0.00216724, + "balance_loss_clip": 1.03643095, + "balance_loss_mlp": 0.19399107, + "epoch": 0.835292349316098, + "flos": 24863758577280.0, + "grad_norm": 2.4501881940684442, + "language_loss": 0.72604823, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.74074125, + "num_input_tokens_seen": 299683585, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.22741699, + "step": 13893, + "time_per_iteration": 2.703325033187866 + }, + { + "auxiliary_loss_clip": 0.01259902, + "auxiliary_loss_mlp": 0.00251394, + "balance_loss_clip": 1.04173207, + "balance_loss_mlp": 0.22424977, + "epoch": 0.835352472568766, + "flos": 19865568049920.0, + "grad_norm": 3.5430756064374047, + "language_loss": 0.87483215, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.88994515, + "num_input_tokens_seen": 299702680, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.27160645, + "step": 13894, + "time_per_iteration": 2.737489700317383 + }, + { + "auxiliary_loss_clip": 0.01241356, + "auxiliary_loss_mlp": 0.00219781, + "balance_loss_clip": 1.03006768, + "balance_loss_mlp": 0.19584361, + "epoch": 0.8354125958214339, + "flos": 22054754257920.0, + "grad_norm": 159.0754439182516, + "language_loss": 0.80475384, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.8193652, + "num_input_tokens_seen": 299721050, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.23937988, + "step": 13895, + "time_per_iteration": 2.688227415084839 + }, + { + "auxiliary_loss_clip": 0.0126959, + "auxiliary_loss_mlp": 0.00257711, + "balance_loss_clip": 1.05105221, + "balance_loss_mlp": 0.23090053, + "epoch": 0.8354727190741019, + "flos": 21397804462080.0, + "grad_norm": 86.04088801196643, + "language_loss": 0.8140502, + "learning_rate": 2.772069258877667e-07, + "loss": 0.82932323, + "num_input_tokens_seen": 299738255, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.26843262, + "step": 13896, + "time_per_iteration": 2.6331627368927 + }, + { + "auxiliary_loss_clip": 0.01252634, + "auxiliary_loss_mlp": 0.00207199, + "balance_loss_clip": 1.03647256, + "balance_loss_mlp": 0.18497854, + "epoch": 0.8355328423267698, + "flos": 50840997834240.0, + "grad_norm": 20.817570793137097, + "language_loss": 0.67555076, + "learning_rate": 2.770091380848423e-07, + "loss": 0.69014907, + "num_input_tokens_seen": 299761315, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.22241211, + "step": 13897, + "time_per_iteration": 2.9100496768951416 + }, + { + "auxiliary_loss_clip": 0.01159732, + "auxiliary_loss_mlp": 0.00148947, + "balance_loss_clip": 1.01592469, + "balance_loss_mlp": 0.13945788, + "epoch": 0.8355929655794379, + "flos": 65551052764800.0, + "grad_norm": 0.7071299721380457, + "language_loss": 0.56999946, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.58308625, + "num_input_tokens_seen": 299828735, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.09472656, + "step": 13898, + "time_per_iteration": 3.202359437942505 + }, + { + "auxiliary_loss_clip": 0.01269084, + "auxiliary_loss_mlp": 0.00237732, + "balance_loss_clip": 1.0478816, + "balance_loss_mlp": 0.21170908, + "epoch": 0.8356530888321058, + "flos": 19170516902400.0, + "grad_norm": 37.669097836609154, + "language_loss": 0.89813882, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.91320705, + "num_input_tokens_seen": 299848395, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26000977, + "step": 13899, + "time_per_iteration": 2.6208600997924805 + }, + { + "auxiliary_loss_clip": 0.01269404, + "auxiliary_loss_mlp": 0.00232851, + "balance_loss_clip": 1.05295885, + "balance_loss_mlp": 0.20796055, + "epoch": 0.8357132120847738, + "flos": 44126672238720.0, + "grad_norm": 4.028850471628516, + "language_loss": 0.74851757, + "learning_rate": 2.764161667219749e-07, + "loss": 0.76354015, + "num_input_tokens_seen": 299871665, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24890137, + "step": 13900, + "time_per_iteration": 2.8973681926727295 + }, + { + "auxiliary_loss_clip": 0.01262192, + "auxiliary_loss_mlp": 0.0024598, + "balance_loss_clip": 1.04271197, + "balance_loss_mlp": 0.22101757, + "epoch": 0.8357733353374418, + "flos": 24389701856640.0, + "grad_norm": 9.11913424850274, + "language_loss": 0.79403883, + "learning_rate": 2.762186403079716e-07, + "loss": 0.80912054, + "num_input_tokens_seen": 299891960, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24975586, + "step": 13901, + "time_per_iteration": 2.694429874420166 + }, + { + "auxiliary_loss_clip": 0.01274922, + "auxiliary_loss_mlp": 0.00225132, + "balance_loss_clip": 1.04883909, + "balance_loss_mlp": 0.20022893, + "epoch": 0.8358334585901097, + "flos": 20916313626240.0, + "grad_norm": 18.071865079606397, + "language_loss": 0.88884592, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.90384644, + "num_input_tokens_seen": 299905070, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.24902344, + "step": 13902, + "time_per_iteration": 4.090671539306641 + }, + { + "auxiliary_loss_clip": 0.01248042, + "auxiliary_loss_mlp": 0.0022048, + "balance_loss_clip": 1.03912222, + "balance_loss_mlp": 0.19845045, + "epoch": 0.8358935818427777, + "flos": 19244169740160.0, + "grad_norm": 52.9527669075588, + "language_loss": 0.68714893, + "learning_rate": 2.758237835853379e-07, + "loss": 0.70183408, + "num_input_tokens_seen": 299925130, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.22033691, + "step": 13903, + "time_per_iteration": 4.066463947296143 + }, + { + "auxiliary_loss_clip": 0.01272468, + "auxiliary_loss_mlp": 0.00214589, + "balance_loss_clip": 1.05705845, + "balance_loss_mlp": 0.19118802, + "epoch": 0.8359537050954456, + "flos": 24134053783680.0, + "grad_norm": 23.195158359126104, + "language_loss": 0.83453977, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.84941041, + "num_input_tokens_seen": 299943845, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.23413086, + "step": 13904, + "time_per_iteration": 2.690842628479004 + }, + { + "auxiliary_loss_clip": 0.01256503, + "auxiliary_loss_mlp": 0.00220832, + "balance_loss_clip": 1.03979468, + "balance_loss_mlp": 0.19690718, + "epoch": 0.8360138283481137, + "flos": 16180415187840.0, + "grad_norm": 12.83640728736064, + "language_loss": 0.79010296, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.80487633, + "num_input_tokens_seen": 299961620, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.23950195, + "step": 13905, + "time_per_iteration": 2.6769862174987793 + }, + { + "auxiliary_loss_clip": 0.01257222, + "auxiliary_loss_mlp": 0.00241276, + "balance_loss_clip": 1.042117, + "balance_loss_mlp": 0.21648088, + "epoch": 0.8360739516007816, + "flos": 22198899536640.0, + "grad_norm": 14.981439188764158, + "language_loss": 0.73058736, + "learning_rate": 2.752319888771e-07, + "loss": 0.74557227, + "num_input_tokens_seen": 299982170, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24829102, + "step": 13906, + "time_per_iteration": 2.725395441055298 + }, + { + "auxiliary_loss_clip": 0.01242499, + "auxiliary_loss_mlp": 0.00236044, + "balance_loss_clip": 1.03205478, + "balance_loss_mlp": 0.21190438, + "epoch": 0.8361340748534496, + "flos": 20923137210240.0, + "grad_norm": 15.748675870859278, + "language_loss": 0.8014397, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.81622517, + "num_input_tokens_seen": 300001330, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.24121094, + "step": 13907, + "time_per_iteration": 2.672337293624878 + }, + { + "auxiliary_loss_clip": 0.01269193, + "auxiliary_loss_mlp": 0.00234724, + "balance_loss_clip": 1.04534733, + "balance_loss_mlp": 0.20769969, + "epoch": 0.8361941981061175, + "flos": 26173599932160.0, + "grad_norm": 513.2316372148265, + "language_loss": 0.83139789, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.8464371, + "num_input_tokens_seen": 300020645, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.27026367, + "step": 13908, + "time_per_iteration": 4.256725788116455 + }, + { + "auxiliary_loss_clip": 0.01276215, + "auxiliary_loss_mlp": 0.00225057, + "balance_loss_clip": 1.05707824, + "balance_loss_mlp": 0.19988018, + "epoch": 0.8362543213587855, + "flos": 24419363512320.0, + "grad_norm": 16.24378684280321, + "language_loss": 0.80263126, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.817644, + "num_input_tokens_seen": 300039945, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25170898, + "step": 13909, + "time_per_iteration": 2.7075142860412598 + }, + { + "auxiliary_loss_clip": 0.01270751, + "auxiliary_loss_mlp": 0.0023731, + "balance_loss_clip": 1.04642332, + "balance_loss_mlp": 0.21085739, + "epoch": 0.8363144446114534, + "flos": 17202396948480.0, + "grad_norm": 5.58731252211104, + "language_loss": 0.82637393, + "learning_rate": 2.744438449482338e-07, + "loss": 0.84145457, + "num_input_tokens_seen": 300058260, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.2644043, + "step": 13910, + "time_per_iteration": 2.666461944580078 + }, + { + "auxiliary_loss_clip": 0.01252514, + "auxiliary_loss_mlp": 0.0022947, + "balance_loss_clip": 1.0412991, + "balance_loss_mlp": 0.20494872, + "epoch": 0.8363745678641215, + "flos": 19279398003840.0, + "grad_norm": 8.918759493762801, + "language_loss": 0.78302813, + "learning_rate": 2.742469725305001e-07, + "loss": 0.79784799, + "num_input_tokens_seen": 300076720, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.24511719, + "step": 13911, + "time_per_iteration": 2.688788890838623 + }, + { + "auxiliary_loss_clip": 0.01268634, + "auxiliary_loss_mlp": 0.00216656, + "balance_loss_clip": 1.04951394, + "balance_loss_mlp": 0.1933153, + "epoch": 0.8364346911167894, + "flos": 11874869596800.0, + "grad_norm": 7.762831398043462, + "language_loss": 0.87745452, + "learning_rate": 2.740501655534946e-07, + "loss": 0.89230746, + "num_input_tokens_seen": 300092950, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.23339844, + "step": 13912, + "time_per_iteration": 4.064759016036987 + }, + { + "auxiliary_loss_clip": 0.01264924, + "auxiliary_loss_mlp": 0.00225741, + "balance_loss_clip": 1.04764915, + "balance_loss_mlp": 0.2027452, + "epoch": 0.8364948143694574, + "flos": 20225212974720.0, + "grad_norm": 25.522307630256012, + "language_loss": 0.85650092, + "learning_rate": 2.738534240246797e-07, + "loss": 0.87140757, + "num_input_tokens_seen": 300110950, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.23010254, + "step": 13913, + "time_per_iteration": 2.689507246017456 + }, + { + "auxiliary_loss_clip": 0.01286693, + "auxiliary_loss_mlp": 0.00245873, + "balance_loss_clip": 1.05971551, + "balance_loss_mlp": 0.21862128, + "epoch": 0.8365549376221254, + "flos": 21612909058560.0, + "grad_norm": 29.225574226905263, + "language_loss": 0.82212669, + "learning_rate": 2.736567479515153e-07, + "loss": 0.83745235, + "num_input_tokens_seen": 300128705, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.27233887, + "step": 13914, + "time_per_iteration": 2.661478281021118 + }, + { + "auxiliary_loss_clip": 0.01261056, + "auxiliary_loss_mlp": 0.00202886, + "balance_loss_clip": 1.04191732, + "balance_loss_mlp": 0.17828095, + "epoch": 0.8366150608747933, + "flos": 23294210912640.0, + "grad_norm": 15.104196534922421, + "language_loss": 0.79016417, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.80480361, + "num_input_tokens_seen": 300148635, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.24633789, + "step": 13915, + "time_per_iteration": 2.6597394943237305 + }, + { + "auxiliary_loss_clip": 0.0125355, + "auxiliary_loss_mlp": 0.00228887, + "balance_loss_clip": 1.03569674, + "balance_loss_mlp": 0.20353135, + "epoch": 0.8366751841274613, + "flos": 15267673664640.0, + "grad_norm": 632.6852116784831, + "language_loss": 0.81227958, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.82710397, + "num_input_tokens_seen": 300165490, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25341797, + "step": 13916, + "time_per_iteration": 2.659848690032959 + }, + { + "auxiliary_loss_clip": 0.01266162, + "auxiliary_loss_mlp": 0.00213863, + "balance_loss_clip": 1.04410255, + "balance_loss_mlp": 0.18938935, + "epoch": 0.8367353073801292, + "flos": 13224931205760.0, + "grad_norm": 3.04085323022098, + "language_loss": 0.85492694, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.86972719, + "num_input_tokens_seen": 300182130, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24487305, + "step": 13917, + "time_per_iteration": 2.6027884483337402 + }, + { + "auxiliary_loss_clip": 0.01252289, + "auxiliary_loss_mlp": 0.00215195, + "balance_loss_clip": 1.03924417, + "balance_loss_mlp": 0.19180582, + "epoch": 0.8367954306327973, + "flos": 24205084928640.0, + "grad_norm": 6.1174381507574225, + "language_loss": 0.85702944, + "learning_rate": 2.728706983644933e-07, + "loss": 0.87170434, + "num_input_tokens_seen": 300203050, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23388672, + "step": 13918, + "time_per_iteration": 2.661808967590332 + }, + { + "auxiliary_loss_clip": 0.01280333, + "auxiliary_loss_mlp": 0.00215187, + "balance_loss_clip": 1.06167126, + "balance_loss_mlp": 0.18981965, + "epoch": 0.8368555538854652, + "flos": 24534744975360.0, + "grad_norm": 2.2152554588916584, + "language_loss": 0.7656858, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.78064096, + "num_input_tokens_seen": 300224380, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25366211, + "step": 13919, + "time_per_iteration": 2.6720550060272217 + }, + { + "auxiliary_loss_clip": 0.01245836, + "auxiliary_loss_mlp": 0.0019923, + "balance_loss_clip": 1.03207779, + "balance_loss_mlp": 0.17665219, + "epoch": 0.8369156771381332, + "flos": 20259363830400.0, + "grad_norm": 1841.8518532644093, + "language_loss": 0.82361603, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.8380667, + "num_input_tokens_seen": 300242915, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.22595215, + "step": 13920, + "time_per_iteration": 2.6577064990997314 + }, + { + "auxiliary_loss_clip": 0.01262326, + "auxiliary_loss_mlp": 0.00229384, + "balance_loss_clip": 1.04381847, + "balance_loss_mlp": 0.20338476, + "epoch": 0.8369758003908011, + "flos": 21835555511040.0, + "grad_norm": 5.225476596093305, + "language_loss": 0.78463978, + "learning_rate": 2.722818488237566e-07, + "loss": 0.79955685, + "num_input_tokens_seen": 300261905, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.26013184, + "step": 13921, + "time_per_iteration": 2.6572532653808594 + }, + { + "auxiliary_loss_clip": 0.01272105, + "auxiliary_loss_mlp": 0.00227712, + "balance_loss_clip": 1.04838717, + "balance_loss_mlp": 0.20290503, + "epoch": 0.8370359236434691, + "flos": 21719312121600.0, + "grad_norm": 10.886515060421619, + "language_loss": 0.94580519, + "learning_rate": 2.720856966640801e-07, + "loss": 0.96080339, + "num_input_tokens_seen": 300281145, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.24804688, + "step": 13922, + "time_per_iteration": 2.6863386631011963 + }, + { + "auxiliary_loss_clip": 0.01251117, + "auxiliary_loss_mlp": 0.00236419, + "balance_loss_clip": 1.0387814, + "balance_loss_mlp": 0.21262479, + "epoch": 0.837096046896137, + "flos": 23148880485120.0, + "grad_norm": 6.744313947971173, + "language_loss": 0.79015112, + "learning_rate": 2.71889610027088e-07, + "loss": 0.80502641, + "num_input_tokens_seen": 300301610, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.23803711, + "step": 13923, + "time_per_iteration": 2.6720070838928223 + }, + { + "auxiliary_loss_clip": 0.01261759, + "auxiliary_loss_mlp": 0.00218053, + "balance_loss_clip": 1.04367173, + "balance_loss_mlp": 0.19326895, + "epoch": 0.8371561701488051, + "flos": 24492872695680.0, + "grad_norm": 2.2910633016194133, + "language_loss": 0.82147062, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.83626878, + "num_input_tokens_seen": 300319420, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.2479248, + "step": 13924, + "time_per_iteration": 2.712542772293091 + }, + { + "auxiliary_loss_clip": 0.01257452, + "auxiliary_loss_mlp": 0.00223915, + "balance_loss_clip": 1.0416559, + "balance_loss_mlp": 0.1975583, + "epoch": 0.837216293401473, + "flos": 29206723161600.0, + "grad_norm": 80.36577754254857, + "language_loss": 0.72673297, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.74154663, + "num_input_tokens_seen": 300341325, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.26330566, + "step": 13925, + "time_per_iteration": 2.7430009841918945 + }, + { + "auxiliary_loss_clip": 0.01277761, + "auxiliary_loss_mlp": 0.0023465, + "balance_loss_clip": 1.05522656, + "balance_loss_mlp": 0.20875771, + "epoch": 0.837276416654141, + "flos": 25265275781760.0, + "grad_norm": 17.529108224919884, + "language_loss": 0.82064283, + "learning_rate": 2.713017433265543e-07, + "loss": 0.83576691, + "num_input_tokens_seen": 300361620, + "router_z_loss_clip": 2.22753906, + "router_z_loss_mlp": 0.25891113, + "step": 13926, + "time_per_iteration": 2.741399049758911 + }, + { + "auxiliary_loss_clip": 0.01267672, + "auxiliary_loss_mlp": 0.00225136, + "balance_loss_clip": 1.0466584, + "balance_loss_mlp": 0.19906479, + "epoch": 0.837336539906809, + "flos": 13882024656000.0, + "grad_norm": 23.50575167140102, + "language_loss": 0.80315131, + "learning_rate": 2.711059188546274e-07, + "loss": 0.81807941, + "num_input_tokens_seen": 300378675, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26049805, + "step": 13927, + "time_per_iteration": 2.6098484992980957 + }, + { + "auxiliary_loss_clip": 0.0117027, + "auxiliary_loss_mlp": 0.00106876, + "balance_loss_clip": 1.02789187, + "balance_loss_mlp": 0.09900796, + "epoch": 0.8373966631594769, + "flos": 68870599044480.0, + "grad_norm": 4.755317234961843, + "language_loss": 0.57414454, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.58691597, + "num_input_tokens_seen": 300449740, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.07861328, + "step": 13928, + "time_per_iteration": 3.3357577323913574 + }, + { + "auxiliary_loss_clip": 0.01276222, + "auxiliary_loss_mlp": 0.00238733, + "balance_loss_clip": 1.05475283, + "balance_loss_mlp": 0.21288809, + "epoch": 0.8374567864121449, + "flos": 20448972748800.0, + "grad_norm": 31.70877768193827, + "language_loss": 0.7686702, + "learning_rate": 2.707144665977068e-07, + "loss": 0.78381974, + "num_input_tokens_seen": 300470000, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.25854492, + "step": 13929, + "time_per_iteration": 2.6872711181640625 + }, + { + "auxiliary_loss_clip": 0.01296682, + "auxiliary_loss_mlp": 0.00236851, + "balance_loss_clip": 1.07096171, + "balance_loss_mlp": 0.20957586, + "epoch": 0.8375169096648128, + "flos": 41904197101440.0, + "grad_norm": 25.659412138874394, + "language_loss": 0.7586025, + "learning_rate": 2.705188388275574e-07, + "loss": 0.77393782, + "num_input_tokens_seen": 300494975, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.27282715, + "step": 13930, + "time_per_iteration": 2.841099500656128 + }, + { + "auxiliary_loss_clip": 0.01275952, + "auxiliary_loss_mlp": 0.00226285, + "balance_loss_clip": 1.05656004, + "balance_loss_mlp": 0.20258623, + "epoch": 0.8375770329174809, + "flos": 20009354192640.0, + "grad_norm": 99.13957253613962, + "language_loss": 0.79536068, + "learning_rate": 2.703232766395067e-07, + "loss": 0.81038308, + "num_input_tokens_seen": 300513175, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.23706055, + "step": 13931, + "time_per_iteration": 2.7806365489959717 + }, + { + "auxiliary_loss_clip": 0.01274415, + "auxiliary_loss_mlp": 0.00238313, + "balance_loss_clip": 1.05061293, + "balance_loss_mlp": 0.21268326, + "epoch": 0.8376371561701488, + "flos": 22783597125120.0, + "grad_norm": 2.7933431953655234, + "language_loss": 0.77998757, + "learning_rate": 2.701277800409705e-07, + "loss": 0.79511482, + "num_input_tokens_seen": 300533770, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.25622559, + "step": 13932, + "time_per_iteration": 2.644869327545166 + }, + { + "auxiliary_loss_clip": 0.01266481, + "auxiliary_loss_mlp": 0.00217034, + "balance_loss_clip": 1.05003142, + "balance_loss_mlp": 0.19428851, + "epoch": 0.8376972794228168, + "flos": 23914459987200.0, + "grad_norm": 144.291333514061, + "language_loss": 0.75322503, + "learning_rate": 2.699323490393628e-07, + "loss": 0.76806015, + "num_input_tokens_seen": 300552995, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.22753906, + "step": 13933, + "time_per_iteration": 2.729759931564331 + }, + { + "auxiliary_loss_clip": 0.01253531, + "auxiliary_loss_mlp": 0.00224049, + "balance_loss_clip": 1.04054713, + "balance_loss_mlp": 0.20166129, + "epoch": 0.8377574026754847, + "flos": 13734718980480.0, + "grad_norm": 39.78640973605216, + "language_loss": 0.84810811, + "learning_rate": 2.697369836420933e-07, + "loss": 0.86288393, + "num_input_tokens_seen": 300570275, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.22387695, + "step": 13934, + "time_per_iteration": 2.5917294025421143 + }, + { + "auxiliary_loss_clip": 0.01260149, + "auxiliary_loss_mlp": 0.00240307, + "balance_loss_clip": 1.04596412, + "balance_loss_mlp": 0.21527344, + "epoch": 0.8378175259281527, + "flos": 21651333632640.0, + "grad_norm": 7.748708248045925, + "language_loss": 0.83090466, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.84590924, + "num_input_tokens_seen": 300590875, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.25024414, + "step": 13935, + "time_per_iteration": 2.679527997970581 + }, + { + "auxiliary_loss_clip": 0.01269643, + "auxiliary_loss_mlp": 0.00235445, + "balance_loss_clip": 1.04846334, + "balance_loss_mlp": 0.21044701, + "epoch": 0.8378776491808206, + "flos": 15448806973440.0, + "grad_norm": 105.05106889364808, + "language_loss": 0.63662493, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.65167582, + "num_input_tokens_seen": 300607490, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.24987793, + "step": 13936, + "time_per_iteration": 2.599281072616577 + }, + { + "auxiliary_loss_clip": 0.01267872, + "auxiliary_loss_mlp": 0.00229066, + "balance_loss_clip": 1.05098498, + "balance_loss_mlp": 0.20305455, + "epoch": 0.8379377724334887, + "flos": 14720395069440.0, + "grad_norm": 71.15572203225985, + "language_loss": 0.97416025, + "learning_rate": 2.691512811503882e-07, + "loss": 0.98912966, + "num_input_tokens_seen": 300623635, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.26000977, + "step": 13937, + "time_per_iteration": 2.699110984802246 + }, + { + "auxiliary_loss_clip": 0.01277433, + "auxiliary_loss_mlp": 0.00224315, + "balance_loss_clip": 1.05849957, + "balance_loss_mlp": 0.1996147, + "epoch": 0.8379978956861566, + "flos": 24535247765760.0, + "grad_norm": 14.004522642010809, + "language_loss": 0.88583255, + "learning_rate": 2.689561782445313e-07, + "loss": 0.90085006, + "num_input_tokens_seen": 300643835, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24707031, + "step": 13938, + "time_per_iteration": 2.6620261669158936 + }, + { + "auxiliary_loss_clip": 0.01290076, + "auxiliary_loss_mlp": 0.00227993, + "balance_loss_clip": 1.06465292, + "balance_loss_mlp": 0.20106338, + "epoch": 0.8380580189388246, + "flos": 18952611045120.0, + "grad_norm": 250.0823117499765, + "language_loss": 0.80014718, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.81532788, + "num_input_tokens_seen": 300662500, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.26953125, + "step": 13939, + "time_per_iteration": 2.6878933906555176 + }, + { + "auxiliary_loss_clip": 0.01282742, + "auxiliary_loss_mlp": 0.00250464, + "balance_loss_clip": 1.059376, + "balance_loss_mlp": 0.22397591, + "epoch": 0.8381181421914926, + "flos": 26540283922560.0, + "grad_norm": 10.17151221752381, + "language_loss": 0.85913253, + "learning_rate": 2.6856616936428e-07, + "loss": 0.87446457, + "num_input_tokens_seen": 300681480, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26477051, + "step": 13940, + "time_per_iteration": 2.740241050720215 + }, + { + "auxiliary_loss_clip": 0.01280946, + "auxiliary_loss_mlp": 0.00226396, + "balance_loss_clip": 1.05963922, + "balance_loss_mlp": 0.20070647, + "epoch": 0.8381782654441605, + "flos": 23291481479040.0, + "grad_norm": 11.110725506108894, + "language_loss": 0.85168648, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.8667599, + "num_input_tokens_seen": 300699165, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.25695801, + "step": 13941, + "time_per_iteration": 2.6624722480773926 + }, + { + "auxiliary_loss_clip": 0.01277193, + "auxiliary_loss_mlp": 0.00237691, + "balance_loss_clip": 1.05428839, + "balance_loss_mlp": 0.20949784, + "epoch": 0.8382383886968285, + "flos": 26758800311040.0, + "grad_norm": 77.09613246811952, + "language_loss": 0.80998206, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.82513088, + "num_input_tokens_seen": 300714615, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.28210449, + "step": 13942, + "time_per_iteration": 2.696963310241699 + }, + { + "auxiliary_loss_clip": 0.01279089, + "auxiliary_loss_mlp": 0.00231718, + "balance_loss_clip": 1.05181265, + "balance_loss_mlp": 0.20391804, + "epoch": 0.8382985119494964, + "flos": 26104544035200.0, + "grad_norm": 3.250924616566831, + "language_loss": 0.88596189, + "learning_rate": 2.679816484834554e-07, + "loss": 0.90107, + "num_input_tokens_seen": 300734860, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.27819824, + "step": 13943, + "time_per_iteration": 2.6827361583709717 + }, + { + "auxiliary_loss_clip": 0.01278026, + "auxiliary_loss_mlp": 0.00247836, + "balance_loss_clip": 1.06103706, + "balance_loss_mlp": 0.223744, + "epoch": 0.8383586352021645, + "flos": 16435129507200.0, + "grad_norm": 20.53703365661822, + "language_loss": 0.9302007, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.94545925, + "num_input_tokens_seen": 300752735, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.2409668, + "step": 13944, + "time_per_iteration": 4.021123647689819 + }, + { + "auxiliary_loss_clip": 0.0119787, + "auxiliary_loss_mlp": 0.00099874, + "balance_loss_clip": 1.0530982, + "balance_loss_mlp": 0.09172006, + "epoch": 0.8384187584548324, + "flos": 64195532288640.0, + "grad_norm": 0.611124694016949, + "language_loss": 0.49782342, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.51080084, + "num_input_tokens_seen": 300820760, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.08154297, + "step": 13945, + "time_per_iteration": 4.74060583114624 + }, + { + "auxiliary_loss_clip": 0.01267386, + "auxiliary_loss_mlp": 0.00237416, + "balance_loss_clip": 1.04957461, + "balance_loss_mlp": 0.21201268, + "epoch": 0.8384788817075004, + "flos": 22382905933440.0, + "grad_norm": 5.358383429730548, + "language_loss": 0.72720081, + "learning_rate": 2.673977187074017e-07, + "loss": 0.74224883, + "num_input_tokens_seen": 300840025, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25402832, + "step": 13946, + "time_per_iteration": 2.635892868041992 + }, + { + "auxiliary_loss_clip": 0.01275162, + "auxiliary_loss_mlp": 0.00239381, + "balance_loss_clip": 1.05226827, + "balance_loss_mlp": 0.21293995, + "epoch": 0.8385390049601683, + "flos": 29496845312640.0, + "grad_norm": 167.68311514220244, + "language_loss": 0.7498821, + "learning_rate": 2.672032068397829e-07, + "loss": 0.76502758, + "num_input_tokens_seen": 300860380, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.26452637, + "step": 13947, + "time_per_iteration": 2.8435580730438232 + }, + { + "auxiliary_loss_clip": 0.01267617, + "auxiliary_loss_mlp": 0.00222048, + "balance_loss_clip": 1.04637957, + "balance_loss_mlp": 0.19749051, + "epoch": 0.8385991282128363, + "flos": 32707797799680.0, + "grad_norm": 3.902823358744054, + "language_loss": 0.76863939, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.78353596, + "num_input_tokens_seen": 300881895, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.24584961, + "step": 13948, + "time_per_iteration": 2.7680132389068604 + }, + { + "auxiliary_loss_clip": 0.01253004, + "auxiliary_loss_mlp": 0.00220852, + "balance_loss_clip": 1.04005146, + "balance_loss_mlp": 0.19742718, + "epoch": 0.8386592514655042, + "flos": 25441022050560.0, + "grad_norm": 11.961047084478684, + "language_loss": 0.9141674, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.92890584, + "num_input_tokens_seen": 300901575, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.234375, + "step": 13949, + "time_per_iteration": 2.6776645183563232 + }, + { + "auxiliary_loss_clip": 0.01264518, + "auxiliary_loss_mlp": 0.00229168, + "balance_loss_clip": 1.04746461, + "balance_loss_mlp": 0.20382375, + "epoch": 0.8387193747181723, + "flos": 22015898720640.0, + "grad_norm": 8.503177703267914, + "language_loss": 0.77417529, + "learning_rate": 2.66620065513385e-07, + "loss": 0.78911209, + "num_input_tokens_seen": 300919735, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25341797, + "step": 13950, + "time_per_iteration": 4.543719530105591 + }, + { + "auxiliary_loss_clip": 0.01255278, + "auxiliary_loss_mlp": 0.0021306, + "balance_loss_clip": 1.04271841, + "balance_loss_mlp": 0.18831176, + "epoch": 0.8387794979708402, + "flos": 18150223080960.0, + "grad_norm": 19.55744785010543, + "language_loss": 0.74438208, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.75906545, + "num_input_tokens_seen": 300939150, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.24755859, + "step": 13951, + "time_per_iteration": 2.6871209144592285 + }, + { + "auxiliary_loss_clip": 0.0127688, + "auxiliary_loss_mlp": 0.00234391, + "balance_loss_clip": 1.05856633, + "balance_loss_mlp": 0.20948833, + "epoch": 0.8388396212235082, + "flos": 25411216740480.0, + "grad_norm": 29.690176796545643, + "language_loss": 0.77554786, + "learning_rate": 2.662316332665393e-07, + "loss": 0.79066062, + "num_input_tokens_seen": 300959730, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24914551, + "step": 13952, + "time_per_iteration": 2.6965112686157227 + }, + { + "auxiliary_loss_clip": 0.01263246, + "auxiliary_loss_mlp": 0.0022951, + "balance_loss_clip": 1.05071235, + "balance_loss_mlp": 0.2064072, + "epoch": 0.8388997444761762, + "flos": 22273055164800.0, + "grad_norm": 11.512120387139356, + "language_loss": 0.801395, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.81632257, + "num_input_tokens_seen": 300976120, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.23132324, + "step": 13953, + "time_per_iteration": 2.639967679977417 + }, + { + "auxiliary_loss_clip": 0.0127762, + "auxiliary_loss_mlp": 0.00242024, + "balance_loss_clip": 1.05923414, + "balance_loss_mlp": 0.21622682, + "epoch": 0.8389598677288441, + "flos": 19573219255680.0, + "grad_norm": 8.46106643499843, + "language_loss": 0.7669431, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.78213954, + "num_input_tokens_seen": 300995080, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25793457, + "step": 13954, + "time_per_iteration": 4.101582050323486 + }, + { + "auxiliary_loss_clip": 0.01269302, + "auxiliary_loss_mlp": 0.00222005, + "balance_loss_clip": 1.05040956, + "balance_loss_mlp": 0.1984973, + "epoch": 0.8390199909815121, + "flos": 17384715406080.0, + "grad_norm": 7.769362956734998, + "language_loss": 0.80384803, + "learning_rate": 2.656494779996932e-07, + "loss": 0.81876111, + "num_input_tokens_seen": 301012920, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.23522949, + "step": 13955, + "time_per_iteration": 2.618305206298828 + }, + { + "auxiliary_loss_clip": 0.01269775, + "auxiliary_loss_mlp": 0.00220411, + "balance_loss_clip": 1.0531739, + "balance_loss_mlp": 0.19511506, + "epoch": 0.83908011423418, + "flos": 24639639667200.0, + "grad_norm": 4.3014605308103615, + "language_loss": 0.75612891, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.77103078, + "num_input_tokens_seen": 301028875, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.25280762, + "step": 13956, + "time_per_iteration": 2.6459875106811523 + }, + { + "auxiliary_loss_clip": 0.01287014, + "auxiliary_loss_mlp": 0.00247352, + "balance_loss_clip": 1.05836701, + "balance_loss_mlp": 0.21954064, + "epoch": 0.8391402374868481, + "flos": 24718356322560.0, + "grad_norm": 2.437416335577208, + "language_loss": 0.88572514, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.90106881, + "num_input_tokens_seen": 301050115, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.27832031, + "step": 13957, + "time_per_iteration": 2.677393674850464 + }, + { + "auxiliary_loss_clip": 0.01180447, + "auxiliary_loss_mlp": 0.00120704, + "balance_loss_clip": 1.03565145, + "balance_loss_mlp": 0.11393332, + "epoch": 0.839200360739516, + "flos": 56871695784960.0, + "grad_norm": 0.8740684704022128, + "language_loss": 0.52930307, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.54231453, + "num_input_tokens_seen": 301114155, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.06787109, + "step": 13958, + "time_per_iteration": 3.2448062896728516 + }, + { + "auxiliary_loss_clip": 0.01262846, + "auxiliary_loss_mlp": 0.00223601, + "balance_loss_clip": 1.04599857, + "balance_loss_mlp": 0.19990182, + "epoch": 0.839260483992184, + "flos": 18332792933760.0, + "grad_norm": 67.6244181750782, + "language_loss": 0.8284837, + "learning_rate": 2.648741917459574e-07, + "loss": 0.84334815, + "num_input_tokens_seen": 301133150, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.23693848, + "step": 13959, + "time_per_iteration": 2.6084694862365723 + }, + { + "auxiliary_loss_clip": 0.01277079, + "auxiliary_loss_mlp": 0.00216378, + "balance_loss_clip": 1.06162381, + "balance_loss_mlp": 0.19323955, + "epoch": 0.8393206072448519, + "flos": 27087921653760.0, + "grad_norm": 2.247092613246207, + "language_loss": 0.64732629, + "learning_rate": 2.646805346545169e-07, + "loss": 0.66226089, + "num_input_tokens_seen": 301153600, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.23144531, + "step": 13960, + "time_per_iteration": 2.6975579261779785 + }, + { + "auxiliary_loss_clip": 0.01185462, + "auxiliary_loss_mlp": 0.00136759, + "balance_loss_clip": 1.0398773, + "balance_loss_mlp": 0.12836701, + "epoch": 0.8393807304975199, + "flos": 61521192057600.0, + "grad_norm": 0.7625886597988187, + "language_loss": 0.60222661, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.61544883, + "num_input_tokens_seen": 301214335, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.08398438, + "step": 13961, + "time_per_iteration": 3.2253010272979736 + }, + { + "auxiliary_loss_clip": 0.01255966, + "auxiliary_loss_mlp": 0.00212916, + "balance_loss_clip": 1.04110706, + "balance_loss_mlp": 0.18760775, + "epoch": 0.8394408537501878, + "flos": 14894848448640.0, + "grad_norm": 5.782051190822292, + "language_loss": 0.76425219, + "learning_rate": 2.642934178894405e-07, + "loss": 0.77894098, + "num_input_tokens_seen": 301228960, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.25317383, + "step": 13962, + "time_per_iteration": 2.6460254192352295 + }, + { + "auxiliary_loss_clip": 0.01266736, + "auxiliary_loss_mlp": 0.00232463, + "balance_loss_clip": 1.05093193, + "balance_loss_mlp": 0.20879972, + "epoch": 0.8395009770028559, + "flos": 17412186332160.0, + "grad_norm": 71.36521566754654, + "language_loss": 0.84126723, + "learning_rate": 2.640999582304841e-07, + "loss": 0.85625923, + "num_input_tokens_seen": 301245875, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.23681641, + "step": 13963, + "time_per_iteration": 2.6018378734588623 + }, + { + "auxiliary_loss_clip": 0.01263098, + "auxiliary_loss_mlp": 0.0023291, + "balance_loss_clip": 1.04694939, + "balance_loss_mlp": 0.2081501, + "epoch": 0.8395611002555238, + "flos": 27924747782400.0, + "grad_norm": 81.07888821627522, + "language_loss": 0.82646412, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.84142423, + "num_input_tokens_seen": 301265550, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.24743652, + "step": 13964, + "time_per_iteration": 2.7339107990264893 + }, + { + "auxiliary_loss_clip": 0.01296086, + "auxiliary_loss_mlp": 0.0024667, + "balance_loss_clip": 1.0644325, + "balance_loss_mlp": 0.21832225, + "epoch": 0.8396212235081918, + "flos": 11100922225920.0, + "grad_norm": 16.370444358047873, + "language_loss": 0.86728007, + "learning_rate": 2.637132363964161e-07, + "loss": 0.8827076, + "num_input_tokens_seen": 301282035, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.28356934, + "step": 13965, + "time_per_iteration": 2.6139590740203857 + }, + { + "auxiliary_loss_clip": 0.0126589, + "auxiliary_loss_mlp": 0.00210391, + "balance_loss_clip": 1.04746461, + "balance_loss_mlp": 0.18685859, + "epoch": 0.8396813467608598, + "flos": 35735641729920.0, + "grad_norm": 20.853476678434387, + "language_loss": 0.73400128, + "learning_rate": 2.635199742359684e-07, + "loss": 0.74876416, + "num_input_tokens_seen": 301305210, + "router_z_loss_clip": 2.18652344, + "router_z_loss_mlp": 0.23547363, + "step": 13966, + "time_per_iteration": 2.859145402908325 + }, + { + "auxiliary_loss_clip": 0.01267226, + "auxiliary_loss_mlp": 0.00214056, + "balance_loss_clip": 1.05027223, + "balance_loss_mlp": 0.18921243, + "epoch": 0.8397414700135277, + "flos": 26176724415360.0, + "grad_norm": 3.501532706318258, + "language_loss": 0.83554041, + "learning_rate": 2.633267779230177e-07, + "loss": 0.85035324, + "num_input_tokens_seen": 301324885, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.24841309, + "step": 13967, + "time_per_iteration": 2.7299818992614746 + }, + { + "auxiliary_loss_clip": 0.01270882, + "auxiliary_loss_mlp": 0.00215889, + "balance_loss_clip": 1.05152917, + "balance_loss_mlp": 0.19143942, + "epoch": 0.8398015932661957, + "flos": 18333116156160.0, + "grad_norm": 32.53385813177828, + "language_loss": 0.91122562, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.92609334, + "num_input_tokens_seen": 301343070, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.24450684, + "step": 13968, + "time_per_iteration": 2.7362773418426514 + }, + { + "auxiliary_loss_clip": 0.01280679, + "auxiliary_loss_mlp": 0.0024018, + "balance_loss_clip": 1.05338097, + "balance_loss_mlp": 0.21404958, + "epoch": 0.8398617165188637, + "flos": 17379507934080.0, + "grad_norm": 55.253414839769, + "language_loss": 0.85783273, + "learning_rate": 2.629405828689075e-07, + "loss": 0.87304127, + "num_input_tokens_seen": 301359280, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.26159668, + "step": 13969, + "time_per_iteration": 2.6690313816070557 + }, + { + "auxiliary_loss_clip": 0.01279974, + "auxiliary_loss_mlp": 0.00240589, + "balance_loss_clip": 1.05761075, + "balance_loss_mlp": 0.21414848, + "epoch": 0.8399218397715317, + "flos": 22929681738240.0, + "grad_norm": 69.13581942101045, + "language_loss": 0.88020313, + "learning_rate": 2.627475841423923e-07, + "loss": 0.89540875, + "num_input_tokens_seen": 301376465, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26452637, + "step": 13970, + "time_per_iteration": 2.692054033279419 + }, + { + "auxiliary_loss_clip": 0.01278556, + "auxiliary_loss_mlp": 0.00230897, + "balance_loss_clip": 1.0579927, + "balance_loss_mlp": 0.20675747, + "epoch": 0.8399819630241996, + "flos": 23149562843520.0, + "grad_norm": 65.91319592646211, + "language_loss": 0.80723947, + "learning_rate": 2.625546512926633e-07, + "loss": 0.82233405, + "num_input_tokens_seen": 301396000, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.24145508, + "step": 13971, + "time_per_iteration": 2.7439663410186768 + }, + { + "auxiliary_loss_clip": 0.01272405, + "auxiliary_loss_mlp": 0.00210293, + "balance_loss_clip": 1.05148578, + "balance_loss_mlp": 0.18568808, + "epoch": 0.8400420862768676, + "flos": 16397423205120.0, + "grad_norm": 19.828319703721718, + "language_loss": 0.8457849, + "learning_rate": 2.623617843270358e-07, + "loss": 0.8606118, + "num_input_tokens_seen": 301413160, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24609375, + "step": 13972, + "time_per_iteration": 2.6380109786987305 + }, + { + "auxiliary_loss_clip": 0.01257994, + "auxiliary_loss_mlp": 0.00211719, + "balance_loss_clip": 1.04328823, + "balance_loss_mlp": 0.18840155, + "epoch": 0.8401022095295355, + "flos": 21287486816640.0, + "grad_norm": 25.12813324193505, + "language_loss": 0.7385999, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.75329709, + "num_input_tokens_seen": 301433325, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.2331543, + "step": 13973, + "time_per_iteration": 2.7582948207855225 + }, + { + "auxiliary_loss_clip": 0.0126188, + "auxiliary_loss_mlp": 0.00215751, + "balance_loss_clip": 1.04536331, + "balance_loss_mlp": 0.19181341, + "epoch": 0.8401623327822035, + "flos": 17311313963520.0, + "grad_norm": 24.650643762721263, + "language_loss": 0.86998272, + "learning_rate": 2.619762480773382e-07, + "loss": 0.88475895, + "num_input_tokens_seen": 301450265, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.23913574, + "step": 13974, + "time_per_iteration": 2.632596254348755 + }, + { + "auxiliary_loss_clip": 0.01265336, + "auxiliary_loss_mlp": 0.00218124, + "balance_loss_clip": 1.04441333, + "balance_loss_mlp": 0.19208905, + "epoch": 0.8402224560348714, + "flos": 22236677665920.0, + "grad_norm": 1527.4370311244913, + "language_loss": 0.78664309, + "learning_rate": 2.617835788078868e-07, + "loss": 0.80147767, + "num_input_tokens_seen": 301470760, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26037598, + "step": 13975, + "time_per_iteration": 2.703493118286133 + }, + { + "auxiliary_loss_clip": 0.01282998, + "auxiliary_loss_mlp": 0.00221357, + "balance_loss_clip": 1.05842292, + "balance_loss_mlp": 0.1963827, + "epoch": 0.8402825792875395, + "flos": 20229953569920.0, + "grad_norm": 1.8136460375866832, + "language_loss": 0.79276448, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.80780804, + "num_input_tokens_seen": 301489425, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.24975586, + "step": 13976, + "time_per_iteration": 2.720053195953369 + }, + { + "auxiliary_loss_clip": 0.01268408, + "auxiliary_loss_mlp": 0.00215979, + "balance_loss_clip": 1.05076194, + "balance_loss_mlp": 0.19087306, + "epoch": 0.8403427025402074, + "flos": 23289973107840.0, + "grad_norm": 12.164063095766762, + "language_loss": 0.80680668, + "learning_rate": 2.61398438016311e-07, + "loss": 0.82165056, + "num_input_tokens_seen": 301508885, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.25097656, + "step": 13977, + "time_per_iteration": 2.874589443206787 + }, + { + "auxiliary_loss_clip": 0.01258913, + "auxiliary_loss_mlp": 0.00242669, + "balance_loss_clip": 1.0417552, + "balance_loss_mlp": 0.21571594, + "epoch": 0.8404028257928754, + "flos": 32675586278400.0, + "grad_norm": 4.4853341926233306, + "language_loss": 0.75114465, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.76616049, + "num_input_tokens_seen": 301533780, + "router_z_loss_clip": 2.17089844, + "router_z_loss_mlp": 0.26953125, + "step": 13978, + "time_per_iteration": 2.8083550930023193 + }, + { + "auxiliary_loss_clip": 0.01250602, + "auxiliary_loss_mlp": 0.00209808, + "balance_loss_clip": 1.03376949, + "balance_loss_mlp": 0.18292651, + "epoch": 0.8404629490455434, + "flos": 16180522928640.0, + "grad_norm": 7.4489054513968735, + "language_loss": 0.84363657, + "learning_rate": 2.610135609365145e-07, + "loss": 0.85824072, + "num_input_tokens_seen": 301551775, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.26879883, + "step": 13979, + "time_per_iteration": 2.663464307785034 + }, + { + "auxiliary_loss_clip": 0.01274324, + "auxiliary_loss_mlp": 0.00226796, + "balance_loss_clip": 1.0542618, + "balance_loss_mlp": 0.20077229, + "epoch": 0.8405230722982113, + "flos": 15194451790080.0, + "grad_norm": 4.101976717903875, + "language_loss": 0.88438129, + "learning_rate": 2.60821221306778e-07, + "loss": 0.89939249, + "num_input_tokens_seen": 301570495, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.26000977, + "step": 13980, + "time_per_iteration": 2.6902098655700684 + }, + { + "auxiliary_loss_clip": 0.0127382, + "auxiliary_loss_mlp": 0.00227364, + "balance_loss_clip": 1.05822134, + "balance_loss_mlp": 0.20385621, + "epoch": 0.8405831955508793, + "flos": 27812418975360.0, + "grad_norm": 1501.9146508019476, + "language_loss": 0.92055917, + "learning_rate": 2.606289476268757e-07, + "loss": 0.93557107, + "num_input_tokens_seen": 301591705, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.23522949, + "step": 13981, + "time_per_iteration": 2.7011451721191406 + }, + { + "auxiliary_loss_clip": 0.01278871, + "auxiliary_loss_mlp": 0.0024547, + "balance_loss_clip": 1.05939662, + "balance_loss_mlp": 0.22144887, + "epoch": 0.8406433188035473, + "flos": 23769452782080.0, + "grad_norm": 21943.214773252053, + "language_loss": 0.75856405, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.77380747, + "num_input_tokens_seen": 301611670, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24023438, + "step": 13982, + "time_per_iteration": 2.72743558883667 + }, + { + "auxiliary_loss_clip": 0.01276725, + "auxiliary_loss_mlp": 0.0024094, + "balance_loss_clip": 1.05604506, + "balance_loss_mlp": 0.21502344, + "epoch": 0.8407034420562153, + "flos": 29205681667200.0, + "grad_norm": 14.056379069046912, + "language_loss": 0.76159024, + "learning_rate": 2.602445981457324e-07, + "loss": 0.7767669, + "num_input_tokens_seen": 301632540, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25939941, + "step": 13983, + "time_per_iteration": 2.71828293800354 + }, + { + "auxiliary_loss_clip": 0.01269153, + "auxiliary_loss_mlp": 0.00223229, + "balance_loss_clip": 1.04633582, + "balance_loss_mlp": 0.19680071, + "epoch": 0.8407635653088832, + "flos": 26360084367360.0, + "grad_norm": 152.6457426716722, + "language_loss": 0.87017286, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.88509667, + "num_input_tokens_seen": 301651480, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.26416016, + "step": 13984, + "time_per_iteration": 2.7417914867401123 + }, + { + "auxiliary_loss_clip": 0.01264244, + "auxiliary_loss_mlp": 0.00212303, + "balance_loss_clip": 1.04897952, + "balance_loss_mlp": 0.18860447, + "epoch": 0.8408236885615512, + "flos": 21468799693440.0, + "grad_norm": 6.592828898921302, + "language_loss": 0.68671715, + "learning_rate": 2.598605125513842e-07, + "loss": 0.70148259, + "num_input_tokens_seen": 301670010, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.23718262, + "step": 13985, + "time_per_iteration": 2.724541664123535 + }, + { + "auxiliary_loss_clip": 0.01258745, + "auxiliary_loss_mlp": 0.00232057, + "balance_loss_clip": 1.04057014, + "balance_loss_mlp": 0.20701112, + "epoch": 0.8408838118142191, + "flos": 22963724853120.0, + "grad_norm": 14.571582443690344, + "language_loss": 0.88594091, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.90084898, + "num_input_tokens_seen": 301689785, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.25061035, + "step": 13986, + "time_per_iteration": 4.095928192138672 + }, + { + "auxiliary_loss_clip": 0.01251404, + "auxiliary_loss_mlp": 0.00218152, + "balance_loss_clip": 1.03704834, + "balance_loss_mlp": 0.19340453, + "epoch": 0.8409439350668871, + "flos": 26800026145920.0, + "grad_norm": 18.759014926764046, + "language_loss": 0.74330288, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.75799847, + "num_input_tokens_seen": 301712225, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.24768066, + "step": 13987, + "time_per_iteration": 2.7979485988616943 + }, + { + "auxiliary_loss_clip": 0.01260644, + "auxiliary_loss_mlp": 0.00224605, + "balance_loss_clip": 1.04833746, + "balance_loss_mlp": 0.20151402, + "epoch": 0.841004058319555, + "flos": 26578672583040.0, + "grad_norm": 8.080809504440852, + "language_loss": 0.74491924, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.7597717, + "num_input_tokens_seen": 301730955, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.2310791, + "step": 13988, + "time_per_iteration": 4.183348178863525 + }, + { + "auxiliary_loss_clip": 0.01285437, + "auxiliary_loss_mlp": 0.00210245, + "balance_loss_clip": 1.05954361, + "balance_loss_mlp": 0.18292238, + "epoch": 0.8410641815722231, + "flos": 14501878680960.0, + "grad_norm": 32.878146827716776, + "language_loss": 0.89551914, + "learning_rate": 2.590931332560622e-07, + "loss": 0.91047597, + "num_input_tokens_seen": 301746930, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.2734375, + "step": 13989, + "time_per_iteration": 2.762453317642212 + }, + { + "auxiliary_loss_clip": 0.01263935, + "auxiliary_loss_mlp": 0.00226763, + "balance_loss_clip": 1.04351807, + "balance_loss_mlp": 0.2014551, + "epoch": 0.841124304824891, + "flos": 29166682475520.0, + "grad_norm": 34.10868208700546, + "language_loss": 0.82225299, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.83715993, + "num_input_tokens_seen": 301766945, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.2532959, + "step": 13990, + "time_per_iteration": 2.8175392150878906 + }, + { + "auxiliary_loss_clip": 0.01261456, + "auxiliary_loss_mlp": 0.00225204, + "balance_loss_clip": 1.04837775, + "balance_loss_mlp": 0.19962144, + "epoch": 0.841184428077559, + "flos": 22412028885120.0, + "grad_norm": 13.224706549165322, + "language_loss": 0.86726564, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.88213223, + "num_input_tokens_seen": 301785460, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.2557373, + "step": 13991, + "time_per_iteration": 2.738900661468506 + }, + { + "auxiliary_loss_clip": 0.01268327, + "auxiliary_loss_mlp": 0.0022452, + "balance_loss_clip": 1.04612124, + "balance_loss_mlp": 0.19952157, + "epoch": 0.841244551330227, + "flos": 22962791099520.0, + "grad_norm": 2.484890188185025, + "language_loss": 0.79861104, + "learning_rate": 2.585182919204105e-07, + "loss": 0.81353951, + "num_input_tokens_seen": 301804180, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.24987793, + "step": 13992, + "time_per_iteration": 5.041700839996338 + }, + { + "auxiliary_loss_clip": 0.01267776, + "auxiliary_loss_mlp": 0.0022857, + "balance_loss_clip": 1.05032814, + "balance_loss_mlp": 0.2033449, + "epoch": 0.8413046745828949, + "flos": 21032736583680.0, + "grad_norm": 42.25221563334634, + "language_loss": 0.8457284, + "learning_rate": 2.583268102064959e-07, + "loss": 0.86069191, + "num_input_tokens_seen": 301823670, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.25256348, + "step": 13993, + "time_per_iteration": 2.719926118850708 + }, + { + "auxiliary_loss_clip": 0.01291427, + "auxiliary_loss_mlp": 0.00256216, + "balance_loss_clip": 1.06071115, + "balance_loss_mlp": 0.22858283, + "epoch": 0.841364797835563, + "flos": 27052082858880.0, + "grad_norm": 6.302978340984394, + "language_loss": 0.81181192, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.82728839, + "num_input_tokens_seen": 301845890, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.27636719, + "step": 13994, + "time_per_iteration": 2.7070443630218506 + }, + { + "auxiliary_loss_clip": 0.01258157, + "auxiliary_loss_mlp": 0.00218, + "balance_loss_clip": 1.04442573, + "balance_loss_mlp": 0.19365761, + "epoch": 0.8414249210882309, + "flos": 17895688329600.0, + "grad_norm": 71.22266275041795, + "language_loss": 0.6729182, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.68767983, + "num_input_tokens_seen": 301863985, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24365234, + "step": 13995, + "time_per_iteration": 2.670283079147339 + }, + { + "auxiliary_loss_clip": 0.01272776, + "auxiliary_loss_mlp": 0.00211611, + "balance_loss_clip": 1.04984009, + "balance_loss_mlp": 0.18580171, + "epoch": 0.8414850443408989, + "flos": 25441201618560.0, + "grad_norm": 37.216884591099664, + "language_loss": 0.78422993, + "learning_rate": 2.577527613603163e-07, + "loss": 0.79907382, + "num_input_tokens_seen": 301882765, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25817871, + "step": 13996, + "time_per_iteration": 4.200998783111572 + }, + { + "auxiliary_loss_clip": 0.01246438, + "auxiliary_loss_mlp": 0.00209083, + "balance_loss_clip": 1.03330624, + "balance_loss_mlp": 0.18596821, + "epoch": 0.8415451675935668, + "flos": 23220055284480.0, + "grad_norm": 5.72831959240996, + "language_loss": 0.70877647, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.72333169, + "num_input_tokens_seen": 301902720, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.2310791, + "step": 13997, + "time_per_iteration": 2.6707754135131836 + }, + { + "auxiliary_loss_clip": 0.01288686, + "auxiliary_loss_mlp": 0.00231592, + "balance_loss_clip": 1.05911422, + "balance_loss_mlp": 0.20554501, + "epoch": 0.8416052908462348, + "flos": 18546496899840.0, + "grad_norm": 25.318301090682894, + "language_loss": 0.90128314, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.91648591, + "num_input_tokens_seen": 301921245, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.26037598, + "step": 13998, + "time_per_iteration": 2.733452558517456 + }, + { + "auxiliary_loss_clip": 0.01252315, + "auxiliary_loss_mlp": 0.0021404, + "balance_loss_clip": 1.03460789, + "balance_loss_mlp": 0.18758713, + "epoch": 0.8416654140989027, + "flos": 26105190480000.0, + "grad_norm": 23.55003162153108, + "language_loss": 0.88497126, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.89963484, + "num_input_tokens_seen": 301942320, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.2644043, + "step": 13999, + "time_per_iteration": 2.7387781143188477 + }, + { + "auxiliary_loss_clip": 0.01271326, + "auxiliary_loss_mlp": 0.00222276, + "balance_loss_clip": 1.04920459, + "balance_loss_mlp": 0.1974805, + "epoch": 0.8417255373515707, + "flos": 26433270328320.0, + "grad_norm": 237.1274672528101, + "language_loss": 0.78272688, + "learning_rate": 2.569882878592096e-07, + "loss": 0.79766291, + "num_input_tokens_seen": 301963110, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.24816895, + "step": 14000, + "time_per_iteration": 2.764411211013794 + }, + { + "auxiliary_loss_clip": 0.01266572, + "auxiliary_loss_mlp": 0.00194331, + "balance_loss_clip": 1.0473485, + "balance_loss_mlp": 0.16883188, + "epoch": 0.8417856606042387, + "flos": 24717745791360.0, + "grad_norm": 63.75137308499209, + "language_loss": 0.86843073, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.88303971, + "num_input_tokens_seen": 301984915, + "router_z_loss_clip": 2.19628906, + "router_z_loss_mlp": 0.25488281, + "step": 14001, + "time_per_iteration": 2.7190444469451904 + }, + { + "auxiliary_loss_clip": 0.0125592, + "auxiliary_loss_mlp": 0.0026046, + "balance_loss_clip": 1.03594685, + "balance_loss_mlp": 0.23362592, + "epoch": 0.8418457838569067, + "flos": 20850849089280.0, + "grad_norm": 10.680479492459558, + "language_loss": 0.87011051, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.88527429, + "num_input_tokens_seen": 302004095, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26831055, + "step": 14002, + "time_per_iteration": 2.724395275115967 + }, + { + "auxiliary_loss_clip": 0.01263291, + "auxiliary_loss_mlp": 0.00225845, + "balance_loss_clip": 1.04410505, + "balance_loss_mlp": 0.20039354, + "epoch": 0.8419059071095746, + "flos": 28660629715200.0, + "grad_norm": 35.072387256192116, + "language_loss": 0.84267151, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.8575629, + "num_input_tokens_seen": 302027250, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.2545166, + "step": 14003, + "time_per_iteration": 2.794793128967285 + }, + { + "auxiliary_loss_clip": 0.01272869, + "auxiliary_loss_mlp": 0.00228872, + "balance_loss_clip": 1.05268145, + "balance_loss_mlp": 0.20235974, + "epoch": 0.8419660303622426, + "flos": 21653596189440.0, + "grad_norm": 115.46181395103311, + "language_loss": 0.73497665, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.74999404, + "num_input_tokens_seen": 302046950, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26501465, + "step": 14004, + "time_per_iteration": 2.7022056579589844 + }, + { + "auxiliary_loss_clip": 0.01272238, + "auxiliary_loss_mlp": 0.00230877, + "balance_loss_clip": 1.052984, + "balance_loss_mlp": 0.20381683, + "epoch": 0.8420261536149106, + "flos": 25301114576640.0, + "grad_norm": 32.40092808493116, + "language_loss": 0.84520221, + "learning_rate": 2.560341831785724e-07, + "loss": 0.86023343, + "num_input_tokens_seen": 302065470, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.27026367, + "step": 14005, + "time_per_iteration": 2.7497127056121826 + }, + { + "auxiliary_loss_clip": 0.01274589, + "auxiliary_loss_mlp": 0.00225895, + "balance_loss_clip": 1.0496254, + "balance_loss_mlp": 0.19933489, + "epoch": 0.8420862768675785, + "flos": 18763397176320.0, + "grad_norm": 7.164986285210702, + "language_loss": 0.86648333, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.8814882, + "num_input_tokens_seen": 302083190, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.265625, + "step": 14006, + "time_per_iteration": 2.6687023639678955 + }, + { + "auxiliary_loss_clip": 0.01270553, + "auxiliary_loss_mlp": 0.00223435, + "balance_loss_clip": 1.0454762, + "balance_loss_mlp": 0.19813836, + "epoch": 0.8421464001202466, + "flos": 18328052338560.0, + "grad_norm": 3.9317029902522704, + "language_loss": 0.8465097, + "learning_rate": 2.556530041751932e-07, + "loss": 0.86144954, + "num_input_tokens_seen": 302098820, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.25256348, + "step": 14007, + "time_per_iteration": 2.7602672576904297 + }, + { + "auxiliary_loss_clip": 0.01265337, + "auxiliary_loss_mlp": 0.00210624, + "balance_loss_clip": 1.04715014, + "balance_loss_mlp": 0.18694906, + "epoch": 0.8422065233729145, + "flos": 31537181560320.0, + "grad_norm": 54.68118402480354, + "language_loss": 0.71705461, + "learning_rate": 2.554625138886102e-07, + "loss": 0.73181415, + "num_input_tokens_seen": 302117075, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.23681641, + "step": 14008, + "time_per_iteration": 2.765855550765991 + }, + { + "auxiliary_loss_clip": 0.0116503, + "auxiliary_loss_mlp": 0.00221126, + "balance_loss_clip": 1.02045262, + "balance_loss_mlp": 0.21244717, + "epoch": 0.8422666466255825, + "flos": 64298128510080.0, + "grad_norm": 0.6956672199384136, + "language_loss": 0.56064594, + "learning_rate": 2.552720897550631e-07, + "loss": 0.57450747, + "num_input_tokens_seen": 302179735, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.08691406, + "step": 14009, + "time_per_iteration": 3.2657644748687744 + }, + { + "auxiliary_loss_clip": 0.01255092, + "auxiliary_loss_mlp": 0.00228425, + "balance_loss_clip": 1.04145586, + "balance_loss_mlp": 0.20444019, + "epoch": 0.8423267698782504, + "flos": 24316731377280.0, + "grad_norm": 3.7245767969624217, + "language_loss": 0.83635223, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.85118735, + "num_input_tokens_seen": 302202055, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.23974609, + "step": 14010, + "time_per_iteration": 2.847505569458008 + }, + { + "auxiliary_loss_clip": 0.012913, + "auxiliary_loss_mlp": 0.002161, + "balance_loss_clip": 1.06277239, + "balance_loss_mlp": 0.18799087, + "epoch": 0.8423868931309184, + "flos": 18296092212480.0, + "grad_norm": 6.768011939093078, + "language_loss": 0.81054318, + "learning_rate": 2.548914399759592e-07, + "loss": 0.82561719, + "num_input_tokens_seen": 302221360, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.28100586, + "step": 14011, + "time_per_iteration": 2.71439528465271 + }, + { + "auxiliary_loss_clip": 0.01280825, + "auxiliary_loss_mlp": 0.00240754, + "balance_loss_clip": 1.05578852, + "balance_loss_mlp": 0.21591097, + "epoch": 0.8424470163835863, + "flos": 23550218121600.0, + "grad_norm": 25.379258480558768, + "language_loss": 0.92963314, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.9448489, + "num_input_tokens_seen": 302240715, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.24841309, + "step": 14012, + "time_per_iteration": 2.741568088531494 + }, + { + "auxiliary_loss_clip": 0.01247332, + "auxiliary_loss_mlp": 0.0021036, + "balance_loss_clip": 1.03948021, + "balance_loss_mlp": 0.18704297, + "epoch": 0.8425071396362543, + "flos": 23769488695680.0, + "grad_norm": 44.77893567290176, + "language_loss": 0.74933469, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.7639116, + "num_input_tokens_seen": 302260950, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.23339844, + "step": 14013, + "time_per_iteration": 2.783193349838257 + }, + { + "auxiliary_loss_clip": 0.0129298, + "auxiliary_loss_mlp": 0.00242759, + "balance_loss_clip": 1.05698299, + "balance_loss_mlp": 0.21574655, + "epoch": 0.8425672628889223, + "flos": 16178906816640.0, + "grad_norm": 129.15117056879146, + "language_loss": 0.88692003, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.90227747, + "num_input_tokens_seen": 302277500, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.27050781, + "step": 14014, + "time_per_iteration": 2.7176311016082764 + }, + { + "auxiliary_loss_clip": 0.01283512, + "auxiliary_loss_mlp": 0.00211696, + "balance_loss_clip": 1.06075275, + "balance_loss_mlp": 0.18629217, + "epoch": 0.8426273861415903, + "flos": 23149131880320.0, + "grad_norm": 21.79858222166037, + "language_loss": 0.73534817, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.75030029, + "num_input_tokens_seen": 302297930, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25415039, + "step": 14015, + "time_per_iteration": 2.696579694747925 + }, + { + "auxiliary_loss_clip": 0.01278098, + "auxiliary_loss_mlp": 0.00194859, + "balance_loss_clip": 1.05542433, + "balance_loss_mlp": 0.17026588, + "epoch": 0.8426875093942582, + "flos": 17457757712640.0, + "grad_norm": 145.3584906614496, + "language_loss": 0.85114706, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.86587662, + "num_input_tokens_seen": 302315735, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.24597168, + "step": 14016, + "time_per_iteration": 2.6907215118408203 + }, + { + "auxiliary_loss_clip": 0.01269797, + "auxiliary_loss_mlp": 0.00248541, + "balance_loss_clip": 1.04662323, + "balance_loss_mlp": 0.2210747, + "epoch": 0.8427476326469262, + "flos": 19640551299840.0, + "grad_norm": 6.079571490263519, + "language_loss": 0.86336768, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.87855107, + "num_input_tokens_seen": 302332790, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.27490234, + "step": 14017, + "time_per_iteration": 2.659961700439453 + }, + { + "auxiliary_loss_clip": 0.01248289, + "auxiliary_loss_mlp": 0.00247034, + "balance_loss_clip": 1.03124976, + "balance_loss_mlp": 0.2221193, + "epoch": 0.8428077558995941, + "flos": 11941160146560.0, + "grad_norm": 15.22548712496516, + "language_loss": 0.70648134, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.72143459, + "num_input_tokens_seen": 302346490, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24914551, + "step": 14018, + "time_per_iteration": 2.6481215953826904 + }, + { + "auxiliary_loss_clip": 0.01253069, + "auxiliary_loss_mlp": 0.0023965, + "balance_loss_clip": 1.03636205, + "balance_loss_mlp": 0.21638064, + "epoch": 0.8428678791522621, + "flos": 10451729767680.0, + "grad_norm": 16.097535745195668, + "language_loss": 0.87098992, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.88591707, + "num_input_tokens_seen": 302363235, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.23291016, + "step": 14019, + "time_per_iteration": 2.694059371948242 + }, + { + "auxiliary_loss_clip": 0.01261656, + "auxiliary_loss_mlp": 0.0020344, + "balance_loss_clip": 1.03983688, + "balance_loss_mlp": 0.17621225, + "epoch": 0.8429280024049302, + "flos": 28767248259840.0, + "grad_norm": 53.53796164333151, + "language_loss": 0.87307113, + "learning_rate": 2.531817924498265e-07, + "loss": 0.88772213, + "num_input_tokens_seen": 302383270, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.27233887, + "step": 14020, + "time_per_iteration": 2.73976469039917 + }, + { + "auxiliary_loss_clip": 0.01258832, + "auxiliary_loss_mlp": 0.00214249, + "balance_loss_clip": 1.04087949, + "balance_loss_mlp": 0.18971595, + "epoch": 0.8429881256575981, + "flos": 19537093152000.0, + "grad_norm": 32.55733787938635, + "language_loss": 0.80703974, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.82177055, + "num_input_tokens_seen": 302401355, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24560547, + "step": 14021, + "time_per_iteration": 2.725313186645508 + }, + { + "auxiliary_loss_clip": 0.01271076, + "auxiliary_loss_mlp": 0.00231743, + "balance_loss_clip": 1.05303645, + "balance_loss_mlp": 0.20617214, + "epoch": 0.8430482489102661, + "flos": 24790931752320.0, + "grad_norm": 5.8127512413606395, + "language_loss": 0.76984274, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.78487092, + "num_input_tokens_seen": 302419515, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.2557373, + "step": 14022, + "time_per_iteration": 3.1866438388824463 + }, + { + "auxiliary_loss_clip": 0.01285278, + "auxiliary_loss_mlp": 0.00240323, + "balance_loss_clip": 1.05468154, + "balance_loss_mlp": 0.2121893, + "epoch": 0.843108372162934, + "flos": 21544248211200.0, + "grad_norm": 6.932843184152693, + "language_loss": 0.80988622, + "learning_rate": 2.526131019933553e-07, + "loss": 0.8251422, + "num_input_tokens_seen": 302438280, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.28125, + "step": 14023, + "time_per_iteration": 2.7496917247772217 + }, + { + "auxiliary_loss_clip": 0.01265786, + "auxiliary_loss_mlp": 0.00239138, + "balance_loss_clip": 1.04953289, + "balance_loss_mlp": 0.21398523, + "epoch": 0.843168495415602, + "flos": 24608792862720.0, + "grad_norm": 8.93944562162873, + "language_loss": 0.72238129, + "learning_rate": 2.524236710204559e-07, + "loss": 0.73743057, + "num_input_tokens_seen": 302460860, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25170898, + "step": 14024, + "time_per_iteration": 2.7442147731781006 + }, + { + "auxiliary_loss_clip": 0.01270138, + "auxiliary_loss_mlp": 0.00222973, + "balance_loss_clip": 1.04770613, + "balance_loss_mlp": 0.1965677, + "epoch": 0.8432286186682699, + "flos": 15122738286720.0, + "grad_norm": 3.1497919157267638, + "language_loss": 0.8812058, + "learning_rate": 2.522343063158261e-07, + "loss": 0.89613682, + "num_input_tokens_seen": 302476980, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.2644043, + "step": 14025, + "time_per_iteration": 2.7435178756713867 + }, + { + "auxiliary_loss_clip": 0.0124802, + "auxiliary_loss_mlp": 0.00195565, + "balance_loss_clip": 1.03614008, + "balance_loss_mlp": 0.17292677, + "epoch": 0.843288741920938, + "flos": 20301882554880.0, + "grad_norm": 211.0821485460889, + "language_loss": 0.83575284, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.85018873, + "num_input_tokens_seen": 302496380, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.22644043, + "step": 14026, + "time_per_iteration": 2.7158002853393555 + }, + { + "auxiliary_loss_clip": 0.01262236, + "auxiliary_loss_mlp": 0.00213162, + "balance_loss_clip": 1.04261017, + "balance_loss_mlp": 0.18929663, + "epoch": 0.8433488651736059, + "flos": 23332096782720.0, + "grad_norm": 5.493664511309784, + "language_loss": 0.88440001, + "learning_rate": 2.518557757400945e-07, + "loss": 0.89915395, + "num_input_tokens_seen": 302516845, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.23852539, + "step": 14027, + "time_per_iteration": 2.7522008419036865 + }, + { + "auxiliary_loss_clip": 0.01261958, + "auxiliary_loss_mlp": 0.00221306, + "balance_loss_clip": 1.04087305, + "balance_loss_mlp": 0.19583064, + "epoch": 0.8434089884262739, + "flos": 39458105844480.0, + "grad_norm": 6.248941638744556, + "language_loss": 0.65307248, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.66790509, + "num_input_tokens_seen": 302538865, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25476074, + "step": 14028, + "time_per_iteration": 4.248548984527588 + }, + { + "auxiliary_loss_clip": 0.0127096, + "auxiliary_loss_mlp": 0.00233874, + "balance_loss_clip": 1.05176139, + "balance_loss_mlp": 0.2080178, + "epoch": 0.8434691116789418, + "flos": 23768842250880.0, + "grad_norm": 15.446024729894063, + "language_loss": 0.72728854, + "learning_rate": 2.51477510323578e-07, + "loss": 0.74233687, + "num_input_tokens_seen": 302557970, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25842285, + "step": 14029, + "time_per_iteration": 2.703129529953003 + }, + { + "auxiliary_loss_clip": 0.01263678, + "auxiliary_loss_mlp": 0.00236292, + "balance_loss_clip": 1.04966879, + "balance_loss_mlp": 0.21335599, + "epoch": 0.8435292349316098, + "flos": 22671411972480.0, + "grad_norm": 2.062747530672219, + "language_loss": 0.80962425, + "learning_rate": 2.51288477067956e-07, + "loss": 0.82462394, + "num_input_tokens_seen": 302578915, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.22949219, + "step": 14030, + "time_per_iteration": 4.138928413391113 + }, + { + "auxiliary_loss_clip": 0.01255644, + "auxiliary_loss_mlp": 0.00235156, + "balance_loss_clip": 1.04119682, + "balance_loss_mlp": 0.2116721, + "epoch": 0.8435893581842777, + "flos": 18843622202880.0, + "grad_norm": 5.4298796130119875, + "language_loss": 0.89683944, + "learning_rate": 2.510995101236502e-07, + "loss": 0.9117474, + "num_input_tokens_seen": 302596300, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.23498535, + "step": 14031, + "time_per_iteration": 2.7077152729034424 + }, + { + "auxiliary_loss_clip": 0.0127432, + "auxiliary_loss_mlp": 0.00227373, + "balance_loss_clip": 1.04908073, + "balance_loss_mlp": 0.20279205, + "epoch": 0.8436494814369457, + "flos": 20704225772160.0, + "grad_norm": 474.6650455922936, + "language_loss": 0.91162241, + "learning_rate": 2.509106094978266e-07, + "loss": 0.92663932, + "num_input_tokens_seen": 302614975, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.24597168, + "step": 14032, + "time_per_iteration": 2.74483060836792 + }, + { + "auxiliary_loss_clip": 0.01267845, + "auxiliary_loss_mlp": 0.0022677, + "balance_loss_clip": 1.04891682, + "balance_loss_mlp": 0.19999588, + "epoch": 0.8437096046896138, + "flos": 22674177319680.0, + "grad_norm": 3.0641845685395266, + "language_loss": 0.82002974, + "learning_rate": 2.507217751976478e-07, + "loss": 0.83497584, + "num_input_tokens_seen": 302636415, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.26757812, + "step": 14033, + "time_per_iteration": 2.7558698654174805 + }, + { + "auxiliary_loss_clip": 0.01251235, + "auxiliary_loss_mlp": 0.00229682, + "balance_loss_clip": 1.03886509, + "balance_loss_mlp": 0.20496954, + "epoch": 0.8437697279422817, + "flos": 16180127879040.0, + "grad_norm": 6.410283049050331, + "language_loss": 0.91078633, + "learning_rate": 2.505330072302743e-07, + "loss": 0.92559552, + "num_input_tokens_seen": 302653605, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.24719238, + "step": 14034, + "time_per_iteration": 2.6484436988830566 + }, + { + "auxiliary_loss_clip": 0.01269006, + "auxiliary_loss_mlp": 0.00221539, + "balance_loss_clip": 1.04422498, + "balance_loss_mlp": 0.19615975, + "epoch": 0.8438298511949497, + "flos": 28765847629440.0, + "grad_norm": 20.125297607174176, + "language_loss": 0.85310841, + "learning_rate": 2.503443056028656e-07, + "loss": 0.86801392, + "num_input_tokens_seen": 302673965, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.25378418, + "step": 14035, + "time_per_iteration": 4.292035818099976 + }, + { + "auxiliary_loss_clip": 0.01267692, + "auxiliary_loss_mlp": 0.00244856, + "balance_loss_clip": 1.04692876, + "balance_loss_mlp": 0.22007269, + "epoch": 0.8438899744476176, + "flos": 33724284779520.0, + "grad_norm": 3.8938413731683164, + "language_loss": 0.78328383, + "learning_rate": 2.501556703225751e-07, + "loss": 0.79840934, + "num_input_tokens_seen": 302695560, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.2479248, + "step": 14036, + "time_per_iteration": 2.8351244926452637 + }, + { + "auxiliary_loss_clip": 0.01235348, + "auxiliary_loss_mlp": 0.00208411, + "balance_loss_clip": 1.02818418, + "balance_loss_mlp": 0.18688136, + "epoch": 0.8439500977002856, + "flos": 25110787386240.0, + "grad_norm": 24.710618631353306, + "language_loss": 0.75454032, + "learning_rate": 2.49967101396557e-07, + "loss": 0.76897788, + "num_input_tokens_seen": 302713480, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.21533203, + "step": 14037, + "time_per_iteration": 2.823664426803589 + }, + { + "auxiliary_loss_clip": 0.01253043, + "auxiliary_loss_mlp": 0.0023648, + "balance_loss_clip": 1.03765678, + "balance_loss_mlp": 0.21233991, + "epoch": 0.8440102209529535, + "flos": 32850362880000.0, + "grad_norm": 5.309624110040808, + "language_loss": 0.75874865, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.77364391, + "num_input_tokens_seen": 302736860, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24145508, + "step": 14038, + "time_per_iteration": 4.3892295360565186 + }, + { + "auxiliary_loss_clip": 0.01261415, + "auxiliary_loss_mlp": 0.00217334, + "balance_loss_clip": 1.04561388, + "balance_loss_mlp": 0.19395688, + "epoch": 0.8440703442056215, + "flos": 23730202195200.0, + "grad_norm": 23.784884832634344, + "language_loss": 0.81205165, + "learning_rate": 2.49590162635938e-07, + "loss": 0.82683909, + "num_input_tokens_seen": 302757745, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.23376465, + "step": 14039, + "time_per_iteration": 2.777498483657837 + }, + { + "auxiliary_loss_clip": 0.01267261, + "auxiliary_loss_mlp": 0.00238907, + "balance_loss_clip": 1.04274011, + "balance_loss_mlp": 0.21237113, + "epoch": 0.8441304674582895, + "flos": 20193719725440.0, + "grad_norm": 78.13862815315439, + "language_loss": 0.88687563, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.90193737, + "num_input_tokens_seen": 302774885, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26513672, + "step": 14040, + "time_per_iteration": 2.7399117946624756 + }, + { + "auxiliary_loss_clip": 0.01274281, + "auxiliary_loss_mlp": 0.0025049, + "balance_loss_clip": 1.05075121, + "balance_loss_mlp": 0.22385831, + "epoch": 0.8441905907109575, + "flos": 20219897761920.0, + "grad_norm": 6.2609120064525285, + "language_loss": 0.78004873, + "learning_rate": 2.492134893781821e-07, + "loss": 0.79529643, + "num_input_tokens_seen": 302791035, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.26647949, + "step": 14041, + "time_per_iteration": 2.698157548904419 + }, + { + "auxiliary_loss_clip": 0.01267118, + "auxiliary_loss_mlp": 0.00241382, + "balance_loss_clip": 1.04532278, + "balance_loss_mlp": 0.21808863, + "epoch": 0.8442507139636254, + "flos": 13516453987200.0, + "grad_norm": 21.477807277810186, + "language_loss": 0.79660845, + "learning_rate": 2.490252523307341e-07, + "loss": 0.81169343, + "num_input_tokens_seen": 302808650, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.23278809, + "step": 14042, + "time_per_iteration": 2.699533224105835 + }, + { + "auxiliary_loss_clip": 0.01264277, + "auxiliary_loss_mlp": 0.00211476, + "balance_loss_clip": 1.04449964, + "balance_loss_mlp": 0.18650174, + "epoch": 0.8443108372162934, + "flos": 18220212731520.0, + "grad_norm": 47.14073108112217, + "language_loss": 0.84691018, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.86166769, + "num_input_tokens_seen": 302824605, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24975586, + "step": 14043, + "time_per_iteration": 2.6643974781036377 + }, + { + "auxiliary_loss_clip": 0.01247684, + "auxiliary_loss_mlp": 0.00242272, + "balance_loss_clip": 1.03593874, + "balance_loss_mlp": 0.21896631, + "epoch": 0.8443709604689613, + "flos": 16105110324480.0, + "grad_norm": 5.407239718842649, + "language_loss": 0.80905783, + "learning_rate": 2.486489774343865e-07, + "loss": 0.82395738, + "num_input_tokens_seen": 302840170, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.2331543, + "step": 14044, + "time_per_iteration": 2.6336328983306885 + }, + { + "auxiliary_loss_clip": 0.01241704, + "auxiliary_loss_mlp": 0.00228307, + "balance_loss_clip": 1.02856016, + "balance_loss_mlp": 0.20498976, + "epoch": 0.8444310837216293, + "flos": 18512130562560.0, + "grad_norm": 9.393127127822908, + "language_loss": 0.80821854, + "learning_rate": 2.484609395997559e-07, + "loss": 0.82291865, + "num_input_tokens_seen": 302858320, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.23327637, + "step": 14045, + "time_per_iteration": 2.6563990116119385 + }, + { + "auxiliary_loss_clip": 0.01252322, + "auxiliary_loss_mlp": 0.00226113, + "balance_loss_clip": 1.03426003, + "balance_loss_mlp": 0.2003991, + "epoch": 0.8444912069742974, + "flos": 14939845211520.0, + "grad_norm": 10.761057257524433, + "language_loss": 0.85689843, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.87168276, + "num_input_tokens_seen": 302875255, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25732422, + "step": 14046, + "time_per_iteration": 2.665637969970703 + }, + { + "auxiliary_loss_clip": 0.01266792, + "auxiliary_loss_mlp": 0.00224128, + "balance_loss_clip": 1.04430485, + "balance_loss_mlp": 0.19861752, + "epoch": 0.8445513302269653, + "flos": 20120318282880.0, + "grad_norm": 28.55373748882652, + "language_loss": 0.85852003, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.87342924, + "num_input_tokens_seen": 302894690, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25512695, + "step": 14047, + "time_per_iteration": 2.659726858139038 + }, + { + "auxiliary_loss_clip": 0.01271117, + "auxiliary_loss_mlp": 0.00214199, + "balance_loss_clip": 1.05016792, + "balance_loss_mlp": 0.19096512, + "epoch": 0.8446114534796333, + "flos": 31170928533120.0, + "grad_norm": 16.16873100854314, + "language_loss": 0.78338915, + "learning_rate": 2.478972246355935e-07, + "loss": 0.79824233, + "num_input_tokens_seen": 302912405, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.23254395, + "step": 14048, + "time_per_iteration": 2.756404161453247 + }, + { + "auxiliary_loss_clip": 0.01264712, + "auxiliary_loss_mlp": 0.00212213, + "balance_loss_clip": 1.04532516, + "balance_loss_mlp": 0.18730995, + "epoch": 0.8446715767323012, + "flos": 23948323534080.0, + "grad_norm": 112.54317219372383, + "language_loss": 0.81598675, + "learning_rate": 2.477094525178667e-07, + "loss": 0.83075595, + "num_input_tokens_seen": 302932525, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24902344, + "step": 14049, + "time_per_iteration": 2.697051763534546 + }, + { + "auxiliary_loss_clip": 0.01147784, + "auxiliary_loss_mlp": 0.00221709, + "balance_loss_clip": 1.00206423, + "balance_loss_mlp": 0.21102758, + "epoch": 0.8447316999849692, + "flos": 67984897484160.0, + "grad_norm": 1.4283108240590596, + "language_loss": 0.5982374, + "learning_rate": 2.475217468471729e-07, + "loss": 0.61193234, + "num_input_tokens_seen": 302991285, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.10693359, + "step": 14050, + "time_per_iteration": 3.156367540359497 + }, + { + "auxiliary_loss_clip": 0.01261348, + "auxiliary_loss_mlp": 0.00243198, + "balance_loss_clip": 1.03849828, + "balance_loss_mlp": 0.21893871, + "epoch": 0.8447918232376371, + "flos": 22418924296320.0, + "grad_norm": 3.599125665390115, + "language_loss": 0.81381518, + "learning_rate": 2.473341076306303e-07, + "loss": 0.82886064, + "num_input_tokens_seen": 303009515, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.24243164, + "step": 14051, + "time_per_iteration": 2.705198287963867 + }, + { + "auxiliary_loss_clip": 0.01261475, + "auxiliary_loss_mlp": 0.00245951, + "balance_loss_clip": 1.04097402, + "balance_loss_mlp": 0.21945064, + "epoch": 0.8448519464903052, + "flos": 23694147918720.0, + "grad_norm": 13.037807389236216, + "language_loss": 0.82608008, + "learning_rate": 2.471465348753547e-07, + "loss": 0.84115434, + "num_input_tokens_seen": 303026905, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26477051, + "step": 14052, + "time_per_iteration": 2.7819175720214844 + }, + { + "auxiliary_loss_clip": 0.01247202, + "auxiliary_loss_mlp": 0.002333, + "balance_loss_clip": 1.03445256, + "balance_loss_mlp": 0.20983969, + "epoch": 0.8449120697429731, + "flos": 13735904129280.0, + "grad_norm": 673.0970583227521, + "language_loss": 0.81142759, + "learning_rate": 2.469590285884575e-07, + "loss": 0.82623261, + "num_input_tokens_seen": 303045245, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.23449707, + "step": 14053, + "time_per_iteration": 2.6766607761383057 + }, + { + "auxiliary_loss_clip": 0.01255827, + "auxiliary_loss_mlp": 0.00236165, + "balance_loss_clip": 1.03815079, + "balance_loss_mlp": 0.21090417, + "epoch": 0.8449721929956411, + "flos": 20886795624960.0, + "grad_norm": 20.546676823314833, + "language_loss": 0.8097654, + "learning_rate": 2.467715887770494e-07, + "loss": 0.82468534, + "num_input_tokens_seen": 303065205, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.25256348, + "step": 14054, + "time_per_iteration": 2.6964504718780518 + }, + { + "auxiliary_loss_clip": 0.01260598, + "auxiliary_loss_mlp": 0.00228661, + "balance_loss_clip": 1.03999007, + "balance_loss_mlp": 0.20386508, + "epoch": 0.845032316248309, + "flos": 33216939129600.0, + "grad_norm": 9.96260890670144, + "language_loss": 0.84211373, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.85700631, + "num_input_tokens_seen": 303088250, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24804688, + "step": 14055, + "time_per_iteration": 2.803569793701172 + }, + { + "auxiliary_loss_clip": 0.01247686, + "auxiliary_loss_mlp": 0.00207616, + "balance_loss_clip": 1.03410661, + "balance_loss_mlp": 0.18527614, + "epoch": 0.845092439500977, + "flos": 23585230903680.0, + "grad_norm": 12.517435762866613, + "language_loss": 0.7859987, + "learning_rate": 2.463969086091302e-07, + "loss": 0.80055165, + "num_input_tokens_seen": 303109280, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.22338867, + "step": 14056, + "time_per_iteration": 2.7259232997894287 + }, + { + "auxiliary_loss_clip": 0.01262052, + "auxiliary_loss_mlp": 0.00250526, + "balance_loss_clip": 1.03932977, + "balance_loss_mlp": 0.22439513, + "epoch": 0.8451525627536449, + "flos": 13333920048000.0, + "grad_norm": 7.866318057504293, + "language_loss": 0.79103327, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.80615902, + "num_input_tokens_seen": 303126075, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.26123047, + "step": 14057, + "time_per_iteration": 2.697777271270752 + }, + { + "auxiliary_loss_clip": 0.01264352, + "auxiliary_loss_mlp": 0.00230871, + "balance_loss_clip": 1.0437963, + "balance_loss_mlp": 0.2054916, + "epoch": 0.8452126860063129, + "flos": 27817985583360.0, + "grad_norm": 23.358679683453353, + "language_loss": 0.84614348, + "learning_rate": 2.460224944284284e-07, + "loss": 0.86109573, + "num_input_tokens_seen": 303146920, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25415039, + "step": 14058, + "time_per_iteration": 2.702638864517212 + }, + { + "auxiliary_loss_clip": 0.01257869, + "auxiliary_loss_mlp": 0.00231224, + "balance_loss_clip": 1.04061759, + "balance_loss_mlp": 0.20832399, + "epoch": 0.845272809258981, + "flos": 27124694202240.0, + "grad_norm": 122.40228373534026, + "language_loss": 0.75649643, + "learning_rate": 2.45835387101033e-07, + "loss": 0.77138734, + "num_input_tokens_seen": 303167885, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.22912598, + "step": 14059, + "time_per_iteration": 2.718773365020752 + }, + { + "auxiliary_loss_clip": 0.0129206, + "auxiliary_loss_mlp": 0.0025045, + "balance_loss_clip": 1.05706143, + "balance_loss_mlp": 0.21913409, + "epoch": 0.8453329325116489, + "flos": 18332577452160.0, + "grad_norm": 213.13374059988325, + "language_loss": 0.69253707, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.70796216, + "num_input_tokens_seen": 303185000, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.31335449, + "step": 14060, + "time_per_iteration": 2.663581609725952 + }, + { + "auxiliary_loss_clip": 0.01259691, + "auxiliary_loss_mlp": 0.00256876, + "balance_loss_clip": 1.03801107, + "balance_loss_mlp": 0.22803971, + "epoch": 0.8453930557643169, + "flos": 22675254727680.0, + "grad_norm": 9.282157571589982, + "language_loss": 0.84858519, + "learning_rate": 2.454613720076277e-07, + "loss": 0.86375087, + "num_input_tokens_seen": 303205210, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.28833008, + "step": 14061, + "time_per_iteration": 2.6942708492279053 + }, + { + "auxiliary_loss_clip": 0.01276604, + "auxiliary_loss_mlp": 0.00258766, + "balance_loss_clip": 1.04797077, + "balance_loss_mlp": 0.23177691, + "epoch": 0.8454531790169848, + "flos": 22487261921280.0, + "grad_norm": 10.55864614907716, + "language_loss": 0.83522391, + "learning_rate": 2.452744642558013e-07, + "loss": 0.85057765, + "num_input_tokens_seen": 303224655, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.27026367, + "step": 14062, + "time_per_iteration": 2.6752195358276367 + }, + { + "auxiliary_loss_clip": 0.01140734, + "auxiliary_loss_mlp": 0.00149485, + "balance_loss_clip": 0.99171311, + "balance_loss_mlp": 0.14209385, + "epoch": 0.8455133022696528, + "flos": 58277848481280.0, + "grad_norm": 0.6361557136777004, + "language_loss": 0.52120697, + "learning_rate": 2.450876230433432e-07, + "loss": 0.53410912, + "num_input_tokens_seen": 303289645, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07373047, + "step": 14063, + "time_per_iteration": 3.261197328567505 + }, + { + "auxiliary_loss_clip": 0.01242818, + "auxiliary_loss_mlp": 0.00238466, + "balance_loss_clip": 1.03105962, + "balance_loss_mlp": 0.21507749, + "epoch": 0.8455734255223207, + "flos": 21361283308800.0, + "grad_norm": 21.005204697699202, + "language_loss": 0.88202095, + "learning_rate": 2.449008483773378e-07, + "loss": 0.89683378, + "num_input_tokens_seen": 303308350, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.23413086, + "step": 14064, + "time_per_iteration": 2.771409034729004 + }, + { + "auxiliary_loss_clip": 0.01256762, + "auxiliary_loss_mlp": 0.00226586, + "balance_loss_clip": 1.0374769, + "balance_loss_mlp": 0.2018625, + "epoch": 0.8456335487749888, + "flos": 20449260057600.0, + "grad_norm": 6.59050265536563, + "language_loss": 0.80290765, + "learning_rate": 2.447141402648685e-07, + "loss": 0.81774116, + "num_input_tokens_seen": 303325230, + "router_z_loss_clip": 2.19238281, + "router_z_loss_mlp": 0.24755859, + "step": 14065, + "time_per_iteration": 2.719221591949463 + }, + { + "auxiliary_loss_clip": 0.01231659, + "auxiliary_loss_mlp": 0.0020354, + "balance_loss_clip": 1.01737452, + "balance_loss_mlp": 0.17975801, + "epoch": 0.8456936720276567, + "flos": 28840901097600.0, + "grad_norm": 105.0441383144679, + "language_loss": 0.82850051, + "learning_rate": 2.445274987130146e-07, + "loss": 0.84285253, + "num_input_tokens_seen": 303345810, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.23803711, + "step": 14066, + "time_per_iteration": 2.8389899730682373 + }, + { + "auxiliary_loss_clip": 0.01263968, + "auxiliary_loss_mlp": 0.00222359, + "balance_loss_clip": 1.04332972, + "balance_loss_mlp": 0.19763537, + "epoch": 0.8457537952803247, + "flos": 22672884430080.0, + "grad_norm": 184.4396474122943, + "language_loss": 0.76552528, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.78038859, + "num_input_tokens_seen": 303365140, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.24731445, + "step": 14067, + "time_per_iteration": 2.8848531246185303 + }, + { + "auxiliary_loss_clip": 0.01250397, + "auxiliary_loss_mlp": 0.00228623, + "balance_loss_clip": 1.03628683, + "balance_loss_mlp": 0.2043401, + "epoch": 0.8458139185329926, + "flos": 33802929607680.0, + "grad_norm": 385.1729473342786, + "language_loss": 0.78192103, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.79671121, + "num_input_tokens_seen": 303386150, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.24267578, + "step": 14068, + "time_per_iteration": 2.8889315128326416 + }, + { + "auxiliary_loss_clip": 0.01146942, + "auxiliary_loss_mlp": 0.00161957, + "balance_loss_clip": 0.99849856, + "balance_loss_mlp": 0.15237299, + "epoch": 0.8458740417856606, + "flos": 70295929603200.0, + "grad_norm": 0.677644693516346, + "language_loss": 0.59635663, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.60944557, + "num_input_tokens_seen": 303453770, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.09570312, + "step": 14069, + "time_per_iteration": 3.2998690605163574 + }, + { + "auxiliary_loss_clip": 0.01251007, + "auxiliary_loss_mlp": 0.00210993, + "balance_loss_clip": 1.03064847, + "balance_loss_mlp": 0.18771149, + "epoch": 0.8459341650383285, + "flos": 24170862245760.0, + "grad_norm": 24.726926691772483, + "language_loss": 0.82489204, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.83951205, + "num_input_tokens_seen": 303474520, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.23291016, + "step": 14070, + "time_per_iteration": 2.741861581802368 + }, + { + "auxiliary_loss_clip": 0.01250387, + "auxiliary_loss_mlp": 0.00205635, + "balance_loss_clip": 1.03539085, + "balance_loss_mlp": 0.18144755, + "epoch": 0.8459942882909965, + "flos": 38181158369280.0, + "grad_norm": 4.984591778324076, + "language_loss": 0.71972829, + "learning_rate": 2.435952896106039e-07, + "loss": 0.73428845, + "num_input_tokens_seen": 303497345, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24182129, + "step": 14071, + "time_per_iteration": 4.362313747406006 + }, + { + "auxiliary_loss_clip": 0.01152296, + "auxiliary_loss_mlp": 0.00120599, + "balance_loss_clip": 1.00421214, + "balance_loss_mlp": 0.11287411, + "epoch": 0.8460544115436646, + "flos": 64118252177280.0, + "grad_norm": 0.7219923093408456, + "language_loss": 0.60493946, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.61766839, + "num_input_tokens_seen": 303554890, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.07714844, + "step": 14072, + "time_per_iteration": 4.404930353164673 + }, + { + "auxiliary_loss_clip": 0.01264627, + "auxiliary_loss_mlp": 0.00229774, + "balance_loss_clip": 1.03897214, + "balance_loss_mlp": 0.20316616, + "epoch": 0.8461145347963325, + "flos": 24170826332160.0, + "grad_norm": 2.907804182309453, + "language_loss": 0.80631471, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.82125866, + "num_input_tokens_seen": 303574380, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.26635742, + "step": 14073, + "time_per_iteration": 2.6860878467559814 + }, + { + "auxiliary_loss_clip": 0.01279891, + "auxiliary_loss_mlp": 0.00230006, + "balance_loss_clip": 1.05206299, + "balance_loss_mlp": 0.20308848, + "epoch": 0.8461746580490005, + "flos": 34893787697280.0, + "grad_norm": 81.81823270753848, + "language_loss": 0.86622536, + "learning_rate": 2.430367633291155e-07, + "loss": 0.88132429, + "num_input_tokens_seen": 303594910, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.26928711, + "step": 14074, + "time_per_iteration": 2.9131650924682617 + }, + { + "auxiliary_loss_clip": 0.01253525, + "auxiliary_loss_mlp": 0.0020974, + "balance_loss_clip": 1.03345346, + "balance_loss_mlp": 0.18558797, + "epoch": 0.8462347813016684, + "flos": 25557014044800.0, + "grad_norm": 13.670863615595035, + "language_loss": 0.83395565, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.84858835, + "num_input_tokens_seen": 303613520, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.24157715, + "step": 14075, + "time_per_iteration": 2.719479560852051 + }, + { + "auxiliary_loss_clip": 0.01248128, + "auxiliary_loss_mlp": 0.00253724, + "balance_loss_clip": 1.03147554, + "balance_loss_mlp": 0.22867835, + "epoch": 0.8462949045543364, + "flos": 21325336773120.0, + "grad_norm": 443.37297686075846, + "language_loss": 0.81333846, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.82835698, + "num_input_tokens_seen": 303631225, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.25036621, + "step": 14076, + "time_per_iteration": 2.7429816722869873 + }, + { + "auxiliary_loss_clip": 0.01275695, + "auxiliary_loss_mlp": 0.00240329, + "balance_loss_clip": 1.04798913, + "balance_loss_mlp": 0.21504499, + "epoch": 0.8463550278070043, + "flos": 22637440684800.0, + "grad_norm": 117.19735803105011, + "language_loss": 0.88294631, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.89810658, + "num_input_tokens_seen": 303649175, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.25280762, + "step": 14077, + "time_per_iteration": 4.145796060562134 + }, + { + "auxiliary_loss_clip": 0.01269474, + "auxiliary_loss_mlp": 0.00219273, + "balance_loss_clip": 1.0459342, + "balance_loss_mlp": 0.19615883, + "epoch": 0.8464151510596724, + "flos": 13005588804480.0, + "grad_norm": 17.998876682724976, + "language_loss": 0.87838531, + "learning_rate": 2.422929943924643e-07, + "loss": 0.89327276, + "num_input_tokens_seen": 303665915, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.23132324, + "step": 14078, + "time_per_iteration": 2.6467270851135254 + }, + { + "auxiliary_loss_clip": 0.01261952, + "auxiliary_loss_mlp": 0.00221644, + "balance_loss_clip": 1.0449152, + "balance_loss_mlp": 0.19722964, + "epoch": 0.8464752743123403, + "flos": 15704921923200.0, + "grad_norm": 4.322404547187289, + "language_loss": 0.92045605, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.93529201, + "num_input_tokens_seen": 303679985, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24401855, + "step": 14079, + "time_per_iteration": 2.680760622024536 + }, + { + "auxiliary_loss_clip": 0.01285628, + "auxiliary_loss_mlp": 0.00220904, + "balance_loss_clip": 1.0554359, + "balance_loss_mlp": 0.19418889, + "epoch": 0.8465353975650083, + "flos": 21653955325440.0, + "grad_norm": 6.906334873987103, + "language_loss": 0.70808482, + "learning_rate": 2.419215098104965e-07, + "loss": 0.72315013, + "num_input_tokens_seen": 303698470, + "router_z_loss_clip": 2.30371094, + "router_z_loss_mlp": 0.26721191, + "step": 14080, + "time_per_iteration": 4.069557189941406 + }, + { + "auxiliary_loss_clip": 0.01283967, + "auxiliary_loss_mlp": 0.00241176, + "balance_loss_clip": 1.05689371, + "balance_loss_mlp": 0.21503344, + "epoch": 0.8465955208176762, + "flos": 18515650095360.0, + "grad_norm": 27.127250637039708, + "language_loss": 0.76507938, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.78033078, + "num_input_tokens_seen": 303716415, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.26159668, + "step": 14081, + "time_per_iteration": 2.6432697772979736 + }, + { + "auxiliary_loss_clip": 0.01261036, + "auxiliary_loss_mlp": 0.00231412, + "balance_loss_clip": 1.04142118, + "balance_loss_mlp": 0.20752245, + "epoch": 0.8466556440703442, + "flos": 24200559815040.0, + "grad_norm": 12.67107487481706, + "language_loss": 0.81159133, + "learning_rate": 2.41550291894576e-07, + "loss": 0.82651579, + "num_input_tokens_seen": 303734490, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.23864746, + "step": 14082, + "time_per_iteration": 2.809234619140625 + }, + { + "auxiliary_loss_clip": 0.01251232, + "auxiliary_loss_mlp": 0.00236419, + "balance_loss_clip": 1.03237343, + "balance_loss_mlp": 0.21198145, + "epoch": 0.8467157673230121, + "flos": 20375894528640.0, + "grad_norm": 7.694821415779886, + "language_loss": 0.82511455, + "learning_rate": 2.413647829539809e-07, + "loss": 0.83999109, + "num_input_tokens_seen": 303752310, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24414062, + "step": 14083, + "time_per_iteration": 2.6410090923309326 + }, + { + "auxiliary_loss_clip": 0.01267304, + "auxiliary_loss_mlp": 0.00222399, + "balance_loss_clip": 1.04659963, + "balance_loss_mlp": 0.19644761, + "epoch": 0.8467758905756801, + "flos": 28473642489600.0, + "grad_norm": 10.658477980105587, + "language_loss": 0.73626757, + "learning_rate": 2.411793407010092e-07, + "loss": 0.75116467, + "num_input_tokens_seen": 303776065, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.2598877, + "step": 14084, + "time_per_iteration": 2.7728354930877686 + }, + { + "auxiliary_loss_clip": 0.01258596, + "auxiliary_loss_mlp": 0.00215777, + "balance_loss_clip": 1.04191828, + "balance_loss_mlp": 0.19257893, + "epoch": 0.8468360138283482, + "flos": 11692551139200.0, + "grad_norm": 23.416390631339503, + "language_loss": 0.81210387, + "learning_rate": 2.409939651426938e-07, + "loss": 0.82684767, + "num_input_tokens_seen": 303793500, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.23205566, + "step": 14085, + "time_per_iteration": 2.619007110595703 + }, + { + "auxiliary_loss_clip": 0.0124512, + "auxiliary_loss_mlp": 0.00217676, + "balance_loss_clip": 1.02935553, + "balance_loss_mlp": 0.19465648, + "epoch": 0.8468961370810161, + "flos": 24607859109120.0, + "grad_norm": 651.5973739362198, + "language_loss": 0.77483928, + "learning_rate": 2.408086562860634e-07, + "loss": 0.78946716, + "num_input_tokens_seen": 303814835, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.23022461, + "step": 14086, + "time_per_iteration": 2.7769453525543213 + }, + { + "auxiliary_loss_clip": 0.01252688, + "auxiliary_loss_mlp": 0.00232502, + "balance_loss_clip": 1.03933918, + "balance_loss_mlp": 0.20960158, + "epoch": 0.8469562603336841, + "flos": 19609812236160.0, + "grad_norm": 5.244998139712387, + "language_loss": 0.82359004, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.83844191, + "num_input_tokens_seen": 303834505, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.22912598, + "step": 14087, + "time_per_iteration": 2.7103874683380127 + }, + { + "auxiliary_loss_clip": 0.01254748, + "auxiliary_loss_mlp": 0.00191097, + "balance_loss_clip": 1.03789663, + "balance_loss_mlp": 0.1689598, + "epoch": 0.847016383586352, + "flos": 22638949056000.0, + "grad_norm": 4.413488480277439, + "language_loss": 0.80530077, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.81975925, + "num_input_tokens_seen": 303855050, + "router_z_loss_clip": 2.16699219, + "router_z_loss_mlp": 0.22155762, + "step": 14088, + "time_per_iteration": 2.743617057800293 + }, + { + "auxiliary_loss_clip": 0.01269175, + "auxiliary_loss_mlp": 0.00223899, + "balance_loss_clip": 1.04665124, + "balance_loss_mlp": 0.19837667, + "epoch": 0.84707650683902, + "flos": 20960161153920.0, + "grad_norm": 52.729474088172445, + "language_loss": 0.83298182, + "learning_rate": 2.402531299965387e-07, + "loss": 0.84791255, + "num_input_tokens_seen": 303875635, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25549316, + "step": 14089, + "time_per_iteration": 2.6954169273376465 + }, + { + "auxiliary_loss_clip": 0.01253194, + "auxiliary_loss_mlp": 0.00236657, + "balance_loss_clip": 1.04022348, + "balance_loss_mlp": 0.21308948, + "epoch": 0.8471366300916879, + "flos": 24093007516800.0, + "grad_norm": 8.236392737766359, + "language_loss": 0.84101272, + "learning_rate": 2.400680880168928e-07, + "loss": 0.85591125, + "num_input_tokens_seen": 303896750, + "router_z_loss_clip": 2.13378906, + "router_z_loss_mlp": 0.23547363, + "step": 14090, + "time_per_iteration": 2.6903371810913086 + }, + { + "auxiliary_loss_clip": 0.01284905, + "auxiliary_loss_mlp": 0.00247046, + "balance_loss_clip": 1.0569706, + "balance_loss_mlp": 0.22055751, + "epoch": 0.847196753344356, + "flos": 18332900674560.0, + "grad_norm": 10.320729654247975, + "language_loss": 0.86870825, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.88402778, + "num_input_tokens_seen": 303915435, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.26489258, + "step": 14091, + "time_per_iteration": 2.7105090618133545 + }, + { + "auxiliary_loss_clip": 0.01149553, + "auxiliary_loss_mlp": 0.0015225, + "balance_loss_clip": 1.00085616, + "balance_loss_mlp": 0.1440963, + "epoch": 0.8472568765970239, + "flos": 49567536956160.0, + "grad_norm": 0.8071600436633227, + "language_loss": 0.58941996, + "learning_rate": 2.396982042749982e-07, + "loss": 0.60243797, + "num_input_tokens_seen": 303977245, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.08154297, + "step": 14092, + "time_per_iteration": 3.2294692993164062 + }, + { + "auxiliary_loss_clip": 0.01251169, + "auxiliary_loss_mlp": 0.00232854, + "balance_loss_clip": 1.03750587, + "balance_loss_mlp": 0.20866624, + "epoch": 0.8473169998496919, + "flos": 19279074781440.0, + "grad_norm": 141.2053472442435, + "language_loss": 0.78046072, + "learning_rate": 2.395133625267756e-07, + "loss": 0.79530096, + "num_input_tokens_seen": 303996055, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.24157715, + "step": 14093, + "time_per_iteration": 2.751455307006836 + }, + { + "auxiliary_loss_clip": 0.01248976, + "auxiliary_loss_mlp": 0.00213087, + "balance_loss_clip": 1.03222334, + "balance_loss_mlp": 0.18781486, + "epoch": 0.8473771231023598, + "flos": 17675555829120.0, + "grad_norm": 36.27473339707779, + "language_loss": 0.91806626, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.93268687, + "num_input_tokens_seen": 304012205, + "router_z_loss_clip": 2.16699219, + "router_z_loss_mlp": 0.25256348, + "step": 14094, + "time_per_iteration": 2.655585765838623 + }, + { + "auxiliary_loss_clip": 0.01234815, + "auxiliary_loss_mlp": 0.00203717, + "balance_loss_clip": 1.02606869, + "balance_loss_mlp": 0.18149662, + "epoch": 0.8474372463550278, + "flos": 26359761144960.0, + "grad_norm": 13.101543195701902, + "language_loss": 0.78148043, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.79586577, + "num_input_tokens_seen": 304033475, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.2220459, + "step": 14095, + "time_per_iteration": 2.8285298347473145 + }, + { + "auxiliary_loss_clip": 0.01257774, + "auxiliary_loss_mlp": 0.00222226, + "balance_loss_clip": 1.0393275, + "balance_loss_mlp": 0.19826454, + "epoch": 0.8474973696076957, + "flos": 23402050519680.0, + "grad_norm": 9.41505447614017, + "language_loss": 0.88214773, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.89694774, + "num_input_tokens_seen": 304051845, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.23950195, + "step": 14096, + "time_per_iteration": 2.737312078475952 + }, + { + "auxiliary_loss_clip": 0.01270966, + "auxiliary_loss_mlp": 0.00224257, + "balance_loss_clip": 1.0443604, + "balance_loss_mlp": 0.19911599, + "epoch": 0.8475574928603637, + "flos": 25075666863360.0, + "grad_norm": 2.6094696484212063, + "language_loss": 0.85985255, + "learning_rate": 2.387746631822374e-07, + "loss": 0.87480474, + "num_input_tokens_seen": 304069965, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.25158691, + "step": 14097, + "time_per_iteration": 2.8089165687561035 + }, + { + "auxiliary_loss_clip": 0.01250043, + "auxiliary_loss_mlp": 0.00218608, + "balance_loss_clip": 1.03325486, + "balance_loss_mlp": 0.1948491, + "epoch": 0.8476176161130318, + "flos": 19966691813760.0, + "grad_norm": 4.260956208722383, + "language_loss": 0.89849591, + "learning_rate": 2.385901552932048e-07, + "loss": 0.91318238, + "num_input_tokens_seen": 304086805, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.23742676, + "step": 14098, + "time_per_iteration": 2.7213079929351807 + }, + { + "auxiliary_loss_clip": 0.01246599, + "auxiliary_loss_mlp": 0.00242739, + "balance_loss_clip": 1.03473687, + "balance_loss_mlp": 0.21858755, + "epoch": 0.8476777393656997, + "flos": 21285834791040.0, + "grad_norm": 45.30672232737521, + "language_loss": 0.79892719, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.8138206, + "num_input_tokens_seen": 304105865, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.24157715, + "step": 14099, + "time_per_iteration": 2.7016491889953613 + }, + { + "auxiliary_loss_clip": 0.01270871, + "auxiliary_loss_mlp": 0.00239829, + "balance_loss_clip": 1.04989886, + "balance_loss_mlp": 0.21368605, + "epoch": 0.8477378626183677, + "flos": 29971476650880.0, + "grad_norm": 94.46462381579936, + "language_loss": 0.71570277, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.73080981, + "num_input_tokens_seen": 304128300, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26147461, + "step": 14100, + "time_per_iteration": 2.7585854530334473 + }, + { + "auxiliary_loss_clip": 0.01275299, + "auxiliary_loss_mlp": 0.00235978, + "balance_loss_clip": 1.05028582, + "balance_loss_mlp": 0.20758194, + "epoch": 0.8477979858710356, + "flos": 24237727413120.0, + "grad_norm": 15.33281906425421, + "language_loss": 0.84967434, + "learning_rate": 2.380370324111085e-07, + "loss": 0.8647871, + "num_input_tokens_seen": 304143695, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.28381348, + "step": 14101, + "time_per_iteration": 2.7331595420837402 + }, + { + "auxiliary_loss_clip": 0.01255919, + "auxiliary_loss_mlp": 0.00220749, + "balance_loss_clip": 1.03790855, + "balance_loss_mlp": 0.19557217, + "epoch": 0.8478581091237036, + "flos": 25593678852480.0, + "grad_norm": 25.509507941137116, + "language_loss": 0.78099775, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.79576445, + "num_input_tokens_seen": 304165800, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.25183105, + "step": 14102, + "time_per_iteration": 2.7837982177734375 + }, + { + "auxiliary_loss_clip": 0.01266244, + "auxiliary_loss_mlp": 0.00214421, + "balance_loss_clip": 1.04443264, + "balance_loss_mlp": 0.19044799, + "epoch": 0.8479182323763715, + "flos": 12057116227200.0, + "grad_norm": 14.359700034406025, + "language_loss": 0.92986453, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.94467115, + "num_input_tokens_seen": 304182910, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.23986816, + "step": 14103, + "time_per_iteration": 2.69295597076416 + }, + { + "auxiliary_loss_clip": 0.01264984, + "auxiliary_loss_mlp": 0.00228756, + "balance_loss_clip": 1.04814458, + "balance_loss_mlp": 0.20450863, + "epoch": 0.8479783556290396, + "flos": 21433391861760.0, + "grad_norm": 24.58196023644449, + "language_loss": 0.85956657, + "learning_rate": 2.374845108533079e-07, + "loss": 0.87450397, + "num_input_tokens_seen": 304200175, + "router_z_loss_clip": 2.16699219, + "router_z_loss_mlp": 0.24243164, + "step": 14104, + "time_per_iteration": 2.7226693630218506 + }, + { + "auxiliary_loss_clip": 0.01262404, + "auxiliary_loss_mlp": 0.00260778, + "balance_loss_clip": 1.0462532, + "balance_loss_mlp": 0.23440938, + "epoch": 0.8480384788817075, + "flos": 19642634288640.0, + "grad_norm": 2.264858473742614, + "language_loss": 0.85331905, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.86855078, + "num_input_tokens_seen": 304217775, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.26391602, + "step": 14105, + "time_per_iteration": 2.6739699840545654 + }, + { + "auxiliary_loss_clip": 0.01285318, + "auxiliary_loss_mlp": 0.00232369, + "balance_loss_clip": 1.05945766, + "balance_loss_mlp": 0.20670286, + "epoch": 0.8480986021343755, + "flos": 22489201255680.0, + "grad_norm": 3.955443851655256, + "language_loss": 0.60108554, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.61626244, + "num_input_tokens_seen": 304235760, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.25683594, + "step": 14106, + "time_per_iteration": 2.694535255432129 + }, + { + "auxiliary_loss_clip": 0.01262549, + "auxiliary_loss_mlp": 0.00230923, + "balance_loss_clip": 1.04189181, + "balance_loss_mlp": 0.20579395, + "epoch": 0.8481587253870434, + "flos": 22090557139200.0, + "grad_norm": 20.44814551509585, + "language_loss": 0.85315531, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.86809003, + "num_input_tokens_seen": 304253985, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25146484, + "step": 14107, + "time_per_iteration": 2.750936985015869 + }, + { + "auxiliary_loss_clip": 0.01256576, + "auxiliary_loss_mlp": 0.00227948, + "balance_loss_clip": 1.03652954, + "balance_loss_mlp": 0.20314068, + "epoch": 0.8482188486397114, + "flos": 33582689366400.0, + "grad_norm": 68.31997429341723, + "language_loss": 0.79512757, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.80997288, + "num_input_tokens_seen": 304276785, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.24816895, + "step": 14108, + "time_per_iteration": 2.832409620285034 + }, + { + "auxiliary_loss_clip": 0.01240837, + "auxiliary_loss_mlp": 0.00222879, + "balance_loss_clip": 1.02939689, + "balance_loss_mlp": 0.20016935, + "epoch": 0.8482789718923793, + "flos": 20919402195840.0, + "grad_norm": 18.644032999838654, + "language_loss": 0.80032754, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.81496471, + "num_input_tokens_seen": 304296310, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.22705078, + "step": 14109, + "time_per_iteration": 2.812953472137451 + }, + { + "auxiliary_loss_clip": 0.01256874, + "auxiliary_loss_mlp": 0.00209659, + "balance_loss_clip": 1.0404774, + "balance_loss_mlp": 0.18598372, + "epoch": 0.8483390951450474, + "flos": 12896204912640.0, + "grad_norm": 36.23868347492057, + "language_loss": 0.84204096, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.8567062, + "num_input_tokens_seen": 304311715, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.23669434, + "step": 14110, + "time_per_iteration": 2.730633020401001 + }, + { + "auxiliary_loss_clip": 0.01258243, + "auxiliary_loss_mlp": 0.00211833, + "balance_loss_clip": 1.04025102, + "balance_loss_mlp": 0.18571442, + "epoch": 0.8483992183977154, + "flos": 25081628520960.0, + "grad_norm": 343.00451382520555, + "language_loss": 0.83997172, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.85467249, + "num_input_tokens_seen": 304331910, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.26135254, + "step": 14111, + "time_per_iteration": 2.7635891437530518 + }, + { + "auxiliary_loss_clip": 0.01239634, + "auxiliary_loss_mlp": 0.00229985, + "balance_loss_clip": 1.0235877, + "balance_loss_mlp": 0.20590471, + "epoch": 0.8484593416503833, + "flos": 25557445008000.0, + "grad_norm": 31.4414191370526, + "language_loss": 0.74891275, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.76360893, + "num_input_tokens_seen": 304351405, + "router_z_loss_clip": 2.16308594, + "router_z_loss_mlp": 0.24060059, + "step": 14112, + "time_per_iteration": 2.7937867641448975 + }, + { + "auxiliary_loss_clip": 0.01250598, + "auxiliary_loss_mlp": 0.00226829, + "balance_loss_clip": 1.03441668, + "balance_loss_mlp": 0.20441729, + "epoch": 0.8485194649030513, + "flos": 27198454780800.0, + "grad_norm": 3.822794192762103, + "language_loss": 0.81300449, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.82777882, + "num_input_tokens_seen": 304372935, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.22399902, + "step": 14113, + "time_per_iteration": 4.265378713607788 + }, + { + "auxiliary_loss_clip": 0.01275761, + "auxiliary_loss_mlp": 0.00227763, + "balance_loss_clip": 1.05256355, + "balance_loss_mlp": 0.20216875, + "epoch": 0.8485795881557192, + "flos": 24205910941440.0, + "grad_norm": 12.176067959296192, + "language_loss": 0.7483176, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.76335287, + "num_input_tokens_seen": 304393070, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.2557373, + "step": 14114, + "time_per_iteration": 4.134046316146851 + }, + { + "auxiliary_loss_clip": 0.01257531, + "auxiliary_loss_mlp": 0.00250931, + "balance_loss_clip": 1.03940737, + "balance_loss_mlp": 0.22382301, + "epoch": 0.8486397114083872, + "flos": 21141653598720.0, + "grad_norm": 3.9953943630432582, + "language_loss": 0.87336671, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.8884514, + "num_input_tokens_seen": 304411195, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.27111816, + "step": 14115, + "time_per_iteration": 2.757525682449341 + }, + { + "auxiliary_loss_clip": 0.01258725, + "auxiliary_loss_mlp": 0.00223367, + "balance_loss_clip": 1.04056549, + "balance_loss_mlp": 0.19826113, + "epoch": 0.8486998346610551, + "flos": 19974772373760.0, + "grad_norm": 2.7301004416734154, + "language_loss": 0.88763273, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.90245366, + "num_input_tokens_seen": 304429425, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25109863, + "step": 14116, + "time_per_iteration": 2.7475719451904297 + }, + { + "auxiliary_loss_clip": 0.01254848, + "auxiliary_loss_mlp": 0.00196711, + "balance_loss_clip": 1.03672659, + "balance_loss_mlp": 0.17246418, + "epoch": 0.8487599579137232, + "flos": 19792310261760.0, + "grad_norm": 5.701828901919393, + "language_loss": 0.75548327, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.76999891, + "num_input_tokens_seen": 304447460, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24279785, + "step": 14117, + "time_per_iteration": 2.7552623748779297 + }, + { + "auxiliary_loss_clip": 0.01251233, + "auxiliary_loss_mlp": 0.00220911, + "balance_loss_clip": 1.03465474, + "balance_loss_mlp": 0.19756992, + "epoch": 0.8488200811663911, + "flos": 26396030903040.0, + "grad_norm": 3.5873541300646625, + "language_loss": 0.74546659, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.76018798, + "num_input_tokens_seen": 304468230, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.23339844, + "step": 14118, + "time_per_iteration": 2.767672538757324 + }, + { + "auxiliary_loss_clip": 0.01244739, + "auxiliary_loss_mlp": 0.00206844, + "balance_loss_clip": 1.03162897, + "balance_loss_mlp": 0.18529111, + "epoch": 0.8488802044190591, + "flos": 16359285939840.0, + "grad_norm": 68.0383087711467, + "language_loss": 0.80324948, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.8177653, + "num_input_tokens_seen": 304484860, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.21557617, + "step": 14119, + "time_per_iteration": 4.2154765129089355 + }, + { + "auxiliary_loss_clip": 0.01263208, + "auxiliary_loss_mlp": 0.00240461, + "balance_loss_clip": 1.04622245, + "balance_loss_mlp": 0.21536735, + "epoch": 0.848940327671727, + "flos": 19208869649280.0, + "grad_norm": 287.1657993689893, + "language_loss": 0.85462862, + "learning_rate": 2.345478926864446e-07, + "loss": 0.86966532, + "num_input_tokens_seen": 304503575, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.25085449, + "step": 14120, + "time_per_iteration": 2.7213051319122314 + }, + { + "auxiliary_loss_clip": 0.01264774, + "auxiliary_loss_mlp": 0.00238491, + "balance_loss_clip": 1.04411745, + "balance_loss_mlp": 0.21298024, + "epoch": 0.849000450924395, + "flos": 21871178824320.0, + "grad_norm": 101.1119628733246, + "language_loss": 0.82321262, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.83824527, + "num_input_tokens_seen": 304525005, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25500488, + "step": 14121, + "time_per_iteration": 2.74642276763916 + }, + { + "auxiliary_loss_clip": 0.01136366, + "auxiliary_loss_mlp": 0.00087442, + "balance_loss_clip": 0.99316621, + "balance_loss_mlp": 0.08095723, + "epoch": 0.8490605741770629, + "flos": 71166475624320.0, + "grad_norm": 0.7801563130309834, + "language_loss": 0.59344292, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.605681, + "num_input_tokens_seen": 304585220, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.06494141, + "step": 14122, + "time_per_iteration": 4.655341625213623 + }, + { + "auxiliary_loss_clip": 0.0125317, + "auxiliary_loss_mlp": 0.00205015, + "balance_loss_clip": 1.03498101, + "balance_loss_mlp": 0.18072012, + "epoch": 0.849120697429731, + "flos": 24973357950720.0, + "grad_norm": 10.537211010791724, + "language_loss": 0.90467894, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.91926074, + "num_input_tokens_seen": 304604665, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24291992, + "step": 14123, + "time_per_iteration": 2.7931089401245117 + }, + { + "auxiliary_loss_clip": 0.01234166, + "auxiliary_loss_mlp": 0.00221183, + "balance_loss_clip": 1.02519751, + "balance_loss_mlp": 0.19729345, + "epoch": 0.8491808206823989, + "flos": 23032277959680.0, + "grad_norm": 12.96170640249753, + "language_loss": 0.90388352, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.918437, + "num_input_tokens_seen": 304620600, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.2388916, + "step": 14124, + "time_per_iteration": 2.7274348735809326 + }, + { + "auxiliary_loss_clip": 0.01261295, + "auxiliary_loss_mlp": 0.00217126, + "balance_loss_clip": 1.04186189, + "balance_loss_mlp": 0.19398715, + "epoch": 0.8492409439350669, + "flos": 23878549365120.0, + "grad_norm": 16.56102732774694, + "language_loss": 0.80170667, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.81649089, + "num_input_tokens_seen": 304639540, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.23132324, + "step": 14125, + "time_per_iteration": 2.7850608825683594 + }, + { + "auxiliary_loss_clip": 0.01289088, + "auxiliary_loss_mlp": 0.00238259, + "balance_loss_clip": 1.05904818, + "balance_loss_mlp": 0.21063834, + "epoch": 0.8493010671877349, + "flos": 22419893963520.0, + "grad_norm": 181.77009602047218, + "language_loss": 0.81040549, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.82567894, + "num_input_tokens_seen": 304660595, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.27612305, + "step": 14126, + "time_per_iteration": 2.7433271408081055 + }, + { + "auxiliary_loss_clip": 0.01233414, + "auxiliary_loss_mlp": 0.00221314, + "balance_loss_clip": 1.02325749, + "balance_loss_mlp": 0.19779383, + "epoch": 0.8493611904404028, + "flos": 17529435302400.0, + "grad_norm": 87.36279460721846, + "language_loss": 0.75485253, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.76939976, + "num_input_tokens_seen": 304679580, + "router_z_loss_clip": 2.10058594, + "router_z_loss_mlp": 0.23498535, + "step": 14127, + "time_per_iteration": 2.842017650604248 + }, + { + "auxiliary_loss_clip": 0.01271351, + "auxiliary_loss_mlp": 0.00212525, + "balance_loss_clip": 1.04503644, + "balance_loss_mlp": 0.18533294, + "epoch": 0.8494213136930708, + "flos": 19462937523840.0, + "grad_norm": 6.515291597690145, + "language_loss": 0.8004117, + "learning_rate": 2.330860086502211e-07, + "loss": 0.8152504, + "num_input_tokens_seen": 304698385, + "router_z_loss_clip": 2.26660156, + "router_z_loss_mlp": 0.27209473, + "step": 14128, + "time_per_iteration": 2.680050849914551 + }, + { + "auxiliary_loss_clip": 0.0124726, + "auxiliary_loss_mlp": 0.00226506, + "balance_loss_clip": 1.03300488, + "balance_loss_mlp": 0.20097157, + "epoch": 0.8494814369457387, + "flos": 18770292587520.0, + "grad_norm": 6.078610346995285, + "language_loss": 0.85391974, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.86865735, + "num_input_tokens_seen": 304715430, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.25549316, + "step": 14129, + "time_per_iteration": 2.7344493865966797 + }, + { + "auxiliary_loss_clip": 0.01251618, + "auxiliary_loss_mlp": 0.00211252, + "balance_loss_clip": 1.03616929, + "balance_loss_mlp": 0.18776768, + "epoch": 0.8495415601984068, + "flos": 23331486251520.0, + "grad_norm": 6.749478107763598, + "language_loss": 0.73935407, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.75398278, + "num_input_tokens_seen": 304734345, + "router_z_loss_clip": 2.15722656, + "router_z_loss_mlp": 0.23498535, + "step": 14130, + "time_per_iteration": 2.7283401489257812 + }, + { + "auxiliary_loss_clip": 0.01244213, + "auxiliary_loss_mlp": 0.00219866, + "balance_loss_clip": 1.03160143, + "balance_loss_mlp": 0.19572575, + "epoch": 0.8496016834510747, + "flos": 26612859352320.0, + "grad_norm": 20.954668869267543, + "language_loss": 0.78296667, + "learning_rate": 2.3253890747186e-07, + "loss": 0.79760742, + "num_input_tokens_seen": 304755030, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.24145508, + "step": 14131, + "time_per_iteration": 2.805588960647583 + }, + { + "auxiliary_loss_clip": 0.01240878, + "auxiliary_loss_mlp": 0.00219402, + "balance_loss_clip": 1.0303576, + "balance_loss_mlp": 0.19542943, + "epoch": 0.8496618067037427, + "flos": 25480380378240.0, + "grad_norm": 18.793284172914905, + "language_loss": 0.76539636, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.77999914, + "num_input_tokens_seen": 304774320, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.23999023, + "step": 14132, + "time_per_iteration": 2.79803204536438 + }, + { + "auxiliary_loss_clip": 0.01248447, + "auxiliary_loss_mlp": 0.00203569, + "balance_loss_clip": 1.03594398, + "balance_loss_mlp": 0.17934605, + "epoch": 0.8497219299564106, + "flos": 25374587846400.0, + "grad_norm": 4.604222589321168, + "language_loss": 0.77048945, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.78500962, + "num_input_tokens_seen": 304795355, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.2421875, + "step": 14133, + "time_per_iteration": 2.7623770236968994 + }, + { + "auxiliary_loss_clip": 0.01152714, + "auxiliary_loss_mlp": 0.00140858, + "balance_loss_clip": 1.0050354, + "balance_loss_mlp": 0.13423008, + "epoch": 0.8497820532090786, + "flos": 67780279658880.0, + "grad_norm": 0.7232400725600282, + "language_loss": 0.5688439, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.5817796, + "num_input_tokens_seen": 304863915, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.06640625, + "step": 14134, + "time_per_iteration": 3.3380932807922363 + }, + { + "auxiliary_loss_clip": 0.01275764, + "auxiliary_loss_mlp": 0.00205, + "balance_loss_clip": 1.05127311, + "balance_loss_mlp": 0.18056196, + "epoch": 0.8498421764617465, + "flos": 23440546920960.0, + "grad_norm": 1321.4126025728785, + "language_loss": 0.87890685, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.89371443, + "num_input_tokens_seen": 304881555, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.24401855, + "step": 14135, + "time_per_iteration": 2.8014698028564453 + }, + { + "auxiliary_loss_clip": 0.01278478, + "auxiliary_loss_mlp": 0.00224012, + "balance_loss_clip": 1.05362344, + "balance_loss_mlp": 0.1973331, + "epoch": 0.8499022997144146, + "flos": 17712615686400.0, + "grad_norm": 11.641768875716627, + "language_loss": 0.75155783, + "learning_rate": 2.316284127127044e-07, + "loss": 0.76658273, + "num_input_tokens_seen": 304898760, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.2668457, + "step": 14136, + "time_per_iteration": 2.726651906967163 + }, + { + "auxiliary_loss_clip": 0.01272199, + "auxiliary_loss_mlp": 0.00232084, + "balance_loss_clip": 1.05070877, + "balance_loss_mlp": 0.20712194, + "epoch": 0.8499624229670825, + "flos": 18588512833920.0, + "grad_norm": 2.869216246269516, + "language_loss": 0.90049547, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.91553825, + "num_input_tokens_seen": 304915465, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.24938965, + "step": 14137, + "time_per_iteration": 2.8696117401123047 + }, + { + "auxiliary_loss_clip": 0.01241545, + "auxiliary_loss_mlp": 0.00239165, + "balance_loss_clip": 1.03206205, + "balance_loss_mlp": 0.21564505, + "epoch": 0.8500225462197505, + "flos": 24345854328960.0, + "grad_norm": 17.93503697433748, + "language_loss": 0.86302823, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.87783533, + "num_input_tokens_seen": 304933190, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.23535156, + "step": 14138, + "time_per_iteration": 2.8463594913482666 + }, + { + "auxiliary_loss_clip": 0.01253133, + "auxiliary_loss_mlp": 0.00222783, + "balance_loss_clip": 1.03627801, + "balance_loss_mlp": 0.19783264, + "epoch": 0.8500826694724185, + "flos": 16545518979840.0, + "grad_norm": 4.956328391128377, + "language_loss": 0.71652687, + "learning_rate": 2.310829204839073e-07, + "loss": 0.73128605, + "num_input_tokens_seen": 304951110, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24951172, + "step": 14139, + "time_per_iteration": 2.8149728775024414 + }, + { + "auxiliary_loss_clip": 0.01259505, + "auxiliary_loss_mlp": 0.00213819, + "balance_loss_clip": 1.0448662, + "balance_loss_mlp": 0.19002488, + "epoch": 0.8501427927250864, + "flos": 16289404030080.0, + "grad_norm": 23.224531579259896, + "language_loss": 0.78402573, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.79875892, + "num_input_tokens_seen": 304969095, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.23791504, + "step": 14140, + "time_per_iteration": 2.6956145763397217 + }, + { + "auxiliary_loss_clip": 0.01271501, + "auxiliary_loss_mlp": 0.00228264, + "balance_loss_clip": 1.04758298, + "balance_loss_mlp": 0.20210969, + "epoch": 0.8502029159777544, + "flos": 26687912820480.0, + "grad_norm": 422682.61986657494, + "language_loss": 0.73715216, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.75214988, + "num_input_tokens_seen": 304989315, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26171875, + "step": 14141, + "time_per_iteration": 2.8010926246643066 + }, + { + "auxiliary_loss_clip": 0.01244884, + "auxiliary_loss_mlp": 0.0020312, + "balance_loss_clip": 1.03163457, + "balance_loss_mlp": 0.18091121, + "epoch": 0.8502630392304223, + "flos": 35590778179200.0, + "grad_norm": 6.317127456306902, + "language_loss": 0.79366952, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.80814958, + "num_input_tokens_seen": 305011020, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.2220459, + "step": 14142, + "time_per_iteration": 2.8930771350860596 + }, + { + "auxiliary_loss_clip": 0.01249127, + "auxiliary_loss_mlp": 0.00220752, + "balance_loss_clip": 1.03442669, + "balance_loss_mlp": 0.19645722, + "epoch": 0.8503231624830904, + "flos": 21649466125440.0, + "grad_norm": 2.9688085940762234, + "language_loss": 0.72382146, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.73852026, + "num_input_tokens_seen": 305033550, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.24267578, + "step": 14143, + "time_per_iteration": 2.785146713256836 + }, + { + "auxiliary_loss_clip": 0.01258572, + "auxiliary_loss_mlp": 0.00217581, + "balance_loss_clip": 1.03711879, + "balance_loss_mlp": 0.1923086, + "epoch": 0.8503832857357583, + "flos": 22417451838720.0, + "grad_norm": 26.37499376342691, + "language_loss": 0.77764839, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.7924099, + "num_input_tokens_seen": 305052885, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25268555, + "step": 14144, + "time_per_iteration": 2.730778455734253 + }, + { + "auxiliary_loss_clip": 0.01256321, + "auxiliary_loss_mlp": 0.00216549, + "balance_loss_clip": 1.04071748, + "balance_loss_mlp": 0.19121677, + "epoch": 0.8504434089884263, + "flos": 18697968552960.0, + "grad_norm": 9.68333736707444, + "language_loss": 0.75621581, + "learning_rate": 2.299937473050777e-07, + "loss": 0.77094448, + "num_input_tokens_seen": 305071995, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.25341797, + "step": 14145, + "time_per_iteration": 2.758537769317627 + }, + { + "auxiliary_loss_clip": 0.01246672, + "auxiliary_loss_mlp": 0.00221849, + "balance_loss_clip": 1.03219318, + "balance_loss_mlp": 0.19713709, + "epoch": 0.8505035322410942, + "flos": 20007989475840.0, + "grad_norm": 3.6430930492698974, + "language_loss": 0.91224205, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.92692727, + "num_input_tokens_seen": 305090190, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.24707031, + "step": 14146, + "time_per_iteration": 2.7445316314697266 + }, + { + "auxiliary_loss_clip": 0.0124437, + "auxiliary_loss_mlp": 0.00203949, + "balance_loss_clip": 1.02754736, + "balance_loss_mlp": 0.18000022, + "epoch": 0.8505636554937622, + "flos": 20812173120000.0, + "grad_norm": 7.266980494620317, + "language_loss": 0.91117001, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.92565322, + "num_input_tokens_seen": 305109355, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.23962402, + "step": 14147, + "time_per_iteration": 2.7156193256378174 + }, + { + "auxiliary_loss_clip": 0.01258753, + "auxiliary_loss_mlp": 0.00216077, + "balance_loss_clip": 1.03892171, + "balance_loss_mlp": 0.19000548, + "epoch": 0.8506237787464301, + "flos": 14174445277440.0, + "grad_norm": 137.41608493811913, + "language_loss": 0.96643162, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.98117995, + "num_input_tokens_seen": 305124165, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.26037598, + "step": 14148, + "time_per_iteration": 2.7488412857055664 + }, + { + "auxiliary_loss_clip": 0.01256812, + "auxiliary_loss_mlp": 0.00220813, + "balance_loss_clip": 1.04044628, + "balance_loss_mlp": 0.19734097, + "epoch": 0.8506839019990982, + "flos": 23258372117760.0, + "grad_norm": 28.918373224598636, + "language_loss": 0.81515455, + "learning_rate": 2.292689741370204e-07, + "loss": 0.82993078, + "num_input_tokens_seen": 305143940, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.23498535, + "step": 14149, + "time_per_iteration": 2.715012788772583 + }, + { + "auxiliary_loss_clip": 0.01257824, + "auxiliary_loss_mlp": 0.00219584, + "balance_loss_clip": 1.04107273, + "balance_loss_mlp": 0.19559897, + "epoch": 0.8507440252517661, + "flos": 23659206963840.0, + "grad_norm": 2.4495845945974124, + "language_loss": 0.85488224, + "learning_rate": 2.290879486935804e-07, + "loss": 0.86965638, + "num_input_tokens_seen": 305163505, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.23999023, + "step": 14150, + "time_per_iteration": 2.79972767829895 + }, + { + "auxiliary_loss_clip": 0.01246294, + "auxiliary_loss_mlp": 0.00214921, + "balance_loss_clip": 1.03439963, + "balance_loss_mlp": 0.19206865, + "epoch": 0.8508041485044341, + "flos": 18661339658880.0, + "grad_norm": 39.54143597893323, + "language_loss": 0.80160046, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.81621259, + "num_input_tokens_seen": 305182325, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.22851562, + "step": 14151, + "time_per_iteration": 2.75498104095459 + }, + { + "auxiliary_loss_clip": 0.01145488, + "auxiliary_loss_mlp": 0.00145734, + "balance_loss_clip": 1.00044394, + "balance_loss_mlp": 0.13724671, + "epoch": 0.8508642717571021, + "flos": 52510918055040.0, + "grad_norm": 0.8465537791046179, + "language_loss": 0.58914316, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.60205531, + "num_input_tokens_seen": 305230775, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.08496094, + "step": 14152, + "time_per_iteration": 3.049924373626709 + }, + { + "auxiliary_loss_clip": 0.0114835, + "auxiliary_loss_mlp": 0.00157564, + "balance_loss_clip": 1.00429809, + "balance_loss_mlp": 0.14850378, + "epoch": 0.85092439500977, + "flos": 69297145050240.0, + "grad_norm": 0.6832510463338531, + "language_loss": 0.59631598, + "learning_rate": 2.285452753096797e-07, + "loss": 0.60937512, + "num_input_tokens_seen": 305296000, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.09082031, + "step": 14153, + "time_per_iteration": 3.218358278274536 + }, + { + "auxiliary_loss_clip": 0.01239757, + "auxiliary_loss_mlp": 0.00251728, + "balance_loss_clip": 1.02858853, + "balance_loss_mlp": 0.22694454, + "epoch": 0.850984518262438, + "flos": 24389737770240.0, + "grad_norm": 4.144986153690338, + "language_loss": 0.86882555, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.88374037, + "num_input_tokens_seen": 305314705, + "router_z_loss_clip": 2.11035156, + "router_z_loss_mlp": 0.24768066, + "step": 14154, + "time_per_iteration": 2.809767961502075 + }, + { + "auxiliary_loss_clip": 0.01227748, + "auxiliary_loss_mlp": 0.00209223, + "balance_loss_clip": 1.02105343, + "balance_loss_mlp": 0.18576288, + "epoch": 0.851044641515106, + "flos": 23294821443840.0, + "grad_norm": 3.0287336926344937, + "language_loss": 0.85319293, + "learning_rate": 2.281838289110165e-07, + "loss": 0.86756271, + "num_input_tokens_seen": 305333870, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.23449707, + "step": 14155, + "time_per_iteration": 4.153446197509766 + }, + { + "auxiliary_loss_clip": 0.01258539, + "auxiliary_loss_mlp": 0.00216193, + "balance_loss_clip": 1.03894281, + "balance_loss_mlp": 0.19143328, + "epoch": 0.851104764767774, + "flos": 22050085489920.0, + "grad_norm": 243.16628309335073, + "language_loss": 0.78534478, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.8000921, + "num_input_tokens_seen": 305352780, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24768066, + "step": 14156, + "time_per_iteration": 4.191095590591431 + }, + { + "auxiliary_loss_clip": 0.01236125, + "auxiliary_loss_mlp": 0.00220166, + "balance_loss_clip": 1.02624655, + "balance_loss_mlp": 0.1943213, + "epoch": 0.8511648880204419, + "flos": 20704728562560.0, + "grad_norm": 3.593363054292012, + "language_loss": 0.8346017, + "learning_rate": 2.278226512621386e-07, + "loss": 0.84916461, + "num_input_tokens_seen": 305371370, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.25830078, + "step": 14157, + "time_per_iteration": 2.715684175491333 + }, + { + "auxiliary_loss_clip": 0.01229575, + "auxiliary_loss_mlp": 0.00205617, + "balance_loss_clip": 1.01995468, + "balance_loss_mlp": 0.18123919, + "epoch": 0.8512250112731099, + "flos": 24024669891840.0, + "grad_norm": 7.890857554098454, + "language_loss": 0.88135487, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.89570683, + "num_input_tokens_seen": 305387955, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.24365234, + "step": 14158, + "time_per_iteration": 2.8615832328796387 + }, + { + "auxiliary_loss_clip": 0.01255845, + "auxiliary_loss_mlp": 0.00225706, + "balance_loss_clip": 1.0383265, + "balance_loss_mlp": 0.19982539, + "epoch": 0.8512851345257778, + "flos": 22015467757440.0, + "grad_norm": 10.897707131527621, + "language_loss": 0.88103527, + "learning_rate": 2.27461742417828e-07, + "loss": 0.89585078, + "num_input_tokens_seen": 305406285, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25854492, + "step": 14159, + "time_per_iteration": 2.7339324951171875 + }, + { + "auxiliary_loss_clip": 0.01250811, + "auxiliary_loss_mlp": 0.00210282, + "balance_loss_clip": 1.0354948, + "balance_loss_mlp": 0.18648778, + "epoch": 0.8513452577784458, + "flos": 14830209924480.0, + "grad_norm": 6.123466150954596, + "language_loss": 0.80008125, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.81469214, + "num_input_tokens_seen": 305424500, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.23779297, + "step": 14160, + "time_per_iteration": 2.7723586559295654 + }, + { + "auxiliary_loss_clip": 0.01280752, + "auxiliary_loss_mlp": 0.00258613, + "balance_loss_clip": 1.05445862, + "balance_loss_mlp": 0.23213705, + "epoch": 0.8514053810311137, + "flos": 33035662166400.0, + "grad_norm": 6.725414165892664, + "language_loss": 0.79317331, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.80856693, + "num_input_tokens_seen": 305442990, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.26464844, + "step": 14161, + "time_per_iteration": 4.26505184173584 + }, + { + "auxiliary_loss_clip": 0.01259131, + "auxiliary_loss_mlp": 0.00206078, + "balance_loss_clip": 1.03965068, + "balance_loss_mlp": 0.18302271, + "epoch": 0.8514655042837818, + "flos": 27564456412800.0, + "grad_norm": 86.94211923624658, + "language_loss": 0.88269043, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.89734256, + "num_input_tokens_seen": 305463065, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.23059082, + "step": 14162, + "time_per_iteration": 2.7833611965179443 + }, + { + "auxiliary_loss_clip": 0.0125095, + "auxiliary_loss_mlp": 0.00234886, + "balance_loss_clip": 1.03492117, + "balance_loss_mlp": 0.21130598, + "epoch": 0.8515256275364497, + "flos": 35556052705920.0, + "grad_norm": 2.575726209116466, + "language_loss": 0.82904845, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.84390676, + "num_input_tokens_seen": 305489070, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.23596191, + "step": 14163, + "time_per_iteration": 2.9315714836120605 + }, + { + "auxiliary_loss_clip": 0.01129777, + "auxiliary_loss_mlp": 0.00154082, + "balance_loss_clip": 0.98360658, + "balance_loss_mlp": 0.14592844, + "epoch": 0.8515857507891177, + "flos": 70207372621440.0, + "grad_norm": 0.7091481400333814, + "language_loss": 0.54074597, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.55358452, + "num_input_tokens_seen": 305551490, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.08154297, + "step": 14164, + "time_per_iteration": 3.2960622310638428 + }, + { + "auxiliary_loss_clip": 0.01255076, + "auxiliary_loss_mlp": 0.0020716, + "balance_loss_clip": 1.03193331, + "balance_loss_mlp": 0.18112427, + "epoch": 0.8516458740417857, + "flos": 22675290641280.0, + "grad_norm": 4.18781060008492, + "language_loss": 0.82237101, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.83699334, + "num_input_tokens_seen": 305570535, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26025391, + "step": 14165, + "time_per_iteration": 4.171326637268066 + }, + { + "auxiliary_loss_clip": 0.01233282, + "auxiliary_loss_mlp": 0.0020967, + "balance_loss_clip": 1.02059472, + "balance_loss_mlp": 0.1856972, + "epoch": 0.8517059972944536, + "flos": 22747435107840.0, + "grad_norm": 13.844316007808242, + "language_loss": 0.75686961, + "learning_rate": 2.26200679088697e-07, + "loss": 0.77129912, + "num_input_tokens_seen": 305590800, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.23974609, + "step": 14166, + "time_per_iteration": 2.7469704151153564 + }, + { + "auxiliary_loss_clip": 0.01241911, + "auxiliary_loss_mlp": 0.00217067, + "balance_loss_clip": 1.02496386, + "balance_loss_mlp": 0.19272463, + "epoch": 0.8517661205471216, + "flos": 21689147675520.0, + "grad_norm": 34.60444735854965, + "language_loss": 0.81544453, + "learning_rate": 2.260207961805125e-07, + "loss": 0.83003432, + "num_input_tokens_seen": 305609495, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.24316406, + "step": 14167, + "time_per_iteration": 2.7881689071655273 + }, + { + "auxiliary_loss_clip": 0.01252032, + "auxiliary_loss_mlp": 0.00225565, + "balance_loss_clip": 1.03736687, + "balance_loss_mlp": 0.20257001, + "epoch": 0.8518262437997896, + "flos": 25374839241600.0, + "grad_norm": 83.19535898597242, + "language_loss": 0.88236332, + "learning_rate": 2.258409805417969e-07, + "loss": 0.89713925, + "num_input_tokens_seen": 305629420, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.23010254, + "step": 14168, + "time_per_iteration": 2.7600932121276855 + }, + { + "auxiliary_loss_clip": 0.01236427, + "auxiliary_loss_mlp": 0.00201441, + "balance_loss_clip": 1.0263294, + "balance_loss_mlp": 0.17817168, + "epoch": 0.8518863670524576, + "flos": 27235406897280.0, + "grad_norm": 1823.1925775208308, + "language_loss": 0.87161303, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.88599169, + "num_input_tokens_seen": 305649835, + "router_z_loss_clip": 2.10058594, + "router_z_loss_mlp": 0.23278809, + "step": 14169, + "time_per_iteration": 2.786895275115967 + }, + { + "auxiliary_loss_clip": 0.01244102, + "auxiliary_loss_mlp": 0.00224189, + "balance_loss_clip": 1.03056037, + "balance_loss_mlp": 0.19978639, + "epoch": 0.8519464903051255, + "flos": 20959514709120.0, + "grad_norm": 1051.1813453464063, + "language_loss": 0.75335133, + "learning_rate": 2.254815511000452e-07, + "loss": 0.76803422, + "num_input_tokens_seen": 305668840, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24438477, + "step": 14170, + "time_per_iteration": 2.6893014907836914 + }, + { + "auxiliary_loss_clip": 0.01241256, + "auxiliary_loss_mlp": 0.00221648, + "balance_loss_clip": 1.02853346, + "balance_loss_mlp": 0.19858147, + "epoch": 0.8520066135577935, + "flos": 18441745862400.0, + "grad_norm": 29.92199104197921, + "language_loss": 0.97138882, + "learning_rate": 2.253019373106384e-07, + "loss": 0.98601782, + "num_input_tokens_seen": 305686955, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.23059082, + "step": 14171, + "time_per_iteration": 2.7783195972442627 + }, + { + "auxiliary_loss_clip": 0.0126444, + "auxiliary_loss_mlp": 0.00224358, + "balance_loss_clip": 1.04521179, + "balance_loss_mlp": 0.19885918, + "epoch": 0.8520667368104614, + "flos": 29130233149440.0, + "grad_norm": 209.04452275394402, + "language_loss": 0.63514304, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.65003109, + "num_input_tokens_seen": 305706290, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25537109, + "step": 14172, + "time_per_iteration": 2.8143856525421143 + }, + { + "auxiliary_loss_clip": 0.01237391, + "auxiliary_loss_mlp": 0.00236081, + "balance_loss_clip": 1.02709079, + "balance_loss_mlp": 0.21253714, + "epoch": 0.8521268600631294, + "flos": 16034366488320.0, + "grad_norm": 12.651143309092951, + "language_loss": 0.78006637, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.79480106, + "num_input_tokens_seen": 305723835, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23547363, + "step": 14173, + "time_per_iteration": 2.6988277435302734 + }, + { + "auxiliary_loss_clip": 0.01254228, + "auxiliary_loss_mlp": 0.00224707, + "balance_loss_clip": 1.03449416, + "balance_loss_mlp": 0.19868377, + "epoch": 0.8521869833157973, + "flos": 22454870832000.0, + "grad_norm": 11.174476138820927, + "language_loss": 0.86767799, + "learning_rate": 2.247634997500205e-07, + "loss": 0.88246727, + "num_input_tokens_seen": 305741655, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.26025391, + "step": 14174, + "time_per_iteration": 2.7365152835845947 + }, + { + "auxiliary_loss_clip": 0.01273239, + "auxiliary_loss_mlp": 0.00213262, + "balance_loss_clip": 1.0491271, + "balance_loss_mlp": 0.1892415, + "epoch": 0.8522471065684654, + "flos": 24972029147520.0, + "grad_norm": 6.958771907138971, + "language_loss": 0.90573955, + "learning_rate": 2.245841551883676e-07, + "loss": 0.92060453, + "num_input_tokens_seen": 305761890, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.24023438, + "step": 14175, + "time_per_iteration": 2.8263590335845947 + }, + { + "auxiliary_loss_clip": 0.01267034, + "auxiliary_loss_mlp": 0.00220182, + "balance_loss_clip": 1.04449284, + "balance_loss_mlp": 0.19539826, + "epoch": 0.8523072298211333, + "flos": 17710604524800.0, + "grad_norm": 453.03026976741785, + "language_loss": 0.76684785, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.78171992, + "num_input_tokens_seen": 305779190, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.24780273, + "step": 14176, + "time_per_iteration": 2.7904422283172607 + }, + { + "auxiliary_loss_clip": 0.01255723, + "auxiliary_loss_mlp": 0.00205423, + "balance_loss_clip": 1.04191041, + "balance_loss_mlp": 0.18147346, + "epoch": 0.8523673530738013, + "flos": 25446193608960.0, + "grad_norm": 5.515385238069052, + "language_loss": 0.85774612, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.87235761, + "num_input_tokens_seen": 305799870, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.23962402, + "step": 14177, + "time_per_iteration": 2.7860147953033447 + }, + { + "auxiliary_loss_clip": 0.01240433, + "auxiliary_loss_mlp": 0.00196464, + "balance_loss_clip": 1.02781177, + "balance_loss_mlp": 0.17345662, + "epoch": 0.8524274763264693, + "flos": 31429593348480.0, + "grad_norm": 2.0381909218283703, + "language_loss": 0.82174653, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.83611548, + "num_input_tokens_seen": 305819695, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.23010254, + "step": 14178, + "time_per_iteration": 2.793318748474121 + }, + { + "auxiliary_loss_clip": 0.01251478, + "auxiliary_loss_mlp": 0.00211358, + "balance_loss_clip": 1.03298807, + "balance_loss_mlp": 0.18705107, + "epoch": 0.8524875995791372, + "flos": 17712651600000.0, + "grad_norm": 13.658459694090839, + "language_loss": 0.83910435, + "learning_rate": 2.238674502491935e-07, + "loss": 0.85373271, + "num_input_tokens_seen": 305837270, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24353027, + "step": 14179, + "time_per_iteration": 2.7665469646453857 + }, + { + "auxiliary_loss_clip": 0.01230958, + "auxiliary_loss_mlp": 0.00195981, + "balance_loss_clip": 1.0220437, + "balance_loss_mlp": 0.17358187, + "epoch": 0.8525477228318052, + "flos": 21687316081920.0, + "grad_norm": 47.22761672787492, + "language_loss": 0.91867322, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.93294257, + "num_input_tokens_seen": 305855250, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.22412109, + "step": 14180, + "time_per_iteration": 2.8783514499664307 + }, + { + "auxiliary_loss_clip": 0.01241492, + "auxiliary_loss_mlp": 0.00230801, + "balance_loss_clip": 1.02845502, + "balance_loss_mlp": 0.20602965, + "epoch": 0.8526078460844732, + "flos": 24827057856000.0, + "grad_norm": 904.0512974710341, + "language_loss": 0.72388911, + "learning_rate": 2.235095018591815e-07, + "loss": 0.738612, + "num_input_tokens_seen": 305875660, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.24755859, + "step": 14181, + "time_per_iteration": 2.7740259170532227 + }, + { + "auxiliary_loss_clip": 0.01223399, + "auxiliary_loss_mlp": 0.00207869, + "balance_loss_clip": 1.01396203, + "balance_loss_mlp": 0.18593405, + "epoch": 0.8526679693371412, + "flos": 13516418073600.0, + "grad_norm": 134.33695646772333, + "language_loss": 0.79605019, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.81036282, + "num_input_tokens_seen": 305892415, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.21936035, + "step": 14182, + "time_per_iteration": 2.826763153076172 + }, + { + "auxiliary_loss_clip": 0.0123744, + "auxiliary_loss_mlp": 0.00215363, + "balance_loss_clip": 1.02801037, + "balance_loss_mlp": 0.19191477, + "epoch": 0.8527280925898091, + "flos": 23514092017920.0, + "grad_norm": 2.267210944939354, + "language_loss": 0.77431792, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.78884596, + "num_input_tokens_seen": 305912665, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.23474121, + "step": 14183, + "time_per_iteration": 2.770207166671753 + }, + { + "auxiliary_loss_clip": 0.01240955, + "auxiliary_loss_mlp": 0.00190677, + "balance_loss_clip": 1.02794707, + "balance_loss_mlp": 0.1674436, + "epoch": 0.8527882158424771, + "flos": 20303031790080.0, + "grad_norm": 5.980100954137653, + "language_loss": 0.83422756, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.84854388, + "num_input_tokens_seen": 305931515, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.2322998, + "step": 14184, + "time_per_iteration": 2.731628656387329 + }, + { + "auxiliary_loss_clip": 0.01238322, + "auxiliary_loss_mlp": 0.00206753, + "balance_loss_clip": 1.02404821, + "balance_loss_mlp": 0.18372178, + "epoch": 0.852848339095145, + "flos": 17202504689280.0, + "grad_norm": 150.94206999242556, + "language_loss": 0.83370531, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.84815598, + "num_input_tokens_seen": 305949965, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.23022461, + "step": 14185, + "time_per_iteration": 2.7558999061584473 + }, + { + "auxiliary_loss_clip": 0.01254934, + "auxiliary_loss_mlp": 0.00202255, + "balance_loss_clip": 1.03302288, + "balance_loss_mlp": 0.17681536, + "epoch": 0.852908462347813, + "flos": 18368990864640.0, + "grad_norm": 4.761952652462049, + "language_loss": 0.8812983, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.89587021, + "num_input_tokens_seen": 305967820, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.25476074, + "step": 14186, + "time_per_iteration": 2.6657252311706543 + }, + { + "auxiliary_loss_clip": 0.01256535, + "auxiliary_loss_mlp": 0.00206505, + "balance_loss_clip": 1.03513074, + "balance_loss_mlp": 0.18232939, + "epoch": 0.8529685856004809, + "flos": 18624890332800.0, + "grad_norm": 36.93822021850735, + "language_loss": 0.70882916, + "learning_rate": 2.224372736588449e-07, + "loss": 0.72345954, + "num_input_tokens_seen": 305985505, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.24169922, + "step": 14187, + "time_per_iteration": 2.7156872749328613 + }, + { + "auxiliary_loss_clip": 0.01259433, + "auxiliary_loss_mlp": 0.00224786, + "balance_loss_clip": 1.03800917, + "balance_loss_mlp": 0.19904888, + "epoch": 0.853028708853149, + "flos": 29607665748480.0, + "grad_norm": 3.41686128624052, + "language_loss": 0.83694756, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.85178977, + "num_input_tokens_seen": 306005220, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25744629, + "step": 14188, + "time_per_iteration": 2.7682626247406006 + }, + { + "auxiliary_loss_clip": 0.01247338, + "auxiliary_loss_mlp": 0.00241969, + "balance_loss_clip": 1.03028679, + "balance_loss_mlp": 0.21585047, + "epoch": 0.8530888321058169, + "flos": 26353153042560.0, + "grad_norm": 9.325566898454634, + "language_loss": 0.85525542, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.87014854, + "num_input_tokens_seen": 306023785, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.26135254, + "step": 14189, + "time_per_iteration": 2.786848783493042 + }, + { + "auxiliary_loss_clip": 0.01235753, + "auxiliary_loss_mlp": 0.00191638, + "balance_loss_clip": 1.02354801, + "balance_loss_mlp": 0.16760573, + "epoch": 0.8531489553584849, + "flos": 20521979141760.0, + "grad_norm": 16.54709832786347, + "language_loss": 0.8872025, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.90147638, + "num_input_tokens_seen": 306041600, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.24023438, + "step": 14190, + "time_per_iteration": 2.7175333499908447 + }, + { + "auxiliary_loss_clip": 0.01246729, + "auxiliary_loss_mlp": 0.00217318, + "balance_loss_clip": 1.0326997, + "balance_loss_mlp": 0.19272476, + "epoch": 0.8532090786111529, + "flos": 20704297599360.0, + "grad_norm": 14.624858553508252, + "language_loss": 0.8688097, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.88345015, + "num_input_tokens_seen": 306060345, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.24584961, + "step": 14191, + "time_per_iteration": 2.699312925338745 + }, + { + "auxiliary_loss_clip": 0.01236335, + "auxiliary_loss_mlp": 0.00192488, + "balance_loss_clip": 1.02334034, + "balance_loss_mlp": 0.1693258, + "epoch": 0.8532692018638208, + "flos": 19828903242240.0, + "grad_norm": 92.29535001274854, + "language_loss": 0.78834581, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.80263406, + "num_input_tokens_seen": 306078285, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.23168945, + "step": 14192, + "time_per_iteration": 2.664381504058838 + }, + { + "auxiliary_loss_clip": 0.012741, + "auxiliary_loss_mlp": 0.00236348, + "balance_loss_clip": 1.04508138, + "balance_loss_mlp": 0.20751174, + "epoch": 0.8533293251164888, + "flos": 20996790048000.0, + "grad_norm": 45.9291340734518, + "language_loss": 0.73302686, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.74813133, + "num_input_tokens_seen": 306093760, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.28833008, + "step": 14193, + "time_per_iteration": 2.713536500930786 + }, + { + "auxiliary_loss_clip": 0.01240803, + "auxiliary_loss_mlp": 0.00207579, + "balance_loss_clip": 1.02741015, + "balance_loss_mlp": 0.18512008, + "epoch": 0.8533894483691568, + "flos": 22419606654720.0, + "grad_norm": 48.19269509271729, + "language_loss": 0.85302424, + "learning_rate": 2.211894078044365e-07, + "loss": 0.86750805, + "num_input_tokens_seen": 306112595, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.2244873, + "step": 14194, + "time_per_iteration": 2.7124104499816895 + }, + { + "auxiliary_loss_clip": 0.01244693, + "auxiliary_loss_mlp": 0.00215583, + "balance_loss_clip": 1.03163457, + "balance_loss_mlp": 0.19186017, + "epoch": 0.8534495716218248, + "flos": 21616536332160.0, + "grad_norm": 7.6247856326891785, + "language_loss": 0.794029, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.80863172, + "num_input_tokens_seen": 306131800, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.23730469, + "step": 14195, + "time_per_iteration": 2.7437386512756348 + }, + { + "auxiliary_loss_clip": 0.01244265, + "auxiliary_loss_mlp": 0.00216609, + "balance_loss_clip": 1.02499604, + "balance_loss_mlp": 0.19182587, + "epoch": 0.8535096948744927, + "flos": 22346277039360.0, + "grad_norm": 11.18822885220872, + "language_loss": 0.95375526, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.96836388, + "num_input_tokens_seen": 306150590, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24780273, + "step": 14196, + "time_per_iteration": 2.7912211418151855 + }, + { + "auxiliary_loss_clip": 0.01133135, + "auxiliary_loss_mlp": 0.00129823, + "balance_loss_clip": 0.98284769, + "balance_loss_mlp": 0.12262288, + "epoch": 0.8535698181271607, + "flos": 52762507891200.0, + "grad_norm": 0.7383048099582208, + "language_loss": 0.54241753, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.55504704, + "num_input_tokens_seen": 306205850, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.07177734, + "step": 14197, + "time_per_iteration": 4.51965069770813 + }, + { + "auxiliary_loss_clip": 0.01239716, + "auxiliary_loss_mlp": 0.00208712, + "balance_loss_clip": 1.02559638, + "balance_loss_mlp": 0.18310626, + "epoch": 0.8536299413798286, + "flos": 19062892776960.0, + "grad_norm": 426.1396147951634, + "language_loss": 0.87044412, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.88492841, + "num_input_tokens_seen": 306225220, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.25622559, + "step": 14198, + "time_per_iteration": 4.123647212982178 + }, + { + "auxiliary_loss_clip": 0.012326, + "auxiliary_loss_mlp": 0.00214106, + "balance_loss_clip": 1.02599359, + "balance_loss_mlp": 0.19242185, + "epoch": 0.8536900646324966, + "flos": 49344743871360.0, + "grad_norm": 93.84193705560455, + "language_loss": 0.75962377, + "learning_rate": 2.203000984963035e-07, + "loss": 0.77409077, + "num_input_tokens_seen": 306249865, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.2166748, + "step": 14199, + "time_per_iteration": 2.9536290168762207 + }, + { + "auxiliary_loss_clip": 0.01219949, + "auxiliary_loss_mlp": 0.0019128, + "balance_loss_clip": 1.01662898, + "balance_loss_mlp": 0.17004904, + "epoch": 0.8537501878851645, + "flos": 21762333636480.0, + "grad_norm": 3.5701672379447653, + "language_loss": 0.92954105, + "learning_rate": 2.201224390669072e-07, + "loss": 0.94365335, + "num_input_tokens_seen": 306270215, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.21228027, + "step": 14200, + "time_per_iteration": 2.7563672065734863 + }, + { + "auxiliary_loss_clip": 0.01240328, + "auxiliary_loss_mlp": 0.00218898, + "balance_loss_clip": 1.02623713, + "balance_loss_mlp": 0.1941622, + "epoch": 0.8538103111378326, + "flos": 22269176496000.0, + "grad_norm": 4.714167957645843, + "language_loss": 0.85537696, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.86996919, + "num_input_tokens_seen": 306288960, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.24731445, + "step": 14201, + "time_per_iteration": 2.8623204231262207 + }, + { + "auxiliary_loss_clip": 0.01234749, + "auxiliary_loss_mlp": 0.00212696, + "balance_loss_clip": 1.02659726, + "balance_loss_mlp": 0.19068956, + "epoch": 0.8538704343905005, + "flos": 20303929630080.0, + "grad_norm": 13.516607827578337, + "language_loss": 0.75938499, + "learning_rate": 2.19767322694256e-07, + "loss": 0.7738595, + "num_input_tokens_seen": 306308735, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.2199707, + "step": 14202, + "time_per_iteration": 2.6796255111694336 + }, + { + "auxiliary_loss_clip": 0.01254674, + "auxiliary_loss_mlp": 0.00186243, + "balance_loss_clip": 1.03566587, + "balance_loss_mlp": 0.16373622, + "epoch": 0.8539305576431685, + "flos": 24755164784640.0, + "grad_norm": 5.023878378164333, + "language_loss": 0.89638662, + "learning_rate": 2.195898657644666e-07, + "loss": 0.91079581, + "num_input_tokens_seen": 306329015, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.22509766, + "step": 14203, + "time_per_iteration": 4.127129316329956 + }, + { + "auxiliary_loss_clip": 0.0125949, + "auxiliary_loss_mlp": 0.00232702, + "balance_loss_clip": 1.03650784, + "balance_loss_mlp": 0.20630893, + "epoch": 0.8539906808958365, + "flos": 26687625511680.0, + "grad_norm": 65.8060890687957, + "language_loss": 0.7682395, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.78316146, + "num_input_tokens_seen": 306349085, + "router_z_loss_clip": 2.22949219, + "router_z_loss_mlp": 0.26379395, + "step": 14204, + "time_per_iteration": 2.824389696121216 + }, + { + "auxiliary_loss_clip": 0.01249102, + "auxiliary_loss_mlp": 0.00229699, + "balance_loss_clip": 1.03178847, + "balance_loss_mlp": 0.20598796, + "epoch": 0.8540508041485044, + "flos": 13365521038080.0, + "grad_norm": 6.436336562745407, + "language_loss": 0.72891641, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.74370438, + "num_input_tokens_seen": 306365385, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.23718262, + "step": 14205, + "time_per_iteration": 2.6321768760681152 + }, + { + "auxiliary_loss_clip": 0.01251596, + "auxiliary_loss_mlp": 0.00217161, + "balance_loss_clip": 1.03855133, + "balance_loss_mlp": 0.1929739, + "epoch": 0.8541109274011724, + "flos": 32780876019840.0, + "grad_norm": 4.21767213452589, + "language_loss": 0.80382228, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.81850988, + "num_input_tokens_seen": 306384585, + "router_z_loss_clip": 2.13378906, + "router_z_loss_mlp": 0.24182129, + "step": 14206, + "time_per_iteration": 2.7675914764404297 + }, + { + "auxiliary_loss_clip": 0.01260044, + "auxiliary_loss_mlp": 0.00224687, + "balance_loss_clip": 1.03719652, + "balance_loss_mlp": 0.19896166, + "epoch": 0.8541710506538404, + "flos": 17639286071040.0, + "grad_norm": 54.9764765995017, + "language_loss": 0.88888097, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.90372825, + "num_input_tokens_seen": 306401565, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25756836, + "step": 14207, + "time_per_iteration": 4.073173761367798 + }, + { + "auxiliary_loss_clip": 0.01252617, + "auxiliary_loss_mlp": 0.00229623, + "balance_loss_clip": 1.03595424, + "balance_loss_mlp": 0.20303899, + "epoch": 0.8542311739065084, + "flos": 20263062931200.0, + "grad_norm": 12.718075233346875, + "language_loss": 0.91574287, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.93056524, + "num_input_tokens_seen": 306419995, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.26586914, + "step": 14208, + "time_per_iteration": 2.648850679397583 + }, + { + "auxiliary_loss_clip": 0.01230692, + "auxiliary_loss_mlp": 0.00211794, + "balance_loss_clip": 1.02388048, + "balance_loss_mlp": 0.18851198, + "epoch": 0.8542912971591763, + "flos": 17785657992960.0, + "grad_norm": 14.432771171216665, + "language_loss": 0.77405137, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.78847623, + "num_input_tokens_seen": 306439240, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.23303223, + "step": 14209, + "time_per_iteration": 2.6812827587127686 + }, + { + "auxiliary_loss_clip": 0.01232195, + "auxiliary_loss_mlp": 0.0019257, + "balance_loss_clip": 1.02178574, + "balance_loss_mlp": 0.1709339, + "epoch": 0.8543514204118443, + "flos": 26979507429120.0, + "grad_norm": 3.5673983151773903, + "language_loss": 0.76126301, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.77551067, + "num_input_tokens_seen": 306458425, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.21643066, + "step": 14210, + "time_per_iteration": 2.706014633178711 + }, + { + "auxiliary_loss_clip": 0.01253471, + "auxiliary_loss_mlp": 0.00225906, + "balance_loss_clip": 1.03278923, + "balance_loss_mlp": 0.19968036, + "epoch": 0.8544115436645122, + "flos": 24024598064640.0, + "grad_norm": 29.596345502021187, + "language_loss": 0.76811945, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.78291321, + "num_input_tokens_seen": 306477210, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.26220703, + "step": 14211, + "time_per_iteration": 2.7154407501220703 + }, + { + "auxiliary_loss_clip": 0.01252576, + "auxiliary_loss_mlp": 0.00214072, + "balance_loss_clip": 1.03380501, + "balance_loss_mlp": 0.19087401, + "epoch": 0.8544716669171802, + "flos": 16617986668800.0, + "grad_norm": 3.813895657578689, + "language_loss": 0.92968673, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.94435322, + "num_input_tokens_seen": 306495820, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.23193359, + "step": 14212, + "time_per_iteration": 2.65909743309021 + }, + { + "auxiliary_loss_clip": 0.01246094, + "auxiliary_loss_mlp": 0.00209979, + "balance_loss_clip": 1.02941751, + "balance_loss_mlp": 0.18533805, + "epoch": 0.8545317901698481, + "flos": 40005779489280.0, + "grad_norm": 25.032864588699223, + "language_loss": 0.77062875, + "learning_rate": 2.178190108088105e-07, + "loss": 0.78518945, + "num_input_tokens_seen": 306516420, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24621582, + "step": 14213, + "time_per_iteration": 2.8303985595703125 + }, + { + "auxiliary_loss_clip": 0.01232528, + "auxiliary_loss_mlp": 0.00202601, + "balance_loss_clip": 1.02179742, + "balance_loss_mlp": 0.17831811, + "epoch": 0.8545919134225162, + "flos": 19902520166400.0, + "grad_norm": 150.61969479948013, + "language_loss": 0.8651402, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.87949145, + "num_input_tokens_seen": 306534785, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.24316406, + "step": 14214, + "time_per_iteration": 2.667465925216675 + }, + { + "auxiliary_loss_clip": 0.01277145, + "auxiliary_loss_mlp": 0.00206996, + "balance_loss_clip": 1.04636216, + "balance_loss_mlp": 0.18028107, + "epoch": 0.8546520366751841, + "flos": 18952970181120.0, + "grad_norm": 11.804320066745388, + "language_loss": 0.79449391, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.80933535, + "num_input_tokens_seen": 306552440, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.26733398, + "step": 14215, + "time_per_iteration": 2.644312858581543 + }, + { + "auxiliary_loss_clip": 0.01226209, + "auxiliary_loss_mlp": 0.00206477, + "balance_loss_clip": 1.01645863, + "balance_loss_mlp": 0.1832906, + "epoch": 0.8547121599278521, + "flos": 35621445415680.0, + "grad_norm": 118.47439639825717, + "language_loss": 0.70666015, + "learning_rate": 2.172890718362279e-07, + "loss": 0.72098702, + "num_input_tokens_seen": 306573600, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.23205566, + "step": 14216, + "time_per_iteration": 2.809391736984253 + }, + { + "auxiliary_loss_clip": 0.0125998, + "auxiliary_loss_mlp": 0.00225415, + "balance_loss_clip": 1.03471088, + "balance_loss_mlp": 0.19972566, + "epoch": 0.8547722831805201, + "flos": 16910048154240.0, + "grad_norm": 21.00571313300758, + "language_loss": 0.75093275, + "learning_rate": 2.17112560704259e-07, + "loss": 0.76578665, + "num_input_tokens_seen": 306592840, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.25720215, + "step": 14217, + "time_per_iteration": 2.7738723754882812 + }, + { + "auxiliary_loss_clip": 0.0124733, + "auxiliary_loss_mlp": 0.00208726, + "balance_loss_clip": 1.0378468, + "balance_loss_mlp": 0.18551588, + "epoch": 0.854832406433188, + "flos": 23002616304000.0, + "grad_norm": 22.947881151272544, + "language_loss": 0.72966808, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.7442286, + "num_input_tokens_seen": 306613210, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.23217773, + "step": 14218, + "time_per_iteration": 2.8099868297576904 + }, + { + "auxiliary_loss_clip": 0.01248666, + "auxiliary_loss_mlp": 0.00219945, + "balance_loss_clip": 1.02973199, + "balance_loss_mlp": 0.19476837, + "epoch": 0.854892529685856, + "flos": 20412595249920.0, + "grad_norm": 4.40077728852622, + "language_loss": 0.7817682, + "learning_rate": 2.167597412688238e-07, + "loss": 0.79645431, + "num_input_tokens_seen": 306631620, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25170898, + "step": 14219, + "time_per_iteration": 2.696798086166382 + }, + { + "auxiliary_loss_clip": 0.01265443, + "auxiliary_loss_mlp": 0.00218855, + "balance_loss_clip": 1.04214346, + "balance_loss_mlp": 0.19253349, + "epoch": 0.854952652938524, + "flos": 16398716094720.0, + "grad_norm": 13.047788904656594, + "language_loss": 0.81010783, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.82495081, + "num_input_tokens_seen": 306646695, + "router_z_loss_clip": 2.23339844, + "router_z_loss_mlp": 0.26318359, + "step": 14220, + "time_per_iteration": 2.6991024017333984 + }, + { + "auxiliary_loss_clip": 0.01239899, + "auxiliary_loss_mlp": 0.00214999, + "balance_loss_clip": 1.02666926, + "balance_loss_mlp": 0.19047767, + "epoch": 0.855012776191192, + "flos": 21178677542400.0, + "grad_norm": 139.84482742079985, + "language_loss": 0.78693753, + "learning_rate": 2.164071923159827e-07, + "loss": 0.80148649, + "num_input_tokens_seen": 306665465, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24536133, + "step": 14221, + "time_per_iteration": 2.7621543407440186 + }, + { + "auxiliary_loss_clip": 0.01272582, + "auxiliary_loss_mlp": 0.00210878, + "balance_loss_clip": 1.04956591, + "balance_loss_mlp": 0.18529549, + "epoch": 0.8550728994438599, + "flos": 26140993361280.0, + "grad_norm": 3.9214346050682143, + "language_loss": 0.70486623, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.71970081, + "num_input_tokens_seen": 306685950, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.25585938, + "step": 14222, + "time_per_iteration": 2.784186363220215 + }, + { + "auxiliary_loss_clip": 0.01223894, + "auxiliary_loss_mlp": 0.00198918, + "balance_loss_clip": 1.0126735, + "balance_loss_mlp": 0.1750527, + "epoch": 0.8551330226965279, + "flos": 22786793435520.0, + "grad_norm": 31.995705457320025, + "language_loss": 0.88908088, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.90330899, + "num_input_tokens_seen": 306705740, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.23876953, + "step": 14223, + "time_per_iteration": 2.75124192237854 + }, + { + "auxiliary_loss_clip": 0.0125259, + "auxiliary_loss_mlp": 0.00213652, + "balance_loss_clip": 1.03844619, + "balance_loss_mlp": 0.1898818, + "epoch": 0.8551931459491958, + "flos": 22419032037120.0, + "grad_norm": 48.36460215251908, + "language_loss": 0.82145512, + "learning_rate": 2.158788761585515e-07, + "loss": 0.83611757, + "num_input_tokens_seen": 306725065, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.2376709, + "step": 14224, + "time_per_iteration": 2.7249348163604736 + }, + { + "auxiliary_loss_clip": 0.01241366, + "auxiliary_loss_mlp": 0.0020376, + "balance_loss_clip": 1.02866447, + "balance_loss_mlp": 0.17983516, + "epoch": 0.8552532692018638, + "flos": 19573183342080.0, + "grad_norm": 45.07361865871097, + "language_loss": 0.84686255, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.86131382, + "num_input_tokens_seen": 306743630, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.23950195, + "step": 14225, + "time_per_iteration": 2.725727081298828 + }, + { + "auxiliary_loss_clip": 0.01244314, + "auxiliary_loss_mlp": 0.00226987, + "balance_loss_clip": 1.03027296, + "balance_loss_mlp": 0.20343086, + "epoch": 0.8553133924545318, + "flos": 26432767537920.0, + "grad_norm": 16.08362745847981, + "language_loss": 0.85277921, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.8674922, + "num_input_tokens_seen": 306763105, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.23535156, + "step": 14226, + "time_per_iteration": 2.9028964042663574 + }, + { + "auxiliary_loss_clip": 0.01262046, + "auxiliary_loss_mlp": 0.00217729, + "balance_loss_clip": 1.0395658, + "balance_loss_mlp": 0.19335033, + "epoch": 0.8553735157071998, + "flos": 16362446336640.0, + "grad_norm": 24.65284595382217, + "language_loss": 0.63698936, + "learning_rate": 2.153511688875702e-07, + "loss": 0.65178716, + "num_input_tokens_seen": 306779875, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.24377441, + "step": 14227, + "time_per_iteration": 2.6965513229370117 + }, + { + "auxiliary_loss_clip": 0.0122455, + "auxiliary_loss_mlp": 0.00211632, + "balance_loss_clip": 1.01162446, + "balance_loss_mlp": 0.18770671, + "epoch": 0.8554336389598677, + "flos": 20887334328960.0, + "grad_norm": 10.423763194389622, + "language_loss": 0.75225562, + "learning_rate": 2.151754018031442e-07, + "loss": 0.76661742, + "num_input_tokens_seen": 306800015, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.23925781, + "step": 14228, + "time_per_iteration": 2.6728408336639404 + }, + { + "auxiliary_loss_clip": 0.01255689, + "auxiliary_loss_mlp": 0.00211058, + "balance_loss_clip": 1.03510046, + "balance_loss_mlp": 0.1854037, + "epoch": 0.8554937622125357, + "flos": 21284721469440.0, + "grad_norm": 6.898935780436777, + "language_loss": 0.82907015, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.84373766, + "num_input_tokens_seen": 306814160, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25646973, + "step": 14229, + "time_per_iteration": 2.674943685531616 + }, + { + "auxiliary_loss_clip": 0.01245156, + "auxiliary_loss_mlp": 0.00212763, + "balance_loss_clip": 1.03038049, + "balance_loss_mlp": 0.18964782, + "epoch": 0.8555538854652037, + "flos": 22413178120320.0, + "grad_norm": 26.720097611356532, + "language_loss": 0.79197478, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.80655402, + "num_input_tokens_seen": 306833310, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.23120117, + "step": 14230, + "time_per_iteration": 2.656473159790039 + }, + { + "auxiliary_loss_clip": 0.01255848, + "auxiliary_loss_mlp": 0.0021881, + "balance_loss_clip": 1.04243016, + "balance_loss_mlp": 0.19534962, + "epoch": 0.8556140087178716, + "flos": 20193719725440.0, + "grad_norm": 4.436299622163843, + "language_loss": 0.89884806, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.91359466, + "num_input_tokens_seen": 306851345, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.23449707, + "step": 14231, + "time_per_iteration": 2.6727588176727295 + }, + { + "auxiliary_loss_clip": 0.01270307, + "auxiliary_loss_mlp": 0.00224638, + "balance_loss_clip": 1.04724705, + "balance_loss_mlp": 0.19857889, + "epoch": 0.8556741319705397, + "flos": 22638123043200.0, + "grad_norm": 9.644639556269928, + "language_loss": 0.77335703, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.78830647, + "num_input_tokens_seen": 306871040, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26062012, + "step": 14232, + "time_per_iteration": 2.658348321914673 + }, + { + "auxiliary_loss_clip": 0.01258288, + "auxiliary_loss_mlp": 0.00195865, + "balance_loss_clip": 1.03983426, + "balance_loss_mlp": 0.17154679, + "epoch": 0.8557342552232076, + "flos": 23549320281600.0, + "grad_norm": 7.897309040130791, + "language_loss": 0.74132168, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.75586319, + "num_input_tokens_seen": 306891625, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24304199, + "step": 14233, + "time_per_iteration": 2.756861448287964 + }, + { + "auxiliary_loss_clip": 0.01245443, + "auxiliary_loss_mlp": 0.00207841, + "balance_loss_clip": 1.02933812, + "balance_loss_mlp": 0.18348616, + "epoch": 0.8557943784758756, + "flos": 19609884063360.0, + "grad_norm": 22.157818050144954, + "language_loss": 0.84035808, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.85489088, + "num_input_tokens_seen": 306910020, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24353027, + "step": 14234, + "time_per_iteration": 2.6609466075897217 + }, + { + "auxiliary_loss_clip": 0.01145936, + "auxiliary_loss_mlp": 0.00167452, + "balance_loss_clip": 0.99543345, + "balance_loss_mlp": 0.15886906, + "epoch": 0.8558545017285435, + "flos": 70641891446400.0, + "grad_norm": 0.8681460424089202, + "language_loss": 0.57574439, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.58887827, + "num_input_tokens_seen": 306969505, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.0859375, + "step": 14235, + "time_per_iteration": 3.1162047386169434 + }, + { + "auxiliary_loss_clip": 0.01142364, + "auxiliary_loss_mlp": 0.00176964, + "balance_loss_clip": 0.99363172, + "balance_loss_mlp": 0.16852359, + "epoch": 0.8559146249812115, + "flos": 56649983086080.0, + "grad_norm": 0.7739741556005995, + "language_loss": 0.5603987, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.57359201, + "num_input_tokens_seen": 307027710, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.08447266, + "step": 14236, + "time_per_iteration": 3.032259702682495 + }, + { + "auxiliary_loss_clip": 0.01253851, + "auxiliary_loss_mlp": 0.00228232, + "balance_loss_clip": 1.03674555, + "balance_loss_mlp": 0.20288828, + "epoch": 0.8559747482338794, + "flos": 22888240421760.0, + "grad_norm": 42.736410455997586, + "language_loss": 0.7899183, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.80473906, + "num_input_tokens_seen": 307045515, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25341797, + "step": 14237, + "time_per_iteration": 2.6695728302001953 + }, + { + "auxiliary_loss_clip": 0.01221004, + "auxiliary_loss_mlp": 0.00205229, + "balance_loss_clip": 1.01238441, + "balance_loss_mlp": 0.1825431, + "epoch": 0.8560348714865474, + "flos": 22601925112320.0, + "grad_norm": 21.689396647985976, + "language_loss": 0.71575499, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.7300173, + "num_input_tokens_seen": 307064470, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.22680664, + "step": 14238, + "time_per_iteration": 2.7927229404449463 + }, + { + "auxiliary_loss_clip": 0.01212926, + "auxiliary_loss_mlp": 0.00194772, + "balance_loss_clip": 1.00898623, + "balance_loss_mlp": 0.17438744, + "epoch": 0.8560949947392154, + "flos": 17931455297280.0, + "grad_norm": 332.58174176644667, + "language_loss": 0.75380278, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.76787972, + "num_input_tokens_seen": 307083900, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.20361328, + "step": 14239, + "time_per_iteration": 4.12511134147644 + }, + { + "auxiliary_loss_clip": 0.01267578, + "auxiliary_loss_mlp": 0.00235068, + "balance_loss_clip": 1.04420662, + "balance_loss_mlp": 0.20878235, + "epoch": 0.8561551179918834, + "flos": 31026208636800.0, + "grad_norm": 13.16943897769031, + "language_loss": 0.75299382, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.76802027, + "num_input_tokens_seen": 307104590, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.26269531, + "step": 14240, + "time_per_iteration": 2.7529990673065186 + }, + { + "auxiliary_loss_clip": 0.01253142, + "auxiliary_loss_mlp": 0.00197938, + "balance_loss_clip": 1.03523231, + "balance_loss_mlp": 0.17353548, + "epoch": 0.8562152412445513, + "flos": 30665198995200.0, + "grad_norm": 2.160219111091949, + "language_loss": 0.7285167, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.74302745, + "num_input_tokens_seen": 307125580, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.24389648, + "step": 14241, + "time_per_iteration": 4.186947584152222 + }, + { + "auxiliary_loss_clip": 0.01265034, + "auxiliary_loss_mlp": 0.00225168, + "balance_loss_clip": 1.03962266, + "balance_loss_mlp": 0.19691503, + "epoch": 0.8562753644972193, + "flos": 31576144838400.0, + "grad_norm": 5.472323926969982, + "language_loss": 0.80293143, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.81783342, + "num_input_tokens_seen": 307147625, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.2824707, + "step": 14242, + "time_per_iteration": 2.7701523303985596 + }, + { + "auxiliary_loss_clip": 0.01253444, + "auxiliary_loss_mlp": 0.00201492, + "balance_loss_clip": 1.0357058, + "balance_loss_mlp": 0.17668432, + "epoch": 0.8563354877498872, + "flos": 26213640618240.0, + "grad_norm": 130.20637617507347, + "language_loss": 0.86712551, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.88167489, + "num_input_tokens_seen": 307164665, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.24780273, + "step": 14243, + "time_per_iteration": 2.7446744441986084 + }, + { + "auxiliary_loss_clip": 0.01252421, + "auxiliary_loss_mlp": 0.00218071, + "balance_loss_clip": 1.03692508, + "balance_loss_mlp": 0.19456294, + "epoch": 0.8563956110025552, + "flos": 24134341092480.0, + "grad_norm": 2.2189827980436974, + "language_loss": 0.76102471, + "learning_rate": 2.123723375556974e-07, + "loss": 0.77572966, + "num_input_tokens_seen": 307182530, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.23498535, + "step": 14244, + "time_per_iteration": 2.7434844970703125 + }, + { + "auxiliary_loss_clip": 0.0114069, + "auxiliary_loss_mlp": 0.00149915, + "balance_loss_clip": 0.99311423, + "balance_loss_mlp": 0.14176062, + "epoch": 0.8564557342552233, + "flos": 56271986311680.0, + "grad_norm": 0.8095457134742159, + "language_loss": 0.57555372, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.58845973, + "num_input_tokens_seen": 307241240, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.08154297, + "step": 14245, + "time_per_iteration": 4.5041303634643555 + }, + { + "auxiliary_loss_clip": 0.01269832, + "auxiliary_loss_mlp": 0.00214895, + "balance_loss_clip": 1.04705715, + "balance_loss_mlp": 0.19008774, + "epoch": 0.8565158575078912, + "flos": 23440618748160.0, + "grad_norm": 478.312023433773, + "language_loss": 0.87507021, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.88991749, + "num_input_tokens_seen": 307261485, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.24841309, + "step": 14246, + "time_per_iteration": 2.8442184925079346 + }, + { + "auxiliary_loss_clip": 0.01238404, + "auxiliary_loss_mlp": 0.0019973, + "balance_loss_clip": 1.02711785, + "balance_loss_mlp": 0.17735487, + "epoch": 0.8565759807605592, + "flos": 20375930442240.0, + "grad_norm": 20.99201303334203, + "language_loss": 0.90182281, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.91620415, + "num_input_tokens_seen": 307279160, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.22363281, + "step": 14247, + "time_per_iteration": 2.6937243938446045 + }, + { + "auxiliary_loss_clip": 0.01262438, + "auxiliary_loss_mlp": 0.00239529, + "balance_loss_clip": 1.04131222, + "balance_loss_mlp": 0.21165764, + "epoch": 0.8566361040132271, + "flos": 18807101049600.0, + "grad_norm": 247.94705937489417, + "language_loss": 0.86517954, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.88019919, + "num_input_tokens_seen": 307297920, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.27844238, + "step": 14248, + "time_per_iteration": 2.642778158187866 + }, + { + "auxiliary_loss_clip": 0.01253245, + "auxiliary_loss_mlp": 0.00254475, + "balance_loss_clip": 1.03356254, + "balance_loss_mlp": 0.22799878, + "epoch": 0.8566962272658951, + "flos": 24535355506560.0, + "grad_norm": 13.409694701844776, + "language_loss": 0.86520642, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.88028359, + "num_input_tokens_seen": 307318320, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.26477051, + "step": 14249, + "time_per_iteration": 4.104541063308716 + }, + { + "auxiliary_loss_clip": 0.01247946, + "auxiliary_loss_mlp": 0.00225099, + "balance_loss_clip": 1.03042459, + "balance_loss_mlp": 0.20125695, + "epoch": 0.856756350518563, + "flos": 23178506227200.0, + "grad_norm": 63.172656568682534, + "language_loss": 0.87011147, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.8848418, + "num_input_tokens_seen": 307336720, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.23876953, + "step": 14250, + "time_per_iteration": 2.69720196723938 + }, + { + "auxiliary_loss_clip": 0.01243152, + "auxiliary_loss_mlp": 0.00207087, + "balance_loss_clip": 1.03123665, + "balance_loss_mlp": 0.1847589, + "epoch": 0.856816473771231, + "flos": 20808581760000.0, + "grad_norm": 14.101776121304919, + "language_loss": 0.86379099, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.8782934, + "num_input_tokens_seen": 307354120, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.22314453, + "step": 14251, + "time_per_iteration": 2.6781182289123535 + }, + { + "auxiliary_loss_clip": 0.01238898, + "auxiliary_loss_mlp": 0.00215566, + "balance_loss_clip": 1.02551973, + "balance_loss_mlp": 0.19239151, + "epoch": 0.856876597023899, + "flos": 20228157889920.0, + "grad_norm": 252.13311251672474, + "language_loss": 0.70115763, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.7157023, + "num_input_tokens_seen": 307373165, + "router_z_loss_clip": 2.13378906, + "router_z_loss_mlp": 0.23193359, + "step": 14252, + "time_per_iteration": 2.7052817344665527 + }, + { + "auxiliary_loss_clip": 0.01263105, + "auxiliary_loss_mlp": 0.00219233, + "balance_loss_clip": 1.03997445, + "balance_loss_mlp": 0.19278002, + "epoch": 0.856936720276567, + "flos": 18296128126080.0, + "grad_norm": 21.217400493091702, + "language_loss": 0.82216579, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.83698916, + "num_input_tokens_seen": 307391000, + "router_z_loss_clip": 2.23339844, + "router_z_loss_mlp": 0.26428223, + "step": 14253, + "time_per_iteration": 2.669016122817993 + }, + { + "auxiliary_loss_clip": 0.01137357, + "auxiliary_loss_mlp": 0.00154473, + "balance_loss_clip": 0.99082667, + "balance_loss_mlp": 0.14665264, + "epoch": 0.8569968435292349, + "flos": 69878394933120.0, + "grad_norm": 0.7945412955892998, + "language_loss": 0.5825845, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.59550279, + "num_input_tokens_seen": 307452865, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.078125, + "step": 14254, + "time_per_iteration": 3.2103559970855713 + }, + { + "auxiliary_loss_clip": 0.01239695, + "auxiliary_loss_mlp": 0.00227129, + "balance_loss_clip": 1.02735806, + "balance_loss_mlp": 0.20192836, + "epoch": 0.8570569667819029, + "flos": 25848572739840.0, + "grad_norm": 126.12193401530179, + "language_loss": 0.89209735, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.90676558, + "num_input_tokens_seen": 307471940, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.25195312, + "step": 14255, + "time_per_iteration": 2.672116279602051 + }, + { + "auxiliary_loss_clip": 0.01225835, + "auxiliary_loss_mlp": 0.00200686, + "balance_loss_clip": 1.01705182, + "balance_loss_mlp": 0.17895389, + "epoch": 0.8571170900345708, + "flos": 23257115141760.0, + "grad_norm": 451.84424446651053, + "language_loss": 0.75935209, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.77361727, + "num_input_tokens_seen": 307488745, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.21740723, + "step": 14256, + "time_per_iteration": 2.682065963745117 + }, + { + "auxiliary_loss_clip": 0.01255972, + "auxiliary_loss_mlp": 0.0021348, + "balance_loss_clip": 1.04047441, + "balance_loss_mlp": 0.18967399, + "epoch": 0.8571772132872388, + "flos": 18917670090240.0, + "grad_norm": 17.451434839614016, + "language_loss": 0.76373816, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.77843261, + "num_input_tokens_seen": 307506855, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.23791504, + "step": 14257, + "time_per_iteration": 2.6956496238708496 + }, + { + "auxiliary_loss_clip": 0.01246871, + "auxiliary_loss_mlp": 0.0021101, + "balance_loss_clip": 1.03511131, + "balance_loss_mlp": 0.18607193, + "epoch": 0.8572373365399069, + "flos": 33250120318080.0, + "grad_norm": 421.0781086393169, + "language_loss": 0.86409974, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.87867856, + "num_input_tokens_seen": 307526115, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.24951172, + "step": 14258, + "time_per_iteration": 2.7596445083618164 + }, + { + "auxiliary_loss_clip": 0.01250371, + "auxiliary_loss_mlp": 0.00209502, + "balance_loss_clip": 1.0356133, + "balance_loss_mlp": 0.18444464, + "epoch": 0.8572974597925748, + "flos": 23327535755520.0, + "grad_norm": 7.332504997617393, + "language_loss": 0.76492095, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.77951974, + "num_input_tokens_seen": 307545230, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.25085449, + "step": 14259, + "time_per_iteration": 2.6724648475646973 + }, + { + "auxiliary_loss_clip": 0.01253252, + "auxiliary_loss_mlp": 0.00214059, + "balance_loss_clip": 1.03006542, + "balance_loss_mlp": 0.18833363, + "epoch": 0.8573575830452428, + "flos": 24535858296960.0, + "grad_norm": 14.316764368645496, + "language_loss": 0.85032153, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.86499465, + "num_input_tokens_seen": 307564900, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25708008, + "step": 14260, + "time_per_iteration": 2.702571392059326 + }, + { + "auxiliary_loss_clip": 0.0126376, + "auxiliary_loss_mlp": 0.00218801, + "balance_loss_clip": 1.04155612, + "balance_loss_mlp": 0.19252746, + "epoch": 0.8574177062979107, + "flos": 24165403378560.0, + "grad_norm": 107.37341959075485, + "language_loss": 0.83875191, + "learning_rate": 2.09413096654806e-07, + "loss": 0.85357749, + "num_input_tokens_seen": 307583500, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.26293945, + "step": 14261, + "time_per_iteration": 2.7587006092071533 + }, + { + "auxiliary_loss_clip": 0.01259759, + "auxiliary_loss_mlp": 0.00219734, + "balance_loss_clip": 1.03430367, + "balance_loss_mlp": 0.19357976, + "epoch": 0.8574778295505787, + "flos": 17930737025280.0, + "grad_norm": 12.959787545880534, + "language_loss": 0.87616968, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.89096463, + "num_input_tokens_seen": 307601430, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.26171875, + "step": 14262, + "time_per_iteration": 2.688798427581787 + }, + { + "auxiliary_loss_clip": 0.01250473, + "auxiliary_loss_mlp": 0.00233687, + "balance_loss_clip": 1.03125215, + "balance_loss_mlp": 0.21004814, + "epoch": 0.8575379528032466, + "flos": 21580697537280.0, + "grad_norm": 10.794874046347948, + "language_loss": 0.76255536, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.77739692, + "num_input_tokens_seen": 307621495, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.23632812, + "step": 14263, + "time_per_iteration": 2.7800662517547607 + }, + { + "auxiliary_loss_clip": 0.01252886, + "auxiliary_loss_mlp": 0.00236508, + "balance_loss_clip": 1.0336957, + "balance_loss_mlp": 0.2101268, + "epoch": 0.8575980760559146, + "flos": 21761579450880.0, + "grad_norm": 6.782057349313423, + "language_loss": 0.83983827, + "learning_rate": 2.088929137266986e-07, + "loss": 0.85473228, + "num_input_tokens_seen": 307640840, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.26403809, + "step": 14264, + "time_per_iteration": 2.7383668422698975 + }, + { + "auxiliary_loss_clip": 0.01261393, + "auxiliary_loss_mlp": 0.00217883, + "balance_loss_clip": 1.04293895, + "balance_loss_mlp": 0.19414882, + "epoch": 0.8576581993085826, + "flos": 34386442047360.0, + "grad_norm": 4.629990956394929, + "language_loss": 0.75873566, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.77352846, + "num_input_tokens_seen": 307663820, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.23754883, + "step": 14265, + "time_per_iteration": 2.8090732097625732 + }, + { + "auxiliary_loss_clip": 0.01246011, + "auxiliary_loss_mlp": 0.0020993, + "balance_loss_clip": 1.0319587, + "balance_loss_mlp": 0.18441892, + "epoch": 0.8577183225612506, + "flos": 23222497409280.0, + "grad_norm": 2.307621639587455, + "language_loss": 0.75131786, + "learning_rate": 2.085464646918027e-07, + "loss": 0.76587725, + "num_input_tokens_seen": 307682385, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.25500488, + "step": 14266, + "time_per_iteration": 2.694284200668335 + }, + { + "auxiliary_loss_clip": 0.01252363, + "auxiliary_loss_mlp": 0.0023714, + "balance_loss_clip": 1.03744626, + "balance_loss_mlp": 0.21315494, + "epoch": 0.8577784458139185, + "flos": 28804164462720.0, + "grad_norm": 1296.5274129180618, + "language_loss": 0.81742656, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.83232164, + "num_input_tokens_seen": 307704680, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.24023438, + "step": 14267, + "time_per_iteration": 2.7416114807128906 + }, + { + "auxiliary_loss_clip": 0.01230744, + "auxiliary_loss_mlp": 0.00206518, + "balance_loss_clip": 1.01973212, + "balance_loss_mlp": 0.18280689, + "epoch": 0.8578385690665865, + "flos": 19755573626880.0, + "grad_norm": 3.278480720812314, + "language_loss": 0.92467439, + "learning_rate": 2.082002873852946e-07, + "loss": 0.93904698, + "num_input_tokens_seen": 307723245, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.23718262, + "step": 14268, + "time_per_iteration": 2.696329116821289 + }, + { + "auxiliary_loss_clip": 0.01282432, + "auxiliary_loss_mlp": 0.00231512, + "balance_loss_clip": 1.05616486, + "balance_loss_mlp": 0.20529763, + "epoch": 0.8578986923192544, + "flos": 20704082117760.0, + "grad_norm": 66.76546193887017, + "language_loss": 0.83218187, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.84732127, + "num_input_tokens_seen": 307742510, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.26208496, + "step": 14269, + "time_per_iteration": 2.6659796237945557 + }, + { + "auxiliary_loss_clip": 0.01247504, + "auxiliary_loss_mlp": 0.00221455, + "balance_loss_clip": 1.03133237, + "balance_loss_mlp": 0.19682601, + "epoch": 0.8579588155719224, + "flos": 36101715189120.0, + "grad_norm": 841.940393210129, + "language_loss": 0.74384427, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.7585339, + "num_input_tokens_seen": 307766030, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24584961, + "step": 14270, + "time_per_iteration": 2.8010268211364746 + }, + { + "auxiliary_loss_clip": 0.01233801, + "auxiliary_loss_mlp": 0.00206677, + "balance_loss_clip": 1.02399266, + "balance_loss_mlp": 0.18426615, + "epoch": 0.8580189388245905, + "flos": 22853479034880.0, + "grad_norm": 2.319321775058704, + "language_loss": 0.8055557, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.81996047, + "num_input_tokens_seen": 307785800, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.22412109, + "step": 14271, + "time_per_iteration": 2.637087345123291 + }, + { + "auxiliary_loss_clip": 0.01148827, + "auxiliary_loss_mlp": 0.00111957, + "balance_loss_clip": 1.00114214, + "balance_loss_mlp": 0.10470901, + "epoch": 0.8580790620772584, + "flos": 69642104290560.0, + "grad_norm": 0.7535652686782242, + "language_loss": 0.57494354, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.58755136, + "num_input_tokens_seen": 307850995, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.07226562, + "step": 14272, + "time_per_iteration": 3.2118399143218994 + }, + { + "auxiliary_loss_clip": 0.012597, + "auxiliary_loss_mlp": 0.00223491, + "balance_loss_clip": 1.039554, + "balance_loss_mlp": 0.19737247, + "epoch": 0.8581391853299264, + "flos": 13334243270400.0, + "grad_norm": 3.08451155555398, + "language_loss": 0.83491278, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.84974468, + "num_input_tokens_seen": 307868585, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26098633, + "step": 14273, + "time_per_iteration": 2.6116373538970947 + }, + { + "auxiliary_loss_clip": 0.01237436, + "auxiliary_loss_mlp": 0.00209423, + "balance_loss_clip": 1.02327704, + "balance_loss_mlp": 0.18566465, + "epoch": 0.8581993085825943, + "flos": 19645651031040.0, + "grad_norm": 5.913474749737479, + "language_loss": 0.90088665, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.91535527, + "num_input_tokens_seen": 307886820, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.23779297, + "step": 14274, + "time_per_iteration": 2.6973061561584473 + }, + { + "auxiliary_loss_clip": 0.0114684, + "auxiliary_loss_mlp": 0.0015676, + "balance_loss_clip": 1.00051332, + "balance_loss_mlp": 0.14841536, + "epoch": 0.8582594318352623, + "flos": 55825077294720.0, + "grad_norm": 0.7874807662382023, + "language_loss": 0.60385394, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.61688995, + "num_input_tokens_seen": 307944020, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.08349609, + "step": 14275, + "time_per_iteration": 3.207456588745117 + }, + { + "auxiliary_loss_clip": 0.012544, + "auxiliary_loss_mlp": 0.00224171, + "balance_loss_clip": 1.03397512, + "balance_loss_mlp": 0.198553, + "epoch": 0.8583195550879302, + "flos": 24279563779200.0, + "grad_norm": 330.7119002521883, + "language_loss": 0.71332109, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.7281068, + "num_input_tokens_seen": 307961055, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25634766, + "step": 14276, + "time_per_iteration": 2.693603992462158 + }, + { + "auxiliary_loss_clip": 0.01234012, + "auxiliary_loss_mlp": 0.00218095, + "balance_loss_clip": 1.02466607, + "balance_loss_mlp": 0.19560069, + "epoch": 0.8583796783405983, + "flos": 13444129952640.0, + "grad_norm": 12.364494643425802, + "language_loss": 0.86362749, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.87814862, + "num_input_tokens_seen": 307978690, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.22497559, + "step": 14277, + "time_per_iteration": 2.6557676792144775 + }, + { + "auxiliary_loss_clip": 0.01252497, + "auxiliary_loss_mlp": 0.00206613, + "balance_loss_clip": 1.03161979, + "balance_loss_mlp": 0.18238945, + "epoch": 0.8584398015932662, + "flos": 16180271533440.0, + "grad_norm": 4.15342012842555, + "language_loss": 0.910402, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.92499304, + "num_input_tokens_seen": 307995870, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.2421875, + "step": 14278, + "time_per_iteration": 2.6382579803466797 + }, + { + "auxiliary_loss_clip": 0.01264746, + "auxiliary_loss_mlp": 0.00237536, + "balance_loss_clip": 1.04039919, + "balance_loss_mlp": 0.21244246, + "epoch": 0.8584999248459342, + "flos": 17450431338240.0, + "grad_norm": 3.7236580033652964, + "language_loss": 0.85325491, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.86827773, + "num_input_tokens_seen": 308013645, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.25073242, + "step": 14279, + "time_per_iteration": 2.6680517196655273 + }, + { + "auxiliary_loss_clip": 0.01243442, + "auxiliary_loss_mlp": 0.00208452, + "balance_loss_clip": 1.02872276, + "balance_loss_mlp": 0.18377607, + "epoch": 0.8585600480986021, + "flos": 23441013797760.0, + "grad_norm": 9.197209967371144, + "language_loss": 0.7663424, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.78086138, + "num_input_tokens_seen": 308032490, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.24682617, + "step": 14280, + "time_per_iteration": 2.6727077960968018 + }, + { + "auxiliary_loss_clip": 0.01235198, + "auxiliary_loss_mlp": 0.00210174, + "balance_loss_clip": 1.02190399, + "balance_loss_mlp": 0.18791756, + "epoch": 0.8586201713512701, + "flos": 19937927998080.0, + "grad_norm": 276.3227323059585, + "language_loss": 0.73023939, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.74469304, + "num_input_tokens_seen": 308052110, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.22241211, + "step": 14281, + "time_per_iteration": 4.073981761932373 + }, + { + "auxiliary_loss_clip": 0.01232453, + "auxiliary_loss_mlp": 0.0022749, + "balance_loss_clip": 1.02371848, + "balance_loss_mlp": 0.20482847, + "epoch": 0.858680294603938, + "flos": 15304769435520.0, + "grad_norm": 28.45370534995835, + "language_loss": 0.80572695, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.82032645, + "num_input_tokens_seen": 308070660, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.2265625, + "step": 14282, + "time_per_iteration": 2.677732467651367 + }, + { + "auxiliary_loss_clip": 0.01238792, + "auxiliary_loss_mlp": 0.00216136, + "balance_loss_clip": 1.02645159, + "balance_loss_mlp": 0.19260404, + "epoch": 0.858740417856606, + "flos": 22711237176960.0, + "grad_norm": 12.532271124313478, + "language_loss": 0.84757268, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.86212194, + "num_input_tokens_seen": 308089520, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.23535156, + "step": 14283, + "time_per_iteration": 4.044692754745483 + }, + { + "auxiliary_loss_clip": 0.01251512, + "auxiliary_loss_mlp": 0.00218314, + "balance_loss_clip": 1.03472495, + "balance_loss_mlp": 0.19405511, + "epoch": 0.8588005411092741, + "flos": 34054303962240.0, + "grad_norm": 19.20677327188584, + "language_loss": 0.67822373, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.692922, + "num_input_tokens_seen": 308111545, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24267578, + "step": 14284, + "time_per_iteration": 2.7587804794311523 + }, + { + "auxiliary_loss_clip": 0.01232667, + "auxiliary_loss_mlp": 0.00206928, + "balance_loss_clip": 1.02169275, + "balance_loss_mlp": 0.18382561, + "epoch": 0.858860664361942, + "flos": 28913584268160.0, + "grad_norm": 2273.7161764439893, + "language_loss": 0.83787256, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.85226852, + "num_input_tokens_seen": 308129690, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.23095703, + "step": 14285, + "time_per_iteration": 2.7597591876983643 + }, + { + "auxiliary_loss_clip": 0.01261176, + "auxiliary_loss_mlp": 0.00219527, + "balance_loss_clip": 1.0393815, + "balance_loss_mlp": 0.19467151, + "epoch": 0.85892078761461, + "flos": 19792525743360.0, + "grad_norm": 728.0003018366128, + "language_loss": 0.82124317, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.83605021, + "num_input_tokens_seen": 308147410, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.2487793, + "step": 14286, + "time_per_iteration": 2.645179510116577 + }, + { + "auxiliary_loss_clip": 0.01144544, + "auxiliary_loss_mlp": 0.00200483, + "balance_loss_clip": 0.99846327, + "balance_loss_mlp": 0.19104135, + "epoch": 0.8589809108672779, + "flos": 67106630039040.0, + "grad_norm": 0.7486414605097893, + "language_loss": 0.48384076, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.49729103, + "num_input_tokens_seen": 308204875, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.09423828, + "step": 14287, + "time_per_iteration": 3.1338818073272705 + }, + { + "auxiliary_loss_clip": 0.0125582, + "auxiliary_loss_mlp": 0.00210887, + "balance_loss_clip": 1.03612638, + "balance_loss_mlp": 0.18772511, + "epoch": 0.8590410341199459, + "flos": 29716259541120.0, + "grad_norm": 56.51074552107692, + "language_loss": 0.86784911, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.88251615, + "num_input_tokens_seen": 308225690, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.23181152, + "step": 14288, + "time_per_iteration": 4.218093156814575 + }, + { + "auxiliary_loss_clip": 0.01251037, + "auxiliary_loss_mlp": 0.00229823, + "balance_loss_clip": 1.03458655, + "balance_loss_mlp": 0.204539, + "epoch": 0.8591011573726138, + "flos": 23987430466560.0, + "grad_norm": 9.228872453372185, + "language_loss": 0.86378682, + "learning_rate": 2.045818444528553e-07, + "loss": 0.87859547, + "num_input_tokens_seen": 308245255, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.25256348, + "step": 14289, + "time_per_iteration": 2.697650194168091 + }, + { + "auxiliary_loss_clip": 0.01254786, + "auxiliary_loss_mlp": 0.0022271, + "balance_loss_clip": 1.0379324, + "balance_loss_mlp": 0.19880879, + "epoch": 0.8591612806252819, + "flos": 14428656806400.0, + "grad_norm": 2.9407045378278576, + "language_loss": 0.73513216, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.74990714, + "num_input_tokens_seen": 308261755, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.23913574, + "step": 14290, + "time_per_iteration": 2.6409811973571777 + }, + { + "auxiliary_loss_clip": 0.01266541, + "auxiliary_loss_mlp": 0.002315, + "balance_loss_clip": 1.04234648, + "balance_loss_mlp": 0.2053691, + "epoch": 0.8592214038779498, + "flos": 31577150419200.0, + "grad_norm": 3.9677036981674134, + "language_loss": 0.63132137, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.64630181, + "num_input_tokens_seen": 308285145, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26135254, + "step": 14291, + "time_per_iteration": 4.1673712730407715 + }, + { + "auxiliary_loss_clip": 0.0125107, + "auxiliary_loss_mlp": 0.00235166, + "balance_loss_clip": 1.03076088, + "balance_loss_mlp": 0.20990574, + "epoch": 0.8592815271306178, + "flos": 17457290835840.0, + "grad_norm": 13.157478427225579, + "language_loss": 0.81299424, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.82785654, + "num_input_tokens_seen": 308304130, + "router_z_loss_clip": 2.20214844, + "router_z_loss_mlp": 0.25268555, + "step": 14292, + "time_per_iteration": 2.738149642944336 + }, + { + "auxiliary_loss_clip": 0.01233486, + "auxiliary_loss_mlp": 0.00230597, + "balance_loss_clip": 1.02081954, + "balance_loss_mlp": 0.20756546, + "epoch": 0.8593416503832857, + "flos": 25411360394880.0, + "grad_norm": 2.457561689828223, + "language_loss": 0.77970266, + "learning_rate": 2.038960195018542e-07, + "loss": 0.79434353, + "num_input_tokens_seen": 308324670, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.23034668, + "step": 14293, + "time_per_iteration": 2.6700170040130615 + }, + { + "auxiliary_loss_clip": 0.01226072, + "auxiliary_loss_mlp": 0.00204637, + "balance_loss_clip": 1.01532793, + "balance_loss_mlp": 0.18091398, + "epoch": 0.8594017736359537, + "flos": 20996646393600.0, + "grad_norm": 6.172904225191042, + "language_loss": 0.77314413, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.78745127, + "num_input_tokens_seen": 308344215, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.23742676, + "step": 14294, + "time_per_iteration": 2.682424306869507 + }, + { + "auxiliary_loss_clip": 0.01240742, + "auxiliary_loss_mlp": 0.00226973, + "balance_loss_clip": 1.02667487, + "balance_loss_mlp": 0.20319089, + "epoch": 0.8594618968886216, + "flos": 22091059929600.0, + "grad_norm": 9.968371465590897, + "language_loss": 0.83532596, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.85000312, + "num_input_tokens_seen": 308360520, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.23754883, + "step": 14295, + "time_per_iteration": 2.7045810222625732 + }, + { + "auxiliary_loss_clip": 0.01268286, + "auxiliary_loss_mlp": 0.00219409, + "balance_loss_clip": 1.0413543, + "balance_loss_mlp": 0.19323094, + "epoch": 0.8595220201412896, + "flos": 11656245467520.0, + "grad_norm": 25.3900259771761, + "language_loss": 0.79837954, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.8132565, + "num_input_tokens_seen": 308376865, + "router_z_loss_clip": 2.26660156, + "router_z_loss_mlp": 0.26147461, + "step": 14296, + "time_per_iteration": 2.6724467277526855 + }, + { + "auxiliary_loss_clip": 0.01246636, + "auxiliary_loss_mlp": 0.00210508, + "balance_loss_clip": 1.02909946, + "balance_loss_mlp": 0.18641546, + "epoch": 0.8595821433939577, + "flos": 25040366772480.0, + "grad_norm": 48.242175239895644, + "language_loss": 0.89327174, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.90784317, + "num_input_tokens_seen": 308395870, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.2409668, + "step": 14297, + "time_per_iteration": 2.7072982788085938 + }, + { + "auxiliary_loss_clip": 0.01241408, + "auxiliary_loss_mlp": 0.00229948, + "balance_loss_clip": 1.02939034, + "balance_loss_mlp": 0.20682183, + "epoch": 0.8596422666466256, + "flos": 28511528359680.0, + "grad_norm": 6.735101612182346, + "language_loss": 0.74938452, + "learning_rate": 2.030402708016954e-07, + "loss": 0.76409811, + "num_input_tokens_seen": 308417250, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.23156738, + "step": 14298, + "time_per_iteration": 2.700176954269409 + }, + { + "auxiliary_loss_clip": 0.0124909, + "auxiliary_loss_mlp": 0.00184158, + "balance_loss_clip": 1.02726173, + "balance_loss_mlp": 0.1589098, + "epoch": 0.8597023898992936, + "flos": 13589137157760.0, + "grad_norm": 379.52249149472163, + "language_loss": 0.79026198, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.80459452, + "num_input_tokens_seen": 308434565, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.25280762, + "step": 14299, + "time_per_iteration": 2.6490731239318848 + }, + { + "auxiliary_loss_clip": 0.0126339, + "auxiliary_loss_mlp": 0.0023086, + "balance_loss_clip": 1.04150224, + "balance_loss_mlp": 0.20372796, + "epoch": 0.8597625131519615, + "flos": 32300821728000.0, + "grad_norm": 26.704762055105466, + "language_loss": 0.81316125, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.82810378, + "num_input_tokens_seen": 308450040, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.27124023, + "step": 14300, + "time_per_iteration": 2.7466633319854736 + }, + { + "auxiliary_loss_clip": 0.01229364, + "auxiliary_loss_mlp": 0.00233057, + "balance_loss_clip": 1.01799047, + "balance_loss_mlp": 0.21063358, + "epoch": 0.8598226364046295, + "flos": 28730367970560.0, + "grad_norm": 13.746523944417254, + "language_loss": 0.7713027, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.78592694, + "num_input_tokens_seen": 308470545, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.22424316, + "step": 14301, + "time_per_iteration": 2.7922587394714355 + }, + { + "auxiliary_loss_clip": 0.01247407, + "auxiliary_loss_mlp": 0.00234459, + "balance_loss_clip": 1.03254986, + "balance_loss_mlp": 0.21024749, + "epoch": 0.8598827596572974, + "flos": 21871825269120.0, + "grad_norm": 23.47135442212001, + "language_loss": 0.83736861, + "learning_rate": 2.023568983386641e-07, + "loss": 0.85218728, + "num_input_tokens_seen": 308490020, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24206543, + "step": 14302, + "time_per_iteration": 2.666097640991211 + }, + { + "auxiliary_loss_clip": 0.01241396, + "auxiliary_loss_mlp": 0.00217925, + "balance_loss_clip": 1.02721143, + "balance_loss_mlp": 0.19338022, + "epoch": 0.8599428829099655, + "flos": 23767297966080.0, + "grad_norm": 13.920340769344048, + "language_loss": 0.91125941, + "learning_rate": 2.02186225623733e-07, + "loss": 0.9258526, + "num_input_tokens_seen": 308509065, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.24536133, + "step": 14303, + "time_per_iteration": 2.7480568885803223 + }, + { + "auxiliary_loss_clip": 0.01258624, + "auxiliary_loss_mlp": 0.00224475, + "balance_loss_clip": 1.03666973, + "balance_loss_mlp": 0.20014438, + "epoch": 0.8600030061626334, + "flos": 16212770363520.0, + "grad_norm": 218.36325184219, + "language_loss": 0.84679443, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.86162537, + "num_input_tokens_seen": 308524725, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.2434082, + "step": 14304, + "time_per_iteration": 2.6200146675109863 + }, + { + "auxiliary_loss_clip": 0.01256459, + "auxiliary_loss_mlp": 0.00228898, + "balance_loss_clip": 1.03210235, + "balance_loss_mlp": 0.20181355, + "epoch": 0.8600631294153014, + "flos": 15669370437120.0, + "grad_norm": 181.07559485472635, + "language_loss": 0.70408297, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.71893656, + "num_input_tokens_seen": 308543525, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.27111816, + "step": 14305, + "time_per_iteration": 2.6677825450897217 + }, + { + "auxiliary_loss_clip": 0.01265678, + "auxiliary_loss_mlp": 0.00239869, + "balance_loss_clip": 1.04019427, + "balance_loss_mlp": 0.21183084, + "epoch": 0.8601232526679693, + "flos": 17493093717120.0, + "grad_norm": 14.925509709869033, + "language_loss": 0.93329149, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.94834697, + "num_input_tokens_seen": 308557995, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.28051758, + "step": 14306, + "time_per_iteration": 2.6223297119140625 + }, + { + "auxiliary_loss_clip": 0.01233218, + "auxiliary_loss_mlp": 0.00222665, + "balance_loss_clip": 1.02083206, + "balance_loss_mlp": 0.20034912, + "epoch": 0.8601833759206373, + "flos": 26985935963520.0, + "grad_norm": 32.017748354199014, + "language_loss": 0.77064097, + "learning_rate": 2.01504216561474e-07, + "loss": 0.78519976, + "num_input_tokens_seen": 308582750, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.22314453, + "step": 14307, + "time_per_iteration": 2.8089101314544678 + }, + { + "auxiliary_loss_clip": 0.01263983, + "auxiliary_loss_mlp": 0.00224528, + "balance_loss_clip": 1.03773952, + "balance_loss_mlp": 0.19786115, + "epoch": 0.8602434991733052, + "flos": 25229760209280.0, + "grad_norm": 19.06802311021845, + "language_loss": 0.73759198, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.75247711, + "num_input_tokens_seen": 308603770, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26672363, + "step": 14308, + "time_per_iteration": 2.7378528118133545 + }, + { + "auxiliary_loss_clip": 0.01153493, + "auxiliary_loss_mlp": 0.00093069, + "balance_loss_clip": 1.0069809, + "balance_loss_mlp": 0.08591661, + "epoch": 0.8603036224259732, + "flos": 71015363107200.0, + "grad_norm": 0.6010551850814191, + "language_loss": 0.47220272, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.48466831, + "num_input_tokens_seen": 308667735, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.07128906, + "step": 14309, + "time_per_iteration": 3.253805637359619 + }, + { + "auxiliary_loss_clip": 0.01267065, + "auxiliary_loss_mlp": 0.00223409, + "balance_loss_clip": 1.03951955, + "balance_loss_mlp": 0.19856568, + "epoch": 0.8603637456786413, + "flos": 20300625578880.0, + "grad_norm": 17.84586207525321, + "language_loss": 0.77946472, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.79436946, + "num_input_tokens_seen": 308686300, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.24841309, + "step": 14310, + "time_per_iteration": 2.6889407634735107 + }, + { + "auxiliary_loss_clip": 0.01238764, + "auxiliary_loss_mlp": 0.00205525, + "balance_loss_clip": 1.024894, + "balance_loss_mlp": 0.18144467, + "epoch": 0.8604238689313092, + "flos": 21835842819840.0, + "grad_norm": 1.8048999818184683, + "language_loss": 0.85941613, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.87385905, + "num_input_tokens_seen": 308705825, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.2409668, + "step": 14311, + "time_per_iteration": 2.667060136795044 + }, + { + "auxiliary_loss_clip": 0.01237735, + "auxiliary_loss_mlp": 0.00216095, + "balance_loss_clip": 1.02492154, + "balance_loss_mlp": 0.19295655, + "epoch": 0.8604839921839772, + "flos": 18004210295040.0, + "grad_norm": 387.2728827644302, + "language_loss": 0.80917555, + "learning_rate": 2.006532397626639e-07, + "loss": 0.82371384, + "num_input_tokens_seen": 308723340, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.23156738, + "step": 14312, + "time_per_iteration": 2.6780643463134766 + }, + { + "auxiliary_loss_clip": 0.01245058, + "auxiliary_loss_mlp": 0.00228344, + "balance_loss_clip": 1.02727175, + "balance_loss_mlp": 0.20364425, + "epoch": 0.8605441154366451, + "flos": 16252164604800.0, + "grad_norm": 3.1805744663613695, + "language_loss": 0.86688519, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.88161922, + "num_input_tokens_seen": 308741280, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24707031, + "step": 14313, + "time_per_iteration": 2.6578269004821777 + }, + { + "auxiliary_loss_clip": 0.01235702, + "auxiliary_loss_mlp": 0.00217008, + "balance_loss_clip": 1.02257299, + "balance_loss_mlp": 0.19332138, + "epoch": 0.8606042386893131, + "flos": 32267065921920.0, + "grad_norm": 11.509375284897356, + "language_loss": 0.79717314, + "learning_rate": 2.003133266178474e-07, + "loss": 0.81170034, + "num_input_tokens_seen": 308762875, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.23681641, + "step": 14314, + "time_per_iteration": 2.7836146354675293 + }, + { + "auxiliary_loss_clip": 0.01250945, + "auxiliary_loss_mlp": 0.00216829, + "balance_loss_clip": 1.03014445, + "balance_loss_mlp": 0.19187877, + "epoch": 0.860664361941981, + "flos": 20229774001920.0, + "grad_norm": 19.70023189289947, + "language_loss": 0.77398324, + "learning_rate": 2.001434724086657e-07, + "loss": 0.788661, + "num_input_tokens_seen": 308780315, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24975586, + "step": 14315, + "time_per_iteration": 2.604783058166504 + }, + { + "auxiliary_loss_clip": 0.01248589, + "auxiliary_loss_mlp": 0.00205525, + "balance_loss_clip": 1.0315578, + "balance_loss_mlp": 0.18125379, + "epoch": 0.8607244851946491, + "flos": 25191622944000.0, + "grad_norm": 2.5755637093846078, + "language_loss": 0.80716789, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.82170904, + "num_input_tokens_seen": 308799435, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24255371, + "step": 14316, + "time_per_iteration": 2.6834583282470703 + }, + { + "auxiliary_loss_clip": 0.01258441, + "auxiliary_loss_mlp": 0.00206838, + "balance_loss_clip": 1.03599524, + "balance_loss_mlp": 0.18251923, + "epoch": 0.860784608447317, + "flos": 20482082110080.0, + "grad_norm": 6.771762125545195, + "language_loss": 0.92403257, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.9386853, + "num_input_tokens_seen": 308817730, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.24328613, + "step": 14317, + "time_per_iteration": 2.63031268119812 + }, + { + "auxiliary_loss_clip": 0.01228379, + "auxiliary_loss_mlp": 0.00226613, + "balance_loss_clip": 1.0168916, + "balance_loss_mlp": 0.20302147, + "epoch": 0.860844731699985, + "flos": 50476037696640.0, + "grad_norm": 251.04769574112913, + "language_loss": 0.74866951, + "learning_rate": 1.996343193113108e-07, + "loss": 0.76321948, + "num_input_tokens_seen": 308841735, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.23620605, + "step": 14318, + "time_per_iteration": 2.946009874343872 + }, + { + "auxiliary_loss_clip": 0.01225821, + "auxiliary_loss_mlp": 0.00207246, + "balance_loss_clip": 1.01812184, + "balance_loss_mlp": 0.18461998, + "epoch": 0.8609048549526529, + "flos": 41172768455040.0, + "grad_norm": 39.1483966258169, + "language_loss": 0.7813105, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.79564118, + "num_input_tokens_seen": 308865050, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.22631836, + "step": 14319, + "time_per_iteration": 2.8476593494415283 + }, + { + "auxiliary_loss_clip": 0.01242921, + "auxiliary_loss_mlp": 0.00226406, + "balance_loss_clip": 1.02641153, + "balance_loss_mlp": 0.20208696, + "epoch": 0.8609649782053209, + "flos": 23951196622080.0, + "grad_norm": 9.820531759385522, + "language_loss": 0.75517696, + "learning_rate": 1.992952252525839e-07, + "loss": 0.76987022, + "num_input_tokens_seen": 308885375, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24316406, + "step": 14320, + "time_per_iteration": 2.6529202461242676 + }, + { + "auxiliary_loss_clip": 0.0125914, + "auxiliary_loss_mlp": 0.0022577, + "balance_loss_clip": 1.03447497, + "balance_loss_mlp": 0.19938931, + "epoch": 0.8610251014579888, + "flos": 23112574813440.0, + "grad_norm": 9.90602648964567, + "language_loss": 0.86779189, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.88264096, + "num_input_tokens_seen": 308904700, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.26391602, + "step": 14321, + "time_per_iteration": 2.689584970474243 + }, + { + "auxiliary_loss_clip": 0.01236682, + "auxiliary_loss_mlp": 0.00211729, + "balance_loss_clip": 1.02270532, + "balance_loss_mlp": 0.18698151, + "epoch": 0.8610852247106568, + "flos": 19426811420160.0, + "grad_norm": 6.788019643222985, + "language_loss": 0.77982271, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.79430681, + "num_input_tokens_seen": 308922985, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.24755859, + "step": 14322, + "time_per_iteration": 2.632833480834961 + }, + { + "auxiliary_loss_clip": 0.01273274, + "auxiliary_loss_mlp": 0.00235802, + "balance_loss_clip": 1.04496181, + "balance_loss_mlp": 0.20898037, + "epoch": 0.8611453479633249, + "flos": 19312076401920.0, + "grad_norm": 28.22211880581904, + "language_loss": 0.6758281, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.6909188, + "num_input_tokens_seen": 308940765, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.26806641, + "step": 14323, + "time_per_iteration": 4.142085552215576 + }, + { + "auxiliary_loss_clip": 0.0123689, + "auxiliary_loss_mlp": 0.00206269, + "balance_loss_clip": 1.02532268, + "balance_loss_mlp": 0.18187837, + "epoch": 0.8612054712159928, + "flos": 23253667436160.0, + "grad_norm": 85.37818325092238, + "language_loss": 0.81365889, + "learning_rate": 1.986178565813801e-07, + "loss": 0.82809055, + "num_input_tokens_seen": 308960110, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.24401855, + "step": 14324, + "time_per_iteration": 2.714848518371582 + }, + { + "auxiliary_loss_clip": 0.01235674, + "auxiliary_loss_mlp": 0.00219148, + "balance_loss_clip": 1.01920474, + "balance_loss_mlp": 0.19481775, + "epoch": 0.8612655944686608, + "flos": 16028440744320.0, + "grad_norm": 50.58293871152654, + "language_loss": 0.77916586, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.79371411, + "num_input_tokens_seen": 308976665, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24328613, + "step": 14325, + "time_per_iteration": 4.117102146148682 + }, + { + "auxiliary_loss_clip": 0.01258962, + "auxiliary_loss_mlp": 0.00250357, + "balance_loss_clip": 1.03451443, + "balance_loss_mlp": 0.22401144, + "epoch": 0.8613257177213287, + "flos": 22492720788480.0, + "grad_norm": 276.38481228771303, + "language_loss": 0.728046, + "learning_rate": 1.982795820716472e-07, + "loss": 0.74313915, + "num_input_tokens_seen": 308997015, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26342773, + "step": 14326, + "time_per_iteration": 2.7022030353546143 + }, + { + "auxiliary_loss_clip": 0.01273889, + "auxiliary_loss_mlp": 0.00218757, + "balance_loss_clip": 1.04661286, + "balance_loss_mlp": 0.19398537, + "epoch": 0.8613858409739967, + "flos": 17238056175360.0, + "grad_norm": 41.06210846109242, + "language_loss": 0.93708766, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.95201409, + "num_input_tokens_seen": 309015250, + "router_z_loss_clip": 2.27441406, + "router_z_loss_mlp": 0.24780273, + "step": 14327, + "time_per_iteration": 2.6531853675842285 + }, + { + "auxiliary_loss_clip": 0.012223, + "auxiliary_loss_mlp": 0.00204245, + "balance_loss_clip": 1.01259267, + "balance_loss_mlp": 0.18185705, + "epoch": 0.8614459642266646, + "flos": 22821123859200.0, + "grad_norm": 5.6945903910028415, + "language_loss": 0.83614099, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.85040647, + "num_input_tokens_seen": 309034140, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.22399902, + "step": 14328, + "time_per_iteration": 2.68983793258667 + }, + { + "auxiliary_loss_clip": 0.01252826, + "auxiliary_loss_mlp": 0.00218735, + "balance_loss_clip": 1.03040707, + "balance_loss_mlp": 0.19410643, + "epoch": 0.8615060874793327, + "flos": 26504301473280.0, + "grad_norm": 379.6709337268802, + "language_loss": 0.85471326, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.86942887, + "num_input_tokens_seen": 309055075, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.24633789, + "step": 14329, + "time_per_iteration": 2.714632749557495 + }, + { + "auxiliary_loss_clip": 0.01245959, + "auxiliary_loss_mlp": 0.00231886, + "balance_loss_clip": 1.03169346, + "balance_loss_mlp": 0.20893838, + "epoch": 0.8615662107320006, + "flos": 24061011477120.0, + "grad_norm": 14.69481423834983, + "language_loss": 0.85849476, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.87327325, + "num_input_tokens_seen": 309074650, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.22937012, + "step": 14330, + "time_per_iteration": 4.145712852478027 + }, + { + "auxiliary_loss_clip": 0.01244551, + "auxiliary_loss_mlp": 0.00245292, + "balance_loss_clip": 1.02671695, + "balance_loss_mlp": 0.22081783, + "epoch": 0.8616263339846686, + "flos": 24165044242560.0, + "grad_norm": 4.039480731002817, + "language_loss": 0.74224758, + "learning_rate": 1.974350915342702e-07, + "loss": 0.757146, + "num_input_tokens_seen": 309094385, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24462891, + "step": 14331, + "time_per_iteration": 2.7027318477630615 + }, + { + "auxiliary_loss_clip": 0.01238425, + "auxiliary_loss_mlp": 0.0021436, + "balance_loss_clip": 1.02265549, + "balance_loss_mlp": 0.19031528, + "epoch": 0.8616864572373365, + "flos": 21724340025600.0, + "grad_norm": 67.42012706042104, + "language_loss": 0.82751942, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.84204733, + "num_input_tokens_seen": 309111815, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.24047852, + "step": 14332, + "time_per_iteration": 2.735325574874878 + }, + { + "auxiliary_loss_clip": 0.01246717, + "auxiliary_loss_mlp": 0.00221991, + "balance_loss_clip": 1.02774119, + "balance_loss_mlp": 0.19674239, + "epoch": 0.8617465804900045, + "flos": 23766651521280.0, + "grad_norm": 6.565313810215746, + "language_loss": 0.75286162, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.76754868, + "num_input_tokens_seen": 309131385, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25256348, + "step": 14333, + "time_per_iteration": 4.040869951248169 + }, + { + "auxiliary_loss_clip": 0.0127761, + "auxiliary_loss_mlp": 0.00227043, + "balance_loss_clip": 1.04987252, + "balance_loss_mlp": 0.20183027, + "epoch": 0.8618067037426724, + "flos": 37703941251840.0, + "grad_norm": 33.145796839878535, + "language_loss": 0.72671592, + "learning_rate": 1.969292174019157e-07, + "loss": 0.7417624, + "num_input_tokens_seen": 309155020, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.25231934, + "step": 14334, + "time_per_iteration": 2.856536388397217 + }, + { + "auxiliary_loss_clip": 0.01272205, + "auxiliary_loss_mlp": 0.00225183, + "balance_loss_clip": 1.04761028, + "balance_loss_mlp": 0.19936225, + "epoch": 0.8618668269953405, + "flos": 21471026336640.0, + "grad_norm": 111.86925210857656, + "language_loss": 0.78761065, + "learning_rate": 1.967607294278577e-07, + "loss": 0.80258453, + "num_input_tokens_seen": 309172865, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.25805664, + "step": 14335, + "time_per_iteration": 2.632023811340332 + }, + { + "auxiliary_loss_clip": 0.01241716, + "auxiliary_loss_mlp": 0.00218707, + "balance_loss_clip": 1.02734387, + "balance_loss_mlp": 0.19494833, + "epoch": 0.8619269502480085, + "flos": 22232691256320.0, + "grad_norm": 19.85125057850752, + "language_loss": 0.88200456, + "learning_rate": 1.965923098328135e-07, + "loss": 0.89660877, + "num_input_tokens_seen": 309193575, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.2376709, + "step": 14336, + "time_per_iteration": 2.673985481262207 + }, + { + "auxiliary_loss_clip": 0.01262036, + "auxiliary_loss_mlp": 0.00231699, + "balance_loss_clip": 1.03660357, + "balance_loss_mlp": 0.20571114, + "epoch": 0.8619870735006764, + "flos": 22710626645760.0, + "grad_norm": 9.65960412518409, + "language_loss": 0.75280297, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.76774037, + "num_input_tokens_seen": 309212680, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.26000977, + "step": 14337, + "time_per_iteration": 2.807074546813965 + }, + { + "auxiliary_loss_clip": 0.01230098, + "auxiliary_loss_mlp": 0.00230485, + "balance_loss_clip": 1.01722264, + "balance_loss_mlp": 0.20622629, + "epoch": 0.8620471967533444, + "flos": 37520293991040.0, + "grad_norm": 38.4651219976966, + "language_loss": 0.72933251, + "learning_rate": 1.962556758053089e-07, + "loss": 0.74393833, + "num_input_tokens_seen": 309234485, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.24255371, + "step": 14338, + "time_per_iteration": 2.8973278999328613 + }, + { + "auxiliary_loss_clip": 0.01252958, + "auxiliary_loss_mlp": 0.00238902, + "balance_loss_clip": 1.03430283, + "balance_loss_mlp": 0.212974, + "epoch": 0.8621073200060123, + "flos": 19682459493120.0, + "grad_norm": 37.21203645997719, + "language_loss": 0.70217675, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.71709538, + "num_input_tokens_seen": 309253630, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25927734, + "step": 14339, + "time_per_iteration": 2.6548831462860107 + }, + { + "auxiliary_loss_clip": 0.01237745, + "auxiliary_loss_mlp": 0.00234094, + "balance_loss_clip": 1.02421546, + "balance_loss_mlp": 0.20870277, + "epoch": 0.8621674432586803, + "flos": 14536855549440.0, + "grad_norm": 35.120360982631944, + "language_loss": 0.71843028, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.73314863, + "num_input_tokens_seen": 309270950, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.25427246, + "step": 14340, + "time_per_iteration": 2.617051362991333 + }, + { + "auxiliary_loss_clip": 0.01237481, + "auxiliary_loss_mlp": 0.00217822, + "balance_loss_clip": 1.02617717, + "balance_loss_mlp": 0.19468337, + "epoch": 0.8622275665113482, + "flos": 20740100480640.0, + "grad_norm": 18.820973438108375, + "language_loss": 0.85916257, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.87371558, + "num_input_tokens_seen": 309288780, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.23156738, + "step": 14341, + "time_per_iteration": 2.637075901031494 + }, + { + "auxiliary_loss_clip": 0.01222583, + "auxiliary_loss_mlp": 0.00200893, + "balance_loss_clip": 1.01464963, + "balance_loss_mlp": 0.18012658, + "epoch": 0.8622876897640163, + "flos": 24715914197760.0, + "grad_norm": 236.92999977855038, + "language_loss": 0.78968704, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.80392182, + "num_input_tokens_seen": 309310875, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.20776367, + "step": 14342, + "time_per_iteration": 2.684673309326172 + }, + { + "auxiliary_loss_clip": 0.01257241, + "auxiliary_loss_mlp": 0.0021048, + "balance_loss_clip": 1.0328083, + "balance_loss_mlp": 0.18464765, + "epoch": 0.8623478130166842, + "flos": 17457362663040.0, + "grad_norm": 6293.630149970105, + "language_loss": 0.79391992, + "learning_rate": 1.95415287816028e-07, + "loss": 0.80859715, + "num_input_tokens_seen": 309329900, + "router_z_loss_clip": 2.24316406, + "router_z_loss_mlp": 0.25854492, + "step": 14343, + "time_per_iteration": 2.6458966732025146 + }, + { + "auxiliary_loss_clip": 0.01249332, + "auxiliary_loss_mlp": 0.00236264, + "balance_loss_clip": 1.02634406, + "balance_loss_mlp": 0.21049069, + "epoch": 0.8624079362693522, + "flos": 18109176814080.0, + "grad_norm": 77.95940398322014, + "language_loss": 0.76330376, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.77815974, + "num_input_tokens_seen": 309347870, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25769043, + "step": 14344, + "time_per_iteration": 2.6135497093200684 + }, + { + "auxiliary_loss_clip": 0.01240198, + "auxiliary_loss_mlp": 0.00208787, + "balance_loss_clip": 1.02272487, + "balance_loss_mlp": 0.18539776, + "epoch": 0.8624680595220201, + "flos": 30666455971200.0, + "grad_norm": 18.00560328856228, + "language_loss": 0.87214923, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.88663912, + "num_input_tokens_seen": 309371695, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.23400879, + "step": 14345, + "time_per_iteration": 2.77447509765625 + }, + { + "auxiliary_loss_clip": 0.01259151, + "auxiliary_loss_mlp": 0.00217177, + "balance_loss_clip": 1.03877342, + "balance_loss_mlp": 0.18938884, + "epoch": 0.8625281827746881, + "flos": 37998588516480.0, + "grad_norm": 179.58916523535865, + "language_loss": 0.62478083, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.63954413, + "num_input_tokens_seen": 309394645, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.27770996, + "step": 14346, + "time_per_iteration": 2.7872304916381836 + }, + { + "auxiliary_loss_clip": 0.01232899, + "auxiliary_loss_mlp": 0.0021131, + "balance_loss_clip": 1.01698112, + "balance_loss_mlp": 0.18625259, + "epoch": 0.862588306027356, + "flos": 26249730808320.0, + "grad_norm": 37.43958634301877, + "language_loss": 0.83957326, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.85401535, + "num_input_tokens_seen": 309413170, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.25061035, + "step": 14347, + "time_per_iteration": 2.6904377937316895 + }, + { + "auxiliary_loss_clip": 0.01266959, + "auxiliary_loss_mlp": 0.00229098, + "balance_loss_clip": 1.04513276, + "balance_loss_mlp": 0.20339648, + "epoch": 0.862648429280024, + "flos": 25878809013120.0, + "grad_norm": 122.14639244932326, + "language_loss": 0.87871295, + "learning_rate": 1.945766105774449e-07, + "loss": 0.89367354, + "num_input_tokens_seen": 309431315, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.25708008, + "step": 14348, + "time_per_iteration": 2.649545192718506 + }, + { + "auxiliary_loss_clip": 0.01224494, + "auxiliary_loss_mlp": 0.00215761, + "balance_loss_clip": 1.01605952, + "balance_loss_mlp": 0.1923601, + "epoch": 0.862708552532692, + "flos": 37816413713280.0, + "grad_norm": 3.0570185635311615, + "language_loss": 0.73612404, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.75052655, + "num_input_tokens_seen": 309453020, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.23413086, + "step": 14349, + "time_per_iteration": 2.79958438873291 + }, + { + "auxiliary_loss_clip": 0.01240793, + "auxiliary_loss_mlp": 0.00219494, + "balance_loss_clip": 1.02562046, + "balance_loss_mlp": 0.19522266, + "epoch": 0.86276867578536, + "flos": 19091800247040.0, + "grad_norm": 107.14628554680039, + "language_loss": 0.80353284, + "learning_rate": 1.942416188703573e-07, + "loss": 0.81813562, + "num_input_tokens_seen": 309469780, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24291992, + "step": 14350, + "time_per_iteration": 2.681988000869751 + }, + { + "auxiliary_loss_clip": 0.01248175, + "auxiliary_loss_mlp": 0.00227534, + "balance_loss_clip": 1.03102517, + "balance_loss_mlp": 0.20208269, + "epoch": 0.862828799038028, + "flos": 22164281804160.0, + "grad_norm": 59.396901337585575, + "language_loss": 0.85445797, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.86921501, + "num_input_tokens_seen": 309489610, + "router_z_loss_clip": 2.17285156, + "router_z_loss_mlp": 0.25476074, + "step": 14351, + "time_per_iteration": 2.7239506244659424 + }, + { + "auxiliary_loss_clip": 0.01237947, + "auxiliary_loss_mlp": 0.00216097, + "balance_loss_clip": 1.02265263, + "balance_loss_mlp": 0.19163534, + "epoch": 0.8628889222906959, + "flos": 23145576433920.0, + "grad_norm": 4.773737585955737, + "language_loss": 0.91561306, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.93015355, + "num_input_tokens_seen": 309508295, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.24475098, + "step": 14352, + "time_per_iteration": 2.7037534713745117 + }, + { + "auxiliary_loss_clip": 0.01146844, + "auxiliary_loss_mlp": 0.00108772, + "balance_loss_clip": 1.00298798, + "balance_loss_mlp": 0.10042701, + "epoch": 0.8629490455433639, + "flos": 57817762151040.0, + "grad_norm": 0.7958328391907492, + "language_loss": 0.60930324, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.62185943, + "num_input_tokens_seen": 309567960, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.08349609, + "step": 14353, + "time_per_iteration": 3.1632461547851562 + }, + { + "auxiliary_loss_clip": 0.01247492, + "auxiliary_loss_mlp": 0.00199716, + "balance_loss_clip": 1.0292995, + "balance_loss_mlp": 0.17679223, + "epoch": 0.8630091687960318, + "flos": 15919667383680.0, + "grad_norm": 6.630218251854488, + "language_loss": 0.88305432, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.89752638, + "num_input_tokens_seen": 309586050, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.22924805, + "step": 14354, + "time_per_iteration": 2.7023520469665527 + }, + { + "auxiliary_loss_clip": 0.01252252, + "auxiliary_loss_mlp": 0.00243267, + "balance_loss_clip": 1.03060102, + "balance_loss_mlp": 0.2168379, + "epoch": 0.8630692920486999, + "flos": 17961691570560.0, + "grad_norm": 33.88813979553913, + "language_loss": 0.93429881, + "learning_rate": 1.934053380181031e-07, + "loss": 0.94925404, + "num_input_tokens_seen": 309602910, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.26452637, + "step": 14355, + "time_per_iteration": 2.6136744022369385 + }, + { + "auxiliary_loss_clip": 0.01242973, + "auxiliary_loss_mlp": 0.00225865, + "balance_loss_clip": 1.02763867, + "balance_loss_mlp": 0.20092668, + "epoch": 0.8631294153013678, + "flos": 22455158140800.0, + "grad_norm": 8.856742713015937, + "language_loss": 0.68566531, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.70035374, + "num_input_tokens_seen": 309621175, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24963379, + "step": 14356, + "time_per_iteration": 2.6748008728027344 + }, + { + "auxiliary_loss_clip": 0.01250772, + "auxiliary_loss_mlp": 0.00219202, + "balance_loss_clip": 1.03030515, + "balance_loss_mlp": 0.19493124, + "epoch": 0.8631895385540358, + "flos": 16837005847680.0, + "grad_norm": 168.52669492376054, + "language_loss": 0.8197459, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.83444566, + "num_input_tokens_seen": 309639395, + "router_z_loss_clip": 2.20214844, + "router_z_loss_mlp": 0.24316406, + "step": 14357, + "time_per_iteration": 2.6073689460754395 + }, + { + "auxiliary_loss_clip": 0.01245239, + "auxiliary_loss_mlp": 0.00206098, + "balance_loss_clip": 1.03124833, + "balance_loss_mlp": 0.18297181, + "epoch": 0.8632496618067037, + "flos": 18697214367360.0, + "grad_norm": 317.4374917359018, + "language_loss": 0.86893272, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.88344616, + "num_input_tokens_seen": 309657265, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.23120117, + "step": 14358, + "time_per_iteration": 2.614860773086548 + }, + { + "auxiliary_loss_clip": 0.01246348, + "auxiliary_loss_mlp": 0.00221088, + "balance_loss_clip": 1.02995276, + "balance_loss_mlp": 0.19630449, + "epoch": 0.8633097850593717, + "flos": 24279922915200.0, + "grad_norm": 1.9281351862610359, + "language_loss": 0.81084478, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.82551914, + "num_input_tokens_seen": 309678610, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24829102, + "step": 14359, + "time_per_iteration": 2.6473844051361084 + }, + { + "auxiliary_loss_clip": 0.01245017, + "auxiliary_loss_mlp": 0.00222493, + "balance_loss_clip": 1.03252745, + "balance_loss_mlp": 0.19907999, + "epoch": 0.8633699083120396, + "flos": 21178569801600.0, + "grad_norm": 4.3550136305399185, + "language_loss": 0.80530798, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.81998312, + "num_input_tokens_seen": 309697710, + "router_z_loss_clip": 2.12011719, + "router_z_loss_mlp": 0.234375, + "step": 14360, + "time_per_iteration": 2.632931709289551 + }, + { + "auxiliary_loss_clip": 0.01271001, + "auxiliary_loss_mlp": 0.00204415, + "balance_loss_clip": 1.04647851, + "balance_loss_mlp": 0.17721179, + "epoch": 0.8634300315647077, + "flos": 19244888012160.0, + "grad_norm": 35.57493504981243, + "language_loss": 0.84705859, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.86181277, + "num_input_tokens_seen": 309715985, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.27197266, + "step": 14361, + "time_per_iteration": 2.647881269454956 + }, + { + "auxiliary_loss_clip": 0.01147759, + "auxiliary_loss_mlp": 0.00158342, + "balance_loss_clip": 1.00676966, + "balance_loss_mlp": 0.14975844, + "epoch": 0.8634901548173756, + "flos": 66195648282240.0, + "grad_norm": 0.9326061620710311, + "language_loss": 0.57640785, + "learning_rate": 1.922374222645329e-07, + "loss": 0.58946884, + "num_input_tokens_seen": 309779930, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.0859375, + "step": 14362, + "time_per_iteration": 3.147634506225586 + }, + { + "auxiliary_loss_clip": 0.01287004, + "auxiliary_loss_mlp": 0.00229038, + "balance_loss_clip": 1.05525565, + "balance_loss_mlp": 0.20142871, + "epoch": 0.8635502780700436, + "flos": 24789531121920.0, + "grad_norm": 34.015331800517096, + "language_loss": 0.86901867, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.88417906, + "num_input_tokens_seen": 309800580, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.27624512, + "step": 14363, + "time_per_iteration": 2.7080166339874268 + }, + { + "auxiliary_loss_clip": 0.01256848, + "auxiliary_loss_mlp": 0.00227319, + "balance_loss_clip": 1.03181982, + "balance_loss_mlp": 0.20177273, + "epoch": 0.8636104013227116, + "flos": 25189970918400.0, + "grad_norm": 154.8147800357241, + "language_loss": 0.83278322, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.8476249, + "num_input_tokens_seen": 309821725, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.25537109, + "step": 14364, + "time_per_iteration": 2.66694974899292 + }, + { + "auxiliary_loss_clip": 0.01248369, + "auxiliary_loss_mlp": 0.00228952, + "balance_loss_clip": 1.02545726, + "balance_loss_mlp": 0.20369177, + "epoch": 0.8636705245753795, + "flos": 23878441624320.0, + "grad_norm": 12.115761788400455, + "language_loss": 0.79367566, + "learning_rate": 1.917379150731755e-07, + "loss": 0.80844879, + "num_input_tokens_seen": 309841565, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25231934, + "step": 14365, + "time_per_iteration": 4.091534376144409 + }, + { + "auxiliary_loss_clip": 0.01252736, + "auxiliary_loss_mlp": 0.00220843, + "balance_loss_clip": 1.03029251, + "balance_loss_mlp": 0.1928165, + "epoch": 0.8637306478280475, + "flos": 23110455911040.0, + "grad_norm": 5.848553572346952, + "language_loss": 0.80688179, + "learning_rate": 1.915715498065993e-07, + "loss": 0.82161748, + "num_input_tokens_seen": 309858635, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.27990723, + "step": 14366, + "time_per_iteration": 2.686221122741699 + }, + { + "auxiliary_loss_clip": 0.01221728, + "auxiliary_loss_mlp": 0.00213732, + "balance_loss_clip": 1.01367617, + "balance_loss_mlp": 0.19079632, + "epoch": 0.8637907710807154, + "flos": 21906802137600.0, + "grad_norm": 56.70632066217224, + "language_loss": 0.88949651, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.90385103, + "num_input_tokens_seen": 309877885, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.22961426, + "step": 14367, + "time_per_iteration": 4.041132211685181 + }, + { + "auxiliary_loss_clip": 0.01272835, + "auxiliary_loss_mlp": 0.00225237, + "balance_loss_clip": 1.0456388, + "balance_loss_mlp": 0.19860587, + "epoch": 0.8638508943333835, + "flos": 23580526222080.0, + "grad_norm": 12.398817603443801, + "language_loss": 0.68545729, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.70043802, + "num_input_tokens_seen": 309893140, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.26623535, + "step": 14368, + "time_per_iteration": 2.685426712036133 + }, + { + "auxiliary_loss_clip": 0.01252904, + "auxiliary_loss_mlp": 0.00210983, + "balance_loss_clip": 1.03583288, + "balance_loss_mlp": 0.18700959, + "epoch": 0.8639110175860514, + "flos": 25775853655680.0, + "grad_norm": 13.654067418643026, + "language_loss": 0.83020103, + "learning_rate": 1.91072865486821e-07, + "loss": 0.84483993, + "num_input_tokens_seen": 309914175, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.23986816, + "step": 14369, + "time_per_iteration": 2.6966726779937744 + }, + { + "auxiliary_loss_clip": 0.01244145, + "auxiliary_loss_mlp": 0.00226549, + "balance_loss_clip": 1.02698576, + "balance_loss_mlp": 0.19888015, + "epoch": 0.8639711408387194, + "flos": 23369443948800.0, + "grad_norm": 32.181832929791405, + "language_loss": 0.7200315, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.73473847, + "num_input_tokens_seen": 309932395, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.27661133, + "step": 14370, + "time_per_iteration": 2.6568968296051025 + }, + { + "auxiliary_loss_clip": 0.01253268, + "auxiliary_loss_mlp": 0.00222785, + "balance_loss_clip": 1.03275692, + "balance_loss_mlp": 0.19778684, + "epoch": 0.8640312640913873, + "flos": 22127221946880.0, + "grad_norm": 13.265204141229706, + "language_loss": 0.71840596, + "learning_rate": 1.907407522366209e-07, + "loss": 0.73316646, + "num_input_tokens_seen": 309951720, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25024414, + "step": 14371, + "time_per_iteration": 2.6812589168548584 + }, + { + "auxiliary_loss_clip": 0.01149475, + "auxiliary_loss_mlp": 0.00103631, + "balance_loss_clip": 1.0086782, + "balance_loss_mlp": 0.09657407, + "epoch": 0.8640913873440553, + "flos": 57571735944960.0, + "grad_norm": 0.8590224396306199, + "language_loss": 0.56119263, + "learning_rate": 1.905747985193107e-07, + "loss": 0.57372367, + "num_input_tokens_seen": 310006120, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.07080078, + "step": 14372, + "time_per_iteration": 4.544673681259155 + }, + { + "auxiliary_loss_clip": 0.01236562, + "auxiliary_loss_mlp": 0.00210337, + "balance_loss_clip": 1.02008569, + "balance_loss_mlp": 0.18579221, + "epoch": 0.8641515105967232, + "flos": 23987430466560.0, + "grad_norm": 3.376889187154304, + "language_loss": 0.8695749, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.88404387, + "num_input_tokens_seen": 310026740, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24536133, + "step": 14373, + "time_per_iteration": 2.7180488109588623 + }, + { + "auxiliary_loss_clip": 0.0124318, + "auxiliary_loss_mlp": 0.00211091, + "balance_loss_clip": 1.02477527, + "balance_loss_mlp": 0.18710628, + "epoch": 0.8642116338493913, + "flos": 19062749122560.0, + "grad_norm": 5.543940126072593, + "language_loss": 0.7110002, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.7255429, + "num_input_tokens_seen": 310044135, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.23962402, + "step": 14374, + "time_per_iteration": 2.628518581390381 + }, + { + "auxiliary_loss_clip": 0.01242312, + "auxiliary_loss_mlp": 0.002086, + "balance_loss_clip": 1.02750063, + "balance_loss_mlp": 0.18394795, + "epoch": 0.8642717571020592, + "flos": 18254148105600.0, + "grad_norm": 6.601886478847673, + "language_loss": 0.84035861, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.85486776, + "num_input_tokens_seen": 310061560, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24645996, + "step": 14375, + "time_per_iteration": 4.065406799316406 + }, + { + "auxiliary_loss_clip": 0.01248851, + "auxiliary_loss_mlp": 0.00219008, + "balance_loss_clip": 1.02858782, + "balance_loss_mlp": 0.19154185, + "epoch": 0.8643318803547272, + "flos": 57663270777600.0, + "grad_norm": 205.30196786310208, + "language_loss": 0.6869123, + "learning_rate": 1.899116698488117e-07, + "loss": 0.7015909, + "num_input_tokens_seen": 310087310, + "router_z_loss_clip": 2.20410156, + "router_z_loss_mlp": 0.27453613, + "step": 14376, + "time_per_iteration": 3.010939598083496 + }, + { + "auxiliary_loss_clip": 0.01237049, + "auxiliary_loss_mlp": 0.00214506, + "balance_loss_clip": 1.02083826, + "balance_loss_mlp": 0.19074759, + "epoch": 0.8643920036073952, + "flos": 19609524927360.0, + "grad_norm": 24.677814633856915, + "language_loss": 0.72927237, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.74378788, + "num_input_tokens_seen": 310106260, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.23742676, + "step": 14377, + "time_per_iteration": 2.6152167320251465 + }, + { + "auxiliary_loss_clip": 0.0122932, + "auxiliary_loss_mlp": 0.00230271, + "balance_loss_clip": 1.01664686, + "balance_loss_mlp": 0.20713274, + "epoch": 0.8644521268600631, + "flos": 20850346298880.0, + "grad_norm": 29.18133767689712, + "language_loss": 0.77176362, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.78635955, + "num_input_tokens_seen": 310125305, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.23144531, + "step": 14378, + "time_per_iteration": 2.6468923091888428 + }, + { + "auxiliary_loss_clip": 0.01154119, + "auxiliary_loss_mlp": 0.00105537, + "balance_loss_clip": 1.01197362, + "balance_loss_mlp": 0.09795562, + "epoch": 0.8645122501127311, + "flos": 66719550101760.0, + "grad_norm": 0.8294144576564058, + "language_loss": 0.59613699, + "learning_rate": 1.894150440305995e-07, + "loss": 0.60873348, + "num_input_tokens_seen": 310189270, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.07568359, + "step": 14379, + "time_per_iteration": 3.152498722076416 + }, + { + "auxiliary_loss_clip": 0.01225399, + "auxiliary_loss_mlp": 0.00215313, + "balance_loss_clip": 1.01280916, + "balance_loss_mlp": 0.19176956, + "epoch": 0.864572373365399, + "flos": 21690009601920.0, + "grad_norm": 6.588715925072324, + "language_loss": 0.821769, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.83617616, + "num_input_tokens_seen": 310208395, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.2355957, + "step": 14380, + "time_per_iteration": 2.687246561050415 + }, + { + "auxiliary_loss_clip": 0.01273462, + "auxiliary_loss_mlp": 0.00241655, + "balance_loss_clip": 1.04668736, + "balance_loss_mlp": 0.21534538, + "epoch": 0.8646324966180671, + "flos": 20266402896000.0, + "grad_norm": 2.641306064148575, + "language_loss": 0.83977842, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.85492969, + "num_input_tokens_seen": 310227415, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.26306152, + "step": 14381, + "time_per_iteration": 2.711496114730835 + }, + { + "auxiliary_loss_clip": 0.01238861, + "auxiliary_loss_mlp": 0.00215655, + "balance_loss_clip": 1.0222789, + "balance_loss_mlp": 0.19090761, + "epoch": 0.864692619870735, + "flos": 11946188050560.0, + "grad_norm": 18.57091060437416, + "language_loss": 0.93912113, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.95366633, + "num_input_tokens_seen": 310242625, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24743652, + "step": 14382, + "time_per_iteration": 2.5789554119110107 + }, + { + "auxiliary_loss_clip": 0.01252127, + "auxiliary_loss_mlp": 0.00208841, + "balance_loss_clip": 1.03431284, + "balance_loss_mlp": 0.18563122, + "epoch": 0.864752743123403, + "flos": 21470703114240.0, + "grad_norm": 4.963112963675815, + "language_loss": 0.83482778, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.84943748, + "num_input_tokens_seen": 310260585, + "router_z_loss_clip": 2.18457031, + "router_z_loss_mlp": 0.23217773, + "step": 14383, + "time_per_iteration": 2.7833220958709717 + }, + { + "auxiliary_loss_clip": 0.01237835, + "auxiliary_loss_mlp": 0.00226563, + "balance_loss_clip": 1.02435386, + "balance_loss_mlp": 0.20412821, + "epoch": 0.8648128663760709, + "flos": 19530018172800.0, + "grad_norm": 108.81259752096503, + "language_loss": 0.90615225, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.92079622, + "num_input_tokens_seen": 310277210, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.22436523, + "step": 14384, + "time_per_iteration": 2.700066089630127 + }, + { + "auxiliary_loss_clip": 0.01243933, + "auxiliary_loss_mlp": 0.00208449, + "balance_loss_clip": 1.02140188, + "balance_loss_mlp": 0.18472642, + "epoch": 0.8648729896287389, + "flos": 21287953693440.0, + "grad_norm": 11.03662883926231, + "language_loss": 0.87645662, + "learning_rate": 1.884236463176072e-07, + "loss": 0.89098036, + "num_input_tokens_seen": 310296610, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.23742676, + "step": 14385, + "time_per_iteration": 2.696552276611328 + }, + { + "auxiliary_loss_clip": 0.012517, + "auxiliary_loss_mlp": 0.00235551, + "balance_loss_clip": 1.03415573, + "balance_loss_mlp": 0.21071957, + "epoch": 0.8649331128814068, + "flos": 24604483230720.0, + "grad_norm": 8.216199750306064, + "language_loss": 0.81087625, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.8257488, + "num_input_tokens_seen": 310316830, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.24853516, + "step": 14386, + "time_per_iteration": 2.651942014694214 + }, + { + "auxiliary_loss_clip": 0.01239124, + "auxiliary_loss_mlp": 0.00195709, + "balance_loss_clip": 1.02496767, + "balance_loss_mlp": 0.17364338, + "epoch": 0.8649932361340749, + "flos": 15377811742080.0, + "grad_norm": 12.942378939306655, + "language_loss": 0.92440283, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.9387511, + "num_input_tokens_seen": 310334355, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.22070312, + "step": 14387, + "time_per_iteration": 2.6702756881713867 + }, + { + "auxiliary_loss_clip": 0.01240542, + "auxiliary_loss_mlp": 0.00207042, + "balance_loss_clip": 1.02479422, + "balance_loss_mlp": 0.18334323, + "epoch": 0.8650533593867428, + "flos": 19901227276800.0, + "grad_norm": 20.909125636602628, + "language_loss": 0.77960038, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.7940762, + "num_input_tokens_seen": 310352900, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.23681641, + "step": 14388, + "time_per_iteration": 2.6260225772857666 + }, + { + "auxiliary_loss_clip": 0.01233868, + "auxiliary_loss_mlp": 0.00216564, + "balance_loss_clip": 1.02431917, + "balance_loss_mlp": 0.19383046, + "epoch": 0.8651134826394108, + "flos": 25626931868160.0, + "grad_norm": 7.5446668959443395, + "language_loss": 0.95925349, + "learning_rate": 1.877640883285283e-07, + "loss": 0.97375786, + "num_input_tokens_seen": 310372855, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.22741699, + "step": 14389, + "time_per_iteration": 2.7499094009399414 + }, + { + "auxiliary_loss_clip": 0.01234203, + "auxiliary_loss_mlp": 0.00214515, + "balance_loss_clip": 1.02179801, + "balance_loss_mlp": 0.19157892, + "epoch": 0.8651736058920788, + "flos": 18734525619840.0, + "grad_norm": 122.43791970226535, + "language_loss": 0.79264939, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.80713665, + "num_input_tokens_seen": 310391595, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.22937012, + "step": 14390, + "time_per_iteration": 2.671577215194702 + }, + { + "auxiliary_loss_clip": 0.01233919, + "auxiliary_loss_mlp": 0.00211421, + "balance_loss_clip": 1.02254355, + "balance_loss_mlp": 0.18842587, + "epoch": 0.8652337291447467, + "flos": 20776765288320.0, + "grad_norm": 5.258311608757628, + "language_loss": 0.87733889, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.8917923, + "num_input_tokens_seen": 310410090, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.22998047, + "step": 14391, + "time_per_iteration": 2.6889514923095703 + }, + { + "auxiliary_loss_clip": 0.01140267, + "auxiliary_loss_mlp": 0.00117403, + "balance_loss_clip": 1.00013411, + "balance_loss_mlp": 0.11153827, + "epoch": 0.8652938523974147, + "flos": 64227887464320.0, + "grad_norm": 0.7747449395480951, + "language_loss": 0.6727401, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.6853168, + "num_input_tokens_seen": 310470055, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.05859375, + "step": 14392, + "time_per_iteration": 3.097878932952881 + }, + { + "auxiliary_loss_clip": 0.01260194, + "auxiliary_loss_mlp": 0.00238431, + "balance_loss_clip": 1.03528833, + "balance_loss_mlp": 0.21071491, + "epoch": 0.8653539756500827, + "flos": 18040587793920.0, + "grad_norm": 18.329437619150884, + "language_loss": 0.85503328, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.87001956, + "num_input_tokens_seen": 310487665, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.27734375, + "step": 14393, + "time_per_iteration": 2.656834602355957 + }, + { + "auxiliary_loss_clip": 0.01245595, + "auxiliary_loss_mlp": 0.00223805, + "balance_loss_clip": 1.02764022, + "balance_loss_mlp": 0.19822286, + "epoch": 0.8654140989027507, + "flos": 17382416935680.0, + "grad_norm": 21.978875759767707, + "language_loss": 0.82920671, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.84390068, + "num_input_tokens_seen": 310506130, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25598145, + "step": 14394, + "time_per_iteration": 2.6428256034851074 + }, + { + "auxiliary_loss_clip": 0.01239165, + "auxiliary_loss_mlp": 0.00213702, + "balance_loss_clip": 1.02353001, + "balance_loss_mlp": 0.18914476, + "epoch": 0.8654742221554186, + "flos": 53284862448000.0, + "grad_norm": 8.883806201888433, + "language_loss": 0.72564995, + "learning_rate": 1.867768130747036e-07, + "loss": 0.74017859, + "num_input_tokens_seen": 310532445, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24560547, + "step": 14395, + "time_per_iteration": 2.9636669158935547 + }, + { + "auxiliary_loss_clip": 0.01253879, + "auxiliary_loss_mlp": 0.00235001, + "balance_loss_clip": 1.04208183, + "balance_loss_mlp": 0.21104044, + "epoch": 0.8655343454080866, + "flos": 23914711382400.0, + "grad_norm": 14.085665904854377, + "language_loss": 0.76858211, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.78347093, + "num_input_tokens_seen": 310552300, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.23937988, + "step": 14396, + "time_per_iteration": 2.774465322494507 + }, + { + "auxiliary_loss_clip": 0.01249969, + "auxiliary_loss_mlp": 0.00219636, + "balance_loss_clip": 1.02960896, + "balance_loss_mlp": 0.19667596, + "epoch": 0.8655944686607545, + "flos": 24097209408000.0, + "grad_norm": 38.363896738058294, + "language_loss": 0.78758061, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.80227661, + "num_input_tokens_seen": 310572710, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.22973633, + "step": 14397, + "time_per_iteration": 2.6789660453796387 + }, + { + "auxiliary_loss_clip": 0.01233836, + "auxiliary_loss_mlp": 0.00228449, + "balance_loss_clip": 1.01672173, + "balance_loss_mlp": 0.20322424, + "epoch": 0.8656545919134225, + "flos": 23112718467840.0, + "grad_norm": 7.862851656968512, + "language_loss": 0.72673047, + "learning_rate": 1.86284103591253e-07, + "loss": 0.74135333, + "num_input_tokens_seen": 310592460, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25244141, + "step": 14398, + "time_per_iteration": 2.703394651412964 + }, + { + "auxiliary_loss_clip": 0.01239356, + "auxiliary_loss_mlp": 0.00201403, + "balance_loss_clip": 1.01894426, + "balance_loss_mlp": 0.17798999, + "epoch": 0.8657147151660904, + "flos": 21141761339520.0, + "grad_norm": 292.5650207115117, + "language_loss": 0.8509922, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.86539984, + "num_input_tokens_seen": 310609375, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.23425293, + "step": 14399, + "time_per_iteration": 2.6535515785217285 + }, + { + "auxiliary_loss_clip": 0.01231446, + "auxiliary_loss_mlp": 0.00208243, + "balance_loss_clip": 1.01775622, + "balance_loss_mlp": 0.1857129, + "epoch": 0.8657748384187585, + "flos": 16289439943680.0, + "grad_norm": 16.96858046722188, + "language_loss": 0.99793309, + "learning_rate": 1.8595597447334855e-07, + "loss": 1.01233006, + "num_input_tokens_seen": 310627405, + "router_z_loss_clip": 2.13574219, + "router_z_loss_mlp": 0.22546387, + "step": 14400, + "time_per_iteration": 2.6258764266967773 + }, + { + "auxiliary_loss_clip": 0.01229325, + "auxiliary_loss_mlp": 0.00236553, + "balance_loss_clip": 1.01848197, + "balance_loss_mlp": 0.21088734, + "epoch": 0.8658349616714264, + "flos": 30843890179200.0, + "grad_norm": 34.879371449753165, + "language_loss": 0.73249573, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.74715453, + "num_input_tokens_seen": 310649945, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.25671387, + "step": 14401, + "time_per_iteration": 2.754730701446533 + }, + { + "auxiliary_loss_clip": 0.01266294, + "auxiliary_loss_mlp": 0.00218836, + "balance_loss_clip": 1.04048908, + "balance_loss_mlp": 0.19163197, + "epoch": 0.8658950849240944, + "flos": 18952862440320.0, + "grad_norm": 5.126452328879962, + "language_loss": 0.84613448, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.86098576, + "num_input_tokens_seen": 310668285, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.2722168, + "step": 14402, + "time_per_iteration": 2.666607141494751 + }, + { + "auxiliary_loss_clip": 0.01228121, + "auxiliary_loss_mlp": 0.00203018, + "balance_loss_clip": 1.01738036, + "balance_loss_mlp": 0.18110704, + "epoch": 0.8659552081767624, + "flos": 23364344217600.0, + "grad_norm": 8.192125364689412, + "language_loss": 0.82053989, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.83485126, + "num_input_tokens_seen": 310687015, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.21899414, + "step": 14403, + "time_per_iteration": 2.6747066974639893 + }, + { + "auxiliary_loss_clip": 0.01251817, + "auxiliary_loss_mlp": 0.00226781, + "balance_loss_clip": 1.03836632, + "balance_loss_mlp": 0.2016044, + "epoch": 0.8660153314294303, + "flos": 23841992298240.0, + "grad_norm": 86.69113300028728, + "language_loss": 0.8095206, + "learning_rate": 1.853005417520368e-07, + "loss": 0.82430661, + "num_input_tokens_seen": 310707580, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.25183105, + "step": 14404, + "time_per_iteration": 2.71597957611084 + }, + { + "auxiliary_loss_clip": 0.01243052, + "auxiliary_loss_mlp": 0.00212941, + "balance_loss_clip": 1.02468002, + "balance_loss_mlp": 0.18800277, + "epoch": 0.8660754546820983, + "flos": 23112467072640.0, + "grad_norm": 19.89051435043802, + "language_loss": 0.79810834, + "learning_rate": 1.851368555901447e-07, + "loss": 0.8126682, + "num_input_tokens_seen": 310727300, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24975586, + "step": 14405, + "time_per_iteration": 2.6641123294830322 + }, + { + "auxiliary_loss_clip": 0.01268758, + "auxiliary_loss_mlp": 0.00227738, + "balance_loss_clip": 1.04121065, + "balance_loss_mlp": 0.20048696, + "epoch": 0.8661355779347663, + "flos": 14391991998720.0, + "grad_norm": 16.150614698109077, + "language_loss": 0.76327878, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.77824372, + "num_input_tokens_seen": 310744935, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.2722168, + "step": 14406, + "time_per_iteration": 2.620616912841797 + }, + { + "auxiliary_loss_clip": 0.01243063, + "auxiliary_loss_mlp": 0.00227335, + "balance_loss_clip": 1.02711117, + "balance_loss_mlp": 0.20411265, + "epoch": 0.8661957011874343, + "flos": 21870137329920.0, + "grad_norm": 4.359759604987049, + "language_loss": 0.89192414, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.90662813, + "num_input_tokens_seen": 310765085, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.23205566, + "step": 14407, + "time_per_iteration": 4.053891658782959 + }, + { + "auxiliary_loss_clip": 0.01252002, + "auxiliary_loss_mlp": 0.00216195, + "balance_loss_clip": 1.03314495, + "balance_loss_mlp": 0.19290137, + "epoch": 0.8662558244401022, + "flos": 21835160461440.0, + "grad_norm": 63.02038596845861, + "language_loss": 0.80491132, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.81959331, + "num_input_tokens_seen": 310783260, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.23303223, + "step": 14408, + "time_per_iteration": 2.6535091400146484 + }, + { + "auxiliary_loss_clip": 0.01222907, + "auxiliary_loss_mlp": 0.00190276, + "balance_loss_clip": 1.01665354, + "balance_loss_mlp": 0.16794786, + "epoch": 0.8663159476927702, + "flos": 17384104874880.0, + "grad_norm": 40.455557470668964, + "language_loss": 0.86307043, + "learning_rate": 1.844827992025304e-07, + "loss": 0.87720227, + "num_input_tokens_seen": 310801970, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.2232666, + "step": 14409, + "time_per_iteration": 4.104370594024658 + }, + { + "auxiliary_loss_clip": 0.01264265, + "auxiliary_loss_mlp": 0.00222431, + "balance_loss_clip": 1.04012597, + "balance_loss_mlp": 0.1964554, + "epoch": 0.8663760709454381, + "flos": 22747722416640.0, + "grad_norm": 6.367403859302036, + "language_loss": 0.87569445, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.89056146, + "num_input_tokens_seen": 310822070, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.25952148, + "step": 14410, + "time_per_iteration": 2.7467525005340576 + }, + { + "auxiliary_loss_clip": 0.01246463, + "auxiliary_loss_mlp": 0.00211102, + "balance_loss_clip": 1.02793646, + "balance_loss_mlp": 0.18826175, + "epoch": 0.8664361941981061, + "flos": 17376850327680.0, + "grad_norm": 12.454712624612602, + "language_loss": 0.86189008, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.87646574, + "num_input_tokens_seen": 310838355, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.22827148, + "step": 14411, + "time_per_iteration": 2.644090414047241 + }, + { + "auxiliary_loss_clip": 0.01220528, + "auxiliary_loss_mlp": 0.00222443, + "balance_loss_clip": 1.01042342, + "balance_loss_mlp": 0.19973385, + "epoch": 0.866496317450774, + "flos": 16034438315520.0, + "grad_norm": 10.227516009729678, + "language_loss": 0.81355405, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.8279838, + "num_input_tokens_seen": 310856055, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.22741699, + "step": 14412, + "time_per_iteration": 2.6722054481506348 + }, + { + "auxiliary_loss_clip": 0.01243708, + "auxiliary_loss_mlp": 0.00201883, + "balance_loss_clip": 1.02777004, + "balance_loss_mlp": 0.17895931, + "epoch": 0.8665564407034421, + "flos": 20814830726400.0, + "grad_norm": 30.21029381658901, + "language_loss": 0.77283454, + "learning_rate": 1.83829844328371e-07, + "loss": 0.78729039, + "num_input_tokens_seen": 310876695, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.22937012, + "step": 14413, + "time_per_iteration": 2.671900510787964 + }, + { + "auxiliary_loss_clip": 0.01242969, + "auxiliary_loss_mlp": 0.0022233, + "balance_loss_clip": 1.02377892, + "balance_loss_mlp": 0.19934618, + "epoch": 0.86661656395611, + "flos": 15815167741440.0, + "grad_norm": 21.145821436005885, + "language_loss": 0.73288125, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.74753428, + "num_input_tokens_seen": 310893880, + "router_z_loss_clip": 2.19238281, + "router_z_loss_mlp": 0.22973633, + "step": 14414, + "time_per_iteration": 2.6783015727996826 + }, + { + "auxiliary_loss_clip": 0.01236498, + "auxiliary_loss_mlp": 0.00216513, + "balance_loss_clip": 1.02131832, + "balance_loss_mlp": 0.1914556, + "epoch": 0.866676687208778, + "flos": 23036910814080.0, + "grad_norm": 37.670684273387245, + "language_loss": 0.70843768, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.72296774, + "num_input_tokens_seen": 310914145, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.25036621, + "step": 14415, + "time_per_iteration": 4.13609766960144 + }, + { + "auxiliary_loss_clip": 0.01128294, + "auxiliary_loss_mlp": 0.00078283, + "balance_loss_clip": 0.99091297, + "balance_loss_mlp": 0.07198897, + "epoch": 0.866736810461446, + "flos": 63802275212160.0, + "grad_norm": 0.787172171129337, + "language_loss": 0.59926486, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.61133063, + "num_input_tokens_seen": 310972825, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06298828, + "step": 14416, + "time_per_iteration": 3.2247490882873535 + }, + { + "auxiliary_loss_clip": 0.01251512, + "auxiliary_loss_mlp": 0.00220719, + "balance_loss_clip": 1.03084087, + "balance_loss_mlp": 0.19510069, + "epoch": 0.8667969337141139, + "flos": 20449367798400.0, + "grad_norm": 980.640818664037, + "language_loss": 0.8297472, + "learning_rate": 1.831779913638285e-07, + "loss": 0.84446955, + "num_input_tokens_seen": 310992050, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25646973, + "step": 14417, + "time_per_iteration": 2.6243443489074707 + }, + { + "auxiliary_loss_clip": 0.01241865, + "auxiliary_loss_mlp": 0.00215285, + "balance_loss_clip": 1.02233267, + "balance_loss_mlp": 0.19073938, + "epoch": 0.866857056966782, + "flos": 21653703930240.0, + "grad_norm": 5.087363103491599, + "language_loss": 0.83104503, + "learning_rate": 1.830152003424319e-07, + "loss": 0.84561652, + "num_input_tokens_seen": 311011105, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.2454834, + "step": 14418, + "time_per_iteration": 4.033541917800903 + }, + { + "auxiliary_loss_clip": 0.01217321, + "auxiliary_loss_mlp": 0.00189642, + "balance_loss_clip": 1.00653458, + "balance_loss_mlp": 0.1660144, + "epoch": 0.8669171802194499, + "flos": 22852832590080.0, + "grad_norm": 6.681436647483804, + "language_loss": 0.75300062, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.76707023, + "num_input_tokens_seen": 311032080, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.2364502, + "step": 14419, + "time_per_iteration": 2.6571156978607178 + }, + { + "auxiliary_loss_clip": 0.01234311, + "auxiliary_loss_mlp": 0.00204928, + "balance_loss_clip": 1.02071548, + "balance_loss_mlp": 0.18029936, + "epoch": 0.8669773034721179, + "flos": 18734166483840.0, + "grad_norm": 11.676271653736977, + "language_loss": 0.86335301, + "learning_rate": 1.826898250065465e-07, + "loss": 0.87774539, + "num_input_tokens_seen": 311049735, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.24633789, + "step": 14420, + "time_per_iteration": 2.6447904109954834 + }, + { + "auxiliary_loss_clip": 0.01232858, + "auxiliary_loss_mlp": 0.00211396, + "balance_loss_clip": 1.01879215, + "balance_loss_mlp": 0.18750647, + "epoch": 0.8670374267247858, + "flos": 18916018064640.0, + "grad_norm": 140.98921641469622, + "language_loss": 0.89397126, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.90841377, + "num_input_tokens_seen": 311067675, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.2388916, + "step": 14421, + "time_per_iteration": 2.696129083633423 + }, + { + "auxiliary_loss_clip": 0.01131538, + "auxiliary_loss_mlp": 0.00151688, + "balance_loss_clip": 0.99352878, + "balance_loss_mlp": 0.14420199, + "epoch": 0.8670975499774538, + "flos": 48814527214080.0, + "grad_norm": 0.7276496043649967, + "language_loss": 0.48243839, + "learning_rate": 1.823647253209941e-07, + "loss": 0.49527067, + "num_input_tokens_seen": 311126605, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.07470703, + "step": 14422, + "time_per_iteration": 3.156510829925537 + }, + { + "auxiliary_loss_clip": 0.01243428, + "auxiliary_loss_mlp": 0.00211733, + "balance_loss_clip": 1.02738857, + "balance_loss_mlp": 0.18859471, + "epoch": 0.8671576732301217, + "flos": 26136145025280.0, + "grad_norm": 2.1205543439032493, + "language_loss": 0.82059848, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.83515012, + "num_input_tokens_seen": 311147325, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.23132324, + "step": 14423, + "time_per_iteration": 2.6857030391693115 + }, + { + "auxiliary_loss_clip": 0.01217235, + "auxiliary_loss_mlp": 0.00212977, + "balance_loss_clip": 1.00788581, + "balance_loss_mlp": 0.18944532, + "epoch": 0.8672177964827897, + "flos": 18367446579840.0, + "grad_norm": 4.721070032725136, + "language_loss": 0.82072103, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.83502316, + "num_input_tokens_seen": 311165385, + "router_z_loss_clip": 2.09277344, + "router_z_loss_mlp": 0.23535156, + "step": 14424, + "time_per_iteration": 2.626976251602173 + }, + { + "auxiliary_loss_clip": 0.01212536, + "auxiliary_loss_mlp": 0.00196343, + "balance_loss_clip": 1.00832975, + "balance_loss_mlp": 0.17351469, + "epoch": 0.8672779197354576, + "flos": 28545355992960.0, + "grad_norm": 655.0083146338258, + "language_loss": 0.77178931, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.78587812, + "num_input_tokens_seen": 311185860, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.22827148, + "step": 14425, + "time_per_iteration": 2.7270257472991943 + }, + { + "auxiliary_loss_clip": 0.01254177, + "auxiliary_loss_mlp": 0.0023119, + "balance_loss_clip": 1.03431845, + "balance_loss_mlp": 0.20631056, + "epoch": 0.8673380429881257, + "flos": 22382474970240.0, + "grad_norm": 16.336243606853476, + "language_loss": 0.75594604, + "learning_rate": 1.817153530980926e-07, + "loss": 0.7707997, + "num_input_tokens_seen": 311205810, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.24865723, + "step": 14426, + "time_per_iteration": 2.6846866607666016 + }, + { + "auxiliary_loss_clip": 0.01263695, + "auxiliary_loss_mlp": 0.00224122, + "balance_loss_clip": 1.03689492, + "balance_loss_mlp": 0.19646585, + "epoch": 0.8673981662407936, + "flos": 20996430912000.0, + "grad_norm": 24.370743604756942, + "language_loss": 0.79621732, + "learning_rate": 1.815531824008234e-07, + "loss": 0.81109554, + "num_input_tokens_seen": 311226080, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.27648926, + "step": 14427, + "time_per_iteration": 2.692060947418213 + }, + { + "auxiliary_loss_clip": 0.01225461, + "auxiliary_loss_mlp": 0.00219228, + "balance_loss_clip": 1.0140028, + "balance_loss_mlp": 0.19564791, + "epoch": 0.8674582894934616, + "flos": 24426797627520.0, + "grad_norm": 166.29935808867512, + "language_loss": 0.76380277, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.77824962, + "num_input_tokens_seen": 311246380, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.2355957, + "step": 14428, + "time_per_iteration": 2.7054030895233154 + }, + { + "auxiliary_loss_clip": 0.01237441, + "auxiliary_loss_mlp": 0.00215559, + "balance_loss_clip": 1.02201843, + "balance_loss_mlp": 0.19220544, + "epoch": 0.8675184127461296, + "flos": 20737514701440.0, + "grad_norm": 4.269687509827742, + "language_loss": 0.78764737, + "learning_rate": 1.812290478794889e-07, + "loss": 0.80217737, + "num_input_tokens_seen": 311266465, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.23352051, + "step": 14429, + "time_per_iteration": 2.8552002906799316 + }, + { + "auxiliary_loss_clip": 0.01238557, + "auxiliary_loss_mlp": 0.00198632, + "balance_loss_clip": 1.02138162, + "balance_loss_mlp": 0.17501622, + "epoch": 0.8675785359987975, + "flos": 19135647774720.0, + "grad_norm": 4.204298931177658, + "language_loss": 0.77968502, + "learning_rate": 1.810670840677151e-07, + "loss": 0.79405689, + "num_input_tokens_seen": 311285075, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.23608398, + "step": 14430, + "time_per_iteration": 2.6675286293029785 + }, + { + "auxiliary_loss_clip": 0.01244113, + "auxiliary_loss_mlp": 0.00222239, + "balance_loss_clip": 1.02326143, + "balance_loss_mlp": 0.19808714, + "epoch": 0.8676386592514655, + "flos": 22710662559360.0, + "grad_norm": 71.64296434698727, + "language_loss": 0.76824564, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.78290915, + "num_input_tokens_seen": 311303230, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.24157715, + "step": 14431, + "time_per_iteration": 2.6766357421875 + }, + { + "auxiliary_loss_clip": 0.01257566, + "auxiliary_loss_mlp": 0.00205884, + "balance_loss_clip": 1.0330106, + "balance_loss_mlp": 0.1805045, + "epoch": 0.8676987825041335, + "flos": 14209853109120.0, + "grad_norm": 29.788487520721386, + "language_loss": 0.75343692, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.76807141, + "num_input_tokens_seen": 311318070, + "router_z_loss_clip": 2.24316406, + "router_z_loss_mlp": 0.25390625, + "step": 14432, + "time_per_iteration": 2.6552889347076416 + }, + { + "auxiliary_loss_clip": 0.01252492, + "auxiliary_loss_mlp": 0.00209199, + "balance_loss_clip": 1.03458095, + "balance_loss_mlp": 0.185166, + "epoch": 0.8677589057568015, + "flos": 13589927256960.0, + "grad_norm": 70.93752956941222, + "language_loss": 0.8876788, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.90229571, + "num_input_tokens_seen": 311334885, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24035645, + "step": 14433, + "time_per_iteration": 2.6424641609191895 + }, + { + "auxiliary_loss_clip": 0.01134133, + "auxiliary_loss_mlp": 0.00089919, + "balance_loss_clip": 0.99428189, + "balance_loss_mlp": 0.08398274, + "epoch": 0.8678190290094694, + "flos": 68933657370240.0, + "grad_norm": 0.7024979810915418, + "language_loss": 0.57549173, + "learning_rate": 1.804199186231805e-07, + "loss": 0.58773226, + "num_input_tokens_seen": 311399780, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.05932617, + "step": 14434, + "time_per_iteration": 3.229625940322876 + }, + { + "auxiliary_loss_clip": 0.01221965, + "auxiliary_loss_mlp": 0.0020923, + "balance_loss_clip": 1.01252651, + "balance_loss_mlp": 0.18646134, + "epoch": 0.8678791522621374, + "flos": 32557726776960.0, + "grad_norm": 17.720751389474906, + "language_loss": 0.85944796, + "learning_rate": 1.802582997433628e-07, + "loss": 0.87375993, + "num_input_tokens_seen": 311419610, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.22766113, + "step": 14435, + "time_per_iteration": 2.747859239578247 + }, + { + "auxiliary_loss_clip": 0.01249511, + "auxiliary_loss_mlp": 0.00222805, + "balance_loss_clip": 1.02756238, + "balance_loss_mlp": 0.19584012, + "epoch": 0.8679392755148053, + "flos": 35042637657600.0, + "grad_norm": 8.90087199043531, + "language_loss": 0.72887003, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.74359322, + "num_input_tokens_seen": 311440045, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.26965332, + "step": 14436, + "time_per_iteration": 2.859226703643799 + }, + { + "auxiliary_loss_clip": 0.01257131, + "auxiliary_loss_mlp": 0.00206988, + "balance_loss_clip": 1.03548217, + "balance_loss_mlp": 0.18059464, + "epoch": 0.8679993987674733, + "flos": 18552494471040.0, + "grad_norm": 5.72616863708048, + "language_loss": 0.77667844, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.79131961, + "num_input_tokens_seen": 311456660, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.26416016, + "step": 14437, + "time_per_iteration": 2.6287810802459717 + }, + { + "auxiliary_loss_clip": 0.01252688, + "auxiliary_loss_mlp": 0.00224476, + "balance_loss_clip": 1.03579092, + "balance_loss_mlp": 0.20113534, + "epoch": 0.8680595220201412, + "flos": 27454390162560.0, + "grad_norm": 5.427303325734982, + "language_loss": 0.87289059, + "learning_rate": 1.797738571571381e-07, + "loss": 0.88766229, + "num_input_tokens_seen": 311475460, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.23327637, + "step": 14438, + "time_per_iteration": 2.772761821746826 + }, + { + "auxiliary_loss_clip": 0.01226867, + "auxiliary_loss_mlp": 0.00208268, + "balance_loss_clip": 1.01828671, + "balance_loss_mlp": 0.18443799, + "epoch": 0.8681196452728093, + "flos": 19208797822080.0, + "grad_norm": 26.92967881897053, + "language_loss": 0.75124097, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.76559234, + "num_input_tokens_seen": 311494575, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.23815918, + "step": 14439, + "time_per_iteration": 2.660846471786499 + }, + { + "auxiliary_loss_clip": 0.0122355, + "auxiliary_loss_mlp": 0.00214087, + "balance_loss_clip": 1.01394606, + "balance_loss_mlp": 0.19081748, + "epoch": 0.8681797685254772, + "flos": 37560442417920.0, + "grad_norm": 48.79252300773218, + "language_loss": 0.72423911, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.73861539, + "num_input_tokens_seen": 311515805, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.23254395, + "step": 14440, + "time_per_iteration": 2.941610097885132 + }, + { + "auxiliary_loss_clip": 0.0123467, + "auxiliary_loss_mlp": 0.00214835, + "balance_loss_clip": 1.02068806, + "balance_loss_mlp": 0.18947887, + "epoch": 0.8682398917781452, + "flos": 23289937194240.0, + "grad_norm": 77.91177420800938, + "language_loss": 0.73050857, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.74500358, + "num_input_tokens_seen": 311536000, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.25354004, + "step": 14441, + "time_per_iteration": 2.808337688446045 + }, + { + "auxiliary_loss_clip": 0.01224636, + "auxiliary_loss_mlp": 0.00204069, + "balance_loss_clip": 1.0165, + "balance_loss_mlp": 0.18038228, + "epoch": 0.8683000150308132, + "flos": 21872794936320.0, + "grad_norm": 45.48090989881931, + "language_loss": 0.74201071, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.75629783, + "num_input_tokens_seen": 311556220, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.23669434, + "step": 14442, + "time_per_iteration": 2.7357115745544434 + }, + { + "auxiliary_loss_clip": 0.01250109, + "auxiliary_loss_mlp": 0.00231354, + "balance_loss_clip": 1.02687919, + "balance_loss_mlp": 0.20577207, + "epoch": 0.8683601382834811, + "flos": 14647209108480.0, + "grad_norm": 4.047999401479111, + "language_loss": 0.79764593, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.81246054, + "num_input_tokens_seen": 311572530, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.25585938, + "step": 14443, + "time_per_iteration": 2.6591262817382812 + }, + { + "auxiliary_loss_clip": 0.01251148, + "auxiliary_loss_mlp": 0.00214168, + "balance_loss_clip": 1.03292799, + "balance_loss_mlp": 0.19055283, + "epoch": 0.8684202615361492, + "flos": 26359904799360.0, + "grad_norm": 2.2218030562216695, + "language_loss": 0.8985728, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.91322601, + "num_input_tokens_seen": 311591105, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.23620605, + "step": 14444, + "time_per_iteration": 2.698276996612549 + }, + { + "auxiliary_loss_clip": 0.01251323, + "auxiliary_loss_mlp": 0.00217001, + "balance_loss_clip": 1.02832842, + "balance_loss_mlp": 0.19020227, + "epoch": 0.8684803847888171, + "flos": 20704010290560.0, + "grad_norm": 12.313134326894172, + "language_loss": 0.85530615, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.8699894, + "num_input_tokens_seen": 311608350, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26782227, + "step": 14445, + "time_per_iteration": 2.662186622619629 + }, + { + "auxiliary_loss_clip": 0.01257648, + "auxiliary_loss_mlp": 0.00206508, + "balance_loss_clip": 1.04279423, + "balance_loss_mlp": 0.18228433, + "epoch": 0.8685405080414851, + "flos": 22638123043200.0, + "grad_norm": 36.1010968906973, + "language_loss": 0.76787734, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.78251892, + "num_input_tokens_seen": 311626380, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.24206543, + "step": 14446, + "time_per_iteration": 2.7264111042022705 + }, + { + "auxiliary_loss_clip": 0.01240262, + "auxiliary_loss_mlp": 0.00229536, + "balance_loss_clip": 1.02345335, + "balance_loss_mlp": 0.20625478, + "epoch": 0.868600631294153, + "flos": 24822065865600.0, + "grad_norm": 18.59204443926526, + "language_loss": 0.88548744, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.90018547, + "num_input_tokens_seen": 311644345, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.23291016, + "step": 14447, + "time_per_iteration": 2.7392091751098633 + }, + { + "auxiliary_loss_clip": 0.01238199, + "auxiliary_loss_mlp": 0.00208946, + "balance_loss_clip": 1.02486157, + "balance_loss_mlp": 0.18493719, + "epoch": 0.868660754546821, + "flos": 25113983696640.0, + "grad_norm": 218.7240625937966, + "language_loss": 0.79137343, + "learning_rate": 1.781635359686515e-07, + "loss": 0.8058449, + "num_input_tokens_seen": 311663340, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.2401123, + "step": 14448, + "time_per_iteration": 2.681224822998047 + }, + { + "auxiliary_loss_clip": 0.01244249, + "auxiliary_loss_mlp": 0.00208597, + "balance_loss_clip": 1.02916217, + "balance_loss_mlp": 0.18527961, + "epoch": 0.8687208777994889, + "flos": 12677832178560.0, + "grad_norm": 5.216532892274374, + "language_loss": 0.87833142, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.89285988, + "num_input_tokens_seen": 311679860, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.23327637, + "step": 14449, + "time_per_iteration": 4.085373163223267 + }, + { + "auxiliary_loss_clip": 0.01136429, + "auxiliary_loss_mlp": 0.00105901, + "balance_loss_clip": 0.9980613, + "balance_loss_mlp": 0.09970205, + "epoch": 0.8687810010521569, + "flos": 65617235573760.0, + "grad_norm": 0.8330900777375345, + "language_loss": 0.59758663, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.61000991, + "num_input_tokens_seen": 311738135, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.06176758, + "step": 14450, + "time_per_iteration": 3.105616807937622 + }, + { + "auxiliary_loss_clip": 0.01243383, + "auxiliary_loss_mlp": 0.00217641, + "balance_loss_clip": 1.02726805, + "balance_loss_mlp": 0.19264235, + "epoch": 0.8688411243048249, + "flos": 24244012293120.0, + "grad_norm": 37.30710260819788, + "language_loss": 0.83444375, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.84905398, + "num_input_tokens_seen": 311756975, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25012207, + "step": 14451, + "time_per_iteration": 4.143481731414795 + }, + { + "auxiliary_loss_clip": 0.01252032, + "auxiliary_loss_mlp": 0.0022744, + "balance_loss_clip": 1.03320909, + "balance_loss_mlp": 0.20186959, + "epoch": 0.8689012475574929, + "flos": 18221828843520.0, + "grad_norm": 15.213254317624477, + "language_loss": 0.79059625, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.80539101, + "num_input_tokens_seen": 311771830, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25610352, + "step": 14452, + "time_per_iteration": 2.6442818641662598 + }, + { + "auxiliary_loss_clip": 0.01272597, + "auxiliary_loss_mlp": 0.00234817, + "balance_loss_clip": 1.04509568, + "balance_loss_mlp": 0.20747015, + "epoch": 0.8689613708101608, + "flos": 19646728439040.0, + "grad_norm": 7.578078017777437, + "language_loss": 0.79736006, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.81243414, + "num_input_tokens_seen": 311790130, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.27331543, + "step": 14453, + "time_per_iteration": 2.708402156829834 + }, + { + "auxiliary_loss_clip": 0.0123231, + "auxiliary_loss_mlp": 0.00199639, + "balance_loss_clip": 1.01972747, + "balance_loss_mlp": 0.1762263, + "epoch": 0.8690214940628288, + "flos": 11728749070080.0, + "grad_norm": 10.049987043127816, + "language_loss": 0.83503371, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.84935319, + "num_input_tokens_seen": 311808360, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.23388672, + "step": 14454, + "time_per_iteration": 2.642094373703003 + }, + { + "auxiliary_loss_clip": 0.01263672, + "auxiliary_loss_mlp": 0.00198473, + "balance_loss_clip": 1.04338551, + "balance_loss_mlp": 0.1756202, + "epoch": 0.8690816173154968, + "flos": 34936450076160.0, + "grad_norm": 65.83516551867336, + "language_loss": 0.66331506, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.67793649, + "num_input_tokens_seen": 311831325, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.22827148, + "step": 14455, + "time_per_iteration": 2.880167245864868 + }, + { + "auxiliary_loss_clip": 0.01236798, + "auxiliary_loss_mlp": 0.00211579, + "balance_loss_clip": 1.02215004, + "balance_loss_mlp": 0.18879855, + "epoch": 0.8691417405681647, + "flos": 11614804151040.0, + "grad_norm": 138.72649612675872, + "language_loss": 0.91189021, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.92637396, + "num_input_tokens_seen": 311848090, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.22753906, + "step": 14456, + "time_per_iteration": 2.593683958053589 + }, + { + "auxiliary_loss_clip": 0.01271912, + "auxiliary_loss_mlp": 0.00214227, + "balance_loss_clip": 1.04475009, + "balance_loss_mlp": 0.1868448, + "epoch": 0.8692018638208328, + "flos": 24608038677120.0, + "grad_norm": 15.095050253067795, + "language_loss": 0.8905015, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.90536284, + "num_input_tokens_seen": 311867855, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.27368164, + "step": 14457, + "time_per_iteration": 4.158725738525391 + }, + { + "auxiliary_loss_clip": 0.01229659, + "auxiliary_loss_mlp": 0.00209541, + "balance_loss_clip": 1.02386093, + "balance_loss_mlp": 0.18667704, + "epoch": 0.8692619870735007, + "flos": 25995124229760.0, + "grad_norm": 26.58316767470622, + "language_loss": 0.85039681, + "learning_rate": 1.765601232001328e-07, + "loss": 0.86478883, + "num_input_tokens_seen": 311888675, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.2286377, + "step": 14458, + "time_per_iteration": 2.7336814403533936 + }, + { + "auxiliary_loss_clip": 0.0126188, + "auxiliary_loss_mlp": 0.00220396, + "balance_loss_clip": 1.04235756, + "balance_loss_mlp": 0.19478981, + "epoch": 0.8693221103261687, + "flos": 18041808856320.0, + "grad_norm": 11.062678181025008, + "language_loss": 0.78712213, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.80194485, + "num_input_tokens_seen": 311907310, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25646973, + "step": 14459, + "time_per_iteration": 2.6156773567199707 + }, + { + "auxiliary_loss_clip": 0.01214763, + "auxiliary_loss_mlp": 0.00220933, + "balance_loss_clip": 1.0098691, + "balance_loss_mlp": 0.19797295, + "epoch": 0.8693822335788366, + "flos": 27492347859840.0, + "grad_norm": 110.63369094961008, + "language_loss": 0.79828227, + "learning_rate": 1.762402701923398e-07, + "loss": 0.81263924, + "num_input_tokens_seen": 311929635, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.22973633, + "step": 14460, + "time_per_iteration": 4.112328052520752 + }, + { + "auxiliary_loss_clip": 0.01254736, + "auxiliary_loss_mlp": 0.00228546, + "balance_loss_clip": 1.0295577, + "balance_loss_mlp": 0.20226039, + "epoch": 0.8694423568315046, + "flos": 24097712198400.0, + "grad_norm": 12.034693757717, + "language_loss": 0.73447168, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.74930453, + "num_input_tokens_seen": 311948800, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.26293945, + "step": 14461, + "time_per_iteration": 2.6623528003692627 + }, + { + "auxiliary_loss_clip": 0.01237624, + "auxiliary_loss_mlp": 0.00217633, + "balance_loss_clip": 1.01984239, + "balance_loss_mlp": 0.19189554, + "epoch": 0.8695024800841725, + "flos": 18362131367040.0, + "grad_norm": 27.844154832878786, + "language_loss": 0.91448706, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.9290396, + "num_input_tokens_seen": 311964090, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25769043, + "step": 14462, + "time_per_iteration": 2.604078769683838 + }, + { + "auxiliary_loss_clip": 0.01266346, + "auxiliary_loss_mlp": 0.00215595, + "balance_loss_clip": 1.04371762, + "balance_loss_mlp": 0.19050106, + "epoch": 0.8695626033368405, + "flos": 14027750133120.0, + "grad_norm": 16.422594601264986, + "language_loss": 0.74908972, + "learning_rate": 1.757610093744335e-07, + "loss": 0.7639091, + "num_input_tokens_seen": 311981460, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25109863, + "step": 14463, + "time_per_iteration": 2.6005260944366455 + }, + { + "auxiliary_loss_clip": 0.01256872, + "auxiliary_loss_mlp": 0.00233202, + "balance_loss_clip": 1.03645873, + "balance_loss_mlp": 0.20578426, + "epoch": 0.8696227265895085, + "flos": 16836862193280.0, + "grad_norm": 46.14021836808044, + "language_loss": 0.76520407, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.78010488, + "num_input_tokens_seen": 312000115, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.27429199, + "step": 14464, + "time_per_iteration": 2.5898923873901367 + }, + { + "auxiliary_loss_clip": 0.01271989, + "auxiliary_loss_mlp": 0.00200165, + "balance_loss_clip": 1.04126239, + "balance_loss_mlp": 0.17329511, + "epoch": 0.8696828498421765, + "flos": 21799070271360.0, + "grad_norm": 50.39975482149876, + "language_loss": 0.73274297, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.74746454, + "num_input_tokens_seen": 312020770, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.26843262, + "step": 14465, + "time_per_iteration": 2.6800997257232666 + }, + { + "auxiliary_loss_clip": 0.01225815, + "auxiliary_loss_mlp": 0.00217368, + "balance_loss_clip": 1.01467586, + "balance_loss_mlp": 0.1928224, + "epoch": 0.8697429730948444, + "flos": 22894812610560.0, + "grad_norm": 3.410334224477989, + "language_loss": 0.89551115, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.90994298, + "num_input_tokens_seen": 312041870, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.2454834, + "step": 14466, + "time_per_iteration": 2.670989990234375 + }, + { + "auxiliary_loss_clip": 0.01272732, + "auxiliary_loss_mlp": 0.00233061, + "balance_loss_clip": 1.04244208, + "balance_loss_mlp": 0.20611998, + "epoch": 0.8698030963475124, + "flos": 24717458482560.0, + "grad_norm": 21.046532184981153, + "language_loss": 0.73686492, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.75192285, + "num_input_tokens_seen": 312058210, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.26965332, + "step": 14467, + "time_per_iteration": 2.7799267768859863 + }, + { + "auxiliary_loss_clip": 0.01231399, + "auxiliary_loss_mlp": 0.00204426, + "balance_loss_clip": 1.02191401, + "balance_loss_mlp": 0.18033394, + "epoch": 0.8698632196001803, + "flos": 28442221067520.0, + "grad_norm": 22.081772116788926, + "language_loss": 0.74699962, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.7613579, + "num_input_tokens_seen": 312082665, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.24084473, + "step": 14468, + "time_per_iteration": 2.777477502822876 + }, + { + "auxiliary_loss_clip": 0.01226509, + "auxiliary_loss_mlp": 0.00201817, + "balance_loss_clip": 1.01625562, + "balance_loss_mlp": 0.17861933, + "epoch": 0.8699233428528483, + "flos": 27636457224960.0, + "grad_norm": 311.412655219317, + "language_loss": 0.77070272, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.7849859, + "num_input_tokens_seen": 312101960, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23205566, + "step": 14469, + "time_per_iteration": 2.705764055252075 + }, + { + "auxiliary_loss_clip": 0.01228708, + "auxiliary_loss_mlp": 0.00179351, + "balance_loss_clip": 1.01698804, + "balance_loss_mlp": 0.15517579, + "epoch": 0.8699834661055164, + "flos": 20045659864320.0, + "grad_norm": 12.120257713655684, + "language_loss": 0.9197787, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.93385923, + "num_input_tokens_seen": 312117125, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.24169922, + "step": 14470, + "time_per_iteration": 2.6190690994262695 + }, + { + "auxiliary_loss_clip": 0.0123979, + "auxiliary_loss_mlp": 0.00208896, + "balance_loss_clip": 1.02494907, + "balance_loss_mlp": 0.1839934, + "epoch": 0.8700435893581843, + "flos": 23732787974400.0, + "grad_norm": 78.3740547772297, + "language_loss": 0.79336393, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.80785084, + "num_input_tokens_seen": 312135775, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24902344, + "step": 14471, + "time_per_iteration": 2.671668529510498 + }, + { + "auxiliary_loss_clip": 0.01239632, + "auxiliary_loss_mlp": 0.00208555, + "balance_loss_clip": 1.02742243, + "balance_loss_mlp": 0.18560722, + "epoch": 0.8701037126108523, + "flos": 23548422441600.0, + "grad_norm": 17.623798832514534, + "language_loss": 0.84735155, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.86183345, + "num_input_tokens_seen": 312156070, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.22949219, + "step": 14472, + "time_per_iteration": 2.673552989959717 + }, + { + "auxiliary_loss_clip": 0.01247144, + "auxiliary_loss_mlp": 0.0020041, + "balance_loss_clip": 1.02669406, + "balance_loss_mlp": 0.17746247, + "epoch": 0.8701638358635202, + "flos": 18843442634880.0, + "grad_norm": 3.2535886430242464, + "language_loss": 0.82449841, + "learning_rate": 1.741679706279644e-07, + "loss": 0.838974, + "num_input_tokens_seen": 312174380, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.22973633, + "step": 14473, + "time_per_iteration": 2.6134870052337646 + }, + { + "auxiliary_loss_clip": 0.01244219, + "auxiliary_loss_mlp": 0.00199802, + "balance_loss_clip": 1.02539849, + "balance_loss_mlp": 0.17535236, + "epoch": 0.8702239591161882, + "flos": 27928339142400.0, + "grad_norm": 54.33032493520876, + "language_loss": 0.78827953, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.80271971, + "num_input_tokens_seen": 312195130, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.2442627, + "step": 14474, + "time_per_iteration": 2.7181267738342285 + }, + { + "auxiliary_loss_clip": 0.01263635, + "auxiliary_loss_mlp": 0.00224834, + "balance_loss_clip": 1.03855753, + "balance_loss_mlp": 0.1987514, + "epoch": 0.8702840823688561, + "flos": 17233997938560.0, + "grad_norm": 957.754079875734, + "language_loss": 0.79931957, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.81420428, + "num_input_tokens_seen": 312212300, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.2611084, + "step": 14475, + "time_per_iteration": 2.7456400394439697 + }, + { + "auxiliary_loss_clip": 0.01253454, + "auxiliary_loss_mlp": 0.0021785, + "balance_loss_clip": 1.02880073, + "balance_loss_mlp": 0.19145715, + "epoch": 0.8703442056215241, + "flos": 19427565605760.0, + "grad_norm": 27.707442234599654, + "language_loss": 0.86105067, + "learning_rate": 1.736914088262349e-07, + "loss": 0.87576365, + "num_input_tokens_seen": 312231735, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.26403809, + "step": 14476, + "time_per_iteration": 2.7972958087921143 + }, + { + "auxiliary_loss_clip": 0.01240347, + "auxiliary_loss_mlp": 0.0021517, + "balance_loss_clip": 1.02869904, + "balance_loss_mlp": 0.19125688, + "epoch": 0.8704043288741921, + "flos": 22273845264000.0, + "grad_norm": 14.411463146296816, + "language_loss": 0.79428113, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.80883634, + "num_input_tokens_seen": 312253060, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.23925781, + "step": 14477, + "time_per_iteration": 2.6834914684295654 + }, + { + "auxiliary_loss_clip": 0.01253155, + "auxiliary_loss_mlp": 0.00199069, + "balance_loss_clip": 1.03177059, + "balance_loss_mlp": 0.17372511, + "epoch": 0.8704644521268601, + "flos": 16648725732480.0, + "grad_norm": 63.436235071279214, + "language_loss": 0.6904639, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.7049861, + "num_input_tokens_seen": 312269460, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.25354004, + "step": 14478, + "time_per_iteration": 2.7032220363616943 + }, + { + "auxiliary_loss_clip": 0.0124302, + "auxiliary_loss_mlp": 0.00208214, + "balance_loss_clip": 1.02921748, + "balance_loss_mlp": 0.18407427, + "epoch": 0.870524575379528, + "flos": 24280210224000.0, + "grad_norm": 83.51673532397983, + "language_loss": 0.80450118, + "learning_rate": 1.732154703087323e-07, + "loss": 0.81901348, + "num_input_tokens_seen": 312289830, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.24133301, + "step": 14479, + "time_per_iteration": 2.7134392261505127 + }, + { + "auxiliary_loss_clip": 0.01247509, + "auxiliary_loss_mlp": 0.00221546, + "balance_loss_clip": 1.02786589, + "balance_loss_mlp": 0.19539195, + "epoch": 0.870584698632196, + "flos": 28768684803840.0, + "grad_norm": 5.945497997819187, + "language_loss": 0.80040467, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.81509519, + "num_input_tokens_seen": 312311320, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.26171875, + "step": 14480, + "time_per_iteration": 2.7847492694854736 + }, + { + "auxiliary_loss_clip": 0.01262746, + "auxiliary_loss_mlp": 0.00216319, + "balance_loss_clip": 1.03735423, + "balance_loss_mlp": 0.18907958, + "epoch": 0.8706448218848639, + "flos": 32449635774720.0, + "grad_norm": 7.4329716029140736, + "language_loss": 0.77999079, + "learning_rate": 1.728985243129666e-07, + "loss": 0.79478145, + "num_input_tokens_seen": 312332095, + "router_z_loss_clip": 2.25292969, + "router_z_loss_mlp": 0.27233887, + "step": 14481, + "time_per_iteration": 2.713491678237915 + }, + { + "auxiliary_loss_clip": 0.01228056, + "auxiliary_loss_mlp": 0.00193971, + "balance_loss_clip": 1.0169729, + "balance_loss_mlp": 0.17216748, + "epoch": 0.8707049451375319, + "flos": 22748009725440.0, + "grad_norm": 16.10345414681128, + "language_loss": 0.83947051, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.85369086, + "num_input_tokens_seen": 312351225, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.21801758, + "step": 14482, + "time_per_iteration": 2.695855140686035 + }, + { + "auxiliary_loss_clip": 0.01248239, + "auxiliary_loss_mlp": 0.0021345, + "balance_loss_clip": 1.03366458, + "balance_loss_mlp": 0.18852319, + "epoch": 0.8707650683902, + "flos": 15851976203520.0, + "grad_norm": 177.12846440245409, + "language_loss": 0.84467399, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.85929084, + "num_input_tokens_seen": 312369730, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24951172, + "step": 14483, + "time_per_iteration": 2.6260478496551514 + }, + { + "auxiliary_loss_clip": 0.01265195, + "auxiliary_loss_mlp": 0.00222933, + "balance_loss_clip": 1.04477072, + "balance_loss_mlp": 0.19595626, + "epoch": 0.8708251916428679, + "flos": 16468131127680.0, + "grad_norm": 30.611077869350186, + "language_loss": 0.71669275, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.73157406, + "num_input_tokens_seen": 312386780, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.26965332, + "step": 14484, + "time_per_iteration": 2.6987297534942627 + }, + { + "auxiliary_loss_clip": 0.01264318, + "auxiliary_loss_mlp": 0.00201224, + "balance_loss_clip": 1.0437218, + "balance_loss_mlp": 0.17728674, + "epoch": 0.8708853148955359, + "flos": 15377847655680.0, + "grad_norm": 117.77642804006734, + "language_loss": 0.80282366, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.81747907, + "num_input_tokens_seen": 312404875, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.23937988, + "step": 14485, + "time_per_iteration": 2.6970574855804443 + }, + { + "auxiliary_loss_clip": 0.01241223, + "auxiliary_loss_mlp": 0.00220512, + "balance_loss_clip": 1.02571332, + "balance_loss_mlp": 0.19473846, + "epoch": 0.8709454381482038, + "flos": 30551325903360.0, + "grad_norm": 3.04471261074546, + "language_loss": 0.6957981, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.71041542, + "num_input_tokens_seen": 312425280, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.25744629, + "step": 14486, + "time_per_iteration": 2.7306582927703857 + }, + { + "auxiliary_loss_clip": 0.01271254, + "auxiliary_loss_mlp": 0.00221297, + "balance_loss_clip": 1.0481956, + "balance_loss_mlp": 0.19578664, + "epoch": 0.8710055614008718, + "flos": 22601422321920.0, + "grad_norm": 60.62939393813725, + "language_loss": 0.7203474, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.73527288, + "num_input_tokens_seen": 312443835, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25512695, + "step": 14487, + "time_per_iteration": 2.651566743850708 + }, + { + "auxiliary_loss_clip": 0.01248176, + "auxiliary_loss_mlp": 0.00211615, + "balance_loss_clip": 1.03054225, + "balance_loss_mlp": 0.18854778, + "epoch": 0.8710656846535397, + "flos": 18443146492800.0, + "grad_norm": 175.7333165485703, + "language_loss": 0.76993775, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.78453577, + "num_input_tokens_seen": 312460830, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.23046875, + "step": 14488, + "time_per_iteration": 2.603893518447876 + }, + { + "auxiliary_loss_clip": 0.01270253, + "auxiliary_loss_mlp": 0.00218098, + "balance_loss_clip": 1.04663801, + "balance_loss_mlp": 0.19269493, + "epoch": 0.8711258079062077, + "flos": 16503862181760.0, + "grad_norm": 4.901576352529993, + "language_loss": 0.92841756, + "learning_rate": 1.716335121648338e-07, + "loss": 0.94330114, + "num_input_tokens_seen": 312477575, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.25390625, + "step": 14489, + "time_per_iteration": 2.6193275451660156 + }, + { + "auxiliary_loss_clip": 0.01280146, + "auxiliary_loss_mlp": 0.00220063, + "balance_loss_clip": 1.05003071, + "balance_loss_mlp": 0.19428957, + "epoch": 0.8711859311588757, + "flos": 15663336952320.0, + "grad_norm": 35.41614737443463, + "language_loss": 0.85983276, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.87483484, + "num_input_tokens_seen": 312492140, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.25756836, + "step": 14490, + "time_per_iteration": 2.6201045513153076 + }, + { + "auxiliary_loss_clip": 0.01255727, + "auxiliary_loss_mlp": 0.00224123, + "balance_loss_clip": 1.03357077, + "balance_loss_mlp": 0.19818312, + "epoch": 0.8712460544115437, + "flos": 15557544420480.0, + "grad_norm": 51.918846993825625, + "language_loss": 0.8554275, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.87022603, + "num_input_tokens_seen": 312508400, + "router_z_loss_clip": 2.22167969, + "router_z_loss_mlp": 0.25952148, + "step": 14491, + "time_per_iteration": 2.6245675086975098 + }, + { + "auxiliary_loss_clip": 0.01238034, + "auxiliary_loss_mlp": 0.00201111, + "balance_loss_clip": 1.02710462, + "balance_loss_mlp": 0.17718577, + "epoch": 0.8713061776642116, + "flos": 16763568491520.0, + "grad_norm": 5.789398285383175, + "language_loss": 0.73789054, + "learning_rate": 1.711602764198723e-07, + "loss": 0.7522819, + "num_input_tokens_seen": 312525915, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.23937988, + "step": 14492, + "time_per_iteration": 4.085930824279785 + }, + { + "auxiliary_loss_clip": 0.01231492, + "auxiliary_loss_mlp": 0.00190984, + "balance_loss_clip": 1.01867545, + "balance_loss_mlp": 0.16842982, + "epoch": 0.8713663009168796, + "flos": 24279887001600.0, + "grad_norm": 23.8760136361393, + "language_loss": 0.77669191, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.79091668, + "num_input_tokens_seen": 312544735, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.22546387, + "step": 14493, + "time_per_iteration": 4.167388439178467 + }, + { + "auxiliary_loss_clip": 0.01256785, + "auxiliary_loss_mlp": 0.00200736, + "balance_loss_clip": 1.03577781, + "balance_loss_mlp": 0.17411667, + "epoch": 0.8714264241695475, + "flos": 23795594904960.0, + "grad_norm": 23.48311690135246, + "language_loss": 0.97903329, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.99360847, + "num_input_tokens_seen": 312557910, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26635742, + "step": 14494, + "time_per_iteration": 2.6229453086853027 + }, + { + "auxiliary_loss_clip": 0.01230192, + "auxiliary_loss_mlp": 0.00192327, + "balance_loss_clip": 1.02203214, + "balance_loss_mlp": 0.16965359, + "epoch": 0.8714865474222155, + "flos": 37997942071680.0, + "grad_norm": 3.7509431244749916, + "language_loss": 0.66597086, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.68019605, + "num_input_tokens_seen": 312580360, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.2265625, + "step": 14495, + "time_per_iteration": 2.777500629425049 + }, + { + "auxiliary_loss_clip": 0.0125138, + "auxiliary_loss_mlp": 0.00200682, + "balance_loss_clip": 1.03066242, + "balance_loss_mlp": 0.17701942, + "epoch": 0.8715466706748836, + "flos": 22455696844800.0, + "grad_norm": 77.24618230417248, + "language_loss": 0.9146384, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.92915899, + "num_input_tokens_seen": 312597550, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.23669434, + "step": 14496, + "time_per_iteration": 2.6776983737945557 + }, + { + "auxiliary_loss_clip": 0.01256233, + "auxiliary_loss_mlp": 0.00216165, + "balance_loss_clip": 1.03795922, + "balance_loss_mlp": 0.19022501, + "epoch": 0.8716067939275515, + "flos": 21215126868480.0, + "grad_norm": 125.42606296148755, + "language_loss": 0.86040181, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.87512583, + "num_input_tokens_seen": 312616435, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.25964355, + "step": 14497, + "time_per_iteration": 2.686739444732666 + }, + { + "auxiliary_loss_clip": 0.01257207, + "auxiliary_loss_mlp": 0.00197405, + "balance_loss_clip": 1.03715181, + "balance_loss_mlp": 0.17108358, + "epoch": 0.8716669171802195, + "flos": 22997732054400.0, + "grad_norm": 101.82940045582598, + "language_loss": 0.76165682, + "learning_rate": 1.70215677535406e-07, + "loss": 0.77620292, + "num_input_tokens_seen": 312632770, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26342773, + "step": 14498, + "time_per_iteration": 2.6759791374206543 + }, + { + "auxiliary_loss_clip": 0.01262519, + "auxiliary_loss_mlp": 0.00206095, + "balance_loss_clip": 1.04067349, + "balance_loss_mlp": 0.17992896, + "epoch": 0.8717270404328874, + "flos": 29784058462080.0, + "grad_norm": 6.6126582225960915, + "language_loss": 0.65094918, + "learning_rate": 1.700584872028108e-07, + "loss": 0.66563535, + "num_input_tokens_seen": 312651900, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.26184082, + "step": 14499, + "time_per_iteration": 4.149197578430176 + }, + { + "auxiliary_loss_clip": 0.01258395, + "auxiliary_loss_mlp": 0.00200321, + "balance_loss_clip": 1.03567219, + "balance_loss_mlp": 0.17492931, + "epoch": 0.8717871636855554, + "flos": 22018125363840.0, + "grad_norm": 7.065797763527555, + "language_loss": 0.90485406, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.91944128, + "num_input_tokens_seen": 312671380, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25415039, + "step": 14500, + "time_per_iteration": 2.6655802726745605 + }, + { + "auxiliary_loss_clip": 0.01244913, + "auxiliary_loss_mlp": 0.00210672, + "balance_loss_clip": 1.02843976, + "balance_loss_mlp": 0.18692577, + "epoch": 0.8718472869382233, + "flos": 16654256426880.0, + "grad_norm": 9.121639187295402, + "language_loss": 0.83436751, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.84892333, + "num_input_tokens_seen": 312689215, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.23742676, + "step": 14501, + "time_per_iteration": 2.626265048980713 + }, + { + "auxiliary_loss_clip": 0.01261501, + "auxiliary_loss_mlp": 0.00215915, + "balance_loss_clip": 1.03455353, + "balance_loss_mlp": 0.18880659, + "epoch": 0.8719074101908914, + "flos": 19495328613120.0, + "grad_norm": 302.4525894446234, + "language_loss": 0.749596, + "learning_rate": 1.695873325782482e-07, + "loss": 0.76437008, + "num_input_tokens_seen": 312706400, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.27124023, + "step": 14502, + "time_per_iteration": 4.041804075241089 + }, + { + "auxiliary_loss_clip": 0.01246184, + "auxiliary_loss_mlp": 0.00200023, + "balance_loss_clip": 1.03113723, + "balance_loss_mlp": 0.17615743, + "epoch": 0.8719675334435593, + "flos": 33070890430080.0, + "grad_norm": 37.583777503916146, + "language_loss": 0.74528849, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.75975055, + "num_input_tokens_seen": 312727985, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.2388916, + "step": 14503, + "time_per_iteration": 2.7424938678741455 + }, + { + "auxiliary_loss_clip": 0.01234114, + "auxiliary_loss_mlp": 0.00205079, + "balance_loss_clip": 1.02095342, + "balance_loss_mlp": 0.18018788, + "epoch": 0.8720276566962273, + "flos": 13626268842240.0, + "grad_norm": 19706.802810613754, + "language_loss": 0.77401066, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.78840256, + "num_input_tokens_seen": 312745025, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.2487793, + "step": 14504, + "time_per_iteration": 2.665323257446289 + }, + { + "auxiliary_loss_clip": 0.01242403, + "auxiliary_loss_mlp": 0.002203, + "balance_loss_clip": 1.0238682, + "balance_loss_mlp": 0.19409731, + "epoch": 0.8720877799488952, + "flos": 23514163845120.0, + "grad_norm": 69.77825874882038, + "language_loss": 0.78929836, + "learning_rate": 1.691168026385552e-07, + "loss": 0.8039254, + "num_input_tokens_seen": 312764170, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.26196289, + "step": 14505, + "time_per_iteration": 2.650418758392334 + }, + { + "auxiliary_loss_clip": 0.01244384, + "auxiliary_loss_mlp": 0.00197278, + "balance_loss_clip": 1.03169751, + "balance_loss_mlp": 0.17294714, + "epoch": 0.8721479032015632, + "flos": 20814148368000.0, + "grad_norm": 15.284107560560482, + "language_loss": 0.84737837, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.86179507, + "num_input_tokens_seen": 312783830, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.24328613, + "step": 14506, + "time_per_iteration": 2.746166706085205 + }, + { + "auxiliary_loss_clip": 0.01260336, + "auxiliary_loss_mlp": 0.00206585, + "balance_loss_clip": 1.03819811, + "balance_loss_mlp": 0.18244514, + "epoch": 0.8722080264542311, + "flos": 19463655795840.0, + "grad_norm": 174.46938398317207, + "language_loss": 0.84305286, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.8577221, + "num_input_tokens_seen": 312802015, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.24133301, + "step": 14507, + "time_per_iteration": 2.628892183303833 + }, + { + "auxiliary_loss_clip": 0.01283119, + "auxiliary_loss_mlp": 0.00234862, + "balance_loss_clip": 1.04877019, + "balance_loss_mlp": 0.20702681, + "epoch": 0.8722681497068991, + "flos": 21761866759680.0, + "grad_norm": 14.746888390435984, + "language_loss": 0.82811856, + "learning_rate": 1.686468975443156e-07, + "loss": 0.84329832, + "num_input_tokens_seen": 312820650, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.27832031, + "step": 14508, + "time_per_iteration": 2.6429219245910645 + }, + { + "auxiliary_loss_clip": 0.01272709, + "auxiliary_loss_mlp": 0.00214786, + "balance_loss_clip": 1.04505205, + "balance_loss_mlp": 0.18820238, + "epoch": 0.8723282729595672, + "flos": 28877134942080.0, + "grad_norm": 14.309454550351468, + "language_loss": 0.75320399, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.76807892, + "num_input_tokens_seen": 312841310, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.26574707, + "step": 14509, + "time_per_iteration": 2.673492670059204 + }, + { + "auxiliary_loss_clip": 0.01253163, + "auxiliary_loss_mlp": 0.00229361, + "balance_loss_clip": 1.04009438, + "balance_loss_mlp": 0.20509028, + "epoch": 0.8723883962122351, + "flos": 26469145036800.0, + "grad_norm": 24.423605678364538, + "language_loss": 0.66766095, + "learning_rate": 1.683339746970558e-07, + "loss": 0.68248618, + "num_input_tokens_seen": 312862100, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24267578, + "step": 14510, + "time_per_iteration": 2.67177152633667 + }, + { + "auxiliary_loss_clip": 0.01291977, + "auxiliary_loss_mlp": 0.00231419, + "balance_loss_clip": 1.05663514, + "balance_loss_mlp": 0.20208187, + "epoch": 0.8724485194649031, + "flos": 20521476351360.0, + "grad_norm": 1825.2609354814567, + "language_loss": 0.78407907, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.79931307, + "num_input_tokens_seen": 312880220, + "router_z_loss_clip": 2.35449219, + "router_z_loss_mlp": 0.29345703, + "step": 14511, + "time_per_iteration": 2.6482410430908203 + }, + { + "auxiliary_loss_clip": 0.01237272, + "auxiliary_loss_mlp": 0.00188479, + "balance_loss_clip": 1.02357984, + "balance_loss_mlp": 0.165472, + "epoch": 0.872508642717571, + "flos": 24353360271360.0, + "grad_norm": 73.75506718892468, + "language_loss": 0.89352709, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.90778458, + "num_input_tokens_seen": 312900765, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.22998047, + "step": 14512, + "time_per_iteration": 2.6537091732025146 + }, + { + "auxiliary_loss_clip": 0.01158281, + "auxiliary_loss_mlp": 0.00077728, + "balance_loss_clip": 1.01876462, + "balance_loss_mlp": 0.07038489, + "epoch": 0.872568765970239, + "flos": 61410012485760.0, + "grad_norm": 0.7800547381198412, + "language_loss": 0.58200127, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.59436136, + "num_input_tokens_seen": 312955840, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.07324219, + "step": 14513, + "time_per_iteration": 3.0596351623535156 + }, + { + "auxiliary_loss_clip": 0.01245737, + "auxiliary_loss_mlp": 0.00213228, + "balance_loss_clip": 1.02959204, + "balance_loss_mlp": 0.18595287, + "epoch": 0.8726288892229069, + "flos": 22598046443520.0, + "grad_norm": 17.079614424013812, + "language_loss": 0.82390845, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.83849806, + "num_input_tokens_seen": 312973565, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.27294922, + "step": 14514, + "time_per_iteration": 2.694761276245117 + }, + { + "auxiliary_loss_clip": 0.01273473, + "auxiliary_loss_mlp": 0.00238421, + "balance_loss_clip": 1.04231715, + "balance_loss_mlp": 0.21170656, + "epoch": 0.872689012475575, + "flos": 25885201633920.0, + "grad_norm": 13.839089380117997, + "language_loss": 0.747379, + "learning_rate": 1.675528831794055e-07, + "loss": 0.7624979, + "num_input_tokens_seen": 312994660, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.26696777, + "step": 14515, + "time_per_iteration": 2.692723035812378 + }, + { + "auxiliary_loss_clip": 0.0125394, + "auxiliary_loss_mlp": 0.00211249, + "balance_loss_clip": 1.03028607, + "balance_loss_mlp": 0.18430801, + "epoch": 0.8727491357282429, + "flos": 21506721477120.0, + "grad_norm": 9.442429702474277, + "language_loss": 0.86964959, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.88430148, + "num_input_tokens_seen": 313009860, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.26940918, + "step": 14516, + "time_per_iteration": 2.6394665241241455 + }, + { + "auxiliary_loss_clip": 0.01255419, + "auxiliary_loss_mlp": 0.00233192, + "balance_loss_clip": 1.03203619, + "balance_loss_mlp": 0.20646548, + "epoch": 0.8728092589809109, + "flos": 19207504932480.0, + "grad_norm": 43.78887780087233, + "language_loss": 0.8524487, + "learning_rate": 1.672409329369453e-07, + "loss": 0.86733484, + "num_input_tokens_seen": 313027025, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26733398, + "step": 14517, + "time_per_iteration": 2.7066662311553955 + }, + { + "auxiliary_loss_clip": 0.0122895, + "auxiliary_loss_mlp": 0.00200712, + "balance_loss_clip": 1.0175705, + "balance_loss_mlp": 0.17691831, + "epoch": 0.8728693822335788, + "flos": 20595308757120.0, + "grad_norm": 6.4835541553170835, + "language_loss": 0.80909604, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.82339263, + "num_input_tokens_seen": 313046830, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.23803711, + "step": 14518, + "time_per_iteration": 2.6162405014038086 + }, + { + "auxiliary_loss_clip": 0.01217015, + "auxiliary_loss_mlp": 0.00191458, + "balance_loss_clip": 1.00917768, + "balance_loss_mlp": 0.16733032, + "epoch": 0.8729295054862468, + "flos": 21728613744000.0, + "grad_norm": 4.465150732347615, + "language_loss": 0.795259, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.8093437, + "num_input_tokens_seen": 313067715, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.24157715, + "step": 14519, + "time_per_iteration": 2.746877670288086 + }, + { + "auxiliary_loss_clip": 0.01259853, + "auxiliary_loss_mlp": 0.00219701, + "balance_loss_clip": 1.03870106, + "balance_loss_mlp": 0.19339153, + "epoch": 0.8729896287389147, + "flos": 17673436926720.0, + "grad_norm": 14.016767935769975, + "language_loss": 0.90666008, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.92145568, + "num_input_tokens_seen": 313082305, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.26293945, + "step": 14520, + "time_per_iteration": 2.574380874633789 + }, + { + "auxiliary_loss_clip": 0.01262586, + "auxiliary_loss_mlp": 0.00225774, + "balance_loss_clip": 1.03580523, + "balance_loss_mlp": 0.19970301, + "epoch": 0.8730497519915827, + "flos": 24571804832640.0, + "grad_norm": 7.980171804249051, + "language_loss": 0.89436966, + "learning_rate": 1.666178664801816e-07, + "loss": 0.90925336, + "num_input_tokens_seen": 313101190, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26086426, + "step": 14521, + "time_per_iteration": 2.693101167678833 + }, + { + "auxiliary_loss_clip": 0.01254818, + "auxiliary_loss_mlp": 0.00197077, + "balance_loss_clip": 1.02795672, + "balance_loss_mlp": 0.17185271, + "epoch": 0.8731098752442508, + "flos": 13443734903040.0, + "grad_norm": 25.438994952392257, + "language_loss": 0.88901007, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.90352905, + "num_input_tokens_seen": 313118965, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.25219727, + "step": 14522, + "time_per_iteration": 2.822897434234619 + }, + { + "auxiliary_loss_clip": 0.01243142, + "auxiliary_loss_mlp": 0.00202754, + "balance_loss_clip": 1.02571726, + "balance_loss_mlp": 0.18089069, + "epoch": 0.8731699984969187, + "flos": 23474446381440.0, + "grad_norm": 10.617856775214662, + "language_loss": 0.82096308, + "learning_rate": 1.66306750360385e-07, + "loss": 0.83542204, + "num_input_tokens_seen": 313139280, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.21850586, + "step": 14523, + "time_per_iteration": 2.732360601425171 + }, + { + "auxiliary_loss_clip": 0.01241012, + "auxiliary_loss_mlp": 0.0021995, + "balance_loss_clip": 1.02290118, + "balance_loss_mlp": 0.19546488, + "epoch": 0.8732301217495867, + "flos": 17712651600000.0, + "grad_norm": 34.88794815190918, + "language_loss": 0.86957246, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.8841821, + "num_input_tokens_seen": 313156655, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24487305, + "step": 14524, + "time_per_iteration": 2.6596908569335938 + }, + { + "auxiliary_loss_clip": 0.01234402, + "auxiliary_loss_mlp": 0.00214026, + "balance_loss_clip": 1.02287292, + "balance_loss_mlp": 0.1898739, + "epoch": 0.8732902450022546, + "flos": 22054359208320.0, + "grad_norm": 58.39445124055214, + "language_loss": 0.87794304, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.89242733, + "num_input_tokens_seen": 313174050, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.24133301, + "step": 14525, + "time_per_iteration": 2.6477370262145996 + }, + { + "auxiliary_loss_clip": 0.01255455, + "auxiliary_loss_mlp": 0.00215169, + "balance_loss_clip": 1.03321958, + "balance_loss_mlp": 0.18964586, + "epoch": 0.8733503682549226, + "flos": 22272983337600.0, + "grad_norm": 5.90751090232, + "language_loss": 0.77355409, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.78826028, + "num_input_tokens_seen": 313192765, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25500488, + "step": 14526, + "time_per_iteration": 2.689669132232666 + }, + { + "auxiliary_loss_clip": 0.01283104, + "auxiliary_loss_mlp": 0.00230433, + "balance_loss_clip": 1.05151498, + "balance_loss_mlp": 0.2028597, + "epoch": 0.8734104915075905, + "flos": 23364344217600.0, + "grad_norm": 6.226968744488414, + "language_loss": 0.70248789, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.71762323, + "num_input_tokens_seen": 313210925, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.27575684, + "step": 14527, + "time_per_iteration": 2.650578737258911 + }, + { + "auxiliary_loss_clip": 0.01281303, + "auxiliary_loss_mlp": 0.00244042, + "balance_loss_clip": 1.0441283, + "balance_loss_mlp": 0.21242815, + "epoch": 0.8734706147602586, + "flos": 17712292464000.0, + "grad_norm": 26.76379282293426, + "language_loss": 0.78376186, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.79901534, + "num_input_tokens_seen": 313228250, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.31604004, + "step": 14528, + "time_per_iteration": 2.6415998935699463 + }, + { + "auxiliary_loss_clip": 0.01254414, + "auxiliary_loss_mlp": 0.0020593, + "balance_loss_clip": 1.03336692, + "balance_loss_mlp": 0.18149218, + "epoch": 0.8735307380129265, + "flos": 22049367217920.0, + "grad_norm": 15.305147364567022, + "language_loss": 0.98546195, + "learning_rate": 1.6537507100149205e-07, + "loss": 1.00006545, + "num_input_tokens_seen": 313247880, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24450684, + "step": 14529, + "time_per_iteration": 2.7355544567108154 + }, + { + "auxiliary_loss_clip": 0.01237975, + "auxiliary_loss_mlp": 0.00190719, + "balance_loss_clip": 1.02263701, + "balance_loss_mlp": 0.16711578, + "epoch": 0.8735908612655945, + "flos": 25338425829120.0, + "grad_norm": 167.28797000664858, + "language_loss": 0.91196448, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.92625141, + "num_input_tokens_seen": 313266790, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.23596191, + "step": 14530, + "time_per_iteration": 2.725600481033325 + }, + { + "auxiliary_loss_clip": 0.01246012, + "auxiliary_loss_mlp": 0.00227354, + "balance_loss_clip": 1.02715135, + "balance_loss_mlp": 0.20257026, + "epoch": 0.8736509845182624, + "flos": 21540908246400.0, + "grad_norm": 11.526513833678468, + "language_loss": 0.79919672, + "learning_rate": 1.650650677057128e-07, + "loss": 0.81393033, + "num_input_tokens_seen": 313286805, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.24755859, + "step": 14531, + "time_per_iteration": 2.64861798286438 + }, + { + "auxiliary_loss_clip": 0.01237747, + "auxiliary_loss_mlp": 0.00213005, + "balance_loss_clip": 1.02162838, + "balance_loss_mlp": 0.18804277, + "epoch": 0.8737111077709304, + "flos": 22017227523840.0, + "grad_norm": 38.98710614497059, + "language_loss": 0.71583307, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.7303406, + "num_input_tokens_seen": 313305415, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.24951172, + "step": 14532, + "time_per_iteration": 2.648535966873169 + }, + { + "auxiliary_loss_clip": 0.01154884, + "auxiliary_loss_mlp": 0.00098904, + "balance_loss_clip": 1.0167079, + "balance_loss_mlp": 0.09113135, + "epoch": 0.8737712310235983, + "flos": 70066315912320.0, + "grad_norm": 0.788467110515332, + "language_loss": 0.57423538, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.58677322, + "num_input_tokens_seen": 313369940, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.07763672, + "step": 14533, + "time_per_iteration": 3.254159450531006 + }, + { + "auxiliary_loss_clip": 0.0123717, + "auxiliary_loss_mlp": 0.0019981, + "balance_loss_clip": 1.02262223, + "balance_loss_mlp": 0.1731783, + "epoch": 0.8738313542762663, + "flos": 28658331244800.0, + "grad_norm": 2.126952454035014, + "language_loss": 0.83186692, + "learning_rate": 1.646005846335954e-07, + "loss": 0.84623671, + "num_input_tokens_seen": 313390965, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.26647949, + "step": 14534, + "time_per_iteration": 4.099276781082153 + }, + { + "auxiliary_loss_clip": 0.01259685, + "auxiliary_loss_mlp": 0.00224599, + "balance_loss_clip": 1.04108405, + "balance_loss_mlp": 0.20030379, + "epoch": 0.8738914775289344, + "flos": 22346384780160.0, + "grad_norm": 7.962579669046744, + "language_loss": 0.83105052, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.84589332, + "num_input_tokens_seen": 313409680, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24291992, + "step": 14535, + "time_per_iteration": 2.7131760120391846 + }, + { + "auxiliary_loss_clip": 0.01253249, + "auxiliary_loss_mlp": 0.00212112, + "balance_loss_clip": 1.03022504, + "balance_loss_mlp": 0.18687564, + "epoch": 0.8739516007816023, + "flos": 31759648444800.0, + "grad_norm": 39.762676088623955, + "language_loss": 0.81876302, + "learning_rate": 1.64291277235048e-07, + "loss": 0.83341664, + "num_input_tokens_seen": 313431335, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25256348, + "step": 14536, + "time_per_iteration": 4.143074035644531 + }, + { + "auxiliary_loss_clip": 0.0123962, + "auxiliary_loss_mlp": 0.00210999, + "balance_loss_clip": 1.02379513, + "balance_loss_mlp": 0.18535727, + "epoch": 0.8740117240342703, + "flos": 21211715076480.0, + "grad_norm": 1739.1905529959822, + "language_loss": 0.72566676, + "learning_rate": 1.641367279482304e-07, + "loss": 0.74017298, + "num_input_tokens_seen": 313449225, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.2565918, + "step": 14537, + "time_per_iteration": 2.6269357204437256 + }, + { + "auxiliary_loss_clip": 0.01245392, + "auxiliary_loss_mlp": 0.00217275, + "balance_loss_clip": 1.02797532, + "balance_loss_mlp": 0.19307575, + "epoch": 0.8740718472869382, + "flos": 25186666867200.0, + "grad_norm": 11.882555518556183, + "language_loss": 0.65900123, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.67362785, + "num_input_tokens_seen": 313467715, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.24194336, + "step": 14538, + "time_per_iteration": 2.683023691177368 + }, + { + "auxiliary_loss_clip": 0.01248221, + "auxiliary_loss_mlp": 0.00202654, + "balance_loss_clip": 1.03168225, + "balance_loss_mlp": 0.17865711, + "epoch": 0.8741319705396062, + "flos": 19500931134720.0, + "grad_norm": 6.56253320767131, + "language_loss": 0.76495147, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.77946019, + "num_input_tokens_seen": 313486805, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.2401123, + "step": 14539, + "time_per_iteration": 2.61826229095459 + }, + { + "auxiliary_loss_clip": 0.0125679, + "auxiliary_loss_mlp": 0.00195659, + "balance_loss_clip": 1.03174794, + "balance_loss_mlp": 0.16974315, + "epoch": 0.8741920937922741, + "flos": 14100900180480.0, + "grad_norm": 20.521007293217092, + "language_loss": 0.83824599, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.85277045, + "num_input_tokens_seen": 313504880, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.25915527, + "step": 14540, + "time_per_iteration": 2.66563081741333 + }, + { + "auxiliary_loss_clip": 0.01237492, + "auxiliary_loss_mlp": 0.0021962, + "balance_loss_clip": 1.02221298, + "balance_loss_mlp": 0.19366789, + "epoch": 0.8742522170449422, + "flos": 27709858667520.0, + "grad_norm": 22.78457974400633, + "language_loss": 0.86261803, + "learning_rate": 1.635192270207193e-07, + "loss": 0.87718916, + "num_input_tokens_seen": 313524995, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.25927734, + "step": 14541, + "time_per_iteration": 4.265751123428345 + }, + { + "auxiliary_loss_clip": 0.01276732, + "auxiliary_loss_mlp": 0.00217734, + "balance_loss_clip": 1.04537249, + "balance_loss_mlp": 0.18836093, + "epoch": 0.8743123402976101, + "flos": 21142587352320.0, + "grad_norm": 41.83784164916678, + "language_loss": 0.79256535, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.80751002, + "num_input_tokens_seen": 313541740, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.29370117, + "step": 14542, + "time_per_iteration": 2.731696367263794 + }, + { + "auxiliary_loss_clip": 0.01158004, + "auxiliary_loss_mlp": 0.00096783, + "balance_loss_clip": 1.01898479, + "balance_loss_mlp": 0.08944009, + "epoch": 0.8743724635502781, + "flos": 60870024351360.0, + "grad_norm": 0.7788095516614272, + "language_loss": 0.54224908, + "learning_rate": 1.632108943707642e-07, + "loss": 0.55479705, + "num_input_tokens_seen": 313593445, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.07324219, + "step": 14543, + "time_per_iteration": 3.0078442096710205 + }, + { + "auxiliary_loss_clip": 0.0126975, + "auxiliary_loss_mlp": 0.00211818, + "balance_loss_clip": 1.04630899, + "balance_loss_mlp": 0.18630758, + "epoch": 0.874432586802946, + "flos": 28109292883200.0, + "grad_norm": 22.360090352621437, + "language_loss": 0.79928833, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.81410396, + "num_input_tokens_seen": 313615640, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25500488, + "step": 14544, + "time_per_iteration": 2.6958084106445312 + }, + { + "auxiliary_loss_clip": 0.01236003, + "auxiliary_loss_mlp": 0.0020293, + "balance_loss_clip": 1.02481341, + "balance_loss_mlp": 0.17931432, + "epoch": 0.874492710055614, + "flos": 23550289948800.0, + "grad_norm": 24.054184562291052, + "language_loss": 0.82829952, + "learning_rate": 1.62902840325714e-07, + "loss": 0.84268892, + "num_input_tokens_seen": 313635550, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.23608398, + "step": 14545, + "time_per_iteration": 4.152297019958496 + }, + { + "auxiliary_loss_clip": 0.01248083, + "auxiliary_loss_mlp": 0.0019814, + "balance_loss_clip": 1.02830172, + "balance_loss_mlp": 0.17193739, + "epoch": 0.8745528333082819, + "flos": 40915647924480.0, + "grad_norm": 8.113883812030293, + "language_loss": 0.73732299, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.75178522, + "num_input_tokens_seen": 313659275, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.26171875, + "step": 14546, + "time_per_iteration": 2.893653392791748 + }, + { + "auxiliary_loss_clip": 0.01250944, + "auxiliary_loss_mlp": 0.00205026, + "balance_loss_clip": 1.03161621, + "balance_loss_mlp": 0.17864524, + "epoch": 0.87461295656095, + "flos": 23622901292160.0, + "grad_norm": 25.10941401131916, + "language_loss": 0.80051172, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.8150714, + "num_input_tokens_seen": 313680595, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26391602, + "step": 14547, + "time_per_iteration": 2.648446798324585 + }, + { + "auxiliary_loss_clip": 0.01276999, + "auxiliary_loss_mlp": 0.00216257, + "balance_loss_clip": 1.04413199, + "balance_loss_mlp": 0.1888984, + "epoch": 0.874673079813618, + "flos": 38794116983040.0, + "grad_norm": 35.801354675897024, + "language_loss": 0.81123096, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.82616353, + "num_input_tokens_seen": 313699730, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.27355957, + "step": 14548, + "time_per_iteration": 2.8567323684692383 + }, + { + "auxiliary_loss_clip": 0.01259365, + "auxiliary_loss_mlp": 0.00212792, + "balance_loss_clip": 1.0340265, + "balance_loss_mlp": 0.18610096, + "epoch": 0.8747332030662859, + "flos": 23696159080320.0, + "grad_norm": 13.24738804444235, + "language_loss": 0.81571257, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.83043408, + "num_input_tokens_seen": 313720090, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.2668457, + "step": 14549, + "time_per_iteration": 2.724985361099243 + }, + { + "auxiliary_loss_clip": 0.01270388, + "auxiliary_loss_mlp": 0.00215038, + "balance_loss_clip": 1.03869975, + "balance_loss_mlp": 0.18729818, + "epoch": 0.8747933263189539, + "flos": 24462456854400.0, + "grad_norm": 5.675299623368861, + "language_loss": 0.93509483, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.94994903, + "num_input_tokens_seen": 313736795, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.27783203, + "step": 14550, + "time_per_iteration": 2.760155200958252 + }, + { + "auxiliary_loss_clip": 0.0123687, + "auxiliary_loss_mlp": 0.00221149, + "balance_loss_clip": 1.02078044, + "balance_loss_mlp": 0.19470805, + "epoch": 0.8748534495716218, + "flos": 13809161917440.0, + "grad_norm": 12.440311560659946, + "language_loss": 0.80690396, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.82148415, + "num_input_tokens_seen": 313754820, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.2644043, + "step": 14551, + "time_per_iteration": 2.6184983253479004 + }, + { + "auxiliary_loss_clip": 0.01232713, + "auxiliary_loss_mlp": 0.00187335, + "balance_loss_clip": 1.02174067, + "balance_loss_mlp": 0.16367176, + "epoch": 0.8749135728242898, + "flos": 29862092759040.0, + "grad_norm": 161.2816906188908, + "language_loss": 0.72438318, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.73858368, + "num_input_tokens_seen": 313775830, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.23669434, + "step": 14552, + "time_per_iteration": 2.708862066268921 + }, + { + "auxiliary_loss_clip": 0.01250628, + "auxiliary_loss_mlp": 0.00222459, + "balance_loss_clip": 1.03080821, + "balance_loss_mlp": 0.19609033, + "epoch": 0.8749736960769577, + "flos": 24133479166080.0, + "grad_norm": 44.40525694049383, + "language_loss": 0.8864572, + "learning_rate": 1.616734111284479e-07, + "loss": 0.90118808, + "num_input_tokens_seen": 313795745, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.2635498, + "step": 14553, + "time_per_iteration": 2.7477195262908936 + }, + { + "auxiliary_loss_clip": 0.01239815, + "auxiliary_loss_mlp": 0.00208795, + "balance_loss_clip": 1.02292216, + "balance_loss_mlp": 0.18380827, + "epoch": 0.8750338193296258, + "flos": 17202540602880.0, + "grad_norm": 28.107059691979547, + "language_loss": 0.78392988, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.79841602, + "num_input_tokens_seen": 313813895, + "router_z_loss_clip": 2.17089844, + "router_z_loss_mlp": 0.25, + "step": 14554, + "time_per_iteration": 2.7716212272644043 + }, + { + "auxiliary_loss_clip": 0.01224264, + "auxiliary_loss_mlp": 0.00205097, + "balance_loss_clip": 1.01183426, + "balance_loss_mlp": 0.18238741, + "epoch": 0.8750939425822937, + "flos": 23733218937600.0, + "grad_norm": 63.364867669872574, + "language_loss": 0.90828866, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.92258227, + "num_input_tokens_seen": 313834225, + "router_z_loss_clip": 2.12402344, + "router_z_loss_mlp": 0.22705078, + "step": 14555, + "time_per_iteration": 2.7751495838165283 + }, + { + "auxiliary_loss_clip": 0.01242299, + "auxiliary_loss_mlp": 0.00201622, + "balance_loss_clip": 1.02488625, + "balance_loss_mlp": 0.177542, + "epoch": 0.8751540658349617, + "flos": 26541684552960.0, + "grad_norm": 2.7408883011704677, + "language_loss": 0.76455456, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.77899379, + "num_input_tokens_seen": 313854430, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.24084473, + "step": 14556, + "time_per_iteration": 2.7581534385681152 + }, + { + "auxiliary_loss_clip": 0.01252456, + "auxiliary_loss_mlp": 0.00221452, + "balance_loss_clip": 1.03227758, + "balance_loss_mlp": 0.19322361, + "epoch": 0.8752141890876296, + "flos": 19386806647680.0, + "grad_norm": 4.609049320203973, + "language_loss": 0.85478264, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.86952168, + "num_input_tokens_seen": 313871600, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.28234863, + "step": 14557, + "time_per_iteration": 2.723848581314087 + }, + { + "auxiliary_loss_clip": 0.01238499, + "auxiliary_loss_mlp": 0.00216866, + "balance_loss_clip": 1.02340174, + "balance_loss_mlp": 0.19283327, + "epoch": 0.8752743123402976, + "flos": 25374408278400.0, + "grad_norm": 47.59507182795253, + "language_loss": 0.89762759, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.91218114, + "num_input_tokens_seen": 313891570, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24047852, + "step": 14558, + "time_per_iteration": 2.662726879119873 + }, + { + "auxiliary_loss_clip": 0.01158558, + "auxiliary_loss_mlp": 0.00108438, + "balance_loss_clip": 1.01760697, + "balance_loss_mlp": 0.10195326, + "epoch": 0.8753344355929655, + "flos": 59952398578560.0, + "grad_norm": 0.7967228675909831, + "language_loss": 0.55482024, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.56749022, + "num_input_tokens_seen": 313951290, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.06494141, + "step": 14559, + "time_per_iteration": 3.1683766841888428 + }, + { + "auxiliary_loss_clip": 0.01243195, + "auxiliary_loss_mlp": 0.002033, + "balance_loss_clip": 1.02580166, + "balance_loss_mlp": 0.17736025, + "epoch": 0.8753945588456336, + "flos": 17894646835200.0, + "grad_norm": 96.81975940087634, + "language_loss": 0.73204255, + "learning_rate": 1.606013202286407e-07, + "loss": 0.74650753, + "num_input_tokens_seen": 313968645, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.2590332, + "step": 14560, + "time_per_iteration": 2.632150411605835 + }, + { + "auxiliary_loss_clip": 0.01244731, + "auxiliary_loss_mlp": 0.00208678, + "balance_loss_clip": 1.03031039, + "balance_loss_mlp": 0.18557553, + "epoch": 0.8754546820983016, + "flos": 30914885410560.0, + "grad_norm": 10.280182517602949, + "language_loss": 0.8667134, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.88124758, + "num_input_tokens_seen": 313987580, + "router_z_loss_clip": 2.13769531, + "router_z_loss_mlp": 0.2310791, + "step": 14561, + "time_per_iteration": 2.726731300354004 + }, + { + "auxiliary_loss_clip": 0.01249084, + "auxiliary_loss_mlp": 0.00213954, + "balance_loss_clip": 1.02642417, + "balance_loss_mlp": 0.18781181, + "epoch": 0.8755148053509695, + "flos": 20631075724800.0, + "grad_norm": 47.106853704488444, + "language_loss": 0.88041127, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.89504158, + "num_input_tokens_seen": 314004460, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26123047, + "step": 14562, + "time_per_iteration": 2.6602189540863037 + }, + { + "auxiliary_loss_clip": 0.01218267, + "auxiliary_loss_mlp": 0.00200956, + "balance_loss_clip": 1.00942087, + "balance_loss_mlp": 0.17675611, + "epoch": 0.8755749286036375, + "flos": 34969739005440.0, + "grad_norm": 48.44945553023005, + "language_loss": 0.77327377, + "learning_rate": 1.601428988367981e-07, + "loss": 0.78746599, + "num_input_tokens_seen": 314026855, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.2421875, + "step": 14563, + "time_per_iteration": 2.749040126800537 + }, + { + "auxiliary_loss_clip": 0.01260546, + "auxiliary_loss_mlp": 0.00214275, + "balance_loss_clip": 1.03411889, + "balance_loss_mlp": 0.18745241, + "epoch": 0.8756350518563054, + "flos": 18186456925440.0, + "grad_norm": 11.609146757677005, + "language_loss": 0.74719626, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.76194441, + "num_input_tokens_seen": 314042830, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.26831055, + "step": 14564, + "time_per_iteration": 2.6852827072143555 + }, + { + "auxiliary_loss_clip": 0.01228518, + "auxiliary_loss_mlp": 0.00190885, + "balance_loss_clip": 1.01553297, + "balance_loss_mlp": 0.16782945, + "epoch": 0.8756951751089734, + "flos": 20084012611200.0, + "grad_norm": 2.600381599361483, + "language_loss": 0.78095287, + "learning_rate": 1.598376334037408e-07, + "loss": 0.79514694, + "num_input_tokens_seen": 314062225, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23071289, + "step": 14565, + "time_per_iteration": 2.675233840942383 + }, + { + "auxiliary_loss_clip": 0.0128842, + "auxiliary_loss_mlp": 0.00219811, + "balance_loss_clip": 1.04987371, + "balance_loss_mlp": 0.1905808, + "epoch": 0.8757552983616413, + "flos": 27525241739520.0, + "grad_norm": 1035.8602754126143, + "language_loss": 0.86270118, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.87778342, + "num_input_tokens_seen": 314082325, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.2923584, + "step": 14566, + "time_per_iteration": 2.716942548751831 + }, + { + "auxiliary_loss_clip": 0.01249248, + "auxiliary_loss_mlp": 0.00206207, + "balance_loss_clip": 1.03227925, + "balance_loss_mlp": 0.1827701, + "epoch": 0.8758154216143094, + "flos": 18073014796800.0, + "grad_norm": 6.743684987493111, + "language_loss": 0.80332142, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.81787592, + "num_input_tokens_seen": 314100310, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.23449707, + "step": 14567, + "time_per_iteration": 2.668977975845337 + }, + { + "auxiliary_loss_clip": 0.01238216, + "auxiliary_loss_mlp": 0.00196824, + "balance_loss_clip": 1.0240705, + "balance_loss_mlp": 0.17297, + "epoch": 0.8758755448669773, + "flos": 25045681985280.0, + "grad_norm": 9.093998061303829, + "language_loss": 0.81467044, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.8290208, + "num_input_tokens_seen": 314121330, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.23840332, + "step": 14568, + "time_per_iteration": 2.895484209060669 + }, + { + "auxiliary_loss_clip": 0.01246797, + "auxiliary_loss_mlp": 0.00206794, + "balance_loss_clip": 1.03487349, + "balance_loss_mlp": 0.18390587, + "epoch": 0.8759356681196453, + "flos": 22856818999680.0, + "grad_norm": 139.57934808100515, + "language_loss": 0.94490796, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.95944393, + "num_input_tokens_seen": 314139875, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.22875977, + "step": 14569, + "time_per_iteration": 2.6754565238952637 + }, + { + "auxiliary_loss_clip": 0.01238891, + "auxiliary_loss_mlp": 0.00220344, + "balance_loss_clip": 1.02370846, + "balance_loss_mlp": 0.19563225, + "epoch": 0.8759957913723132, + "flos": 21032521102080.0, + "grad_norm": 2.3891493480551627, + "language_loss": 0.81959653, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.83418894, + "num_input_tokens_seen": 314157850, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24731445, + "step": 14570, + "time_per_iteration": 2.7163448333740234 + }, + { + "auxiliary_loss_clip": 0.01259379, + "auxiliary_loss_mlp": 0.00220069, + "balance_loss_clip": 1.03570819, + "balance_loss_mlp": 0.19598868, + "epoch": 0.8760559146249812, + "flos": 20010467514240.0, + "grad_norm": 151.43724567602834, + "language_loss": 0.76096004, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.77575445, + "num_input_tokens_seen": 314176720, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.24084473, + "step": 14571, + "time_per_iteration": 2.657125949859619 + }, + { + "auxiliary_loss_clip": 0.01247677, + "auxiliary_loss_mlp": 0.00214267, + "balance_loss_clip": 1.03034234, + "balance_loss_mlp": 0.19041277, + "epoch": 0.8761160378776491, + "flos": 19974161842560.0, + "grad_norm": 56.95716268715254, + "language_loss": 0.72477996, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.73939943, + "num_input_tokens_seen": 314196645, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.2388916, + "step": 14572, + "time_per_iteration": 2.6856110095977783 + }, + { + "auxiliary_loss_clip": 0.0122924, + "auxiliary_loss_mlp": 0.00198088, + "balance_loss_clip": 1.01513267, + "balance_loss_mlp": 0.17381679, + "epoch": 0.8761761611303172, + "flos": 28804415857920.0, + "grad_norm": 3.2689841689013, + "language_loss": 0.82700497, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.8412782, + "num_input_tokens_seen": 314217430, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24267578, + "step": 14573, + "time_per_iteration": 2.790480136871338 + }, + { + "auxiliary_loss_clip": 0.01220241, + "auxiliary_loss_mlp": 0.00190009, + "balance_loss_clip": 1.01223993, + "balance_loss_mlp": 0.16747804, + "epoch": 0.8762362843829851, + "flos": 18332505624960.0, + "grad_norm": 10.668956481498908, + "language_loss": 0.81072527, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.82482779, + "num_input_tokens_seen": 314235310, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.2253418, + "step": 14574, + "time_per_iteration": 2.6432807445526123 + }, + { + "auxiliary_loss_clip": 0.01250759, + "auxiliary_loss_mlp": 0.00190161, + "balance_loss_clip": 1.03475654, + "balance_loss_mlp": 0.16631949, + "epoch": 0.8762964076356531, + "flos": 15779149378560.0, + "grad_norm": 2.708278683630191, + "language_loss": 0.82937771, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.8437869, + "num_input_tokens_seen": 314252355, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.23852539, + "step": 14575, + "time_per_iteration": 2.649977207183838 + }, + { + "auxiliary_loss_clip": 0.01220707, + "auxiliary_loss_mlp": 0.00221574, + "balance_loss_clip": 1.01296175, + "balance_loss_mlp": 0.1984714, + "epoch": 0.8763565308883211, + "flos": 33176754789120.0, + "grad_norm": 27.33318817678827, + "language_loss": 0.72395641, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.73837924, + "num_input_tokens_seen": 314272755, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.23095703, + "step": 14576, + "time_per_iteration": 4.193576335906982 + }, + { + "auxiliary_loss_clip": 0.01231114, + "auxiliary_loss_mlp": 0.00218554, + "balance_loss_clip": 1.02066541, + "balance_loss_mlp": 0.19448574, + "epoch": 0.876416654140989, + "flos": 15888102307200.0, + "grad_norm": 66.09588055863574, + "language_loss": 0.74392718, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.75842381, + "num_input_tokens_seen": 314291365, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.24084473, + "step": 14577, + "time_per_iteration": 2.7047746181488037 + }, + { + "auxiliary_loss_clip": 0.0128187, + "auxiliary_loss_mlp": 0.00213177, + "balance_loss_clip": 1.05122542, + "balance_loss_mlp": 0.18585438, + "epoch": 0.876476777393657, + "flos": 25885237547520.0, + "grad_norm": 23.319994319975244, + "language_loss": 0.80586708, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.82081747, + "num_input_tokens_seen": 314310075, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.27319336, + "step": 14578, + "time_per_iteration": 4.164779424667358 + }, + { + "auxiliary_loss_clip": 0.01269376, + "auxiliary_loss_mlp": 0.00204452, + "balance_loss_clip": 1.04363847, + "balance_loss_mlp": 0.1783925, + "epoch": 0.876536900646325, + "flos": 13589675861760.0, + "grad_norm": 3.795488010824798, + "language_loss": 0.80867249, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.82341075, + "num_input_tokens_seen": 314325695, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.26086426, + "step": 14579, + "time_per_iteration": 2.5782058238983154 + }, + { + "auxiliary_loss_clip": 0.01226835, + "auxiliary_loss_mlp": 0.00191798, + "balance_loss_clip": 1.01539087, + "balance_loss_mlp": 0.1687783, + "epoch": 0.876597023898993, + "flos": 12203344494720.0, + "grad_norm": 26.455425653127794, + "language_loss": 0.78176737, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.79595375, + "num_input_tokens_seen": 314343605, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.23022461, + "step": 14580, + "time_per_iteration": 2.6377501487731934 + }, + { + "auxiliary_loss_clip": 0.01239171, + "auxiliary_loss_mlp": 0.00212826, + "balance_loss_clip": 1.02970707, + "balance_loss_mlp": 0.18909146, + "epoch": 0.8766571471516609, + "flos": 25336773803520.0, + "grad_norm": 4.996658510276659, + "language_loss": 0.73672557, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.75124556, + "num_input_tokens_seen": 314364275, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.23742676, + "step": 14581, + "time_per_iteration": 2.67145037651062 + }, + { + "auxiliary_loss_clip": 0.0122522, + "auxiliary_loss_mlp": 0.00201621, + "balance_loss_clip": 1.0114522, + "balance_loss_mlp": 0.17830411, + "epoch": 0.8767172704043289, + "flos": 30113287545600.0, + "grad_norm": 4.456378252789214, + "language_loss": 0.78447974, + "learning_rate": 1.572541512164416e-07, + "loss": 0.79874814, + "num_input_tokens_seen": 314385140, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.23303223, + "step": 14582, + "time_per_iteration": 2.727855920791626 + }, + { + "auxiliary_loss_clip": 0.01235644, + "auxiliary_loss_mlp": 0.00220275, + "balance_loss_clip": 1.01924264, + "balance_loss_mlp": 0.19578928, + "epoch": 0.8767773936569968, + "flos": 19281157770240.0, + "grad_norm": 166.28602104886014, + "language_loss": 0.76763874, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.78219795, + "num_input_tokens_seen": 314403715, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24499512, + "step": 14583, + "time_per_iteration": 4.070705890655518 + }, + { + "auxiliary_loss_clip": 0.01263964, + "auxiliary_loss_mlp": 0.00197656, + "balance_loss_clip": 1.04085279, + "balance_loss_mlp": 0.17150171, + "epoch": 0.8768375169096648, + "flos": 21247230648960.0, + "grad_norm": 9.996074619429974, + "language_loss": 0.84345907, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.85807526, + "num_input_tokens_seen": 314421880, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26171875, + "step": 14584, + "time_per_iteration": 2.6554510593414307 + }, + { + "auxiliary_loss_clip": 0.01249325, + "auxiliary_loss_mlp": 0.00197331, + "balance_loss_clip": 1.03239441, + "balance_loss_mlp": 0.172571, + "epoch": 0.8768976401623327, + "flos": 23295539715840.0, + "grad_norm": 5.222666518443242, + "language_loss": 0.79361904, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.80808556, + "num_input_tokens_seen": 314441585, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24755859, + "step": 14585, + "time_per_iteration": 2.6986677646636963 + }, + { + "auxiliary_loss_clip": 0.01239485, + "auxiliary_loss_mlp": 0.00217825, + "balance_loss_clip": 1.02522087, + "balance_loss_mlp": 0.19273192, + "epoch": 0.8769577634150008, + "flos": 21361247395200.0, + "grad_norm": 171.40198163867146, + "language_loss": 0.85352206, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.86809516, + "num_input_tokens_seen": 314459020, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.25085449, + "step": 14586, + "time_per_iteration": 2.7267696857452393 + }, + { + "auxiliary_loss_clip": 0.01237449, + "auxiliary_loss_mlp": 0.00193574, + "balance_loss_clip": 1.02314425, + "balance_loss_mlp": 0.16859969, + "epoch": 0.8770178866676687, + "flos": 23514056104320.0, + "grad_norm": 2.183645230567387, + "language_loss": 0.86796963, + "learning_rate": 1.564981454895844e-07, + "loss": 0.88227987, + "num_input_tokens_seen": 314478935, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.24975586, + "step": 14587, + "time_per_iteration": 4.106330871582031 + }, + { + "auxiliary_loss_clip": 0.01256939, + "auxiliary_loss_mlp": 0.00221559, + "balance_loss_clip": 1.03581858, + "balance_loss_mlp": 0.19549988, + "epoch": 0.8770780099203367, + "flos": 19719052473600.0, + "grad_norm": 16.303160895652475, + "language_loss": 0.82239532, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.83718026, + "num_input_tokens_seen": 314497635, + "router_z_loss_clip": 2.21386719, + "router_z_loss_mlp": 0.26086426, + "step": 14588, + "time_per_iteration": 2.7327075004577637 + }, + { + "auxiliary_loss_clip": 0.01229503, + "auxiliary_loss_mlp": 0.00192659, + "balance_loss_clip": 1.01723611, + "balance_loss_mlp": 0.16743411, + "epoch": 0.8771381331730047, + "flos": 21395901041280.0, + "grad_norm": 2.603843820372525, + "language_loss": 0.75424004, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.7684617, + "num_input_tokens_seen": 314515445, + "router_z_loss_clip": 2.12402344, + "router_z_loss_mlp": 0.25231934, + "step": 14589, + "time_per_iteration": 2.6294307708740234 + }, + { + "auxiliary_loss_clip": 0.01260357, + "auxiliary_loss_mlp": 0.00233976, + "balance_loss_clip": 1.03969049, + "balance_loss_mlp": 0.20896566, + "epoch": 0.8771982564256726, + "flos": 20261770041600.0, + "grad_norm": 13.307063405463575, + "language_loss": 0.8075614, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.82250476, + "num_input_tokens_seen": 314533040, + "router_z_loss_clip": 2.20605469, + "router_z_loss_mlp": 0.25036621, + "step": 14590, + "time_per_iteration": 2.681377410888672 + }, + { + "auxiliary_loss_clip": 0.01277666, + "auxiliary_loss_mlp": 0.00224263, + "balance_loss_clip": 1.04114151, + "balance_loss_mlp": 0.19579543, + "epoch": 0.8772583796783406, + "flos": 12489372495360.0, + "grad_norm": 7.080004984949966, + "language_loss": 0.83791763, + "learning_rate": 1.558945991776086e-07, + "loss": 0.85293692, + "num_input_tokens_seen": 314548280, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.2845459, + "step": 14591, + "time_per_iteration": 2.5977182388305664 + }, + { + "auxiliary_loss_clip": 0.01242974, + "auxiliary_loss_mlp": 0.00213938, + "balance_loss_clip": 1.03008902, + "balance_loss_mlp": 0.19076365, + "epoch": 0.8773185029310085, + "flos": 15921103927680.0, + "grad_norm": 5.458893668530817, + "language_loss": 0.86886346, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.88343257, + "num_input_tokens_seen": 314565345, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23168945, + "step": 14592, + "time_per_iteration": 2.6402688026428223 + }, + { + "auxiliary_loss_clip": 0.0122842, + "auxiliary_loss_mlp": 0.00202196, + "balance_loss_clip": 1.01724088, + "balance_loss_mlp": 0.17862871, + "epoch": 0.8773786261836766, + "flos": 21504530747520.0, + "grad_norm": 156.58574978599952, + "language_loss": 0.89689583, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.91120195, + "num_input_tokens_seen": 314584190, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.23583984, + "step": 14593, + "time_per_iteration": 2.6624386310577393 + }, + { + "auxiliary_loss_clip": 0.01236862, + "auxiliary_loss_mlp": 0.00175975, + "balance_loss_clip": 1.02327061, + "balance_loss_mlp": 0.15271729, + "epoch": 0.8774387494363445, + "flos": 26761493831040.0, + "grad_norm": 859.4081847641895, + "language_loss": 0.82837224, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.84250063, + "num_input_tokens_seen": 314605625, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.23266602, + "step": 14594, + "time_per_iteration": 2.7501978874206543 + }, + { + "auxiliary_loss_clip": 0.01239648, + "auxiliary_loss_mlp": 0.0021344, + "balance_loss_clip": 1.02192259, + "balance_loss_mlp": 0.18940759, + "epoch": 0.8774988726890125, + "flos": 18478841633280.0, + "grad_norm": 12.577624378309755, + "language_loss": 0.84011662, + "learning_rate": 1.552921717241651e-07, + "loss": 0.85464746, + "num_input_tokens_seen": 314622630, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.2401123, + "step": 14595, + "time_per_iteration": 2.602811336517334 + }, + { + "auxiliary_loss_clip": 0.01246394, + "auxiliary_loss_mlp": 0.0023349, + "balance_loss_clip": 1.02932334, + "balance_loss_mlp": 0.20856312, + "epoch": 0.8775589959416804, + "flos": 24426366664320.0, + "grad_norm": 5.629836949590199, + "language_loss": 0.79592705, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.81072581, + "num_input_tokens_seen": 314642460, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24951172, + "step": 14596, + "time_per_iteration": 2.6414475440979004 + }, + { + "auxiliary_loss_clip": 0.01242781, + "auxiliary_loss_mlp": 0.00207717, + "balance_loss_clip": 1.0283078, + "balance_loss_mlp": 0.18267064, + "epoch": 0.8776191191943484, + "flos": 23440151871360.0, + "grad_norm": 2.386298678785432, + "language_loss": 0.91946256, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.93396759, + "num_input_tokens_seen": 314659875, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.25048828, + "step": 14597, + "time_per_iteration": 2.717808246612549 + }, + { + "auxiliary_loss_clip": 0.01248149, + "auxiliary_loss_mlp": 0.00212457, + "balance_loss_clip": 1.03022242, + "balance_loss_mlp": 0.18727967, + "epoch": 0.8776792424470163, + "flos": 26830872950400.0, + "grad_norm": 12.567023401236579, + "language_loss": 0.78799808, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.8026042, + "num_input_tokens_seen": 314680260, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25170898, + "step": 14598, + "time_per_iteration": 2.699855327606201 + }, + { + "auxiliary_loss_clip": 0.0124858, + "auxiliary_loss_mlp": 0.00177655, + "balance_loss_clip": 1.02958703, + "balance_loss_mlp": 0.15358624, + "epoch": 0.8777393656996844, + "flos": 15626169354240.0, + "grad_norm": 2.1616581632719405, + "language_loss": 0.86978734, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.88404959, + "num_input_tokens_seen": 314696260, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.24047852, + "step": 14599, + "time_per_iteration": 2.621403932571411 + }, + { + "auxiliary_loss_clip": 0.01258508, + "auxiliary_loss_mlp": 0.00218631, + "balance_loss_clip": 1.03654301, + "balance_loss_mlp": 0.19350141, + "epoch": 0.8777994889523523, + "flos": 18879999701760.0, + "grad_norm": 2.8994859335993333, + "language_loss": 0.79882121, + "learning_rate": 1.545407113589332e-07, + "loss": 0.81359261, + "num_input_tokens_seen": 314714215, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.2512207, + "step": 14600, + "time_per_iteration": 2.6168739795684814 + }, + { + "auxiliary_loss_clip": 0.01262535, + "auxiliary_loss_mlp": 0.00215028, + "balance_loss_clip": 1.03650749, + "balance_loss_mlp": 0.19006555, + "epoch": 0.8778596122050203, + "flos": 48826516400640.0, + "grad_norm": 2.5222066752054064, + "language_loss": 0.7676267, + "learning_rate": 1.543906292031072e-07, + "loss": 0.78240234, + "num_input_tokens_seen": 314735700, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.24938965, + "step": 14601, + "time_per_iteration": 3.027595043182373 + }, + { + "auxiliary_loss_clip": 0.01264264, + "auxiliary_loss_mlp": 0.00231191, + "balance_loss_clip": 1.04037809, + "balance_loss_mlp": 0.20454788, + "epoch": 0.8779197354576883, + "flos": 25660184883840.0, + "grad_norm": 11.016274885561202, + "language_loss": 0.81276774, + "learning_rate": 1.542406170329733e-07, + "loss": 0.82772237, + "num_input_tokens_seen": 314753335, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.26623535, + "step": 14602, + "time_per_iteration": 2.689143657684326 + }, + { + "auxiliary_loss_clip": 0.0123009, + "auxiliary_loss_mlp": 0.00210538, + "balance_loss_clip": 1.02100456, + "balance_loss_mlp": 0.18915194, + "epoch": 0.8779798587103562, + "flos": 18843227153280.0, + "grad_norm": 110.71171628474467, + "language_loss": 0.77154112, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.78594744, + "num_input_tokens_seen": 314770800, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.21362305, + "step": 14603, + "time_per_iteration": 2.6671180725097656 + }, + { + "auxiliary_loss_clip": 0.01142758, + "auxiliary_loss_mlp": 0.00069364, + "balance_loss_clip": 1.0040673, + "balance_loss_mlp": 0.06116246, + "epoch": 0.8780399819630242, + "flos": 68613119377920.0, + "grad_norm": 0.7349116157251885, + "language_loss": 0.53582758, + "learning_rate": 1.539408026725344e-07, + "loss": 0.54794878, + "num_input_tokens_seen": 314837275, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.08203125, + "step": 14604, + "time_per_iteration": 3.190201759338379 + }, + { + "auxiliary_loss_clip": 0.01145479, + "auxiliary_loss_mlp": 0.00080339, + "balance_loss_clip": 1.00840402, + "balance_loss_mlp": 0.07337709, + "epoch": 0.8781001052156922, + "flos": 65734807766400.0, + "grad_norm": 0.6889713135709393, + "language_loss": 0.5784229, + "learning_rate": 1.537910004935976e-07, + "loss": 0.59068108, + "num_input_tokens_seen": 314902220, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06982422, + "step": 14605, + "time_per_iteration": 3.1572859287261963 + }, + { + "auxiliary_loss_clip": 0.01242357, + "auxiliary_loss_mlp": 0.00208706, + "balance_loss_clip": 1.0280875, + "balance_loss_mlp": 0.18410078, + "epoch": 0.8781602284683602, + "flos": 22049654526720.0, + "grad_norm": 20.24468496330013, + "language_loss": 0.92179638, + "learning_rate": 1.536412683230912e-07, + "loss": 0.93630695, + "num_input_tokens_seen": 314921645, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.24609375, + "step": 14606, + "time_per_iteration": 2.755272388458252 + }, + { + "auxiliary_loss_clip": 0.01265823, + "auxiliary_loss_mlp": 0.00228795, + "balance_loss_clip": 1.04136038, + "balance_loss_mlp": 0.20116207, + "epoch": 0.8782203517210281, + "flos": 17562939713280.0, + "grad_norm": 227.28301414421742, + "language_loss": 0.80491662, + "learning_rate": 1.534916061666931e-07, + "loss": 0.81986284, + "num_input_tokens_seen": 314939390, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.27661133, + "step": 14607, + "time_per_iteration": 2.6171460151672363 + }, + { + "auxiliary_loss_clip": 0.01231875, + "auxiliary_loss_mlp": 0.0020044, + "balance_loss_clip": 1.0227077, + "balance_loss_mlp": 0.17646727, + "epoch": 0.8782804749736961, + "flos": 25520421064320.0, + "grad_norm": 7.108441214532879, + "language_loss": 0.79673266, + "learning_rate": 1.533420140300785e-07, + "loss": 0.81105578, + "num_input_tokens_seen": 314959205, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.23974609, + "step": 14608, + "time_per_iteration": 2.7729554176330566 + }, + { + "auxiliary_loss_clip": 0.01250675, + "auxiliary_loss_mlp": 0.00229415, + "balance_loss_clip": 1.03111148, + "balance_loss_mlp": 0.20525165, + "epoch": 0.878340598226364, + "flos": 21798747048960.0, + "grad_norm": 409.60870711053053, + "language_loss": 0.96345317, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.97825408, + "num_input_tokens_seen": 314977485, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.24169922, + "step": 14609, + "time_per_iteration": 2.622631549835205 + }, + { + "auxiliary_loss_clip": 0.01260979, + "auxiliary_loss_mlp": 0.00217918, + "balance_loss_clip": 1.03690839, + "balance_loss_mlp": 0.19280057, + "epoch": 0.878400721479032, + "flos": 21102403011840.0, + "grad_norm": 6.341992656864886, + "language_loss": 0.77331185, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.78810084, + "num_input_tokens_seen": 314997830, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.2512207, + "step": 14610, + "time_per_iteration": 2.75335431098938 + }, + { + "auxiliary_loss_clip": 0.01241751, + "auxiliary_loss_mlp": 0.00207522, + "balance_loss_clip": 1.0256238, + "balance_loss_mlp": 0.18354857, + "epoch": 0.8784608447316999, + "flos": 20923532259840.0, + "grad_norm": 11.121690678331516, + "language_loss": 0.88660067, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.90109342, + "num_input_tokens_seen": 315016480, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.23986816, + "step": 14611, + "time_per_iteration": 2.6261003017425537 + }, + { + "auxiliary_loss_clip": 0.01251774, + "auxiliary_loss_mlp": 0.00209429, + "balance_loss_clip": 1.03082442, + "balance_loss_mlp": 0.18580216, + "epoch": 0.878520967984368, + "flos": 23330660238720.0, + "grad_norm": 11.023422556483977, + "language_loss": 0.8358717, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.85048378, + "num_input_tokens_seen": 315036135, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.2364502, + "step": 14612, + "time_per_iteration": 2.707693576812744 + }, + { + "auxiliary_loss_clip": 0.01243054, + "auxiliary_loss_mlp": 0.00195835, + "balance_loss_clip": 1.02655423, + "balance_loss_mlp": 0.17274451, + "epoch": 0.8785810912370359, + "flos": 25518984520320.0, + "grad_norm": 370.2296197082719, + "language_loss": 0.78430372, + "learning_rate": 1.525951038422002e-07, + "loss": 0.79869258, + "num_input_tokens_seen": 315057995, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.2310791, + "step": 14613, + "time_per_iteration": 2.6760265827178955 + }, + { + "auxiliary_loss_clip": 0.01139881, + "auxiliary_loss_mlp": 0.00137464, + "balance_loss_clip": 1.0013231, + "balance_loss_mlp": 0.12854719, + "epoch": 0.8786412144897039, + "flos": 61841047691520.0, + "grad_norm": 0.9952612940458774, + "language_loss": 0.6366232, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.64939666, + "num_input_tokens_seen": 315104010, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.08935547, + "step": 14614, + "time_per_iteration": 3.011845827102661 + }, + { + "auxiliary_loss_clip": 0.01138479, + "auxiliary_loss_mlp": 0.0018543, + "balance_loss_clip": 1.00004768, + "balance_loss_mlp": 0.17694235, + "epoch": 0.8787013377423719, + "flos": 70989364638720.0, + "grad_norm": 0.6747015530389926, + "language_loss": 0.57317495, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.5864141, + "num_input_tokens_seen": 315174550, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.08496094, + "step": 14615, + "time_per_iteration": 3.1996216773986816 + }, + { + "auxiliary_loss_clip": 0.01229591, + "auxiliary_loss_mlp": 0.00195566, + "balance_loss_clip": 1.01358366, + "balance_loss_mlp": 0.17072304, + "epoch": 0.8787614609950398, + "flos": 17347404153600.0, + "grad_norm": 4.741661414565293, + "language_loss": 0.83353543, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.84778702, + "num_input_tokens_seen": 315191825, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24841309, + "step": 14616, + "time_per_iteration": 2.616708517074585 + }, + { + "auxiliary_loss_clip": 0.01140752, + "auxiliary_loss_mlp": 0.00161044, + "balance_loss_clip": 1.00127816, + "balance_loss_mlp": 0.15236598, + "epoch": 0.8788215842477078, + "flos": 72511401588480.0, + "grad_norm": 0.7813506904411884, + "language_loss": 0.57107425, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.5840922, + "num_input_tokens_seen": 315255075, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.08691406, + "step": 14617, + "time_per_iteration": 3.2486398220062256 + }, + { + "auxiliary_loss_clip": 0.01239515, + "auxiliary_loss_mlp": 0.00229066, + "balance_loss_clip": 1.02190185, + "balance_loss_mlp": 0.20386526, + "epoch": 0.8788817075003758, + "flos": 24827452905600.0, + "grad_norm": 5.167972963115753, + "language_loss": 0.90056866, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.91525447, + "num_input_tokens_seen": 315273995, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25183105, + "step": 14618, + "time_per_iteration": 4.082475662231445 + }, + { + "auxiliary_loss_clip": 0.01237054, + "auxiliary_loss_mlp": 0.00231362, + "balance_loss_clip": 1.02541447, + "balance_loss_mlp": 0.20619714, + "epoch": 0.8789418307530438, + "flos": 22638769488000.0, + "grad_norm": 48.26608417631701, + "language_loss": 0.76573575, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.78041989, + "num_input_tokens_seen": 315294485, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.25134277, + "step": 14619, + "time_per_iteration": 2.738611936569214 + }, + { + "auxiliary_loss_clip": 0.01241426, + "auxiliary_loss_mlp": 0.00202289, + "balance_loss_clip": 1.0251925, + "balance_loss_mlp": 0.17864934, + "epoch": 0.8790019540057117, + "flos": 19785738072960.0, + "grad_norm": 135.51900608121278, + "language_loss": 0.84347105, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.85790819, + "num_input_tokens_seen": 315310420, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.23657227, + "step": 14620, + "time_per_iteration": 4.061131000518799 + }, + { + "auxiliary_loss_clip": 0.01253777, + "auxiliary_loss_mlp": 0.00231344, + "balance_loss_clip": 1.03716898, + "balance_loss_mlp": 0.20507036, + "epoch": 0.8790620772583797, + "flos": 20229774001920.0, + "grad_norm": 5.0138116011689124, + "language_loss": 0.89048719, + "learning_rate": 1.514036906317542e-07, + "loss": 0.90533835, + "num_input_tokens_seen": 315330110, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.26281738, + "step": 14621, + "time_per_iteration": 2.7099084854125977 + }, + { + "auxiliary_loss_clip": 0.01267571, + "auxiliary_loss_mlp": 0.00214538, + "balance_loss_clip": 1.04112887, + "balance_loss_mlp": 0.18899122, + "epoch": 0.8791222005110476, + "flos": 24130785646080.0, + "grad_norm": 26.932774059664062, + "language_loss": 0.7386089, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.75343001, + "num_input_tokens_seen": 315350080, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.25561523, + "step": 14622, + "time_per_iteration": 2.6987380981445312 + }, + { + "auxiliary_loss_clip": 0.01251025, + "auxiliary_loss_mlp": 0.00185973, + "balance_loss_clip": 1.02710021, + "balance_loss_mlp": 0.1589362, + "epoch": 0.8791823237637156, + "flos": 21614201948160.0, + "grad_norm": 11.433020326348347, + "language_loss": 0.8004688, + "learning_rate": 1.511065382058687e-07, + "loss": 0.81483883, + "num_input_tokens_seen": 315366360, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.27026367, + "step": 14623, + "time_per_iteration": 2.6417529582977295 + }, + { + "auxiliary_loss_clip": 0.01229896, + "auxiliary_loss_mlp": 0.0020364, + "balance_loss_clip": 1.01542604, + "balance_loss_mlp": 0.18088251, + "epoch": 0.8792424470163835, + "flos": 24243401761920.0, + "grad_norm": 3.7370957623426158, + "language_loss": 0.86750436, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.88183975, + "num_input_tokens_seen": 315385890, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.22753906, + "step": 14624, + "time_per_iteration": 2.6745190620422363 + }, + { + "auxiliary_loss_clip": 0.01246464, + "auxiliary_loss_mlp": 0.00227951, + "balance_loss_clip": 1.02765107, + "balance_loss_mlp": 0.20181994, + "epoch": 0.8793025702690516, + "flos": 24893204751360.0, + "grad_norm": 40.16130120357125, + "language_loss": 0.88377905, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.89852315, + "num_input_tokens_seen": 315403400, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26135254, + "step": 14625, + "time_per_iteration": 4.113278865814209 + }, + { + "auxiliary_loss_clip": 0.0122681, + "auxiliary_loss_mlp": 0.00200583, + "balance_loss_clip": 1.01100683, + "balance_loss_mlp": 0.17541805, + "epoch": 0.8793626935217195, + "flos": 25373115388800.0, + "grad_norm": 14.319227897277162, + "language_loss": 0.80040765, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.81468153, + "num_input_tokens_seen": 315423670, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.25158691, + "step": 14626, + "time_per_iteration": 2.7406954765319824 + }, + { + "auxiliary_loss_clip": 0.01239209, + "auxiliary_loss_mlp": 0.00199145, + "balance_loss_clip": 1.02145064, + "balance_loss_mlp": 0.17288366, + "epoch": 0.8794228167743875, + "flos": 34678000742400.0, + "grad_norm": 3.5238374791680114, + "language_loss": 0.78820407, + "learning_rate": 1.505130747218246e-07, + "loss": 0.80258763, + "num_input_tokens_seen": 315446265, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.26257324, + "step": 14627, + "time_per_iteration": 2.7868871688842773 + }, + { + "auxiliary_loss_clip": 0.01244392, + "auxiliary_loss_mlp": 0.00206259, + "balance_loss_clip": 1.02619386, + "balance_loss_mlp": 0.18074787, + "epoch": 0.8794829400270555, + "flos": 19464014931840.0, + "grad_norm": 20.27071204848293, + "language_loss": 0.78856128, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.8030678, + "num_input_tokens_seen": 315464655, + "router_z_loss_clip": 2.18652344, + "router_z_loss_mlp": 0.25488281, + "step": 14628, + "time_per_iteration": 2.674473285675049 + }, + { + "auxiliary_loss_clip": 0.01241599, + "auxiliary_loss_mlp": 0.00205399, + "balance_loss_clip": 1.02439499, + "balance_loss_mlp": 0.17987639, + "epoch": 0.8795430632797234, + "flos": 15231403906560.0, + "grad_norm": 20.739233241421587, + "language_loss": 0.813519, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.82798904, + "num_input_tokens_seen": 315481090, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25549316, + "step": 14629, + "time_per_iteration": 2.6836891174316406 + }, + { + "auxiliary_loss_clip": 0.01228817, + "auxiliary_loss_mlp": 0.00223782, + "balance_loss_clip": 1.01707864, + "balance_loss_mlp": 0.20048892, + "epoch": 0.8796031865323914, + "flos": 27744727795200.0, + "grad_norm": 3.908048739034047, + "language_loss": 0.75191039, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.7664364, + "num_input_tokens_seen": 315502010, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.2331543, + "step": 14630, + "time_per_iteration": 4.160262823104858 + }, + { + "auxiliary_loss_clip": 0.01230264, + "auxiliary_loss_mlp": 0.00205031, + "balance_loss_clip": 1.01881266, + "balance_loss_mlp": 0.18161789, + "epoch": 0.8796633097850594, + "flos": 31285412156160.0, + "grad_norm": 29.687521108392104, + "language_loss": 0.8043642, + "learning_rate": 1.499207333613999e-07, + "loss": 0.81871718, + "num_input_tokens_seen": 315523040, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.23413086, + "step": 14631, + "time_per_iteration": 2.7756783962249756 + }, + { + "auxiliary_loss_clip": 0.01237367, + "auxiliary_loss_mlp": 0.00207757, + "balance_loss_clip": 1.02112317, + "balance_loss_mlp": 0.18262736, + "epoch": 0.8797234330377274, + "flos": 24243150366720.0, + "grad_norm": 18.463355010957255, + "language_loss": 0.75962138, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.77407265, + "num_input_tokens_seen": 315541865, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.2512207, + "step": 14632, + "time_per_iteration": 2.6979832649230957 + }, + { + "auxiliary_loss_clip": 0.01251767, + "auxiliary_loss_mlp": 0.00202145, + "balance_loss_clip": 1.03452122, + "balance_loss_mlp": 0.17850557, + "epoch": 0.8797835562903953, + "flos": 24167414540160.0, + "grad_norm": 4.110969139897216, + "language_loss": 0.75456822, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.76910734, + "num_input_tokens_seen": 315561470, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.2364502, + "step": 14633, + "time_per_iteration": 2.6748769283294678 + }, + { + "auxiliary_loss_clip": 0.01241561, + "auxiliary_loss_mlp": 0.00212968, + "balance_loss_clip": 1.02167618, + "balance_loss_mlp": 0.18899456, + "epoch": 0.8798436795430633, + "flos": 19284677303040.0, + "grad_norm": 2.5114088293170442, + "language_loss": 0.89020437, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.90474963, + "num_input_tokens_seen": 315583140, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.23986816, + "step": 14634, + "time_per_iteration": 2.7000691890716553 + }, + { + "auxiliary_loss_clip": 0.01244361, + "auxiliary_loss_mlp": 0.00213229, + "balance_loss_clip": 1.02582145, + "balance_loss_mlp": 0.18787315, + "epoch": 0.8799038027957312, + "flos": 28179390274560.0, + "grad_norm": 99.5129776238023, + "language_loss": 0.87621319, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.89078903, + "num_input_tokens_seen": 315601935, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25354004, + "step": 14635, + "time_per_iteration": 2.730013847351074 + }, + { + "auxiliary_loss_clip": 0.0125314, + "auxiliary_loss_mlp": 0.00205208, + "balance_loss_clip": 1.03108573, + "balance_loss_mlp": 0.17887495, + "epoch": 0.8799639260483992, + "flos": 24644703484800.0, + "grad_norm": 13.087960564262703, + "language_loss": 0.73733461, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.75191808, + "num_input_tokens_seen": 315619995, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.26318359, + "step": 14636, + "time_per_iteration": 2.6831247806549072 + }, + { + "auxiliary_loss_clip": 0.01230301, + "auxiliary_loss_mlp": 0.00204146, + "balance_loss_clip": 1.01495647, + "balance_loss_mlp": 0.17927849, + "epoch": 0.8800240493010671, + "flos": 22200479735040.0, + "grad_norm": 2.830732921815829, + "language_loss": 0.77855396, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.79289842, + "num_input_tokens_seen": 315637895, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24853516, + "step": 14637, + "time_per_iteration": 2.6743836402893066 + }, + { + "auxiliary_loss_clip": 0.01249995, + "auxiliary_loss_mlp": 0.00200929, + "balance_loss_clip": 1.03531623, + "balance_loss_mlp": 0.17810075, + "epoch": 0.8800841725537352, + "flos": 14246086953600.0, + "grad_norm": 13.028554837422135, + "language_loss": 0.75146008, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.76596934, + "num_input_tokens_seen": 315655520, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.22814941, + "step": 14638, + "time_per_iteration": 2.624941110610962 + }, + { + "auxiliary_loss_clip": 0.01241134, + "auxiliary_loss_mlp": 0.00193256, + "balance_loss_clip": 1.0269537, + "balance_loss_mlp": 0.17066596, + "epoch": 0.8801442958064031, + "flos": 37415794348800.0, + "grad_norm": 4.012448640835184, + "language_loss": 0.65899128, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.67333519, + "num_input_tokens_seen": 315678955, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.22607422, + "step": 14639, + "time_per_iteration": 2.8232548236846924 + }, + { + "auxiliary_loss_clip": 0.01235423, + "auxiliary_loss_mlp": 0.00205233, + "balance_loss_clip": 1.02245569, + "balance_loss_mlp": 0.18140276, + "epoch": 0.8802044190590711, + "flos": 25047334010880.0, + "grad_norm": 40.51349960442892, + "language_loss": 0.81971192, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.83411849, + "num_input_tokens_seen": 315700360, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23840332, + "step": 14640, + "time_per_iteration": 2.690929651260376 + }, + { + "auxiliary_loss_clip": 0.01239025, + "auxiliary_loss_mlp": 0.00196386, + "balance_loss_clip": 1.01909184, + "balance_loss_mlp": 0.17160256, + "epoch": 0.8802645423117391, + "flos": 24133874215680.0, + "grad_norm": 249.49906959825012, + "language_loss": 0.77314454, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.78749859, + "num_input_tokens_seen": 315719270, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24743652, + "step": 14641, + "time_per_iteration": 2.695016622543335 + }, + { + "auxiliary_loss_clip": 0.0126547, + "auxiliary_loss_mlp": 0.00237034, + "balance_loss_clip": 1.03764319, + "balance_loss_mlp": 0.20950896, + "epoch": 0.880324665564407, + "flos": 17931203902080.0, + "grad_norm": 15.462335048879238, + "language_loss": 0.95374441, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.96876955, + "num_input_tokens_seen": 315737425, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.2755127, + "step": 14642, + "time_per_iteration": 2.6634738445281982 + }, + { + "auxiliary_loss_clip": 0.01241527, + "auxiliary_loss_mlp": 0.0020865, + "balance_loss_clip": 1.02401817, + "balance_loss_mlp": 0.18276951, + "epoch": 0.880384788817075, + "flos": 21287630471040.0, + "grad_norm": 8.555387416619876, + "language_loss": 0.85945487, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.87395656, + "num_input_tokens_seen": 315755725, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.25915527, + "step": 14643, + "time_per_iteration": 2.678297281265259 + }, + { + "auxiliary_loss_clip": 0.01224129, + "auxiliary_loss_mlp": 0.00218979, + "balance_loss_clip": 1.00961804, + "balance_loss_mlp": 0.19262151, + "epoch": 0.880444912069743, + "flos": 12458489777280.0, + "grad_norm": 322.32196068194935, + "language_loss": 0.80925012, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.82368112, + "num_input_tokens_seen": 315773835, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.2635498, + "step": 14644, + "time_per_iteration": 2.6891934871673584 + }, + { + "auxiliary_loss_clip": 0.01254103, + "auxiliary_loss_mlp": 0.00221241, + "balance_loss_clip": 1.02959645, + "balance_loss_mlp": 0.19536027, + "epoch": 0.880505035322411, + "flos": 13625945619840.0, + "grad_norm": 13.299011058239252, + "language_loss": 0.89939505, + "learning_rate": 1.47856380505911e-07, + "loss": 0.91414845, + "num_input_tokens_seen": 315790615, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.25891113, + "step": 14645, + "time_per_iteration": 2.6585636138916016 + }, + { + "auxiliary_loss_clip": 0.01234985, + "auxiliary_loss_mlp": 0.00224473, + "balance_loss_clip": 1.02243018, + "balance_loss_mlp": 0.19937953, + "epoch": 0.8805651585750789, + "flos": 23183067254400.0, + "grad_norm": 16.830985805200758, + "language_loss": 0.71650374, + "learning_rate": 1.477094533001364e-07, + "loss": 0.73109829, + "num_input_tokens_seen": 315811010, + "router_z_loss_clip": 2.12402344, + "router_z_loss_mlp": 0.25061035, + "step": 14646, + "time_per_iteration": 2.652214288711548 + }, + { + "auxiliary_loss_clip": 0.01261394, + "auxiliary_loss_mlp": 0.00223313, + "balance_loss_clip": 1.03624225, + "balance_loss_mlp": 0.19752851, + "epoch": 0.8806252818277469, + "flos": 14903000835840.0, + "grad_norm": 9.637728397416852, + "language_loss": 0.88839787, + "learning_rate": 1.475625963334055e-07, + "loss": 0.90324497, + "num_input_tokens_seen": 315828130, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.25817871, + "step": 14647, + "time_per_iteration": 2.5911874771118164 + }, + { + "auxiliary_loss_clip": 0.01242461, + "auxiliary_loss_mlp": 0.00211461, + "balance_loss_clip": 1.02702856, + "balance_loss_mlp": 0.18728489, + "epoch": 0.8806854050804148, + "flos": 17639178330240.0, + "grad_norm": 3.055741766769449, + "language_loss": 0.84167427, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.85621351, + "num_input_tokens_seen": 315844900, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24182129, + "step": 14648, + "time_per_iteration": 2.6600260734558105 + }, + { + "auxiliary_loss_clip": 0.01235981, + "auxiliary_loss_mlp": 0.0021762, + "balance_loss_clip": 1.02156258, + "balance_loss_mlp": 0.19374272, + "epoch": 0.8807455283330828, + "flos": 25332392344320.0, + "grad_norm": 4.214813449353809, + "language_loss": 0.72544032, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.73997635, + "num_input_tokens_seen": 315863745, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.23864746, + "step": 14649, + "time_per_iteration": 2.6548690795898438 + }, + { + "auxiliary_loss_clip": 0.01240199, + "auxiliary_loss_mlp": 0.00211031, + "balance_loss_clip": 1.02259135, + "balance_loss_mlp": 0.18583035, + "epoch": 0.8808056515857507, + "flos": 25265168040960.0, + "grad_norm": 9.976858269062886, + "language_loss": 0.68802983, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.70254213, + "num_input_tokens_seen": 315885765, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25195312, + "step": 14650, + "time_per_iteration": 2.700778007507324 + }, + { + "auxiliary_loss_clip": 0.01236399, + "auxiliary_loss_mlp": 0.00215135, + "balance_loss_clip": 1.02431452, + "balance_loss_mlp": 0.18939802, + "epoch": 0.8808657748384188, + "flos": 26578852151040.0, + "grad_norm": 5.082270517712404, + "language_loss": 0.78509367, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.79960907, + "num_input_tokens_seen": 315907340, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.25708008, + "step": 14651, + "time_per_iteration": 2.7909634113311768 + }, + { + "auxiliary_loss_clip": 0.01257177, + "auxiliary_loss_mlp": 0.00191376, + "balance_loss_clip": 1.03491592, + "balance_loss_mlp": 0.1666525, + "epoch": 0.8809258980910867, + "flos": 18661231918080.0, + "grad_norm": 2.429150804684975, + "language_loss": 0.78341973, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.79790521, + "num_input_tokens_seen": 315924935, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.24719238, + "step": 14652, + "time_per_iteration": 2.674785852432251 + }, + { + "auxiliary_loss_clip": 0.01226887, + "auxiliary_loss_mlp": 0.00192402, + "balance_loss_clip": 1.0115875, + "balance_loss_mlp": 0.1684055, + "epoch": 0.8809860213437547, + "flos": 19792274348160.0, + "grad_norm": 478.45755585983363, + "language_loss": 0.85242909, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.86662197, + "num_input_tokens_seen": 315943165, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.23999023, + "step": 14653, + "time_per_iteration": 2.6306660175323486 + }, + { + "auxiliary_loss_clip": 0.01257958, + "auxiliary_loss_mlp": 0.00209164, + "balance_loss_clip": 1.0343883, + "balance_loss_mlp": 0.18300985, + "epoch": 0.8810461445964227, + "flos": 17894467267200.0, + "grad_norm": 152.27389710197167, + "language_loss": 0.80716908, + "learning_rate": 1.465365647269421e-07, + "loss": 0.82184029, + "num_input_tokens_seen": 315961340, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26135254, + "step": 14654, + "time_per_iteration": 2.670576810836792 + }, + { + "auxiliary_loss_clip": 0.01232634, + "auxiliary_loss_mlp": 0.0020995, + "balance_loss_clip": 1.01438069, + "balance_loss_mlp": 0.18297267, + "epoch": 0.8811062678490906, + "flos": 29163917128320.0, + "grad_norm": 51.40218536582138, + "language_loss": 0.79025245, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.8046782, + "num_input_tokens_seen": 315981335, + "router_z_loss_clip": 2.18066406, + "router_z_loss_mlp": 0.26989746, + "step": 14655, + "time_per_iteration": 2.7009658813476562 + }, + { + "auxiliary_loss_clip": 0.01242798, + "auxiliary_loss_mlp": 0.00212105, + "balance_loss_clip": 1.0256449, + "balance_loss_mlp": 0.18766747, + "epoch": 0.8811663911017587, + "flos": 20338834671360.0, + "grad_norm": 18.21658981170416, + "language_loss": 0.8823477, + "learning_rate": 1.462440453077449e-07, + "loss": 0.89689672, + "num_input_tokens_seen": 316001325, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24438477, + "step": 14656, + "time_per_iteration": 2.6468594074249268 + }, + { + "auxiliary_loss_clip": 0.01262175, + "auxiliary_loss_mlp": 0.00203047, + "balance_loss_clip": 1.03819251, + "balance_loss_mlp": 0.17812021, + "epoch": 0.8812265143544266, + "flos": 25885704424320.0, + "grad_norm": 4.431932756179847, + "language_loss": 0.77183688, + "learning_rate": 1.460978910372914e-07, + "loss": 0.78648913, + "num_input_tokens_seen": 316022540, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.24938965, + "step": 14657, + "time_per_iteration": 2.7759833335876465 + }, + { + "auxiliary_loss_clip": 0.01247002, + "auxiliary_loss_mlp": 0.00181672, + "balance_loss_clip": 1.02782404, + "balance_loss_mlp": 0.15699555, + "epoch": 0.8812866376070946, + "flos": 27195509865600.0, + "grad_norm": 7.8039857556862335, + "language_loss": 0.93415344, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.94844019, + "num_input_tokens_seen": 316037735, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24694824, + "step": 14658, + "time_per_iteration": 2.746023178100586 + }, + { + "auxiliary_loss_clip": 0.01262232, + "auxiliary_loss_mlp": 0.00210433, + "balance_loss_clip": 1.0377934, + "balance_loss_mlp": 0.18146518, + "epoch": 0.8813467608597625, + "flos": 23807194997760.0, + "grad_norm": 81.5324246702588, + "language_loss": 0.85448045, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.86920714, + "num_input_tokens_seen": 316058105, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.28979492, + "step": 14659, + "time_per_iteration": 2.734225273132324 + }, + { + "auxiliary_loss_clip": 0.01233571, + "auxiliary_loss_mlp": 0.00220373, + "balance_loss_clip": 1.01897383, + "balance_loss_mlp": 0.19607805, + "epoch": 0.8814068841124305, + "flos": 21105455667840.0, + "grad_norm": 96.1752773942248, + "language_loss": 0.69550681, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.71004617, + "num_input_tokens_seen": 316074415, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.24279785, + "step": 14660, + "time_per_iteration": 4.215841293334961 + }, + { + "auxiliary_loss_clip": 0.01234385, + "auxiliary_loss_mlp": 0.00199558, + "balance_loss_clip": 1.0176928, + "balance_loss_mlp": 0.17378452, + "epoch": 0.8814670073650984, + "flos": 24716991605760.0, + "grad_norm": 11.68168917412, + "language_loss": 0.86500096, + "learning_rate": 1.455139770123972e-07, + "loss": 0.87934041, + "num_input_tokens_seen": 316094405, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.2578125, + "step": 14661, + "time_per_iteration": 2.7820184230804443 + }, + { + "auxiliary_loss_clip": 0.01232175, + "auxiliary_loss_mlp": 0.00210241, + "balance_loss_clip": 1.01739562, + "balance_loss_mlp": 0.18531425, + "epoch": 0.8815271306177664, + "flos": 22966274718720.0, + "grad_norm": 18.992304143602148, + "language_loss": 0.83568907, + "learning_rate": 1.45368174298081e-07, + "loss": 0.85011327, + "num_input_tokens_seen": 316113390, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.24951172, + "step": 14662, + "time_per_iteration": 4.058739900588989 + }, + { + "auxiliary_loss_clip": 0.01207693, + "auxiliary_loss_mlp": 0.00186262, + "balance_loss_clip": 1.00162315, + "balance_loss_mlp": 0.16435108, + "epoch": 0.8815872538704344, + "flos": 19460064435840.0, + "grad_norm": 133.0019995448286, + "language_loss": 0.80989254, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.82383215, + "num_input_tokens_seen": 316131085, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.21911621, + "step": 14663, + "time_per_iteration": 2.584728479385376 + }, + { + "auxiliary_loss_clip": 0.01240075, + "auxiliary_loss_mlp": 0.00195035, + "balance_loss_clip": 1.02184319, + "balance_loss_mlp": 0.17068025, + "epoch": 0.8816473771231024, + "flos": 32156604622080.0, + "grad_norm": 64.46199582649771, + "language_loss": 0.77097166, + "learning_rate": 1.450767798584489e-07, + "loss": 0.78532273, + "num_input_tokens_seen": 316151440, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24365234, + "step": 14664, + "time_per_iteration": 2.7726173400878906 + }, + { + "auxiliary_loss_clip": 0.01221989, + "auxiliary_loss_mlp": 0.00212993, + "balance_loss_clip": 1.00846207, + "balance_loss_mlp": 0.18913949, + "epoch": 0.8817075003757703, + "flos": 19682279925120.0, + "grad_norm": 80.82836312143671, + "language_loss": 0.87654269, + "learning_rate": 1.449311881441828e-07, + "loss": 0.89089251, + "num_input_tokens_seen": 316170750, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.23876953, + "step": 14665, + "time_per_iteration": 2.610426902770996 + }, + { + "auxiliary_loss_clip": 0.01245069, + "auxiliary_loss_mlp": 0.00224831, + "balance_loss_clip": 1.02904272, + "balance_loss_mlp": 0.19811629, + "epoch": 0.8817676236284383, + "flos": 15668616251520.0, + "grad_norm": 14.000775480360062, + "language_loss": 0.71191186, + "learning_rate": 1.447856667743117e-07, + "loss": 0.7266109, + "num_input_tokens_seen": 316187265, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.26708984, + "step": 14666, + "time_per_iteration": 2.6398582458496094 + }, + { + "auxiliary_loss_clip": 0.0124949, + "auxiliary_loss_mlp": 0.00214043, + "balance_loss_clip": 1.02895355, + "balance_loss_mlp": 0.18770933, + "epoch": 0.8818277468811063, + "flos": 17895185539200.0, + "grad_norm": 4.151952001996691, + "language_loss": 0.92142582, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.93606114, + "num_input_tokens_seen": 316206555, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.26342773, + "step": 14667, + "time_per_iteration": 2.6231319904327393 + }, + { + "auxiliary_loss_clip": 0.0124303, + "auxiliary_loss_mlp": 0.00223839, + "balance_loss_clip": 1.02239716, + "balance_loss_mlp": 0.19724298, + "epoch": 0.8818878701337742, + "flos": 18770508069120.0, + "grad_norm": 59.92313416709527, + "language_loss": 0.70968878, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.72435749, + "num_input_tokens_seen": 316225210, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.265625, + "step": 14668, + "time_per_iteration": 4.080662250518799 + }, + { + "auxiliary_loss_clip": 0.01245054, + "auxiliary_loss_mlp": 0.00203339, + "balance_loss_clip": 1.02730906, + "balance_loss_mlp": 0.17950867, + "epoch": 0.8819479933864423, + "flos": 17712292464000.0, + "grad_norm": 262.2267646469065, + "language_loss": 0.68045986, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.69494379, + "num_input_tokens_seen": 316242685, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.23828125, + "step": 14669, + "time_per_iteration": 2.657020330429077 + }, + { + "auxiliary_loss_clip": 0.01240416, + "auxiliary_loss_mlp": 0.00208692, + "balance_loss_clip": 1.02762687, + "balance_loss_mlp": 0.18425432, + "epoch": 0.8820081166391102, + "flos": 11728749070080.0, + "grad_norm": 111.58850295884521, + "language_loss": 0.81086338, + "learning_rate": 1.442042848491043e-07, + "loss": 0.82535446, + "num_input_tokens_seen": 316260935, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.24438477, + "step": 14670, + "time_per_iteration": 2.6029529571533203 + }, + { + "auxiliary_loss_clip": 0.01237135, + "auxiliary_loss_mlp": 0.00212812, + "balance_loss_clip": 1.01499295, + "balance_loss_mlp": 0.18691954, + "epoch": 0.8820682398917782, + "flos": 27490372611840.0, + "grad_norm": 4.770168390484394, + "language_loss": 0.8418957, + "learning_rate": 1.44059115283929e-07, + "loss": 0.85639513, + "num_input_tokens_seen": 316281190, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25891113, + "step": 14671, + "time_per_iteration": 2.719602584838867 + }, + { + "auxiliary_loss_clip": 0.01246653, + "auxiliary_loss_mlp": 0.00215362, + "balance_loss_clip": 1.02510667, + "balance_loss_mlp": 0.1897081, + "epoch": 0.8821283631444461, + "flos": 16873850223360.0, + "grad_norm": 24.339113006566468, + "language_loss": 0.94037211, + "learning_rate": 1.43914016096218e-07, + "loss": 0.95499223, + "num_input_tokens_seen": 316297115, + "router_z_loss_clip": 2.21386719, + "router_z_loss_mlp": 0.25646973, + "step": 14672, + "time_per_iteration": 4.064741611480713 + }, + { + "auxiliary_loss_clip": 0.01224112, + "auxiliary_loss_mlp": 0.00189148, + "balance_loss_clip": 1.01219428, + "balance_loss_mlp": 0.16504414, + "epoch": 0.8821884863971141, + "flos": 24280964409600.0, + "grad_norm": 7.5916824220498444, + "language_loss": 0.81336719, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.82749975, + "num_input_tokens_seen": 316318235, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.2409668, + "step": 14673, + "time_per_iteration": 2.6906182765960693 + }, + { + "auxiliary_loss_clip": 0.0112683, + "auxiliary_loss_mlp": 0.00163249, + "balance_loss_clip": 0.98742968, + "balance_loss_mlp": 0.15352184, + "epoch": 0.882248609649782, + "flos": 59432342492160.0, + "grad_norm": 0.8740379852358127, + "language_loss": 0.48230854, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.49520937, + "num_input_tokens_seen": 316384705, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.09716797, + "step": 14674, + "time_per_iteration": 3.2636678218841553 + }, + { + "auxiliary_loss_clip": 0.01241598, + "auxiliary_loss_mlp": 0.00200932, + "balance_loss_clip": 1.02035117, + "balance_loss_mlp": 0.17512369, + "epoch": 0.88230873290245, + "flos": 19937784343680.0, + "grad_norm": 7.298368459313634, + "language_loss": 0.86231601, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.87674129, + "num_input_tokens_seen": 316401165, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.2578125, + "step": 14675, + "time_per_iteration": 2.624028205871582 + }, + { + "auxiliary_loss_clip": 0.01239347, + "auxiliary_loss_mlp": 0.00201087, + "balance_loss_clip": 1.02032685, + "balance_loss_mlp": 0.17577839, + "epoch": 0.882368856155118, + "flos": 16362769559040.0, + "grad_norm": 12.504372927636396, + "language_loss": 0.88308454, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.89748889, + "num_input_tokens_seen": 316418780, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.2532959, + "step": 14676, + "time_per_iteration": 2.6801092624664307 + }, + { + "auxiliary_loss_clip": 0.01135198, + "auxiliary_loss_mlp": 0.00170786, + "balance_loss_clip": 0.99346673, + "balance_loss_mlp": 0.16158313, + "epoch": 0.882428979407786, + "flos": 70594563277440.0, + "grad_norm": 0.6748812105169887, + "language_loss": 0.54113507, + "learning_rate": 1.431895760121109e-07, + "loss": 0.55419493, + "num_input_tokens_seen": 316482030, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.09179688, + "step": 14677, + "time_per_iteration": 3.228390693664551 + }, + { + "auxiliary_loss_clip": 0.01238542, + "auxiliary_loss_mlp": 0.00201575, + "balance_loss_clip": 1.02031446, + "balance_loss_mlp": 0.17724393, + "epoch": 0.8824891026604539, + "flos": 18150294908160.0, + "grad_norm": 12.425328324724813, + "language_loss": 0.80194914, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.81635034, + "num_input_tokens_seen": 316499175, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24316406, + "step": 14678, + "time_per_iteration": 2.6402156352996826 + }, + { + "auxiliary_loss_clip": 0.01250859, + "auxiliary_loss_mlp": 0.00205493, + "balance_loss_clip": 1.02816927, + "balance_loss_mlp": 0.17964804, + "epoch": 0.8825492259131219, + "flos": 27232713377280.0, + "grad_norm": 19.073233712294453, + "language_loss": 0.7931664, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.80772996, + "num_input_tokens_seen": 316519495, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25842285, + "step": 14679, + "time_per_iteration": 2.716090202331543 + }, + { + "auxiliary_loss_clip": 0.01226431, + "auxiliary_loss_mlp": 0.00198438, + "balance_loss_clip": 1.01674962, + "balance_loss_mlp": 0.17471512, + "epoch": 0.8826093491657898, + "flos": 22274419881600.0, + "grad_norm": 9.51942845860056, + "language_loss": 0.70791435, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.72216296, + "num_input_tokens_seen": 316538180, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23742676, + "step": 14680, + "time_per_iteration": 2.670997142791748 + }, + { + "auxiliary_loss_clip": 0.0124433, + "auxiliary_loss_mlp": 0.00202585, + "balance_loss_clip": 1.02935529, + "balance_loss_mlp": 0.1778726, + "epoch": 0.8826694724184578, + "flos": 14204753377920.0, + "grad_norm": 74.40897921032519, + "language_loss": 0.85891134, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.87338054, + "num_input_tokens_seen": 316551750, + "router_z_loss_clip": 2.15136719, + "router_z_loss_mlp": 0.24731445, + "step": 14681, + "time_per_iteration": 2.6426925659179688 + }, + { + "auxiliary_loss_clip": 0.0124049, + "auxiliary_loss_mlp": 0.00224348, + "balance_loss_clip": 1.02511823, + "balance_loss_mlp": 0.19932608, + "epoch": 0.8827295956711259, + "flos": 20631686256000.0, + "grad_norm": 103.90839054202644, + "language_loss": 0.80049372, + "learning_rate": 1.424668961888047e-07, + "loss": 0.81514204, + "num_input_tokens_seen": 316570680, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.25024414, + "step": 14682, + "time_per_iteration": 2.6768016815185547 + }, + { + "auxiliary_loss_clip": 0.01258845, + "auxiliary_loss_mlp": 0.00233551, + "balance_loss_clip": 1.03264713, + "balance_loss_mlp": 0.20745608, + "epoch": 0.8827897189237938, + "flos": 18513064316160.0, + "grad_norm": 45.87329545389999, + "language_loss": 0.83762318, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.85254717, + "num_input_tokens_seen": 316588635, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.26074219, + "step": 14683, + "time_per_iteration": 2.715405225753784 + }, + { + "auxiliary_loss_clip": 0.01250298, + "auxiliary_loss_mlp": 0.00208005, + "balance_loss_clip": 1.03025651, + "balance_loss_mlp": 0.18408018, + "epoch": 0.8828498421764618, + "flos": 22747399194240.0, + "grad_norm": 8.853996792302022, + "language_loss": 0.74168873, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.75627172, + "num_input_tokens_seen": 316607550, + "router_z_loss_clip": 2.19824219, + "router_z_loss_mlp": 0.23937988, + "step": 14684, + "time_per_iteration": 2.627473831176758 + }, + { + "auxiliary_loss_clip": 0.01226993, + "auxiliary_loss_mlp": 0.00199811, + "balance_loss_clip": 1.00826311, + "balance_loss_mlp": 0.17524171, + "epoch": 0.8829099654291297, + "flos": 15012384727680.0, + "grad_norm": 16.69880685740119, + "language_loss": 0.80787444, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.82214248, + "num_input_tokens_seen": 316624460, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24609375, + "step": 14685, + "time_per_iteration": 2.672715902328491 + }, + { + "auxiliary_loss_clip": 0.0126385, + "auxiliary_loss_mlp": 0.00221116, + "balance_loss_clip": 1.03707147, + "balance_loss_mlp": 0.19435333, + "epoch": 0.8829700886817977, + "flos": 16720546976640.0, + "grad_norm": 27.777383588340935, + "language_loss": 0.81507003, + "learning_rate": 1.418900201783806e-07, + "loss": 0.82991976, + "num_input_tokens_seen": 316640765, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.26745605, + "step": 14686, + "time_per_iteration": 2.591649293899536 + }, + { + "auxiliary_loss_clip": 0.01240278, + "auxiliary_loss_mlp": 0.00206779, + "balance_loss_clip": 1.02564323, + "balance_loss_mlp": 0.18379524, + "epoch": 0.8830302119344656, + "flos": 15263256291840.0, + "grad_norm": 9.90285958477027, + "language_loss": 0.71533227, + "learning_rate": 1.417459773114007e-07, + "loss": 0.72980285, + "num_input_tokens_seen": 316656120, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.22998047, + "step": 14687, + "time_per_iteration": 2.696958065032959 + }, + { + "auxiliary_loss_clip": 0.01238212, + "auxiliary_loss_mlp": 0.00227476, + "balance_loss_clip": 1.0217669, + "balance_loss_mlp": 0.20307374, + "epoch": 0.8830903351871336, + "flos": 28617751854720.0, + "grad_norm": 21.475176425463243, + "language_loss": 0.760934, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.7755909, + "num_input_tokens_seen": 316676095, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24389648, + "step": 14688, + "time_per_iteration": 2.702655076980591 + }, + { + "auxiliary_loss_clip": 0.012356, + "auxiliary_loss_mlp": 0.00195222, + "balance_loss_clip": 1.02405095, + "balance_loss_mlp": 0.17291823, + "epoch": 0.8831504584398016, + "flos": 28001632844160.0, + "grad_norm": 697.4893449547667, + "language_loss": 0.73646438, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.7507726, + "num_input_tokens_seen": 316696235, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.22302246, + "step": 14689, + "time_per_iteration": 2.7074341773986816 + }, + { + "auxiliary_loss_clip": 0.01232734, + "auxiliary_loss_mlp": 0.00203823, + "balance_loss_clip": 1.02487648, + "balance_loss_mlp": 0.18104178, + "epoch": 0.8832105816924696, + "flos": 26579642250240.0, + "grad_norm": 4.3972042108469696, + "language_loss": 0.78604245, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.80040801, + "num_input_tokens_seen": 316719680, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.2277832, + "step": 14690, + "time_per_iteration": 2.7105960845947266 + }, + { + "auxiliary_loss_clip": 0.01251003, + "auxiliary_loss_mlp": 0.00207252, + "balance_loss_clip": 1.03268993, + "balance_loss_mlp": 0.1839347, + "epoch": 0.8832707049451375, + "flos": 24898771359360.0, + "grad_norm": 59.277559105340515, + "language_loss": 0.80029374, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.81487632, + "num_input_tokens_seen": 316739830, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.2331543, + "step": 14691, + "time_per_iteration": 2.669515371322632 + }, + { + "auxiliary_loss_clip": 0.01279709, + "auxiliary_loss_mlp": 0.00221412, + "balance_loss_clip": 1.04881454, + "balance_loss_mlp": 0.19451821, + "epoch": 0.8833308281978055, + "flos": 15451141357440.0, + "grad_norm": 22.279993737572884, + "language_loss": 0.60980463, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.62481594, + "num_input_tokens_seen": 316758105, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.26940918, + "step": 14692, + "time_per_iteration": 2.6290252208709717 + }, + { + "auxiliary_loss_clip": 0.01267491, + "auxiliary_loss_mlp": 0.00225231, + "balance_loss_clip": 1.03846383, + "balance_loss_mlp": 0.20037603, + "epoch": 0.8833909514504734, + "flos": 20301523418880.0, + "grad_norm": 33.269456652717686, + "language_loss": 0.7083593, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.72328651, + "num_input_tokens_seen": 316777455, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.24853516, + "step": 14693, + "time_per_iteration": 2.608477830886841 + }, + { + "auxiliary_loss_clip": 0.01232883, + "auxiliary_loss_mlp": 0.00194671, + "balance_loss_clip": 1.02394485, + "balance_loss_mlp": 0.17211697, + "epoch": 0.8834510747031414, + "flos": 20374027021440.0, + "grad_norm": 6.924358597653154, + "language_loss": 0.81907088, + "learning_rate": 1.407396505730898e-07, + "loss": 0.83334643, + "num_input_tokens_seen": 316796300, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.22570801, + "step": 14694, + "time_per_iteration": 2.631570339202881 + }, + { + "auxiliary_loss_clip": 0.01250181, + "auxiliary_loss_mlp": 0.00215469, + "balance_loss_clip": 1.02725613, + "balance_loss_mlp": 0.19166242, + "epoch": 0.8835111979558095, + "flos": 29752026508800.0, + "grad_norm": 79.62749844412413, + "language_loss": 0.82377708, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.8384335, + "num_input_tokens_seen": 316819090, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.23840332, + "step": 14695, + "time_per_iteration": 2.7290844917297363 + }, + { + "auxiliary_loss_clip": 0.01241772, + "auxiliary_loss_mlp": 0.00200135, + "balance_loss_clip": 1.02973473, + "balance_loss_mlp": 0.17675805, + "epoch": 0.8835713212084774, + "flos": 24134556574080.0, + "grad_norm": 8.86482051632961, + "language_loss": 0.87528312, + "learning_rate": 1.404527630961998e-07, + "loss": 0.8897022, + "num_input_tokens_seen": 316839250, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.23388672, + "step": 14696, + "time_per_iteration": 2.6965370178222656 + }, + { + "auxiliary_loss_clip": 0.01252208, + "auxiliary_loss_mlp": 0.00218437, + "balance_loss_clip": 1.03018069, + "balance_loss_mlp": 0.19314043, + "epoch": 0.8836314444611454, + "flos": 27672331933440.0, + "grad_norm": 1.7616264259153829, + "language_loss": 0.82806128, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.84276772, + "num_input_tokens_seen": 316861315, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25280762, + "step": 14697, + "time_per_iteration": 2.732086658477783 + }, + { + "auxiliary_loss_clip": 0.01235921, + "auxiliary_loss_mlp": 0.00227707, + "balance_loss_clip": 1.02160466, + "balance_loss_mlp": 0.20386487, + "epoch": 0.8836915677138133, + "flos": 16836969934080.0, + "grad_norm": 223.8172143185505, + "language_loss": 0.78797674, + "learning_rate": 1.401661576761779e-07, + "loss": 0.80261302, + "num_input_tokens_seen": 316879325, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.23864746, + "step": 14698, + "time_per_iteration": 2.7728383541107178 + }, + { + "auxiliary_loss_clip": 0.01141814, + "auxiliary_loss_mlp": 0.00113995, + "balance_loss_clip": 1.00084865, + "balance_loss_mlp": 0.10684266, + "epoch": 0.8837516909664813, + "flos": 69310540823040.0, + "grad_norm": 0.7637945995492635, + "language_loss": 0.53091764, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.54347575, + "num_input_tokens_seen": 316936425, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.07128906, + "step": 14699, + "time_per_iteration": 3.1652109622955322 + }, + { + "auxiliary_loss_clip": 0.0123698, + "auxiliary_loss_mlp": 0.00205419, + "balance_loss_clip": 1.01772213, + "balance_loss_mlp": 0.1811479, + "epoch": 0.8838118142191492, + "flos": 21324726241920.0, + "grad_norm": 3.643324967016273, + "language_loss": 0.86915839, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.88358229, + "num_input_tokens_seen": 316956360, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.24267578, + "step": 14700, + "time_per_iteration": 2.6609997749328613 + }, + { + "auxiliary_loss_clip": 0.01235206, + "auxiliary_loss_mlp": 0.00197099, + "balance_loss_clip": 1.02005672, + "balance_loss_mlp": 0.17409131, + "epoch": 0.8838719374718172, + "flos": 21470559459840.0, + "grad_norm": 14.579911230540528, + "language_loss": 0.81771207, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.83203506, + "num_input_tokens_seen": 316975295, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.23022461, + "step": 14701, + "time_per_iteration": 2.6913814544677734 + }, + { + "auxiliary_loss_clip": 0.01266466, + "auxiliary_loss_mlp": 0.00238951, + "balance_loss_clip": 1.03674245, + "balance_loss_mlp": 0.21167552, + "epoch": 0.8839320607244852, + "flos": 26468929555200.0, + "grad_norm": 10.853367756405776, + "language_loss": 0.7967189, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.81177306, + "num_input_tokens_seen": 316994520, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.27294922, + "step": 14702, + "time_per_iteration": 2.7025229930877686 + }, + { + "auxiliary_loss_clip": 0.01260478, + "auxiliary_loss_mlp": 0.00235837, + "balance_loss_clip": 1.0404197, + "balance_loss_mlp": 0.21174484, + "epoch": 0.8839921839771532, + "flos": 45222270923520.0, + "grad_norm": 547.140188799923, + "language_loss": 0.78151608, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.79647923, + "num_input_tokens_seen": 317018095, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.24121094, + "step": 14703, + "time_per_iteration": 4.360274314880371 + }, + { + "auxiliary_loss_clip": 0.01220636, + "auxiliary_loss_mlp": 0.0020428, + "balance_loss_clip": 1.01174474, + "balance_loss_mlp": 0.18148738, + "epoch": 0.8840523072298211, + "flos": 20006876154240.0, + "grad_norm": 16.124461153745127, + "language_loss": 0.74086428, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.75511348, + "num_input_tokens_seen": 317035755, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.22790527, + "step": 14704, + "time_per_iteration": 4.118948698043823 + }, + { + "auxiliary_loss_clip": 0.01228666, + "auxiliary_loss_mlp": 0.00202132, + "balance_loss_clip": 1.01865542, + "balance_loss_mlp": 0.18012556, + "epoch": 0.8841124304824891, + "flos": 24426007528320.0, + "grad_norm": 59.81905263893535, + "language_loss": 0.78129488, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.79560292, + "num_input_tokens_seen": 317055765, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.22009277, + "step": 14705, + "time_per_iteration": 2.7672982215881348 + }, + { + "auxiliary_loss_clip": 0.01224239, + "auxiliary_loss_mlp": 0.00190829, + "balance_loss_clip": 1.01501322, + "balance_loss_mlp": 0.16794077, + "epoch": 0.884172553735157, + "flos": 31284622056960.0, + "grad_norm": 33.00697817258948, + "language_loss": 0.77858293, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.79273367, + "num_input_tokens_seen": 317077955, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.22900391, + "step": 14706, + "time_per_iteration": 2.7416415214538574 + }, + { + "auxiliary_loss_clip": 0.0124017, + "auxiliary_loss_mlp": 0.00191915, + "balance_loss_clip": 1.02462041, + "balance_loss_mlp": 0.16858621, + "epoch": 0.884232676987825, + "flos": 21391160446080.0, + "grad_norm": 19.78912078166092, + "language_loss": 0.8179155, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.83223635, + "num_input_tokens_seen": 317095825, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.23327637, + "step": 14707, + "time_per_iteration": 2.6320641040802 + }, + { + "auxiliary_loss_clip": 0.01141808, + "auxiliary_loss_mlp": 0.00151849, + "balance_loss_clip": 1.00118279, + "balance_loss_mlp": 0.14317098, + "epoch": 0.8842928002404931, + "flos": 57911451799680.0, + "grad_norm": 0.7882163729747542, + "language_loss": 0.59675199, + "learning_rate": 1.387373629491173e-07, + "loss": 0.60968858, + "num_input_tokens_seen": 317152875, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.08691406, + "step": 14708, + "time_per_iteration": 3.0161643028259277 + }, + { + "auxiliary_loss_clip": 0.01204058, + "auxiliary_loss_mlp": 0.00211459, + "balance_loss_clip": 1.00029969, + "balance_loss_mlp": 0.18805826, + "epoch": 0.884352923493161, + "flos": 41463896186880.0, + "grad_norm": 21.255959003701594, + "language_loss": 0.72973096, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.74388611, + "num_input_tokens_seen": 317176725, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.23400879, + "step": 14709, + "time_per_iteration": 2.8503310680389404 + }, + { + "auxiliary_loss_clip": 0.01252168, + "auxiliary_loss_mlp": 0.0023239, + "balance_loss_clip": 1.0300523, + "balance_loss_mlp": 0.20696232, + "epoch": 0.884413046745829, + "flos": 46541234332800.0, + "grad_norm": 2.5397855808006473, + "language_loss": 0.69634676, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.71119225, + "num_input_tokens_seen": 317206880, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25427246, + "step": 14710, + "time_per_iteration": 4.3776774406433105 + }, + { + "auxiliary_loss_clip": 0.01238829, + "auxiliary_loss_mlp": 0.00193162, + "balance_loss_clip": 1.02528369, + "balance_loss_mlp": 0.17027363, + "epoch": 0.8844731699984969, + "flos": 19135324552320.0, + "grad_norm": 72.6991784042285, + "language_loss": 0.72249603, + "learning_rate": 1.38310100580431e-07, + "loss": 0.73681587, + "num_input_tokens_seen": 317224135, + "router_z_loss_clip": 2.13574219, + "router_z_loss_mlp": 0.22900391, + "step": 14711, + "time_per_iteration": 2.6472487449645996 + }, + { + "auxiliary_loss_clip": 0.01258108, + "auxiliary_loss_mlp": 0.00234661, + "balance_loss_clip": 1.03471732, + "balance_loss_mlp": 0.20812455, + "epoch": 0.8845332932511649, + "flos": 23260634674560.0, + "grad_norm": 21.278099370276966, + "language_loss": 0.83436543, + "learning_rate": 1.38167820974606e-07, + "loss": 0.84929311, + "num_input_tokens_seen": 317244505, + "router_z_loss_clip": 2.23535156, + "router_z_loss_mlp": 0.26550293, + "step": 14712, + "time_per_iteration": 2.6864051818847656 + }, + { + "auxiliary_loss_clip": 0.01254546, + "auxiliary_loss_mlp": 0.00211633, + "balance_loss_clip": 1.03211689, + "balance_loss_mlp": 0.18659942, + "epoch": 0.8845934165038328, + "flos": 17564591738880.0, + "grad_norm": 16.57193039971631, + "language_loss": 0.91806209, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.93272388, + "num_input_tokens_seen": 317257830, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25036621, + "step": 14713, + "time_per_iteration": 2.6288020610809326 + }, + { + "auxiliary_loss_clip": 0.01219512, + "auxiliary_loss_mlp": 0.00190797, + "balance_loss_clip": 1.00917578, + "balance_loss_mlp": 0.16823025, + "epoch": 0.8846535397565009, + "flos": 27485739757440.0, + "grad_norm": 4.30061428507834, + "language_loss": 0.63034725, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.64445037, + "num_input_tokens_seen": 317278430, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.22570801, + "step": 14714, + "time_per_iteration": 2.7148754596710205 + }, + { + "auxiliary_loss_clip": 0.01228403, + "auxiliary_loss_mlp": 0.0020987, + "balance_loss_clip": 1.01456141, + "balance_loss_mlp": 0.18460976, + "epoch": 0.8847136630091688, + "flos": 28761430256640.0, + "grad_norm": 594.9196128781076, + "language_loss": 0.80749762, + "learning_rate": 1.377414057838755e-07, + "loss": 0.82188034, + "num_input_tokens_seen": 317295970, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.25231934, + "step": 14715, + "time_per_iteration": 4.145519018173218 + }, + { + "auxiliary_loss_clip": 0.01239681, + "auxiliary_loss_mlp": 0.00225019, + "balance_loss_clip": 1.02515769, + "balance_loss_mlp": 0.20072442, + "epoch": 0.8847737862618368, + "flos": 23476924419840.0, + "grad_norm": 260.8045641395967, + "language_loss": 0.81236911, + "learning_rate": 1.375994086138461e-07, + "loss": 0.82701606, + "num_input_tokens_seen": 317316185, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.24279785, + "step": 14716, + "time_per_iteration": 2.713517189025879 + }, + { + "auxiliary_loss_clip": 0.01250119, + "auxiliary_loss_mlp": 0.00217161, + "balance_loss_clip": 1.03137553, + "balance_loss_mlp": 0.19161427, + "epoch": 0.8848339095145047, + "flos": 18660872782080.0, + "grad_norm": 241.98286960747376, + "language_loss": 0.79654944, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.81122231, + "num_input_tokens_seen": 317333275, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25524902, + "step": 14717, + "time_per_iteration": 2.6731925010681152 + }, + { + "auxiliary_loss_clip": 0.01224575, + "auxiliary_loss_mlp": 0.00183498, + "balance_loss_clip": 1.01904988, + "balance_loss_mlp": 0.1611703, + "epoch": 0.8848940327671727, + "flos": 32270298145920.0, + "grad_norm": 70.04279209807329, + "language_loss": 0.79754728, + "learning_rate": 1.373156261464208e-07, + "loss": 0.81162798, + "num_input_tokens_seen": 317351245, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.22338867, + "step": 14718, + "time_per_iteration": 2.725996494293213 + }, + { + "auxiliary_loss_clip": 0.01245275, + "auxiliary_loss_mlp": 0.00190232, + "balance_loss_clip": 1.02174675, + "balance_loss_mlp": 0.16392267, + "epoch": 0.8849541560198406, + "flos": 24021832717440.0, + "grad_norm": 24.197588879023172, + "language_loss": 0.85692811, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.87128317, + "num_input_tokens_seen": 317370740, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26318359, + "step": 14719, + "time_per_iteration": 2.718214988708496 + }, + { + "auxiliary_loss_clip": 0.01246566, + "auxiliary_loss_mlp": 0.00211736, + "balance_loss_clip": 1.02533436, + "balance_loss_mlp": 0.18821582, + "epoch": 0.8850142792725086, + "flos": 16873060124160.0, + "grad_norm": 11.496224948543954, + "language_loss": 0.78372121, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.7983042, + "num_input_tokens_seen": 317388370, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.23522949, + "step": 14720, + "time_per_iteration": 2.6510961055755615 + }, + { + "auxiliary_loss_clip": 0.01250763, + "auxiliary_loss_mlp": 0.00222327, + "balance_loss_clip": 1.03087139, + "balance_loss_mlp": 0.19558857, + "epoch": 0.8850744025251767, + "flos": 24024059360640.0, + "grad_norm": 112.12904758839495, + "language_loss": 0.91653514, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.93126607, + "num_input_tokens_seen": 317407390, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26757812, + "step": 14721, + "time_per_iteration": 2.6652679443359375 + }, + { + "auxiliary_loss_clip": 0.01258796, + "auxiliary_loss_mlp": 0.00207002, + "balance_loss_clip": 1.03336728, + "balance_loss_mlp": 0.17839147, + "epoch": 0.8851345257778446, + "flos": 47955575329920.0, + "grad_norm": 8.63124123148216, + "language_loss": 0.71990705, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.73456502, + "num_input_tokens_seen": 317430825, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.2857666, + "step": 14722, + "time_per_iteration": 2.8640894889831543 + }, + { + "auxiliary_loss_clip": 0.01240439, + "auxiliary_loss_mlp": 0.00182203, + "balance_loss_clip": 1.02525413, + "balance_loss_mlp": 0.15782472, + "epoch": 0.8851946490305126, + "flos": 36611000173440.0, + "grad_norm": 24.651356149789887, + "language_loss": 0.78260469, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.79683113, + "num_input_tokens_seen": 317451905, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24389648, + "step": 14723, + "time_per_iteration": 2.8277499675750732 + }, + { + "auxiliary_loss_clip": 0.01233335, + "auxiliary_loss_mlp": 0.00201011, + "balance_loss_clip": 1.02071774, + "balance_loss_mlp": 0.17795607, + "epoch": 0.8852547722831805, + "flos": 21544248211200.0, + "grad_norm": 2.5383805255559153, + "language_loss": 0.86220711, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.87655056, + "num_input_tokens_seen": 317470030, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.23059082, + "step": 14724, + "time_per_iteration": 2.646821975708008 + }, + { + "auxiliary_loss_clip": 0.01136657, + "auxiliary_loss_mlp": 0.00180114, + "balance_loss_clip": 0.99567902, + "balance_loss_mlp": 0.16995783, + "epoch": 0.8853148955358485, + "flos": 63059246472960.0, + "grad_norm": 0.7909354004589548, + "language_loss": 0.58287889, + "learning_rate": 1.363246127376143e-07, + "loss": 0.59604657, + "num_input_tokens_seen": 317527460, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.1015625, + "step": 14725, + "time_per_iteration": 3.0173439979553223 + }, + { + "auxiliary_loss_clip": 0.01269636, + "auxiliary_loss_mlp": 0.00235458, + "balance_loss_clip": 1.03883874, + "balance_loss_mlp": 0.2071097, + "epoch": 0.8853750187885164, + "flos": 18149828031360.0, + "grad_norm": 33.07999910585455, + "language_loss": 0.80525249, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.82030344, + "num_input_tokens_seen": 317544070, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.28356934, + "step": 14726, + "time_per_iteration": 2.675887107849121 + }, + { + "auxiliary_loss_clip": 0.01240907, + "auxiliary_loss_mlp": 0.00207792, + "balance_loss_clip": 1.02463865, + "balance_loss_mlp": 0.18230531, + "epoch": 0.8854351420411845, + "flos": 39570542392320.0, + "grad_norm": 338.1914952440052, + "language_loss": 0.74775285, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.76223981, + "num_input_tokens_seen": 317570275, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.25524902, + "step": 14727, + "time_per_iteration": 2.8345322608947754 + }, + { + "auxiliary_loss_clip": 0.0123739, + "auxiliary_loss_mlp": 0.00199687, + "balance_loss_clip": 1.02717912, + "balance_loss_mlp": 0.17861083, + "epoch": 0.8854952652938524, + "flos": 23769309127680.0, + "grad_norm": 15.928599101948274, + "language_loss": 0.78632843, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.80069917, + "num_input_tokens_seen": 317590160, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.21069336, + "step": 14728, + "time_per_iteration": 2.668821096420288 + }, + { + "auxiliary_loss_clip": 0.0123231, + "auxiliary_loss_mlp": 0.00199257, + "balance_loss_clip": 1.0167222, + "balance_loss_mlp": 0.17527181, + "epoch": 0.8855553885465204, + "flos": 18290310122880.0, + "grad_norm": 5.508606094082406, + "language_loss": 0.77391303, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.78822875, + "num_input_tokens_seen": 317608340, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.2401123, + "step": 14729, + "time_per_iteration": 2.606114387512207 + }, + { + "auxiliary_loss_clip": 0.01233458, + "auxiliary_loss_mlp": 0.00190115, + "balance_loss_clip": 1.02094615, + "balance_loss_mlp": 0.16778669, + "epoch": 0.8856155117991883, + "flos": 36867402432000.0, + "grad_norm": 6.012381092998263, + "language_loss": 0.71389079, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.72812647, + "num_input_tokens_seen": 317629910, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.22338867, + "step": 14730, + "time_per_iteration": 2.7688257694244385 + }, + { + "auxiliary_loss_clip": 0.01231604, + "auxiliary_loss_mlp": 0.00197299, + "balance_loss_clip": 1.02381968, + "balance_loss_mlp": 0.17506675, + "epoch": 0.8856756350518563, + "flos": 22163886754560.0, + "grad_norm": 473.99953936545285, + "language_loss": 0.85778463, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.87207365, + "num_input_tokens_seen": 317650265, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.22229004, + "step": 14731, + "time_per_iteration": 2.6504909992218018 + }, + { + "auxiliary_loss_clip": 0.0124342, + "auxiliary_loss_mlp": 0.00217554, + "balance_loss_clip": 1.02406108, + "balance_loss_mlp": 0.191495, + "epoch": 0.8857357583045242, + "flos": 20740962407040.0, + "grad_norm": 82.3850920338073, + "language_loss": 0.91579926, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.93040907, + "num_input_tokens_seen": 317669045, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.26037598, + "step": 14732, + "time_per_iteration": 2.6966965198516846 + }, + { + "auxiliary_loss_clip": 0.01138767, + "auxiliary_loss_mlp": 0.00184638, + "balance_loss_clip": 0.99782854, + "balance_loss_mlp": 0.17481494, + "epoch": 0.8857958815571922, + "flos": 69892329409920.0, + "grad_norm": 0.9239748013275888, + "language_loss": 0.58870566, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.60193968, + "num_input_tokens_seen": 317728065, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.09814453, + "step": 14733, + "time_per_iteration": 3.1644692420959473 + }, + { + "auxiliary_loss_clip": 0.01236603, + "auxiliary_loss_mlp": 0.00197861, + "balance_loss_clip": 1.02468729, + "balance_loss_mlp": 0.17373255, + "epoch": 0.8858560048098603, + "flos": 15121948187520.0, + "grad_norm": 6.236357589746058, + "language_loss": 0.76392972, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.77827442, + "num_input_tokens_seen": 317746120, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.2409668, + "step": 14734, + "time_per_iteration": 2.7121474742889404 + }, + { + "auxiliary_loss_clip": 0.01237073, + "auxiliary_loss_mlp": 0.00196984, + "balance_loss_clip": 1.02955818, + "balance_loss_mlp": 0.17506097, + "epoch": 0.8859161280625282, + "flos": 16611019430400.0, + "grad_norm": 44.88065343617451, + "language_loss": 0.82479, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.83913052, + "num_input_tokens_seen": 317762280, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.21923828, + "step": 14735, + "time_per_iteration": 2.6151435375213623 + }, + { + "auxiliary_loss_clip": 0.01242018, + "auxiliary_loss_mlp": 0.00213227, + "balance_loss_clip": 1.02467108, + "balance_loss_mlp": 0.18775165, + "epoch": 0.8859762513151962, + "flos": 18694484933760.0, + "grad_norm": 7.157788703512122, + "language_loss": 0.79107082, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.80562329, + "num_input_tokens_seen": 317780615, + "router_z_loss_clip": 2.17285156, + "router_z_loss_mlp": 0.25463867, + "step": 14736, + "time_per_iteration": 2.714839458465576 + }, + { + "auxiliary_loss_clip": 0.01236339, + "auxiliary_loss_mlp": 0.00201176, + "balance_loss_clip": 1.02517939, + "balance_loss_mlp": 0.17772718, + "epoch": 0.8860363745678641, + "flos": 19536877670400.0, + "grad_norm": 256.4452416786309, + "language_loss": 0.92459756, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.93897271, + "num_input_tokens_seen": 317798830, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.23449707, + "step": 14737, + "time_per_iteration": 2.6794421672821045 + }, + { + "auxiliary_loss_clip": 0.01251855, + "auxiliary_loss_mlp": 0.00219483, + "balance_loss_clip": 1.0376693, + "balance_loss_mlp": 0.1937104, + "epoch": 0.8860964978205321, + "flos": 35954912304000.0, + "grad_norm": 30.746100708611035, + "language_loss": 0.77321517, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.78792858, + "num_input_tokens_seen": 317819235, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.25769043, + "step": 14738, + "time_per_iteration": 2.7835981845855713 + }, + { + "auxiliary_loss_clip": 0.01255107, + "auxiliary_loss_mlp": 0.00214654, + "balance_loss_clip": 1.02978778, + "balance_loss_mlp": 0.18833217, + "epoch": 0.8861566210732, + "flos": 21212577002880.0, + "grad_norm": 14.527332357269145, + "language_loss": 0.83336455, + "learning_rate": 1.343529763547222e-07, + "loss": 0.84806216, + "num_input_tokens_seen": 317836785, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.26330566, + "step": 14739, + "time_per_iteration": 2.6709787845611572 + }, + { + "auxiliary_loss_clip": 0.01229832, + "auxiliary_loss_mlp": 0.00209107, + "balance_loss_clip": 1.0188508, + "balance_loss_mlp": 0.18509871, + "epoch": 0.886216744325868, + "flos": 14609071843200.0, + "grad_norm": 14.006959975435175, + "language_loss": 0.93290395, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.94729328, + "num_input_tokens_seen": 317854225, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.23999023, + "step": 14740, + "time_per_iteration": 2.6033682823181152 + }, + { + "auxiliary_loss_clip": 0.01232416, + "auxiliary_loss_mlp": 0.0022007, + "balance_loss_clip": 1.0190444, + "balance_loss_mlp": 0.19571508, + "epoch": 0.886276867578536, + "flos": 26651643062400.0, + "grad_norm": 37.813440093416524, + "language_loss": 0.71888936, + "learning_rate": 1.34072445601471e-07, + "loss": 0.73341423, + "num_input_tokens_seen": 317874865, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24353027, + "step": 14741, + "time_per_iteration": 2.6978201866149902 + }, + { + "auxiliary_loss_clip": 0.01248034, + "auxiliary_loss_mlp": 0.00217692, + "balance_loss_clip": 1.03103518, + "balance_loss_mlp": 0.19157293, + "epoch": 0.886336990831204, + "flos": 16764071281920.0, + "grad_norm": 27.38537539454488, + "language_loss": 0.82577485, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.84043217, + "num_input_tokens_seen": 317892830, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.2611084, + "step": 14742, + "time_per_iteration": 2.628241777420044 + }, + { + "auxiliary_loss_clip": 0.012238, + "auxiliary_loss_mlp": 0.00209766, + "balance_loss_clip": 1.01256871, + "balance_loss_mlp": 0.18604338, + "epoch": 0.8863971140838719, + "flos": 25265275781760.0, + "grad_norm": 27.7719153692272, + "language_loss": 0.67324054, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.68757623, + "num_input_tokens_seen": 317911780, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.23718262, + "step": 14743, + "time_per_iteration": 2.860140562057495 + }, + { + "auxiliary_loss_clip": 0.01257017, + "auxiliary_loss_mlp": 0.00214389, + "balance_loss_clip": 1.0342195, + "balance_loss_mlp": 0.18661365, + "epoch": 0.8864572373365399, + "flos": 23404313076480.0, + "grad_norm": 10.071489109421657, + "language_loss": 0.67864847, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.69336253, + "num_input_tokens_seen": 317932855, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.27770996, + "step": 14744, + "time_per_iteration": 2.66629695892334 + }, + { + "auxiliary_loss_clip": 0.01234161, + "auxiliary_loss_mlp": 0.00218347, + "balance_loss_clip": 1.01898837, + "balance_loss_mlp": 0.19256178, + "epoch": 0.8865173605892078, + "flos": 18548759456640.0, + "grad_norm": 6.586067973907986, + "language_loss": 0.84048098, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.8550061, + "num_input_tokens_seen": 317952090, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.2578125, + "step": 14745, + "time_per_iteration": 4.181748390197754 + }, + { + "auxiliary_loss_clip": 0.01234048, + "auxiliary_loss_mlp": 0.00202109, + "balance_loss_clip": 1.02069712, + "balance_loss_mlp": 0.17838621, + "epoch": 0.8865774838418758, + "flos": 19025868833280.0, + "grad_norm": 39.64515260491521, + "language_loss": 0.8649857, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.87934726, + "num_input_tokens_seen": 317970370, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23718262, + "step": 14746, + "time_per_iteration": 4.153962850570679 + }, + { + "auxiliary_loss_clip": 0.01258901, + "auxiliary_loss_mlp": 0.00197276, + "balance_loss_clip": 1.03134692, + "balance_loss_mlp": 0.17121691, + "epoch": 0.8866376070945439, + "flos": 22163168482560.0, + "grad_norm": 3.922959358141624, + "language_loss": 0.85235119, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.86691296, + "num_input_tokens_seen": 317989125, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.26062012, + "step": 14747, + "time_per_iteration": 2.6865029335021973 + }, + { + "auxiliary_loss_clip": 0.01228439, + "auxiliary_loss_mlp": 0.00240471, + "balance_loss_clip": 1.01500547, + "balance_loss_mlp": 0.21579504, + "epoch": 0.8866977303472118, + "flos": 20704261685760.0, + "grad_norm": 11.243385949710184, + "language_loss": 0.90725338, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.92194253, + "num_input_tokens_seen": 318007820, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.24694824, + "step": 14748, + "time_per_iteration": 2.6466941833496094 + }, + { + "auxiliary_loss_clip": 0.01248031, + "auxiliary_loss_mlp": 0.00206547, + "balance_loss_clip": 1.02928936, + "balance_loss_mlp": 0.18178675, + "epoch": 0.8867578535998798, + "flos": 48794448533760.0, + "grad_norm": 15.32179602818951, + "language_loss": 0.83754945, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.85209513, + "num_input_tokens_seen": 318030435, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.2479248, + "step": 14749, + "time_per_iteration": 2.9730517864227295 + }, + { + "auxiliary_loss_clip": 0.01252489, + "auxiliary_loss_mlp": 0.00201429, + "balance_loss_clip": 1.02972555, + "balance_loss_mlp": 0.17476191, + "epoch": 0.8868179768525477, + "flos": 21105312013440.0, + "grad_norm": 4.920100825621436, + "language_loss": 0.79544407, + "learning_rate": 1.328135602550451e-07, + "loss": 0.80998325, + "num_input_tokens_seen": 318049465, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.2668457, + "step": 14750, + "time_per_iteration": 2.65580153465271 + }, + { + "auxiliary_loss_clip": 0.01237761, + "auxiliary_loss_mlp": 0.00215572, + "balance_loss_clip": 1.02317929, + "balance_loss_mlp": 0.19224297, + "epoch": 0.8868781001052157, + "flos": 21830922656640.0, + "grad_norm": 10.802415020196346, + "language_loss": 0.6825816, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.69711494, + "num_input_tokens_seen": 318067760, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.23339844, + "step": 14751, + "time_per_iteration": 2.773310899734497 + }, + { + "auxiliary_loss_clip": 0.01251682, + "auxiliary_loss_mlp": 0.00217684, + "balance_loss_clip": 1.03348362, + "balance_loss_mlp": 0.19247118, + "epoch": 0.8869382233578836, + "flos": 13516418073600.0, + "grad_norm": 23.16617959842969, + "language_loss": 0.90325141, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.91794509, + "num_input_tokens_seen": 318082785, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.2520752, + "step": 14752, + "time_per_iteration": 4.029150485992432 + }, + { + "auxiliary_loss_clip": 0.01252162, + "auxiliary_loss_mlp": 0.00223627, + "balance_loss_clip": 1.02629507, + "balance_loss_mlp": 0.19598264, + "epoch": 0.8869983466105517, + "flos": 22704988210560.0, + "grad_norm": 17.728027801765545, + "language_loss": 0.87223834, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.88699621, + "num_input_tokens_seen": 318101925, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.27648926, + "step": 14753, + "time_per_iteration": 2.6618664264678955 + }, + { + "auxiliary_loss_clip": 0.01252566, + "auxiliary_loss_mlp": 0.00213285, + "balance_loss_clip": 1.03252077, + "balance_loss_mlp": 0.18841785, + "epoch": 0.8870584698632196, + "flos": 15340751884800.0, + "grad_norm": 214.27015660362002, + "language_loss": 0.7496686, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.76432705, + "num_input_tokens_seen": 318119945, + "router_z_loss_clip": 2.20019531, + "router_z_loss_mlp": 0.2487793, + "step": 14754, + "time_per_iteration": 2.6466569900512695 + }, + { + "auxiliary_loss_clip": 0.0124995, + "auxiliary_loss_mlp": 0.00207426, + "balance_loss_clip": 1.03139591, + "balance_loss_mlp": 0.18300045, + "epoch": 0.8871185931158876, + "flos": 26615624699520.0, + "grad_norm": 18.597687385218368, + "language_loss": 0.84435606, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.85892987, + "num_input_tokens_seen": 318139685, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24438477, + "step": 14755, + "time_per_iteration": 2.6790030002593994 + }, + { + "auxiliary_loss_clip": 0.01238464, + "auxiliary_loss_mlp": 0.00216439, + "balance_loss_clip": 1.02015734, + "balance_loss_mlp": 0.18911585, + "epoch": 0.8871787163685555, + "flos": 21799034357760.0, + "grad_norm": 8.841719774390247, + "language_loss": 0.83539402, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.8499431, + "num_input_tokens_seen": 318160375, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.2734375, + "step": 14756, + "time_per_iteration": 2.707352876663208 + }, + { + "auxiliary_loss_clip": 0.01244117, + "auxiliary_loss_mlp": 0.00218655, + "balance_loss_clip": 1.02428746, + "balance_loss_mlp": 0.19382419, + "epoch": 0.8872388396212235, + "flos": 14902964922240.0, + "grad_norm": 18.19898344030165, + "language_loss": 0.85407734, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.86870503, + "num_input_tokens_seen": 318177995, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.24816895, + "step": 14757, + "time_per_iteration": 4.07853889465332 + }, + { + "auxiliary_loss_clip": 0.01221029, + "auxiliary_loss_mlp": 0.0020661, + "balance_loss_clip": 1.00837159, + "balance_loss_mlp": 0.18337612, + "epoch": 0.8872989628738914, + "flos": 26432157006720.0, + "grad_norm": 6.982756854302053, + "language_loss": 0.75985229, + "learning_rate": 1.316993656021632e-07, + "loss": 0.77412868, + "num_input_tokens_seen": 318197030, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.23254395, + "step": 14758, + "time_per_iteration": 2.8234407901763916 + }, + { + "auxiliary_loss_clip": 0.01260079, + "auxiliary_loss_mlp": 0.00225863, + "balance_loss_clip": 1.04188788, + "balance_loss_mlp": 0.20001872, + "epoch": 0.8873590861265594, + "flos": 48142562555520.0, + "grad_norm": 73.20494585873374, + "language_loss": 0.76197946, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.7768389, + "num_input_tokens_seen": 318221780, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25817871, + "step": 14759, + "time_per_iteration": 2.9613053798675537 + }, + { + "auxiliary_loss_clip": 0.01216966, + "auxiliary_loss_mlp": 0.00211678, + "balance_loss_clip": 1.00351095, + "balance_loss_mlp": 0.18782456, + "epoch": 0.8874192093792275, + "flos": 18332972501760.0, + "grad_norm": 4.934560025686785, + "language_loss": 0.83832526, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.85261172, + "num_input_tokens_seen": 318239710, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23864746, + "step": 14760, + "time_per_iteration": 2.6534879207611084 + }, + { + "auxiliary_loss_clip": 0.01250032, + "auxiliary_loss_mlp": 0.00202303, + "balance_loss_clip": 1.02884221, + "balance_loss_mlp": 0.17654169, + "epoch": 0.8874793326318954, + "flos": 17894215872000.0, + "grad_norm": 182.49121923634502, + "language_loss": 0.87312514, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.88764846, + "num_input_tokens_seen": 318257425, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25769043, + "step": 14761, + "time_per_iteration": 2.623445749282837 + }, + { + "auxiliary_loss_clip": 0.01224673, + "auxiliary_loss_mlp": 0.00215614, + "balance_loss_clip": 1.01293731, + "balance_loss_mlp": 0.19242817, + "epoch": 0.8875394558845634, + "flos": 31102231772160.0, + "grad_norm": 32.79207795417152, + "language_loss": 0.71176195, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.72616482, + "num_input_tokens_seen": 318278485, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.23168945, + "step": 14762, + "time_per_iteration": 2.7377376556396484 + }, + { + "auxiliary_loss_clip": 0.01237343, + "auxiliary_loss_mlp": 0.00208586, + "balance_loss_clip": 1.02290916, + "balance_loss_mlp": 0.18427958, + "epoch": 0.8875995791372313, + "flos": 21142048648320.0, + "grad_norm": 28.374094330900785, + "language_loss": 0.74814999, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.76260924, + "num_input_tokens_seen": 318297560, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.2434082, + "step": 14763, + "time_per_iteration": 2.727959156036377 + }, + { + "auxiliary_loss_clip": 0.01255143, + "auxiliary_loss_mlp": 0.00204986, + "balance_loss_clip": 1.03000903, + "balance_loss_mlp": 0.17912953, + "epoch": 0.8876597023898993, + "flos": 17455136019840.0, + "grad_norm": 115.8381253482719, + "language_loss": 0.78874862, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.80334997, + "num_input_tokens_seen": 318313060, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.25866699, + "step": 14764, + "time_per_iteration": 2.6225690841674805 + }, + { + "auxiliary_loss_clip": 0.01245192, + "auxiliary_loss_mlp": 0.002189, + "balance_loss_clip": 1.02342641, + "balance_loss_mlp": 0.19454512, + "epoch": 0.8877198256425672, + "flos": 22707933125760.0, + "grad_norm": 24.110255011175106, + "language_loss": 0.76687437, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.78151536, + "num_input_tokens_seen": 318332030, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.24328613, + "step": 14765, + "time_per_iteration": 2.7009658813476562 + }, + { + "auxiliary_loss_clip": 0.01226327, + "auxiliary_loss_mlp": 0.00221086, + "balance_loss_clip": 1.01518691, + "balance_loss_mlp": 0.1960997, + "epoch": 0.8877799488952353, + "flos": 24535104111360.0, + "grad_norm": 4.555663669142268, + "language_loss": 0.84501046, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.85948461, + "num_input_tokens_seen": 318351090, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.24987793, + "step": 14766, + "time_per_iteration": 2.699216842651367 + }, + { + "auxiliary_loss_clip": 0.01232446, + "auxiliary_loss_mlp": 0.00209461, + "balance_loss_clip": 1.01773584, + "balance_loss_mlp": 0.18601286, + "epoch": 0.8878400721479032, + "flos": 20959191486720.0, + "grad_norm": 79.07665593989941, + "language_loss": 0.81851208, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.83293122, + "num_input_tokens_seen": 318372000, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.23474121, + "step": 14767, + "time_per_iteration": 2.748370409011841 + }, + { + "auxiliary_loss_clip": 0.01225174, + "auxiliary_loss_mlp": 0.00190151, + "balance_loss_clip": 1.01352239, + "balance_loss_mlp": 0.16716768, + "epoch": 0.8879001954005712, + "flos": 25295260659840.0, + "grad_norm": 12.27886566546524, + "language_loss": 0.79586577, + "learning_rate": 1.303129987538778e-07, + "loss": 0.81001902, + "num_input_tokens_seen": 318391530, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.22961426, + "step": 14768, + "time_per_iteration": 2.6915369033813477 + }, + { + "auxiliary_loss_clip": 0.01240405, + "auxiliary_loss_mlp": 0.0019566, + "balance_loss_clip": 1.02681005, + "balance_loss_mlp": 0.17128217, + "epoch": 0.8879603186532391, + "flos": 23185329811200.0, + "grad_norm": 11.141767622818953, + "language_loss": 0.78676951, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.80113024, + "num_input_tokens_seen": 318410690, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24389648, + "step": 14769, + "time_per_iteration": 2.7202377319335938 + }, + { + "auxiliary_loss_clip": 0.01218863, + "auxiliary_loss_mlp": 0.00202504, + "balance_loss_clip": 1.00854242, + "balance_loss_mlp": 0.18000922, + "epoch": 0.8880204419059071, + "flos": 13655427707520.0, + "grad_norm": 19989.14465877103, + "language_loss": 0.74721152, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.7614252, + "num_input_tokens_seen": 318427380, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.22509766, + "step": 14770, + "time_per_iteration": 2.70035982131958 + }, + { + "auxiliary_loss_clip": 0.01219162, + "auxiliary_loss_mlp": 0.00221951, + "balance_loss_clip": 1.00925684, + "balance_loss_mlp": 0.19763236, + "epoch": 0.888080565158575, + "flos": 20631865824000.0, + "grad_norm": 463.96450495673207, + "language_loss": 0.72878861, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.74319971, + "num_input_tokens_seen": 318448530, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.24304199, + "step": 14771, + "time_per_iteration": 2.695709705352783 + }, + { + "auxiliary_loss_clip": 0.01236276, + "auxiliary_loss_mlp": 0.00227089, + "balance_loss_clip": 1.02139473, + "balance_loss_mlp": 0.20311588, + "epoch": 0.888140688411243, + "flos": 28620014411520.0, + "grad_norm": 52.38675223208279, + "language_loss": 0.87537813, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.89001173, + "num_input_tokens_seen": 318468655, + "router_z_loss_clip": 2.15136719, + "router_z_loss_mlp": 0.23986816, + "step": 14772, + "time_per_iteration": 2.738405466079712 + }, + { + "auxiliary_loss_clip": 0.01216898, + "auxiliary_loss_mlp": 0.00219376, + "balance_loss_clip": 1.01016068, + "balance_loss_mlp": 0.19607024, + "epoch": 0.8882008116639111, + "flos": 25520241496320.0, + "grad_norm": 4.1739591762975685, + "language_loss": 0.82001591, + "learning_rate": 1.296224737033258e-07, + "loss": 0.8343786, + "num_input_tokens_seen": 318488740, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.23303223, + "step": 14773, + "time_per_iteration": 2.6990530490875244 + }, + { + "auxiliary_loss_clip": 0.01237028, + "auxiliary_loss_mlp": 0.0019712, + "balance_loss_clip": 1.02510619, + "balance_loss_mlp": 0.17442302, + "epoch": 0.888260934916579, + "flos": 27673696650240.0, + "grad_norm": 681.7323883765392, + "language_loss": 0.82029927, + "learning_rate": 1.294845814469907e-07, + "loss": 0.83464074, + "num_input_tokens_seen": 318508810, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.2265625, + "step": 14774, + "time_per_iteration": 2.729085922241211 + }, + { + "auxiliary_loss_clip": 0.01254208, + "auxiliary_loss_mlp": 0.00225846, + "balance_loss_clip": 1.03203201, + "balance_loss_mlp": 0.19959661, + "epoch": 0.888321058169247, + "flos": 21611077464960.0, + "grad_norm": 9.49057400519304, + "language_loss": 0.84317327, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.85797381, + "num_input_tokens_seen": 318526860, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.26245117, + "step": 14775, + "time_per_iteration": 2.6381781101226807 + }, + { + "auxiliary_loss_clip": 0.01231492, + "auxiliary_loss_mlp": 0.0019848, + "balance_loss_clip": 1.01850867, + "balance_loss_mlp": 0.17503199, + "epoch": 0.8883811814219149, + "flos": 18149109759360.0, + "grad_norm": 19.47064955627887, + "language_loss": 0.87001908, + "learning_rate": 1.292090097299432e-07, + "loss": 0.88431883, + "num_input_tokens_seen": 318545180, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.23425293, + "step": 14776, + "time_per_iteration": 2.643582582473755 + }, + { + "auxiliary_loss_clip": 0.0125934, + "auxiliary_loss_mlp": 0.00214253, + "balance_loss_clip": 1.03377032, + "balance_loss_mlp": 0.1888133, + "epoch": 0.8884413046745829, + "flos": 28324648874880.0, + "grad_norm": 14.786214627757134, + "language_loss": 0.7939924, + "learning_rate": 1.290713302796802e-07, + "loss": 0.80872834, + "num_input_tokens_seen": 318564350, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.25463867, + "step": 14777, + "time_per_iteration": 2.711566686630249 + }, + { + "auxiliary_loss_clip": 0.01226045, + "auxiliary_loss_mlp": 0.00201912, + "balance_loss_clip": 1.01427567, + "balance_loss_mlp": 0.17812979, + "epoch": 0.8885014279272508, + "flos": 15158756649600.0, + "grad_norm": 6.87632635900233, + "language_loss": 0.79748499, + "learning_rate": 1.2893372177522e-07, + "loss": 0.8117646, + "num_input_tokens_seen": 318582275, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.23791504, + "step": 14778, + "time_per_iteration": 2.6354029178619385 + }, + { + "auxiliary_loss_clip": 0.01239512, + "auxiliary_loss_mlp": 0.00214121, + "balance_loss_clip": 1.02278554, + "balance_loss_mlp": 0.19027874, + "epoch": 0.8885615511799189, + "flos": 19099593498240.0, + "grad_norm": 43.0590915770348, + "language_loss": 0.83736879, + "learning_rate": 1.287961842217804e-07, + "loss": 0.85190517, + "num_input_tokens_seen": 318601230, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.23828125, + "step": 14779, + "time_per_iteration": 2.639014959335327 + }, + { + "auxiliary_loss_clip": 0.01125381, + "auxiliary_loss_mlp": 0.00158438, + "balance_loss_clip": 0.98438537, + "balance_loss_mlp": 0.15018876, + "epoch": 0.8886216744325868, + "flos": 51186567605760.0, + "grad_norm": 0.8466552501689502, + "language_loss": 0.55715346, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.56999165, + "num_input_tokens_seen": 318645595, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.08251953, + "step": 14780, + "time_per_iteration": 2.971055507659912 + }, + { + "auxiliary_loss_clip": 0.0112768, + "auxiliary_loss_mlp": 0.00080199, + "balance_loss_clip": 0.98505974, + "balance_loss_mlp": 0.07366651, + "epoch": 0.8886817976852548, + "flos": 61612981263360.0, + "grad_norm": 0.7735029368922866, + "language_loss": 0.61341631, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.62549508, + "num_input_tokens_seen": 318707850, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.06542969, + "step": 14781, + "time_per_iteration": 3.202974557876587 + }, + { + "auxiliary_loss_clip": 0.01122802, + "auxiliary_loss_mlp": 0.00179859, + "balance_loss_clip": 0.98479784, + "balance_loss_mlp": 0.17060831, + "epoch": 0.8887419209379227, + "flos": 60646946935680.0, + "grad_norm": 0.7798984368725708, + "language_loss": 0.57402527, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.58705187, + "num_input_tokens_seen": 318764915, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.09228516, + "step": 14782, + "time_per_iteration": 2.9798190593719482 + }, + { + "auxiliary_loss_clip": 0.01219296, + "auxiliary_loss_mlp": 0.00222758, + "balance_loss_clip": 1.01412058, + "balance_loss_mlp": 0.19880924, + "epoch": 0.8888020441905907, + "flos": 29205861235200.0, + "grad_norm": 28.947117193984937, + "language_loss": 0.72636372, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.74078423, + "num_input_tokens_seen": 318785660, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.23950195, + "step": 14783, + "time_per_iteration": 2.800751209259033 + }, + { + "auxiliary_loss_clip": 0.01256312, + "auxiliary_loss_mlp": 0.0022295, + "balance_loss_clip": 1.03535509, + "balance_loss_mlp": 0.19878623, + "epoch": 0.8888621674432586, + "flos": 22162701605760.0, + "grad_norm": 46.33082866041383, + "language_loss": 0.84973061, + "learning_rate": 1.281095609023415e-07, + "loss": 0.86452323, + "num_input_tokens_seen": 318806080, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24145508, + "step": 14784, + "time_per_iteration": 2.663804769515991 + }, + { + "auxiliary_loss_clip": 0.01272192, + "auxiliary_loss_mlp": 0.00219273, + "balance_loss_clip": 1.04370308, + "balance_loss_mlp": 0.19359508, + "epoch": 0.8889222906959267, + "flos": 27672834723840.0, + "grad_norm": 185.7006610821539, + "language_loss": 0.71530616, + "learning_rate": 1.279724491644565e-07, + "loss": 0.73022079, + "num_input_tokens_seen": 318826445, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.2565918, + "step": 14785, + "time_per_iteration": 2.7417757511138916 + }, + { + "auxiliary_loss_clip": 0.01250592, + "auxiliary_loss_mlp": 0.00185635, + "balance_loss_clip": 1.03232384, + "balance_loss_mlp": 0.16049415, + "epoch": 0.8889824139485947, + "flos": 14168627274240.0, + "grad_norm": 36.21589165396545, + "language_loss": 0.72570443, + "learning_rate": 1.278354084140445e-07, + "loss": 0.74006677, + "num_input_tokens_seen": 318843915, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25146484, + "step": 14786, + "time_per_iteration": 2.5951390266418457 + }, + { + "auxiliary_loss_clip": 0.01251956, + "auxiliary_loss_mlp": 0.00210768, + "balance_loss_clip": 1.02966595, + "balance_loss_mlp": 0.18529284, + "epoch": 0.8890425372012626, + "flos": 12853003829760.0, + "grad_norm": 110.93755951110654, + "language_loss": 0.94508791, + "learning_rate": 1.276984386563009e-07, + "loss": 0.95971519, + "num_input_tokens_seen": 318859670, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25476074, + "step": 14787, + "time_per_iteration": 2.7349014282226562 + }, + { + "auxiliary_loss_clip": 0.01259026, + "auxiliary_loss_mlp": 0.00216646, + "balance_loss_clip": 1.03612363, + "balance_loss_mlp": 0.19038454, + "epoch": 0.8891026604539306, + "flos": 21689291329920.0, + "grad_norm": 7.674388644363673, + "language_loss": 0.8274104, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.84216714, + "num_input_tokens_seen": 318877855, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.26293945, + "step": 14788, + "time_per_iteration": 4.149017810821533 + }, + { + "auxiliary_loss_clip": 0.01222195, + "auxiliary_loss_mlp": 0.00204446, + "balance_loss_clip": 1.00908387, + "balance_loss_mlp": 0.18053274, + "epoch": 0.8891627837065985, + "flos": 21871430219520.0, + "grad_norm": 226.03722063454896, + "language_loss": 0.76243305, + "learning_rate": 1.274247121395935e-07, + "loss": 0.77669954, + "num_input_tokens_seen": 318896045, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23913574, + "step": 14789, + "time_per_iteration": 4.26997971534729 + }, + { + "auxiliary_loss_clip": 0.01247561, + "auxiliary_loss_mlp": 0.00224437, + "balance_loss_clip": 1.02744579, + "balance_loss_mlp": 0.19879508, + "epoch": 0.8892229069592665, + "flos": 21580230660480.0, + "grad_norm": 28.73973978282685, + "language_loss": 0.78513843, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.79985845, + "num_input_tokens_seen": 318915515, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25622559, + "step": 14790, + "time_per_iteration": 2.650146007537842 + }, + { + "auxiliary_loss_clip": 0.01233262, + "auxiliary_loss_mlp": 0.00210409, + "balance_loss_clip": 1.02166188, + "balance_loss_mlp": 0.18606693, + "epoch": 0.8892830302119344, + "flos": 23075981832960.0, + "grad_norm": 117.5096818322478, + "language_loss": 0.79814732, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.81258404, + "num_input_tokens_seen": 318934305, + "router_z_loss_clip": 2.11425781, + "router_z_loss_mlp": 0.24353027, + "step": 14791, + "time_per_iteration": 2.654984474182129 + }, + { + "auxiliary_loss_clip": 0.01228967, + "auxiliary_loss_mlp": 0.00212495, + "balance_loss_clip": 1.0181551, + "balance_loss_mlp": 0.18926108, + "epoch": 0.8893431534646025, + "flos": 23072139077760.0, + "grad_norm": 2.918491780417806, + "language_loss": 0.82131052, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.83572513, + "num_input_tokens_seen": 318953880, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.23242188, + "step": 14792, + "time_per_iteration": 2.65254545211792 + }, + { + "auxiliary_loss_clip": 0.01258557, + "auxiliary_loss_mlp": 0.00217442, + "balance_loss_clip": 1.03378868, + "balance_loss_mlp": 0.19262311, + "epoch": 0.8894032767172704, + "flos": 22454978572800.0, + "grad_norm": 93.5160236475789, + "language_loss": 0.76350605, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.77826607, + "num_input_tokens_seen": 318971395, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.24816895, + "step": 14793, + "time_per_iteration": 2.700469970703125 + }, + { + "auxiliary_loss_clip": 0.01245714, + "auxiliary_loss_mlp": 0.00213111, + "balance_loss_clip": 1.02591848, + "balance_loss_mlp": 0.18659928, + "epoch": 0.8894633999699384, + "flos": 25338246261120.0, + "grad_norm": 9.87267713560225, + "language_loss": 0.7969892, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.81157744, + "num_input_tokens_seen": 318990580, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26538086, + "step": 14794, + "time_per_iteration": 4.128849267959595 + }, + { + "auxiliary_loss_clip": 0.01267707, + "auxiliary_loss_mlp": 0.00221531, + "balance_loss_clip": 1.04281092, + "balance_loss_mlp": 0.19512579, + "epoch": 0.8895235232226063, + "flos": 20994096528000.0, + "grad_norm": 19.064622345903363, + "language_loss": 0.80703443, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.82192683, + "num_input_tokens_seen": 319010040, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.26416016, + "step": 14795, + "time_per_iteration": 2.678644895553589 + }, + { + "auxiliary_loss_clip": 0.01124133, + "auxiliary_loss_mlp": 0.00140264, + "balance_loss_clip": 0.98415107, + "balance_loss_mlp": 0.13330229, + "epoch": 0.8895836464752743, + "flos": 69732956764800.0, + "grad_norm": 0.9491654958233726, + "language_loss": 0.55576581, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.56840974, + "num_input_tokens_seen": 319063860, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.06982422, + "step": 14796, + "time_per_iteration": 3.1199796199798584 + }, + { + "auxiliary_loss_clip": 0.0125792, + "auxiliary_loss_mlp": 0.00216331, + "balance_loss_clip": 1.03164339, + "balance_loss_mlp": 0.18952067, + "epoch": 0.8896437697279422, + "flos": 23221815050880.0, + "grad_norm": 24.179088519443376, + "language_loss": 0.79388213, + "learning_rate": 1.263326468169843e-07, + "loss": 0.80862468, + "num_input_tokens_seen": 319082335, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.26831055, + "step": 14797, + "time_per_iteration": 2.679567575454712 + }, + { + "auxiliary_loss_clip": 0.01123606, + "auxiliary_loss_mlp": 0.00099335, + "balance_loss_clip": 0.98241985, + "balance_loss_mlp": 0.09175347, + "epoch": 0.8897038929806103, + "flos": 70752711882240.0, + "grad_norm": 0.7197303182924487, + "language_loss": 0.57121325, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.58344269, + "num_input_tokens_seen": 319147075, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.07568359, + "step": 14798, + "time_per_iteration": 3.1688547134399414 + }, + { + "auxiliary_loss_clip": 0.0124992, + "auxiliary_loss_mlp": 0.00219303, + "balance_loss_clip": 1.02669811, + "balance_loss_mlp": 0.19540203, + "epoch": 0.8897640162332782, + "flos": 19245103493760.0, + "grad_norm": 85.31840614827937, + "language_loss": 0.86229837, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.87699062, + "num_input_tokens_seen": 319166630, + "router_z_loss_clip": 2.23339844, + "router_z_loss_mlp": 0.23913574, + "step": 14799, + "time_per_iteration": 2.6265149116516113 + }, + { + "auxiliary_loss_clip": 0.01132845, + "auxiliary_loss_mlp": 0.00118479, + "balance_loss_clip": 0.99042368, + "balance_loss_mlp": 0.11123097, + "epoch": 0.8898241394859462, + "flos": 41356275039360.0, + "grad_norm": 0.8676691905921, + "language_loss": 0.57487869, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.58739191, + "num_input_tokens_seen": 319221865, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.07226562, + "step": 14800, + "time_per_iteration": 4.4376749992370605 + }, + { + "auxiliary_loss_clip": 0.01237079, + "auxiliary_loss_mlp": 0.00218917, + "balance_loss_clip": 1.02419186, + "balance_loss_mlp": 0.19614792, + "epoch": 0.8898842627386142, + "flos": 18986295024000.0, + "grad_norm": 9.551569550010948, + "language_loss": 0.74067724, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.75523722, + "num_input_tokens_seen": 319240710, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.2277832, + "step": 14801, + "time_per_iteration": 2.6785690784454346 + }, + { + "auxiliary_loss_clip": 0.0126925, + "auxiliary_loss_mlp": 0.00227267, + "balance_loss_clip": 1.03896523, + "balance_loss_mlp": 0.20147052, + "epoch": 0.8899443859912821, + "flos": 13217173868160.0, + "grad_norm": 9.24054606211364, + "language_loss": 0.84493649, + "learning_rate": 1.256524149358682e-07, + "loss": 0.85990167, + "num_input_tokens_seen": 319256495, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.25817871, + "step": 14802, + "time_per_iteration": 2.6918041706085205 + }, + { + "auxiliary_loss_clip": 0.01247535, + "auxiliary_loss_mlp": 0.00219812, + "balance_loss_clip": 1.02889109, + "balance_loss_mlp": 0.19481392, + "epoch": 0.8900045092439501, + "flos": 22674680110080.0, + "grad_norm": 76.71293541428848, + "language_loss": 0.80280274, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.81747627, + "num_input_tokens_seen": 319273620, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25, + "step": 14803, + "time_per_iteration": 2.6857666969299316 + }, + { + "auxiliary_loss_clip": 0.01241617, + "auxiliary_loss_mlp": 0.00218198, + "balance_loss_clip": 1.02168894, + "balance_loss_mlp": 0.1927114, + "epoch": 0.890064632496618, + "flos": 21141617685120.0, + "grad_norm": 6.649836246285387, + "language_loss": 0.81687403, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.83147216, + "num_input_tokens_seen": 319291720, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.25488281, + "step": 14804, + "time_per_iteration": 2.677724599838257 + }, + { + "auxiliary_loss_clip": 0.01232283, + "auxiliary_loss_mlp": 0.00213891, + "balance_loss_clip": 1.01835513, + "balance_loss_mlp": 0.19029987, + "epoch": 0.8901247557492861, + "flos": 23397058529280.0, + "grad_norm": 166.05009596566248, + "language_loss": 0.89395332, + "learning_rate": 1.252451286713123e-07, + "loss": 0.90841508, + "num_input_tokens_seen": 319310380, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.23596191, + "step": 14805, + "time_per_iteration": 2.6983046531677246 + }, + { + "auxiliary_loss_clip": 0.01268964, + "auxiliary_loss_mlp": 0.00200886, + "balance_loss_clip": 1.03953671, + "balance_loss_mlp": 0.1737659, + "epoch": 0.890184879001954, + "flos": 29169591477120.0, + "grad_norm": 51.99318907439467, + "language_loss": 0.75778282, + "learning_rate": 1.251095087580505e-07, + "loss": 0.77248138, + "num_input_tokens_seen": 319331765, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.27148438, + "step": 14806, + "time_per_iteration": 2.739058017730713 + }, + { + "auxiliary_loss_clip": 0.01247624, + "auxiliary_loss_mlp": 0.00205453, + "balance_loss_clip": 1.027246, + "balance_loss_mlp": 0.17911951, + "epoch": 0.890245002254622, + "flos": 14427830793600.0, + "grad_norm": 68.02772651196445, + "language_loss": 0.77944112, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.7939719, + "num_input_tokens_seen": 319349135, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.26330566, + "step": 14807, + "time_per_iteration": 2.7281761169433594 + }, + { + "auxiliary_loss_clip": 0.01223408, + "auxiliary_loss_mlp": 0.00202035, + "balance_loss_clip": 1.01330471, + "balance_loss_mlp": 0.1789327, + "epoch": 0.8903051255072899, + "flos": 22382187661440.0, + "grad_norm": 7.658479076201395, + "language_loss": 0.82582355, + "learning_rate": 1.248384822247732e-07, + "loss": 0.840078, + "num_input_tokens_seen": 319368410, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.2310791, + "step": 14808, + "time_per_iteration": 2.7291417121887207 + }, + { + "auxiliary_loss_clip": 0.01230668, + "auxiliary_loss_mlp": 0.00218341, + "balance_loss_clip": 1.01641273, + "balance_loss_mlp": 0.19345005, + "epoch": 0.8903652487599579, + "flos": 20777375819520.0, + "grad_norm": 2.072279906844782, + "language_loss": 0.89640951, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.91089964, + "num_input_tokens_seen": 319387535, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.24890137, + "step": 14809, + "time_per_iteration": 2.716245412826538 + }, + { + "auxiliary_loss_clip": 0.01240685, + "auxiliary_loss_mlp": 0.00220367, + "balance_loss_clip": 1.01976418, + "balance_loss_mlp": 0.19430819, + "epoch": 0.8904253720126258, + "flos": 24424499157120.0, + "grad_norm": 6.173717875626312, + "language_loss": 0.77740586, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.79201639, + "num_input_tokens_seen": 319407210, + "router_z_loss_clip": 2.20605469, + "router_z_loss_mlp": 0.26086426, + "step": 14810, + "time_per_iteration": 2.8604414463043213 + }, + { + "auxiliary_loss_clip": 0.01242735, + "auxiliary_loss_mlp": 0.0020587, + "balance_loss_clip": 1.01845574, + "balance_loss_mlp": 0.18147957, + "epoch": 0.8904854952652939, + "flos": 19463871277440.0, + "grad_norm": 8.62194124345209, + "language_loss": 0.8095479, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.82403392, + "num_input_tokens_seen": 319425340, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.24377441, + "step": 14811, + "time_per_iteration": 2.626422643661499 + }, + { + "auxiliary_loss_clip": 0.01254396, + "auxiliary_loss_mlp": 0.0021714, + "balance_loss_clip": 1.03248501, + "balance_loss_mlp": 0.19308399, + "epoch": 0.8905456185179618, + "flos": 50800741666560.0, + "grad_norm": 287.5271858341366, + "language_loss": 0.73163366, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.74634898, + "num_input_tokens_seen": 319448150, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24072266, + "step": 14812, + "time_per_iteration": 2.955007314682007 + }, + { + "auxiliary_loss_clip": 0.01239499, + "auxiliary_loss_mlp": 0.00213644, + "balance_loss_clip": 1.02467608, + "balance_loss_mlp": 0.19061311, + "epoch": 0.8906057417706298, + "flos": 17784867893760.0, + "grad_norm": 5.91270169639237, + "language_loss": 0.76715583, + "learning_rate": 1.24162160341861e-07, + "loss": 0.78168726, + "num_input_tokens_seen": 319466115, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.23059082, + "step": 14813, + "time_per_iteration": 2.608144998550415 + }, + { + "auxiliary_loss_clip": 0.01280395, + "auxiliary_loss_mlp": 0.00246547, + "balance_loss_clip": 1.05211663, + "balance_loss_mlp": 0.21774645, + "epoch": 0.8906658650232978, + "flos": 21944867575680.0, + "grad_norm": 1610.5879510496973, + "language_loss": 0.85230052, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.86756992, + "num_input_tokens_seen": 319485255, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.28808594, + "step": 14814, + "time_per_iteration": 2.6677498817443848 + }, + { + "auxiliary_loss_clip": 0.01252038, + "auxiliary_loss_mlp": 0.00238602, + "balance_loss_clip": 1.02986407, + "balance_loss_mlp": 0.2116611, + "epoch": 0.8907259882759657, + "flos": 21287810039040.0, + "grad_norm": 30.926198115583258, + "language_loss": 0.81736851, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.83227491, + "num_input_tokens_seen": 319501800, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.26965332, + "step": 14815, + "time_per_iteration": 2.6490190029144287 + }, + { + "auxiliary_loss_clip": 0.01225417, + "auxiliary_loss_mlp": 0.00183965, + "balance_loss_clip": 1.01701987, + "balance_loss_mlp": 0.16265076, + "epoch": 0.8907861115286337, + "flos": 20120426023680.0, + "grad_norm": 9.781992650451725, + "language_loss": 0.83325088, + "learning_rate": 1.237572207545914e-07, + "loss": 0.84734476, + "num_input_tokens_seen": 319520415, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.21313477, + "step": 14816, + "time_per_iteration": 2.6856508255004883 + }, + { + "auxiliary_loss_clip": 0.01254989, + "auxiliary_loss_mlp": 0.0020033, + "balance_loss_clip": 1.02934718, + "balance_loss_mlp": 0.17537947, + "epoch": 0.8908462347813016, + "flos": 20084156265600.0, + "grad_norm": 86.79647985338453, + "language_loss": 0.85851753, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.87307072, + "num_input_tokens_seen": 319538410, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.24975586, + "step": 14817, + "time_per_iteration": 2.6623480319976807 + }, + { + "auxiliary_loss_clip": 0.01114056, + "auxiliary_loss_mlp": 0.0006722, + "balance_loss_clip": 0.97441614, + "balance_loss_mlp": 0.06130673, + "epoch": 0.8909063580339697, + "flos": 65503649790720.0, + "grad_norm": 0.7265400415956129, + "language_loss": 0.5625028, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.57431555, + "num_input_tokens_seen": 319602565, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.05908203, + "step": 14818, + "time_per_iteration": 3.201524019241333 + }, + { + "auxiliary_loss_clip": 0.01244608, + "auxiliary_loss_mlp": 0.00235171, + "balance_loss_clip": 1.02713323, + "balance_loss_mlp": 0.21037576, + "epoch": 0.8909664812866376, + "flos": 29863062426240.0, + "grad_norm": 4.916925331198541, + "language_loss": 0.72850049, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.74329829, + "num_input_tokens_seen": 319624645, + "router_z_loss_clip": 2.17675781, + "router_z_loss_mlp": 0.24816895, + "step": 14819, + "time_per_iteration": 2.7588136196136475 + }, + { + "auxiliary_loss_clip": 0.01253094, + "auxiliary_loss_mlp": 0.0021228, + "balance_loss_clip": 1.03090167, + "balance_loss_mlp": 0.18750793, + "epoch": 0.8910266045393056, + "flos": 25447127362560.0, + "grad_norm": 5.18757025518573, + "language_loss": 0.8735764, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.88823009, + "num_input_tokens_seen": 319644040, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.24768066, + "step": 14820, + "time_per_iteration": 2.701744556427002 + }, + { + "auxiliary_loss_clip": 0.01257081, + "auxiliary_loss_mlp": 0.00224542, + "balance_loss_clip": 1.03552222, + "balance_loss_mlp": 0.20130767, + "epoch": 0.8910867277919735, + "flos": 24499121662080.0, + "grad_norm": 256.891619464761, + "language_loss": 0.82331216, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.83812833, + "num_input_tokens_seen": 319663930, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.23217773, + "step": 14821, + "time_per_iteration": 2.818347454071045 + }, + { + "auxiliary_loss_clip": 0.01120462, + "auxiliary_loss_mlp": 0.00090126, + "balance_loss_clip": 0.97976369, + "balance_loss_mlp": 0.08230567, + "epoch": 0.8911468510446415, + "flos": 60688136856960.0, + "grad_norm": 0.7873941092082194, + "language_loss": 0.5886057, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.60071158, + "num_input_tokens_seen": 319721245, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.078125, + "step": 14822, + "time_per_iteration": 3.0373570919036865 + }, + { + "auxiliary_loss_clip": 0.01237797, + "auxiliary_loss_mlp": 0.0021959, + "balance_loss_clip": 1.02145052, + "balance_loss_mlp": 0.1937688, + "epoch": 0.8912069742973094, + "flos": 25337492075520.0, + "grad_norm": 14.60635346939878, + "language_loss": 0.7886734, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.80324721, + "num_input_tokens_seen": 319741200, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25830078, + "step": 14823, + "time_per_iteration": 2.750028371810913 + }, + { + "auxiliary_loss_clip": 0.01232151, + "auxiliary_loss_mlp": 0.00204127, + "balance_loss_clip": 1.0218246, + "balance_loss_mlp": 0.17936744, + "epoch": 0.8912670975499775, + "flos": 18223516782720.0, + "grad_norm": 6.325837786881918, + "language_loss": 0.78734505, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.8017078, + "num_input_tokens_seen": 319759265, + "router_z_loss_clip": 2.10253906, + "router_z_loss_mlp": 0.24743652, + "step": 14824, + "time_per_iteration": 2.658195972442627 + }, + { + "auxiliary_loss_clip": 0.0125162, + "auxiliary_loss_mlp": 0.00208858, + "balance_loss_clip": 1.02940893, + "balance_loss_mlp": 0.18370491, + "epoch": 0.8913272208026454, + "flos": 26504481041280.0, + "grad_norm": 5.528170510190306, + "language_loss": 0.79716051, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.81176525, + "num_input_tokens_seen": 319777560, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.2512207, + "step": 14825, + "time_per_iteration": 2.6786162853240967 + }, + { + "auxiliary_loss_clip": 0.01231714, + "auxiliary_loss_mlp": 0.00211441, + "balance_loss_clip": 1.02007818, + "balance_loss_mlp": 0.1872535, + "epoch": 0.8913873440553134, + "flos": 18802324540800.0, + "grad_norm": 39.61555353336122, + "language_loss": 0.7962687, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.8107003, + "num_input_tokens_seen": 319794125, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.24194336, + "step": 14826, + "time_per_iteration": 2.6381447315216064 + }, + { + "auxiliary_loss_clip": 0.01244597, + "auxiliary_loss_mlp": 0.00220881, + "balance_loss_clip": 1.02837753, + "balance_loss_mlp": 0.19643164, + "epoch": 0.8914474673079814, + "flos": 20884892204160.0, + "grad_norm": 37.614104551475776, + "language_loss": 0.83316326, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.84781802, + "num_input_tokens_seen": 319810310, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.24450684, + "step": 14827, + "time_per_iteration": 2.644540786743164 + }, + { + "auxiliary_loss_clip": 0.01234176, + "auxiliary_loss_mlp": 0.00218427, + "balance_loss_clip": 1.0165751, + "balance_loss_mlp": 0.193977, + "epoch": 0.8915075905606493, + "flos": 20952439729920.0, + "grad_norm": 12.613404511314892, + "language_loss": 0.85591078, + "learning_rate": 1.221438670423336e-07, + "loss": 0.87043679, + "num_input_tokens_seen": 319828505, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24487305, + "step": 14828, + "time_per_iteration": 2.6872520446777344 + }, + { + "auxiliary_loss_clip": 0.01235416, + "auxiliary_loss_mlp": 0.00231833, + "balance_loss_clip": 1.02008867, + "balance_loss_mlp": 0.2072404, + "epoch": 0.8915677138133173, + "flos": 23076305055360.0, + "grad_norm": 12.50503033540821, + "language_loss": 0.83322352, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.84789598, + "num_input_tokens_seen": 319848680, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.24572754, + "step": 14829, + "time_per_iteration": 2.6504921913146973 + }, + { + "auxiliary_loss_clip": 0.01249732, + "auxiliary_loss_mlp": 0.00218052, + "balance_loss_clip": 1.03211641, + "balance_loss_mlp": 0.19266048, + "epoch": 0.8916278370659853, + "flos": 23440259612160.0, + "grad_norm": 53.89431160423309, + "language_loss": 0.90450025, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.91917801, + "num_input_tokens_seen": 319868835, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25402832, + "step": 14830, + "time_per_iteration": 4.0738701820373535 + }, + { + "auxiliary_loss_clip": 0.01222595, + "auxiliary_loss_mlp": 0.00203891, + "balance_loss_clip": 1.01375258, + "balance_loss_mlp": 0.17977522, + "epoch": 0.8916879603186533, + "flos": 25160488830720.0, + "grad_norm": 6.3285340902086835, + "language_loss": 0.82234514, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.83661002, + "num_input_tokens_seen": 319891585, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.24133301, + "step": 14831, + "time_per_iteration": 4.372467756271362 + }, + { + "auxiliary_loss_clip": 0.01245391, + "auxiliary_loss_mlp": 0.00225497, + "balance_loss_clip": 1.02592611, + "balance_loss_mlp": 0.19927144, + "epoch": 0.8917480835713212, + "flos": 20229845829120.0, + "grad_norm": 12.360240526737428, + "language_loss": 0.82681191, + "learning_rate": 1.216083607088847e-07, + "loss": 0.84152079, + "num_input_tokens_seen": 319910315, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.26220703, + "step": 14832, + "time_per_iteration": 2.642322540283203 + }, + { + "auxiliary_loss_clip": 0.0125456, + "auxiliary_loss_mlp": 0.00221831, + "balance_loss_clip": 1.02941644, + "balance_loss_mlp": 0.19595017, + "epoch": 0.8918082068239892, + "flos": 26101922342400.0, + "grad_norm": 5.8449961262459516, + "language_loss": 0.7491858, + "learning_rate": 1.214746621848355e-07, + "loss": 0.76394975, + "num_input_tokens_seen": 319932275, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.25891113, + "step": 14833, + "time_per_iteration": 2.805919647216797 + }, + { + "auxiliary_loss_clip": 0.01274489, + "auxiliary_loss_mlp": 0.0023033, + "balance_loss_clip": 1.04482889, + "balance_loss_mlp": 0.20332953, + "epoch": 0.8918683300766571, + "flos": 24831439315200.0, + "grad_norm": 859.5092743737114, + "language_loss": 0.81592774, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.83097595, + "num_input_tokens_seen": 319955335, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.26965332, + "step": 14834, + "time_per_iteration": 2.8268256187438965 + }, + { + "auxiliary_loss_clip": 0.01240603, + "auxiliary_loss_mlp": 0.00217363, + "balance_loss_clip": 1.02228057, + "balance_loss_mlp": 0.19141105, + "epoch": 0.8919284533293251, + "flos": 22305158945280.0, + "grad_norm": 3.6783586374603554, + "language_loss": 0.8695839, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.88416362, + "num_input_tokens_seen": 319973990, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25952148, + "step": 14835, + "time_per_iteration": 2.6570937633514404 + }, + { + "auxiliary_loss_clip": 0.01219503, + "auxiliary_loss_mlp": 0.0020631, + "balance_loss_clip": 1.01050782, + "balance_loss_mlp": 0.18306457, + "epoch": 0.891988576581993, + "flos": 30373532559360.0, + "grad_norm": 9.260615786289174, + "language_loss": 0.81403726, + "learning_rate": 1.210739940361689e-07, + "loss": 0.82829535, + "num_input_tokens_seen": 319995555, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.23278809, + "step": 14836, + "time_per_iteration": 4.125575304031372 + }, + { + "auxiliary_loss_clip": 0.01236228, + "auxiliary_loss_mlp": 0.00221021, + "balance_loss_clip": 1.02201164, + "balance_loss_mlp": 0.19517647, + "epoch": 0.8920486998346611, + "flos": 15552947479680.0, + "grad_norm": 20.781063359923706, + "language_loss": 0.77457535, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.78914785, + "num_input_tokens_seen": 320012385, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.25854492, + "step": 14837, + "time_per_iteration": 2.6045784950256348 + }, + { + "auxiliary_loss_clip": 0.01249348, + "auxiliary_loss_mlp": 0.00214772, + "balance_loss_clip": 1.02460027, + "balance_loss_mlp": 0.18957132, + "epoch": 0.892108823087329, + "flos": 21214983214080.0, + "grad_norm": 2.853782607972406, + "language_loss": 0.76885271, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.78349388, + "num_input_tokens_seen": 320032390, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.25231934, + "step": 14838, + "time_per_iteration": 2.6806321144104004 + }, + { + "auxiliary_loss_clip": 0.01228371, + "auxiliary_loss_mlp": 0.00193679, + "balance_loss_clip": 1.01714444, + "balance_loss_mlp": 0.16983762, + "epoch": 0.892168946339997, + "flos": 21978982517760.0, + "grad_norm": 4.336444467047846, + "language_loss": 0.84749532, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.86171591, + "num_input_tokens_seen": 320052885, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.23852539, + "step": 14839, + "time_per_iteration": 2.657876968383789 + }, + { + "auxiliary_loss_clip": 0.01125157, + "auxiliary_loss_mlp": 0.00052874, + "balance_loss_clip": 0.98203182, + "balance_loss_mlp": 0.04624606, + "epoch": 0.892229069592665, + "flos": 67475289277440.0, + "grad_norm": 0.6650822541864144, + "language_loss": 0.48695818, + "learning_rate": 1.205407673483978e-07, + "loss": 0.49873853, + "num_input_tokens_seen": 320113685, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.06640625, + "step": 14840, + "time_per_iteration": 3.128323793411255 + }, + { + "auxiliary_loss_clip": 0.01277941, + "auxiliary_loss_mlp": 0.00233937, + "balance_loss_clip": 1.04344153, + "balance_loss_mlp": 0.20568401, + "epoch": 0.8922891928453329, + "flos": 19459561645440.0, + "grad_norm": 16.6828403638237, + "language_loss": 0.76165986, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.77677864, + "num_input_tokens_seen": 320130810, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.28271484, + "step": 14841, + "time_per_iteration": 2.8260655403137207 + }, + { + "auxiliary_loss_clip": 0.01229768, + "auxiliary_loss_mlp": 0.00217183, + "balance_loss_clip": 1.02122414, + "balance_loss_mlp": 0.19273366, + "epoch": 0.8923493160980009, + "flos": 23367396873600.0, + "grad_norm": 17.911179315408763, + "language_loss": 0.75907832, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.77354789, + "num_input_tokens_seen": 320152170, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.24475098, + "step": 14842, + "time_per_iteration": 4.229801893234253 + }, + { + "auxiliary_loss_clip": 0.01235799, + "auxiliary_loss_mlp": 0.00184018, + "balance_loss_clip": 1.02294803, + "balance_loss_mlp": 0.16208351, + "epoch": 0.8924094393506689, + "flos": 26177047637760.0, + "grad_norm": 3.8910229966174437, + "language_loss": 0.88382542, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.89802355, + "num_input_tokens_seen": 320172360, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.21923828, + "step": 14843, + "time_per_iteration": 2.756709098815918 + }, + { + "auxiliary_loss_clip": 0.01266228, + "auxiliary_loss_mlp": 0.00215423, + "balance_loss_clip": 1.03937185, + "balance_loss_mlp": 0.1883032, + "epoch": 0.8924695626033369, + "flos": 22018520413440.0, + "grad_norm": 149.62640504620046, + "language_loss": 0.79110599, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.80592251, + "num_input_tokens_seen": 320192130, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.27111816, + "step": 14844, + "time_per_iteration": 2.6724138259887695 + }, + { + "auxiliary_loss_clip": 0.01243136, + "auxiliary_loss_mlp": 0.00210846, + "balance_loss_clip": 1.02374411, + "balance_loss_mlp": 0.1851801, + "epoch": 0.8925296858560048, + "flos": 14793940166400.0, + "grad_norm": 5.397478653430712, + "language_loss": 1.01091218, + "learning_rate": 1.1987583758531038e-07, + "loss": 1.02545202, + "num_input_tokens_seen": 320207760, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.2565918, + "step": 14845, + "time_per_iteration": 2.658132791519165 + }, + { + "auxiliary_loss_clip": 0.01227747, + "auxiliary_loss_mlp": 0.00205687, + "balance_loss_clip": 1.01657057, + "balance_loss_mlp": 0.18141562, + "epoch": 0.8925898091086728, + "flos": 22346636175360.0, + "grad_norm": 188.36342225498996, + "language_loss": 0.79057074, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.80490506, + "num_input_tokens_seen": 320225325, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.24267578, + "step": 14846, + "time_per_iteration": 2.7409181594848633 + }, + { + "auxiliary_loss_clip": 0.01266366, + "auxiliary_loss_mlp": 0.00215062, + "balance_loss_clip": 1.04028511, + "balance_loss_mlp": 0.18795379, + "epoch": 0.8926499323613407, + "flos": 45806322067200.0, + "grad_norm": 10.500963278811344, + "language_loss": 0.6361925, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.65100682, + "num_input_tokens_seen": 320247645, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.27099609, + "step": 14847, + "time_per_iteration": 2.8560523986816406 + }, + { + "auxiliary_loss_clip": 0.01233721, + "auxiliary_loss_mlp": 0.00200799, + "balance_loss_clip": 1.02059031, + "balance_loss_mlp": 0.17657575, + "epoch": 0.8927100556140087, + "flos": 22127042378880.0, + "grad_norm": 20.721331744980745, + "language_loss": 0.86188722, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.87623239, + "num_input_tokens_seen": 320266005, + "router_z_loss_clip": 2.13378906, + "router_z_loss_mlp": 0.24230957, + "step": 14848, + "time_per_iteration": 2.7141618728637695 + }, + { + "auxiliary_loss_clip": 0.01241322, + "auxiliary_loss_mlp": 0.0021771, + "balance_loss_clip": 1.02187753, + "balance_loss_mlp": 0.19298638, + "epoch": 0.8927701788666766, + "flos": 28330143655680.0, + "grad_norm": 380.81301208904455, + "language_loss": 0.77658772, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.79117799, + "num_input_tokens_seen": 320285555, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24731445, + "step": 14849, + "time_per_iteration": 2.6829285621643066 + }, + { + "auxiliary_loss_clip": 0.01268587, + "auxiliary_loss_mlp": 0.00224293, + "balance_loss_clip": 1.0420289, + "balance_loss_mlp": 0.1989013, + "epoch": 0.8928303021193447, + "flos": 25294973351040.0, + "grad_norm": 199.94475898237332, + "language_loss": 0.87671149, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.89164025, + "num_input_tokens_seen": 320305395, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.25378418, + "step": 14850, + "time_per_iteration": 2.7260923385620117 + }, + { + "auxiliary_loss_clip": 0.01235606, + "auxiliary_loss_mlp": 0.00188173, + "balance_loss_clip": 1.01994824, + "balance_loss_mlp": 0.16431889, + "epoch": 0.8928904253720126, + "flos": 22236713579520.0, + "grad_norm": 33.21290279012587, + "language_loss": 0.82818496, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.84242278, + "num_input_tokens_seen": 320324220, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.23876953, + "step": 14851, + "time_per_iteration": 2.6568989753723145 + }, + { + "auxiliary_loss_clip": 0.01251309, + "auxiliary_loss_mlp": 0.00207923, + "balance_loss_clip": 1.02861559, + "balance_loss_mlp": 0.18231666, + "epoch": 0.8929505486246806, + "flos": 27092374940160.0, + "grad_norm": 9.972798701593826, + "language_loss": 0.86626792, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.88086027, + "num_input_tokens_seen": 320347195, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25585938, + "step": 14852, + "time_per_iteration": 2.766901969909668 + }, + { + "auxiliary_loss_clip": 0.01240627, + "auxiliary_loss_mlp": 0.0021307, + "balance_loss_clip": 1.02639961, + "balance_loss_mlp": 0.18990776, + "epoch": 0.8930106718773486, + "flos": 23039352938880.0, + "grad_norm": 20.829924590031997, + "language_loss": 0.74958408, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.76412106, + "num_input_tokens_seen": 320366850, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.23144531, + "step": 14853, + "time_per_iteration": 2.696295976638794 + }, + { + "auxiliary_loss_clip": 0.0124547, + "auxiliary_loss_mlp": 0.00223209, + "balance_loss_clip": 1.02889109, + "balance_loss_mlp": 0.1992119, + "epoch": 0.8930707951300165, + "flos": 35626652887680.0, + "grad_norm": 47.832109320132965, + "language_loss": 0.75434983, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.76903659, + "num_input_tokens_seen": 320388895, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.23986816, + "step": 14854, + "time_per_iteration": 2.826183795928955 + }, + { + "auxiliary_loss_clip": 0.01222914, + "auxiliary_loss_mlp": 0.00180828, + "balance_loss_clip": 1.01212811, + "balance_loss_mlp": 0.1580828, + "epoch": 0.8931309183826845, + "flos": 23039891642880.0, + "grad_norm": 10.466801041761865, + "language_loss": 0.82697409, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.84101152, + "num_input_tokens_seen": 320408520, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.22741699, + "step": 14855, + "time_per_iteration": 2.6699206829071045 + }, + { + "auxiliary_loss_clip": 0.01232462, + "auxiliary_loss_mlp": 0.00210859, + "balance_loss_clip": 1.01725149, + "balance_loss_mlp": 0.18497908, + "epoch": 0.8931910416353525, + "flos": 26504624695680.0, + "grad_norm": 52.56467186414891, + "language_loss": 0.71928483, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.73371804, + "num_input_tokens_seen": 320427400, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.25854492, + "step": 14856, + "time_per_iteration": 2.6590194702148438 + }, + { + "auxiliary_loss_clip": 0.01230556, + "auxiliary_loss_mlp": 0.0020583, + "balance_loss_clip": 1.01644158, + "balance_loss_mlp": 0.18033111, + "epoch": 0.8932511648880205, + "flos": 24973609345920.0, + "grad_norm": 8.92687435828467, + "language_loss": 0.74370265, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.75806653, + "num_input_tokens_seen": 320447570, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.25512695, + "step": 14857, + "time_per_iteration": 2.696143627166748 + }, + { + "auxiliary_loss_clip": 0.01263935, + "auxiliary_loss_mlp": 0.00215307, + "balance_loss_clip": 1.0350225, + "balance_loss_mlp": 0.18929586, + "epoch": 0.8933112881406884, + "flos": 24460733001600.0, + "grad_norm": 1852.4414127373195, + "language_loss": 0.83326125, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.84805357, + "num_input_tokens_seen": 320464405, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.2598877, + "step": 14858, + "time_per_iteration": 2.6553304195404053 + }, + { + "auxiliary_loss_clip": 0.0125149, + "auxiliary_loss_mlp": 0.00210086, + "balance_loss_clip": 1.0307008, + "balance_loss_mlp": 0.18515918, + "epoch": 0.8933714113933564, + "flos": 28293083798400.0, + "grad_norm": 12.088713937699753, + "language_loss": 0.77089417, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.78550994, + "num_input_tokens_seen": 320485525, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24926758, + "step": 14859, + "time_per_iteration": 2.7873921394348145 + }, + { + "auxiliary_loss_clip": 0.01234914, + "auxiliary_loss_mlp": 0.00187413, + "balance_loss_clip": 1.02383947, + "balance_loss_mlp": 0.16361919, + "epoch": 0.8934315346460243, + "flos": 21434864319360.0, + "grad_norm": 9.229172856493989, + "language_loss": 0.82898581, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.84320909, + "num_input_tokens_seen": 320506725, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.23791504, + "step": 14860, + "time_per_iteration": 2.694922685623169 + }, + { + "auxiliary_loss_clip": 0.01259093, + "auxiliary_loss_mlp": 0.00190816, + "balance_loss_clip": 1.0393374, + "balance_loss_mlp": 0.16569835, + "epoch": 0.8934916578986923, + "flos": 23769596436480.0, + "grad_norm": 4.263036557437016, + "language_loss": 0.67248487, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.68698394, + "num_input_tokens_seen": 320525425, + "router_z_loss_clip": 2.19433594, + "router_z_loss_mlp": 0.25134277, + "step": 14861, + "time_per_iteration": 2.7063214778900146 + }, + { + "auxiliary_loss_clip": 0.01225369, + "auxiliary_loss_mlp": 0.001877, + "balance_loss_clip": 1.01869524, + "balance_loss_mlp": 0.16440681, + "epoch": 0.8935517811513602, + "flos": 18916161719040.0, + "grad_norm": 46.444723478929205, + "language_loss": 0.7330786, + "learning_rate": 1.176284122190685e-07, + "loss": 0.74720931, + "num_input_tokens_seen": 320543010, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.23303223, + "step": 14862, + "time_per_iteration": 2.657008409500122 + }, + { + "auxiliary_loss_clip": 0.01244888, + "auxiliary_loss_mlp": 0.00205301, + "balance_loss_clip": 1.02915537, + "balance_loss_mlp": 0.18046945, + "epoch": 0.8936119044040283, + "flos": 24061370613120.0, + "grad_norm": 34.46431282728762, + "language_loss": 0.85547006, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.86997199, + "num_input_tokens_seen": 320562180, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24829102, + "step": 14863, + "time_per_iteration": 2.6638290882110596 + }, + { + "auxiliary_loss_clip": 0.01232708, + "auxiliary_loss_mlp": 0.00201736, + "balance_loss_clip": 1.01962495, + "balance_loss_mlp": 0.1766305, + "epoch": 0.8936720276566962, + "flos": 21324079797120.0, + "grad_norm": 200.2983161733313, + "language_loss": 0.78342396, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.79776841, + "num_input_tokens_seen": 320580395, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.25085449, + "step": 14864, + "time_per_iteration": 2.6464576721191406 + }, + { + "auxiliary_loss_clip": 0.01273099, + "auxiliary_loss_mlp": 0.00226766, + "balance_loss_clip": 1.04695022, + "balance_loss_mlp": 0.19986048, + "epoch": 0.8937321509093642, + "flos": 18406122549120.0, + "grad_norm": 19.154182163904455, + "language_loss": 0.86349857, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.87849724, + "num_input_tokens_seen": 320599505, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26879883, + "step": 14865, + "time_per_iteration": 2.6811389923095703 + }, + { + "auxiliary_loss_clip": 0.0122777, + "auxiliary_loss_mlp": 0.00232439, + "balance_loss_clip": 1.01905251, + "balance_loss_mlp": 0.20857351, + "epoch": 0.8937922741620322, + "flos": 22054754257920.0, + "grad_norm": 2.5048396708619736, + "language_loss": 0.78339553, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.79799759, + "num_input_tokens_seen": 320619825, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.23864746, + "step": 14866, + "time_per_iteration": 2.835833787918091 + }, + { + "auxiliary_loss_clip": 0.01254616, + "auxiliary_loss_mlp": 0.00224335, + "balance_loss_clip": 1.03524685, + "balance_loss_mlp": 0.19788255, + "epoch": 0.8938523974147001, + "flos": 25664386775040.0, + "grad_norm": 34.75473691416527, + "language_loss": 0.90462148, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.91941094, + "num_input_tokens_seen": 320638515, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.26428223, + "step": 14867, + "time_per_iteration": 2.687352418899536 + }, + { + "auxiliary_loss_clip": 0.01238342, + "auxiliary_loss_mlp": 0.00213028, + "balance_loss_clip": 1.02169335, + "balance_loss_mlp": 0.18863782, + "epoch": 0.8939125206673681, + "flos": 25742852035200.0, + "grad_norm": 15.499879308896707, + "language_loss": 0.87227082, + "learning_rate": 1.168401272009567e-07, + "loss": 0.88678455, + "num_input_tokens_seen": 320659430, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24389648, + "step": 14868, + "time_per_iteration": 2.7114531993865967 + }, + { + "auxiliary_loss_clip": 0.01259891, + "auxiliary_loss_mlp": 0.00213249, + "balance_loss_clip": 1.03573346, + "balance_loss_mlp": 0.18716551, + "epoch": 0.8939726439200361, + "flos": 27344503480320.0, + "grad_norm": 2.2801984206606853, + "language_loss": 0.84265888, + "learning_rate": 1.167089962692056e-07, + "loss": 0.85739028, + "num_input_tokens_seen": 320679295, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26062012, + "step": 14869, + "time_per_iteration": 2.67999005317688 + }, + { + "auxiliary_loss_clip": 0.01232773, + "auxiliary_loss_mlp": 0.00220049, + "balance_loss_clip": 1.01740527, + "balance_loss_mlp": 0.19567063, + "epoch": 0.8940327671727041, + "flos": 20338834671360.0, + "grad_norm": 24.125237453787392, + "language_loss": 0.74091721, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.75544548, + "num_input_tokens_seen": 320697535, + "router_z_loss_clip": 2.15722656, + "router_z_loss_mlp": 0.24365234, + "step": 14870, + "time_per_iteration": 2.6744472980499268 + }, + { + "auxiliary_loss_clip": 0.01116429, + "auxiliary_loss_mlp": 0.00059624, + "balance_loss_clip": 0.97637308, + "balance_loss_mlp": 0.05304325, + "epoch": 0.894092890425372, + "flos": 58410573235200.0, + "grad_norm": 0.8051386583939307, + "language_loss": 0.55274737, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.56450796, + "num_input_tokens_seen": 320758635, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.06591797, + "step": 14871, + "time_per_iteration": 3.223461151123047 + }, + { + "auxiliary_loss_clip": 0.01235516, + "auxiliary_loss_mlp": 0.00211304, + "balance_loss_clip": 1.02201581, + "balance_loss_mlp": 0.18829665, + "epoch": 0.89415301367804, + "flos": 19829657427840.0, + "grad_norm": 151.41287997382028, + "language_loss": 0.84048122, + "learning_rate": 1.16316031981331e-07, + "loss": 0.85494941, + "num_input_tokens_seen": 320777175, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.22998047, + "step": 14872, + "time_per_iteration": 4.028707981109619 + }, + { + "auxiliary_loss_clip": 0.01230811, + "auxiliary_loss_mlp": 0.00209454, + "balance_loss_clip": 1.02060008, + "balance_loss_mlp": 0.18667261, + "epoch": 0.8942131369307079, + "flos": 25775781828480.0, + "grad_norm": 4.684741295558105, + "language_loss": 0.74516928, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.75957191, + "num_input_tokens_seen": 320797670, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.22766113, + "step": 14873, + "time_per_iteration": 4.170468807220459 + }, + { + "auxiliary_loss_clip": 0.01232001, + "auxiliary_loss_mlp": 0.00228012, + "balance_loss_clip": 1.0217371, + "balance_loss_mlp": 0.20270348, + "epoch": 0.8942732601833759, + "flos": 23149024139520.0, + "grad_norm": 41.66496865154406, + "language_loss": 0.67082536, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.68542552, + "num_input_tokens_seen": 320817410, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.25317383, + "step": 14874, + "time_per_iteration": 2.6883206367492676 + }, + { + "auxiliary_loss_clip": 0.01248819, + "auxiliary_loss_mlp": 0.00195387, + "balance_loss_clip": 1.02937627, + "balance_loss_mlp": 0.17026934, + "epoch": 0.8943333834360438, + "flos": 27855548231040.0, + "grad_norm": 39.66718117383615, + "language_loss": 0.83637321, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.8508153, + "num_input_tokens_seen": 320836745, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25134277, + "step": 14875, + "time_per_iteration": 2.725451946258545 + }, + { + "auxiliary_loss_clip": 0.01279977, + "auxiliary_loss_mlp": 0.00225378, + "balance_loss_clip": 1.04881978, + "balance_loss_mlp": 0.19531363, + "epoch": 0.8943935066887119, + "flos": 22163958581760.0, + "grad_norm": 10.350736933167141, + "language_loss": 0.87279665, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.88785022, + "num_input_tokens_seen": 320853305, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.30053711, + "step": 14876, + "time_per_iteration": 2.7310967445373535 + }, + { + "auxiliary_loss_clip": 0.01243767, + "auxiliary_loss_mlp": 0.0021449, + "balance_loss_clip": 1.02492011, + "balance_loss_mlp": 0.1903621, + "epoch": 0.8944536299413798, + "flos": 21470056669440.0, + "grad_norm": 9.484179460297234, + "language_loss": 0.89338028, + "learning_rate": 1.156625201573287e-07, + "loss": 0.90796286, + "num_input_tokens_seen": 320872885, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24145508, + "step": 14877, + "time_per_iteration": 2.6130852699279785 + }, + { + "auxiliary_loss_clip": 0.01239861, + "auxiliary_loss_mlp": 0.00201193, + "balance_loss_clip": 1.02054262, + "balance_loss_mlp": 0.17642151, + "epoch": 0.8945137531940478, + "flos": 17748777703680.0, + "grad_norm": 25.41839400662891, + "language_loss": 0.85382843, + "learning_rate": 1.155320321355151e-07, + "loss": 0.86823905, + "num_input_tokens_seen": 320889755, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.2479248, + "step": 14878, + "time_per_iteration": 2.6247568130493164 + }, + { + "auxiliary_loss_clip": 0.01262022, + "auxiliary_loss_mlp": 0.00206134, + "balance_loss_clip": 1.03823876, + "balance_loss_mlp": 0.17887044, + "epoch": 0.8945738764467158, + "flos": 21142264129920.0, + "grad_norm": 9.674992373795435, + "language_loss": 0.85473847, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.86941999, + "num_input_tokens_seen": 320907860, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.27246094, + "step": 14879, + "time_per_iteration": 4.158797740936279 + }, + { + "auxiliary_loss_clip": 0.01254494, + "auxiliary_loss_mlp": 0.00237151, + "balance_loss_clip": 1.03289831, + "balance_loss_mlp": 0.20923197, + "epoch": 0.8946339996993837, + "flos": 14903000835840.0, + "grad_norm": 5.243175669435129, + "language_loss": 0.84220946, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.85712588, + "num_input_tokens_seen": 320925825, + "router_z_loss_clip": 2.21777344, + "router_z_loss_mlp": 0.27929688, + "step": 14880, + "time_per_iteration": 2.60955548286438 + }, + { + "auxiliary_loss_clip": 0.01224773, + "auxiliary_loss_mlp": 0.00209524, + "balance_loss_clip": 1.01426387, + "balance_loss_mlp": 0.18503842, + "epoch": 0.8946941229520518, + "flos": 27382173868800.0, + "grad_norm": 2.2514669889732124, + "language_loss": 0.89800489, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.91234791, + "num_input_tokens_seen": 320946165, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.24511719, + "step": 14881, + "time_per_iteration": 2.72633695602417 + }, + { + "auxiliary_loss_clip": 0.01220277, + "auxiliary_loss_mlp": 0.00195777, + "balance_loss_clip": 1.00721765, + "balance_loss_mlp": 0.17049289, + "epoch": 0.8947542462047197, + "flos": 31796277338880.0, + "grad_norm": 1739.1108916410462, + "language_loss": 0.74675477, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.76091528, + "num_input_tokens_seen": 320969330, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.25268555, + "step": 14882, + "time_per_iteration": 2.724710464477539 + }, + { + "auxiliary_loss_clip": 0.01267393, + "auxiliary_loss_mlp": 0.00217465, + "balance_loss_clip": 1.04332852, + "balance_loss_mlp": 0.19140592, + "epoch": 0.8948143694573877, + "flos": 20883599314560.0, + "grad_norm": 21.766555815258275, + "language_loss": 0.85760981, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.87245834, + "num_input_tokens_seen": 320985055, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.26062012, + "step": 14883, + "time_per_iteration": 2.62906551361084 + }, + { + "auxiliary_loss_clip": 0.01232122, + "auxiliary_loss_mlp": 0.00217335, + "balance_loss_clip": 1.01946998, + "balance_loss_mlp": 0.19251531, + "epoch": 0.8948744927100556, + "flos": 28215552291840.0, + "grad_norm": 15.458897611550313, + "language_loss": 0.79834986, + "learning_rate": 1.147506048211253e-07, + "loss": 0.81284446, + "num_input_tokens_seen": 321004720, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.24816895, + "step": 14884, + "time_per_iteration": 4.165149450302124 + }, + { + "auxiliary_loss_clip": 0.01228291, + "auxiliary_loss_mlp": 0.00206157, + "balance_loss_clip": 1.01442385, + "balance_loss_mlp": 0.18249437, + "epoch": 0.8949346159627236, + "flos": 21902672073600.0, + "grad_norm": 6.1570117006665885, + "language_loss": 0.82196987, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.83631438, + "num_input_tokens_seen": 321022350, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.23657227, + "step": 14885, + "time_per_iteration": 2.658698558807373 + }, + { + "auxiliary_loss_clip": 0.0125706, + "auxiliary_loss_mlp": 0.00204819, + "balance_loss_clip": 1.03502202, + "balance_loss_mlp": 0.17923693, + "epoch": 0.8949947392153915, + "flos": 21359128492800.0, + "grad_norm": 1294.972478455038, + "language_loss": 0.89970279, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.91432154, + "num_input_tokens_seen": 321040450, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.2557373, + "step": 14886, + "time_per_iteration": 2.748791217803955 + }, + { + "auxiliary_loss_clip": 0.01247392, + "auxiliary_loss_mlp": 0.00216869, + "balance_loss_clip": 1.02884293, + "balance_loss_mlp": 0.19203812, + "epoch": 0.8950548624680595, + "flos": 52445342799360.0, + "grad_norm": 6.998018999586399, + "language_loss": 0.72703046, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.74167305, + "num_input_tokens_seen": 321063970, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24804688, + "step": 14887, + "time_per_iteration": 2.9542315006256104 + }, + { + "auxiliary_loss_clip": 0.01267134, + "auxiliary_loss_mlp": 0.00204785, + "balance_loss_clip": 1.03998756, + "balance_loss_mlp": 0.17795102, + "epoch": 0.8951149857207275, + "flos": 20121323863680.0, + "grad_norm": 6.0896160092350975, + "language_loss": 0.69890332, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.71362251, + "num_input_tokens_seen": 321083840, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.26843262, + "step": 14888, + "time_per_iteration": 2.694272518157959 + }, + { + "auxiliary_loss_clip": 0.01260187, + "auxiliary_loss_mlp": 0.00226161, + "balance_loss_clip": 1.03654337, + "balance_loss_mlp": 0.19991133, + "epoch": 0.8951751089733955, + "flos": 29862631463040.0, + "grad_norm": 268.725308443291, + "language_loss": 0.78572226, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.80058575, + "num_input_tokens_seen": 321104165, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.26281738, + "step": 14889, + "time_per_iteration": 2.716782808303833 + }, + { + "auxiliary_loss_clip": 0.01243646, + "auxiliary_loss_mlp": 0.00183294, + "balance_loss_clip": 1.02345359, + "balance_loss_mlp": 0.15792647, + "epoch": 0.8952352322260634, + "flos": 15262789415040.0, + "grad_norm": 127.51551791418738, + "language_loss": 0.8166256, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.83089495, + "num_input_tokens_seen": 321117290, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.25378418, + "step": 14890, + "time_per_iteration": 2.6683952808380127 + }, + { + "auxiliary_loss_clip": 0.01230793, + "auxiliary_loss_mlp": 0.00200441, + "balance_loss_clip": 1.01752269, + "balance_loss_mlp": 0.1752163, + "epoch": 0.8952953554787314, + "flos": 26798338206720.0, + "grad_norm": 12.86940924997561, + "language_loss": 0.83239025, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.84670264, + "num_input_tokens_seen": 321137115, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.25219727, + "step": 14891, + "time_per_iteration": 2.7002155780792236 + }, + { + "auxiliary_loss_clip": 0.0125159, + "auxiliary_loss_mlp": 0.0023655, + "balance_loss_clip": 1.02668715, + "balance_loss_mlp": 0.21091953, + "epoch": 0.8953554787313994, + "flos": 14137205852160.0, + "grad_norm": 3.6486318434882565, + "language_loss": 0.84806776, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.86294913, + "num_input_tokens_seen": 321154490, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.25610352, + "step": 14892, + "time_per_iteration": 2.6455674171447754 + }, + { + "auxiliary_loss_clip": 0.01231531, + "auxiliary_loss_mlp": 0.00224347, + "balance_loss_clip": 1.01788735, + "balance_loss_mlp": 0.19959939, + "epoch": 0.8954156019840673, + "flos": 25703314139520.0, + "grad_norm": 4.292466524690189, + "language_loss": 0.86547476, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.88003349, + "num_input_tokens_seen": 321175625, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24768066, + "step": 14893, + "time_per_iteration": 2.6705162525177 + }, + { + "auxiliary_loss_clip": 0.01223411, + "auxiliary_loss_mlp": 0.00207649, + "balance_loss_clip": 1.0131948, + "balance_loss_mlp": 0.1838069, + "epoch": 0.8954757252367354, + "flos": 21907987286400.0, + "grad_norm": 211.8540200969957, + "language_loss": 0.81782115, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.83213174, + "num_input_tokens_seen": 321193895, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23852539, + "step": 14894, + "time_per_iteration": 2.67779278755188 + }, + { + "auxiliary_loss_clip": 0.01261211, + "auxiliary_loss_mlp": 0.0023601, + "balance_loss_clip": 1.03628671, + "balance_loss_mlp": 0.21077336, + "epoch": 0.8955358484894033, + "flos": 12970396454400.0, + "grad_norm": 155.07973440325128, + "language_loss": 0.75956595, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.77453816, + "num_input_tokens_seen": 321211610, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.25244141, + "step": 14895, + "time_per_iteration": 2.645392656326294 + }, + { + "auxiliary_loss_clip": 0.01280652, + "auxiliary_loss_mlp": 0.00218988, + "balance_loss_clip": 1.05055094, + "balance_loss_mlp": 0.19017556, + "epoch": 0.8955959717420713, + "flos": 17273966797440.0, + "grad_norm": 31.027033728606206, + "language_loss": 0.75903201, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.77402842, + "num_input_tokens_seen": 321229805, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.28796387, + "step": 14896, + "time_per_iteration": 2.665987491607666 + }, + { + "auxiliary_loss_clip": 0.01244541, + "auxiliary_loss_mlp": 0.00207774, + "balance_loss_clip": 1.02637005, + "balance_loss_mlp": 0.18208435, + "epoch": 0.8956560949947392, + "flos": 14793868339200.0, + "grad_norm": 205.63545778838653, + "language_loss": 0.83122611, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.84574932, + "num_input_tokens_seen": 321247165, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25720215, + "step": 14897, + "time_per_iteration": 2.671221971511841 + }, + { + "auxiliary_loss_clip": 0.01118846, + "auxiliary_loss_mlp": 0.0007545, + "balance_loss_clip": 0.97980624, + "balance_loss_mlp": 0.06906012, + "epoch": 0.8957162182474072, + "flos": 63607817957760.0, + "grad_norm": 0.9296164641937676, + "language_loss": 0.54259098, + "learning_rate": 1.129372846953931e-07, + "loss": 0.5545339, + "num_input_tokens_seen": 321308425, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.06396484, + "step": 14898, + "time_per_iteration": 3.2240471839904785 + }, + { + "auxiliary_loss_clip": 0.01250511, + "auxiliary_loss_mlp": 0.00212549, + "balance_loss_clip": 1.02825022, + "balance_loss_mlp": 0.18564339, + "epoch": 0.8957763415000751, + "flos": 25009843190400.0, + "grad_norm": 32.094575480965304, + "language_loss": 0.7954216, + "learning_rate": 1.12808298352008e-07, + "loss": 0.81005216, + "num_input_tokens_seen": 321329295, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.26916504, + "step": 14899, + "time_per_iteration": 2.6835107803344727 + }, + { + "auxiliary_loss_clip": 0.01258776, + "auxiliary_loss_mlp": 0.00213995, + "balance_loss_clip": 1.03573573, + "balance_loss_mlp": 0.18875885, + "epoch": 0.8958364647527431, + "flos": 19828615933440.0, + "grad_norm": 3.318614135482341, + "language_loss": 0.82058334, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.83531106, + "num_input_tokens_seen": 321347580, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25256348, + "step": 14900, + "time_per_iteration": 2.624495267868042 + }, + { + "auxiliary_loss_clip": 0.01116554, + "auxiliary_loss_mlp": 0.00069945, + "balance_loss_clip": 0.97715807, + "balance_loss_mlp": 0.06379421, + "epoch": 0.895896588005411, + "flos": 65537190115200.0, + "grad_norm": 0.7633707096228676, + "language_loss": 0.61177325, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.62363827, + "num_input_tokens_seen": 321407820, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.0612793, + "step": 14901, + "time_per_iteration": 3.126039981842041 + }, + { + "auxiliary_loss_clip": 0.01259957, + "auxiliary_loss_mlp": 0.00233895, + "balance_loss_clip": 1.0341475, + "balance_loss_mlp": 0.20440264, + "epoch": 0.8959567112580791, + "flos": 25591021246080.0, + "grad_norm": 5.77365035114095, + "language_loss": 0.79208589, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.80702442, + "num_input_tokens_seen": 321426745, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.29455566, + "step": 14902, + "time_per_iteration": 2.684617280960083 + }, + { + "auxiliary_loss_clip": 0.01247109, + "auxiliary_loss_mlp": 0.00204835, + "balance_loss_clip": 1.03010988, + "balance_loss_mlp": 0.18095696, + "epoch": 0.896016834510747, + "flos": 24201780877440.0, + "grad_norm": 9252.533347940654, + "language_loss": 0.85250258, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.86702204, + "num_input_tokens_seen": 321446165, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.23852539, + "step": 14903, + "time_per_iteration": 2.7374775409698486 + }, + { + "auxiliary_loss_clip": 0.01264072, + "auxiliary_loss_mlp": 0.00203088, + "balance_loss_clip": 1.03139746, + "balance_loss_mlp": 0.17671871, + "epoch": 0.896076957763415, + "flos": 23075945919360.0, + "grad_norm": 3.6554740042741214, + "language_loss": 0.81586432, + "learning_rate": 1.121644401702877e-07, + "loss": 0.83053589, + "num_input_tokens_seen": 321465285, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.26367188, + "step": 14904, + "time_per_iteration": 2.6761982440948486 + }, + { + "auxiliary_loss_clip": 0.01248171, + "auxiliary_loss_mlp": 0.00205812, + "balance_loss_clip": 1.02551126, + "balance_loss_mlp": 0.18001559, + "epoch": 0.8961370810160829, + "flos": 22236605838720.0, + "grad_norm": 4.704703435507026, + "language_loss": 0.85276836, + "learning_rate": 1.12035883275166e-07, + "loss": 0.86730814, + "num_input_tokens_seen": 321483670, + "router_z_loss_clip": 2.22949219, + "router_z_loss_mlp": 0.25817871, + "step": 14905, + "time_per_iteration": 2.6656103134155273 + }, + { + "auxiliary_loss_clip": 0.01218326, + "auxiliary_loss_mlp": 0.00191646, + "balance_loss_clip": 1.0063138, + "balance_loss_mlp": 0.16631427, + "epoch": 0.8961972042687509, + "flos": 23072318645760.0, + "grad_norm": 7.276421528706706, + "language_loss": 0.85066879, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.86476856, + "num_input_tokens_seen": 321501190, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.25305176, + "step": 14906, + "time_per_iteration": 2.6711812019348145 + }, + { + "auxiliary_loss_clip": 0.01234832, + "auxiliary_loss_mlp": 0.00216141, + "balance_loss_clip": 1.01929009, + "balance_loss_mlp": 0.19032043, + "epoch": 0.896257327521419, + "flos": 18185882307840.0, + "grad_norm": 6.8631218638820295, + "language_loss": 0.81299973, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.8275094, + "num_input_tokens_seen": 321518540, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.25817871, + "step": 14907, + "time_per_iteration": 2.6302411556243896 + }, + { + "auxiliary_loss_clip": 0.01249484, + "auxiliary_loss_mlp": 0.00208817, + "balance_loss_clip": 1.03366256, + "balance_loss_mlp": 0.18394998, + "epoch": 0.8963174507740869, + "flos": 17895472848000.0, + "grad_norm": 349.6875715332795, + "language_loss": 0.89244008, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.90702307, + "num_input_tokens_seen": 321536555, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.24853516, + "step": 14908, + "time_per_iteration": 2.702270269393921 + }, + { + "auxiliary_loss_clip": 0.01253661, + "auxiliary_loss_mlp": 0.00207236, + "balance_loss_clip": 1.02725101, + "balance_loss_mlp": 0.18222591, + "epoch": 0.8963775740267549, + "flos": 21032269706880.0, + "grad_norm": 73.121583651027, + "language_loss": 0.79264671, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.80725563, + "num_input_tokens_seen": 321557655, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.25024414, + "step": 14909, + "time_per_iteration": 2.742488384246826 + }, + { + "auxiliary_loss_clip": 0.01247296, + "auxiliary_loss_mlp": 0.00235666, + "balance_loss_clip": 1.03345108, + "balance_loss_mlp": 0.20833114, + "epoch": 0.8964376972794228, + "flos": 23179619548800.0, + "grad_norm": 2.38879957370183, + "language_loss": 0.81014156, + "learning_rate": 1.113941727737877e-07, + "loss": 0.8249712, + "num_input_tokens_seen": 321576160, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2734375, + "step": 14910, + "time_per_iteration": 2.745143175125122 + }, + { + "auxiliary_loss_clip": 0.01237415, + "auxiliary_loss_mlp": 0.00208708, + "balance_loss_clip": 1.02577353, + "balance_loss_mlp": 0.18562931, + "epoch": 0.8964978205320908, + "flos": 24972998814720.0, + "grad_norm": 48.08724786895627, + "language_loss": 0.7113111, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.72577238, + "num_input_tokens_seen": 321596205, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.23083496, + "step": 14911, + "time_per_iteration": 2.6836836338043213 + }, + { + "auxiliary_loss_clip": 0.01235407, + "auxiliary_loss_mlp": 0.00200513, + "balance_loss_clip": 1.02227128, + "balance_loss_mlp": 0.1759437, + "epoch": 0.8965579437847587, + "flos": 19172025273600.0, + "grad_norm": 239.4121125826751, + "language_loss": 0.82412565, + "learning_rate": 1.111379898520437e-07, + "loss": 0.83848488, + "num_input_tokens_seen": 321614800, + "router_z_loss_clip": 2.13378906, + "router_z_loss_mlp": 0.2454834, + "step": 14912, + "time_per_iteration": 2.6898725032806396 + }, + { + "auxiliary_loss_clip": 0.01247873, + "auxiliary_loss_mlp": 0.00209591, + "balance_loss_clip": 1.0268116, + "balance_loss_mlp": 0.1839253, + "epoch": 0.8966180670374267, + "flos": 24276690691200.0, + "grad_norm": 39.85399785911896, + "language_loss": 0.88258862, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.89716327, + "num_input_tokens_seen": 321633445, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.2565918, + "step": 14913, + "time_per_iteration": 2.6727206707000732 + }, + { + "auxiliary_loss_clip": 0.01254826, + "auxiliary_loss_mlp": 0.00206993, + "balance_loss_clip": 1.03061688, + "balance_loss_mlp": 0.1821855, + "epoch": 0.8966781902900947, + "flos": 13553190622080.0, + "grad_norm": 677.1273277128881, + "language_loss": 0.7282145, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.74283266, + "num_input_tokens_seen": 321650890, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.24804688, + "step": 14914, + "time_per_iteration": 4.046157360076904 + }, + { + "auxiliary_loss_clip": 0.01129885, + "auxiliary_loss_mlp": 0.00104202, + "balance_loss_clip": 0.98933768, + "balance_loss_mlp": 0.09733513, + "epoch": 0.8967383135427627, + "flos": 65066114223360.0, + "grad_norm": 0.6910881396979742, + "language_loss": 0.54442871, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.55676961, + "num_input_tokens_seen": 321710960, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.06884766, + "step": 14915, + "time_per_iteration": 4.5932536125183105 + }, + { + "auxiliary_loss_clip": 0.01240723, + "auxiliary_loss_mlp": 0.00193091, + "balance_loss_clip": 1.0265224, + "balance_loss_mlp": 0.1694157, + "epoch": 0.8967984367954306, + "flos": 29713027317120.0, + "grad_norm": 11.047714427941733, + "language_loss": 0.76387024, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.77820837, + "num_input_tokens_seen": 321733290, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.23657227, + "step": 14916, + "time_per_iteration": 2.7729239463806152 + }, + { + "auxiliary_loss_clip": 0.0124317, + "auxiliary_loss_mlp": 0.0021983, + "balance_loss_clip": 1.0253278, + "balance_loss_mlp": 0.19421148, + "epoch": 0.8968585600480986, + "flos": 25702488126720.0, + "grad_norm": 77.62803926768352, + "language_loss": 0.82934314, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.84397316, + "num_input_tokens_seen": 321753120, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.25610352, + "step": 14917, + "time_per_iteration": 2.7189583778381348 + }, + { + "auxiliary_loss_clip": 0.01273607, + "auxiliary_loss_mlp": 0.00214446, + "balance_loss_clip": 1.04546523, + "balance_loss_mlp": 0.18880436, + "epoch": 0.8969186833007665, + "flos": 30044698525440.0, + "grad_norm": 17.299562947711497, + "language_loss": 0.78603637, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.80091691, + "num_input_tokens_seen": 321772840, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.25671387, + "step": 14918, + "time_per_iteration": 2.738684892654419 + }, + { + "auxiliary_loss_clip": 0.01235112, + "auxiliary_loss_mlp": 0.0019697, + "balance_loss_clip": 1.01912069, + "balance_loss_mlp": 0.17263922, + "epoch": 0.8969788065534345, + "flos": 22818143030400.0, + "grad_norm": 5.021255303331505, + "language_loss": 0.91791582, + "learning_rate": 1.102436060943881e-07, + "loss": 0.93223661, + "num_input_tokens_seen": 321791020, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.24304199, + "step": 14919, + "time_per_iteration": 2.7326104640960693 + }, + { + "auxiliary_loss_clip": 0.01248667, + "auxiliary_loss_mlp": 0.00215958, + "balance_loss_clip": 1.02693725, + "balance_loss_mlp": 0.19017296, + "epoch": 0.8970389298061026, + "flos": 13261488272640.0, + "grad_norm": 268.3894563915832, + "language_loss": 0.83353704, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.84818327, + "num_input_tokens_seen": 321810075, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25756836, + "step": 14920, + "time_per_iteration": 2.663797616958618 + }, + { + "auxiliary_loss_clip": 0.01280855, + "auxiliary_loss_mlp": 0.00219723, + "balance_loss_clip": 1.05180907, + "balance_loss_mlp": 0.19268647, + "epoch": 0.8970990530587705, + "flos": 10266071345280.0, + "grad_norm": 65.26910504402719, + "language_loss": 1.02947497, + "learning_rate": 1.0998871274307164e-07, + "loss": 1.0444808, + "num_input_tokens_seen": 321822635, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.27050781, + "step": 14921, + "time_per_iteration": 4.0310893058776855 + }, + { + "auxiliary_loss_clip": 0.01244157, + "auxiliary_loss_mlp": 0.00219192, + "balance_loss_clip": 1.02621126, + "balance_loss_mlp": 0.19396758, + "epoch": 0.8971591763114385, + "flos": 20302708567680.0, + "grad_norm": 56.202299483909805, + "language_loss": 0.82127792, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.83591139, + "num_input_tokens_seen": 321841130, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.25268555, + "step": 14922, + "time_per_iteration": 2.670257091522217 + }, + { + "auxiliary_loss_clip": 0.01224959, + "auxiliary_loss_mlp": 0.00202533, + "balance_loss_clip": 1.01024175, + "balance_loss_mlp": 0.17739178, + "epoch": 0.8972192995641064, + "flos": 23257043314560.0, + "grad_norm": 32.60584575256128, + "language_loss": 0.79709589, + "learning_rate": 1.097341060694219e-07, + "loss": 0.81137085, + "num_input_tokens_seen": 321859855, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.25146484, + "step": 14923, + "time_per_iteration": 2.7233080863952637 + }, + { + "auxiliary_loss_clip": 0.01244624, + "auxiliary_loss_mlp": 0.00230823, + "balance_loss_clip": 1.02233529, + "balance_loss_mlp": 0.20514517, + "epoch": 0.8972794228167744, + "flos": 18369601395840.0, + "grad_norm": 5.062318966090202, + "language_loss": 0.8253721, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.84012657, + "num_input_tokens_seen": 321877990, + "router_z_loss_clip": 2.22363281, + "router_z_loss_mlp": 0.25695801, + "step": 14924, + "time_per_iteration": 2.6173830032348633 + }, + { + "auxiliary_loss_clip": 0.01233371, + "auxiliary_loss_mlp": 0.00207756, + "balance_loss_clip": 1.01943994, + "balance_loss_mlp": 0.1838906, + "epoch": 0.8973395460694423, + "flos": 23952058548480.0, + "grad_norm": 1.868105357906014, + "language_loss": 0.78902251, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.80343378, + "num_input_tokens_seen": 321898120, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.2388916, + "step": 14925, + "time_per_iteration": 2.787172317504883 + }, + { + "auxiliary_loss_clip": 0.01247305, + "auxiliary_loss_mlp": 0.00206331, + "balance_loss_clip": 1.03084016, + "balance_loss_mlp": 0.1815477, + "epoch": 0.8973996693221103, + "flos": 24970843998720.0, + "grad_norm": 6.751678155647724, + "language_loss": 0.89512944, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.90966576, + "num_input_tokens_seen": 321918140, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24768066, + "step": 14926, + "time_per_iteration": 4.121737241744995 + }, + { + "auxiliary_loss_clip": 0.01247271, + "auxiliary_loss_mlp": 0.00204423, + "balance_loss_clip": 1.02998042, + "balance_loss_mlp": 0.17962779, + "epoch": 0.8974597925747783, + "flos": 25738937452800.0, + "grad_norm": 3.621688418629379, + "language_loss": 0.83336234, + "learning_rate": 1.092257529095555e-07, + "loss": 0.84787929, + "num_input_tokens_seen": 321938580, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.2479248, + "step": 14927, + "time_per_iteration": 2.6818904876708984 + }, + { + "auxiliary_loss_clip": 0.01240776, + "auxiliary_loss_mlp": 0.00199696, + "balance_loss_clip": 1.02268779, + "balance_loss_mlp": 0.17554426, + "epoch": 0.8975199158274463, + "flos": 38071918131840.0, + "grad_norm": 17.366552440849855, + "language_loss": 0.74233413, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.75673878, + "num_input_tokens_seen": 321961135, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24145508, + "step": 14928, + "time_per_iteration": 2.834700107574463 + }, + { + "auxiliary_loss_clip": 0.01243823, + "auxiliary_loss_mlp": 0.00211023, + "balance_loss_clip": 1.02016735, + "balance_loss_mlp": 0.18370037, + "epoch": 0.8975800390801142, + "flos": 25411683617280.0, + "grad_norm": 92.9321280913213, + "language_loss": 0.78432691, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.79887539, + "num_input_tokens_seen": 321980945, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.27282715, + "step": 14929, + "time_per_iteration": 2.724102258682251 + }, + { + "auxiliary_loss_clip": 0.0125347, + "auxiliary_loss_mlp": 0.00203917, + "balance_loss_clip": 1.03059876, + "balance_loss_mlp": 0.17934775, + "epoch": 0.8976401623327822, + "flos": 21759604202880.0, + "grad_norm": 18.223368721477254, + "language_loss": 0.75322521, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.76779908, + "num_input_tokens_seen": 322000350, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.24560547, + "step": 14930, + "time_per_iteration": 2.7955880165100098 + }, + { + "auxiliary_loss_clip": 0.01233709, + "auxiliary_loss_mlp": 0.00210492, + "balance_loss_clip": 1.02059436, + "balance_loss_mlp": 0.18601838, + "epoch": 0.8977002855854501, + "flos": 13845323934720.0, + "grad_norm": 13.858817171283862, + "language_loss": 0.84161973, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.85606176, + "num_input_tokens_seen": 322018980, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.24475098, + "step": 14931, + "time_per_iteration": 2.649012565612793 + }, + { + "auxiliary_loss_clip": 0.01239969, + "auxiliary_loss_mlp": 0.00239012, + "balance_loss_clip": 1.02622283, + "balance_loss_mlp": 0.21487188, + "epoch": 0.8977604088381181, + "flos": 19427529692160.0, + "grad_norm": 14.433160476951599, + "language_loss": 0.70812166, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.72291148, + "num_input_tokens_seen": 322037675, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24121094, + "step": 14932, + "time_per_iteration": 2.6979901790618896 + }, + { + "auxiliary_loss_clip": 0.01226469, + "auxiliary_loss_mlp": 0.00200339, + "balance_loss_clip": 1.01715732, + "balance_loss_mlp": 0.17622329, + "epoch": 0.8978205320907862, + "flos": 22742083981440.0, + "grad_norm": 36.215801962970815, + "language_loss": 0.79690289, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.81117094, + "num_input_tokens_seen": 322055130, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.2409668, + "step": 14933, + "time_per_iteration": 2.884436845779419 + }, + { + "auxiliary_loss_clip": 0.01250491, + "auxiliary_loss_mlp": 0.00239473, + "balance_loss_clip": 1.03080475, + "balance_loss_mlp": 0.21321163, + "epoch": 0.8978806553434541, + "flos": 21360529123200.0, + "grad_norm": 43.79648961753964, + "language_loss": 0.8114295, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.82632917, + "num_input_tokens_seen": 322074850, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26269531, + "step": 14934, + "time_per_iteration": 2.7523908615112305 + }, + { + "auxiliary_loss_clip": 0.01248054, + "auxiliary_loss_mlp": 0.0020361, + "balance_loss_clip": 1.02975976, + "balance_loss_mlp": 0.17858809, + "epoch": 0.8979407785961221, + "flos": 20924178704640.0, + "grad_norm": 172.50392258456108, + "language_loss": 0.69241166, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.70692837, + "num_input_tokens_seen": 322093315, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25048828, + "step": 14935, + "time_per_iteration": 2.7395992279052734 + }, + { + "auxiliary_loss_clip": 0.01247748, + "auxiliary_loss_mlp": 0.002071, + "balance_loss_clip": 1.03126669, + "balance_loss_mlp": 0.18226869, + "epoch": 0.89800090184879, + "flos": 25228934196480.0, + "grad_norm": 12.280423408814, + "language_loss": 0.86964774, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.88419622, + "num_input_tokens_seen": 322112555, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.24853516, + "step": 14936, + "time_per_iteration": 2.722834587097168 + }, + { + "auxiliary_loss_clip": 0.01235056, + "auxiliary_loss_mlp": 0.00216379, + "balance_loss_clip": 1.0201261, + "balance_loss_mlp": 0.19161928, + "epoch": 0.898061025101458, + "flos": 22562674525440.0, + "grad_norm": 87.77459027009446, + "language_loss": 0.81259215, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.82710648, + "num_input_tokens_seen": 322130440, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.24755859, + "step": 14937, + "time_per_iteration": 2.725440263748169 + }, + { + "auxiliary_loss_clip": 0.01137467, + "auxiliary_loss_mlp": 0.00084904, + "balance_loss_clip": 0.9967854, + "balance_loss_mlp": 0.07636867, + "epoch": 0.8981211483541259, + "flos": 56192551384320.0, + "grad_norm": 0.8211870962886032, + "language_loss": 0.62918413, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.64140785, + "num_input_tokens_seen": 322187295, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.08544922, + "step": 14938, + "time_per_iteration": 3.0692741870880127 + }, + { + "auxiliary_loss_clip": 0.01231197, + "auxiliary_loss_mlp": 0.00213722, + "balance_loss_clip": 1.01905966, + "balance_loss_mlp": 0.18993935, + "epoch": 0.898181271606794, + "flos": 16392718523520.0, + "grad_norm": 1281.2972670911786, + "language_loss": 0.87422413, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.88867331, + "num_input_tokens_seen": 322202965, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.23754883, + "step": 14939, + "time_per_iteration": 2.7186875343322754 + }, + { + "auxiliary_loss_clip": 0.01146299, + "auxiliary_loss_mlp": 0.00091835, + "balance_loss_clip": 1.00304806, + "balance_loss_mlp": 0.08396759, + "epoch": 0.8982413948594619, + "flos": 63440259989760.0, + "grad_norm": 0.700137245070063, + "language_loss": 0.52256083, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.53494215, + "num_input_tokens_seen": 322269490, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.07861328, + "step": 14940, + "time_per_iteration": 3.26021671295166 + }, + { + "auxiliary_loss_clip": 0.01260193, + "auxiliary_loss_mlp": 0.00228671, + "balance_loss_clip": 1.03309739, + "balance_loss_mlp": 0.19975102, + "epoch": 0.8983015181121299, + "flos": 21835340029440.0, + "grad_norm": 6.46360948043896, + "language_loss": 0.88220465, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.8970933, + "num_input_tokens_seen": 322288060, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.28930664, + "step": 14941, + "time_per_iteration": 2.6849465370178223 + }, + { + "auxiliary_loss_clip": 0.01251025, + "auxiliary_loss_mlp": 0.00207093, + "balance_loss_clip": 1.03518128, + "balance_loss_mlp": 0.18209529, + "epoch": 0.8983616413647978, + "flos": 28949961767040.0, + "grad_norm": 20.65981664697821, + "language_loss": 0.81697428, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.83155549, + "num_input_tokens_seen": 322307930, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.25, + "step": 14942, + "time_per_iteration": 2.7346599102020264 + }, + { + "auxiliary_loss_clip": 0.01253703, + "auxiliary_loss_mlp": 0.00225828, + "balance_loss_clip": 1.03357244, + "balance_loss_mlp": 0.19895832, + "epoch": 0.8984217646174658, + "flos": 17785083375360.0, + "grad_norm": 13.98619710078044, + "language_loss": 0.91064227, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.92543763, + "num_input_tokens_seen": 322326155, + "router_z_loss_clip": 2.19824219, + "router_z_loss_mlp": 0.26879883, + "step": 14943, + "time_per_iteration": 2.6412899494171143 + }, + { + "auxiliary_loss_clip": 0.01258633, + "auxiliary_loss_mlp": 0.00229126, + "balance_loss_clip": 1.03419089, + "balance_loss_mlp": 0.20243502, + "epoch": 0.8984818878701337, + "flos": 23404528558080.0, + "grad_norm": 324.571002444918, + "language_loss": 0.78422594, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.7991035, + "num_input_tokens_seen": 322345850, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.26721191, + "step": 14944, + "time_per_iteration": 2.7357442378997803 + }, + { + "auxiliary_loss_clip": 0.01252062, + "auxiliary_loss_mlp": 0.00231042, + "balance_loss_clip": 1.02920866, + "balance_loss_mlp": 0.20404083, + "epoch": 0.8985420111228017, + "flos": 22346061557760.0, + "grad_norm": 63.189301355003245, + "language_loss": 0.86581004, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.88064104, + "num_input_tokens_seen": 322364715, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.27001953, + "step": 14945, + "time_per_iteration": 2.6971487998962402 + }, + { + "auxiliary_loss_clip": 0.01276143, + "auxiliary_loss_mlp": 0.00227949, + "balance_loss_clip": 1.04336965, + "balance_loss_mlp": 0.19902879, + "epoch": 0.8986021343754698, + "flos": 21392776558080.0, + "grad_norm": 50.090817721132986, + "language_loss": 0.86000299, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.87504399, + "num_input_tokens_seen": 322383570, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.28955078, + "step": 14946, + "time_per_iteration": 2.736772298812866 + }, + { + "auxiliary_loss_clip": 0.01240341, + "auxiliary_loss_mlp": 0.00217429, + "balance_loss_clip": 1.02447557, + "balance_loss_mlp": 0.1928007, + "epoch": 0.8986622576281377, + "flos": 21325372686720.0, + "grad_norm": 14.776985114773739, + "language_loss": 0.7191866, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.73376429, + "num_input_tokens_seen": 322401375, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24633789, + "step": 14947, + "time_per_iteration": 2.681806802749634 + }, + { + "auxiliary_loss_clip": 0.01244099, + "auxiliary_loss_mlp": 0.00206692, + "balance_loss_clip": 1.02717495, + "balance_loss_mlp": 0.1804423, + "epoch": 0.8987223808808057, + "flos": 23988292392960.0, + "grad_norm": 73.39639464959784, + "language_loss": 0.79218835, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.8066963, + "num_input_tokens_seen": 322421890, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.26257324, + "step": 14948, + "time_per_iteration": 2.7832841873168945 + }, + { + "auxiliary_loss_clip": 0.012472, + "auxiliary_loss_mlp": 0.00222209, + "balance_loss_clip": 1.02591908, + "balance_loss_mlp": 0.19519581, + "epoch": 0.8987825041334736, + "flos": 41500956044160.0, + "grad_norm": 40.060983661418824, + "language_loss": 0.82363653, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.83833063, + "num_input_tokens_seen": 322445730, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26989746, + "step": 14949, + "time_per_iteration": 2.83032488822937 + }, + { + "auxiliary_loss_clip": 0.01273991, + "auxiliary_loss_mlp": 0.00252956, + "balance_loss_clip": 1.04165447, + "balance_loss_mlp": 0.22293895, + "epoch": 0.8988426273861416, + "flos": 27564276844800.0, + "grad_norm": 167.59344068171558, + "language_loss": 0.8353703, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.85063982, + "num_input_tokens_seen": 322464595, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.30029297, + "step": 14950, + "time_per_iteration": 2.7130472660064697 + }, + { + "auxiliary_loss_clip": 0.01249907, + "auxiliary_loss_mlp": 0.00203447, + "balance_loss_clip": 1.03350878, + "balance_loss_mlp": 0.17852066, + "epoch": 0.8989027506388095, + "flos": 17092653920640.0, + "grad_norm": 23.051235351621226, + "language_loss": 0.76185572, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.7763893, + "num_input_tokens_seen": 322483305, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.24926758, + "step": 14951, + "time_per_iteration": 2.67946195602417 + }, + { + "auxiliary_loss_clip": 0.01263989, + "auxiliary_loss_mlp": 0.00222952, + "balance_loss_clip": 1.03887558, + "balance_loss_mlp": 0.19598684, + "epoch": 0.8989628738914776, + "flos": 20555124416640.0, + "grad_norm": 8.361169087087884, + "language_loss": 0.8240701, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.83893949, + "num_input_tokens_seen": 322501905, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.26977539, + "step": 14952, + "time_per_iteration": 2.707854747772217 + }, + { + "auxiliary_loss_clip": 0.01243847, + "auxiliary_loss_mlp": 0.00215298, + "balance_loss_clip": 1.02521133, + "balance_loss_mlp": 0.18750995, + "epoch": 0.8990229971441455, + "flos": 16251087196800.0, + "grad_norm": 41.63146965420085, + "language_loss": 0.69290012, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.70749158, + "num_input_tokens_seen": 322518135, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.2779541, + "step": 14953, + "time_per_iteration": 2.7253971099853516 + }, + { + "auxiliary_loss_clip": 0.01247771, + "auxiliary_loss_mlp": 0.00222013, + "balance_loss_clip": 1.02739906, + "balance_loss_mlp": 0.19662173, + "epoch": 0.8990831203968135, + "flos": 21981316901760.0, + "grad_norm": 6.80418576062488, + "language_loss": 0.90877491, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.92347276, + "num_input_tokens_seen": 322537905, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.25366211, + "step": 14954, + "time_per_iteration": 2.728855848312378 + }, + { + "auxiliary_loss_clip": 0.01242923, + "auxiliary_loss_mlp": 0.00210529, + "balance_loss_clip": 1.02753043, + "balance_loss_mlp": 0.18729469, + "epoch": 0.8991432436494814, + "flos": 27447171528960.0, + "grad_norm": 51.0196201556688, + "language_loss": 0.69048452, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.70501906, + "num_input_tokens_seen": 322557945, + "router_z_loss_clip": 2.15722656, + "router_z_loss_mlp": 0.23266602, + "step": 14955, + "time_per_iteration": 2.749107599258423 + }, + { + "auxiliary_loss_clip": 0.0124407, + "auxiliary_loss_mlp": 0.00207663, + "balance_loss_clip": 1.03125906, + "balance_loss_mlp": 0.18448859, + "epoch": 0.8992033669021494, + "flos": 21579835610880.0, + "grad_norm": 1194.1623971174467, + "language_loss": 0.62840426, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.64292163, + "num_input_tokens_seen": 322575765, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.23181152, + "step": 14956, + "time_per_iteration": 4.052279949188232 + }, + { + "auxiliary_loss_clip": 0.01241129, + "auxiliary_loss_mlp": 0.00219022, + "balance_loss_clip": 1.02749491, + "balance_loss_mlp": 0.19438177, + "epoch": 0.8992634901548173, + "flos": 28584211530240.0, + "grad_norm": 27.52854732823561, + "language_loss": 0.87340331, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.88800478, + "num_input_tokens_seen": 322595665, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.24658203, + "step": 14957, + "time_per_iteration": 4.257422685623169 + }, + { + "auxiliary_loss_clip": 0.01256411, + "auxiliary_loss_mlp": 0.00230246, + "balance_loss_clip": 1.03139627, + "balance_loss_mlp": 0.20428193, + "epoch": 0.8993236134074853, + "flos": 19867435557120.0, + "grad_norm": 2.861413126913298, + "language_loss": 0.86749423, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.88236082, + "num_input_tokens_seen": 322614755, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.25952148, + "step": 14958, + "time_per_iteration": 2.7474870681762695 + }, + { + "auxiliary_loss_clip": 0.01235628, + "auxiliary_loss_mlp": 0.00201742, + "balance_loss_clip": 1.02549732, + "balance_loss_mlp": 0.17739901, + "epoch": 0.8993837366601534, + "flos": 19390649402880.0, + "grad_norm": 874.5527326568764, + "language_loss": 0.82959729, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.84397095, + "num_input_tokens_seen": 322633425, + "router_z_loss_clip": 2.10058594, + "router_z_loss_mlp": 0.24353027, + "step": 14959, + "time_per_iteration": 2.643650531768799 + }, + { + "auxiliary_loss_clip": 0.01228531, + "auxiliary_loss_mlp": 0.00211901, + "balance_loss_clip": 1.01512825, + "balance_loss_mlp": 0.18759465, + "epoch": 0.8994438599128213, + "flos": 18551740285440.0, + "grad_norm": 33.49251383292649, + "language_loss": 0.79342508, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.80782938, + "num_input_tokens_seen": 322652065, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.24304199, + "step": 14960, + "time_per_iteration": 2.8261499404907227 + }, + { + "auxiliary_loss_clip": 0.01247549, + "auxiliary_loss_mlp": 0.00214777, + "balance_loss_clip": 1.02898049, + "balance_loss_mlp": 0.19029126, + "epoch": 0.8995039831654893, + "flos": 24427587726720.0, + "grad_norm": 34.942523289377846, + "language_loss": 0.72275305, + "learning_rate": 1.049510991294591e-07, + "loss": 0.73737627, + "num_input_tokens_seen": 322673275, + "router_z_loss_clip": 2.18652344, + "router_z_loss_mlp": 0.24499512, + "step": 14961, + "time_per_iteration": 2.770008087158203 + }, + { + "auxiliary_loss_clip": 0.01223343, + "auxiliary_loss_mlp": 0.00225923, + "balance_loss_clip": 1.0132761, + "balance_loss_mlp": 0.20262982, + "epoch": 0.8995641064181572, + "flos": 21251324799360.0, + "grad_norm": 3.1274646550465204, + "language_loss": 0.88090205, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.89539468, + "num_input_tokens_seen": 322693375, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.2331543, + "step": 14962, + "time_per_iteration": 2.6767005920410156 + }, + { + "auxiliary_loss_clip": 0.01268258, + "auxiliary_loss_mlp": 0.00217217, + "balance_loss_clip": 1.04023874, + "balance_loss_mlp": 0.19004926, + "epoch": 0.8996242296708252, + "flos": 23513661054720.0, + "grad_norm": 3.047560081478307, + "language_loss": 0.83801186, + "learning_rate": 1.047022340612298e-07, + "loss": 0.85286659, + "num_input_tokens_seen": 322712615, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.27209473, + "step": 14963, + "time_per_iteration": 4.177760601043701 + }, + { + "auxiliary_loss_clip": 0.01146478, + "auxiliary_loss_mlp": 0.00100254, + "balance_loss_clip": 1.00604069, + "balance_loss_mlp": 0.0900974, + "epoch": 0.8996843529234931, + "flos": 62403230430720.0, + "grad_norm": 0.7536273048986821, + "language_loss": 0.56641686, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.57888424, + "num_input_tokens_seen": 322766855, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.1015625, + "step": 14964, + "time_per_iteration": 2.9920525550842285 + }, + { + "auxiliary_loss_clip": 0.01293745, + "auxiliary_loss_mlp": 0.00227775, + "balance_loss_clip": 1.05464208, + "balance_loss_mlp": 0.2003208, + "epoch": 0.8997444761761612, + "flos": 24236829573120.0, + "grad_norm": 15.72876555013831, + "language_loss": 0.82030475, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.83551991, + "num_input_tokens_seen": 322781130, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.27453613, + "step": 14965, + "time_per_iteration": 2.68679141998291 + }, + { + "auxiliary_loss_clip": 0.01252561, + "auxiliary_loss_mlp": 0.00195397, + "balance_loss_clip": 1.03033555, + "balance_loss_mlp": 0.17134012, + "epoch": 0.8998045994288291, + "flos": 21361103740800.0, + "grad_norm": 435.17392338682373, + "language_loss": 0.79898834, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.81346798, + "num_input_tokens_seen": 322800310, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.24060059, + "step": 14966, + "time_per_iteration": 2.6923253536224365 + }, + { + "auxiliary_loss_clip": 0.01241951, + "auxiliary_loss_mlp": 0.0022636, + "balance_loss_clip": 1.02683759, + "balance_loss_mlp": 0.19920461, + "epoch": 0.8998647226814971, + "flos": 28986159697920.0, + "grad_norm": 11.711113219074862, + "language_loss": 0.80172276, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.81640589, + "num_input_tokens_seen": 322820955, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.27172852, + "step": 14967, + "time_per_iteration": 2.715650796890259 + }, + { + "auxiliary_loss_clip": 0.012261, + "auxiliary_loss_mlp": 0.00206238, + "balance_loss_clip": 1.01155901, + "balance_loss_mlp": 0.18170491, + "epoch": 0.899924845934165, + "flos": 13625909706240.0, + "grad_norm": 3.1484606151643733, + "language_loss": 0.82622993, + "learning_rate": 1.040813291960323e-07, + "loss": 0.84055334, + "num_input_tokens_seen": 322838780, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24536133, + "step": 14968, + "time_per_iteration": 3.988271474838257 + }, + { + "auxiliary_loss_clip": 0.0123975, + "auxiliary_loss_mlp": 0.00222064, + "balance_loss_clip": 1.02459717, + "balance_loss_mlp": 0.19545677, + "epoch": 0.899984969186833, + "flos": 20882629647360.0, + "grad_norm": 34.791924160654126, + "language_loss": 0.78474104, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.7993592, + "num_input_tokens_seen": 322856710, + "router_z_loss_clip": 2.15136719, + "router_z_loss_mlp": 0.26586914, + "step": 14969, + "time_per_iteration": 2.6075313091278076 + }, + { + "auxiliary_loss_clip": 0.01248789, + "auxiliary_loss_mlp": 0.00221236, + "balance_loss_clip": 1.02465081, + "balance_loss_mlp": 0.19524834, + "epoch": 0.9000450924395009, + "flos": 20921808407040.0, + "grad_norm": 7.344813267680482, + "language_loss": 0.86877692, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.88347709, + "num_input_tokens_seen": 322876070, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.26025391, + "step": 14970, + "time_per_iteration": 2.78420352935791 + }, + { + "auxiliary_loss_clip": 0.01245624, + "auxiliary_loss_mlp": 0.00193198, + "balance_loss_clip": 1.02533197, + "balance_loss_mlp": 0.16868848, + "epoch": 0.900105215692169, + "flos": 17165049782400.0, + "grad_norm": 284.53999455683527, + "language_loss": 0.8279804, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.8423686, + "num_input_tokens_seen": 322895095, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24523926, + "step": 14971, + "time_per_iteration": 2.737246513366699 + }, + { + "auxiliary_loss_clip": 0.01256025, + "auxiliary_loss_mlp": 0.00216024, + "balance_loss_clip": 1.02922845, + "balance_loss_mlp": 0.19040546, + "epoch": 0.900165338944837, + "flos": 19931930426880.0, + "grad_norm": 89.09426343475955, + "language_loss": 0.9284265, + "learning_rate": 1.035858993572476e-07, + "loss": 0.94314706, + "num_input_tokens_seen": 322911845, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.25622559, + "step": 14972, + "time_per_iteration": 2.7411112785339355 + }, + { + "auxiliary_loss_clip": 0.01252405, + "auxiliary_loss_mlp": 0.00241899, + "balance_loss_clip": 1.0286355, + "balance_loss_mlp": 0.21561375, + "epoch": 0.9002254621975049, + "flos": 16107085572480.0, + "grad_norm": 6.270421693920907, + "language_loss": 0.90324378, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.91818684, + "num_input_tokens_seen": 322928170, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26293945, + "step": 14973, + "time_per_iteration": 2.720637559890747 + }, + { + "auxiliary_loss_clip": 0.01236139, + "auxiliary_loss_mlp": 0.00215326, + "balance_loss_clip": 1.01683497, + "balance_loss_mlp": 0.19036314, + "epoch": 0.9002855854501729, + "flos": 28476120528000.0, + "grad_norm": 3.8680362197281375, + "language_loss": 0.66735727, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.68187189, + "num_input_tokens_seen": 322948165, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.24987793, + "step": 14974, + "time_per_iteration": 2.7946276664733887 + }, + { + "auxiliary_loss_clip": 0.01263999, + "auxiliary_loss_mlp": 0.00216925, + "balance_loss_clip": 1.0406996, + "balance_loss_mlp": 0.19149731, + "epoch": 0.9003457087028408, + "flos": 25630307746560.0, + "grad_norm": 3989.179351830267, + "language_loss": 0.69692379, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.71173298, + "num_input_tokens_seen": 322968880, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25427246, + "step": 14975, + "time_per_iteration": 2.831637382507324 + }, + { + "auxiliary_loss_clip": 0.01245793, + "auxiliary_loss_mlp": 0.00192163, + "balance_loss_clip": 1.02908397, + "balance_loss_mlp": 0.16634245, + "epoch": 0.9004058319555088, + "flos": 24389414547840.0, + "grad_norm": 95.32687079638082, + "language_loss": 0.81172788, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.82610744, + "num_input_tokens_seen": 322989395, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.25805664, + "step": 14976, + "time_per_iteration": 2.7177388668060303 + }, + { + "auxiliary_loss_clip": 0.01240682, + "auxiliary_loss_mlp": 0.00214049, + "balance_loss_clip": 1.0276649, + "balance_loss_mlp": 0.18953922, + "epoch": 0.9004659552081767, + "flos": 29059345658880.0, + "grad_norm": 70.3143908033979, + "language_loss": 0.7604627, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.77500999, + "num_input_tokens_seen": 323009060, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.24487305, + "step": 14977, + "time_per_iteration": 2.7912211418151855 + }, + { + "auxiliary_loss_clip": 0.01251024, + "auxiliary_loss_mlp": 0.00214199, + "balance_loss_clip": 1.03062081, + "balance_loss_mlp": 0.18920061, + "epoch": 0.9005260784608448, + "flos": 16763855800320.0, + "grad_norm": 6.3485233288736005, + "language_loss": 0.80022693, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.81487918, + "num_input_tokens_seen": 323027530, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25012207, + "step": 14978, + "time_per_iteration": 2.6954894065856934 + }, + { + "auxiliary_loss_clip": 0.01265534, + "auxiliary_loss_mlp": 0.00209336, + "balance_loss_clip": 1.03948212, + "balance_loss_mlp": 0.18381356, + "epoch": 0.9005862017135127, + "flos": 20376002269440.0, + "grad_norm": 3.1716216884038717, + "language_loss": 0.88810337, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.90285212, + "num_input_tokens_seen": 323045370, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.25524902, + "step": 14979, + "time_per_iteration": 2.6934101581573486 + }, + { + "auxiliary_loss_clip": 0.01135075, + "auxiliary_loss_mlp": 0.00161343, + "balance_loss_clip": 0.99371183, + "balance_loss_mlp": 0.15123378, + "epoch": 0.9006463249661807, + "flos": 67580255796480.0, + "grad_norm": 0.8520949452415648, + "language_loss": 0.52137387, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.53433812, + "num_input_tokens_seen": 323105660, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.10107422, + "step": 14980, + "time_per_iteration": 3.2253005504608154 + }, + { + "auxiliary_loss_clip": 0.01259691, + "auxiliary_loss_mlp": 0.00215841, + "balance_loss_clip": 1.03542447, + "balance_loss_mlp": 0.18782638, + "epoch": 0.9007064482188486, + "flos": 28293335193600.0, + "grad_norm": 401.9227631423909, + "language_loss": 0.90689969, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.92165506, + "num_input_tokens_seen": 323126365, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.2800293, + "step": 14981, + "time_per_iteration": 2.77797794342041 + }, + { + "auxiliary_loss_clip": 0.01235473, + "auxiliary_loss_mlp": 0.00216084, + "balance_loss_clip": 1.01986003, + "balance_loss_mlp": 0.1907759, + "epoch": 0.9007665714715166, + "flos": 21616320850560.0, + "grad_norm": 108.65820250838966, + "language_loss": 0.885566, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.90008163, + "num_input_tokens_seen": 323145655, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.25305176, + "step": 14982, + "time_per_iteration": 2.7323219776153564 + }, + { + "auxiliary_loss_clip": 0.0124407, + "auxiliary_loss_mlp": 0.00209516, + "balance_loss_clip": 1.02718997, + "balance_loss_mlp": 0.18563813, + "epoch": 0.9008266947241845, + "flos": 26541864120960.0, + "grad_norm": 3.4063121311916356, + "language_loss": 0.7767309, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.7912668, + "num_input_tokens_seen": 323164540, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.23876953, + "step": 14983, + "time_per_iteration": 2.701016664505005 + }, + { + "auxiliary_loss_clip": 0.01235756, + "auxiliary_loss_mlp": 0.00221991, + "balance_loss_clip": 1.02026498, + "balance_loss_mlp": 0.19675469, + "epoch": 0.9008868179768525, + "flos": 23110527738240.0, + "grad_norm": 258.16302489021706, + "language_loss": 0.80706918, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.82164669, + "num_input_tokens_seen": 323186960, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.25244141, + "step": 14984, + "time_per_iteration": 2.726374626159668 + }, + { + "auxiliary_loss_clip": 0.01219603, + "auxiliary_loss_mlp": 0.00198793, + "balance_loss_clip": 1.01146698, + "balance_loss_mlp": 0.17551103, + "epoch": 0.9009469412295206, + "flos": 19060809788160.0, + "grad_norm": 7.621481403019943, + "language_loss": 0.76561058, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.77979451, + "num_input_tokens_seen": 323206135, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.23278809, + "step": 14985, + "time_per_iteration": 2.6914021968841553 + }, + { + "auxiliary_loss_clip": 0.01234466, + "auxiliary_loss_mlp": 0.00215528, + "balance_loss_clip": 1.01728356, + "balance_loss_mlp": 0.19087572, + "epoch": 0.9010070644821885, + "flos": 23222281927680.0, + "grad_norm": 12.885990101630478, + "language_loss": 0.79401976, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.80851972, + "num_input_tokens_seen": 323225980, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24658203, + "step": 14986, + "time_per_iteration": 2.7460954189300537 + }, + { + "auxiliary_loss_clip": 0.01228712, + "auxiliary_loss_mlp": 0.00208977, + "balance_loss_clip": 1.01319194, + "balance_loss_mlp": 0.18501607, + "epoch": 0.9010671877348565, + "flos": 17384823146880.0, + "grad_norm": 10.092715810862806, + "language_loss": 0.83258355, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.84696043, + "num_input_tokens_seen": 323243700, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.23962402, + "step": 14987, + "time_per_iteration": 2.6339879035949707 + }, + { + "auxiliary_loss_clip": 0.01240797, + "auxiliary_loss_mlp": 0.00240234, + "balance_loss_clip": 1.01660895, + "balance_loss_mlp": 0.21269655, + "epoch": 0.9011273109875244, + "flos": 21908166854400.0, + "grad_norm": 14.769897462053658, + "language_loss": 0.83201468, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.846825, + "num_input_tokens_seen": 323261535, + "router_z_loss_clip": 2.23535156, + "router_z_loss_mlp": 0.27514648, + "step": 14988, + "time_per_iteration": 2.6589250564575195 + }, + { + "auxiliary_loss_clip": 0.01254417, + "auxiliary_loss_mlp": 0.00215225, + "balance_loss_clip": 1.033499, + "balance_loss_mlp": 0.19027415, + "epoch": 0.9011874342401924, + "flos": 24060831909120.0, + "grad_norm": 209.30486048181731, + "language_loss": 0.80368507, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.81838149, + "num_input_tokens_seen": 323281855, + "router_z_loss_clip": 2.21386719, + "router_z_loss_mlp": 0.24951172, + "step": 14989, + "time_per_iteration": 2.6847009658813477 + }, + { + "auxiliary_loss_clip": 0.01247215, + "auxiliary_loss_mlp": 0.00227391, + "balance_loss_clip": 1.02619159, + "balance_loss_mlp": 0.20053352, + "epoch": 0.9012475574928603, + "flos": 16758791982720.0, + "grad_norm": 723.814252877338, + "language_loss": 0.90443444, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.91918051, + "num_input_tokens_seen": 323299505, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26843262, + "step": 14990, + "time_per_iteration": 2.7019410133361816 + }, + { + "auxiliary_loss_clip": 0.01257715, + "auxiliary_loss_mlp": 0.00234428, + "balance_loss_clip": 1.03443456, + "balance_loss_mlp": 0.20759422, + "epoch": 0.9013076807455284, + "flos": 19971109186560.0, + "grad_norm": 14.414215547132091, + "language_loss": 0.86783975, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.88276118, + "num_input_tokens_seen": 323318365, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26818848, + "step": 14991, + "time_per_iteration": 2.7014145851135254 + }, + { + "auxiliary_loss_clip": 0.01144953, + "auxiliary_loss_mlp": 0.00099595, + "balance_loss_clip": 1.00031066, + "balance_loss_mlp": 0.09063063, + "epoch": 0.9013678039981963, + "flos": 65180274624000.0, + "grad_norm": 1.3993841061653867, + "language_loss": 0.59377682, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.60622227, + "num_input_tokens_seen": 323371835, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.08984375, + "step": 14992, + "time_per_iteration": 3.072889566421509 + }, + { + "auxiliary_loss_clip": 0.01233509, + "auxiliary_loss_mlp": 0.00201345, + "balance_loss_clip": 1.01843739, + "balance_loss_mlp": 0.1773358, + "epoch": 0.9014279272508643, + "flos": 20521224956160.0, + "grad_norm": 23.281941032500054, + "language_loss": 0.90760767, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.92195618, + "num_input_tokens_seen": 323388495, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.23999023, + "step": 14993, + "time_per_iteration": 2.6695525646209717 + }, + { + "auxiliary_loss_clip": 0.01244423, + "auxiliary_loss_mlp": 0.00219307, + "balance_loss_clip": 1.02775025, + "balance_loss_mlp": 0.19445223, + "epoch": 0.9014880505035322, + "flos": 17309051406720.0, + "grad_norm": 3.8018175954967703, + "language_loss": 0.82005048, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.83468777, + "num_input_tokens_seen": 323405280, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24841309, + "step": 14994, + "time_per_iteration": 2.6932692527770996 + }, + { + "auxiliary_loss_clip": 0.01233382, + "auxiliary_loss_mlp": 0.00214092, + "balance_loss_clip": 1.02075565, + "balance_loss_mlp": 0.18979757, + "epoch": 0.9015481737562002, + "flos": 28402862739840.0, + "grad_norm": 3.97973985741406, + "language_loss": 0.73189175, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.74636656, + "num_input_tokens_seen": 323425310, + "router_z_loss_clip": 2.12597656, + "router_z_loss_mlp": 0.24267578, + "step": 14995, + "time_per_iteration": 2.763733148574829 + }, + { + "auxiliary_loss_clip": 0.01239586, + "auxiliary_loss_mlp": 0.00213411, + "balance_loss_clip": 1.01792383, + "balance_loss_mlp": 0.18823402, + "epoch": 0.9016082970088681, + "flos": 29752672953600.0, + "grad_norm": 18.84629864485391, + "language_loss": 0.75751364, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.77204359, + "num_input_tokens_seen": 323447805, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25195312, + "step": 14996, + "time_per_iteration": 2.741243839263916 + }, + { + "auxiliary_loss_clip": 0.01235753, + "auxiliary_loss_mlp": 0.00188115, + "balance_loss_clip": 1.02156007, + "balance_loss_mlp": 0.16463102, + "epoch": 0.9016684202615362, + "flos": 23513230091520.0, + "grad_norm": 10.1944999174036, + "language_loss": 0.75761247, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.77185112, + "num_input_tokens_seen": 323467150, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.23486328, + "step": 14997, + "time_per_iteration": 2.7784781455993652 + }, + { + "auxiliary_loss_clip": 0.01251361, + "auxiliary_loss_mlp": 0.00225079, + "balance_loss_clip": 1.03141117, + "balance_loss_mlp": 0.19938916, + "epoch": 0.9017285435142042, + "flos": 16979247705600.0, + "grad_norm": 24.4270080199845, + "language_loss": 0.84484929, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.85961366, + "num_input_tokens_seen": 323484250, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.25695801, + "step": 14998, + "time_per_iteration": 2.6779465675354004 + }, + { + "auxiliary_loss_clip": 0.01238647, + "auxiliary_loss_mlp": 0.00215491, + "balance_loss_clip": 1.0206393, + "balance_loss_mlp": 0.19009905, + "epoch": 0.9017886667668721, + "flos": 21393351175680.0, + "grad_norm": 16.63805499006334, + "language_loss": 0.83103526, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.84557664, + "num_input_tokens_seen": 323502910, + "router_z_loss_clip": 2.18261719, + "router_z_loss_mlp": 0.25378418, + "step": 14999, + "time_per_iteration": 5.574976921081543 + }, + { + "auxiliary_loss_clip": 0.01258981, + "auxiliary_loss_mlp": 0.00222543, + "balance_loss_clip": 1.03729272, + "balance_loss_mlp": 0.19518444, + "epoch": 0.9018487900195401, + "flos": 20996574566400.0, + "grad_norm": 10.038773970461397, + "language_loss": 0.84175283, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.85656804, + "num_input_tokens_seen": 323521820, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.27331543, + "step": 15000, + "time_per_iteration": 2.642618179321289 + }, + { + "auxiliary_loss_clip": 0.01236722, + "auxiliary_loss_mlp": 0.00212158, + "balance_loss_clip": 1.02321672, + "balance_loss_mlp": 0.18719539, + "epoch": 0.901908913272208, + "flos": 53358443458560.0, + "grad_norm": 58.16988022019603, + "language_loss": 0.89850819, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.91299701, + "num_input_tokens_seen": 323543200, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24951172, + "step": 15001, + "time_per_iteration": 2.9734010696411133 + }, + { + "auxiliary_loss_clip": 0.01232716, + "auxiliary_loss_mlp": 0.00201111, + "balance_loss_clip": 1.02037251, + "balance_loss_mlp": 0.17892662, + "epoch": 0.901969036524876, + "flos": 22089838867200.0, + "grad_norm": 422.18531102711256, + "language_loss": 0.84160447, + "learning_rate": 9.990687143794407e-08, + "loss": 0.85594279, + "num_input_tokens_seen": 323563075, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.22167969, + "step": 15002, + "time_per_iteration": 2.6874022483825684 + }, + { + "auxiliary_loss_clip": 0.01242063, + "auxiliary_loss_mlp": 0.00194303, + "balance_loss_clip": 1.02581549, + "balance_loss_mlp": 0.16949505, + "epoch": 0.9020291597775439, + "flos": 23835025059840.0, + "grad_norm": 20.664040522650563, + "language_loss": 0.77737236, + "learning_rate": 9.978535328195347e-08, + "loss": 0.79173607, + "num_input_tokens_seen": 323579065, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24816895, + "step": 15003, + "time_per_iteration": 2.6824169158935547 + }, + { + "auxiliary_loss_clip": 0.0123673, + "auxiliary_loss_mlp": 0.00228464, + "balance_loss_clip": 1.02109659, + "balance_loss_mlp": 0.20456275, + "epoch": 0.902089283030212, + "flos": 18326005263360.0, + "grad_norm": 108.4412815014333, + "language_loss": 0.92965311, + "learning_rate": 9.9663907182292e-08, + "loss": 0.94430506, + "num_input_tokens_seen": 323594835, + "router_z_loss_clip": 2.15527344, + "router_z_loss_mlp": 0.23901367, + "step": 15004, + "time_per_iteration": 2.734975814819336 + }, + { + "auxiliary_loss_clip": 0.01241901, + "auxiliary_loss_mlp": 0.00199595, + "balance_loss_clip": 1.02341461, + "balance_loss_mlp": 0.17454875, + "epoch": 0.9021494062828799, + "flos": 24170359455360.0, + "grad_norm": 47.6089388395283, + "language_loss": 0.8227073, + "learning_rate": 9.954253314356575e-08, + "loss": 0.8371222, + "num_input_tokens_seen": 323611475, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25048828, + "step": 15005, + "time_per_iteration": 4.190165758132935 + }, + { + "auxiliary_loss_clip": 0.01251231, + "auxiliary_loss_mlp": 0.00228183, + "balance_loss_clip": 1.02950597, + "balance_loss_mlp": 0.20021638, + "epoch": 0.9022095295355479, + "flos": 21616859554560.0, + "grad_norm": 5.438884616713143, + "language_loss": 0.81397831, + "learning_rate": 9.942123117037748e-08, + "loss": 0.82877243, + "num_input_tokens_seen": 323629730, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.2800293, + "step": 15006, + "time_per_iteration": 2.6893153190612793 + }, + { + "auxiliary_loss_clip": 0.01259467, + "auxiliary_loss_mlp": 0.00226935, + "balance_loss_clip": 1.0371778, + "balance_loss_mlp": 0.20203194, + "epoch": 0.9022696527882158, + "flos": 18726229578240.0, + "grad_norm": 57.048777572370646, + "language_loss": 0.92812419, + "learning_rate": 9.930000126732618e-08, + "loss": 0.94298822, + "num_input_tokens_seen": 323646000, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.24902344, + "step": 15007, + "time_per_iteration": 2.6015121936798096 + }, + { + "auxiliary_loss_clip": 0.01239834, + "auxiliary_loss_mlp": 0.00211302, + "balance_loss_clip": 1.02301955, + "balance_loss_mlp": 0.18611346, + "epoch": 0.9023297760408838, + "flos": 26761206522240.0, + "grad_norm": 5933.435773313606, + "language_loss": 0.84724581, + "learning_rate": 9.917884343900928e-08, + "loss": 0.86175716, + "num_input_tokens_seen": 323667250, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.2520752, + "step": 15008, + "time_per_iteration": 2.7353920936584473 + }, + { + "auxiliary_loss_clip": 0.01216398, + "auxiliary_loss_mlp": 0.00202642, + "balance_loss_clip": 1.00735974, + "balance_loss_mlp": 0.1786336, + "epoch": 0.9023898992935517, + "flos": 20522553759360.0, + "grad_norm": 21.35877510816422, + "language_loss": 0.81319511, + "learning_rate": 9.905775769002156e-08, + "loss": 0.82738554, + "num_input_tokens_seen": 323687150, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.2401123, + "step": 15009, + "time_per_iteration": 2.6589205265045166 + }, + { + "auxiliary_loss_clip": 0.0125335, + "auxiliary_loss_mlp": 0.00189278, + "balance_loss_clip": 1.03505707, + "balance_loss_mlp": 0.16246751, + "epoch": 0.9024500225462198, + "flos": 17456644391040.0, + "grad_norm": 12.598554133260299, + "language_loss": 0.82041001, + "learning_rate": 9.893674402495399e-08, + "loss": 0.83483636, + "num_input_tokens_seen": 323703660, + "router_z_loss_clip": 2.18066406, + "router_z_loss_mlp": 0.26818848, + "step": 15010, + "time_per_iteration": 4.252178907394409 + }, + { + "auxiliary_loss_clip": 0.01250605, + "auxiliary_loss_mlp": 0.00201241, + "balance_loss_clip": 1.03203559, + "balance_loss_mlp": 0.17599216, + "epoch": 0.9025101457988878, + "flos": 20813609664000.0, + "grad_norm": 26.24036777057758, + "language_loss": 0.83770376, + "learning_rate": 9.881580244839538e-08, + "loss": 0.8522222, + "num_input_tokens_seen": 323722060, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25219727, + "step": 15011, + "time_per_iteration": 2.6820366382598877 + }, + { + "auxiliary_loss_clip": 0.01254051, + "auxiliary_loss_mlp": 0.00220471, + "balance_loss_clip": 1.03369093, + "balance_loss_mlp": 0.19622347, + "epoch": 0.9025702690515557, + "flos": 19026371623680.0, + "grad_norm": 44.21652215068953, + "language_loss": 0.8581804, + "learning_rate": 9.869493296493204e-08, + "loss": 0.8729257, + "num_input_tokens_seen": 323740645, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.2421875, + "step": 15012, + "time_per_iteration": 2.624124526977539 + }, + { + "auxiliary_loss_clip": 0.01226091, + "auxiliary_loss_mlp": 0.00196769, + "balance_loss_clip": 1.01225019, + "balance_loss_mlp": 0.17168745, + "epoch": 0.9026303923042237, + "flos": 19682818629120.0, + "grad_norm": 38.560705028470515, + "language_loss": 0.75973207, + "learning_rate": 9.857413557914763e-08, + "loss": 0.77396065, + "num_input_tokens_seen": 323758905, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.25085449, + "step": 15013, + "time_per_iteration": 2.6967415809631348 + }, + { + "auxiliary_loss_clip": 0.01228064, + "auxiliary_loss_mlp": 0.00203292, + "balance_loss_clip": 1.01747966, + "balance_loss_mlp": 0.18027228, + "epoch": 0.9026905155568916, + "flos": 24608110504320.0, + "grad_norm": 32.44093781396152, + "language_loss": 0.79160237, + "learning_rate": 9.845341029562249e-08, + "loss": 0.80591589, + "num_input_tokens_seen": 323780595, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.23022461, + "step": 15014, + "time_per_iteration": 2.715789556503296 + }, + { + "auxiliary_loss_clip": 0.01242167, + "auxiliary_loss_mlp": 0.00212673, + "balance_loss_clip": 1.02445698, + "balance_loss_mlp": 0.18694755, + "epoch": 0.9027506388095596, + "flos": 20521799573760.0, + "grad_norm": 327.26455813075864, + "language_loss": 0.79709738, + "learning_rate": 9.833275711893474e-08, + "loss": 0.81164581, + "num_input_tokens_seen": 323798160, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25708008, + "step": 15015, + "time_per_iteration": 2.6807641983032227 + }, + { + "auxiliary_loss_clip": 0.01257758, + "auxiliary_loss_mlp": 0.00222392, + "balance_loss_clip": 1.03500414, + "balance_loss_mlp": 0.19670205, + "epoch": 0.9028107620622275, + "flos": 22784494965120.0, + "grad_norm": 132.9685644941984, + "language_loss": 0.77976662, + "learning_rate": 9.821217605365895e-08, + "loss": 0.79456812, + "num_input_tokens_seen": 323816810, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25671387, + "step": 15016, + "time_per_iteration": 2.7219746112823486 + }, + { + "auxiliary_loss_clip": 0.01234741, + "auxiliary_loss_mlp": 0.00203063, + "balance_loss_clip": 1.01997948, + "balance_loss_mlp": 0.17945912, + "epoch": 0.9028708853148956, + "flos": 25410534382080.0, + "grad_norm": 38918.254669354974, + "language_loss": 0.8149693, + "learning_rate": 9.809166710436855e-08, + "loss": 0.82934725, + "num_input_tokens_seen": 323836900, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.23608398, + "step": 15017, + "time_per_iteration": 2.7588536739349365 + }, + { + "auxiliary_loss_clip": 0.0125365, + "auxiliary_loss_mlp": 0.00211258, + "balance_loss_clip": 1.03704882, + "balance_loss_mlp": 0.18655832, + "epoch": 0.9029310085675635, + "flos": 21871322478720.0, + "grad_norm": 7.485255498032207, + "language_loss": 0.75926203, + "learning_rate": 9.797123027563237e-08, + "loss": 0.77391112, + "num_input_tokens_seen": 323855325, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.24719238, + "step": 15018, + "time_per_iteration": 2.642148017883301 + }, + { + "auxiliary_loss_clip": 0.01261323, + "auxiliary_loss_mlp": 0.001934, + "balance_loss_clip": 1.03631151, + "balance_loss_mlp": 0.16767484, + "epoch": 0.9029911318202315, + "flos": 26214394803840.0, + "grad_norm": 3.429631674040516, + "language_loss": 0.76770735, + "learning_rate": 9.785086557201782e-08, + "loss": 0.78225452, + "num_input_tokens_seen": 323875650, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.25769043, + "step": 15019, + "time_per_iteration": 2.750429391860962 + }, + { + "auxiliary_loss_clip": 0.01218463, + "auxiliary_loss_mlp": 0.00217933, + "balance_loss_clip": 1.00944829, + "balance_loss_mlp": 0.19463921, + "epoch": 0.9030512550728994, + "flos": 15961360095360.0, + "grad_norm": 6.985222440715066, + "language_loss": 0.80988622, + "learning_rate": 9.773057299808951e-08, + "loss": 0.82425022, + "num_input_tokens_seen": 323892920, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.23291016, + "step": 15020, + "time_per_iteration": 2.6244869232177734 + }, + { + "auxiliary_loss_clip": 0.01250545, + "auxiliary_loss_mlp": 0.00198223, + "balance_loss_clip": 1.0289669, + "balance_loss_mlp": 0.17392772, + "epoch": 0.9031113783255674, + "flos": 23987610034560.0, + "grad_norm": 58.35458515489077, + "language_loss": 0.8183198, + "learning_rate": 9.7610352558408e-08, + "loss": 0.83280742, + "num_input_tokens_seen": 323913835, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.24291992, + "step": 15021, + "time_per_iteration": 2.704535484313965 + }, + { + "auxiliary_loss_clip": 0.01269444, + "auxiliary_loss_mlp": 0.00216036, + "balance_loss_clip": 1.03860939, + "balance_loss_mlp": 0.18908289, + "epoch": 0.9031715015782353, + "flos": 22237216369920.0, + "grad_norm": 7.958720477811071, + "language_loss": 0.85382843, + "learning_rate": 9.749020425753251e-08, + "loss": 0.86868322, + "num_input_tokens_seen": 323933440, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.26940918, + "step": 15022, + "time_per_iteration": 2.7219390869140625 + }, + { + "auxiliary_loss_clip": 0.01222853, + "auxiliary_loss_mlp": 0.00205274, + "balance_loss_clip": 1.01196778, + "balance_loss_mlp": 0.18065687, + "epoch": 0.9032316248309034, + "flos": 26323168164480.0, + "grad_norm": 37.15080853215362, + "language_loss": 0.80887413, + "learning_rate": 9.737012810001943e-08, + "loss": 0.82315534, + "num_input_tokens_seen": 323954090, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.24597168, + "step": 15023, + "time_per_iteration": 2.7446141242980957 + }, + { + "auxiliary_loss_clip": 0.01244111, + "auxiliary_loss_mlp": 0.00211184, + "balance_loss_clip": 1.02728534, + "balance_loss_mlp": 0.18630549, + "epoch": 0.9032917480835713, + "flos": 22636686499200.0, + "grad_norm": 16.161374012163932, + "language_loss": 0.8959893, + "learning_rate": 9.725012409042155e-08, + "loss": 0.91054225, + "num_input_tokens_seen": 323974040, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.2487793, + "step": 15024, + "time_per_iteration": 2.8138136863708496 + }, + { + "auxiliary_loss_clip": 0.01240403, + "auxiliary_loss_mlp": 0.00213474, + "balance_loss_clip": 1.02427244, + "balance_loss_mlp": 0.18977487, + "epoch": 0.9033518713362393, + "flos": 23878764846720.0, + "grad_norm": 17.408762510992197, + "language_loss": 0.77023172, + "learning_rate": 9.713019223328966e-08, + "loss": 0.78477049, + "num_input_tokens_seen": 323996125, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.23706055, + "step": 15025, + "time_per_iteration": 2.708176612854004 + }, + { + "auxiliary_loss_clip": 0.01237489, + "auxiliary_loss_mlp": 0.0021092, + "balance_loss_clip": 1.02087045, + "balance_loss_mlp": 0.18708968, + "epoch": 0.9034119945889073, + "flos": 26905279973760.0, + "grad_norm": 40.5976459823537, + "language_loss": 0.84079719, + "learning_rate": 9.70103325331717e-08, + "loss": 0.85528123, + "num_input_tokens_seen": 324017645, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.23828125, + "step": 15026, + "time_per_iteration": 2.74302077293396 + }, + { + "auxiliary_loss_clip": 0.01257144, + "auxiliary_loss_mlp": 0.00224993, + "balance_loss_clip": 1.03505433, + "balance_loss_mlp": 0.2002926, + "epoch": 0.9034721178415752, + "flos": 20850166730880.0, + "grad_norm": 120.89512557012259, + "language_loss": 0.79067469, + "learning_rate": 9.68905449946129e-08, + "loss": 0.8054961, + "num_input_tokens_seen": 324036875, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24694824, + "step": 15027, + "time_per_iteration": 2.6471056938171387 + }, + { + "auxiliary_loss_clip": 0.01219451, + "auxiliary_loss_mlp": 0.00204295, + "balance_loss_clip": 1.0079695, + "balance_loss_mlp": 0.18038186, + "epoch": 0.9035322410942432, + "flos": 22234307368320.0, + "grad_norm": 11.176804098902096, + "language_loss": 0.8193146, + "learning_rate": 9.677082962215477e-08, + "loss": 0.83355206, + "num_input_tokens_seen": 324057045, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.23937988, + "step": 15028, + "time_per_iteration": 2.743190050125122 + }, + { + "auxiliary_loss_clip": 0.01242065, + "auxiliary_loss_mlp": 0.00199508, + "balance_loss_clip": 1.02302802, + "balance_loss_mlp": 0.17361566, + "epoch": 0.9035923643469111, + "flos": 25923410726400.0, + "grad_norm": 19.43433257442062, + "language_loss": 0.76673383, + "learning_rate": 9.665118642033765e-08, + "loss": 0.78114951, + "num_input_tokens_seen": 324079735, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25891113, + "step": 15029, + "time_per_iteration": 2.6872169971466064 + }, + { + "auxiliary_loss_clip": 0.0125927, + "auxiliary_loss_mlp": 0.00212059, + "balance_loss_clip": 1.03443682, + "balance_loss_mlp": 0.18722773, + "epoch": 0.9036524875995792, + "flos": 20339804338560.0, + "grad_norm": 7.602857031576354, + "language_loss": 0.8199864, + "learning_rate": 9.653161539369858e-08, + "loss": 0.83469975, + "num_input_tokens_seen": 324097785, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.24816895, + "step": 15030, + "time_per_iteration": 2.647810459136963 + }, + { + "auxiliary_loss_clip": 0.01245594, + "auxiliary_loss_mlp": 0.00231025, + "balance_loss_clip": 1.02689338, + "balance_loss_mlp": 0.20395218, + "epoch": 0.9037126108522471, + "flos": 40114624677120.0, + "grad_norm": 4.16667491061631, + "language_loss": 0.74191982, + "learning_rate": 9.641211654677151e-08, + "loss": 0.75668597, + "num_input_tokens_seen": 324121625, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.27062988, + "step": 15031, + "time_per_iteration": 2.8130578994750977 + }, + { + "auxiliary_loss_clip": 0.01217254, + "auxiliary_loss_mlp": 0.00211663, + "balance_loss_clip": 1.00680375, + "balance_loss_mlp": 0.18799964, + "epoch": 0.9037727341049151, + "flos": 23332024955520.0, + "grad_norm": 3.4867909371867034, + "language_loss": 0.84336489, + "learning_rate": 9.629268988408723e-08, + "loss": 0.85765409, + "num_input_tokens_seen": 324142535, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.23657227, + "step": 15032, + "time_per_iteration": 2.694162368774414 + }, + { + "auxiliary_loss_clip": 0.01249032, + "auxiliary_loss_mlp": 0.00214095, + "balance_loss_clip": 1.03249586, + "balance_loss_mlp": 0.18863192, + "epoch": 0.903832857357583, + "flos": 12822659815680.0, + "grad_norm": 21.771482162021986, + "language_loss": 0.83669615, + "learning_rate": 9.617333541017502e-08, + "loss": 0.85132742, + "num_input_tokens_seen": 324159610, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.25439453, + "step": 15033, + "time_per_iteration": 2.6575756072998047 + }, + { + "auxiliary_loss_clip": 0.01247066, + "auxiliary_loss_mlp": 0.00229222, + "balance_loss_clip": 1.02635205, + "balance_loss_mlp": 0.20169654, + "epoch": 0.903892980610251, + "flos": 25703026830720.0, + "grad_norm": 8.055241139084632, + "language_loss": 0.80451047, + "learning_rate": 9.605405312956105e-08, + "loss": 0.81927329, + "num_input_tokens_seen": 324182510, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.27575684, + "step": 15034, + "time_per_iteration": 2.7057321071624756 + }, + { + "auxiliary_loss_clip": 0.01255427, + "auxiliary_loss_mlp": 0.00228398, + "balance_loss_clip": 1.0361774, + "balance_loss_mlp": 0.2039123, + "epoch": 0.9039531038629189, + "flos": 14684089397760.0, + "grad_norm": 25.403456083190346, + "language_loss": 0.74443537, + "learning_rate": 9.593484304676791e-08, + "loss": 0.75927353, + "num_input_tokens_seen": 324200555, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24499512, + "step": 15035, + "time_per_iteration": 2.663966417312622 + }, + { + "auxiliary_loss_clip": 0.01252194, + "auxiliary_loss_mlp": 0.00230476, + "balance_loss_clip": 1.03388143, + "balance_loss_mlp": 0.2047269, + "epoch": 0.904013227115587, + "flos": 24024921287040.0, + "grad_norm": 3.8192492047623503, + "language_loss": 0.71859443, + "learning_rate": 9.581570516631643e-08, + "loss": 0.73342109, + "num_input_tokens_seen": 324220255, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25793457, + "step": 15036, + "time_per_iteration": 2.6970818042755127 + }, + { + "auxiliary_loss_clip": 0.01219838, + "auxiliary_loss_mlp": 0.00199582, + "balance_loss_clip": 1.0113219, + "balance_loss_mlp": 0.17655092, + "epoch": 0.9040733503682549, + "flos": 22856459863680.0, + "grad_norm": 6.558841982692202, + "language_loss": 0.89793628, + "learning_rate": 9.569663949272455e-08, + "loss": 0.91213048, + "num_input_tokens_seen": 324237855, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.23046875, + "step": 15037, + "time_per_iteration": 2.746861219406128 + }, + { + "auxiliary_loss_clip": 0.01255162, + "auxiliary_loss_mlp": 0.00204625, + "balance_loss_clip": 1.03036976, + "balance_loss_mlp": 0.17845826, + "epoch": 0.9041334736209229, + "flos": 19974951941760.0, + "grad_norm": 6.364061276610379, + "language_loss": 0.75059867, + "learning_rate": 9.557764603050667e-08, + "loss": 0.76519644, + "num_input_tokens_seen": 324257050, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.26196289, + "step": 15038, + "time_per_iteration": 2.6521472930908203 + }, + { + "auxiliary_loss_clip": 0.01241138, + "auxiliary_loss_mlp": 0.00231446, + "balance_loss_clip": 1.02302015, + "balance_loss_mlp": 0.20567343, + "epoch": 0.9041935968735909, + "flos": 17530548624000.0, + "grad_norm": 53.80233959122208, + "language_loss": 0.86583424, + "learning_rate": 9.545872478417494e-08, + "loss": 0.88056004, + "num_input_tokens_seen": 324275510, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25769043, + "step": 15039, + "time_per_iteration": 2.6551008224487305 + }, + { + "auxiliary_loss_clip": 0.01247572, + "auxiliary_loss_mlp": 0.00203026, + "balance_loss_clip": 1.03472018, + "balance_loss_mlp": 0.17913681, + "epoch": 0.9042537201262588, + "flos": 22780149419520.0, + "grad_norm": 4.940922162979636, + "language_loss": 0.77555156, + "learning_rate": 9.533987575823977e-08, + "loss": 0.79005754, + "num_input_tokens_seen": 324295150, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23901367, + "step": 15040, + "time_per_iteration": 2.6549835205078125 + }, + { + "auxiliary_loss_clip": 0.01242728, + "auxiliary_loss_mlp": 0.00209928, + "balance_loss_clip": 1.0311501, + "balance_loss_mlp": 0.18464354, + "epoch": 0.9043138433789268, + "flos": 20595416497920.0, + "grad_norm": 5.373407049316351, + "language_loss": 0.75432664, + "learning_rate": 9.522109895720709e-08, + "loss": 0.76885319, + "num_input_tokens_seen": 324313855, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.25317383, + "step": 15041, + "time_per_iteration": 5.611579179763794 + }, + { + "auxiliary_loss_clip": 0.0124051, + "auxiliary_loss_mlp": 0.0020961, + "balance_loss_clip": 1.01960647, + "balance_loss_mlp": 0.1833483, + "epoch": 0.9043739666315948, + "flos": 32962978995840.0, + "grad_norm": 25.58658441269707, + "language_loss": 0.68277156, + "learning_rate": 9.510239438558155e-08, + "loss": 0.69727272, + "num_input_tokens_seen": 324338465, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26245117, + "step": 15042, + "time_per_iteration": 2.772920608520508 + }, + { + "auxiliary_loss_clip": 0.01125352, + "auxiliary_loss_mlp": 0.00067865, + "balance_loss_clip": 0.98505402, + "balance_loss_mlp": 0.0613323, + "epoch": 0.9044340898842628, + "flos": 67296418525440.0, + "grad_norm": 0.7606041998341159, + "language_loss": 0.56024939, + "learning_rate": 9.498376204786351e-08, + "loss": 0.57218158, + "num_input_tokens_seen": 324398740, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.06542969, + "step": 15043, + "time_per_iteration": 3.145738124847412 + }, + { + "auxiliary_loss_clip": 0.01250768, + "auxiliary_loss_mlp": 0.00207375, + "balance_loss_clip": 1.03080821, + "balance_loss_mlp": 0.18136342, + "epoch": 0.9044942131369307, + "flos": 17713154390400.0, + "grad_norm": 69.28521764241187, + "language_loss": 0.76745999, + "learning_rate": 9.486520194855274e-08, + "loss": 0.78204143, + "num_input_tokens_seen": 324417335, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26013184, + "step": 15044, + "time_per_iteration": 2.679252862930298 + }, + { + "auxiliary_loss_clip": 0.0123927, + "auxiliary_loss_mlp": 0.00207372, + "balance_loss_clip": 1.02295411, + "balance_loss_mlp": 0.18379208, + "epoch": 0.9045543363895987, + "flos": 17820563034240.0, + "grad_norm": 54.56605930279235, + "language_loss": 0.80326009, + "learning_rate": 9.474671409214407e-08, + "loss": 0.81772649, + "num_input_tokens_seen": 324433240, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.23583984, + "step": 15045, + "time_per_iteration": 2.637418746948242 + }, + { + "auxiliary_loss_clip": 0.01257307, + "auxiliary_loss_mlp": 0.00223462, + "balance_loss_clip": 1.03708231, + "balance_loss_mlp": 0.19861889, + "epoch": 0.9046144596422666, + "flos": 21872723109120.0, + "grad_norm": 49.42887324531474, + "language_loss": 0.74995816, + "learning_rate": 9.462829848313081e-08, + "loss": 0.76476586, + "num_input_tokens_seen": 324452675, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.24816895, + "step": 15046, + "time_per_iteration": 2.6340174674987793 + }, + { + "auxiliary_loss_clip": 0.01268102, + "auxiliary_loss_mlp": 0.00226371, + "balance_loss_clip": 1.04299045, + "balance_loss_mlp": 0.20121777, + "epoch": 0.9046745828949346, + "flos": 17672646827520.0, + "grad_norm": 3.8820145326455275, + "language_loss": 0.73854256, + "learning_rate": 9.450995512600379e-08, + "loss": 0.75348723, + "num_input_tokens_seen": 324467865, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.25170898, + "step": 15047, + "time_per_iteration": 4.136888265609741 + }, + { + "auxiliary_loss_clip": 0.01246118, + "auxiliary_loss_mlp": 0.00216091, + "balance_loss_clip": 1.03217435, + "balance_loss_mlp": 0.19199833, + "epoch": 0.9047347061476025, + "flos": 25702559953920.0, + "grad_norm": 6.788652520151275, + "language_loss": 0.77219164, + "learning_rate": 9.439168402525032e-08, + "loss": 0.78681374, + "num_input_tokens_seen": 324490430, + "router_z_loss_clip": 2.13769531, + "router_z_loss_mlp": 0.24121094, + "step": 15048, + "time_per_iteration": 2.764289140701294 + }, + { + "auxiliary_loss_clip": 0.01241472, + "auxiliary_loss_mlp": 0.00222862, + "balance_loss_clip": 1.02097404, + "balance_loss_mlp": 0.19657679, + "epoch": 0.9047948294002706, + "flos": 15158146118400.0, + "grad_norm": 99.25524063793468, + "language_loss": 0.84381485, + "learning_rate": 9.427348518535483e-08, + "loss": 0.85845816, + "num_input_tokens_seen": 324506620, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.26281738, + "step": 15049, + "time_per_iteration": 2.644822835922241 + }, + { + "auxiliary_loss_clip": 0.01224864, + "auxiliary_loss_mlp": 0.0021946, + "balance_loss_clip": 1.01253474, + "balance_loss_mlp": 0.19496228, + "epoch": 0.9048549526529385, + "flos": 21872292145920.0, + "grad_norm": 21.77818935330412, + "language_loss": 0.82271332, + "learning_rate": 9.415535861079993e-08, + "loss": 0.83715653, + "num_input_tokens_seen": 324525505, + "router_z_loss_clip": 2.12402344, + "router_z_loss_mlp": 0.24487305, + "step": 15050, + "time_per_iteration": 2.699599027633667 + }, + { + "auxiliary_loss_clip": 0.01251714, + "auxiliary_loss_mlp": 0.00238938, + "balance_loss_clip": 1.0348134, + "balance_loss_mlp": 0.21322486, + "epoch": 0.9049150759056065, + "flos": 23546626761600.0, + "grad_norm": 2.20513850135261, + "language_loss": 0.88811105, + "learning_rate": 9.403730430606472e-08, + "loss": 0.90301758, + "num_input_tokens_seen": 324544415, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25695801, + "step": 15051, + "time_per_iteration": 2.639695167541504 + }, + { + "auxiliary_loss_clip": 0.01247381, + "auxiliary_loss_mlp": 0.00209568, + "balance_loss_clip": 1.03042948, + "balance_loss_mlp": 0.18384305, + "epoch": 0.9049751991582745, + "flos": 19645902426240.0, + "grad_norm": 7.698691086967648, + "language_loss": 0.98279965, + "learning_rate": 9.391932227562582e-08, + "loss": 0.99736911, + "num_input_tokens_seen": 324562555, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.25708008, + "step": 15052, + "time_per_iteration": 2.6397194862365723 + }, + { + "auxiliary_loss_clip": 0.0125922, + "auxiliary_loss_mlp": 0.00206401, + "balance_loss_clip": 1.03406215, + "balance_loss_mlp": 0.17987646, + "epoch": 0.9050353224109424, + "flos": 15596220389760.0, + "grad_norm": 328.56707369052475, + "language_loss": 0.84397388, + "learning_rate": 9.380141252395724e-08, + "loss": 0.85863012, + "num_input_tokens_seen": 324580865, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.26538086, + "step": 15053, + "time_per_iteration": 4.010976076126099 + }, + { + "auxiliary_loss_clip": 0.01240055, + "auxiliary_loss_mlp": 0.00221796, + "balance_loss_clip": 1.02529359, + "balance_loss_mlp": 0.19558194, + "epoch": 0.9050954456636104, + "flos": 28183592165760.0, + "grad_norm": 17.40690448539482, + "language_loss": 0.80424678, + "learning_rate": 9.368357505553049e-08, + "loss": 0.8188653, + "num_input_tokens_seen": 324600665, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.26196289, + "step": 15054, + "time_per_iteration": 2.680037021636963 + }, + { + "auxiliary_loss_clip": 0.01231497, + "auxiliary_loss_mlp": 0.00211519, + "balance_loss_clip": 1.0173924, + "balance_loss_mlp": 0.18867886, + "epoch": 0.9051555689162784, + "flos": 25731611078400.0, + "grad_norm": 11.452298958362615, + "language_loss": 0.8891806, + "learning_rate": 9.356580987481333e-08, + "loss": 0.90361077, + "num_input_tokens_seen": 324618145, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.22839355, + "step": 15055, + "time_per_iteration": 2.691591262817383 + }, + { + "auxiliary_loss_clip": 0.01220137, + "auxiliary_loss_mlp": 0.00216767, + "balance_loss_clip": 1.01067305, + "balance_loss_mlp": 0.19216223, + "epoch": 0.9052156921689464, + "flos": 23257258796160.0, + "grad_norm": 2.4553447549121947, + "language_loss": 0.90707922, + "learning_rate": 9.344811698627176e-08, + "loss": 0.92144823, + "num_input_tokens_seen": 324638165, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.24572754, + "step": 15056, + "time_per_iteration": 2.6667087078094482 + }, + { + "auxiliary_loss_clip": 0.01257281, + "auxiliary_loss_mlp": 0.00214001, + "balance_loss_clip": 1.0360564, + "balance_loss_mlp": 0.18902668, + "epoch": 0.9052758154216143, + "flos": 29564285097600.0, + "grad_norm": 60.87470699828128, + "language_loss": 0.80745453, + "learning_rate": 9.333049639436863e-08, + "loss": 0.8221674, + "num_input_tokens_seen": 324658560, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.25, + "step": 15057, + "time_per_iteration": 2.7902650833129883 + }, + { + "auxiliary_loss_clip": 0.0122259, + "auxiliary_loss_mlp": 0.00185063, + "balance_loss_clip": 1.01541901, + "balance_loss_mlp": 0.16186459, + "epoch": 0.9053359386742823, + "flos": 22127688823680.0, + "grad_norm": 19.676869144073425, + "language_loss": 0.87628412, + "learning_rate": 9.321294810356418e-08, + "loss": 0.89036071, + "num_input_tokens_seen": 324679185, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.23168945, + "step": 15058, + "time_per_iteration": 2.6526734828948975 + }, + { + "auxiliary_loss_clip": 0.01132894, + "auxiliary_loss_mlp": 0.00113673, + "balance_loss_clip": 0.9926368, + "balance_loss_mlp": 0.10618699, + "epoch": 0.9053960619269502, + "flos": 67090112760960.0, + "grad_norm": 0.6630899415322499, + "language_loss": 0.51111734, + "learning_rate": 9.309547211831592e-08, + "loss": 0.52358294, + "num_input_tokens_seen": 324744830, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.07470703, + "step": 15059, + "time_per_iteration": 3.2797129154205322 + }, + { + "auxiliary_loss_clip": 0.01253599, + "auxiliary_loss_mlp": 0.00224636, + "balance_loss_clip": 1.03144813, + "balance_loss_mlp": 0.19962636, + "epoch": 0.9054561851796182, + "flos": 15815419136640.0, + "grad_norm": 14.721323178055718, + "language_loss": 0.75801444, + "learning_rate": 9.297806844307831e-08, + "loss": 0.77279687, + "num_input_tokens_seen": 324762905, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.25036621, + "step": 15060, + "time_per_iteration": 2.6126139163970947 + }, + { + "auxiliary_loss_clip": 0.01255966, + "auxiliary_loss_mlp": 0.0020405, + "balance_loss_clip": 1.03423333, + "balance_loss_mlp": 0.1783964, + "epoch": 0.9055163084322861, + "flos": 17566997950080.0, + "grad_norm": 7.252747886340646, + "language_loss": 0.75526571, + "learning_rate": 9.286073708230357e-08, + "loss": 0.76986593, + "num_input_tokens_seen": 324781905, + "router_z_loss_clip": 2.21777344, + "router_z_loss_mlp": 0.25671387, + "step": 15061, + "time_per_iteration": 2.6981401443481445 + }, + { + "auxiliary_loss_clip": 0.01257317, + "auxiliary_loss_mlp": 0.00213082, + "balance_loss_clip": 1.03380799, + "balance_loss_mlp": 0.18578282, + "epoch": 0.9055764316849542, + "flos": 17639573379840.0, + "grad_norm": 19.80352392666029, + "language_loss": 0.7981838, + "learning_rate": 9.274347804044058e-08, + "loss": 0.81288785, + "num_input_tokens_seen": 324799260, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.2734375, + "step": 15062, + "time_per_iteration": 2.7697815895080566 + }, + { + "auxiliary_loss_clip": 0.01234704, + "auxiliary_loss_mlp": 0.00220701, + "balance_loss_clip": 1.0193646, + "balance_loss_mlp": 0.19592966, + "epoch": 0.9056365549376221, + "flos": 20120856986880.0, + "grad_norm": 3.258175765459346, + "language_loss": 0.79177296, + "learning_rate": 9.2626291321936e-08, + "loss": 0.80632704, + "num_input_tokens_seen": 324817800, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.24780273, + "step": 15063, + "time_per_iteration": 2.682821273803711 + }, + { + "auxiliary_loss_clip": 0.01235163, + "auxiliary_loss_mlp": 0.00212702, + "balance_loss_clip": 1.01971662, + "balance_loss_mlp": 0.18882427, + "epoch": 0.9056966781902901, + "flos": 27598786836480.0, + "grad_norm": 16.767583688336437, + "language_loss": 0.78474969, + "learning_rate": 9.250917693123406e-08, + "loss": 0.79922831, + "num_input_tokens_seen": 324838445, + "router_z_loss_clip": 2.15527344, + "router_z_loss_mlp": 0.23901367, + "step": 15064, + "time_per_iteration": 2.7116708755493164 + }, + { + "auxiliary_loss_clip": 0.01228542, + "auxiliary_loss_mlp": 0.00225534, + "balance_loss_clip": 1.015185, + "balance_loss_mlp": 0.20066693, + "epoch": 0.9057568014429581, + "flos": 25920106675200.0, + "grad_norm": 17.407795647337224, + "language_loss": 0.8083747, + "learning_rate": 9.23921348727752e-08, + "loss": 0.82291543, + "num_input_tokens_seen": 324859895, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24865723, + "step": 15065, + "time_per_iteration": 2.724809169769287 + }, + { + "auxiliary_loss_clip": 0.01242824, + "auxiliary_loss_mlp": 0.00236926, + "balance_loss_clip": 1.02507246, + "balance_loss_mlp": 0.21247645, + "epoch": 0.905816924695626, + "flos": 22930364096640.0, + "grad_norm": 2.41019544858577, + "language_loss": 0.71044892, + "learning_rate": 9.227516515099743e-08, + "loss": 0.72524643, + "num_input_tokens_seen": 324879580, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.24438477, + "step": 15066, + "time_per_iteration": 2.713212728500366 + }, + { + "auxiliary_loss_clip": 0.01256911, + "auxiliary_loss_mlp": 0.0022578, + "balance_loss_clip": 1.02863157, + "balance_loss_mlp": 0.19962578, + "epoch": 0.905877047948294, + "flos": 22157422306560.0, + "grad_norm": 41.314273779982535, + "language_loss": 0.89548862, + "learning_rate": 9.215826777033675e-08, + "loss": 0.91031557, + "num_input_tokens_seen": 324898950, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.26147461, + "step": 15067, + "time_per_iteration": 2.6788675785064697 + }, + { + "auxiliary_loss_clip": 0.01260513, + "auxiliary_loss_mlp": 0.00236981, + "balance_loss_clip": 1.03641963, + "balance_loss_mlp": 0.20783424, + "epoch": 0.905937171200962, + "flos": 15304805349120.0, + "grad_norm": 7.757862125018227, + "language_loss": 0.79646313, + "learning_rate": 9.204144273522563e-08, + "loss": 0.81143808, + "num_input_tokens_seen": 324917455, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.29125977, + "step": 15068, + "time_per_iteration": 2.695805788040161 + }, + { + "auxiliary_loss_clip": 0.01214816, + "auxiliary_loss_mlp": 0.00220227, + "balance_loss_clip": 1.00533056, + "balance_loss_mlp": 0.19578908, + "epoch": 0.90599729445363, + "flos": 19462973437440.0, + "grad_norm": 366.18637109233424, + "language_loss": 0.91774505, + "learning_rate": 9.19246900500943e-08, + "loss": 0.93209553, + "num_input_tokens_seen": 324934495, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.24462891, + "step": 15069, + "time_per_iteration": 2.6887691020965576 + }, + { + "auxiliary_loss_clip": 0.01263863, + "auxiliary_loss_mlp": 0.00217759, + "balance_loss_clip": 1.03680921, + "balance_loss_mlp": 0.19162877, + "epoch": 0.9060574177062979, + "flos": 23732967542400.0, + "grad_norm": 9.096928066843194, + "language_loss": 0.69115496, + "learning_rate": 9.180800971936987e-08, + "loss": 0.70597118, + "num_input_tokens_seen": 324953230, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.26147461, + "step": 15070, + "time_per_iteration": 2.8041584491729736 + }, + { + "auxiliary_loss_clip": 0.0125159, + "auxiliary_loss_mlp": 0.00193013, + "balance_loss_clip": 1.0315845, + "balance_loss_mlp": 0.16749041, + "epoch": 0.9061175409589659, + "flos": 17311134395520.0, + "grad_norm": 13.74168586258266, + "language_loss": 0.90748107, + "learning_rate": 9.169140174747724e-08, + "loss": 0.92192709, + "num_input_tokens_seen": 324969880, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25537109, + "step": 15071, + "time_per_iteration": 2.639390707015991 + }, + { + "auxiliary_loss_clip": 0.01251893, + "auxiliary_loss_mlp": 0.00209283, + "balance_loss_clip": 1.03058577, + "balance_loss_mlp": 0.18414164, + "epoch": 0.9061776642116338, + "flos": 17778439359360.0, + "grad_norm": 1345.976684688048, + "language_loss": 0.71626198, + "learning_rate": 9.157486613883758e-08, + "loss": 0.7308737, + "num_input_tokens_seen": 324987005, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25158691, + "step": 15072, + "time_per_iteration": 2.632375717163086 + }, + { + "auxiliary_loss_clip": 0.01236947, + "auxiliary_loss_mlp": 0.00224533, + "balance_loss_clip": 1.02012348, + "balance_loss_mlp": 0.19846226, + "epoch": 0.9062377874643018, + "flos": 42777688037760.0, + "grad_norm": 27.184692014518347, + "language_loss": 0.80857313, + "learning_rate": 9.145840289787021e-08, + "loss": 0.82318789, + "num_input_tokens_seen": 325010700, + "router_z_loss_clip": 2.16699219, + "router_z_loss_mlp": 0.26074219, + "step": 15073, + "time_per_iteration": 2.827106475830078 + }, + { + "auxiliary_loss_clip": 0.01235424, + "auxiliary_loss_mlp": 0.00200039, + "balance_loss_clip": 1.02393186, + "balance_loss_mlp": 0.17563653, + "epoch": 0.9062979107169697, + "flos": 16361620323840.0, + "grad_norm": 30.36749580305017, + "language_loss": 0.89951783, + "learning_rate": 9.134201202899161e-08, + "loss": 0.91387248, + "num_input_tokens_seen": 325028760, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.24389648, + "step": 15074, + "time_per_iteration": 2.6496243476867676 + }, + { + "auxiliary_loss_clip": 0.01119971, + "auxiliary_loss_mlp": 0.00086248, + "balance_loss_clip": 0.98014736, + "balance_loss_mlp": 0.07938162, + "epoch": 0.9063580339696378, + "flos": 69313988528640.0, + "grad_norm": 0.7857841252478698, + "language_loss": 0.5131793, + "learning_rate": 9.122569353661513e-08, + "loss": 0.52524149, + "num_input_tokens_seen": 325093545, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.06884766, + "step": 15075, + "time_per_iteration": 3.248792886734009 + }, + { + "auxiliary_loss_clip": 0.01127077, + "auxiliary_loss_mlp": 0.00121023, + "balance_loss_clip": 0.98719192, + "balance_loss_mlp": 0.11429983, + "epoch": 0.9064181572223057, + "flos": 58794747148800.0, + "grad_norm": 0.9170012894173143, + "language_loss": 0.61106682, + "learning_rate": 9.11094474251517e-08, + "loss": 0.62354779, + "num_input_tokens_seen": 325152295, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.06738281, + "step": 15076, + "time_per_iteration": 3.055506706237793 + }, + { + "auxiliary_loss_clip": 0.01247597, + "auxiliary_loss_mlp": 0.00205407, + "balance_loss_clip": 1.02706838, + "balance_loss_mlp": 0.1799562, + "epoch": 0.9064782804749737, + "flos": 21762692772480.0, + "grad_norm": 15.225410818512412, + "language_loss": 0.89323127, + "learning_rate": 9.09932736990091e-08, + "loss": 0.90776134, + "num_input_tokens_seen": 325169705, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25439453, + "step": 15077, + "time_per_iteration": 2.7565386295318604 + }, + { + "auxiliary_loss_clip": 0.01227635, + "auxiliary_loss_mlp": 0.00210026, + "balance_loss_clip": 1.02176189, + "balance_loss_mlp": 0.18590955, + "epoch": 0.9065384037276417, + "flos": 21397373498880.0, + "grad_norm": 28.884494557437772, + "language_loss": 0.91270387, + "learning_rate": 9.08771723625934e-08, + "loss": 0.92708045, + "num_input_tokens_seen": 325189175, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.24108887, + "step": 15078, + "time_per_iteration": 2.643357038497925 + }, + { + "auxiliary_loss_clip": 0.01215996, + "auxiliary_loss_mlp": 0.00199724, + "balance_loss_clip": 1.007303, + "balance_loss_mlp": 0.17697909, + "epoch": 0.9065985269803096, + "flos": 38283646849920.0, + "grad_norm": 59.57053548544907, + "language_loss": 0.70754468, + "learning_rate": 9.076114342030617e-08, + "loss": 0.72170186, + "num_input_tokens_seen": 325211020, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.22753906, + "step": 15079, + "time_per_iteration": 2.8747503757476807 + }, + { + "auxiliary_loss_clip": 0.01236994, + "auxiliary_loss_mlp": 0.00209585, + "balance_loss_clip": 1.02166367, + "balance_loss_mlp": 0.18631576, + "epoch": 0.9066586502329776, + "flos": 44818562989440.0, + "grad_norm": 33.734746487688554, + "language_loss": 0.75382996, + "learning_rate": 9.064518687654765e-08, + "loss": 0.76829576, + "num_input_tokens_seen": 325236970, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.23254395, + "step": 15080, + "time_per_iteration": 2.864539384841919 + }, + { + "auxiliary_loss_clip": 0.01264189, + "auxiliary_loss_mlp": 0.00232653, + "balance_loss_clip": 1.03954911, + "balance_loss_mlp": 0.2062602, + "epoch": 0.9067187734856456, + "flos": 18623992492800.0, + "grad_norm": 3.624187685730049, + "language_loss": 0.79542333, + "learning_rate": 9.052930273571547e-08, + "loss": 0.81039178, + "num_input_tokens_seen": 325252670, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.26379395, + "step": 15081, + "time_per_iteration": 2.6636626720428467 + }, + { + "auxiliary_loss_clip": 0.01233527, + "auxiliary_loss_mlp": 0.00200619, + "balance_loss_clip": 1.01616502, + "balance_loss_mlp": 0.1762287, + "epoch": 0.9067788967383136, + "flos": 22747578762240.0, + "grad_norm": 2.188990414311764, + "language_loss": 0.83381546, + "learning_rate": 9.04134910022032e-08, + "loss": 0.84815693, + "num_input_tokens_seen": 325273860, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.24377441, + "step": 15082, + "time_per_iteration": 2.6981866359710693 + }, + { + "auxiliary_loss_clip": 0.01246443, + "auxiliary_loss_mlp": 0.0022132, + "balance_loss_clip": 1.03265393, + "balance_loss_mlp": 0.19596386, + "epoch": 0.9068390199909815, + "flos": 27670787648640.0, + "grad_norm": 129.76593258799082, + "language_loss": 0.85626936, + "learning_rate": 9.029775168040266e-08, + "loss": 0.870947, + "num_input_tokens_seen": 325294140, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.25341797, + "step": 15083, + "time_per_iteration": 4.116838455200195 + }, + { + "auxiliary_loss_clip": 0.01230221, + "auxiliary_loss_mlp": 0.00194108, + "balance_loss_clip": 1.02190971, + "balance_loss_mlp": 0.17065978, + "epoch": 0.9068991432436495, + "flos": 24244012293120.0, + "grad_norm": 20.50254076558301, + "language_loss": 0.76427549, + "learning_rate": 9.01820847747028e-08, + "loss": 0.77851874, + "num_input_tokens_seen": 325313130, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.234375, + "step": 15084, + "time_per_iteration": 4.092199087142944 + }, + { + "auxiliary_loss_clip": 0.0123983, + "auxiliary_loss_mlp": 0.00217054, + "balance_loss_clip": 1.02413511, + "balance_loss_mlp": 0.19265193, + "epoch": 0.9069592664963174, + "flos": 28033305661440.0, + "grad_norm": 5.315620677173247, + "language_loss": 0.75092357, + "learning_rate": 9.006649028948965e-08, + "loss": 0.76549244, + "num_input_tokens_seen": 325334880, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.24401855, + "step": 15085, + "time_per_iteration": 2.7476320266723633 + }, + { + "auxiliary_loss_clip": 0.01136052, + "auxiliary_loss_mlp": 0.0013217, + "balance_loss_clip": 0.99547577, + "balance_loss_mlp": 0.12353907, + "epoch": 0.9070193897489854, + "flos": 68778414789120.0, + "grad_norm": 0.767234597043579, + "language_loss": 0.60632211, + "learning_rate": 8.995096822914638e-08, + "loss": 0.61900431, + "num_input_tokens_seen": 325394175, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.08642578, + "step": 15086, + "time_per_iteration": 3.200314998626709 + }, + { + "auxiliary_loss_clip": 0.01228494, + "auxiliary_loss_mlp": 0.0021478, + "balance_loss_clip": 1.01424241, + "balance_loss_mlp": 0.18998429, + "epoch": 0.9070795130016533, + "flos": 23441624328960.0, + "grad_norm": 15.25925326289114, + "language_loss": 0.79128355, + "learning_rate": 8.983551859805416e-08, + "loss": 0.80571628, + "num_input_tokens_seen": 325415020, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.2479248, + "step": 15087, + "time_per_iteration": 2.7514586448669434 + }, + { + "auxiliary_loss_clip": 0.01238176, + "auxiliary_loss_mlp": 0.00224636, + "balance_loss_clip": 1.022367, + "balance_loss_mlp": 0.19871947, + "epoch": 0.9071396362543214, + "flos": 18916413114240.0, + "grad_norm": 6.249442292165258, + "language_loss": 0.86346328, + "learning_rate": 8.972014140059058e-08, + "loss": 0.87809145, + "num_input_tokens_seen": 325433595, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.25939941, + "step": 15088, + "time_per_iteration": 2.612811326980591 + }, + { + "auxiliary_loss_clip": 0.01233095, + "auxiliary_loss_mlp": 0.00209611, + "balance_loss_clip": 1.02050114, + "balance_loss_mlp": 0.18648484, + "epoch": 0.9071997595069893, + "flos": 25228646887680.0, + "grad_norm": 40.50116486826753, + "language_loss": 0.80907714, + "learning_rate": 8.960483664113038e-08, + "loss": 0.82350421, + "num_input_tokens_seen": 325451605, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.23144531, + "step": 15089, + "time_per_iteration": 4.1578288078308105 + }, + { + "auxiliary_loss_clip": 0.01221737, + "auxiliary_loss_mlp": 0.00196197, + "balance_loss_clip": 1.01267004, + "balance_loss_mlp": 0.17285571, + "epoch": 0.9072598827596573, + "flos": 24346608514560.0, + "grad_norm": 29.44201264311004, + "language_loss": 0.83462071, + "learning_rate": 8.948960432404628e-08, + "loss": 0.84880006, + "num_input_tokens_seen": 325470645, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.23327637, + "step": 15090, + "time_per_iteration": 2.7164530754089355 + }, + { + "auxiliary_loss_clip": 0.01252495, + "auxiliary_loss_mlp": 0.00204724, + "balance_loss_clip": 1.03330922, + "balance_loss_mlp": 0.17731753, + "epoch": 0.9073200060123253, + "flos": 22674967418880.0, + "grad_norm": 3.429803787378421, + "language_loss": 0.87925315, + "learning_rate": 8.93744444537079e-08, + "loss": 0.89382529, + "num_input_tokens_seen": 325488070, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.27416992, + "step": 15091, + "time_per_iteration": 2.6655592918395996 + }, + { + "auxiliary_loss_clip": 0.01208229, + "auxiliary_loss_mlp": 0.00191232, + "balance_loss_clip": 1.00533557, + "balance_loss_mlp": 0.1676878, + "epoch": 0.9073801292649932, + "flos": 23695476721920.0, + "grad_norm": 2.8121111096515468, + "language_loss": 0.91888523, + "learning_rate": 8.925935703448217e-08, + "loss": 0.93287981, + "num_input_tokens_seen": 325509285, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.2355957, + "step": 15092, + "time_per_iteration": 2.735358715057373 + }, + { + "auxiliary_loss_clip": 0.01246462, + "auxiliary_loss_mlp": 0.00214864, + "balance_loss_clip": 1.03124654, + "balance_loss_mlp": 0.18900782, + "epoch": 0.9074402525176612, + "flos": 25375413859200.0, + "grad_norm": 3.159380672791971, + "language_loss": 0.86188495, + "learning_rate": 8.914434207073296e-08, + "loss": 0.87649822, + "num_input_tokens_seen": 325529360, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.25878906, + "step": 15093, + "time_per_iteration": 2.712634325027466 + }, + { + "auxiliary_loss_clip": 0.01133426, + "auxiliary_loss_mlp": 0.00138811, + "balance_loss_clip": 0.99418342, + "balance_loss_mlp": 0.12960845, + "epoch": 0.9075003757703292, + "flos": 67649024384640.0, + "grad_norm": 157.92540449499535, + "language_loss": 0.56773734, + "learning_rate": 8.902939956682188e-08, + "loss": 0.58045971, + "num_input_tokens_seen": 325583565, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.09179688, + "step": 15094, + "time_per_iteration": 3.0915162563323975 + }, + { + "auxiliary_loss_clip": 0.01250555, + "auxiliary_loss_mlp": 0.00246789, + "balance_loss_clip": 1.03101373, + "balance_loss_mlp": 0.2213856, + "epoch": 0.9075604990229972, + "flos": 22453649769600.0, + "grad_norm": 9.978801788903164, + "language_loss": 0.79981124, + "learning_rate": 8.891452952710742e-08, + "loss": 0.81478465, + "num_input_tokens_seen": 325603690, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25390625, + "step": 15095, + "time_per_iteration": 4.0524492263793945 + }, + { + "auxiliary_loss_clip": 0.0123619, + "auxiliary_loss_mlp": 0.00213183, + "balance_loss_clip": 1.01988697, + "balance_loss_mlp": 0.18769637, + "epoch": 0.9076206222756651, + "flos": 19536662188800.0, + "grad_norm": 4.072849385024415, + "language_loss": 0.81629556, + "learning_rate": 8.879973195594526e-08, + "loss": 0.83078927, + "num_input_tokens_seen": 325622255, + "router_z_loss_clip": 2.16308594, + "router_z_loss_mlp": 0.25512695, + "step": 15096, + "time_per_iteration": 2.6207001209259033 + }, + { + "auxiliary_loss_clip": 0.01243863, + "auxiliary_loss_mlp": 0.00225437, + "balance_loss_clip": 1.02224326, + "balance_loss_mlp": 0.19993815, + "epoch": 0.9076807455283331, + "flos": 30116914819200.0, + "grad_norm": 2.465318536860071, + "language_loss": 0.64549816, + "learning_rate": 8.868500685768898e-08, + "loss": 0.66019118, + "num_input_tokens_seen": 325640165, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25524902, + "step": 15097, + "time_per_iteration": 2.7904675006866455 + }, + { + "auxiliary_loss_clip": 0.01225939, + "auxiliary_loss_mlp": 0.00201376, + "balance_loss_clip": 1.01281285, + "balance_loss_mlp": 0.17826165, + "epoch": 0.907740868781001, + "flos": 18697537589760.0, + "grad_norm": 2.4900984502435937, + "language_loss": 0.8769545, + "learning_rate": 8.857035423668935e-08, + "loss": 0.89122766, + "num_input_tokens_seen": 325659455, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.2310791, + "step": 15098, + "time_per_iteration": 2.6595869064331055 + }, + { + "auxiliary_loss_clip": 0.01242544, + "auxiliary_loss_mlp": 0.00226007, + "balance_loss_clip": 1.02072239, + "balance_loss_mlp": 0.20136656, + "epoch": 0.907800992033669, + "flos": 22638805401600.0, + "grad_norm": 15.397503075972372, + "language_loss": 0.75125802, + "learning_rate": 8.845577409729266e-08, + "loss": 0.76594341, + "num_input_tokens_seen": 325678095, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.24645996, + "step": 15099, + "time_per_iteration": 2.7422943115234375 + }, + { + "auxiliary_loss_clip": 0.01252487, + "auxiliary_loss_mlp": 0.00203974, + "balance_loss_clip": 1.0276978, + "balance_loss_mlp": 0.17717618, + "epoch": 0.907861115286337, + "flos": 21287666384640.0, + "grad_norm": 47.06191861049389, + "language_loss": 0.80273789, + "learning_rate": 8.834126644384477e-08, + "loss": 0.81730247, + "num_input_tokens_seen": 325695825, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.26818848, + "step": 15100, + "time_per_iteration": 2.757944345474243 + }, + { + "auxiliary_loss_clip": 0.01124307, + "auxiliary_loss_mlp": 0.00129092, + "balance_loss_clip": 0.98522198, + "balance_loss_mlp": 0.12084267, + "epoch": 0.907921238539005, + "flos": 69739493040000.0, + "grad_norm": 0.6748972980382554, + "language_loss": 0.52677315, + "learning_rate": 8.822683128068775e-08, + "loss": 0.53930712, + "num_input_tokens_seen": 325764515, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.08251953, + "step": 15101, + "time_per_iteration": 3.2404656410217285 + }, + { + "auxiliary_loss_clip": 0.01240007, + "auxiliary_loss_mlp": 0.00202785, + "balance_loss_clip": 1.02215886, + "balance_loss_mlp": 0.17851359, + "epoch": 0.9079813617916729, + "flos": 23477391296640.0, + "grad_norm": 18.37181079044459, + "language_loss": 0.77482307, + "learning_rate": 8.811246861216081e-08, + "loss": 0.78925097, + "num_input_tokens_seen": 325783235, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24291992, + "step": 15102, + "time_per_iteration": 2.705183982849121 + }, + { + "auxiliary_loss_clip": 0.01227949, + "auxiliary_loss_mlp": 0.0021583, + "balance_loss_clip": 1.01556945, + "balance_loss_mlp": 0.19089113, + "epoch": 0.9080414850443409, + "flos": 22929933133440.0, + "grad_norm": 7.597860036340499, + "language_loss": 0.85398698, + "learning_rate": 8.799817844260049e-08, + "loss": 0.86842477, + "num_input_tokens_seen": 325800195, + "router_z_loss_clip": 2.12402344, + "router_z_loss_mlp": 0.24951172, + "step": 15103, + "time_per_iteration": 2.6312315464019775 + }, + { + "auxiliary_loss_clip": 0.01237917, + "auxiliary_loss_mlp": 0.00220827, + "balance_loss_clip": 1.01806092, + "balance_loss_mlp": 0.19597185, + "epoch": 0.9081016082970089, + "flos": 26177083551360.0, + "grad_norm": 5868.071002686604, + "language_loss": 0.83157855, + "learning_rate": 8.78839607763413e-08, + "loss": 0.84616601, + "num_input_tokens_seen": 325820215, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.24865723, + "step": 15104, + "time_per_iteration": 2.7698302268981934 + }, + { + "auxiliary_loss_clip": 0.01223447, + "auxiliary_loss_mlp": 0.00202653, + "balance_loss_clip": 1.01040947, + "balance_loss_mlp": 0.17807201, + "epoch": 0.9081617315496768, + "flos": 24462169545600.0, + "grad_norm": 7.242070262012642, + "language_loss": 0.85065055, + "learning_rate": 8.77698156177138e-08, + "loss": 0.86491156, + "num_input_tokens_seen": 325838415, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.2454834, + "step": 15105, + "time_per_iteration": 2.6815779209136963 + }, + { + "auxiliary_loss_clip": 0.0124397, + "auxiliary_loss_mlp": 0.00200823, + "balance_loss_clip": 1.0253098, + "balance_loss_mlp": 0.17577687, + "epoch": 0.9082218548023449, + "flos": 24746868743040.0, + "grad_norm": 711.1571996051864, + "language_loss": 0.80861306, + "learning_rate": 8.765574297104628e-08, + "loss": 0.82306099, + "num_input_tokens_seen": 325855580, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25036621, + "step": 15106, + "time_per_iteration": 2.695061683654785 + }, + { + "auxiliary_loss_clip": 0.0126261, + "auxiliary_loss_mlp": 0.00222301, + "balance_loss_clip": 1.03856325, + "balance_loss_mlp": 0.19696906, + "epoch": 0.9082819780550128, + "flos": 24421302846720.0, + "grad_norm": 2.5213840991520535, + "language_loss": 0.87746572, + "learning_rate": 8.754174284066462e-08, + "loss": 0.89231485, + "num_input_tokens_seen": 325874890, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.25341797, + "step": 15107, + "time_per_iteration": 2.7746129035949707 + }, + { + "auxiliary_loss_clip": 0.0111161, + "auxiliary_loss_mlp": 0.00106895, + "balance_loss_clip": 0.9742924, + "balance_loss_mlp": 0.09878902, + "epoch": 0.9083421013076808, + "flos": 59609704872960.0, + "grad_norm": 0.8049611643896945, + "language_loss": 0.59566903, + "learning_rate": 8.742781523089205e-08, + "loss": 0.60785407, + "num_input_tokens_seen": 325935835, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.08105469, + "step": 15108, + "time_per_iteration": 3.1508679389953613 + }, + { + "auxiliary_loss_clip": 0.01236532, + "auxiliary_loss_mlp": 0.0020167, + "balance_loss_clip": 1.01838923, + "balance_loss_mlp": 0.17701723, + "epoch": 0.9084022245603487, + "flos": 33620216100480.0, + "grad_norm": 4.804797469814861, + "language_loss": 0.82869053, + "learning_rate": 8.73139601460482e-08, + "loss": 0.84307259, + "num_input_tokens_seen": 325958035, + "router_z_loss_clip": 2.18066406, + "router_z_loss_mlp": 0.24621582, + "step": 15109, + "time_per_iteration": 2.7531652450561523 + }, + { + "auxiliary_loss_clip": 0.01226428, + "auxiliary_loss_mlp": 0.00204566, + "balance_loss_clip": 1.01422763, + "balance_loss_mlp": 0.17971104, + "epoch": 0.9084623478130167, + "flos": 24971705925120.0, + "grad_norm": 75.78143263442574, + "language_loss": 0.76817852, + "learning_rate": 8.720017759045073e-08, + "loss": 0.78248847, + "num_input_tokens_seen": 325979870, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.24853516, + "step": 15110, + "time_per_iteration": 2.6808688640594482 + }, + { + "auxiliary_loss_clip": 0.01213939, + "auxiliary_loss_mlp": 0.00204852, + "balance_loss_clip": 1.00954294, + "balance_loss_mlp": 0.18208304, + "epoch": 0.9085224710656846, + "flos": 31461804869760.0, + "grad_norm": 4.603348557740948, + "language_loss": 0.76388788, + "learning_rate": 8.708646756841421e-08, + "loss": 0.7780757, + "num_input_tokens_seen": 325998245, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.22790527, + "step": 15111, + "time_per_iteration": 2.792809009552002 + }, + { + "auxiliary_loss_clip": 0.01110996, + "auxiliary_loss_mlp": 0.00113014, + "balance_loss_clip": 0.97311807, + "balance_loss_mlp": 0.10519421, + "epoch": 0.9085825943183526, + "flos": 64917012867840.0, + "grad_norm": 0.7158046271429627, + "language_loss": 0.50811899, + "learning_rate": 8.697283008425026e-08, + "loss": 0.52035904, + "num_input_tokens_seen": 326061770, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.078125, + "step": 15112, + "time_per_iteration": 3.18013596534729 + }, + { + "auxiliary_loss_clip": 0.01233814, + "auxiliary_loss_mlp": 0.0020634, + "balance_loss_clip": 1.0171442, + "balance_loss_mlp": 0.18124676, + "epoch": 0.9086427175710206, + "flos": 18953221576320.0, + "grad_norm": 177.9269583961014, + "language_loss": 0.79181123, + "learning_rate": 8.685926514226837e-08, + "loss": 0.80621284, + "num_input_tokens_seen": 326080945, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.25109863, + "step": 15113, + "time_per_iteration": 2.6208336353302 + }, + { + "auxiliary_loss_clip": 0.01224803, + "auxiliary_loss_mlp": 0.00196854, + "balance_loss_clip": 1.01194012, + "balance_loss_mlp": 0.17360786, + "epoch": 0.9087028408236886, + "flos": 34014873807360.0, + "grad_norm": 24.350724078749924, + "language_loss": 0.86321765, + "learning_rate": 8.674577274677508e-08, + "loss": 0.87743413, + "num_input_tokens_seen": 326100630, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.2322998, + "step": 15114, + "time_per_iteration": 2.733224630355835 + }, + { + "auxiliary_loss_clip": 0.01260497, + "auxiliary_loss_mlp": 0.00238578, + "balance_loss_clip": 1.03590631, + "balance_loss_mlp": 0.20930003, + "epoch": 0.9087629640763565, + "flos": 21944580266880.0, + "grad_norm": 5.080671438722505, + "language_loss": 0.81702667, + "learning_rate": 8.663235290207405e-08, + "loss": 0.83201742, + "num_input_tokens_seen": 326120145, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.29248047, + "step": 15115, + "time_per_iteration": 2.751502752304077 + }, + { + "auxiliary_loss_clip": 0.01253233, + "auxiliary_loss_mlp": 0.00232364, + "balance_loss_clip": 1.0299145, + "balance_loss_mlp": 0.20712715, + "epoch": 0.9088230873290245, + "flos": 21762908254080.0, + "grad_norm": 17.315585459801266, + "language_loss": 0.7295385, + "learning_rate": 8.651900561246561e-08, + "loss": 0.74439442, + "num_input_tokens_seen": 326140715, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.25219727, + "step": 15116, + "time_per_iteration": 2.750324249267578 + }, + { + "auxiliary_loss_clip": 0.01231621, + "auxiliary_loss_mlp": 0.00227707, + "balance_loss_clip": 1.01902962, + "balance_loss_mlp": 0.20230371, + "epoch": 0.9088832105816925, + "flos": 21541267382400.0, + "grad_norm": 6.086512907286031, + "language_loss": 0.76762772, + "learning_rate": 8.640573088224812e-08, + "loss": 0.78222102, + "num_input_tokens_seen": 326159130, + "router_z_loss_clip": 2.12597656, + "router_z_loss_mlp": 0.25402832, + "step": 15117, + "time_per_iteration": 2.627537250518799 + }, + { + "auxiliary_loss_clip": 0.0122826, + "auxiliary_loss_mlp": 0.00225797, + "balance_loss_clip": 1.01467538, + "balance_loss_mlp": 0.20101309, + "epoch": 0.9089433338343604, + "flos": 25996704428160.0, + "grad_norm": 277.3562179158369, + "language_loss": 0.81654751, + "learning_rate": 8.629252871571745e-08, + "loss": 0.83108807, + "num_input_tokens_seen": 326181375, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.24804688, + "step": 15118, + "time_per_iteration": 2.714554786682129 + }, + { + "auxiliary_loss_clip": 0.0125389, + "auxiliary_loss_mlp": 0.00246262, + "balance_loss_clip": 1.02963173, + "balance_loss_mlp": 0.21725804, + "epoch": 0.9090034570870285, + "flos": 21178426147200.0, + "grad_norm": 28.39081463998199, + "language_loss": 0.81093049, + "learning_rate": 8.617939911716554e-08, + "loss": 0.82593197, + "num_input_tokens_seen": 326199740, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.28991699, + "step": 15119, + "time_per_iteration": 2.727130889892578 + }, + { + "auxiliary_loss_clip": 0.01277118, + "auxiliary_loss_mlp": 0.00222221, + "balance_loss_clip": 1.04419303, + "balance_loss_mlp": 0.19541082, + "epoch": 0.9090635803396964, + "flos": 16141811045760.0, + "grad_norm": 7.983057635328793, + "language_loss": 0.8344717, + "learning_rate": 8.60663420908827e-08, + "loss": 0.84946513, + "num_input_tokens_seen": 326214350, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.26818848, + "step": 15120, + "time_per_iteration": 2.6598029136657715 + }, + { + "auxiliary_loss_clip": 0.01246671, + "auxiliary_loss_mlp": 0.00208842, + "balance_loss_clip": 1.02900279, + "balance_loss_mlp": 0.1849764, + "epoch": 0.9091237035923644, + "flos": 20591537829120.0, + "grad_norm": 21.751956817441865, + "language_loss": 0.75376219, + "learning_rate": 8.595335764115596e-08, + "loss": 0.76831734, + "num_input_tokens_seen": 326234580, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.23864746, + "step": 15121, + "time_per_iteration": 2.7122533321380615 + }, + { + "auxiliary_loss_clip": 0.01232927, + "auxiliary_loss_mlp": 0.00229382, + "balance_loss_clip": 1.01855206, + "balance_loss_mlp": 0.20096242, + "epoch": 0.9091838268450323, + "flos": 52227760164480.0, + "grad_norm": 13.785364555830267, + "language_loss": 0.79699314, + "learning_rate": 8.58404457722699e-08, + "loss": 0.8116163, + "num_input_tokens_seen": 326259080, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.28417969, + "step": 15122, + "time_per_iteration": 2.9996819496154785 + }, + { + "auxiliary_loss_clip": 0.01226085, + "auxiliary_loss_mlp": 0.00210293, + "balance_loss_clip": 1.01330447, + "balance_loss_mlp": 0.18608175, + "epoch": 0.9092439500977003, + "flos": 20559613616640.0, + "grad_norm": 14.825985257247167, + "language_loss": 0.79905462, + "learning_rate": 8.572760648850575e-08, + "loss": 0.81341839, + "num_input_tokens_seen": 326280175, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.24194336, + "step": 15123, + "time_per_iteration": 2.66717267036438 + }, + { + "auxiliary_loss_clip": 0.01216738, + "auxiliary_loss_mlp": 0.00210263, + "balance_loss_clip": 1.01142848, + "balance_loss_mlp": 0.18664718, + "epoch": 0.9093040733503682, + "flos": 28617859595520.0, + "grad_norm": 5.8845346947950485, + "language_loss": 0.84050941, + "learning_rate": 8.561483979414253e-08, + "loss": 0.85477942, + "num_input_tokens_seen": 326297990, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.23632812, + "step": 15124, + "time_per_iteration": 2.734279155731201 + }, + { + "auxiliary_loss_clip": 0.01234298, + "auxiliary_loss_mlp": 0.00209191, + "balance_loss_clip": 1.02115321, + "balance_loss_mlp": 0.18389477, + "epoch": 0.9093641966030362, + "flos": 23440187784960.0, + "grad_norm": 16.523058782115832, + "language_loss": 0.81799531, + "learning_rate": 8.55021456934566e-08, + "loss": 0.83243024, + "num_input_tokens_seen": 326316735, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.25305176, + "step": 15125, + "time_per_iteration": 4.1393773555755615 + }, + { + "auxiliary_loss_clip": 0.01231239, + "auxiliary_loss_mlp": 0.00220865, + "balance_loss_clip": 1.02083254, + "balance_loss_mlp": 0.19698757, + "epoch": 0.9094243198557042, + "flos": 16800197385600.0, + "grad_norm": 39.37313926393241, + "language_loss": 0.85384774, + "learning_rate": 8.538952419072143e-08, + "loss": 0.86836874, + "num_input_tokens_seen": 326334370, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.23876953, + "step": 15126, + "time_per_iteration": 4.127735376358032 + }, + { + "auxiliary_loss_clip": 0.01233311, + "auxiliary_loss_mlp": 0.00227436, + "balance_loss_clip": 1.02269173, + "balance_loss_mlp": 0.2022118, + "epoch": 0.9094844431083722, + "flos": 24273278899200.0, + "grad_norm": 30.433627003449057, + "language_loss": 0.82546598, + "learning_rate": 8.527697529020694e-08, + "loss": 0.84007347, + "num_input_tokens_seen": 326353435, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.25231934, + "step": 15127, + "time_per_iteration": 2.6523330211639404 + }, + { + "auxiliary_loss_clip": 0.01245982, + "auxiliary_loss_mlp": 0.00220862, + "balance_loss_clip": 1.02838814, + "balance_loss_mlp": 0.19606636, + "epoch": 0.9095445663610401, + "flos": 21944652094080.0, + "grad_norm": 40.97874751747872, + "language_loss": 0.7123149, + "learning_rate": 8.516449899618173e-08, + "loss": 0.72698331, + "num_input_tokens_seen": 326371810, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.24780273, + "step": 15128, + "time_per_iteration": 2.7147955894470215 + }, + { + "auxiliary_loss_clip": 0.01225352, + "auxiliary_loss_mlp": 0.00238594, + "balance_loss_clip": 1.01307487, + "balance_loss_mlp": 0.21454987, + "epoch": 0.9096046896137081, + "flos": 19792848965760.0, + "grad_norm": 8.084603448831395, + "language_loss": 0.83631045, + "learning_rate": 8.505209531291013e-08, + "loss": 0.85094988, + "num_input_tokens_seen": 326391380, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.24060059, + "step": 15129, + "time_per_iteration": 2.729275941848755 + }, + { + "auxiliary_loss_clip": 0.01250309, + "auxiliary_loss_mlp": 0.00227397, + "balance_loss_clip": 1.02889276, + "balance_loss_mlp": 0.20154038, + "epoch": 0.909664812866376, + "flos": 22638087129600.0, + "grad_norm": 179.25023300608527, + "language_loss": 0.92864645, + "learning_rate": 8.49397642446552e-08, + "loss": 0.94342351, + "num_input_tokens_seen": 326408800, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25891113, + "step": 15130, + "time_per_iteration": 2.7799782752990723 + }, + { + "auxiliary_loss_clip": 0.01254229, + "auxiliary_loss_mlp": 0.00217971, + "balance_loss_clip": 1.03194284, + "balance_loss_mlp": 0.19124451, + "epoch": 0.909724936119044, + "flos": 39852153020160.0, + "grad_norm": 16.980449542495236, + "language_loss": 0.84180367, + "learning_rate": 8.482750579567644e-08, + "loss": 0.85652566, + "num_input_tokens_seen": 326431565, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.26708984, + "step": 15131, + "time_per_iteration": 2.851686954498291 + }, + { + "auxiliary_loss_clip": 0.01240352, + "auxiliary_loss_mlp": 0.00224074, + "balance_loss_clip": 1.02298355, + "balance_loss_mlp": 0.1986113, + "epoch": 0.9097850593717121, + "flos": 35071616954880.0, + "grad_norm": 20.160192051177717, + "language_loss": 0.67898977, + "learning_rate": 8.471531997023085e-08, + "loss": 0.69363403, + "num_input_tokens_seen": 326451715, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25476074, + "step": 15132, + "time_per_iteration": 4.156470537185669 + }, + { + "auxiliary_loss_clip": 0.01232612, + "auxiliary_loss_mlp": 0.00180529, + "balance_loss_clip": 1.02338982, + "balance_loss_mlp": 0.15718766, + "epoch": 0.90984518262438, + "flos": 23367468700800.0, + "grad_norm": 11.62064872941279, + "language_loss": 0.88703758, + "learning_rate": 8.460320677257193e-08, + "loss": 0.901169, + "num_input_tokens_seen": 326470855, + "router_z_loss_clip": 2.09277344, + "router_z_loss_mlp": 0.23327637, + "step": 15133, + "time_per_iteration": 2.673035144805908 + }, + { + "auxiliary_loss_clip": 0.01229971, + "auxiliary_loss_mlp": 0.00221458, + "balance_loss_clip": 1.01463473, + "balance_loss_mlp": 0.19531512, + "epoch": 0.909905305877048, + "flos": 27523302405120.0, + "grad_norm": 4.662234059870096, + "language_loss": 0.82193661, + "learning_rate": 8.449116620695118e-08, + "loss": 0.83645087, + "num_input_tokens_seen": 326490480, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.26184082, + "step": 15134, + "time_per_iteration": 2.7051069736480713 + }, + { + "auxiliary_loss_clip": 0.0127149, + "auxiliary_loss_mlp": 0.0021566, + "balance_loss_clip": 1.04379392, + "balance_loss_mlp": 0.18825416, + "epoch": 0.9099654291297159, + "flos": 24347865490560.0, + "grad_norm": 4.135031815618909, + "language_loss": 0.80658638, + "learning_rate": 8.437919827761786e-08, + "loss": 0.82145792, + "num_input_tokens_seen": 326509445, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.27429199, + "step": 15135, + "time_per_iteration": 2.739424228668213 + }, + { + "auxiliary_loss_clip": 0.01226047, + "auxiliary_loss_mlp": 0.00207471, + "balance_loss_clip": 1.01436102, + "balance_loss_mlp": 0.18409391, + "epoch": 0.9100255523823839, + "flos": 21215234609280.0, + "grad_norm": 2.285088322402059, + "language_loss": 0.77890015, + "learning_rate": 8.426730298881702e-08, + "loss": 0.7932353, + "num_input_tokens_seen": 326528380, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.23376465, + "step": 15136, + "time_per_iteration": 2.6302878856658936 + }, + { + "auxiliary_loss_clip": 0.01093167, + "auxiliary_loss_mlp": 0.00084483, + "balance_loss_clip": 0.95630264, + "balance_loss_mlp": 0.07742613, + "epoch": 0.9100856756350518, + "flos": 46052276446080.0, + "grad_norm": 0.8046831985286951, + "language_loss": 0.58657587, + "learning_rate": 8.415548034479214e-08, + "loss": 0.59835237, + "num_input_tokens_seen": 326576940, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.07080078, + "step": 15137, + "time_per_iteration": 4.2811033725738525 + }, + { + "auxiliary_loss_clip": 0.01237998, + "auxiliary_loss_mlp": 0.0022448, + "balance_loss_clip": 1.01955235, + "balance_loss_mlp": 0.19863513, + "epoch": 0.9101457988877198, + "flos": 20229917656320.0, + "grad_norm": 13.151123540772781, + "language_loss": 0.89351696, + "learning_rate": 8.40437303497834e-08, + "loss": 0.90814173, + "num_input_tokens_seen": 326596100, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25817871, + "step": 15138, + "time_per_iteration": 2.9926295280456543 + }, + { + "auxiliary_loss_clip": 0.01226286, + "auxiliary_loss_mlp": 0.00232432, + "balance_loss_clip": 1.01699555, + "balance_loss_mlp": 0.20841157, + "epoch": 0.9102059221403878, + "flos": 26615157822720.0, + "grad_norm": 4.762353008110366, + "language_loss": 0.86079192, + "learning_rate": 8.39320530080283e-08, + "loss": 0.87537909, + "num_input_tokens_seen": 326615700, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.24047852, + "step": 15139, + "time_per_iteration": 2.784212589263916 + }, + { + "auxiliary_loss_clip": 0.01239484, + "auxiliary_loss_mlp": 0.00218255, + "balance_loss_clip": 1.0214417, + "balance_loss_mlp": 0.19394815, + "epoch": 0.9102660453930558, + "flos": 21908561904000.0, + "grad_norm": 3.658266142626434, + "language_loss": 0.82913721, + "learning_rate": 8.382044832376167e-08, + "loss": 0.84371459, + "num_input_tokens_seen": 326635905, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24328613, + "step": 15140, + "time_per_iteration": 2.8381917476654053 + }, + { + "auxiliary_loss_clip": 0.01234533, + "auxiliary_loss_mlp": 0.00201101, + "balance_loss_clip": 1.02145123, + "balance_loss_mlp": 0.17809312, + "epoch": 0.9103261686457237, + "flos": 36176660916480.0, + "grad_norm": 235.10382414086962, + "language_loss": 0.74030578, + "learning_rate": 8.370891630121569e-08, + "loss": 0.75466216, + "num_input_tokens_seen": 326661855, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.2298584, + "step": 15141, + "time_per_iteration": 2.9178528785705566 + }, + { + "auxiliary_loss_clip": 0.01241723, + "auxiliary_loss_mlp": 0.00220705, + "balance_loss_clip": 1.02146423, + "balance_loss_mlp": 0.19426477, + "epoch": 0.9103862918983917, + "flos": 23878549365120.0, + "grad_norm": 73.88859369565351, + "language_loss": 0.8326655, + "learning_rate": 8.359745694462005e-08, + "loss": 0.8472898, + "num_input_tokens_seen": 326679320, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.26464844, + "step": 15142, + "time_per_iteration": 2.892758846282959 + }, + { + "auxiliary_loss_clip": 0.01224074, + "auxiliary_loss_mlp": 0.00208755, + "balance_loss_clip": 1.01092994, + "balance_loss_mlp": 0.18599752, + "epoch": 0.9104464151510596, + "flos": 14939521989120.0, + "grad_norm": 13.582807217951162, + "language_loss": 0.71899247, + "learning_rate": 8.348607025820076e-08, + "loss": 0.73332077, + "num_input_tokens_seen": 326698110, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.22753906, + "step": 15143, + "time_per_iteration": 2.8068759441375732 + }, + { + "auxiliary_loss_clip": 0.01240837, + "auxiliary_loss_mlp": 0.00213262, + "balance_loss_clip": 1.02172256, + "balance_loss_mlp": 0.18543839, + "epoch": 0.9105065384037276, + "flos": 33655803500160.0, + "grad_norm": 10.196176277840399, + "language_loss": 0.7037164, + "learning_rate": 8.337475624618152e-08, + "loss": 0.71825743, + "num_input_tokens_seen": 326718370, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.27819824, + "step": 15144, + "time_per_iteration": 2.914198637008667 + }, + { + "auxiliary_loss_clip": 0.01212203, + "auxiliary_loss_mlp": 0.00198976, + "balance_loss_clip": 1.00518608, + "balance_loss_mlp": 0.17500326, + "epoch": 0.9105666616563957, + "flos": 24316695463680.0, + "grad_norm": 53.53801923569034, + "language_loss": 0.78150618, + "learning_rate": 8.326351491278382e-08, + "loss": 0.79561794, + "num_input_tokens_seen": 326738445, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.23950195, + "step": 15145, + "time_per_iteration": 2.7549173831939697 + }, + { + "auxiliary_loss_clip": 0.01204832, + "auxiliary_loss_mlp": 0.00210952, + "balance_loss_clip": 0.9999218, + "balance_loss_mlp": 0.18770599, + "epoch": 0.9106267849090636, + "flos": 29971692132480.0, + "grad_norm": 2.3018574022160094, + "language_loss": 0.78138876, + "learning_rate": 8.315234626222545e-08, + "loss": 0.79554659, + "num_input_tokens_seen": 326758855, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.23242188, + "step": 15146, + "time_per_iteration": 2.7828433513641357 + }, + { + "auxiliary_loss_clip": 0.01238532, + "auxiliary_loss_mlp": 0.00210142, + "balance_loss_clip": 1.02327538, + "balance_loss_mlp": 0.18527508, + "epoch": 0.9106869081617316, + "flos": 25337743470720.0, + "grad_norm": 200.22926889246204, + "language_loss": 0.82619298, + "learning_rate": 8.304125029872233e-08, + "loss": 0.84067976, + "num_input_tokens_seen": 326777140, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.2487793, + "step": 15147, + "time_per_iteration": 2.7467541694641113 + }, + { + "auxiliary_loss_clip": 0.01245322, + "auxiliary_loss_mlp": 0.0020246, + "balance_loss_clip": 1.02495563, + "balance_loss_mlp": 0.17739055, + "epoch": 0.9107470314143995, + "flos": 18187031543040.0, + "grad_norm": 4.997713145314226, + "language_loss": 0.88533688, + "learning_rate": 8.293022702648711e-08, + "loss": 0.89981467, + "num_input_tokens_seen": 326794070, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25061035, + "step": 15148, + "time_per_iteration": 2.669952392578125 + }, + { + "auxiliary_loss_clip": 0.01233273, + "auxiliary_loss_mlp": 0.00223649, + "balance_loss_clip": 1.01673818, + "balance_loss_mlp": 0.19978349, + "epoch": 0.9108071546670675, + "flos": 23550828652800.0, + "grad_norm": 411.2638205817862, + "language_loss": 0.76953816, + "learning_rate": 8.281927644972996e-08, + "loss": 0.78410739, + "num_input_tokens_seen": 326814695, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.23876953, + "step": 15149, + "time_per_iteration": 2.7897896766662598 + }, + { + "auxiliary_loss_clip": 0.01237365, + "auxiliary_loss_mlp": 0.00211159, + "balance_loss_clip": 1.01795673, + "balance_loss_mlp": 0.18297818, + "epoch": 0.9108672779197354, + "flos": 25630307746560.0, + "grad_norm": 29.0189455799177, + "language_loss": 0.71968496, + "learning_rate": 8.270839857265776e-08, + "loss": 0.7341702, + "num_input_tokens_seen": 326835295, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.28222656, + "step": 15150, + "time_per_iteration": 2.790698766708374 + }, + { + "auxiliary_loss_clip": 0.01240637, + "auxiliary_loss_mlp": 0.00224068, + "balance_loss_clip": 1.02360678, + "balance_loss_mlp": 0.19846165, + "epoch": 0.9109274011724035, + "flos": 22339094319360.0, + "grad_norm": 4.404218003408579, + "language_loss": 0.81157684, + "learning_rate": 8.259759339947514e-08, + "loss": 0.82622385, + "num_input_tokens_seen": 326853350, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25585938, + "step": 15151, + "time_per_iteration": 2.655329465866089 + }, + { + "auxiliary_loss_clip": 0.01211022, + "auxiliary_loss_mlp": 0.00210151, + "balance_loss_clip": 1.00517368, + "balance_loss_mlp": 0.18709597, + "epoch": 0.9109875244250714, + "flos": 26688200129280.0, + "grad_norm": 1775.8284102356038, + "language_loss": 0.71813691, + "learning_rate": 8.248686093438429e-08, + "loss": 0.73234868, + "num_input_tokens_seen": 326873425, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.23046875, + "step": 15152, + "time_per_iteration": 2.7658543586730957 + }, + { + "auxiliary_loss_clip": 0.0124139, + "auxiliary_loss_mlp": 0.00217355, + "balance_loss_clip": 1.01996422, + "balance_loss_mlp": 0.19221354, + "epoch": 0.9110476476777394, + "flos": 22930112701440.0, + "grad_norm": 57.10333627716773, + "language_loss": 0.80128002, + "learning_rate": 8.23762011815834e-08, + "loss": 0.81586748, + "num_input_tokens_seen": 326893455, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25146484, + "step": 15153, + "time_per_iteration": 2.7493207454681396 + }, + { + "auxiliary_loss_clip": 0.01243548, + "auxiliary_loss_mlp": 0.00221812, + "balance_loss_clip": 1.02788627, + "balance_loss_mlp": 0.19724312, + "epoch": 0.9111077709304073, + "flos": 13472857854720.0, + "grad_norm": 4.699183421207225, + "language_loss": 0.79303461, + "learning_rate": 8.226561414526956e-08, + "loss": 0.80768824, + "num_input_tokens_seen": 326910210, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24584961, + "step": 15154, + "time_per_iteration": 2.6382365226745605 + }, + { + "auxiliary_loss_clip": 0.01234451, + "auxiliary_loss_mlp": 0.00215111, + "balance_loss_clip": 1.0237031, + "balance_loss_mlp": 0.19022022, + "epoch": 0.9111678941830753, + "flos": 20850561780480.0, + "grad_norm": 3.2425487081174853, + "language_loss": 0.91590154, + "learning_rate": 8.215509982963564e-08, + "loss": 0.93039721, + "num_input_tokens_seen": 326929350, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.2487793, + "step": 15155, + "time_per_iteration": 2.6384549140930176 + }, + { + "auxiliary_loss_clip": 0.01236431, + "auxiliary_loss_mlp": 0.00218317, + "balance_loss_clip": 1.0227834, + "balance_loss_mlp": 0.19455847, + "epoch": 0.9112280174357432, + "flos": 19682244011520.0, + "grad_norm": 43.41043724460105, + "language_loss": 0.66947699, + "learning_rate": 8.204465823887252e-08, + "loss": 0.68402445, + "num_input_tokens_seen": 326949060, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.23742676, + "step": 15156, + "time_per_iteration": 2.6806352138519287 + }, + { + "auxiliary_loss_clip": 0.01241259, + "auxiliary_loss_mlp": 0.00208573, + "balance_loss_clip": 1.02099764, + "balance_loss_mlp": 0.1825735, + "epoch": 0.9112881406884112, + "flos": 25447163276160.0, + "grad_norm": 700.4247347549093, + "language_loss": 0.81445742, + "learning_rate": 8.193428937716796e-08, + "loss": 0.82895577, + "num_input_tokens_seen": 326968950, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.26013184, + "step": 15157, + "time_per_iteration": 2.670891761779785 + }, + { + "auxiliary_loss_clip": 0.01215914, + "auxiliary_loss_mlp": 0.00186535, + "balance_loss_clip": 1.00579393, + "balance_loss_mlp": 0.1640761, + "epoch": 0.9113482639410793, + "flos": 33066975847680.0, + "grad_norm": 3.632969599038402, + "language_loss": 0.68521476, + "learning_rate": 8.182399324870747e-08, + "loss": 0.69923925, + "num_input_tokens_seen": 326989455, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.22460938, + "step": 15158, + "time_per_iteration": 2.784550189971924 + }, + { + "auxiliary_loss_clip": 0.01221766, + "auxiliary_loss_mlp": 0.00220605, + "balance_loss_clip": 1.01060426, + "balance_loss_mlp": 0.19570239, + "epoch": 0.9114083871937472, + "flos": 21835591424640.0, + "grad_norm": 2543.343420082134, + "language_loss": 0.75065589, + "learning_rate": 8.171376985767375e-08, + "loss": 0.76507962, + "num_input_tokens_seen": 327009640, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.24890137, + "step": 15159, + "time_per_iteration": 2.6260087490081787 + }, + { + "auxiliary_loss_clip": 0.01225778, + "auxiliary_loss_mlp": 0.00210869, + "balance_loss_clip": 1.01403475, + "balance_loss_mlp": 0.18565577, + "epoch": 0.9114685104464152, + "flos": 27088999061760.0, + "grad_norm": 2.4633408166723356, + "language_loss": 0.85357761, + "learning_rate": 8.160361920824588e-08, + "loss": 0.86794412, + "num_input_tokens_seen": 327027690, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.25231934, + "step": 15160, + "time_per_iteration": 2.6778383255004883 + }, + { + "auxiliary_loss_clip": 0.01265714, + "auxiliary_loss_mlp": 0.00234041, + "balance_loss_clip": 1.04277742, + "balance_loss_mlp": 0.20487064, + "epoch": 0.9115286336990831, + "flos": 17967042696960.0, + "grad_norm": 244.95996680243576, + "language_loss": 0.78334773, + "learning_rate": 8.149354130460073e-08, + "loss": 0.79834521, + "num_input_tokens_seen": 327045915, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.29187012, + "step": 15161, + "time_per_iteration": 2.819289445877075 + }, + { + "auxiliary_loss_clip": 0.01242967, + "auxiliary_loss_mlp": 0.00213949, + "balance_loss_clip": 1.01964033, + "balance_loss_mlp": 0.18790133, + "epoch": 0.9115887569517511, + "flos": 22929861306240.0, + "grad_norm": 15.838985480114276, + "language_loss": 0.85780823, + "learning_rate": 8.138353615091321e-08, + "loss": 0.8723774, + "num_input_tokens_seen": 327066355, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.26037598, + "step": 15162, + "time_per_iteration": 2.7309370040893555 + }, + { + "auxiliary_loss_clip": 0.01224224, + "auxiliary_loss_mlp": 0.00233217, + "balance_loss_clip": 1.01448941, + "balance_loss_mlp": 0.2092796, + "epoch": 0.911648880204419, + "flos": 23988436047360.0, + "grad_norm": 26.511323279638745, + "language_loss": 0.73819649, + "learning_rate": 8.127360375135395e-08, + "loss": 0.7527709, + "num_input_tokens_seen": 327086735, + "router_z_loss_clip": 2.10058594, + "router_z_loss_mlp": 0.23925781, + "step": 15163, + "time_per_iteration": 2.7200591564178467 + }, + { + "auxiliary_loss_clip": 0.01251838, + "auxiliary_loss_mlp": 0.00242752, + "balance_loss_clip": 1.02505422, + "balance_loss_mlp": 0.21620445, + "epoch": 0.911709003457087, + "flos": 17055306754560.0, + "grad_norm": 368.0479644577485, + "language_loss": 0.81335402, + "learning_rate": 8.116374411009186e-08, + "loss": 0.82829988, + "num_input_tokens_seen": 327104035, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.26574707, + "step": 15164, + "time_per_iteration": 2.6807126998901367 + }, + { + "auxiliary_loss_clip": 0.01226789, + "auxiliary_loss_mlp": 0.00199334, + "balance_loss_clip": 1.01723647, + "balance_loss_mlp": 0.17582585, + "epoch": 0.911769126709755, + "flos": 21653344794240.0, + "grad_norm": 22.141503100476793, + "language_loss": 0.82143152, + "learning_rate": 8.105395723129315e-08, + "loss": 0.83569276, + "num_input_tokens_seen": 327124370, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23510742, + "step": 15165, + "time_per_iteration": 2.6981706619262695 + }, + { + "auxiliary_loss_clip": 0.01248853, + "auxiliary_loss_mlp": 0.00211277, + "balance_loss_clip": 1.02917433, + "balance_loss_mlp": 0.18599312, + "epoch": 0.911829249962423, + "flos": 24790321221120.0, + "grad_norm": 3.601440051723143, + "language_loss": 0.82748806, + "learning_rate": 8.094424311912074e-08, + "loss": 0.84208935, + "num_input_tokens_seen": 327140915, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25292969, + "step": 15166, + "time_per_iteration": 2.6600568294525146 + }, + { + "auxiliary_loss_clip": 0.01263865, + "auxiliary_loss_mlp": 0.00226583, + "balance_loss_clip": 1.03252041, + "balance_loss_mlp": 0.19923644, + "epoch": 0.9118893732150909, + "flos": 20959406968320.0, + "grad_norm": 3.6419729031099175, + "language_loss": 0.82391262, + "learning_rate": 8.083460177773482e-08, + "loss": 0.83881712, + "num_input_tokens_seen": 327158940, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.27355957, + "step": 15167, + "time_per_iteration": 4.059603691101074 + }, + { + "auxiliary_loss_clip": 0.01086792, + "auxiliary_loss_mlp": 0.00074483, + "balance_loss_clip": 0.95079195, + "balance_loss_mlp": 0.06795068, + "epoch": 0.9119494964677589, + "flos": 67917385872000.0, + "grad_norm": 0.7555371186955577, + "language_loss": 0.64838064, + "learning_rate": 8.072503321129298e-08, + "loss": 0.65999341, + "num_input_tokens_seen": 327217450, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06542969, + "step": 15168, + "time_per_iteration": 4.550852298736572 + }, + { + "auxiliary_loss_clip": 0.0123159, + "auxiliary_loss_mlp": 0.00210662, + "balance_loss_clip": 1.02025867, + "balance_loss_mlp": 0.18728471, + "epoch": 0.9120096197204268, + "flos": 18551524803840.0, + "grad_norm": 11.23093551234543, + "language_loss": 0.85729623, + "learning_rate": 8.061553742395033e-08, + "loss": 0.87171876, + "num_input_tokens_seen": 327233905, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.23376465, + "step": 15169, + "time_per_iteration": 2.6620635986328125 + }, + { + "auxiliary_loss_clip": 0.01234909, + "auxiliary_loss_mlp": 0.00213502, + "balance_loss_clip": 1.01660764, + "balance_loss_mlp": 0.18782394, + "epoch": 0.9120697429730948, + "flos": 19025725178880.0, + "grad_norm": 75.19426090336482, + "language_loss": 0.89947945, + "learning_rate": 8.05061144198591e-08, + "loss": 0.91396362, + "num_input_tokens_seen": 327252430, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25695801, + "step": 15170, + "time_per_iteration": 2.6526594161987305 + }, + { + "auxiliary_loss_clip": 0.01239277, + "auxiliary_loss_mlp": 0.00213506, + "balance_loss_clip": 1.01981449, + "balance_loss_mlp": 0.18722011, + "epoch": 0.9121298662257629, + "flos": 17163685065600.0, + "grad_norm": 219.02244106901773, + "language_loss": 0.88929844, + "learning_rate": 8.039676420316799e-08, + "loss": 0.90382624, + "num_input_tokens_seen": 327269215, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.26269531, + "step": 15171, + "time_per_iteration": 2.6428802013397217 + }, + { + "auxiliary_loss_clip": 0.01229491, + "auxiliary_loss_mlp": 0.00224893, + "balance_loss_clip": 1.01352739, + "balance_loss_mlp": 0.2013019, + "epoch": 0.9121899894784308, + "flos": 19682710888320.0, + "grad_norm": 70.6049377733483, + "language_loss": 0.72601545, + "learning_rate": 8.02874867780241e-08, + "loss": 0.74055934, + "num_input_tokens_seen": 327290320, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.23596191, + "step": 15172, + "time_per_iteration": 2.6725051403045654 + }, + { + "auxiliary_loss_clip": 0.01243418, + "auxiliary_loss_mlp": 0.00239439, + "balance_loss_clip": 1.02744496, + "balance_loss_mlp": 0.21324843, + "epoch": 0.9122501127310988, + "flos": 22235743912320.0, + "grad_norm": 132.2154748583167, + "language_loss": 0.82434773, + "learning_rate": 8.017828214857103e-08, + "loss": 0.8391763, + "num_input_tokens_seen": 327310150, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.26184082, + "step": 15173, + "time_per_iteration": 2.69443678855896 + }, + { + "auxiliary_loss_clip": 0.01256779, + "auxiliary_loss_mlp": 0.00221783, + "balance_loss_clip": 1.03077793, + "balance_loss_mlp": 0.19428153, + "epoch": 0.9123102359837667, + "flos": 15957122290560.0, + "grad_norm": 61.906340312503744, + "language_loss": 0.75600952, + "learning_rate": 8.00691503189499e-08, + "loss": 0.77079517, + "num_input_tokens_seen": 327326660, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.27526855, + "step": 15174, + "time_per_iteration": 4.156372785568237 + }, + { + "auxiliary_loss_clip": 0.01227495, + "auxiliary_loss_mlp": 0.00191075, + "balance_loss_clip": 1.01310587, + "balance_loss_mlp": 0.16655341, + "epoch": 0.9123703592364347, + "flos": 25155784149120.0, + "grad_norm": 2.9696271677495165, + "language_loss": 0.83573443, + "learning_rate": 7.996009129329894e-08, + "loss": 0.84992015, + "num_input_tokens_seen": 327346700, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.24536133, + "step": 15175, + "time_per_iteration": 2.6867010593414307 + }, + { + "auxiliary_loss_clip": 0.01089776, + "auxiliary_loss_mlp": 0.00068723, + "balance_loss_clip": 0.95233476, + "balance_loss_mlp": 0.06266695, + "epoch": 0.9124304824891026, + "flos": 60801650812800.0, + "grad_norm": 0.9574020081588935, + "language_loss": 0.57630992, + "learning_rate": 7.985110507575421e-08, + "loss": 0.58789492, + "num_input_tokens_seen": 327403050, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06054688, + "step": 15176, + "time_per_iteration": 3.2126123905181885 + }, + { + "auxiliary_loss_clip": 0.01235554, + "auxiliary_loss_mlp": 0.00210934, + "balance_loss_clip": 1.01904273, + "balance_loss_mlp": 0.18431483, + "epoch": 0.9124906057417707, + "flos": 18150941352960.0, + "grad_norm": 1127.9117966502836, + "language_loss": 0.76152539, + "learning_rate": 7.97421916704475e-08, + "loss": 0.77599025, + "num_input_tokens_seen": 327422225, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.26635742, + "step": 15177, + "time_per_iteration": 2.6383533477783203 + }, + { + "auxiliary_loss_clip": 0.01229141, + "auxiliary_loss_mlp": 0.00203188, + "balance_loss_clip": 1.01862979, + "balance_loss_mlp": 0.17934623, + "epoch": 0.9125507289944386, + "flos": 11686769049600.0, + "grad_norm": 5.130844342602503, + "language_loss": 0.8923496, + "learning_rate": 7.963335108150926e-08, + "loss": 0.90667284, + "num_input_tokens_seen": 327437025, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.23852539, + "step": 15178, + "time_per_iteration": 2.6197571754455566 + }, + { + "auxiliary_loss_clip": 0.01229213, + "auxiliary_loss_mlp": 0.00248999, + "balance_loss_clip": 1.01261961, + "balance_loss_mlp": 0.22314274, + "epoch": 0.9126108522471066, + "flos": 17748813617280.0, + "grad_norm": 12.750744013220011, + "language_loss": 0.86715716, + "learning_rate": 7.952458331306711e-08, + "loss": 0.88193929, + "num_input_tokens_seen": 327453915, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.25830078, + "step": 15179, + "time_per_iteration": 4.0271079540252686 + }, + { + "auxiliary_loss_clip": 0.01217052, + "auxiliary_loss_mlp": 0.00179149, + "balance_loss_clip": 1.00913072, + "balance_loss_mlp": 0.15509218, + "epoch": 0.9126709754997745, + "flos": 27635738952960.0, + "grad_norm": 449.8073812144304, + "language_loss": 0.74556196, + "learning_rate": 7.941588836924507e-08, + "loss": 0.75952399, + "num_input_tokens_seen": 327474415, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.24047852, + "step": 15180, + "time_per_iteration": 2.7400448322296143 + }, + { + "auxiliary_loss_clip": 0.01215482, + "auxiliary_loss_mlp": 0.00200521, + "balance_loss_clip": 1.00917506, + "balance_loss_mlp": 0.1771203, + "epoch": 0.9127310987524425, + "flos": 15924982596480.0, + "grad_norm": 6.166886355935559, + "language_loss": 0.8377676, + "learning_rate": 7.930726625416495e-08, + "loss": 0.85192764, + "num_input_tokens_seen": 327492750, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.23388672, + "step": 15181, + "time_per_iteration": 2.6591107845306396 + }, + { + "auxiliary_loss_clip": 0.01250395, + "auxiliary_loss_mlp": 0.00207713, + "balance_loss_clip": 1.02755022, + "balance_loss_mlp": 0.18104537, + "epoch": 0.9127912220051104, + "flos": 21536885923200.0, + "grad_norm": 16.176513667530834, + "language_loss": 0.83381116, + "learning_rate": 7.919871697194614e-08, + "loss": 0.84839219, + "num_input_tokens_seen": 327509470, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26660156, + "step": 15182, + "time_per_iteration": 2.668879985809326 + }, + { + "auxiliary_loss_clip": 0.01247845, + "auxiliary_loss_mlp": 0.00219359, + "balance_loss_clip": 1.02350307, + "balance_loss_mlp": 0.19341953, + "epoch": 0.9128513452577784, + "flos": 24063561342720.0, + "grad_norm": 2.6778235083100212, + "language_loss": 0.83988869, + "learning_rate": 7.909024052670421e-08, + "loss": 0.85456073, + "num_input_tokens_seen": 327530520, + "router_z_loss_clip": 2.24511719, + "router_z_loss_mlp": 0.25939941, + "step": 15183, + "time_per_iteration": 2.705538511276245 + }, + { + "auxiliary_loss_clip": 0.01259019, + "auxiliary_loss_mlp": 0.0022853, + "balance_loss_clip": 1.03534579, + "balance_loss_mlp": 0.20338893, + "epoch": 0.9129114685104465, + "flos": 16216469464320.0, + "grad_norm": 8.317495280846062, + "language_loss": 0.8492372, + "learning_rate": 7.898183692255256e-08, + "loss": 0.86411273, + "num_input_tokens_seen": 327546960, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.25146484, + "step": 15184, + "time_per_iteration": 2.634359359741211 + }, + { + "auxiliary_loss_clip": 0.01227068, + "auxiliary_loss_mlp": 0.00195445, + "balance_loss_clip": 1.01634467, + "balance_loss_mlp": 0.17341486, + "epoch": 0.9129715917631144, + "flos": 19384364522880.0, + "grad_norm": 7.524071213030906, + "language_loss": 0.83471233, + "learning_rate": 7.887350616360233e-08, + "loss": 0.84893751, + "num_input_tokens_seen": 327564830, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.22033691, + "step": 15185, + "time_per_iteration": 2.7252070903778076 + }, + { + "auxiliary_loss_clip": 0.01230929, + "auxiliary_loss_mlp": 0.00211495, + "balance_loss_clip": 1.01518011, + "balance_loss_mlp": 0.18588866, + "epoch": 0.9130317150157824, + "flos": 20590460421120.0, + "grad_norm": 4.870786995984541, + "language_loss": 0.78439516, + "learning_rate": 7.876524825396158e-08, + "loss": 0.79881936, + "num_input_tokens_seen": 327583675, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.25634766, + "step": 15186, + "time_per_iteration": 2.657087564468384 + }, + { + "auxiliary_loss_clip": 0.01245716, + "auxiliary_loss_mlp": 0.00212773, + "balance_loss_clip": 1.02639318, + "balance_loss_mlp": 0.18831134, + "epoch": 0.9130918382684503, + "flos": 20189230525440.0, + "grad_norm": 19.45421687575443, + "language_loss": 0.86118162, + "learning_rate": 7.865706319773502e-08, + "loss": 0.87576652, + "num_input_tokens_seen": 327602280, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24475098, + "step": 15187, + "time_per_iteration": 2.6904025077819824 + }, + { + "auxiliary_loss_clip": 0.01244566, + "auxiliary_loss_mlp": 0.00214419, + "balance_loss_clip": 1.02381456, + "balance_loss_mlp": 0.18942064, + "epoch": 0.9131519615211183, + "flos": 25556870390400.0, + "grad_norm": 561.3572068252946, + "language_loss": 0.73943448, + "learning_rate": 7.854895099902515e-08, + "loss": 0.75402439, + "num_input_tokens_seen": 327623515, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25024414, + "step": 15188, + "time_per_iteration": 2.680144786834717 + }, + { + "auxiliary_loss_clip": 0.01228962, + "auxiliary_loss_mlp": 0.00224844, + "balance_loss_clip": 1.0164156, + "balance_loss_mlp": 0.19998923, + "epoch": 0.9132120847737862, + "flos": 17931563038080.0, + "grad_norm": 2.3920337343922795, + "language_loss": 0.85091245, + "learning_rate": 7.844091166193157e-08, + "loss": 0.86545062, + "num_input_tokens_seen": 327642875, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.24853516, + "step": 15189, + "time_per_iteration": 2.670252799987793 + }, + { + "auxiliary_loss_clip": 0.01226291, + "auxiliary_loss_mlp": 0.00207947, + "balance_loss_clip": 1.01066208, + "balance_loss_mlp": 0.18425986, + "epoch": 0.9132722080264543, + "flos": 20047635112320.0, + "grad_norm": 51.18450463493105, + "language_loss": 0.83647001, + "learning_rate": 7.8332945190551e-08, + "loss": 0.85081238, + "num_input_tokens_seen": 327662450, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.23693848, + "step": 15190, + "time_per_iteration": 2.7182059288024902 + }, + { + "auxiliary_loss_clip": 0.01083239, + "auxiliary_loss_mlp": 0.00091578, + "balance_loss_clip": 0.94682646, + "balance_loss_mlp": 0.0839009, + "epoch": 0.9133323312791222, + "flos": 70439967141120.0, + "grad_norm": 0.7027294052999026, + "language_loss": 0.56410134, + "learning_rate": 7.822505158897797e-08, + "loss": 0.57584947, + "num_input_tokens_seen": 327723845, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.07666016, + "step": 15191, + "time_per_iteration": 3.1930105686187744 + }, + { + "auxiliary_loss_clip": 0.01233394, + "auxiliary_loss_mlp": 0.00200004, + "balance_loss_clip": 1.02109194, + "balance_loss_mlp": 0.17491055, + "epoch": 0.9133924545317902, + "flos": 25483792170240.0, + "grad_norm": 5.216472784138341, + "language_loss": 0.80534714, + "learning_rate": 7.81172308613034e-08, + "loss": 0.81968111, + "num_input_tokens_seen": 327742590, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.25097656, + "step": 15192, + "time_per_iteration": 2.768113613128662 + }, + { + "auxiliary_loss_clip": 0.01230177, + "auxiliary_loss_mlp": 0.00207486, + "balance_loss_clip": 1.01518404, + "balance_loss_mlp": 0.18372795, + "epoch": 0.9134525777844581, + "flos": 39930690107520.0, + "grad_norm": 424.1350886672366, + "language_loss": 0.77126062, + "learning_rate": 7.800948301161647e-08, + "loss": 0.78563726, + "num_input_tokens_seen": 327764350, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.23779297, + "step": 15193, + "time_per_iteration": 2.8360421657562256 + }, + { + "auxiliary_loss_clip": 0.01235898, + "auxiliary_loss_mlp": 0.0021955, + "balance_loss_clip": 1.02089691, + "balance_loss_mlp": 0.19500509, + "epoch": 0.9135127010371261, + "flos": 20886723797760.0, + "grad_norm": 2229.675209293125, + "language_loss": 0.80308104, + "learning_rate": 7.790180804400215e-08, + "loss": 0.81763542, + "num_input_tokens_seen": 327783120, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.2454834, + "step": 15194, + "time_per_iteration": 2.6483874320983887 + }, + { + "auxiliary_loss_clip": 0.01244318, + "auxiliary_loss_mlp": 0.00236826, + "balance_loss_clip": 1.02529311, + "balance_loss_mlp": 0.20955065, + "epoch": 0.913572824289794, + "flos": 20813250528000.0, + "grad_norm": 20.92280030603179, + "language_loss": 0.72246176, + "learning_rate": 7.779420596254383e-08, + "loss": 0.73727322, + "num_input_tokens_seen": 327801960, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.27282715, + "step": 15195, + "time_per_iteration": 2.6423444747924805 + }, + { + "auxiliary_loss_clip": 0.01235802, + "auxiliary_loss_mlp": 0.00224938, + "balance_loss_clip": 1.01794624, + "balance_loss_mlp": 0.19871241, + "epoch": 0.913632947542462, + "flos": 25703278225920.0, + "grad_norm": 10.094544826301764, + "language_loss": 0.79457068, + "learning_rate": 7.768667677132201e-08, + "loss": 0.80917799, + "num_input_tokens_seen": 327823795, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.26269531, + "step": 15196, + "time_per_iteration": 2.7720067501068115 + }, + { + "auxiliary_loss_clip": 0.01236305, + "auxiliary_loss_mlp": 0.00211033, + "balance_loss_clip": 1.02253544, + "balance_loss_mlp": 0.1874655, + "epoch": 0.9136930707951301, + "flos": 26286216048000.0, + "grad_norm": 2.5689518077882143, + "language_loss": 0.7730189, + "learning_rate": 7.757922047441411e-08, + "loss": 0.78749228, + "num_input_tokens_seen": 327845175, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.23571777, + "step": 15197, + "time_per_iteration": 2.6641390323638916 + }, + { + "auxiliary_loss_clip": 0.01244292, + "auxiliary_loss_mlp": 0.00221147, + "balance_loss_clip": 1.02214587, + "balance_loss_mlp": 0.19465879, + "epoch": 0.913753194047798, + "flos": 22091885942400.0, + "grad_norm": 28.933515667311795, + "language_loss": 0.85417068, + "learning_rate": 7.747183707589489e-08, + "loss": 0.86882508, + "num_input_tokens_seen": 327863150, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.26501465, + "step": 15198, + "time_per_iteration": 2.769416093826294 + }, + { + "auxiliary_loss_clip": 0.01234326, + "auxiliary_loss_mlp": 0.00203303, + "balance_loss_clip": 1.02150559, + "balance_loss_mlp": 0.17687465, + "epoch": 0.913813317300466, + "flos": 23587206151680.0, + "grad_norm": 49.53372887669639, + "language_loss": 0.74102563, + "learning_rate": 7.736452657983616e-08, + "loss": 0.75540197, + "num_input_tokens_seen": 327883445, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.2644043, + "step": 15199, + "time_per_iteration": 2.7137839794158936 + }, + { + "auxiliary_loss_clip": 0.01249806, + "auxiliary_loss_mlp": 0.00215566, + "balance_loss_clip": 1.03012824, + "balance_loss_mlp": 0.18949476, + "epoch": 0.9138734405531339, + "flos": 28876452583680.0, + "grad_norm": 50.33344386873857, + "language_loss": 0.74128026, + "learning_rate": 7.725728899030714e-08, + "loss": 0.755934, + "num_input_tokens_seen": 327905745, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.26074219, + "step": 15200, + "time_per_iteration": 2.726574182510376 + }, + { + "auxiliary_loss_clip": 0.01231855, + "auxiliary_loss_mlp": 0.00216258, + "balance_loss_clip": 1.01907432, + "balance_loss_mlp": 0.19178393, + "epoch": 0.9139335638058019, + "flos": 22821087945600.0, + "grad_norm": 6.275974848505211, + "language_loss": 0.79504609, + "learning_rate": 7.715012431137435e-08, + "loss": 0.80952722, + "num_input_tokens_seen": 327925435, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.24475098, + "step": 15201, + "time_per_iteration": 2.650866746902466 + }, + { + "auxiliary_loss_clip": 0.01226285, + "auxiliary_loss_mlp": 0.002421, + "balance_loss_clip": 1.01515508, + "balance_loss_mlp": 0.21699433, + "epoch": 0.9139936870584698, + "flos": 18004174381440.0, + "grad_norm": 22.370980122587003, + "language_loss": 0.78725278, + "learning_rate": 7.704303254710165e-08, + "loss": 0.80193663, + "num_input_tokens_seen": 327944145, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.25134277, + "step": 15202, + "time_per_iteration": 2.6384048461914062 + }, + { + "auxiliary_loss_clip": 0.01236251, + "auxiliary_loss_mlp": 0.00211777, + "balance_loss_clip": 1.0210743, + "balance_loss_mlp": 0.18438303, + "epoch": 0.9140538103111379, + "flos": 15813767111040.0, + "grad_norm": 67.20164936036782, + "language_loss": 0.79440361, + "learning_rate": 7.693601370155001e-08, + "loss": 0.80888391, + "num_input_tokens_seen": 327960565, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.27416992, + "step": 15203, + "time_per_iteration": 2.649212121963501 + }, + { + "auxiliary_loss_clip": 0.01234347, + "auxiliary_loss_mlp": 0.00211462, + "balance_loss_clip": 1.022717, + "balance_loss_mlp": 0.18674992, + "epoch": 0.9141139335638058, + "flos": 23987035416960.0, + "grad_norm": 118.10637971284312, + "language_loss": 0.77199733, + "learning_rate": 7.682906777877751e-08, + "loss": 0.78645545, + "num_input_tokens_seen": 327981180, + "router_z_loss_clip": 2.11425781, + "router_z_loss_mlp": 0.24694824, + "step": 15204, + "time_per_iteration": 2.743765354156494 + }, + { + "auxiliary_loss_clip": 0.01237347, + "auxiliary_loss_mlp": 0.00242253, + "balance_loss_clip": 1.01805544, + "balance_loss_mlp": 0.21553841, + "epoch": 0.9141740568164738, + "flos": 24024418496640.0, + "grad_norm": 11.599380362575976, + "language_loss": 0.701024, + "learning_rate": 7.672219478283915e-08, + "loss": 0.71581995, + "num_input_tokens_seen": 328001500, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.26721191, + "step": 15205, + "time_per_iteration": 2.7192745208740234 + }, + { + "auxiliary_loss_clip": 0.01230902, + "auxiliary_loss_mlp": 0.00204756, + "balance_loss_clip": 1.01944852, + "balance_loss_mlp": 0.18013912, + "epoch": 0.9142341800691417, + "flos": 27018291139200.0, + "grad_norm": 59.03333719654073, + "language_loss": 0.89449704, + "learning_rate": 7.661539471778811e-08, + "loss": 0.90885359, + "num_input_tokens_seen": 328023025, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.24609375, + "step": 15206, + "time_per_iteration": 2.7348172664642334 + }, + { + "auxiliary_loss_clip": 0.01227972, + "auxiliary_loss_mlp": 0.00217726, + "balance_loss_clip": 1.01562572, + "balance_loss_mlp": 0.19318049, + "epoch": 0.9142943033218097, + "flos": 20412487509120.0, + "grad_norm": 20.402348763291076, + "language_loss": 0.8304742, + "learning_rate": 7.650866758767382e-08, + "loss": 0.84493124, + "num_input_tokens_seen": 328041410, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.24536133, + "step": 15207, + "time_per_iteration": 2.921351671218872 + }, + { + "auxiliary_loss_clip": 0.01237938, + "auxiliary_loss_mlp": 0.00217112, + "balance_loss_clip": 1.01989019, + "balance_loss_mlp": 0.19408101, + "epoch": 0.9143544265744776, + "flos": 19755322231680.0, + "grad_norm": 4.525769776494969, + "language_loss": 0.81761432, + "learning_rate": 7.640201339654373e-08, + "loss": 0.83216488, + "num_input_tokens_seen": 328060495, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.23059082, + "step": 15208, + "time_per_iteration": 2.7250730991363525 + }, + { + "auxiliary_loss_clip": 0.01230924, + "auxiliary_loss_mlp": 0.00215395, + "balance_loss_clip": 1.02181101, + "balance_loss_mlp": 0.1923995, + "epoch": 0.9144145498271457, + "flos": 17165444832000.0, + "grad_norm": 5.290281620066998, + "language_loss": 0.94196707, + "learning_rate": 7.629543214844237e-08, + "loss": 0.9564302, + "num_input_tokens_seen": 328076905, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.23010254, + "step": 15209, + "time_per_iteration": 4.105029106140137 + }, + { + "auxiliary_loss_clip": 0.01248252, + "auxiliary_loss_mlp": 0.00233749, + "balance_loss_clip": 1.02507555, + "balance_loss_mlp": 0.20609234, + "epoch": 0.9144746730798137, + "flos": 23726072131200.0, + "grad_norm": 17.456356301018843, + "language_loss": 0.84735978, + "learning_rate": 7.618892384741093e-08, + "loss": 0.86217976, + "num_input_tokens_seen": 328096960, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.2767334, + "step": 15210, + "time_per_iteration": 4.1807167530059814 + }, + { + "auxiliary_loss_clip": 0.01227831, + "auxiliary_loss_mlp": 0.00210856, + "balance_loss_clip": 1.01666069, + "balance_loss_mlp": 0.18805096, + "epoch": 0.9145347963324816, + "flos": 25847854467840.0, + "grad_norm": 8.697062765935094, + "language_loss": 0.84546208, + "learning_rate": 7.6082488497488e-08, + "loss": 0.85984898, + "num_input_tokens_seen": 328115445, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.22814941, + "step": 15211, + "time_per_iteration": 2.7157561779022217 + }, + { + "auxiliary_loss_clip": 0.01229772, + "auxiliary_loss_mlp": 0.00218953, + "balance_loss_clip": 1.01388025, + "balance_loss_mlp": 0.19499183, + "epoch": 0.9145949195851496, + "flos": 19242769109760.0, + "grad_norm": 7.7657828846739045, + "language_loss": 0.90300107, + "learning_rate": 7.597612610270986e-08, + "loss": 0.91748834, + "num_input_tokens_seen": 328133965, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.23962402, + "step": 15212, + "time_per_iteration": 2.685877561569214 + }, + { + "auxiliary_loss_clip": 0.01235772, + "auxiliary_loss_mlp": 0.00212989, + "balance_loss_clip": 1.02221012, + "balance_loss_mlp": 0.18746585, + "epoch": 0.9146550428378175, + "flos": 18296379521280.0, + "grad_norm": 9.87575385844662, + "language_loss": 0.89776421, + "learning_rate": 7.586983666711022e-08, + "loss": 0.91225183, + "num_input_tokens_seen": 328151520, + "router_z_loss_clip": 2.13378906, + "router_z_loss_mlp": 0.25524902, + "step": 15213, + "time_per_iteration": 2.666491985321045 + }, + { + "auxiliary_loss_clip": 0.0123718, + "auxiliary_loss_mlp": 0.00205856, + "balance_loss_clip": 1.01946259, + "balance_loss_mlp": 0.18072656, + "epoch": 0.9147151660904855, + "flos": 20084264006400.0, + "grad_norm": 7.330480119635453, + "language_loss": 0.82511938, + "learning_rate": 7.576362019471894e-08, + "loss": 0.83954972, + "num_input_tokens_seen": 328171275, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.25109863, + "step": 15214, + "time_per_iteration": 2.6956350803375244 + }, + { + "auxiliary_loss_clip": 0.01256634, + "auxiliary_loss_mlp": 0.0021571, + "balance_loss_clip": 1.03601694, + "balance_loss_mlp": 0.19078302, + "epoch": 0.9147752893431534, + "flos": 24389127239040.0, + "grad_norm": 14.629216149161605, + "language_loss": 0.72229058, + "learning_rate": 7.565747668956413e-08, + "loss": 0.737014, + "num_input_tokens_seen": 328192115, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24938965, + "step": 15215, + "time_per_iteration": 2.7841944694519043 + }, + { + "auxiliary_loss_clip": 0.01252862, + "auxiliary_loss_mlp": 0.00214201, + "balance_loss_clip": 1.03127599, + "balance_loss_mlp": 0.18736693, + "epoch": 0.9148354125958215, + "flos": 18150402648960.0, + "grad_norm": 43.99972806887092, + "language_loss": 0.86701262, + "learning_rate": 7.555140615567058e-08, + "loss": 0.88168323, + "num_input_tokens_seen": 328208990, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.26855469, + "step": 15216, + "time_per_iteration": 4.031267404556274 + }, + { + "auxiliary_loss_clip": 0.01228105, + "auxiliary_loss_mlp": 0.00228824, + "balance_loss_clip": 1.01361799, + "balance_loss_mlp": 0.20334873, + "epoch": 0.9148955358484894, + "flos": 23367540528000.0, + "grad_norm": 20.142432060383065, + "language_loss": 0.76259696, + "learning_rate": 7.544540859706062e-08, + "loss": 0.77716625, + "num_input_tokens_seen": 328227840, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.25476074, + "step": 15217, + "time_per_iteration": 2.7114360332489014 + }, + { + "auxiliary_loss_clip": 0.01247707, + "auxiliary_loss_mlp": 0.0020675, + "balance_loss_clip": 1.02893353, + "balance_loss_mlp": 0.18231168, + "epoch": 0.9149556591011574, + "flos": 18076498416000.0, + "grad_norm": 13.50187486799799, + "language_loss": 0.88517094, + "learning_rate": 7.533948401775347e-08, + "loss": 0.89971554, + "num_input_tokens_seen": 328246250, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24450684, + "step": 15218, + "time_per_iteration": 2.648017644882202 + }, + { + "auxiliary_loss_clip": 0.01083006, + "auxiliary_loss_mlp": 0.00070228, + "balance_loss_clip": 0.94717634, + "balance_loss_mlp": 0.06293241, + "epoch": 0.9150157823538253, + "flos": 54586374825600.0, + "grad_norm": 0.8209146784577512, + "language_loss": 0.58300436, + "learning_rate": 7.523363242176595e-08, + "loss": 0.59453666, + "num_input_tokens_seen": 328303625, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07275391, + "step": 15219, + "time_per_iteration": 3.110989570617676 + }, + { + "auxiliary_loss_clip": 0.01222978, + "auxiliary_loss_mlp": 0.0021889, + "balance_loss_clip": 1.01032019, + "balance_loss_mlp": 0.19534624, + "epoch": 0.9150759056064933, + "flos": 17893102550400.0, + "grad_norm": 14.45182193405094, + "language_loss": 0.8638345, + "learning_rate": 7.512785381311216e-08, + "loss": 0.87825316, + "num_input_tokens_seen": 328322135, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.23522949, + "step": 15220, + "time_per_iteration": 2.630849838256836 + }, + { + "auxiliary_loss_clip": 0.01254282, + "auxiliary_loss_mlp": 0.00226496, + "balance_loss_clip": 1.02812433, + "balance_loss_mlp": 0.20091371, + "epoch": 0.9151360288591612, + "flos": 18073517587200.0, + "grad_norm": 12.200188971529789, + "language_loss": 0.74002695, + "learning_rate": 7.50221481958031e-08, + "loss": 0.75483477, + "num_input_tokens_seen": 328340750, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.25585938, + "step": 15221, + "time_per_iteration": 4.133982181549072 + }, + { + "auxiliary_loss_clip": 0.01233092, + "auxiliary_loss_mlp": 0.00214712, + "balance_loss_clip": 1.01648331, + "balance_loss_mlp": 0.19005959, + "epoch": 0.9151961521118293, + "flos": 19354523299200.0, + "grad_norm": 378.26645009248045, + "language_loss": 0.94844764, + "learning_rate": 7.491651557384692e-08, + "loss": 0.96292567, + "num_input_tokens_seen": 328359995, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.24645996, + "step": 15222, + "time_per_iteration": 2.6632165908813477 + }, + { + "auxiliary_loss_clip": 0.01077301, + "auxiliary_loss_mlp": 0.00062585, + "balance_loss_clip": 0.94213009, + "balance_loss_mlp": 0.05605267, + "epoch": 0.9152562753644973, + "flos": 72146621018880.0, + "grad_norm": 0.7051164639632026, + "language_loss": 0.48844492, + "learning_rate": 7.481095595124953e-08, + "loss": 0.49984378, + "num_input_tokens_seen": 328426865, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.06542969, + "step": 15223, + "time_per_iteration": 3.184323787689209 + }, + { + "auxiliary_loss_clip": 0.0124005, + "auxiliary_loss_mlp": 0.00222751, + "balance_loss_clip": 1.02590489, + "balance_loss_mlp": 0.19801475, + "epoch": 0.9153163986171652, + "flos": 20777016683520.0, + "grad_norm": 24.520946316790237, + "language_loss": 0.81810486, + "learning_rate": 7.470546933201349e-08, + "loss": 0.83273292, + "num_input_tokens_seen": 328445970, + "router_z_loss_clip": 2.13964844, + "router_z_loss_mlp": 0.24743652, + "step": 15224, + "time_per_iteration": 2.6698310375213623 + }, + { + "auxiliary_loss_clip": 0.0124129, + "auxiliary_loss_mlp": 0.00215056, + "balance_loss_clip": 1.0219512, + "balance_loss_mlp": 0.18981916, + "epoch": 0.9153765218698332, + "flos": 23040107124480.0, + "grad_norm": 15.44462375530467, + "language_loss": 0.89967287, + "learning_rate": 7.460005572013895e-08, + "loss": 0.91423631, + "num_input_tokens_seen": 328464585, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25244141, + "step": 15225, + "time_per_iteration": 2.7057507038116455 + }, + { + "auxiliary_loss_clip": 0.01249935, + "auxiliary_loss_mlp": 0.00213159, + "balance_loss_clip": 1.03072762, + "balance_loss_mlp": 0.18829212, + "epoch": 0.9154366451225011, + "flos": 28990900293120.0, + "grad_norm": 692.0240411450426, + "language_loss": 0.76450008, + "learning_rate": 7.44947151196238e-08, + "loss": 0.77913105, + "num_input_tokens_seen": 328490155, + "router_z_loss_clip": 2.18847656, + "router_z_loss_mlp": 0.24890137, + "step": 15226, + "time_per_iteration": 2.7581610679626465 + }, + { + "auxiliary_loss_clip": 0.01242394, + "auxiliary_loss_mlp": 0.00212402, + "balance_loss_clip": 1.02766395, + "balance_loss_mlp": 0.18771356, + "epoch": 0.9154967683751691, + "flos": 22309504490880.0, + "grad_norm": 11.297041058300382, + "language_loss": 0.84784067, + "learning_rate": 7.43894475344613e-08, + "loss": 0.86238861, + "num_input_tokens_seen": 328508275, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24658203, + "step": 15227, + "time_per_iteration": 2.676530122756958 + }, + { + "auxiliary_loss_clip": 0.01234126, + "auxiliary_loss_mlp": 0.00214145, + "balance_loss_clip": 1.01830733, + "balance_loss_mlp": 0.18994585, + "epoch": 0.915556891627837, + "flos": 24571481610240.0, + "grad_norm": 4.0706522416841215, + "language_loss": 0.81174254, + "learning_rate": 7.428425296864404e-08, + "loss": 0.82622522, + "num_input_tokens_seen": 328529425, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.24206543, + "step": 15228, + "time_per_iteration": 2.7147719860076904 + }, + { + "auxiliary_loss_clip": 0.01222631, + "auxiliary_loss_mlp": 0.00216629, + "balance_loss_clip": 1.00890386, + "balance_loss_mlp": 0.19141592, + "epoch": 0.9156170148805051, + "flos": 22164676853760.0, + "grad_norm": 39.24322981409838, + "language_loss": 0.78504753, + "learning_rate": 7.417913142616106e-08, + "loss": 0.79944015, + "num_input_tokens_seen": 328550200, + "router_z_loss_clip": 2.13769531, + "router_z_loss_mlp": 0.25219727, + "step": 15229, + "time_per_iteration": 2.691774845123291 + }, + { + "auxiliary_loss_clip": 0.01234605, + "auxiliary_loss_mlp": 0.0019957, + "balance_loss_clip": 1.01610923, + "balance_loss_mlp": 0.17541781, + "epoch": 0.915677138133173, + "flos": 20920659171840.0, + "grad_norm": 25.828114866928644, + "language_loss": 0.9096486, + "learning_rate": 7.407408291099848e-08, + "loss": 0.92399037, + "num_input_tokens_seen": 328568540, + "router_z_loss_clip": 2.18457031, + "router_z_loss_mlp": 0.24133301, + "step": 15230, + "time_per_iteration": 2.651468276977539 + }, + { + "auxiliary_loss_clip": 0.01220622, + "auxiliary_loss_mlp": 0.00216028, + "balance_loss_clip": 1.00998187, + "balance_loss_mlp": 0.19154264, + "epoch": 0.915737261385841, + "flos": 24345136056960.0, + "grad_norm": 82.04429104477137, + "language_loss": 0.90565991, + "learning_rate": 7.396910742713957e-08, + "loss": 0.92002636, + "num_input_tokens_seen": 328587300, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.24511719, + "step": 15231, + "time_per_iteration": 2.682678461074829 + }, + { + "auxiliary_loss_clip": 0.01216622, + "auxiliary_loss_mlp": 0.00222345, + "balance_loss_clip": 1.00579381, + "balance_loss_mlp": 0.19868165, + "epoch": 0.9157973846385089, + "flos": 26761386090240.0, + "grad_norm": 1.973290299637632, + "language_loss": 0.78802264, + "learning_rate": 7.386420497856516e-08, + "loss": 0.80241227, + "num_input_tokens_seen": 328610055, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.23681641, + "step": 15232, + "time_per_iteration": 2.7297003269195557 + }, + { + "auxiliary_loss_clip": 0.01247151, + "auxiliary_loss_mlp": 0.00224966, + "balance_loss_clip": 1.02766979, + "balance_loss_mlp": 0.19912139, + "epoch": 0.9158575078911769, + "flos": 18478733892480.0, + "grad_norm": 33.873154415169026, + "language_loss": 0.79134047, + "learning_rate": 7.375937556925338e-08, + "loss": 0.80606163, + "num_input_tokens_seen": 328626815, + "router_z_loss_clip": 2.19433594, + "router_z_loss_mlp": 0.25854492, + "step": 15233, + "time_per_iteration": 2.6318297386169434 + }, + { + "auxiliary_loss_clip": 0.01254195, + "auxiliary_loss_mlp": 0.00223757, + "balance_loss_clip": 1.03160906, + "balance_loss_mlp": 0.19713759, + "epoch": 0.9159176311438448, + "flos": 21798926616960.0, + "grad_norm": 3.6578985422361194, + "language_loss": 0.76668859, + "learning_rate": 7.365461920317861e-08, + "loss": 0.78146803, + "num_input_tokens_seen": 328643995, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26623535, + "step": 15234, + "time_per_iteration": 2.6570727825164795 + }, + { + "auxiliary_loss_clip": 0.01254788, + "auxiliary_loss_mlp": 0.00239875, + "balance_loss_clip": 1.03397107, + "balance_loss_mlp": 0.21348169, + "epoch": 0.9159777543965129, + "flos": 24783749032320.0, + "grad_norm": 12.859115663110945, + "language_loss": 0.95915759, + "learning_rate": 7.354993588431391e-08, + "loss": 0.97410429, + "num_input_tokens_seen": 328659565, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.26403809, + "step": 15235, + "time_per_iteration": 2.757220983505249 + }, + { + "auxiliary_loss_clip": 0.01241548, + "auxiliary_loss_mlp": 0.00213908, + "balance_loss_clip": 1.02796006, + "balance_loss_mlp": 0.18768154, + "epoch": 0.9160378776491809, + "flos": 26868758820480.0, + "grad_norm": 3.0306463752634074, + "language_loss": 0.85290563, + "learning_rate": 7.344532561662853e-08, + "loss": 0.86746019, + "num_input_tokens_seen": 328679045, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.26245117, + "step": 15236, + "time_per_iteration": 2.802663803100586 + }, + { + "auxiliary_loss_clip": 0.01080787, + "auxiliary_loss_mlp": 0.00087023, + "balance_loss_clip": 0.94271892, + "balance_loss_mlp": 0.08044279, + "epoch": 0.9160980009018488, + "flos": 70578222589440.0, + "grad_norm": 0.6488012212840875, + "language_loss": 0.61121917, + "learning_rate": 7.334078840409019e-08, + "loss": 0.62289727, + "num_input_tokens_seen": 328744565, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06591797, + "step": 15237, + "time_per_iteration": 3.10310697555542 + }, + { + "auxiliary_loss_clip": 0.01263112, + "auxiliary_loss_mlp": 0.00235205, + "balance_loss_clip": 1.03639233, + "balance_loss_mlp": 0.20872837, + "epoch": 0.9161581241545168, + "flos": 16289332202880.0, + "grad_norm": 708.5518001103608, + "language_loss": 0.8266868, + "learning_rate": 7.323632425066151e-08, + "loss": 0.84166998, + "num_input_tokens_seen": 328762455, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.26501465, + "step": 15238, + "time_per_iteration": 2.6327779293060303 + }, + { + "auxiliary_loss_clip": 0.0122918, + "auxiliary_loss_mlp": 0.00221395, + "balance_loss_clip": 1.01660609, + "balance_loss_mlp": 0.1977323, + "epoch": 0.9162182474071847, + "flos": 18438154502400.0, + "grad_norm": 58.219639065668666, + "language_loss": 0.82888085, + "learning_rate": 7.313193316030464e-08, + "loss": 0.84338653, + "num_input_tokens_seen": 328780320, + "router_z_loss_clip": 2.12402344, + "router_z_loss_mlp": 0.23693848, + "step": 15239, + "time_per_iteration": 2.727893590927124 + }, + { + "auxiliary_loss_clip": 0.01259558, + "auxiliary_loss_mlp": 0.00208556, + "balance_loss_clip": 1.03452539, + "balance_loss_mlp": 0.18228218, + "epoch": 0.9162783706598527, + "flos": 19167248764800.0, + "grad_norm": 3.2536242414954337, + "language_loss": 0.74819469, + "learning_rate": 7.302761513697819e-08, + "loss": 0.7628758, + "num_input_tokens_seen": 328797570, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.26269531, + "step": 15240, + "time_per_iteration": 2.646620273590088 + }, + { + "auxiliary_loss_clip": 0.01240761, + "auxiliary_loss_mlp": 0.00215278, + "balance_loss_clip": 1.02403069, + "balance_loss_mlp": 0.1911615, + "epoch": 0.9163384939125206, + "flos": 20412990299520.0, + "grad_norm": 26.679201270703295, + "language_loss": 0.81102395, + "learning_rate": 7.292337018463746e-08, + "loss": 0.82558429, + "num_input_tokens_seen": 328814075, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.24133301, + "step": 15241, + "time_per_iteration": 2.614020824432373 + }, + { + "auxiliary_loss_clip": 0.01270738, + "auxiliary_loss_mlp": 0.00235004, + "balance_loss_clip": 1.04144418, + "balance_loss_mlp": 0.20901629, + "epoch": 0.9163986171651887, + "flos": 19645902426240.0, + "grad_norm": 5.735059735816699, + "language_loss": 0.84074992, + "learning_rate": 7.281919830723549e-08, + "loss": 0.8558073, + "num_input_tokens_seen": 328831990, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.26013184, + "step": 15242, + "time_per_iteration": 2.662379503250122 + }, + { + "auxiliary_loss_clip": 0.01228829, + "auxiliary_loss_mlp": 0.00209476, + "balance_loss_clip": 1.01622343, + "balance_loss_mlp": 0.18519349, + "epoch": 0.9164587404178566, + "flos": 12823054865280.0, + "grad_norm": 6.330474422754946, + "language_loss": 0.89370787, + "learning_rate": 7.271509950872334e-08, + "loss": 0.90809095, + "num_input_tokens_seen": 328849105, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.24291992, + "step": 15243, + "time_per_iteration": 2.62955379486084 + }, + { + "auxiliary_loss_clip": 0.01255269, + "auxiliary_loss_mlp": 0.00223216, + "balance_loss_clip": 1.03392112, + "balance_loss_mlp": 0.19908835, + "epoch": 0.9165188636705246, + "flos": 22309396750080.0, + "grad_norm": 390.9932462753688, + "language_loss": 0.88470531, + "learning_rate": 7.261107379304721e-08, + "loss": 0.89949024, + "num_input_tokens_seen": 328866810, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.24133301, + "step": 15244, + "time_per_iteration": 2.709573745727539 + }, + { + "auxiliary_loss_clip": 0.01256705, + "auxiliary_loss_mlp": 0.00233347, + "balance_loss_clip": 1.02790213, + "balance_loss_mlp": 0.20666757, + "epoch": 0.9165789869231925, + "flos": 18223337214720.0, + "grad_norm": 6.10810663887689, + "language_loss": 0.83797425, + "learning_rate": 7.250712116415214e-08, + "loss": 0.85287476, + "num_input_tokens_seen": 328885325, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.2668457, + "step": 15245, + "time_per_iteration": 2.7091357707977295 + }, + { + "auxiliary_loss_clip": 0.01228069, + "auxiliary_loss_mlp": 0.00211796, + "balance_loss_clip": 1.01694226, + "balance_loss_mlp": 0.18763183, + "epoch": 0.9166391101758605, + "flos": 13691553811200.0, + "grad_norm": 3.449491965752427, + "language_loss": 0.84198064, + "learning_rate": 7.240324162598033e-08, + "loss": 0.85637927, + "num_input_tokens_seen": 328902655, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.24133301, + "step": 15246, + "time_per_iteration": 2.5992815494537354 + }, + { + "auxiliary_loss_clip": 0.01236224, + "auxiliary_loss_mlp": 0.0024139, + "balance_loss_clip": 1.02011013, + "balance_loss_mlp": 0.21375775, + "epoch": 0.9166992334285284, + "flos": 17346793622400.0, + "grad_norm": 73.58223654399899, + "language_loss": 0.85180962, + "learning_rate": 7.229943518247106e-08, + "loss": 0.86658573, + "num_input_tokens_seen": 328918440, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.27648926, + "step": 15247, + "time_per_iteration": 2.649966239929199 + }, + { + "auxiliary_loss_clip": 0.01243553, + "auxiliary_loss_mlp": 0.00221301, + "balance_loss_clip": 1.02482259, + "balance_loss_mlp": 0.19661269, + "epoch": 0.9167593566811965, + "flos": 23731135948800.0, + "grad_norm": 7.946335431247394, + "language_loss": 0.85047311, + "learning_rate": 7.219570183756052e-08, + "loss": 0.8651216, + "num_input_tokens_seen": 328938055, + "router_z_loss_clip": 2.18847656, + "router_z_loss_mlp": 0.24658203, + "step": 15248, + "time_per_iteration": 2.6834778785705566 + }, + { + "auxiliary_loss_clip": 0.01255173, + "auxiliary_loss_mlp": 0.00210652, + "balance_loss_clip": 1.03327823, + "balance_loss_mlp": 0.185225, + "epoch": 0.9168194799338644, + "flos": 27818201064960.0, + "grad_norm": 13.496206006993104, + "language_loss": 0.83480394, + "learning_rate": 7.209204159518178e-08, + "loss": 0.84946227, + "num_input_tokens_seen": 328957895, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25415039, + "step": 15249, + "time_per_iteration": 2.7630374431610107 + }, + { + "auxiliary_loss_clip": 0.01247851, + "auxiliary_loss_mlp": 0.00243682, + "balance_loss_clip": 1.02380109, + "balance_loss_mlp": 0.21737224, + "epoch": 0.9168796031865324, + "flos": 21717552355200.0, + "grad_norm": 5.921877204822616, + "language_loss": 0.88501585, + "learning_rate": 7.198845445926616e-08, + "loss": 0.89993119, + "num_input_tokens_seen": 328971365, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.26306152, + "step": 15250, + "time_per_iteration": 2.708756446838379 + }, + { + "auxiliary_loss_clip": 0.01235439, + "auxiliary_loss_mlp": 0.00219121, + "balance_loss_clip": 1.02224088, + "balance_loss_mlp": 0.19357446, + "epoch": 0.9169397264392004, + "flos": 23404420817280.0, + "grad_norm": 25.610293426937552, + "language_loss": 0.83965361, + "learning_rate": 7.188494043374138e-08, + "loss": 0.85419929, + "num_input_tokens_seen": 328990830, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.25524902, + "step": 15251, + "time_per_iteration": 4.174715042114258 + }, + { + "auxiliary_loss_clip": 0.01242968, + "auxiliary_loss_mlp": 0.0021346, + "balance_loss_clip": 1.02825499, + "balance_loss_mlp": 0.1888914, + "epoch": 0.9169998496918683, + "flos": 23950981140480.0, + "grad_norm": 9.472703206867905, + "language_loss": 0.91088712, + "learning_rate": 7.178149952253298e-08, + "loss": 0.9254514, + "num_input_tokens_seen": 329008345, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.24572754, + "step": 15252, + "time_per_iteration": 4.159137010574341 + }, + { + "auxiliary_loss_clip": 0.01246428, + "auxiliary_loss_mlp": 0.00241633, + "balance_loss_clip": 1.02583218, + "balance_loss_mlp": 0.21584851, + "epoch": 0.9170599729445363, + "flos": 18332469711360.0, + "grad_norm": 7.693590633640967, + "language_loss": 0.84130275, + "learning_rate": 7.167813172956316e-08, + "loss": 0.85618329, + "num_input_tokens_seen": 329027440, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25793457, + "step": 15253, + "time_per_iteration": 2.667346954345703 + }, + { + "auxiliary_loss_clip": 0.01247579, + "auxiliary_loss_mlp": 0.00213937, + "balance_loss_clip": 1.02733028, + "balance_loss_mlp": 0.18757981, + "epoch": 0.9171200961972042, + "flos": 22674859678080.0, + "grad_norm": 40.724287550347, + "language_loss": 0.81644267, + "learning_rate": 7.157483705875256e-08, + "loss": 0.83105785, + "num_input_tokens_seen": 329046445, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.26342773, + "step": 15254, + "time_per_iteration": 2.6539664268493652 + }, + { + "auxiliary_loss_clip": 0.01240535, + "auxiliary_loss_mlp": 0.00208489, + "balance_loss_clip": 1.02666926, + "balance_loss_mlp": 0.18477848, + "epoch": 0.9171802194498723, + "flos": 26719298328960.0, + "grad_norm": 4.163976068138394, + "language_loss": 0.85309458, + "learning_rate": 7.14716155140167e-08, + "loss": 0.86758476, + "num_input_tokens_seen": 329065555, + "router_z_loss_clip": 2.13964844, + "router_z_loss_mlp": 0.23730469, + "step": 15255, + "time_per_iteration": 2.7292609214782715 + }, + { + "auxiliary_loss_clip": 0.01227298, + "auxiliary_loss_mlp": 0.00224407, + "balance_loss_clip": 1.01094568, + "balance_loss_mlp": 0.19877703, + "epoch": 0.9172403427025402, + "flos": 37889240538240.0, + "grad_norm": 21.579931478258025, + "language_loss": 0.76632261, + "learning_rate": 7.136846709927047e-08, + "loss": 0.78083968, + "num_input_tokens_seen": 329087515, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25634766, + "step": 15256, + "time_per_iteration": 2.831430435180664 + }, + { + "auxiliary_loss_clip": 0.01237775, + "auxiliary_loss_mlp": 0.00241754, + "balance_loss_clip": 1.02435303, + "balance_loss_mlp": 0.21642169, + "epoch": 0.9173004659552082, + "flos": 17055163100160.0, + "grad_norm": 12.051486105726836, + "language_loss": 0.89350551, + "learning_rate": 7.126539181842561e-08, + "loss": 0.90830076, + "num_input_tokens_seen": 329106820, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.25354004, + "step": 15257, + "time_per_iteration": 2.7379651069641113 + }, + { + "auxiliary_loss_clip": 0.0122592, + "auxiliary_loss_mlp": 0.00214251, + "balance_loss_clip": 1.01491857, + "balance_loss_mlp": 0.19050473, + "epoch": 0.9173605892078761, + "flos": 22201593056640.0, + "grad_norm": 8.887843174491286, + "language_loss": 0.84089363, + "learning_rate": 7.116238967539012e-08, + "loss": 0.85529524, + "num_input_tokens_seen": 329126515, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.23754883, + "step": 15258, + "time_per_iteration": 4.167887449264526 + }, + { + "auxiliary_loss_clip": 0.01235626, + "auxiliary_loss_mlp": 0.00214155, + "balance_loss_clip": 1.01930416, + "balance_loss_mlp": 0.18878764, + "epoch": 0.9174207124605441, + "flos": 16507776764160.0, + "grad_norm": 14.929708630150316, + "language_loss": 0.90036774, + "learning_rate": 7.105946067406999e-08, + "loss": 0.91486549, + "num_input_tokens_seen": 329142660, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.25341797, + "step": 15259, + "time_per_iteration": 2.607780694961548 + }, + { + "auxiliary_loss_clip": 0.01232582, + "auxiliary_loss_mlp": 0.00210118, + "balance_loss_clip": 1.02115643, + "balance_loss_mlp": 0.18721783, + "epoch": 0.917480835713212, + "flos": 24535606901760.0, + "grad_norm": 5.208437935640262, + "language_loss": 0.82100642, + "learning_rate": 7.095660481836895e-08, + "loss": 0.83543348, + "num_input_tokens_seen": 329162575, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.22900391, + "step": 15260, + "time_per_iteration": 2.7098939418792725 + }, + { + "auxiliary_loss_clip": 0.01225415, + "auxiliary_loss_mlp": 0.00205013, + "balance_loss_clip": 1.01408434, + "balance_loss_mlp": 0.18096897, + "epoch": 0.9175409589658801, + "flos": 20880726226560.0, + "grad_norm": 175.564638717051, + "language_loss": 0.68369496, + "learning_rate": 7.085382211218637e-08, + "loss": 0.69799924, + "num_input_tokens_seen": 329182090, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.24072266, + "step": 15261, + "time_per_iteration": 2.658200740814209 + }, + { + "auxiliary_loss_clip": 0.01227196, + "auxiliary_loss_mlp": 0.00219122, + "balance_loss_clip": 1.01669526, + "balance_loss_mlp": 0.19396912, + "epoch": 0.917601082218548, + "flos": 14276035918080.0, + "grad_norm": 14.00533231983276, + "language_loss": 0.79871935, + "learning_rate": 7.075111255942002e-08, + "loss": 0.81318253, + "num_input_tokens_seen": 329196535, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.25170898, + "step": 15262, + "time_per_iteration": 2.6448192596435547 + }, + { + "auxiliary_loss_clip": 0.01242606, + "auxiliary_loss_mlp": 0.00241145, + "balance_loss_clip": 1.01693153, + "balance_loss_mlp": 0.21398944, + "epoch": 0.917661205471216, + "flos": 19099234362240.0, + "grad_norm": 6.465028146443321, + "language_loss": 0.85869133, + "learning_rate": 7.064847616396496e-08, + "loss": 0.87352884, + "num_input_tokens_seen": 329215135, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.27160645, + "step": 15263, + "time_per_iteration": 4.205925941467285 + }, + { + "auxiliary_loss_clip": 0.01252377, + "auxiliary_loss_mlp": 0.0020399, + "balance_loss_clip": 1.03046036, + "balance_loss_mlp": 0.17491525, + "epoch": 0.917721328723884, + "flos": 21106568989440.0, + "grad_norm": 8.296636653531964, + "language_loss": 0.8413012, + "learning_rate": 7.054591292971324e-08, + "loss": 0.85586488, + "num_input_tokens_seen": 329235150, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.29077148, + "step": 15264, + "time_per_iteration": 2.6572518348693848 + }, + { + "auxiliary_loss_clip": 0.01245573, + "auxiliary_loss_mlp": 0.00229071, + "balance_loss_clip": 1.03184533, + "balance_loss_mlp": 0.20419209, + "epoch": 0.9177814519765519, + "flos": 21943215550080.0, + "grad_norm": 3.7175220362405645, + "language_loss": 0.88561678, + "learning_rate": 7.044342286055394e-08, + "loss": 0.90036321, + "num_input_tokens_seen": 329254365, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2487793, + "step": 15265, + "time_per_iteration": 2.6789212226867676 + }, + { + "auxiliary_loss_clip": 0.01255804, + "auxiliary_loss_mlp": 0.00241416, + "balance_loss_clip": 1.03115511, + "balance_loss_mlp": 0.21578641, + "epoch": 0.9178415752292199, + "flos": 24205982768640.0, + "grad_norm": 25.974736321782235, + "language_loss": 0.79649544, + "learning_rate": 7.034100596037306e-08, + "loss": 0.81146765, + "num_input_tokens_seen": 329274385, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.25646973, + "step": 15266, + "time_per_iteration": 2.698477029800415 + }, + { + "auxiliary_loss_clip": 0.01235142, + "auxiliary_loss_mlp": 0.00207204, + "balance_loss_clip": 1.02207375, + "balance_loss_mlp": 0.18351689, + "epoch": 0.9179016984818879, + "flos": 20042068504320.0, + "grad_norm": 3.8308938558620036, + "language_loss": 0.84162819, + "learning_rate": 7.023866223305486e-08, + "loss": 0.85605174, + "num_input_tokens_seen": 329292160, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.23693848, + "step": 15267, + "time_per_iteration": 2.695997714996338 + }, + { + "auxiliary_loss_clip": 0.01078258, + "auxiliary_loss_mlp": 0.00085925, + "balance_loss_clip": 0.94492429, + "balance_loss_mlp": 0.0790583, + "epoch": 0.9179618217345559, + "flos": 65555901100800.0, + "grad_norm": 0.7221236347431655, + "language_loss": 0.55181789, + "learning_rate": 7.013639168247975e-08, + "loss": 0.56345975, + "num_input_tokens_seen": 329351870, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.06884766, + "step": 15268, + "time_per_iteration": 3.1861915588378906 + }, + { + "auxiliary_loss_clip": 0.01249132, + "auxiliary_loss_mlp": 0.00206615, + "balance_loss_clip": 1.03358579, + "balance_loss_mlp": 0.18317857, + "epoch": 0.9180219449872238, + "flos": 21324618501120.0, + "grad_norm": 17.037016916240564, + "language_loss": 0.8605454, + "learning_rate": 7.0034194312526e-08, + "loss": 0.87510288, + "num_input_tokens_seen": 329370930, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.234375, + "step": 15269, + "time_per_iteration": 2.7049005031585693 + }, + { + "auxiliary_loss_clip": 0.01234565, + "auxiliary_loss_mlp": 0.00215544, + "balance_loss_clip": 1.02269387, + "balance_loss_mlp": 0.19134471, + "epoch": 0.9180820682398918, + "flos": 41060008684800.0, + "grad_norm": 64.37252766424696, + "language_loss": 0.79542136, + "learning_rate": 6.993207012706936e-08, + "loss": 0.8099224, + "num_input_tokens_seen": 329391275, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.24194336, + "step": 15270, + "time_per_iteration": 2.815192461013794 + }, + { + "auxiliary_loss_clip": 0.01235036, + "auxiliary_loss_mlp": 0.00219005, + "balance_loss_clip": 1.02023232, + "balance_loss_mlp": 0.19531783, + "epoch": 0.9181421914925597, + "flos": 28072915384320.0, + "grad_norm": 6.440304225107744, + "language_loss": 0.85127318, + "learning_rate": 6.98300191299821e-08, + "loss": 0.86581355, + "num_input_tokens_seen": 329412775, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.23693848, + "step": 15271, + "time_per_iteration": 2.7710206508636475 + }, + { + "auxiliary_loss_clip": 0.01251315, + "auxiliary_loss_mlp": 0.0021525, + "balance_loss_clip": 1.02785635, + "balance_loss_mlp": 0.18780807, + "epoch": 0.9182023147452277, + "flos": 29169411909120.0, + "grad_norm": 83.26397951214456, + "language_loss": 0.79940724, + "learning_rate": 6.972804132513355e-08, + "loss": 0.81407291, + "num_input_tokens_seen": 329432440, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.27429199, + "step": 15272, + "time_per_iteration": 2.7304728031158447 + }, + { + "auxiliary_loss_clip": 0.01240653, + "auxiliary_loss_mlp": 0.00239493, + "balance_loss_clip": 1.02312875, + "balance_loss_mlp": 0.21407795, + "epoch": 0.9182624379978956, + "flos": 24060831909120.0, + "grad_norm": 3.318749045153596, + "language_loss": 0.80270207, + "learning_rate": 6.962613671639105e-08, + "loss": 0.81750357, + "num_input_tokens_seen": 329450605, + "router_z_loss_clip": 2.17285156, + "router_z_loss_mlp": 0.25427246, + "step": 15273, + "time_per_iteration": 2.687993288040161 + }, + { + "auxiliary_loss_clip": 0.01211769, + "auxiliary_loss_mlp": 0.00193815, + "balance_loss_clip": 1.00383496, + "balance_loss_mlp": 0.17191647, + "epoch": 0.9183225612505637, + "flos": 23293528554240.0, + "grad_norm": 4.59984661871367, + "language_loss": 0.80897021, + "learning_rate": 6.952430530761933e-08, + "loss": 0.82302606, + "num_input_tokens_seen": 329470550, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.21923828, + "step": 15274, + "time_per_iteration": 2.7292332649230957 + }, + { + "auxiliary_loss_clip": 0.01235545, + "auxiliary_loss_mlp": 0.00227701, + "balance_loss_clip": 1.01863647, + "balance_loss_mlp": 0.20275098, + "epoch": 0.9183826845032316, + "flos": 19609237618560.0, + "grad_norm": 2.4407560737734366, + "language_loss": 0.77363271, + "learning_rate": 6.942254710267902e-08, + "loss": 0.78826517, + "num_input_tokens_seen": 329489765, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.24963379, + "step": 15275, + "time_per_iteration": 2.6649746894836426 + }, + { + "auxiliary_loss_clip": 0.01240171, + "auxiliary_loss_mlp": 0.00221093, + "balance_loss_clip": 1.01936758, + "balance_loss_mlp": 0.1953914, + "epoch": 0.9184428077558996, + "flos": 18479057114880.0, + "grad_norm": 2.7821880033777435, + "language_loss": 0.81497943, + "learning_rate": 6.932086210542953e-08, + "loss": 0.82959211, + "num_input_tokens_seen": 329507040, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.25708008, + "step": 15276, + "time_per_iteration": 2.665553092956543 + }, + { + "auxiliary_loss_clip": 0.01231385, + "auxiliary_loss_mlp": 0.00229151, + "balance_loss_clip": 1.01843989, + "balance_loss_mlp": 0.20646539, + "epoch": 0.9185029310085676, + "flos": 20741034234240.0, + "grad_norm": 28.1593548207596, + "language_loss": 0.80210531, + "learning_rate": 6.921925031972642e-08, + "loss": 0.81671059, + "num_input_tokens_seen": 329525540, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.22692871, + "step": 15277, + "time_per_iteration": 2.6705400943756104 + }, + { + "auxiliary_loss_clip": 0.01082335, + "auxiliary_loss_mlp": 0.0009502, + "balance_loss_clip": 0.94956112, + "balance_loss_mlp": 0.08820107, + "epoch": 0.9185630542612355, + "flos": 68209231875840.0, + "grad_norm": 0.7123128988714621, + "language_loss": 0.58686674, + "learning_rate": 6.91177117494226e-08, + "loss": 0.59864032, + "num_input_tokens_seen": 329592905, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.06835938, + "step": 15278, + "time_per_iteration": 3.315192222595215 + }, + { + "auxiliary_loss_clip": 0.01209664, + "auxiliary_loss_mlp": 0.00246944, + "balance_loss_clip": 1.00413394, + "balance_loss_mlp": 0.22255367, + "epoch": 0.9186231775139035, + "flos": 12239470598400.0, + "grad_norm": 2.3475673876315373, + "language_loss": 0.72408712, + "learning_rate": 6.901624639836879e-08, + "loss": 0.73865318, + "num_input_tokens_seen": 329610150, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.24414062, + "step": 15279, + "time_per_iteration": 2.6280343532562256 + }, + { + "auxiliary_loss_clip": 0.01087031, + "auxiliary_loss_mlp": 0.00131632, + "balance_loss_clip": 0.95293975, + "balance_loss_mlp": 0.12304907, + "epoch": 0.9186833007665715, + "flos": 63939237770880.0, + "grad_norm": 0.8372111121220113, + "language_loss": 0.59556425, + "learning_rate": 6.891485427041211e-08, + "loss": 0.60775089, + "num_input_tokens_seen": 329673650, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.0859375, + "step": 15280, + "time_per_iteration": 3.1077654361724854 + }, + { + "auxiliary_loss_clip": 0.01242463, + "auxiliary_loss_mlp": 0.0024838, + "balance_loss_clip": 1.02319634, + "balance_loss_mlp": 0.22301212, + "epoch": 0.9187434240192395, + "flos": 19974700546560.0, + "grad_norm": 18.53952195789101, + "language_loss": 0.78686625, + "learning_rate": 6.881353536939815e-08, + "loss": 0.80177468, + "num_input_tokens_seen": 329692520, + "router_z_loss_clip": 2.19042969, + "router_z_loss_mlp": 0.25366211, + "step": 15281, + "time_per_iteration": 2.6522364616394043 + }, + { + "auxiliary_loss_clip": 0.01242145, + "auxiliary_loss_mlp": 0.00243939, + "balance_loss_clip": 1.0224762, + "balance_loss_mlp": 0.21698585, + "epoch": 0.9188035472719074, + "flos": 25227820874880.0, + "grad_norm": 3.0651160219570617, + "language_loss": 0.91297817, + "learning_rate": 6.871228969916831e-08, + "loss": 0.92783904, + "num_input_tokens_seen": 329713750, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.26940918, + "step": 15282, + "time_per_iteration": 2.703188180923462 + }, + { + "auxiliary_loss_clip": 0.01241946, + "auxiliary_loss_mlp": 0.00222379, + "balance_loss_clip": 1.02716625, + "balance_loss_mlp": 0.19581893, + "epoch": 0.9188636705245754, + "flos": 18405547931520.0, + "grad_norm": 16.772361842433988, + "language_loss": 0.69731379, + "learning_rate": 6.861111726356194e-08, + "loss": 0.71195704, + "num_input_tokens_seen": 329730960, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.26525879, + "step": 15283, + "time_per_iteration": 2.6577792167663574 + }, + { + "auxiliary_loss_clip": 0.01251672, + "auxiliary_loss_mlp": 0.00214511, + "balance_loss_clip": 1.02844071, + "balance_loss_mlp": 0.18896404, + "epoch": 0.9189237937772433, + "flos": 23769129559680.0, + "grad_norm": 22.55970126777019, + "language_loss": 0.74312842, + "learning_rate": 6.851001806641554e-08, + "loss": 0.75779027, + "num_input_tokens_seen": 329750975, + "router_z_loss_clip": 2.23339844, + "router_z_loss_mlp": 0.25537109, + "step": 15284, + "time_per_iteration": 2.6397571563720703 + }, + { + "auxiliary_loss_clip": 0.01242151, + "auxiliary_loss_mlp": 0.00217894, + "balance_loss_clip": 1.02446747, + "balance_loss_mlp": 0.19315851, + "epoch": 0.9189839170299113, + "flos": 21214624078080.0, + "grad_norm": 31.03367851559166, + "language_loss": 0.8085686, + "learning_rate": 6.840899211156292e-08, + "loss": 0.82316905, + "num_input_tokens_seen": 329769645, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24743652, + "step": 15285, + "time_per_iteration": 2.711258888244629 + }, + { + "auxiliary_loss_clip": 0.01227408, + "auxiliary_loss_mlp": 0.00212901, + "balance_loss_clip": 1.01268458, + "balance_loss_mlp": 0.18935709, + "epoch": 0.9190440402825792, + "flos": 16727370560640.0, + "grad_norm": 9.692133386208775, + "language_loss": 0.81612039, + "learning_rate": 6.830803940283458e-08, + "loss": 0.83052343, + "num_input_tokens_seen": 329788185, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.23535156, + "step": 15286, + "time_per_iteration": 2.627272844314575 + }, + { + "auxiliary_loss_clip": 0.01230945, + "auxiliary_loss_mlp": 0.00217129, + "balance_loss_clip": 1.01746297, + "balance_loss_mlp": 0.1927273, + "epoch": 0.9191041635352473, + "flos": 23441193365760.0, + "grad_norm": 5.371200508514221, + "language_loss": 0.8087092, + "learning_rate": 6.820715994405945e-08, + "loss": 0.82318997, + "num_input_tokens_seen": 329806780, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24401855, + "step": 15287, + "time_per_iteration": 2.753887414932251 + }, + { + "auxiliary_loss_clip": 0.01256757, + "auxiliary_loss_mlp": 0.00237274, + "balance_loss_clip": 1.03458261, + "balance_loss_mlp": 0.2108096, + "epoch": 0.9191642867879152, + "flos": 18807532012800.0, + "grad_norm": 36.441656266204625, + "language_loss": 0.76337284, + "learning_rate": 6.810635373906226e-08, + "loss": 0.7783131, + "num_input_tokens_seen": 329826350, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.26477051, + "step": 15288, + "time_per_iteration": 2.6368422508239746 + }, + { + "auxiliary_loss_clip": 0.01230417, + "auxiliary_loss_mlp": 0.0024084, + "balance_loss_clip": 1.01624584, + "balance_loss_mlp": 0.21670061, + "epoch": 0.9192244100405832, + "flos": 32160950167680.0, + "grad_norm": 15.822950260766937, + "language_loss": 0.76872575, + "learning_rate": 6.800562079166549e-08, + "loss": 0.78343832, + "num_input_tokens_seen": 329846160, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.24145508, + "step": 15289, + "time_per_iteration": 2.7910735607147217 + }, + { + "auxiliary_loss_clip": 0.01238704, + "auxiliary_loss_mlp": 0.00223942, + "balance_loss_clip": 1.02350521, + "balance_loss_mlp": 0.19890842, + "epoch": 0.9192845332932512, + "flos": 16357669827840.0, + "grad_norm": 29.960995938785068, + "language_loss": 0.83717263, + "learning_rate": 6.790496110568921e-08, + "loss": 0.85179913, + "num_input_tokens_seen": 329862020, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.25048828, + "step": 15290, + "time_per_iteration": 2.6840367317199707 + }, + { + "auxiliary_loss_clip": 0.01224534, + "auxiliary_loss_mlp": 0.00223018, + "balance_loss_clip": 1.01384521, + "balance_loss_mlp": 0.19894917, + "epoch": 0.9193446565459191, + "flos": 26614475464320.0, + "grad_norm": 2.2201347891948604, + "language_loss": 0.80299628, + "learning_rate": 6.78043746849506e-08, + "loss": 0.81747174, + "num_input_tokens_seen": 329880185, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.24060059, + "step": 15291, + "time_per_iteration": 2.7109055519104004 + }, + { + "auxiliary_loss_clip": 0.01235803, + "auxiliary_loss_mlp": 0.00218325, + "balance_loss_clip": 1.02102137, + "balance_loss_mlp": 0.19450735, + "epoch": 0.9194047797985871, + "flos": 22492182084480.0, + "grad_norm": 12.557589802992355, + "language_loss": 0.76791626, + "learning_rate": 6.770386153326346e-08, + "loss": 0.78245753, + "num_input_tokens_seen": 329900255, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.23815918, + "step": 15292, + "time_per_iteration": 2.6725354194641113 + }, + { + "auxiliary_loss_clip": 0.01244618, + "auxiliary_loss_mlp": 0.00227921, + "balance_loss_clip": 1.0232625, + "balance_loss_mlp": 0.20128971, + "epoch": 0.9194649030512551, + "flos": 25078791346560.0, + "grad_norm": 21.49134076103688, + "language_loss": 0.8009395, + "learning_rate": 6.760342165443988e-08, + "loss": 0.81566489, + "num_input_tokens_seen": 329919095, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26635742, + "step": 15293, + "time_per_iteration": 4.148429870605469 + }, + { + "auxiliary_loss_clip": 0.01238417, + "auxiliary_loss_mlp": 0.0022747, + "balance_loss_clip": 1.02547407, + "balance_loss_mlp": 0.20223336, + "epoch": 0.9195250263039231, + "flos": 11911139354880.0, + "grad_norm": 1902.7063289848104, + "language_loss": 0.87338561, + "learning_rate": 6.750305505228837e-08, + "loss": 0.88804448, + "num_input_tokens_seen": 329936505, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.25231934, + "step": 15294, + "time_per_iteration": 4.13311767578125 + }, + { + "auxiliary_loss_clip": 0.01246392, + "auxiliary_loss_mlp": 0.00247898, + "balance_loss_clip": 1.02404284, + "balance_loss_mlp": 0.22121876, + "epoch": 0.919585149556591, + "flos": 21834154880640.0, + "grad_norm": 9.712696001186586, + "language_loss": 0.84959567, + "learning_rate": 6.74027617306141e-08, + "loss": 0.86453861, + "num_input_tokens_seen": 329956795, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.26696777, + "step": 15295, + "time_per_iteration": 2.73774790763855 + }, + { + "auxiliary_loss_clip": 0.01226356, + "auxiliary_loss_mlp": 0.00196547, + "balance_loss_clip": 1.01674688, + "balance_loss_mlp": 0.17431399, + "epoch": 0.919645272809259, + "flos": 28184059042560.0, + "grad_norm": 20.270979338862993, + "language_loss": 0.80451691, + "learning_rate": 6.730254169322114e-08, + "loss": 0.81874597, + "num_input_tokens_seen": 329977195, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.22241211, + "step": 15296, + "time_per_iteration": 2.6866872310638428 + }, + { + "auxiliary_loss_clip": 0.01234431, + "auxiliary_loss_mlp": 0.002216, + "balance_loss_clip": 1.01966202, + "balance_loss_mlp": 0.19558904, + "epoch": 0.9197053960619269, + "flos": 18332828847360.0, + "grad_norm": 98.63096330836115, + "language_loss": 0.83136034, + "learning_rate": 6.720239494390912e-08, + "loss": 0.84592068, + "num_input_tokens_seen": 329992095, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.26013184, + "step": 15297, + "time_per_iteration": 2.676393985748291 + }, + { + "auxiliary_loss_clip": 0.01222983, + "auxiliary_loss_mlp": 0.0020615, + "balance_loss_clip": 1.00886679, + "balance_loss_mlp": 0.18046071, + "epoch": 0.9197655193145949, + "flos": 28183448511360.0, + "grad_norm": 53.31273874762472, + "language_loss": 0.82930678, + "learning_rate": 6.710232148647676e-08, + "loss": 0.84359813, + "num_input_tokens_seen": 330011490, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.25695801, + "step": 15298, + "time_per_iteration": 2.7990834712982178 + }, + { + "auxiliary_loss_clip": 0.01255361, + "auxiliary_loss_mlp": 0.00208199, + "balance_loss_clip": 1.02774668, + "balance_loss_mlp": 0.1813416, + "epoch": 0.9198256425672628, + "flos": 17306321973120.0, + "grad_norm": 13.522842732772652, + "language_loss": 0.90569818, + "learning_rate": 6.70023213247175e-08, + "loss": 0.92033386, + "num_input_tokens_seen": 330027885, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.26855469, + "step": 15299, + "time_per_iteration": 2.7258474826812744 + }, + { + "auxiliary_loss_clip": 0.01245524, + "auxiliary_loss_mlp": 0.00224132, + "balance_loss_clip": 1.02786469, + "balance_loss_mlp": 0.19932503, + "epoch": 0.9198857658199309, + "flos": 17858520731520.0, + "grad_norm": 6.321690960178277, + "language_loss": 0.73426592, + "learning_rate": 6.690239446242385e-08, + "loss": 0.74896246, + "num_input_tokens_seen": 330046230, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24829102, + "step": 15300, + "time_per_iteration": 4.154743671417236 + }, + { + "auxiliary_loss_clip": 0.01211147, + "auxiliary_loss_mlp": 0.00206231, + "balance_loss_clip": 1.00759935, + "balance_loss_mlp": 0.18434413, + "epoch": 0.9199458890725988, + "flos": 22127545169280.0, + "grad_norm": 7.224584397551697, + "language_loss": 0.76258111, + "learning_rate": 6.680254090338545e-08, + "loss": 0.77675486, + "num_input_tokens_seen": 330065535, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.21887207, + "step": 15301, + "time_per_iteration": 2.643026351928711 + }, + { + "auxiliary_loss_clip": 0.01246734, + "auxiliary_loss_mlp": 0.0023401, + "balance_loss_clip": 1.02972639, + "balance_loss_mlp": 0.20843919, + "epoch": 0.9200060123252668, + "flos": 16034043265920.0, + "grad_norm": 45.836394793817696, + "language_loss": 0.78099191, + "learning_rate": 6.670276065138814e-08, + "loss": 0.79579926, + "num_input_tokens_seen": 330082920, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.25585938, + "step": 15302, + "time_per_iteration": 2.6429622173309326 + }, + { + "auxiliary_loss_clip": 0.01247148, + "auxiliary_loss_mlp": 0.00227276, + "balance_loss_clip": 1.03035223, + "balance_loss_mlp": 0.20181285, + "epoch": 0.9200661355779348, + "flos": 26864521015680.0, + "grad_norm": 11.245016100516198, + "language_loss": 0.84606034, + "learning_rate": 6.660305371021579e-08, + "loss": 0.86080456, + "num_input_tokens_seen": 330101165, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25488281, + "step": 15303, + "time_per_iteration": 2.7643415927886963 + }, + { + "auxiliary_loss_clip": 0.01240125, + "auxiliary_loss_mlp": 0.00225508, + "balance_loss_clip": 1.02451491, + "balance_loss_mlp": 0.20099851, + "epoch": 0.9201262588306027, + "flos": 12786749193600.0, + "grad_norm": 114.32055516751215, + "language_loss": 0.9703477, + "learning_rate": 6.650342008365006e-08, + "loss": 0.98500407, + "num_input_tokens_seen": 330118775, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24536133, + "step": 15304, + "time_per_iteration": 2.6702382564544678 + }, + { + "auxiliary_loss_clip": 0.01255629, + "auxiliary_loss_mlp": 0.00212582, + "balance_loss_clip": 1.03220272, + "balance_loss_mlp": 0.18539028, + "epoch": 0.9201863820832707, + "flos": 20631614428800.0, + "grad_norm": 30.849733841774707, + "language_loss": 0.88800526, + "learning_rate": 6.64038597754677e-08, + "loss": 0.90268731, + "num_input_tokens_seen": 330135570, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.27197266, + "step": 15305, + "time_per_iteration": 4.0837318897247314 + }, + { + "auxiliary_loss_clip": 0.01234456, + "auxiliary_loss_mlp": 0.00218563, + "balance_loss_clip": 1.01756334, + "balance_loss_mlp": 0.19331464, + "epoch": 0.9202465053359387, + "flos": 26395815421440.0, + "grad_norm": 17.749543791388774, + "language_loss": 0.91627693, + "learning_rate": 6.630437278944501e-08, + "loss": 0.93080711, + "num_input_tokens_seen": 330152840, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25256348, + "step": 15306, + "time_per_iteration": 2.685586452484131 + }, + { + "auxiliary_loss_clip": 0.01237776, + "auxiliary_loss_mlp": 0.00240746, + "balance_loss_clip": 1.02097166, + "balance_loss_mlp": 0.2146277, + "epoch": 0.9203066285886067, + "flos": 10488179093760.0, + "grad_norm": 12.233686508978892, + "language_loss": 0.79577363, + "learning_rate": 6.62049591293541e-08, + "loss": 0.81055892, + "num_input_tokens_seen": 330168605, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.26135254, + "step": 15307, + "time_per_iteration": 2.641709566116333 + }, + { + "auxiliary_loss_clip": 0.01253597, + "auxiliary_loss_mlp": 0.00220229, + "balance_loss_clip": 1.02958274, + "balance_loss_mlp": 0.19366881, + "epoch": 0.9203667518412746, + "flos": 19390721230080.0, + "grad_norm": 13.615322489437727, + "language_loss": 0.86169922, + "learning_rate": 6.610561879896526e-08, + "loss": 0.87643749, + "num_input_tokens_seen": 330186160, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.265625, + "step": 15308, + "time_per_iteration": 2.684983015060425 + }, + { + "auxiliary_loss_clip": 0.01229162, + "auxiliary_loss_mlp": 0.00207777, + "balance_loss_clip": 1.0174129, + "balance_loss_mlp": 0.18349406, + "epoch": 0.9204268750939426, + "flos": 15924982596480.0, + "grad_norm": 15.952259017432539, + "language_loss": 0.86467457, + "learning_rate": 6.600635180204484e-08, + "loss": 0.87904394, + "num_input_tokens_seen": 330201780, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.24279785, + "step": 15309, + "time_per_iteration": 2.61946964263916 + }, + { + "auxiliary_loss_clip": 0.01235756, + "auxiliary_loss_mlp": 0.00223344, + "balance_loss_clip": 1.01601887, + "balance_loss_mlp": 0.19788124, + "epoch": 0.9204869983466105, + "flos": 16471758401280.0, + "grad_norm": 12.246883344516585, + "language_loss": 0.76735079, + "learning_rate": 6.590715814235781e-08, + "loss": 0.78194177, + "num_input_tokens_seen": 330219165, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.25463867, + "step": 15310, + "time_per_iteration": 2.608581066131592 + }, + { + "auxiliary_loss_clip": 0.01246457, + "auxiliary_loss_mlp": 0.00233486, + "balance_loss_clip": 1.0281775, + "balance_loss_mlp": 0.20780876, + "epoch": 0.9205471215992785, + "flos": 21539220307200.0, + "grad_norm": 56.82261477842547, + "language_loss": 0.73029888, + "learning_rate": 6.580803782366495e-08, + "loss": 0.74509823, + "num_input_tokens_seen": 330238975, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25671387, + "step": 15311, + "time_per_iteration": 2.73721981048584 + }, + { + "auxiliary_loss_clip": 0.01232977, + "auxiliary_loss_mlp": 0.00208217, + "balance_loss_clip": 1.01618123, + "balance_loss_mlp": 0.18362378, + "epoch": 0.9206072448519464, + "flos": 25005892694400.0, + "grad_norm": 9.566399790910664, + "language_loss": 0.82912457, + "learning_rate": 6.570899084972503e-08, + "loss": 0.8435365, + "num_input_tokens_seen": 330259755, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24597168, + "step": 15312, + "time_per_iteration": 2.688659906387329 + }, + { + "auxiliary_loss_clip": 0.01233338, + "auxiliary_loss_mlp": 0.00223262, + "balance_loss_clip": 1.02035475, + "balance_loss_mlp": 0.19751312, + "epoch": 0.9206673681046145, + "flos": 20522661500160.0, + "grad_norm": 7.760239934815944, + "language_loss": 0.85198343, + "learning_rate": 6.561001722429394e-08, + "loss": 0.86654937, + "num_input_tokens_seen": 330277660, + "router_z_loss_clip": 2.12597656, + "router_z_loss_mlp": 0.25744629, + "step": 15313, + "time_per_iteration": 2.7138595581054688 + }, + { + "auxiliary_loss_clip": 0.01249434, + "auxiliary_loss_mlp": 0.0021691, + "balance_loss_clip": 1.02643085, + "balance_loss_mlp": 0.1897537, + "epoch": 0.9207274913572824, + "flos": 20883455660160.0, + "grad_norm": 50.95621584759037, + "language_loss": 0.86005044, + "learning_rate": 6.55111169511251e-08, + "loss": 0.8747139, + "num_input_tokens_seen": 330295455, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.27160645, + "step": 15314, + "time_per_iteration": 2.690624237060547 + }, + { + "auxiliary_loss_clip": 0.0125752, + "auxiliary_loss_mlp": 0.00223664, + "balance_loss_clip": 1.03109622, + "balance_loss_mlp": 0.19680631, + "epoch": 0.9207876146099504, + "flos": 22708256348160.0, + "grad_norm": 42.618944020461235, + "language_loss": 0.87133527, + "learning_rate": 6.541229003396864e-08, + "loss": 0.88614714, + "num_input_tokens_seen": 330315310, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.26867676, + "step": 15315, + "time_per_iteration": 2.7019407749176025 + }, + { + "auxiliary_loss_clip": 0.01242967, + "auxiliary_loss_mlp": 0.00233222, + "balance_loss_clip": 1.02232504, + "balance_loss_mlp": 0.20858151, + "epoch": 0.9208477378626184, + "flos": 18507354053760.0, + "grad_norm": 34.47360047615683, + "language_loss": 0.83131719, + "learning_rate": 6.531353647657156e-08, + "loss": 0.84607911, + "num_input_tokens_seen": 330333260, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.24609375, + "step": 15316, + "time_per_iteration": 2.6290252208709717 + }, + { + "auxiliary_loss_clip": 0.01246044, + "auxiliary_loss_mlp": 0.00241989, + "balance_loss_clip": 1.02588439, + "balance_loss_mlp": 0.21460652, + "epoch": 0.9209078611152863, + "flos": 22999635475200.0, + "grad_norm": 20.460919845514542, + "language_loss": 0.76775414, + "learning_rate": 6.521485628267931e-08, + "loss": 0.78263444, + "num_input_tokens_seen": 330352465, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.27429199, + "step": 15317, + "time_per_iteration": 2.666034698486328 + }, + { + "auxiliary_loss_clip": 0.01249252, + "auxiliary_loss_mlp": 0.00234328, + "balance_loss_clip": 1.03263068, + "balance_loss_mlp": 0.21015222, + "epoch": 0.9209679843679544, + "flos": 24061514267520.0, + "grad_norm": 2.7586696599966123, + "language_loss": 0.90960515, + "learning_rate": 6.511624945603378e-08, + "loss": 0.92444086, + "num_input_tokens_seen": 330372685, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.24182129, + "step": 15318, + "time_per_iteration": 2.756600856781006 + }, + { + "auxiliary_loss_clip": 0.01228411, + "auxiliary_loss_mlp": 0.00228529, + "balance_loss_clip": 1.01308489, + "balance_loss_mlp": 0.2042342, + "epoch": 0.9210281076206223, + "flos": 13553370190080.0, + "grad_norm": 12.52983353865456, + "language_loss": 0.94388306, + "learning_rate": 6.501771600037354e-08, + "loss": 0.95845246, + "num_input_tokens_seen": 330388860, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.24291992, + "step": 15319, + "time_per_iteration": 2.636667251586914 + }, + { + "auxiliary_loss_clip": 0.01090359, + "auxiliary_loss_mlp": 0.00084861, + "balance_loss_clip": 0.95447314, + "balance_loss_mlp": 0.07694574, + "epoch": 0.9210882308732903, + "flos": 71426289674880.0, + "grad_norm": 0.748546950482804, + "language_loss": 0.55282009, + "learning_rate": 6.491925591943559e-08, + "loss": 0.56457233, + "num_input_tokens_seen": 330448735, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07910156, + "step": 15320, + "time_per_iteration": 3.188377618789673 + }, + { + "auxiliary_loss_clip": 0.01258396, + "auxiliary_loss_mlp": 0.00246512, + "balance_loss_clip": 1.03000534, + "balance_loss_mlp": 0.21718684, + "epoch": 0.9211483541259582, + "flos": 18509113820160.0, + "grad_norm": 4.211996314704572, + "language_loss": 0.75683224, + "learning_rate": 6.482086921695384e-08, + "loss": 0.77188134, + "num_input_tokens_seen": 330465600, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.29333496, + "step": 15321, + "time_per_iteration": 2.683562994003296 + }, + { + "auxiliary_loss_clip": 0.0121749, + "auxiliary_loss_mlp": 0.00229636, + "balance_loss_clip": 1.0129385, + "balance_loss_mlp": 0.2058423, + "epoch": 0.9212084773786262, + "flos": 23258228463360.0, + "grad_norm": 130.35455703233765, + "language_loss": 0.76560378, + "learning_rate": 6.47225558966582e-08, + "loss": 0.78007507, + "num_input_tokens_seen": 330485770, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.23803711, + "step": 15322, + "time_per_iteration": 2.781521797180176 + }, + { + "auxiliary_loss_clip": 0.01225399, + "auxiliary_loss_mlp": 0.00216477, + "balance_loss_clip": 1.01512003, + "balance_loss_mlp": 0.19361255, + "epoch": 0.9212686006312941, + "flos": 16289511770880.0, + "grad_norm": 152.0715259139222, + "language_loss": 0.78138793, + "learning_rate": 6.462431596227725e-08, + "loss": 0.79580677, + "num_input_tokens_seen": 330504255, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.22851562, + "step": 15323, + "time_per_iteration": 2.6777901649475098 + }, + { + "auxiliary_loss_clip": 0.01246457, + "auxiliary_loss_mlp": 0.00224137, + "balance_loss_clip": 1.02817822, + "balance_loss_mlp": 0.19855484, + "epoch": 0.9213287238839621, + "flos": 19785773986560.0, + "grad_norm": 5.333550870987672, + "language_loss": 0.83140063, + "learning_rate": 6.452614941753597e-08, + "loss": 0.84610659, + "num_input_tokens_seen": 330520705, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25585938, + "step": 15324, + "time_per_iteration": 2.7237203121185303 + }, + { + "auxiliary_loss_clip": 0.01236969, + "auxiliary_loss_mlp": 0.00229991, + "balance_loss_clip": 1.02253866, + "balance_loss_mlp": 0.20523092, + "epoch": 0.92138884713663, + "flos": 21030402199680.0, + "grad_norm": 9.42029925662829, + "language_loss": 0.77130294, + "learning_rate": 6.442805626615744e-08, + "loss": 0.78597254, + "num_input_tokens_seen": 330539245, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.24768066, + "step": 15325, + "time_per_iteration": 2.7086875438690186 + }, + { + "auxiliary_loss_clip": 0.01221, + "auxiliary_loss_mlp": 0.00222605, + "balance_loss_clip": 1.00984001, + "balance_loss_mlp": 0.19838132, + "epoch": 0.9214489703892981, + "flos": 28587264186240.0, + "grad_norm": 45.13668281868633, + "language_loss": 0.83145273, + "learning_rate": 6.433003651186109e-08, + "loss": 0.84588879, + "num_input_tokens_seen": 330561815, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.24255371, + "step": 15326, + "time_per_iteration": 2.7338593006134033 + }, + { + "auxiliary_loss_clip": 0.01239823, + "auxiliary_loss_mlp": 0.00230698, + "balance_loss_clip": 1.02160943, + "balance_loss_mlp": 0.20573595, + "epoch": 0.921509093641966, + "flos": 16361476669440.0, + "grad_norm": 19.407726215618105, + "language_loss": 0.80849981, + "learning_rate": 6.42320901583635e-08, + "loss": 0.82320505, + "num_input_tokens_seen": 330579760, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24963379, + "step": 15327, + "time_per_iteration": 2.6413962841033936 + }, + { + "auxiliary_loss_clip": 0.01257244, + "auxiliary_loss_mlp": 0.00209077, + "balance_loss_clip": 1.03740966, + "balance_loss_mlp": 0.18201697, + "epoch": 0.921569216894634, + "flos": 26830837036800.0, + "grad_norm": 69.6432180867408, + "language_loss": 0.84818256, + "learning_rate": 6.413421720937906e-08, + "loss": 0.86284572, + "num_input_tokens_seen": 330598545, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.27050781, + "step": 15328, + "time_per_iteration": 2.6970250606536865 + }, + { + "auxiliary_loss_clip": 0.01214644, + "auxiliary_loss_mlp": 0.00231089, + "balance_loss_clip": 1.00717187, + "balance_loss_mlp": 0.207748, + "epoch": 0.921629340147302, + "flos": 24645134448000.0, + "grad_norm": 3.764740678393423, + "language_loss": 0.80438179, + "learning_rate": 6.4036417668619e-08, + "loss": 0.81883907, + "num_input_tokens_seen": 330616700, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.23339844, + "step": 15329, + "time_per_iteration": 2.732074737548828 + }, + { + "auxiliary_loss_clip": 0.01223836, + "auxiliary_loss_mlp": 0.0022093, + "balance_loss_clip": 1.01228261, + "balance_loss_mlp": 0.19566919, + "epoch": 0.9216894633999699, + "flos": 15086504442240.0, + "grad_norm": 12.741446299019758, + "language_loss": 0.93771327, + "learning_rate": 6.393869153979192e-08, + "loss": 0.95216089, + "num_input_tokens_seen": 330633355, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.25256348, + "step": 15330, + "time_per_iteration": 2.640230894088745 + }, + { + "auxiliary_loss_clip": 0.01250877, + "auxiliary_loss_mlp": 0.00222075, + "balance_loss_clip": 1.02779758, + "balance_loss_mlp": 0.19716024, + "epoch": 0.921749586652638, + "flos": 19204524103680.0, + "grad_norm": 21.859772856401584, + "language_loss": 0.89749742, + "learning_rate": 6.384103882660397e-08, + "loss": 0.91222697, + "num_input_tokens_seen": 330651470, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.24902344, + "step": 15331, + "time_per_iteration": 2.645536422729492 + }, + { + "auxiliary_loss_clip": 0.01240529, + "auxiliary_loss_mlp": 0.00234356, + "balance_loss_clip": 1.02154326, + "balance_loss_mlp": 0.20832051, + "epoch": 0.9218097099053059, + "flos": 20522446018560.0, + "grad_norm": 171.22508102793736, + "language_loss": 0.82126319, + "learning_rate": 6.374345953275794e-08, + "loss": 0.83601207, + "num_input_tokens_seen": 330669170, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26049805, + "step": 15332, + "time_per_iteration": 2.655120849609375 + }, + { + "auxiliary_loss_clip": 0.0123554, + "auxiliary_loss_mlp": 0.00207315, + "balance_loss_clip": 1.02240002, + "balance_loss_mlp": 0.18303214, + "epoch": 0.9218698331579739, + "flos": 17348625216000.0, + "grad_norm": 6.678415478215065, + "language_loss": 0.82528263, + "learning_rate": 6.364595366195358e-08, + "loss": 0.83971119, + "num_input_tokens_seen": 330686635, + "router_z_loss_clip": 2.13574219, + "router_z_loss_mlp": 0.24304199, + "step": 15333, + "time_per_iteration": 2.6898014545440674 + }, + { + "auxiliary_loss_clip": 0.01093429, + "auxiliary_loss_mlp": 0.0007597, + "balance_loss_clip": 0.95603228, + "balance_loss_mlp": 0.06867441, + "epoch": 0.9219299564106418, + "flos": 61958332575360.0, + "grad_norm": 0.7770724287987117, + "language_loss": 0.52294946, + "learning_rate": 6.354852121788879e-08, + "loss": 0.53464347, + "num_input_tokens_seen": 330749160, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.07275391, + "step": 15334, + "time_per_iteration": 3.134171724319458 + }, + { + "auxiliary_loss_clip": 0.01221937, + "auxiliary_loss_mlp": 0.00216909, + "balance_loss_clip": 1.01376915, + "balance_loss_mlp": 0.19372304, + "epoch": 0.9219900796633098, + "flos": 15701761526400.0, + "grad_norm": 7.491733649023814, + "language_loss": 0.68898314, + "learning_rate": 6.345116220425839e-08, + "loss": 0.70337164, + "num_input_tokens_seen": 330766840, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.23181152, + "step": 15335, + "time_per_iteration": 4.112954378128052 + }, + { + "auxiliary_loss_clip": 0.01231001, + "auxiliary_loss_mlp": 0.00198696, + "balance_loss_clip": 1.01832783, + "balance_loss_mlp": 0.17429426, + "epoch": 0.9220502029159777, + "flos": 24932670819840.0, + "grad_norm": 10.71374001787533, + "language_loss": 0.78857481, + "learning_rate": 6.335387662475366e-08, + "loss": 0.80287182, + "num_input_tokens_seen": 330785585, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.24389648, + "step": 15336, + "time_per_iteration": 4.304477214813232 + }, + { + "auxiliary_loss_clip": 0.0123668, + "auxiliary_loss_mlp": 0.00213382, + "balance_loss_clip": 1.02075315, + "balance_loss_mlp": 0.18986225, + "epoch": 0.9221103261686457, + "flos": 15667215621120.0, + "grad_norm": 2912.9963774508815, + "language_loss": 0.79053593, + "learning_rate": 6.325666448306433e-08, + "loss": 0.80503654, + "num_input_tokens_seen": 330800750, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.23547363, + "step": 15337, + "time_per_iteration": 2.742065668106079 + }, + { + "auxiliary_loss_clip": 0.01088631, + "auxiliary_loss_mlp": 0.00071211, + "balance_loss_clip": 0.95347273, + "balance_loss_mlp": 0.06386812, + "epoch": 0.9221704494213137, + "flos": 67516299630720.0, + "grad_norm": 11293.29688537772, + "language_loss": 0.64511508, + "learning_rate": 6.31595257828763e-08, + "loss": 0.65671349, + "num_input_tokens_seen": 330863640, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.07324219, + "step": 15338, + "time_per_iteration": 3.108013391494751 + }, + { + "auxiliary_loss_clip": 0.01228602, + "auxiliary_loss_mlp": 0.00225208, + "balance_loss_clip": 1.01621258, + "balance_loss_mlp": 0.20108044, + "epoch": 0.9222305726739817, + "flos": 30226945155840.0, + "grad_norm": 5.589196113420816, + "language_loss": 0.76537651, + "learning_rate": 6.306246052787289e-08, + "loss": 0.77991462, + "num_input_tokens_seen": 330884675, + "router_z_loss_clip": 2.12402344, + "router_z_loss_mlp": 0.24108887, + "step": 15339, + "time_per_iteration": 2.757204532623291 + }, + { + "auxiliary_loss_clip": 0.01244058, + "auxiliary_loss_mlp": 0.0021511, + "balance_loss_clip": 1.02414834, + "balance_loss_mlp": 0.18965848, + "epoch": 0.9222906959266496, + "flos": 25337204766720.0, + "grad_norm": 38.44551113492542, + "language_loss": 0.80322164, + "learning_rate": 6.296546872173513e-08, + "loss": 0.81781328, + "num_input_tokens_seen": 330904125, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25476074, + "step": 15340, + "time_per_iteration": 2.71545147895813 + }, + { + "auxiliary_loss_clip": 0.0123725, + "auxiliary_loss_mlp": 0.00220631, + "balance_loss_clip": 1.02657199, + "balance_loss_mlp": 0.19667016, + "epoch": 0.9223508191793176, + "flos": 27599864244480.0, + "grad_norm": 489.91558296082377, + "language_loss": 0.76353323, + "learning_rate": 6.286855036814098e-08, + "loss": 0.77811199, + "num_input_tokens_seen": 330925140, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.23937988, + "step": 15341, + "time_per_iteration": 2.7345080375671387 + }, + { + "auxiliary_loss_clip": 0.01210797, + "auxiliary_loss_mlp": 0.00196656, + "balance_loss_clip": 1.00626314, + "balance_loss_mlp": 0.17505553, + "epoch": 0.9224109424319856, + "flos": 27307587277440.0, + "grad_norm": 6.719607869016103, + "language_loss": 0.75536454, + "learning_rate": 6.277170547076571e-08, + "loss": 0.7694391, + "num_input_tokens_seen": 330946625, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.21594238, + "step": 15342, + "time_per_iteration": 4.145627975463867 + }, + { + "auxiliary_loss_clip": 0.0124725, + "auxiliary_loss_mlp": 0.00218181, + "balance_loss_clip": 1.02620196, + "balance_loss_mlp": 0.19438672, + "epoch": 0.9224710656846535, + "flos": 48208314401280.0, + "grad_norm": 69.13730524489941, + "language_loss": 0.77598196, + "learning_rate": 6.26749340332815e-08, + "loss": 0.7906363, + "num_input_tokens_seen": 330967795, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.23803711, + "step": 15343, + "time_per_iteration": 2.9954402446746826 + }, + { + "auxiliary_loss_clip": 0.01084438, + "auxiliary_loss_mlp": 0.00085679, + "balance_loss_clip": 0.94927239, + "balance_loss_mlp": 0.07814451, + "epoch": 0.9225311889373216, + "flos": 66722171794560.0, + "grad_norm": 0.901252464271142, + "language_loss": 0.51237071, + "learning_rate": 6.257823605935786e-08, + "loss": 0.52407181, + "num_input_tokens_seen": 331040850, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.07519531, + "step": 15344, + "time_per_iteration": 3.340118646621704 + }, + { + "auxiliary_loss_clip": 0.01226288, + "auxiliary_loss_mlp": 0.00228251, + "balance_loss_clip": 1.02150035, + "balance_loss_mlp": 0.20488602, + "epoch": 0.9225913121899895, + "flos": 22271295398400.0, + "grad_norm": 143.06440907060596, + "language_loss": 0.77120876, + "learning_rate": 6.248161155266162e-08, + "loss": 0.7857542, + "num_input_tokens_seen": 331060595, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.23376465, + "step": 15345, + "time_per_iteration": 2.6779019832611084 + }, + { + "auxiliary_loss_clip": 0.0124057, + "auxiliary_loss_mlp": 0.00230562, + "balance_loss_clip": 1.02710509, + "balance_loss_mlp": 0.20601705, + "epoch": 0.9226514354426575, + "flos": 20082719721600.0, + "grad_norm": 26.327256530466336, + "language_loss": 0.85298455, + "learning_rate": 6.238506051685677e-08, + "loss": 0.86769587, + "num_input_tokens_seen": 331080195, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2454834, + "step": 15346, + "time_per_iteration": 2.689383029937744 + }, + { + "auxiliary_loss_clip": 0.01253895, + "auxiliary_loss_mlp": 0.00215322, + "balance_loss_clip": 1.03329539, + "balance_loss_mlp": 0.19006133, + "epoch": 0.9227115586953254, + "flos": 16070851728000.0, + "grad_norm": 7.909285276172332, + "language_loss": 0.84329021, + "learning_rate": 6.228858295560457e-08, + "loss": 0.8579824, + "num_input_tokens_seen": 331097645, + "router_z_loss_clip": 2.20410156, + "router_z_loss_mlp": 0.25268555, + "step": 15347, + "time_per_iteration": 2.6222996711730957 + }, + { + "auxiliary_loss_clip": 0.0121129, + "auxiliary_loss_mlp": 0.00225162, + "balance_loss_clip": 1.00563991, + "balance_loss_mlp": 0.20195223, + "epoch": 0.9227716819479934, + "flos": 20446027833600.0, + "grad_norm": 5.819183806436241, + "language_loss": 0.81629491, + "learning_rate": 6.219217887256367e-08, + "loss": 0.83065945, + "num_input_tokens_seen": 331116830, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.23193359, + "step": 15348, + "time_per_iteration": 4.045306205749512 + }, + { + "auxiliary_loss_clip": 0.01256006, + "auxiliary_loss_mlp": 0.00242209, + "balance_loss_clip": 1.03048825, + "balance_loss_mlp": 0.21495757, + "epoch": 0.9228318052006613, + "flos": 25007401065600.0, + "grad_norm": 23.88420904454497, + "language_loss": 0.76054108, + "learning_rate": 6.209584827138959e-08, + "loss": 0.77552325, + "num_input_tokens_seen": 331137235, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.27246094, + "step": 15349, + "time_per_iteration": 2.738607406616211 + }, + { + "auxiliary_loss_clip": 0.01245739, + "auxiliary_loss_mlp": 0.00231603, + "balance_loss_clip": 1.02329564, + "balance_loss_mlp": 0.20557973, + "epoch": 0.9228919284533293, + "flos": 12677257560960.0, + "grad_norm": 860.5284308247421, + "language_loss": 0.97601151, + "learning_rate": 6.199959115573495e-08, + "loss": 0.990785, + "num_input_tokens_seen": 331153155, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.26025391, + "step": 15350, + "time_per_iteration": 2.651982545852661 + }, + { + "auxiliary_loss_clip": 0.01091124, + "auxiliary_loss_mlp": 0.00084868, + "balance_loss_clip": 0.95464194, + "balance_loss_mlp": 0.07766781, + "epoch": 0.9229520517059973, + "flos": 69986162712960.0, + "grad_norm": 0.7441270979252695, + "language_loss": 0.59533286, + "learning_rate": 6.190340752924994e-08, + "loss": 0.60709274, + "num_input_tokens_seen": 331214895, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.07177734, + "step": 15351, + "time_per_iteration": 3.1568312644958496 + }, + { + "auxiliary_loss_clip": 0.01255853, + "auxiliary_loss_mlp": 0.00216724, + "balance_loss_clip": 1.0333153, + "balance_loss_mlp": 0.1911414, + "epoch": 0.9230121749586653, + "flos": 14793832425600.0, + "grad_norm": 15.278472013009086, + "language_loss": 0.86329484, + "learning_rate": 6.180729739558233e-08, + "loss": 0.87802052, + "num_input_tokens_seen": 331232185, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.2557373, + "step": 15352, + "time_per_iteration": 2.6653170585632324 + }, + { + "auxiliary_loss_clip": 0.01258948, + "auxiliary_loss_mlp": 0.00227479, + "balance_loss_clip": 1.03398824, + "balance_loss_mlp": 0.20106186, + "epoch": 0.9230722982113332, + "flos": 22967208472320.0, + "grad_norm": 54.87972018831437, + "language_loss": 0.70298755, + "learning_rate": 6.171126075837585e-08, + "loss": 0.71785188, + "num_input_tokens_seen": 331251065, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.2644043, + "step": 15353, + "time_per_iteration": 2.6592440605163574 + }, + { + "auxiliary_loss_clip": 0.01231864, + "auxiliary_loss_mlp": 0.00229508, + "balance_loss_clip": 1.01973927, + "balance_loss_mlp": 0.20428352, + "epoch": 0.9231324214640012, + "flos": 18551452976640.0, + "grad_norm": 72.7246187514909, + "language_loss": 0.81879979, + "learning_rate": 6.161529762127293e-08, + "loss": 0.83341348, + "num_input_tokens_seen": 331269110, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.25231934, + "step": 15354, + "time_per_iteration": 2.658181667327881 + }, + { + "auxiliary_loss_clip": 0.01267133, + "auxiliary_loss_mlp": 0.00236957, + "balance_loss_clip": 1.0322578, + "balance_loss_mlp": 0.20846568, + "epoch": 0.9231925447166691, + "flos": 22082727974400.0, + "grad_norm": 2.3001549587716346, + "language_loss": 0.76944131, + "learning_rate": 6.1519407987912e-08, + "loss": 0.78448224, + "num_input_tokens_seen": 331286555, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.28479004, + "step": 15355, + "time_per_iteration": 2.666051149368286 + }, + { + "auxiliary_loss_clip": 0.01228062, + "auxiliary_loss_mlp": 0.00207969, + "balance_loss_clip": 1.01752496, + "balance_loss_mlp": 0.18407995, + "epoch": 0.9232526679693371, + "flos": 26541145848960.0, + "grad_norm": 4.274135170336573, + "language_loss": 0.82605147, + "learning_rate": 6.142359186192947e-08, + "loss": 0.84041178, + "num_input_tokens_seen": 331307660, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.23901367, + "step": 15356, + "time_per_iteration": 2.7975668907165527 + }, + { + "auxiliary_loss_clip": 0.01238243, + "auxiliary_loss_mlp": 0.00211484, + "balance_loss_clip": 1.02105856, + "balance_loss_mlp": 0.18602064, + "epoch": 0.9233127912220052, + "flos": 14756664827520.0, + "grad_norm": 34.17062730295717, + "language_loss": 0.70029581, + "learning_rate": 6.132784924695844e-08, + "loss": 0.71479309, + "num_input_tokens_seen": 331324885, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25463867, + "step": 15357, + "time_per_iteration": 2.603517770767212 + }, + { + "auxiliary_loss_clip": 0.01261887, + "auxiliary_loss_mlp": 0.00240655, + "balance_loss_clip": 1.03290462, + "balance_loss_mlp": 0.21175879, + "epoch": 0.9233729144746731, + "flos": 25261792162560.0, + "grad_norm": 12.759676932185963, + "language_loss": 0.76764011, + "learning_rate": 6.123218014662956e-08, + "loss": 0.78266549, + "num_input_tokens_seen": 331345885, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.28881836, + "step": 15358, + "time_per_iteration": 2.7020230293273926 + }, + { + "auxiliary_loss_clip": 0.01222182, + "auxiliary_loss_mlp": 0.00221347, + "balance_loss_clip": 1.00912523, + "balance_loss_mlp": 0.1974102, + "epoch": 0.9234330377273411, + "flos": 27849837968640.0, + "grad_norm": 109.05223959264094, + "language_loss": 0.81372929, + "learning_rate": 6.113658456457104e-08, + "loss": 0.82816452, + "num_input_tokens_seen": 331364320, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.23950195, + "step": 15359, + "time_per_iteration": 2.696103096008301 + }, + { + "auxiliary_loss_clip": 0.01232412, + "auxiliary_loss_mlp": 0.0023068, + "balance_loss_clip": 1.02222824, + "balance_loss_mlp": 0.20724364, + "epoch": 0.923493160980009, + "flos": 24608361899520.0, + "grad_norm": 4.49111588569745, + "language_loss": 0.73751003, + "learning_rate": 6.104106250440732e-08, + "loss": 0.75214094, + "num_input_tokens_seen": 331384135, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23425293, + "step": 15360, + "time_per_iteration": 2.696033477783203 + }, + { + "auxiliary_loss_clip": 0.01090349, + "auxiliary_loss_mlp": 0.00073956, + "balance_loss_clip": 0.95406818, + "balance_loss_mlp": 0.06651688, + "epoch": 0.923553284232677, + "flos": 67700916558720.0, + "grad_norm": 0.808797484907436, + "language_loss": 0.54470754, + "learning_rate": 6.094561396976083e-08, + "loss": 0.55635059, + "num_input_tokens_seen": 331440645, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07421875, + "step": 15361, + "time_per_iteration": 3.1378085613250732 + }, + { + "auxiliary_loss_clip": 0.0125153, + "auxiliary_loss_mlp": 0.00214875, + "balance_loss_clip": 1.02846515, + "balance_loss_mlp": 0.18813637, + "epoch": 0.9236134074853449, + "flos": 18807244704000.0, + "grad_norm": 5.657518793349361, + "language_loss": 0.8110621, + "learning_rate": 6.085023896425112e-08, + "loss": 0.82572609, + "num_input_tokens_seen": 331459580, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26757812, + "step": 15362, + "time_per_iteration": 2.7207212448120117 + }, + { + "auxiliary_loss_clip": 0.01255381, + "auxiliary_loss_mlp": 0.00228866, + "balance_loss_clip": 1.02984977, + "balance_loss_mlp": 0.20008841, + "epoch": 0.923673530738013, + "flos": 27782362270080.0, + "grad_norm": 53.039070556140345, + "language_loss": 0.83675373, + "learning_rate": 6.075493749149463e-08, + "loss": 0.85159612, + "num_input_tokens_seen": 331481560, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.28796387, + "step": 15363, + "time_per_iteration": 2.8429746627807617 + }, + { + "auxiliary_loss_clip": 0.0123037, + "auxiliary_loss_mlp": 0.00206227, + "balance_loss_clip": 1.02038455, + "balance_loss_mlp": 0.18277867, + "epoch": 0.9237336539906809, + "flos": 26797117144320.0, + "grad_norm": 110.31931657423164, + "language_loss": 0.91319996, + "learning_rate": 6.065970955510514e-08, + "loss": 0.92756593, + "num_input_tokens_seen": 331499090, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23449707, + "step": 15364, + "time_per_iteration": 2.729325771331787 + }, + { + "auxiliary_loss_clip": 0.01246752, + "auxiliary_loss_mlp": 0.0024959, + "balance_loss_clip": 1.02877557, + "balance_loss_mlp": 0.2247709, + "epoch": 0.9237937772433489, + "flos": 23587708942080.0, + "grad_norm": 41.16634062103163, + "language_loss": 0.7502054, + "learning_rate": 6.056455515869419e-08, + "loss": 0.76516879, + "num_input_tokens_seen": 331519420, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24816895, + "step": 15365, + "time_per_iteration": 2.745678186416626 + }, + { + "auxiliary_loss_clip": 0.01253513, + "auxiliary_loss_mlp": 0.00225038, + "balance_loss_clip": 1.03120744, + "balance_loss_mlp": 0.20123202, + "epoch": 0.9238539004960168, + "flos": 26140562398080.0, + "grad_norm": 5.5534525582082335, + "language_loss": 0.72215492, + "learning_rate": 6.046947430586913e-08, + "loss": 0.7369405, + "num_input_tokens_seen": 331538720, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.23803711, + "step": 15366, + "time_per_iteration": 2.682913303375244 + }, + { + "auxiliary_loss_clip": 0.01228352, + "auxiliary_loss_mlp": 0.0021597, + "balance_loss_clip": 1.01487827, + "balance_loss_mlp": 0.19223532, + "epoch": 0.9239140237486848, + "flos": 21068000760960.0, + "grad_norm": 13.820170409697463, + "language_loss": 0.81325698, + "learning_rate": 6.037446700023619e-08, + "loss": 0.8277002, + "num_input_tokens_seen": 331558505, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.23742676, + "step": 15367, + "time_per_iteration": 2.7145352363586426 + }, + { + "auxiliary_loss_clip": 0.01228119, + "auxiliary_loss_mlp": 0.00222978, + "balance_loss_clip": 1.01832902, + "balance_loss_mlp": 0.20097163, + "epoch": 0.9239741470013527, + "flos": 24607930936320.0, + "grad_norm": 4.106528577847107, + "language_loss": 0.72543359, + "learning_rate": 6.027953324539759e-08, + "loss": 0.73994458, + "num_input_tokens_seen": 331578440, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.2199707, + "step": 15368, + "time_per_iteration": 2.6830554008483887 + }, + { + "auxiliary_loss_clip": 0.01244265, + "auxiliary_loss_mlp": 0.00241331, + "balance_loss_clip": 1.02521479, + "balance_loss_mlp": 0.21723834, + "epoch": 0.9240342702540207, + "flos": 24718248581760.0, + "grad_norm": 15.870974565261431, + "language_loss": 0.83601928, + "learning_rate": 6.018467304495401e-08, + "loss": 0.8508752, + "num_input_tokens_seen": 331598945, + "router_z_loss_clip": 2.19238281, + "router_z_loss_mlp": 0.24072266, + "step": 15369, + "time_per_iteration": 2.6846280097961426 + }, + { + "auxiliary_loss_clip": 0.01255114, + "auxiliary_loss_mlp": 0.00240793, + "balance_loss_clip": 1.02987778, + "balance_loss_mlp": 0.21296942, + "epoch": 0.9240943935066888, + "flos": 20849987162880.0, + "grad_norm": 256.9684957686365, + "language_loss": 0.86173916, + "learning_rate": 6.008988640250145e-08, + "loss": 0.87669826, + "num_input_tokens_seen": 331616700, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.2779541, + "step": 15370, + "time_per_iteration": 2.6263363361358643 + }, + { + "auxiliary_loss_clip": 0.01225801, + "auxiliary_loss_mlp": 0.00204191, + "balance_loss_clip": 1.01355028, + "balance_loss_mlp": 0.18034896, + "epoch": 0.9241545167593567, + "flos": 24462313200000.0, + "grad_norm": 7.458546260609154, + "language_loss": 0.73180103, + "learning_rate": 5.999517332163528e-08, + "loss": 0.7461009, + "num_input_tokens_seen": 331635625, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.23852539, + "step": 15371, + "time_per_iteration": 2.705002784729004 + }, + { + "auxiliary_loss_clip": 0.01102119, + "auxiliary_loss_mlp": 0.00070443, + "balance_loss_clip": 0.96192944, + "balance_loss_mlp": 0.06271869, + "epoch": 0.9242146400120247, + "flos": 61827259847040.0, + "grad_norm": 0.7056982990241214, + "language_loss": 0.57049716, + "learning_rate": 5.99005338059464e-08, + "loss": 0.58222276, + "num_input_tokens_seen": 331698595, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.07714844, + "step": 15372, + "time_per_iteration": 3.112746477127075 + }, + { + "auxiliary_loss_clip": 0.01218972, + "auxiliary_loss_mlp": 0.00233313, + "balance_loss_clip": 1.00784683, + "balance_loss_mlp": 0.20984066, + "epoch": 0.9242747632646926, + "flos": 22048397550720.0, + "grad_norm": 117.29596536092251, + "language_loss": 0.78218961, + "learning_rate": 5.98059678590237e-08, + "loss": 0.79671246, + "num_input_tokens_seen": 331717975, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.23461914, + "step": 15373, + "time_per_iteration": 2.6335248947143555 + }, + { + "auxiliary_loss_clip": 0.0123575, + "auxiliary_loss_mlp": 0.00204225, + "balance_loss_clip": 1.02327001, + "balance_loss_mlp": 0.17969128, + "epoch": 0.9243348865173606, + "flos": 18478338842880.0, + "grad_norm": 8.292031415015979, + "language_loss": 0.83609653, + "learning_rate": 5.971147548445299e-08, + "loss": 0.85049629, + "num_input_tokens_seen": 331737220, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.24523926, + "step": 15374, + "time_per_iteration": 2.699777126312256 + }, + { + "auxiliary_loss_clip": 0.01222035, + "auxiliary_loss_mlp": 0.00204693, + "balance_loss_clip": 1.01225567, + "balance_loss_mlp": 0.18184046, + "epoch": 0.9243950097700285, + "flos": 23258767167360.0, + "grad_norm": 30.512391122485294, + "language_loss": 0.73964083, + "learning_rate": 5.961705668581784e-08, + "loss": 0.75390804, + "num_input_tokens_seen": 331757300, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.2286377, + "step": 15375, + "time_per_iteration": 2.724558115005493 + }, + { + "auxiliary_loss_clip": 0.01237317, + "auxiliary_loss_mlp": 0.0022464, + "balance_loss_clip": 1.02203512, + "balance_loss_mlp": 0.19849706, + "epoch": 0.9244551330226966, + "flos": 29749081593600.0, + "grad_norm": 11.098159840546234, + "language_loss": 0.74095106, + "learning_rate": 5.952271146669829e-08, + "loss": 0.75557065, + "num_input_tokens_seen": 331776995, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.26135254, + "step": 15376, + "time_per_iteration": 2.8236963748931885 + }, + { + "auxiliary_loss_clip": 0.01092531, + "auxiliary_loss_mlp": 0.00067225, + "balance_loss_clip": 0.95526063, + "balance_loss_mlp": 0.05992972, + "epoch": 0.9245152562753645, + "flos": 68864960609280.0, + "grad_norm": 0.6430496470717083, + "language_loss": 0.60542929, + "learning_rate": 5.94284398306717e-08, + "loss": 0.61702693, + "num_input_tokens_seen": 331845015, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.07275391, + "step": 15377, + "time_per_iteration": 4.672845840454102 + }, + { + "auxiliary_loss_clip": 0.01235185, + "auxiliary_loss_mlp": 0.00224418, + "balance_loss_clip": 1.02219439, + "balance_loss_mlp": 0.20043309, + "epoch": 0.9245753795280325, + "flos": 21579260993280.0, + "grad_norm": 34.89366483582198, + "language_loss": 0.80964327, + "learning_rate": 5.933424178131341e-08, + "loss": 0.82423931, + "num_input_tokens_seen": 331862795, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.23999023, + "step": 15378, + "time_per_iteration": 4.307880163192749 + }, + { + "auxiliary_loss_clip": 0.01246127, + "auxiliary_loss_mlp": 0.00237299, + "balance_loss_clip": 1.02853, + "balance_loss_mlp": 0.21081088, + "epoch": 0.9246355027807004, + "flos": 34496077334400.0, + "grad_norm": 24.511777334342728, + "language_loss": 0.70571208, + "learning_rate": 5.924011732219503e-08, + "loss": 0.72054631, + "num_input_tokens_seen": 331882535, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.26489258, + "step": 15379, + "time_per_iteration": 2.8104381561279297 + }, + { + "auxiliary_loss_clip": 0.01221433, + "auxiliary_loss_mlp": 0.00221841, + "balance_loss_clip": 1.01115608, + "balance_loss_mlp": 0.19826122, + "epoch": 0.9246956260333684, + "flos": 15953854152960.0, + "grad_norm": 6.138477461793837, + "language_loss": 0.92177683, + "learning_rate": 5.914606645688591e-08, + "loss": 0.93620956, + "num_input_tokens_seen": 331899335, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.23608398, + "step": 15380, + "time_per_iteration": 2.740817070007324 + }, + { + "auxiliary_loss_clip": 0.01249843, + "auxiliary_loss_mlp": 0.00239739, + "balance_loss_clip": 1.02763748, + "balance_loss_mlp": 0.21344116, + "epoch": 0.9247557492860363, + "flos": 23368366540800.0, + "grad_norm": 20.383857026752406, + "language_loss": 0.80592942, + "learning_rate": 5.905208918895233e-08, + "loss": 0.82082522, + "num_input_tokens_seen": 331919030, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.26330566, + "step": 15381, + "time_per_iteration": 2.692462205886841 + }, + { + "auxiliary_loss_clip": 0.01254667, + "auxiliary_loss_mlp": 0.00231802, + "balance_loss_clip": 1.03533721, + "balance_loss_mlp": 0.20593357, + "epoch": 0.9248158725387043, + "flos": 23039855729280.0, + "grad_norm": 23.510702157914203, + "language_loss": 0.85819733, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.87306201, + "num_input_tokens_seen": 331936465, + "router_z_loss_clip": 2.19433594, + "router_z_loss_mlp": 0.25842285, + "step": 15382, + "time_per_iteration": 2.7246627807617188 + }, + { + "auxiliary_loss_clip": 0.01249729, + "auxiliary_loss_mlp": 0.00238122, + "balance_loss_clip": 1.02864122, + "balance_loss_mlp": 0.21122883, + "epoch": 0.9248759957913724, + "flos": 22522418357760.0, + "grad_norm": 120.78709099181876, + "language_loss": 0.81093585, + "learning_rate": 5.886435545946455e-08, + "loss": 0.82581437, + "num_input_tokens_seen": 331954625, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.2689209, + "step": 15383, + "time_per_iteration": 2.6523597240448 + }, + { + "auxiliary_loss_clip": 0.01235319, + "auxiliary_loss_mlp": 0.00213477, + "balance_loss_clip": 1.0232625, + "balance_loss_mlp": 0.18899174, + "epoch": 0.9249361190440403, + "flos": 25447271016960.0, + "grad_norm": 3.040619293590424, + "language_loss": 0.82654673, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.84103471, + "num_input_tokens_seen": 331975865, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.24487305, + "step": 15384, + "time_per_iteration": 2.7744874954223633 + }, + { + "auxiliary_loss_clip": 0.01227227, + "auxiliary_loss_mlp": 0.00205788, + "balance_loss_clip": 1.01126051, + "balance_loss_mlp": 0.18117164, + "epoch": 0.9249962422967083, + "flos": 12378623886720.0, + "grad_norm": 5.965523551749448, + "language_loss": 0.78471905, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.79904926, + "num_input_tokens_seen": 331992760, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.24621582, + "step": 15385, + "time_per_iteration": 4.175187110900879 + }, + { + "auxiliary_loss_clip": 0.01250287, + "auxiliary_loss_mlp": 0.0023084, + "balance_loss_clip": 1.03284109, + "balance_loss_mlp": 0.20417309, + "epoch": 0.9250563655493762, + "flos": 22929430343040.0, + "grad_norm": 47.04090976642357, + "language_loss": 0.90319276, + "learning_rate": 5.85833069345496e-08, + "loss": 0.91800404, + "num_input_tokens_seen": 332011890, + "router_z_loss_clip": 2.17285156, + "router_z_loss_mlp": 0.26672363, + "step": 15386, + "time_per_iteration": 2.662161111831665 + }, + { + "auxiliary_loss_clip": 0.01228508, + "auxiliary_loss_mlp": 0.00209124, + "balance_loss_clip": 1.01728344, + "balance_loss_mlp": 0.18563968, + "epoch": 0.9251164888020442, + "flos": 18478662065280.0, + "grad_norm": 9.686475516662387, + "language_loss": 0.84393728, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.85831356, + "num_input_tokens_seen": 332029485, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.23474121, + "step": 15387, + "time_per_iteration": 2.7189342975616455 + }, + { + "auxiliary_loss_clip": 0.01229599, + "auxiliary_loss_mlp": 0.00227088, + "balance_loss_clip": 1.01562595, + "balance_loss_mlp": 0.20145822, + "epoch": 0.9251766120547121, + "flos": 33037062796800.0, + "grad_norm": 304.56368967306435, + "language_loss": 0.757936, + "learning_rate": 5.839630933893014e-08, + "loss": 0.7725029, + "num_input_tokens_seen": 332052970, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.25634766, + "step": 15388, + "time_per_iteration": 2.8978002071380615 + }, + { + "auxiliary_loss_clip": 0.01252511, + "auxiliary_loss_mlp": 0.00227099, + "balance_loss_clip": 1.02769732, + "balance_loss_mlp": 0.20179096, + "epoch": 0.9252367353073802, + "flos": 24387906176640.0, + "grad_norm": 3.9951890913897277, + "language_loss": 0.90933061, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.92412674, + "num_input_tokens_seen": 332070395, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.25305176, + "step": 15389, + "time_per_iteration": 2.655665397644043 + }, + { + "auxiliary_loss_clip": 0.01278288, + "auxiliary_loss_mlp": 0.00249882, + "balance_loss_clip": 1.04782283, + "balance_loss_mlp": 0.2224396, + "epoch": 0.9252968585600481, + "flos": 18916844077440.0, + "grad_norm": 285.7073213668664, + "language_loss": 0.86137009, + "learning_rate": 5.820960624653381e-08, + "loss": 0.87665182, + "num_input_tokens_seen": 332090185, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.27429199, + "step": 15390, + "time_per_iteration": 4.13965630531311 + }, + { + "auxiliary_loss_clip": 0.01253677, + "auxiliary_loss_mlp": 0.00216289, + "balance_loss_clip": 1.03088343, + "balance_loss_mlp": 0.19071874, + "epoch": 0.9253569818127161, + "flos": 21725345606400.0, + "grad_norm": 3.0044628113383665, + "language_loss": 0.83562493, + "learning_rate": 5.811636514789597e-08, + "loss": 0.85032463, + "num_input_tokens_seen": 332109050, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.2557373, + "step": 15391, + "time_per_iteration": 2.6316168308258057 + }, + { + "auxiliary_loss_clip": 0.01273696, + "auxiliary_loss_mlp": 0.00236449, + "balance_loss_clip": 1.04480505, + "balance_loss_mlp": 0.20849422, + "epoch": 0.925417105065384, + "flos": 34240357434240.0, + "grad_norm": 99.47517312716941, + "language_loss": 0.62251544, + "learning_rate": 5.80231976856802e-08, + "loss": 0.63761687, + "num_input_tokens_seen": 332131180, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27929688, + "step": 15392, + "time_per_iteration": 2.804828405380249 + }, + { + "auxiliary_loss_clip": 0.01240635, + "auxiliary_loss_mlp": 0.00245686, + "balance_loss_clip": 1.02355027, + "balance_loss_mlp": 0.22044985, + "epoch": 0.925477228318052, + "flos": 25959536830080.0, + "grad_norm": 8.858574184139616, + "language_loss": 0.84770155, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.8625648, + "num_input_tokens_seen": 332149555, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.25244141, + "step": 15393, + "time_per_iteration": 2.6844425201416016 + }, + { + "auxiliary_loss_clip": 0.01232339, + "auxiliary_loss_mlp": 0.00217855, + "balance_loss_clip": 1.01721978, + "balance_loss_mlp": 0.19183135, + "epoch": 0.9255373515707199, + "flos": 11838240702720.0, + "grad_norm": 3.426160207552794, + "language_loss": 0.78342563, + "learning_rate": 5.783708368464357e-08, + "loss": 0.79792756, + "num_input_tokens_seen": 332165830, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.26037598, + "step": 15394, + "time_per_iteration": 2.700195074081421 + }, + { + "auxiliary_loss_clip": 0.01232808, + "auxiliary_loss_mlp": 0.00235915, + "balance_loss_clip": 1.02119303, + "balance_loss_mlp": 0.21011797, + "epoch": 0.925597474823388, + "flos": 21434325615360.0, + "grad_norm": 55.25787419169357, + "language_loss": 0.80679655, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.82148379, + "num_input_tokens_seen": 332185130, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.25793457, + "step": 15395, + "time_per_iteration": 2.625373125076294 + }, + { + "auxiliary_loss_clip": 0.0123418, + "auxiliary_loss_mlp": 0.00226917, + "balance_loss_clip": 1.01731181, + "balance_loss_mlp": 0.20197842, + "epoch": 0.925657598076056, + "flos": 22857573185280.0, + "grad_norm": 113.15015449971071, + "language_loss": 0.80808151, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.82269251, + "num_input_tokens_seen": 332203695, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24963379, + "step": 15396, + "time_per_iteration": 2.696953296661377 + }, + { + "auxiliary_loss_clip": 0.01246989, + "auxiliary_loss_mlp": 0.00227104, + "balance_loss_clip": 1.02816606, + "balance_loss_mlp": 0.19902982, + "epoch": 0.9257177213287239, + "flos": 25704032411520.0, + "grad_norm": 691.4068881012736, + "language_loss": 0.9361918, + "learning_rate": 5.755846504448603e-08, + "loss": 0.95093274, + "num_input_tokens_seen": 332224850, + "router_z_loss_clip": 2.18847656, + "router_z_loss_mlp": 0.28088379, + "step": 15397, + "time_per_iteration": 2.71514892578125 + }, + { + "auxiliary_loss_clip": 0.01093142, + "auxiliary_loss_mlp": 0.00075514, + "balance_loss_clip": 0.9586888, + "balance_loss_mlp": 0.06759874, + "epoch": 0.9257778445813919, + "flos": 59592933221760.0, + "grad_norm": 0.7917543502888512, + "language_loss": 0.54809833, + "learning_rate": 5.746573947489586e-08, + "loss": 0.55978489, + "num_input_tokens_seen": 332278085, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.07910156, + "step": 15398, + "time_per_iteration": 3.0459885597229004 + }, + { + "auxiliary_loss_clip": 0.01263374, + "auxiliary_loss_mlp": 0.00248683, + "balance_loss_clip": 1.0362134, + "balance_loss_mlp": 0.21945331, + "epoch": 0.9258379678340598, + "flos": 27709427704320.0, + "grad_norm": 56.72210130941292, + "language_loss": 0.86134928, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.87646985, + "num_input_tokens_seen": 332297875, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.29248047, + "step": 15399, + "time_per_iteration": 2.772329092025757 + }, + { + "auxiliary_loss_clip": 0.01214567, + "auxiliary_loss_mlp": 0.00217432, + "balance_loss_clip": 1.00759935, + "balance_loss_mlp": 0.19373342, + "epoch": 0.9258980910867278, + "flos": 24863543095680.0, + "grad_norm": 26.39246507986695, + "language_loss": 0.84525532, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.85957533, + "num_input_tokens_seen": 332318500, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.23718262, + "step": 15400, + "time_per_iteration": 2.7075726985931396 + }, + { + "auxiliary_loss_clip": 0.01094257, + "auxiliary_loss_mlp": 0.00091956, + "balance_loss_clip": 0.95918465, + "balance_loss_mlp": 0.08394549, + "epoch": 0.9259582143393957, + "flos": 63134587249920.0, + "grad_norm": 0.7105062275276695, + "language_loss": 0.51001406, + "learning_rate": 5.718800474673946e-08, + "loss": 0.52187622, + "num_input_tokens_seen": 332381980, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.08007812, + "step": 15401, + "time_per_iteration": 3.1205708980560303 + }, + { + "auxiliary_loss_clip": 0.01218453, + "auxiliary_loss_mlp": 0.00210331, + "balance_loss_clip": 1.00835109, + "balance_loss_mlp": 0.18616766, + "epoch": 0.9260183375920638, + "flos": 24127122458880.0, + "grad_norm": 123.38493707269622, + "language_loss": 0.87598801, + "learning_rate": 5.709557384259378e-08, + "loss": 0.89027584, + "num_input_tokens_seen": 332399510, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.24145508, + "step": 15402, + "time_per_iteration": 2.6664953231811523 + }, + { + "auxiliary_loss_clip": 0.01096122, + "auxiliary_loss_mlp": 0.00050064, + "balance_loss_clip": 0.95960307, + "balance_loss_mlp": 0.04281577, + "epoch": 0.9260784608447317, + "flos": 63042872849280.0, + "grad_norm": 0.7342002143172576, + "language_loss": 0.50281346, + "learning_rate": 5.700321661357876e-08, + "loss": 0.51427537, + "num_input_tokens_seen": 332459130, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07226562, + "step": 15403, + "time_per_iteration": 3.2268285751342773 + }, + { + "auxiliary_loss_clip": 0.01090815, + "auxiliary_loss_mlp": 0.00062987, + "balance_loss_clip": 0.95345581, + "balance_loss_mlp": 0.05597761, + "epoch": 0.9261385840973997, + "flos": 70585979927040.0, + "grad_norm": 0.6684257328871466, + "language_loss": 0.58284312, + "learning_rate": 5.69109330631965e-08, + "loss": 0.59438115, + "num_input_tokens_seen": 332526555, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.0703125, + "step": 15404, + "time_per_iteration": 3.1576764583587646 + }, + { + "auxiliary_loss_clip": 0.01231953, + "auxiliary_loss_mlp": 0.00224953, + "balance_loss_clip": 1.01626897, + "balance_loss_mlp": 0.1994426, + "epoch": 0.9261987073500676, + "flos": 20229917656320.0, + "grad_norm": 12.797307021393562, + "language_loss": 0.8090415, + "learning_rate": 5.681872319494596e-08, + "loss": 0.82361054, + "num_input_tokens_seen": 332544005, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.25500488, + "step": 15405, + "time_per_iteration": 2.688830852508545 + }, + { + "auxiliary_loss_clip": 0.01246156, + "auxiliary_loss_mlp": 0.00213156, + "balance_loss_clip": 1.02895427, + "balance_loss_mlp": 0.18791978, + "epoch": 0.9262588306027356, + "flos": 20954163582720.0, + "grad_norm": 7.239018238409776, + "language_loss": 0.78608048, + "learning_rate": 5.672658701232458e-08, + "loss": 0.80067366, + "num_input_tokens_seen": 332563070, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25256348, + "step": 15406, + "time_per_iteration": 2.7194502353668213 + }, + { + "auxiliary_loss_clip": 0.01239221, + "auxiliary_loss_mlp": 0.00231666, + "balance_loss_clip": 1.01987612, + "balance_loss_mlp": 0.20559523, + "epoch": 0.9263189538554035, + "flos": 22158679282560.0, + "grad_norm": 16.029974416878353, + "language_loss": 0.85996878, + "learning_rate": 5.663452451882555e-08, + "loss": 0.87467766, + "num_input_tokens_seen": 332579620, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26049805, + "step": 15407, + "time_per_iteration": 2.656231641769409 + }, + { + "auxiliary_loss_clip": 0.01255842, + "auxiliary_loss_mlp": 0.00249534, + "balance_loss_clip": 1.02937126, + "balance_loss_mlp": 0.22138873, + "epoch": 0.9263790771080715, + "flos": 18187211111040.0, + "grad_norm": 20.81688336069759, + "language_loss": 0.82317507, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.83822882, + "num_input_tokens_seen": 332597795, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.28161621, + "step": 15408, + "time_per_iteration": 2.758786678314209 + }, + { + "auxiliary_loss_clip": 0.01219084, + "auxiliary_loss_mlp": 0.00223272, + "balance_loss_clip": 1.00960231, + "balance_loss_mlp": 0.19971617, + "epoch": 0.9264392003607396, + "flos": 48178545004800.0, + "grad_norm": 11.855922670118483, + "language_loss": 0.75139117, + "learning_rate": 5.645062061315675e-08, + "loss": 0.76581472, + "num_input_tokens_seen": 332620375, + "router_z_loss_clip": 2.09277344, + "router_z_loss_mlp": 0.2355957, + "step": 15409, + "time_per_iteration": 2.867767810821533 + }, + { + "auxiliary_loss_clip": 0.01244262, + "auxiliary_loss_mlp": 0.00224548, + "balance_loss_clip": 1.02479768, + "balance_loss_mlp": 0.1996925, + "epoch": 0.9264993236134075, + "flos": 26389458714240.0, + "grad_norm": 4.338239355233278, + "language_loss": 0.84387076, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.85855889, + "num_input_tokens_seen": 332639510, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.2487793, + "step": 15410, + "time_per_iteration": 2.7551653385162354 + }, + { + "auxiliary_loss_clip": 0.01235532, + "auxiliary_loss_mlp": 0.00233255, + "balance_loss_clip": 1.02154255, + "balance_loss_mlp": 0.20831636, + "epoch": 0.9265594468660755, + "flos": 20920084554240.0, + "grad_norm": 31.344764043631727, + "language_loss": 0.87671173, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.89139962, + "num_input_tokens_seen": 332658350, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24902344, + "step": 15411, + "time_per_iteration": 2.6602351665496826 + }, + { + "auxiliary_loss_clip": 0.01256055, + "auxiliary_loss_mlp": 0.00224014, + "balance_loss_clip": 1.03661084, + "balance_loss_mlp": 0.1997072, + "epoch": 0.9266195701187434, + "flos": 17525017929600.0, + "grad_norm": 475.1065275754509, + "language_loss": 0.81985337, + "learning_rate": 5.617531751025728e-08, + "loss": 0.83465403, + "num_input_tokens_seen": 332676715, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24316406, + "step": 15412, + "time_per_iteration": 2.7413437366485596 + }, + { + "auxiliary_loss_clip": 0.01221193, + "auxiliary_loss_mlp": 0.0021906, + "balance_loss_clip": 1.01097882, + "balance_loss_mlp": 0.19465759, + "epoch": 0.9266796933714114, + "flos": 33688733293440.0, + "grad_norm": 2.953455408945847, + "language_loss": 0.74784946, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.76225197, + "num_input_tokens_seen": 332701470, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.24401855, + "step": 15413, + "time_per_iteration": 2.8012936115264893 + }, + { + "auxiliary_loss_clip": 0.01245679, + "auxiliary_loss_mlp": 0.002155, + "balance_loss_clip": 1.02884781, + "balance_loss_mlp": 0.1919919, + "epoch": 0.9267398166240793, + "flos": 18916520855040.0, + "grad_norm": 251.37320504146476, + "language_loss": 0.83851433, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.85312605, + "num_input_tokens_seen": 332719060, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.23510742, + "step": 15414, + "time_per_iteration": 2.6309783458709717 + }, + { + "auxiliary_loss_clip": 0.01218026, + "auxiliary_loss_mlp": 0.0021318, + "balance_loss_clip": 1.00928402, + "balance_loss_mlp": 0.18933846, + "epoch": 0.9267999398767474, + "flos": 20478957626880.0, + "grad_norm": 36.481733428654216, + "language_loss": 0.8790074, + "learning_rate": 5.59006777975819e-08, + "loss": 0.89331949, + "num_input_tokens_seen": 332736345, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.23840332, + "step": 15415, + "time_per_iteration": 2.655496835708618 + }, + { + "auxiliary_loss_clip": 0.01240004, + "auxiliary_loss_mlp": 0.0023148, + "balance_loss_clip": 1.02087736, + "balance_loss_mlp": 0.20624366, + "epoch": 0.9268600631294153, + "flos": 24789351553920.0, + "grad_norm": 23.465515359641316, + "language_loss": 0.62257993, + "learning_rate": 5.580927866294671e-08, + "loss": 0.63729483, + "num_input_tokens_seen": 332756270, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25231934, + "step": 15416, + "time_per_iteration": 2.664907932281494 + }, + { + "auxiliary_loss_clip": 0.01235416, + "auxiliary_loss_mlp": 0.00211292, + "balance_loss_clip": 1.02199161, + "balance_loss_mlp": 0.18767682, + "epoch": 0.9269201863820833, + "flos": 18697178453760.0, + "grad_norm": 540.6228057440904, + "language_loss": 0.80163038, + "learning_rate": 5.571795325221807e-08, + "loss": 0.81609744, + "num_input_tokens_seen": 332775185, + "router_z_loss_clip": 2.13378906, + "router_z_loss_mlp": 0.23608398, + "step": 15417, + "time_per_iteration": 2.6940248012542725 + }, + { + "auxiliary_loss_clip": 0.01243053, + "auxiliary_loss_mlp": 0.00212867, + "balance_loss_clip": 1.02325141, + "balance_loss_mlp": 0.18536542, + "epoch": 0.9269803096347512, + "flos": 20923999136640.0, + "grad_norm": 96.38788273348412, + "language_loss": 0.85124457, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.86580372, + "num_input_tokens_seen": 332794320, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.27490234, + "step": 15418, + "time_per_iteration": 2.6684722900390625 + }, + { + "auxiliary_loss_clip": 0.01235837, + "auxiliary_loss_mlp": 0.00236786, + "balance_loss_clip": 1.01892781, + "balance_loss_mlp": 0.21126322, + "epoch": 0.9270404328874192, + "flos": 28002710252160.0, + "grad_norm": 99.37403375480774, + "language_loss": 0.82889682, + "learning_rate": 5.553552361633174e-08, + "loss": 0.84362304, + "num_input_tokens_seen": 332818095, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.25537109, + "step": 15419, + "time_per_iteration": 2.776309013366699 + }, + { + "auxiliary_loss_clip": 0.0120391, + "auxiliary_loss_mlp": 0.00208726, + "balance_loss_clip": 0.9979459, + "balance_loss_mlp": 0.18588509, + "epoch": 0.9271005561400871, + "flos": 25889870401920.0, + "grad_norm": 33.83367208887662, + "language_loss": 0.81639642, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.83052278, + "num_input_tokens_seen": 332839860, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.22851562, + "step": 15420, + "time_per_iteration": 4.248643398284912 + }, + { + "auxiliary_loss_clip": 0.01244144, + "auxiliary_loss_mlp": 0.00218947, + "balance_loss_clip": 1.02475071, + "balance_loss_mlp": 0.19319771, + "epoch": 0.9271606793927551, + "flos": 27053914452480.0, + "grad_norm": 32.38280326417553, + "language_loss": 0.83703107, + "learning_rate": 5.535338891759389e-08, + "loss": 0.85166204, + "num_input_tokens_seen": 332861155, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.2578125, + "step": 15421, + "time_per_iteration": 4.149538278579712 + }, + { + "auxiliary_loss_clip": 0.01231068, + "auxiliary_loss_mlp": 0.0023037, + "balance_loss_clip": 1.01532614, + "balance_loss_mlp": 0.20611057, + "epoch": 0.9272208026454232, + "flos": 26209869690240.0, + "grad_norm": 9.87297963187902, + "language_loss": 0.81245023, + "learning_rate": 5.526243217829041e-08, + "loss": 0.82706457, + "num_input_tokens_seen": 332881110, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.24255371, + "step": 15422, + "time_per_iteration": 2.7476139068603516 + }, + { + "auxiliary_loss_clip": 0.01248174, + "auxiliary_loss_mlp": 0.00221279, + "balance_loss_clip": 1.02582037, + "balance_loss_mlp": 0.19610187, + "epoch": 0.9272809258980911, + "flos": 12458453863680.0, + "grad_norm": 60.2545033265279, + "language_loss": 0.9046604, + "learning_rate": 5.517154918363065e-08, + "loss": 0.91935492, + "num_input_tokens_seen": 332899350, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25195312, + "step": 15423, + "time_per_iteration": 2.7256083488464355 + }, + { + "auxiliary_loss_clip": 0.01246643, + "auxiliary_loss_mlp": 0.00240787, + "balance_loss_clip": 1.0281266, + "balance_loss_mlp": 0.21763682, + "epoch": 0.9273410491507591, + "flos": 22856890826880.0, + "grad_norm": 4.1344896653316505, + "language_loss": 0.83677971, + "learning_rate": 5.508073993706053e-08, + "loss": 0.85165405, + "num_input_tokens_seen": 332918105, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.23168945, + "step": 15424, + "time_per_iteration": 2.7141287326812744 + }, + { + "auxiliary_loss_clip": 0.01091804, + "auxiliary_loss_mlp": 0.00046536, + "balance_loss_clip": 0.9562608, + "balance_loss_mlp": 0.03986066, + "epoch": 0.927401172403427, + "flos": 47665384329600.0, + "grad_norm": 1.5649922388032202, + "language_loss": 0.59617722, + "learning_rate": 5.499000444202351e-08, + "loss": 0.60756063, + "num_input_tokens_seen": 332969490, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06689453, + "step": 15425, + "time_per_iteration": 2.9562928676605225 + }, + { + "auxiliary_loss_clip": 0.01233868, + "auxiliary_loss_mlp": 0.00220378, + "balance_loss_clip": 1.02046156, + "balance_loss_mlp": 0.19613121, + "epoch": 0.927461295656095, + "flos": 29972374490880.0, + "grad_norm": 29.46208262031019, + "language_loss": 0.77396286, + "learning_rate": 5.489934270196106e-08, + "loss": 0.78850543, + "num_input_tokens_seen": 332988805, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24243164, + "step": 15426, + "time_per_iteration": 2.7989273071289062 + }, + { + "auxiliary_loss_clip": 0.01238984, + "auxiliary_loss_mlp": 0.00225005, + "balance_loss_clip": 1.02936614, + "balance_loss_mlp": 0.20212841, + "epoch": 0.9275214189087629, + "flos": 20375427651840.0, + "grad_norm": 35.783362862748156, + "language_loss": 0.89252079, + "learning_rate": 5.480875472030977e-08, + "loss": 0.9071607, + "num_input_tokens_seen": 333007960, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.2286377, + "step": 15427, + "time_per_iteration": 4.263451099395752 + }, + { + "auxiliary_loss_clip": 0.01257041, + "auxiliary_loss_mlp": 0.00237804, + "balance_loss_clip": 1.03172207, + "balance_loss_mlp": 0.21163712, + "epoch": 0.927581542161431, + "flos": 22383193242240.0, + "grad_norm": 15.895171789224603, + "language_loss": 0.83966625, + "learning_rate": 5.471824050050555e-08, + "loss": 0.85461462, + "num_input_tokens_seen": 333026035, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.26147461, + "step": 15428, + "time_per_iteration": 2.6723058223724365 + }, + { + "auxiliary_loss_clip": 0.01234385, + "auxiliary_loss_mlp": 0.0022815, + "balance_loss_clip": 1.01877129, + "balance_loss_mlp": 0.20175725, + "epoch": 0.9276416654140989, + "flos": 23952453598080.0, + "grad_norm": 10.683578051879332, + "language_loss": 0.81623709, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.8308624, + "num_input_tokens_seen": 333045590, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.26403809, + "step": 15429, + "time_per_iteration": 2.6646621227264404 + }, + { + "auxiliary_loss_clip": 0.01222625, + "auxiliary_loss_mlp": 0.00199499, + "balance_loss_clip": 1.01080656, + "balance_loss_mlp": 0.17613395, + "epoch": 0.9277017886667669, + "flos": 13917719796480.0, + "grad_norm": 12.693469777870273, + "language_loss": 0.83456677, + "learning_rate": 5.45374333601647e-08, + "loss": 0.84878802, + "num_input_tokens_seen": 333063355, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.23388672, + "step": 15430, + "time_per_iteration": 2.695611000061035 + }, + { + "auxiliary_loss_clip": 0.01250393, + "auxiliary_loss_mlp": 0.00209474, + "balance_loss_clip": 1.03111744, + "balance_loss_mlp": 0.18280694, + "epoch": 0.9277619119194348, + "flos": 35666478092160.0, + "grad_norm": 112.84610767742186, + "language_loss": 0.83005768, + "learning_rate": 5.444714044648391e-08, + "loss": 0.84465635, + "num_input_tokens_seen": 333088045, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.26660156, + "step": 15431, + "time_per_iteration": 2.829538345336914 + }, + { + "auxiliary_loss_clip": 0.01235576, + "auxiliary_loss_mlp": 0.00231653, + "balance_loss_clip": 1.02095723, + "balance_loss_mlp": 0.20654801, + "epoch": 0.9278220351721028, + "flos": 23841238112640.0, + "grad_norm": 10.578138407385245, + "language_loss": 0.77570283, + "learning_rate": 5.4356921308363e-08, + "loss": 0.79037511, + "num_input_tokens_seen": 333108005, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.25134277, + "step": 15432, + "time_per_iteration": 4.078392028808594 + }, + { + "auxiliary_loss_clip": 0.01242865, + "auxiliary_loss_mlp": 0.00211934, + "balance_loss_clip": 1.02430975, + "balance_loss_mlp": 0.18769917, + "epoch": 0.9278821584247707, + "flos": 15228135768960.0, + "grad_norm": 15.095740712043044, + "language_loss": 0.91149449, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.92604244, + "num_input_tokens_seen": 333124335, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24243164, + "step": 15433, + "time_per_iteration": 2.737490653991699 + }, + { + "auxiliary_loss_clip": 0.0122307, + "auxiliary_loss_mlp": 0.00210827, + "balance_loss_clip": 1.01346827, + "balance_loss_mlp": 0.18697347, + "epoch": 0.9279422816774388, + "flos": 24681404206080.0, + "grad_norm": 34.48764169784444, + "language_loss": 0.76146126, + "learning_rate": 5.417670437248056e-08, + "loss": 0.77580017, + "num_input_tokens_seen": 333143995, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.23840332, + "step": 15434, + "time_per_iteration": 2.6718480587005615 + }, + { + "auxiliary_loss_clip": 0.01217391, + "auxiliary_loss_mlp": 0.00207503, + "balance_loss_clip": 1.01230395, + "balance_loss_mlp": 0.1843403, + "epoch": 0.9280024049301068, + "flos": 19169188099200.0, + "grad_norm": 13.365002507407802, + "language_loss": 0.76086974, + "learning_rate": 5.40867065815529e-08, + "loss": 0.77511871, + "num_input_tokens_seen": 333162805, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.23144531, + "step": 15435, + "time_per_iteration": 2.675955057144165 + }, + { + "auxiliary_loss_clip": 0.01231558, + "auxiliary_loss_mlp": 0.00226819, + "balance_loss_clip": 1.01704216, + "balance_loss_mlp": 0.19998461, + "epoch": 0.9280625281827747, + "flos": 11393701983360.0, + "grad_norm": 7.897831613507709, + "language_loss": 0.82580185, + "learning_rate": 5.399678257985263e-08, + "loss": 0.84038556, + "num_input_tokens_seen": 333175770, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.26843262, + "step": 15436, + "time_per_iteration": 2.6557416915893555 + }, + { + "auxiliary_loss_clip": 0.01229081, + "auxiliary_loss_mlp": 0.00226733, + "balance_loss_clip": 1.01195335, + "balance_loss_mlp": 0.20095977, + "epoch": 0.9281226514354427, + "flos": 24785616539520.0, + "grad_norm": 3.410574679440539, + "language_loss": 0.76510274, + "learning_rate": 5.390693237078925e-08, + "loss": 0.77966088, + "num_input_tokens_seen": 333194775, + "router_z_loss_clip": 2.16699219, + "router_z_loss_mlp": 0.2578125, + "step": 15437, + "time_per_iteration": 2.7620935440063477 + }, + { + "auxiliary_loss_clip": 0.01251424, + "auxiliary_loss_mlp": 0.00232681, + "balance_loss_clip": 1.02520299, + "balance_loss_mlp": 0.20479837, + "epoch": 0.9281827746881106, + "flos": 15083128563840.0, + "grad_norm": 64.48551733551943, + "language_loss": 0.80897009, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.82381111, + "num_input_tokens_seen": 333208920, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.27893066, + "step": 15438, + "time_per_iteration": 2.634073257446289 + }, + { + "auxiliary_loss_clip": 0.01253177, + "auxiliary_loss_mlp": 0.00224176, + "balance_loss_clip": 1.02737308, + "balance_loss_mlp": 0.19738963, + "epoch": 0.9282428979407786, + "flos": 24135059364480.0, + "grad_norm": 9.374278134354405, + "language_loss": 0.71927619, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.73404974, + "num_input_tokens_seen": 333229350, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.26794434, + "step": 15439, + "time_per_iteration": 2.725388288497925 + }, + { + "auxiliary_loss_clip": 0.01230895, + "auxiliary_loss_mlp": 0.00227124, + "balance_loss_clip": 1.01841593, + "balance_loss_mlp": 0.20157781, + "epoch": 0.9283030211934465, + "flos": 24823215100800.0, + "grad_norm": 9.778166603539537, + "language_loss": 0.77077419, + "learning_rate": 5.363782453347876e-08, + "loss": 0.78535438, + "num_input_tokens_seen": 333246125, + "router_z_loss_clip": 2.12402344, + "router_z_loss_mlp": 0.2557373, + "step": 15440, + "time_per_iteration": 2.6651241779327393 + }, + { + "auxiliary_loss_clip": 0.01237681, + "auxiliary_loss_mlp": 0.00246698, + "balance_loss_clip": 1.01726091, + "balance_loss_mlp": 0.22144917, + "epoch": 0.9283631444461146, + "flos": 23981037845760.0, + "grad_norm": 2.112013319051999, + "language_loss": 0.8250463, + "learning_rate": 5.354826952900682e-08, + "loss": 0.83989012, + "num_input_tokens_seen": 333263685, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25256348, + "step": 15441, + "time_per_iteration": 2.7204153537750244 + }, + { + "auxiliary_loss_clip": 0.01214674, + "auxiliary_loss_mlp": 0.00205929, + "balance_loss_clip": 1.00779951, + "balance_loss_mlp": 0.18298107, + "epoch": 0.9284232676987825, + "flos": 22784530878720.0, + "grad_norm": 14.956388082679284, + "language_loss": 0.70298028, + "learning_rate": 5.345878833417949e-08, + "loss": 0.71718633, + "num_input_tokens_seen": 333282435, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.22961426, + "step": 15442, + "time_per_iteration": 2.7121894359588623 + }, + { + "auxiliary_loss_clip": 0.01251747, + "auxiliary_loss_mlp": 0.0022517, + "balance_loss_clip": 1.02826154, + "balance_loss_mlp": 0.19880095, + "epoch": 0.9284833909514505, + "flos": 19500500171520.0, + "grad_norm": 12.623367352212918, + "language_loss": 0.89953172, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.91430092, + "num_input_tokens_seen": 333300400, + "router_z_loss_clip": 2.23535156, + "router_z_loss_mlp": 0.26379395, + "step": 15443, + "time_per_iteration": 2.6618549823760986 + }, + { + "auxiliary_loss_clip": 0.01252066, + "auxiliary_loss_mlp": 0.00197737, + "balance_loss_clip": 1.03493381, + "balance_loss_mlp": 0.17428844, + "epoch": 0.9285435142041184, + "flos": 23185976256000.0, + "grad_norm": 2.5320436532044543, + "language_loss": 0.71918237, + "learning_rate": 5.328004738702896e-08, + "loss": 0.73368049, + "num_input_tokens_seen": 333318980, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.234375, + "step": 15444, + "time_per_iteration": 2.697817802429199 + }, + { + "auxiliary_loss_clip": 0.01233663, + "auxiliary_loss_mlp": 0.00235936, + "balance_loss_clip": 1.01830661, + "balance_loss_mlp": 0.21099713, + "epoch": 0.9286036374567864, + "flos": 17675519915520.0, + "grad_norm": 72.69278201598813, + "language_loss": 0.79319853, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.80789447, + "num_input_tokens_seen": 333334135, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24951172, + "step": 15445, + "time_per_iteration": 2.62845778465271 + }, + { + "auxiliary_loss_clip": 0.01244415, + "auxiliary_loss_mlp": 0.00209982, + "balance_loss_clip": 1.02491009, + "balance_loss_mlp": 0.18298069, + "epoch": 0.9286637607094543, + "flos": 20886687884160.0, + "grad_norm": 27.972911554193367, + "language_loss": 0.77522588, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.78976983, + "num_input_tokens_seen": 333353325, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.26989746, + "step": 15446, + "time_per_iteration": 2.672994613647461 + }, + { + "auxiliary_loss_clip": 0.01279289, + "auxiliary_loss_mlp": 0.00262221, + "balance_loss_clip": 1.04764438, + "balance_loss_mlp": 0.23390892, + "epoch": 0.9287238839621224, + "flos": 19026012487680.0, + "grad_norm": 60.488169106855096, + "language_loss": 0.78522128, + "learning_rate": 5.301248962337523e-08, + "loss": 0.80063635, + "num_input_tokens_seen": 333371110, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.28308105, + "step": 15447, + "time_per_iteration": 2.6597719192504883 + }, + { + "auxiliary_loss_clip": 0.0120294, + "auxiliary_loss_mlp": 0.00220234, + "balance_loss_clip": 0.99938792, + "balance_loss_mlp": 0.19638059, + "epoch": 0.9287840072147904, + "flos": 20557027837440.0, + "grad_norm": 13.694989020778324, + "language_loss": 0.79132259, + "learning_rate": 5.292345135757403e-08, + "loss": 0.80555433, + "num_input_tokens_seen": 333391420, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.23864746, + "step": 15448, + "time_per_iteration": 2.7015221118927 + }, + { + "auxiliary_loss_clip": 0.01230049, + "auxiliary_loss_mlp": 0.00221086, + "balance_loss_clip": 1.01696599, + "balance_loss_mlp": 0.19565871, + "epoch": 0.9288441304674583, + "flos": 21250822008960.0, + "grad_norm": 29.05478418133339, + "language_loss": 0.85479891, + "learning_rate": 5.283448692511072e-08, + "loss": 0.86931026, + "num_input_tokens_seen": 333410365, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.25427246, + "step": 15449, + "time_per_iteration": 2.6801469326019287 + }, + { + "auxiliary_loss_clip": 0.01243124, + "auxiliary_loss_mlp": 0.00213154, + "balance_loss_clip": 1.02463973, + "balance_loss_mlp": 0.18748796, + "epoch": 0.9289042537201263, + "flos": 27669853895040.0, + "grad_norm": 25.401955523269958, + "language_loss": 0.77358103, + "learning_rate": 5.27455963293586e-08, + "loss": 0.78814381, + "num_input_tokens_seen": 333430000, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25695801, + "step": 15450, + "time_per_iteration": 2.8047618865966797 + }, + { + "auxiliary_loss_clip": 0.01242109, + "auxiliary_loss_mlp": 0.002317, + "balance_loss_clip": 1.02153742, + "balance_loss_mlp": 0.20738147, + "epoch": 0.9289643769727942, + "flos": 19317750750720.0, + "grad_norm": 4.416802477589287, + "language_loss": 0.80745065, + "learning_rate": 5.265677957368875e-08, + "loss": 0.82218874, + "num_input_tokens_seen": 333445800, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24328613, + "step": 15451, + "time_per_iteration": 2.623225212097168 + }, + { + "auxiliary_loss_clip": 0.01248777, + "auxiliary_loss_mlp": 0.00231782, + "balance_loss_clip": 1.02945876, + "balance_loss_mlp": 0.20695093, + "epoch": 0.9290245002254622, + "flos": 14058058233600.0, + "grad_norm": 31.36450287562341, + "language_loss": 0.83301497, + "learning_rate": 5.25680366614687e-08, + "loss": 0.84782052, + "num_input_tokens_seen": 333461550, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.24829102, + "step": 15452, + "time_per_iteration": 2.6763408184051514 + }, + { + "auxiliary_loss_clip": 0.0124301, + "auxiliary_loss_mlp": 0.00210139, + "balance_loss_clip": 1.02449918, + "balance_loss_mlp": 0.18460417, + "epoch": 0.9290846234781301, + "flos": 20047132321920.0, + "grad_norm": 65.14658438178863, + "language_loss": 0.80083323, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.81536472, + "num_input_tokens_seen": 333478835, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.25524902, + "step": 15453, + "time_per_iteration": 2.660738468170166 + }, + { + "auxiliary_loss_clip": 0.01088241, + "auxiliary_loss_mlp": 0.00065802, + "balance_loss_clip": 0.95374775, + "balance_loss_mlp": 0.05850658, + "epoch": 0.9291447467307982, + "flos": 61227514460160.0, + "grad_norm": 1.230976824539382, + "language_loss": 0.59923208, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.61077261, + "num_input_tokens_seen": 333535250, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.07275391, + "step": 15454, + "time_per_iteration": 3.0820119380950928 + }, + { + "auxiliary_loss_clip": 0.01246608, + "auxiliary_loss_mlp": 0.0022695, + "balance_loss_clip": 1.02837026, + "balance_loss_mlp": 0.20024714, + "epoch": 0.9292048699834661, + "flos": 20553328736640.0, + "grad_norm": 66.97817840952578, + "language_loss": 0.77798808, + "learning_rate": 5.230225101914709e-08, + "loss": 0.79272366, + "num_input_tokens_seen": 333553805, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.26708984, + "step": 15455, + "time_per_iteration": 2.660629987716675 + }, + { + "auxiliary_loss_clip": 0.01244018, + "auxiliary_loss_mlp": 0.00246188, + "balance_loss_clip": 1.02732718, + "balance_loss_mlp": 0.22101054, + "epoch": 0.9292649932361341, + "flos": 23623655477760.0, + "grad_norm": 87.6129157084128, + "language_loss": 0.73047358, + "learning_rate": 5.22138035143509e-08, + "loss": 0.74537569, + "num_input_tokens_seen": 333572800, + "router_z_loss_clip": 2.16699219, + "router_z_loss_mlp": 0.25158691, + "step": 15456, + "time_per_iteration": 2.661466598510742 + }, + { + "auxiliary_loss_clip": 0.01249393, + "auxiliary_loss_mlp": 0.00226714, + "balance_loss_clip": 1.03147602, + "balance_loss_mlp": 0.20058292, + "epoch": 0.929325116488802, + "flos": 15009942602880.0, + "grad_norm": 4.9440371565594665, + "language_loss": 0.78255892, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.79732001, + "num_input_tokens_seen": 333588520, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.26123047, + "step": 15457, + "time_per_iteration": 2.6429173946380615 + }, + { + "auxiliary_loss_clip": 0.01248005, + "auxiliary_loss_mlp": 0.00223556, + "balance_loss_clip": 1.02876019, + "balance_loss_mlp": 0.19743752, + "epoch": 0.92938523974147, + "flos": 17967365919360.0, + "grad_norm": 12.03429764172215, + "language_loss": 0.88776553, + "learning_rate": 5.203713008885291e-08, + "loss": 0.9024812, + "num_input_tokens_seen": 333603435, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.26123047, + "step": 15458, + "time_per_iteration": 2.598708391189575 + }, + { + "auxiliary_loss_clip": 0.01238741, + "auxiliary_loss_mlp": 0.00221963, + "balance_loss_clip": 1.02409399, + "balance_loss_mlp": 0.19827592, + "epoch": 0.9294453629941379, + "flos": 23003047267200.0, + "grad_norm": 15.499761655774588, + "language_loss": 0.82309443, + "learning_rate": 5.194890417485065e-08, + "loss": 0.83770144, + "num_input_tokens_seen": 333623305, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.2364502, + "step": 15459, + "time_per_iteration": 2.6708805561065674 + }, + { + "auxiliary_loss_clip": 0.01248146, + "auxiliary_loss_mlp": 0.00215847, + "balance_loss_clip": 1.02908945, + "balance_loss_mlp": 0.19113502, + "epoch": 0.929505486246806, + "flos": 17055234927360.0, + "grad_norm": 6.340206885576306, + "language_loss": 0.71513146, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.72977138, + "num_input_tokens_seen": 333641205, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.24682617, + "step": 15460, + "time_per_iteration": 2.6326968669891357 + }, + { + "auxiliary_loss_clip": 0.01264171, + "auxiliary_loss_mlp": 0.00218966, + "balance_loss_clip": 1.03812444, + "balance_loss_mlp": 0.19309726, + "epoch": 0.9295656094994739, + "flos": 27340409329920.0, + "grad_norm": 13.685073338605157, + "language_loss": 0.8783499, + "learning_rate": 5.177267396106733e-08, + "loss": 0.89318132, + "num_input_tokens_seen": 333659615, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.25854492, + "step": 15461, + "time_per_iteration": 2.7172281742095947 + }, + { + "auxiliary_loss_clip": 0.01225661, + "auxiliary_loss_mlp": 0.00207019, + "balance_loss_clip": 1.0134747, + "balance_loss_mlp": 0.18367782, + "epoch": 0.9296257327521419, + "flos": 21470954509440.0, + "grad_norm": 10.701968105336944, + "language_loss": 0.86268675, + "learning_rate": 5.168466966796869e-08, + "loss": 0.87701356, + "num_input_tokens_seen": 333678985, + "router_z_loss_clip": 2.12011719, + "router_z_loss_mlp": 0.23364258, + "step": 15462, + "time_per_iteration": 4.160700559616089 + }, + { + "auxiliary_loss_clip": 0.01234077, + "auxiliary_loss_mlp": 0.00237493, + "balance_loss_clip": 1.02017164, + "balance_loss_mlp": 0.21355619, + "epoch": 0.9296858560048099, + "flos": 16362661818240.0, + "grad_norm": 15.135776101207872, + "language_loss": 0.73867691, + "learning_rate": 5.159673925518282e-08, + "loss": 0.75339264, + "num_input_tokens_seen": 333696410, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.23937988, + "step": 15463, + "time_per_iteration": 4.0837531089782715 + }, + { + "auxiliary_loss_clip": 0.01218658, + "auxiliary_loss_mlp": 0.00229536, + "balance_loss_clip": 1.01146626, + "balance_loss_mlp": 0.20688581, + "epoch": 0.9297459792574778, + "flos": 29858609139840.0, + "grad_norm": 21.596395820870754, + "language_loss": 0.76781756, + "learning_rate": 5.15088827260437e-08, + "loss": 0.78229952, + "num_input_tokens_seen": 333716615, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.22644043, + "step": 15464, + "time_per_iteration": 2.7974557876586914 + }, + { + "auxiliary_loss_clip": 0.01251649, + "auxiliary_loss_mlp": 0.00243067, + "balance_loss_clip": 1.03019643, + "balance_loss_mlp": 0.21805689, + "epoch": 0.9298061025101458, + "flos": 15924838942080.0, + "grad_norm": 12.807057274972777, + "language_loss": 0.8574459, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.87239301, + "num_input_tokens_seen": 333732800, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.25024414, + "step": 15465, + "time_per_iteration": 2.6604020595550537 + }, + { + "auxiliary_loss_clip": 0.01096896, + "auxiliary_loss_mlp": 0.00080994, + "balance_loss_clip": 0.96154726, + "balance_loss_mlp": 0.07403195, + "epoch": 0.9298662257628137, + "flos": 64096994304000.0, + "grad_norm": 0.6907225337915476, + "language_loss": 0.55872661, + "learning_rate": 5.133339133202952e-08, + "loss": 0.57050556, + "num_input_tokens_seen": 333799300, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.06982422, + "step": 15466, + "time_per_iteration": 3.3077917098999023 + }, + { + "auxiliary_loss_clip": 0.01243545, + "auxiliary_loss_mlp": 0.00221354, + "balance_loss_clip": 1.03032172, + "balance_loss_mlp": 0.19649887, + "epoch": 0.9299263490154818, + "flos": 24280210224000.0, + "grad_norm": 15.377848730370221, + "language_loss": 0.80041087, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.8150599, + "num_input_tokens_seen": 333820360, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.24853516, + "step": 15467, + "time_per_iteration": 2.7261979579925537 + }, + { + "auxiliary_loss_clip": 0.01236818, + "auxiliary_loss_mlp": 0.00224169, + "balance_loss_clip": 1.02116442, + "balance_loss_mlp": 0.19783539, + "epoch": 0.9299864722681497, + "flos": 23294354567040.0, + "grad_norm": 11.802700606354401, + "language_loss": 0.7858752, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.80048507, + "num_input_tokens_seen": 333840415, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.26330566, + "step": 15468, + "time_per_iteration": 2.711766242980957 + }, + { + "auxiliary_loss_clip": 0.01244421, + "auxiliary_loss_mlp": 0.00227721, + "balance_loss_clip": 1.0255456, + "balance_loss_mlp": 0.20077963, + "epoch": 0.9300465955208177, + "flos": 21395972868480.0, + "grad_norm": 16.389506539490718, + "language_loss": 0.84429395, + "learning_rate": 5.107070845155737e-08, + "loss": 0.85901535, + "num_input_tokens_seen": 333859910, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.26977539, + "step": 15469, + "time_per_iteration": 2.6973044872283936 + }, + { + "auxiliary_loss_clip": 0.01242412, + "auxiliary_loss_mlp": 0.00223063, + "balance_loss_clip": 1.02134085, + "balance_loss_mlp": 0.19775447, + "epoch": 0.9301067187734856, + "flos": 24571445696640.0, + "grad_norm": 36.74656877367761, + "language_loss": 0.83713901, + "learning_rate": 5.098329529416379e-08, + "loss": 0.85179371, + "num_input_tokens_seen": 333880495, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25292969, + "step": 15470, + "time_per_iteration": 4.127457618713379 + }, + { + "auxiliary_loss_clip": 0.01228206, + "auxiliary_loss_mlp": 0.0022244, + "balance_loss_clip": 1.01867056, + "balance_loss_mlp": 0.19887258, + "epoch": 0.9301668420261536, + "flos": 22196960202240.0, + "grad_norm": 33.89218153413759, + "language_loss": 0.82427436, + "learning_rate": 5.089595604367902e-08, + "loss": 0.83878082, + "num_input_tokens_seen": 333897640, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.23583984, + "step": 15471, + "time_per_iteration": 2.7078909873962402 + }, + { + "auxiliary_loss_clip": 0.01231167, + "auxiliary_loss_mlp": 0.00230245, + "balance_loss_clip": 1.0143801, + "balance_loss_mlp": 0.20603326, + "epoch": 0.9302269652788215, + "flos": 17747628468480.0, + "grad_norm": 17.24508435760109, + "language_loss": 0.79655349, + "learning_rate": 5.080869070341487e-08, + "loss": 0.8111676, + "num_input_tokens_seen": 333913670, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24206543, + "step": 15472, + "time_per_iteration": 2.670297145843506 + }, + { + "auxiliary_loss_clip": 0.01229453, + "auxiliary_loss_mlp": 0.00234498, + "balance_loss_clip": 1.01732922, + "balance_loss_mlp": 0.21145469, + "epoch": 0.9302870885314896, + "flos": 19390793057280.0, + "grad_norm": 10.897087659480508, + "language_loss": 0.94844586, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.96308535, + "num_input_tokens_seen": 333934105, + "router_z_loss_clip": 2.12011719, + "router_z_loss_mlp": 0.23059082, + "step": 15473, + "time_per_iteration": 2.6460723876953125 + }, + { + "auxiliary_loss_clip": 0.01267102, + "auxiliary_loss_mlp": 0.00223341, + "balance_loss_clip": 1.03866553, + "balance_loss_mlp": 0.19650725, + "epoch": 0.9303472117841575, + "flos": 21760286561280.0, + "grad_norm": 5.783071209879749, + "language_loss": 0.73836851, + "learning_rate": 5.063438176678203e-08, + "loss": 0.75327301, + "num_input_tokens_seen": 333953635, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.26831055, + "step": 15474, + "time_per_iteration": 4.187880516052246 + }, + { + "auxiliary_loss_clip": 0.0123614, + "auxiliary_loss_mlp": 0.00220144, + "balance_loss_clip": 1.02219272, + "balance_loss_mlp": 0.19458561, + "epoch": 0.9304073350368255, + "flos": 19609740408960.0, + "grad_norm": 14.115748776263876, + "language_loss": 0.8240056, + "learning_rate": 5.054733817702339e-08, + "loss": 0.83856845, + "num_input_tokens_seen": 333971825, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.25561523, + "step": 15475, + "time_per_iteration": 2.7446584701538086 + }, + { + "auxiliary_loss_clip": 0.01235134, + "auxiliary_loss_mlp": 0.00224152, + "balance_loss_clip": 1.01704502, + "balance_loss_mlp": 0.19958329, + "epoch": 0.9304674582894935, + "flos": 30441582875520.0, + "grad_norm": 87.18662172199377, + "language_loss": 0.73458278, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.74917567, + "num_input_tokens_seen": 333990120, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24572754, + "step": 15476, + "time_per_iteration": 2.732071876525879 + }, + { + "auxiliary_loss_clip": 0.0125577, + "auxiliary_loss_mlp": 0.00236201, + "balance_loss_clip": 1.03362644, + "balance_loss_mlp": 0.20973611, + "epoch": 0.9305275815421614, + "flos": 17785693906560.0, + "grad_norm": 812.3675798868563, + "language_loss": 0.79174459, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.80666435, + "num_input_tokens_seen": 334007970, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.26477051, + "step": 15477, + "time_per_iteration": 2.660285234451294 + }, + { + "auxiliary_loss_clip": 0.01224356, + "auxiliary_loss_mlp": 0.00211395, + "balance_loss_clip": 1.01292205, + "balance_loss_mlp": 0.18676677, + "epoch": 0.9305877047948294, + "flos": 25298456970240.0, + "grad_norm": 13.979079667178993, + "language_loss": 0.65259892, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.66695642, + "num_input_tokens_seen": 334027120, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.24621582, + "step": 15478, + "time_per_iteration": 2.701500177383423 + }, + { + "auxiliary_loss_clip": 0.01257167, + "auxiliary_loss_mlp": 0.0022098, + "balance_loss_clip": 1.03082728, + "balance_loss_mlp": 0.1951827, + "epoch": 0.9306478280474973, + "flos": 16977236544000.0, + "grad_norm": 14910.929320770738, + "language_loss": 0.88611376, + "learning_rate": 5.01999030853566e-08, + "loss": 0.90089524, + "num_input_tokens_seen": 334042785, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.25842285, + "step": 15479, + "time_per_iteration": 2.7905688285827637 + }, + { + "auxiliary_loss_clip": 0.01231, + "auxiliary_loss_mlp": 0.00213483, + "balance_loss_clip": 1.01902533, + "balance_loss_mlp": 0.18925937, + "epoch": 0.9307079513001654, + "flos": 35663353608960.0, + "grad_norm": 13.445204572926379, + "language_loss": 0.75940138, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.77384621, + "num_input_tokens_seen": 334063480, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.24230957, + "step": 15480, + "time_per_iteration": 2.794933795928955 + }, + { + "auxiliary_loss_clip": 0.01254186, + "auxiliary_loss_mlp": 0.00234092, + "balance_loss_clip": 1.03166437, + "balance_loss_mlp": 0.2080685, + "epoch": 0.9307680745528333, + "flos": 19208151377280.0, + "grad_norm": 2.950933596683998, + "language_loss": 0.76738226, + "learning_rate": 5.002662914604583e-08, + "loss": 0.78226507, + "num_input_tokens_seen": 334082005, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.26013184, + "step": 15481, + "time_per_iteration": 2.689924955368042 + }, + { + "auxiliary_loss_clip": 0.01212846, + "auxiliary_loss_mlp": 0.00198854, + "balance_loss_clip": 1.00586867, + "balance_loss_mlp": 0.17579883, + "epoch": 0.9308281978055013, + "flos": 19062641381760.0, + "grad_norm": 25.814423419619395, + "language_loss": 0.82025772, + "learning_rate": 4.994010308952701e-08, + "loss": 0.83437467, + "num_input_tokens_seen": 334101375, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.23059082, + "step": 15482, + "time_per_iteration": 2.673771619796753 + }, + { + "auxiliary_loss_clip": 0.01227749, + "auxiliary_loss_mlp": 0.00231244, + "balance_loss_clip": 1.01626086, + "balance_loss_mlp": 0.20742594, + "epoch": 0.9308883210581692, + "flos": 20521548178560.0, + "grad_norm": 19.709567275406044, + "language_loss": 0.87705886, + "learning_rate": 4.985365097947469e-08, + "loss": 0.89164883, + "num_input_tokens_seen": 334119460, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.23815918, + "step": 15483, + "time_per_iteration": 2.6858410835266113 + }, + { + "auxiliary_loss_clip": 0.01239762, + "auxiliary_loss_mlp": 0.00217081, + "balance_loss_clip": 1.01961625, + "balance_loss_mlp": 0.19242826, + "epoch": 0.9309484443108372, + "flos": 13001422826880.0, + "grad_norm": 21.0727337646115, + "language_loss": 0.83137584, + "learning_rate": 4.976727281916782e-08, + "loss": 0.84594429, + "num_input_tokens_seen": 334136065, + "router_z_loss_clip": 2.20214844, + "router_z_loss_mlp": 0.24658203, + "step": 15484, + "time_per_iteration": 2.6589295864105225 + }, + { + "auxiliary_loss_clip": 0.01251489, + "auxiliary_loss_mlp": 0.00242227, + "balance_loss_clip": 1.03537869, + "balance_loss_mlp": 0.21905278, + "epoch": 0.9310085675635051, + "flos": 12567765928320.0, + "grad_norm": 19.440832302242182, + "language_loss": 0.85495472, + "learning_rate": 4.968096861188087e-08, + "loss": 0.86989188, + "num_input_tokens_seen": 334153690, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.23193359, + "step": 15485, + "time_per_iteration": 2.677751302719116 + }, + { + "auxiliary_loss_clip": 0.01249054, + "auxiliary_loss_mlp": 0.00253241, + "balance_loss_clip": 1.02576423, + "balance_loss_mlp": 0.22483294, + "epoch": 0.9310686908161732, + "flos": 23477570864640.0, + "grad_norm": 13.281361744337142, + "language_loss": 0.87557727, + "learning_rate": 4.959473836088723e-08, + "loss": 0.8906002, + "num_input_tokens_seen": 334171880, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.28442383, + "step": 15486, + "time_per_iteration": 2.8261735439300537 + }, + { + "auxiliary_loss_clip": 0.01271222, + "auxiliary_loss_mlp": 0.00229329, + "balance_loss_clip": 1.04101694, + "balance_loss_mlp": 0.20230439, + "epoch": 0.9311288140688411, + "flos": 24170287628160.0, + "grad_norm": 752.9272591946858, + "language_loss": 0.85245502, + "learning_rate": 4.950858206945674e-08, + "loss": 0.86746049, + "num_input_tokens_seen": 334190005, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.27038574, + "step": 15487, + "time_per_iteration": 2.702105760574341 + }, + { + "auxiliary_loss_clip": 0.0122979, + "auxiliary_loss_mlp": 0.00219266, + "balance_loss_clip": 1.01353359, + "balance_loss_mlp": 0.19422054, + "epoch": 0.9311889373215091, + "flos": 35590203561600.0, + "grad_norm": 18.321833798089116, + "language_loss": 0.78016311, + "learning_rate": 4.942249974085633e-08, + "loss": 0.79465365, + "num_input_tokens_seen": 334209545, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.25085449, + "step": 15488, + "time_per_iteration": 2.8490989208221436 + }, + { + "auxiliary_loss_clip": 0.01233265, + "auxiliary_loss_mlp": 0.00228283, + "balance_loss_clip": 1.01833093, + "balance_loss_mlp": 0.2032015, + "epoch": 0.9312490605741771, + "flos": 20230528187520.0, + "grad_norm": 7.744922456808966, + "language_loss": 0.82670254, + "learning_rate": 4.933649137834983e-08, + "loss": 0.84131801, + "num_input_tokens_seen": 334228900, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.25085449, + "step": 15489, + "time_per_iteration": 2.6741156578063965 + }, + { + "auxiliary_loss_clip": 0.01247513, + "auxiliary_loss_mlp": 0.00215597, + "balance_loss_clip": 1.02884591, + "balance_loss_mlp": 0.19064625, + "epoch": 0.931309183826845, + "flos": 13950577762560.0, + "grad_norm": 18.649659816719712, + "language_loss": 0.90163684, + "learning_rate": 4.925055698519931e-08, + "loss": 0.91626799, + "num_input_tokens_seen": 334245500, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24938965, + "step": 15490, + "time_per_iteration": 2.6627843379974365 + }, + { + "auxiliary_loss_clip": 0.01255839, + "auxiliary_loss_mlp": 0.00243519, + "balance_loss_clip": 1.03119755, + "balance_loss_mlp": 0.21680379, + "epoch": 0.931369307079513, + "flos": 20156731695360.0, + "grad_norm": 39.22206894379515, + "language_loss": 0.79995775, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.81495136, + "num_input_tokens_seen": 334264370, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.26745605, + "step": 15491, + "time_per_iteration": 2.632183313369751 + }, + { + "auxiliary_loss_clip": 0.01216413, + "auxiliary_loss_mlp": 0.0021113, + "balance_loss_clip": 1.01107883, + "balance_loss_mlp": 0.18809851, + "epoch": 0.931429430332181, + "flos": 25338569483520.0, + "grad_norm": 91.45650051874382, + "language_loss": 0.80665493, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.82093036, + "num_input_tokens_seen": 334283905, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.23034668, + "step": 15492, + "time_per_iteration": 2.687941312789917 + }, + { + "auxiliary_loss_clip": 0.01091922, + "auxiliary_loss_mlp": 0.00068038, + "balance_loss_clip": 0.95552862, + "balance_loss_mlp": 0.06026599, + "epoch": 0.931489553584849, + "flos": 71226193985280.0, + "grad_norm": 0.6925480551560913, + "language_loss": 0.52825314, + "learning_rate": 4.899319765445442e-08, + "loss": 0.53985274, + "num_input_tokens_seen": 334339925, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.07763672, + "step": 15493, + "time_per_iteration": 3.053372621536255 + }, + { + "auxiliary_loss_clip": 0.01231403, + "auxiliary_loss_mlp": 0.00213889, + "balance_loss_clip": 1.02271199, + "balance_loss_mlp": 0.19159693, + "epoch": 0.9315496768375169, + "flos": 14643653662080.0, + "grad_norm": 3.4132665830314646, + "language_loss": 0.79786146, + "learning_rate": 4.890755917128531e-08, + "loss": 0.81231439, + "num_input_tokens_seen": 334357225, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.22302246, + "step": 15494, + "time_per_iteration": 2.6551594734191895 + }, + { + "auxiliary_loss_clip": 0.01244472, + "auxiliary_loss_mlp": 0.00236924, + "balance_loss_clip": 1.02683342, + "balance_loss_mlp": 0.21171159, + "epoch": 0.9316098000901849, + "flos": 28329928174080.0, + "grad_norm": 2.2123506609612127, + "language_loss": 0.75733805, + "learning_rate": 4.882199467373671e-08, + "loss": 0.77215207, + "num_input_tokens_seen": 334375945, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.2520752, + "step": 15495, + "time_per_iteration": 2.7256836891174316 + }, + { + "auxiliary_loss_clip": 0.0122573, + "auxiliary_loss_mlp": 0.00206538, + "balance_loss_clip": 1.01742911, + "balance_loss_mlp": 0.18387654, + "epoch": 0.9316699233428528, + "flos": 28512677594880.0, + "grad_norm": 3.6845568476541923, + "language_loss": 0.69156337, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.70588607, + "num_input_tokens_seen": 334395310, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.22668457, + "step": 15496, + "time_per_iteration": 2.8016929626464844 + }, + { + "auxiliary_loss_clip": 0.01242709, + "auxiliary_loss_mlp": 0.0021414, + "balance_loss_clip": 1.02077127, + "balance_loss_mlp": 0.18887925, + "epoch": 0.9317300465955208, + "flos": 33693402061440.0, + "grad_norm": 9.005002246276026, + "language_loss": 0.8419674, + "learning_rate": 4.865108764847825e-08, + "loss": 0.85653591, + "num_input_tokens_seen": 334416965, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25256348, + "step": 15497, + "time_per_iteration": 2.7524819374084473 + }, + { + "auxiliary_loss_clip": 0.01250715, + "auxiliary_loss_mlp": 0.00236085, + "balance_loss_clip": 1.03259695, + "balance_loss_mlp": 0.21106258, + "epoch": 0.9317901698481887, + "flos": 23658237296640.0, + "grad_norm": 15.880221765807132, + "language_loss": 0.73605359, + "learning_rate": 4.856574512724898e-08, + "loss": 0.75092149, + "num_input_tokens_seen": 334435620, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25012207, + "step": 15498, + "time_per_iteration": 2.646432876586914 + }, + { + "auxiliary_loss_clip": 0.01229678, + "auxiliary_loss_mlp": 0.00224613, + "balance_loss_clip": 1.01724124, + "balance_loss_mlp": 0.19966222, + "epoch": 0.9318502931008568, + "flos": 20960017499520.0, + "grad_norm": 32.044361141111786, + "language_loss": 0.8720597, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.88660264, + "num_input_tokens_seen": 334456210, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.24951172, + "step": 15499, + "time_per_iteration": 2.722696542739868 + }, + { + "auxiliary_loss_clip": 0.01221112, + "auxiliary_loss_mlp": 0.00237041, + "balance_loss_clip": 1.01025677, + "balance_loss_mlp": 0.21273404, + "epoch": 0.9319104163535247, + "flos": 23441049711360.0, + "grad_norm": 9.168177780059693, + "language_loss": 0.83027387, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.84485543, + "num_input_tokens_seen": 334475485, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.24304199, + "step": 15500, + "time_per_iteration": 2.718877077102661 + }, + { + "auxiliary_loss_clip": 0.01220328, + "auxiliary_loss_mlp": 0.00226831, + "balance_loss_clip": 1.01140952, + "balance_loss_mlp": 0.20279843, + "epoch": 0.9319705396061927, + "flos": 22347426274560.0, + "grad_norm": 16.492030218114675, + "language_loss": 0.80968034, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.82415193, + "num_input_tokens_seen": 334494740, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.24047852, + "step": 15501, + "time_per_iteration": 2.7366244792938232 + }, + { + "auxiliary_loss_clip": 0.0124354, + "auxiliary_loss_mlp": 0.00220032, + "balance_loss_clip": 1.02674437, + "balance_loss_mlp": 0.19539149, + "epoch": 0.9320306628588607, + "flos": 20993557824000.0, + "grad_norm": 51.83051139109661, + "language_loss": 0.75780427, + "learning_rate": 4.822511506047666e-08, + "loss": 0.77244002, + "num_input_tokens_seen": 334511910, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24621582, + "step": 15502, + "time_per_iteration": 2.6186678409576416 + }, + { + "auxiliary_loss_clip": 0.012387, + "auxiliary_loss_mlp": 0.00230005, + "balance_loss_clip": 1.02341819, + "balance_loss_mlp": 0.20678331, + "epoch": 0.9320907861115286, + "flos": 24538300421760.0, + "grad_norm": 51.65767327422452, + "language_loss": 0.72489476, + "learning_rate": 4.814014256446586e-08, + "loss": 0.73958182, + "num_input_tokens_seen": 334533150, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.23254395, + "step": 15503, + "time_per_iteration": 2.704935073852539 + }, + { + "auxiliary_loss_clip": 0.01270001, + "auxiliary_loss_mlp": 0.00221947, + "balance_loss_clip": 1.04071879, + "balance_loss_mlp": 0.19584017, + "epoch": 0.9321509093641966, + "flos": 19785414850560.0, + "grad_norm": 9.618496916442938, + "language_loss": 0.84545398, + "learning_rate": 4.805524408317652e-08, + "loss": 0.8603735, + "num_input_tokens_seen": 334550940, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.26123047, + "step": 15504, + "time_per_iteration": 4.061170816421509 + }, + { + "auxiliary_loss_clip": 0.01256987, + "auxiliary_loss_mlp": 0.00243746, + "balance_loss_clip": 1.03354073, + "balance_loss_mlp": 0.21788917, + "epoch": 0.9322110326168646, + "flos": 24972675592320.0, + "grad_norm": 5.905805331187033, + "language_loss": 0.77954328, + "learning_rate": 4.797041961982762e-08, + "loss": 0.79455066, + "num_input_tokens_seen": 334570935, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25878906, + "step": 15505, + "time_per_iteration": 4.122279167175293 + }, + { + "auxiliary_loss_clip": 0.01233726, + "auxiliary_loss_mlp": 0.00202062, + "balance_loss_clip": 1.0185709, + "balance_loss_mlp": 0.17701565, + "epoch": 0.9322711558695326, + "flos": 16143642639360.0, + "grad_norm": 5.6433078891458655, + "language_loss": 0.8498345, + "learning_rate": 4.788566917763614e-08, + "loss": 0.86419237, + "num_input_tokens_seen": 334589315, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.25073242, + "step": 15506, + "time_per_iteration": 2.6875457763671875 + }, + { + "auxiliary_loss_clip": 0.01225181, + "auxiliary_loss_mlp": 0.00212231, + "balance_loss_clip": 1.01530147, + "balance_loss_mlp": 0.18775725, + "epoch": 0.9323312791222005, + "flos": 23732428838400.0, + "grad_norm": 10.882925030968346, + "language_loss": 0.91896451, + "learning_rate": 4.780099275981597e-08, + "loss": 0.93333858, + "num_input_tokens_seen": 334608990, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.24450684, + "step": 15507, + "time_per_iteration": 2.7391045093536377 + }, + { + "auxiliary_loss_clip": 0.01235301, + "auxiliary_loss_mlp": 0.00222746, + "balance_loss_clip": 1.01846838, + "balance_loss_mlp": 0.19795053, + "epoch": 0.9323914023748685, + "flos": 20777914523520.0, + "grad_norm": 18.52536809762023, + "language_loss": 0.75559568, + "learning_rate": 4.771639036957742e-08, + "loss": 0.77017617, + "num_input_tokens_seen": 334628655, + "router_z_loss_clip": 2.17089844, + "router_z_loss_mlp": 0.24768066, + "step": 15508, + "time_per_iteration": 2.654867649078369 + }, + { + "auxiliary_loss_clip": 0.01230641, + "auxiliary_loss_mlp": 0.00224501, + "balance_loss_clip": 1.01589751, + "balance_loss_mlp": 0.19927633, + "epoch": 0.9324515256275364, + "flos": 23915178259200.0, + "grad_norm": 2.9002203082277833, + "language_loss": 0.79565138, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.81020278, + "num_input_tokens_seen": 334648295, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.25219727, + "step": 15509, + "time_per_iteration": 2.713151454925537 + }, + { + "auxiliary_loss_clip": 0.01235252, + "auxiliary_loss_mlp": 0.00207212, + "balance_loss_clip": 1.02074277, + "balance_loss_mlp": 0.18373968, + "epoch": 0.9325116488802044, + "flos": 18005215875840.0, + "grad_norm": 12.066863616672919, + "language_loss": 0.8277154, + "learning_rate": 4.754740768467624e-08, + "loss": 0.84214008, + "num_input_tokens_seen": 334666280, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.23498535, + "step": 15510, + "time_per_iteration": 2.6322970390319824 + }, + { + "auxiliary_loss_clip": 0.0125055, + "auxiliary_loss_mlp": 0.00227269, + "balance_loss_clip": 1.02875185, + "balance_loss_mlp": 0.20218728, + "epoch": 0.9325717721328723, + "flos": 29021603443200.0, + "grad_norm": 17.47599519994485, + "language_loss": 0.7764684, + "learning_rate": 4.746302739642161e-08, + "loss": 0.79124653, + "num_input_tokens_seen": 334688830, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25073242, + "step": 15511, + "time_per_iteration": 2.7939913272857666 + }, + { + "auxiliary_loss_clip": 0.01230038, + "auxiliary_loss_mlp": 0.00215744, + "balance_loss_clip": 1.01755381, + "balance_loss_mlp": 0.19349989, + "epoch": 0.9326318953855404, + "flos": 21646341642240.0, + "grad_norm": 8.00560178063444, + "language_loss": 0.84622926, + "learning_rate": 4.737872114856412e-08, + "loss": 0.86068714, + "num_input_tokens_seen": 334705205, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.22241211, + "step": 15512, + "time_per_iteration": 4.22195839881897 + }, + { + "auxiliary_loss_clip": 0.01248631, + "auxiliary_loss_mlp": 0.00230279, + "balance_loss_clip": 1.02797878, + "balance_loss_mlp": 0.20592441, + "epoch": 0.9326920186382083, + "flos": 26065724411520.0, + "grad_norm": 19.61232645726312, + "language_loss": 0.8652485, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.88003761, + "num_input_tokens_seen": 334723830, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24365234, + "step": 15513, + "time_per_iteration": 2.6946003437042236 + }, + { + "auxiliary_loss_clip": 0.01256047, + "auxiliary_loss_mlp": 0.00225027, + "balance_loss_clip": 1.03334332, + "balance_loss_mlp": 0.19895561, + "epoch": 0.9327521418908763, + "flos": 12057116227200.0, + "grad_norm": 31.560437466450825, + "language_loss": 0.90314716, + "learning_rate": 4.721033078682768e-08, + "loss": 0.9179579, + "num_input_tokens_seen": 334740825, + "router_z_loss_clip": 2.22558594, + "router_z_loss_mlp": 0.26086426, + "step": 15514, + "time_per_iteration": 2.641972064971924 + }, + { + "auxiliary_loss_clip": 0.01245659, + "auxiliary_loss_mlp": 0.00221297, + "balance_loss_clip": 1.03095388, + "balance_loss_mlp": 0.1964661, + "epoch": 0.9328122651435443, + "flos": 43834395271680.0, + "grad_norm": 3.687854607931068, + "language_loss": 0.77548623, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.79015583, + "num_input_tokens_seen": 334765825, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24804688, + "step": 15515, + "time_per_iteration": 2.8462512493133545 + }, + { + "auxiliary_loss_clip": 0.01260853, + "auxiliary_loss_mlp": 0.0022578, + "balance_loss_clip": 1.0389564, + "balance_loss_mlp": 0.19932747, + "epoch": 0.9328723883962122, + "flos": 15194954580480.0, + "grad_norm": 4.204943427560678, + "language_loss": 0.91693336, + "learning_rate": 4.704223662500806e-08, + "loss": 0.93179965, + "num_input_tokens_seen": 334782680, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.26452637, + "step": 15516, + "time_per_iteration": 4.080990791320801 + }, + { + "auxiliary_loss_clip": 0.01245114, + "auxiliary_loss_mlp": 0.00231733, + "balance_loss_clip": 1.02424598, + "balance_loss_mlp": 0.20760551, + "epoch": 0.9329325116488802, + "flos": 20261770041600.0, + "grad_norm": 11.559954983735405, + "language_loss": 0.88071436, + "learning_rate": 4.695830062703643e-08, + "loss": 0.89548278, + "num_input_tokens_seen": 334800160, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24121094, + "step": 15517, + "time_per_iteration": 2.7151999473571777 + }, + { + "auxiliary_loss_clip": 0.01248165, + "auxiliary_loss_mlp": 0.00231828, + "balance_loss_clip": 1.02773428, + "balance_loss_mlp": 0.20376575, + "epoch": 0.9329926349015482, + "flos": 13115008609920.0, + "grad_norm": 61.327149960495774, + "language_loss": 0.84014827, + "learning_rate": 4.687443868860219e-08, + "loss": 0.85494816, + "num_input_tokens_seen": 334815840, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.28063965, + "step": 15518, + "time_per_iteration": 2.599527359008789 + }, + { + "auxiliary_loss_clip": 0.01221871, + "auxiliary_loss_mlp": 0.00214335, + "balance_loss_clip": 1.01233196, + "balance_loss_mlp": 0.19087471, + "epoch": 0.9330527581542162, + "flos": 23040250778880.0, + "grad_norm": 12.27627608655398, + "language_loss": 0.84600842, + "learning_rate": 4.679065081288458e-08, + "loss": 0.86037046, + "num_input_tokens_seen": 334834735, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.23474121, + "step": 15519, + "time_per_iteration": 2.6852145195007324 + }, + { + "auxiliary_loss_clip": 0.01232684, + "auxiliary_loss_mlp": 0.00207922, + "balance_loss_clip": 1.01802492, + "balance_loss_mlp": 0.184247, + "epoch": 0.9331128814068841, + "flos": 15559627409280.0, + "grad_norm": 13.316388415540754, + "language_loss": 0.91839033, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.93279648, + "num_input_tokens_seen": 334853490, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.23693848, + "step": 15520, + "time_per_iteration": 2.606900930404663 + }, + { + "auxiliary_loss_clip": 0.01224766, + "auxiliary_loss_mlp": 0.00216111, + "balance_loss_clip": 1.01268375, + "balance_loss_mlp": 0.19100554, + "epoch": 0.9331730046595521, + "flos": 22271762275200.0, + "grad_norm": 4.387365544225445, + "language_loss": 0.83864427, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.85305303, + "num_input_tokens_seen": 334873675, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.25097656, + "step": 15521, + "time_per_iteration": 2.6989078521728516 + }, + { + "auxiliary_loss_clip": 0.0125224, + "auxiliary_loss_mlp": 0.00223274, + "balance_loss_clip": 1.03495526, + "balance_loss_mlp": 0.19961125, + "epoch": 0.93323312791222, + "flos": 15777641007360.0, + "grad_norm": 26.049468875339286, + "language_loss": 0.84981799, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.86457312, + "num_input_tokens_seen": 334890970, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.23669434, + "step": 15522, + "time_per_iteration": 2.6587812900543213 + }, + { + "auxiliary_loss_clip": 0.01248878, + "auxiliary_loss_mlp": 0.00232952, + "balance_loss_clip": 1.02317739, + "balance_loss_mlp": 0.20462805, + "epoch": 0.933293251164888, + "flos": 22010978557440.0, + "grad_norm": 19.39291013424816, + "language_loss": 0.73706931, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.75188762, + "num_input_tokens_seen": 334906635, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.2833252, + "step": 15523, + "time_per_iteration": 2.68953537940979 + }, + { + "auxiliary_loss_clip": 0.01242004, + "auxiliary_loss_mlp": 0.00236803, + "balance_loss_clip": 1.02637148, + "balance_loss_mlp": 0.20921823, + "epoch": 0.933353374417556, + "flos": 26031358074240.0, + "grad_norm": 76.22521592717139, + "language_loss": 0.76272827, + "learning_rate": 4.63728224861577e-08, + "loss": 0.77751637, + "num_input_tokens_seen": 334926230, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.27575684, + "step": 15524, + "time_per_iteration": 2.7391645908355713 + }, + { + "auxiliary_loss_clip": 0.01254464, + "auxiliary_loss_mlp": 0.0025468, + "balance_loss_clip": 1.03059983, + "balance_loss_mlp": 0.22682112, + "epoch": 0.933413497670224, + "flos": 24900100162560.0, + "grad_norm": 15.508547714207198, + "language_loss": 0.80024397, + "learning_rate": 4.628947905336589e-08, + "loss": 0.81533539, + "num_input_tokens_seen": 334946680, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.27844238, + "step": 15525, + "time_per_iteration": 2.8457634449005127 + }, + { + "auxiliary_loss_clip": 0.01243214, + "auxiliary_loss_mlp": 0.00231414, + "balance_loss_clip": 1.02434111, + "balance_loss_mlp": 0.20595071, + "epoch": 0.9334736209228919, + "flos": 23688689051520.0, + "grad_norm": 15.468032369957692, + "language_loss": 0.90071893, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.91546524, + "num_input_tokens_seen": 334964785, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.2545166, + "step": 15526, + "time_per_iteration": 2.716691732406616 + }, + { + "auxiliary_loss_clip": 0.01233928, + "auxiliary_loss_mlp": 0.00205292, + "balance_loss_clip": 1.01618671, + "balance_loss_mlp": 0.18174858, + "epoch": 0.9335337441755599, + "flos": 15377344865280.0, + "grad_norm": 36.61411475946276, + "language_loss": 0.77313459, + "learning_rate": 4.61230144456366e-08, + "loss": 0.78752685, + "num_input_tokens_seen": 334982400, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.2355957, + "step": 15527, + "time_per_iteration": 2.6246745586395264 + }, + { + "auxiliary_loss_clip": 0.01257442, + "auxiliary_loss_mlp": 0.0022481, + "balance_loss_clip": 1.02996945, + "balance_loss_mlp": 0.1992994, + "epoch": 0.9335938674282279, + "flos": 16106726436480.0, + "grad_norm": 59.26182965531216, + "language_loss": 0.75865006, + "learning_rate": 4.603989327701141e-08, + "loss": 0.77347261, + "num_input_tokens_seen": 334999685, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.25512695, + "step": 15528, + "time_per_iteration": 2.709789514541626 + }, + { + "auxiliary_loss_clip": 0.0125258, + "auxiliary_loss_mlp": 0.00223806, + "balance_loss_clip": 1.02966237, + "balance_loss_mlp": 0.19791359, + "epoch": 0.9336539906808958, + "flos": 18952898353920.0, + "grad_norm": 49.28586282885602, + "language_loss": 0.84202397, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.8567878, + "num_input_tokens_seen": 335019160, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25878906, + "step": 15529, + "time_per_iteration": 2.641139268875122 + }, + { + "auxiliary_loss_clip": 0.01226716, + "auxiliary_loss_mlp": 0.0021441, + "balance_loss_clip": 1.0143379, + "balance_loss_mlp": 0.18962659, + "epoch": 0.9337141139335638, + "flos": 18109104986880.0, + "grad_norm": 5.2112185238403494, + "language_loss": 0.69594067, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.71035194, + "num_input_tokens_seen": 335037350, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.24804688, + "step": 15530, + "time_per_iteration": 2.670599937438965 + }, + { + "auxiliary_loss_clip": 0.01218967, + "auxiliary_loss_mlp": 0.00208297, + "balance_loss_clip": 1.00858235, + "balance_loss_mlp": 0.18338257, + "epoch": 0.9337742371862318, + "flos": 17345716214400.0, + "grad_norm": 14.006072876486176, + "language_loss": 0.80056256, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.81483519, + "num_input_tokens_seen": 335056060, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.24902344, + "step": 15531, + "time_per_iteration": 2.6670663356781006 + }, + { + "auxiliary_loss_clip": 0.01245819, + "auxiliary_loss_mlp": 0.00240735, + "balance_loss_clip": 1.02840185, + "balance_loss_mlp": 0.21464014, + "epoch": 0.9338343604388998, + "flos": 29058986522880.0, + "grad_norm": 350.9700647075944, + "language_loss": 0.78348601, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.79835153, + "num_input_tokens_seen": 335075410, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.26098633, + "step": 15532, + "time_per_iteration": 2.7141005992889404 + }, + { + "auxiliary_loss_clip": 0.0124342, + "auxiliary_loss_mlp": 0.00202897, + "balance_loss_clip": 1.02464247, + "balance_loss_mlp": 0.1773148, + "epoch": 0.9338944836915677, + "flos": 18660908695680.0, + "grad_norm": 2.4038024899809955, + "language_loss": 0.79661453, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.81107771, + "num_input_tokens_seen": 335095190, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25598145, + "step": 15533, + "time_per_iteration": 2.6496822834014893 + }, + { + "auxiliary_loss_clip": 0.01230257, + "auxiliary_loss_mlp": 0.00216423, + "balance_loss_clip": 1.02264214, + "balance_loss_mlp": 0.19299865, + "epoch": 0.9339546069442357, + "flos": 16617735273600.0, + "grad_norm": 14.54597768126562, + "language_loss": 0.88832355, + "learning_rate": 4.554272235700507e-08, + "loss": 0.90279043, + "num_input_tokens_seen": 335113825, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.23413086, + "step": 15534, + "time_per_iteration": 2.6481070518493652 + }, + { + "auxiliary_loss_clip": 0.01210589, + "auxiliary_loss_mlp": 0.00217563, + "balance_loss_clip": 1.00770319, + "balance_loss_mlp": 0.19670171, + "epoch": 0.9340147301969036, + "flos": 23693106424320.0, + "grad_norm": 39.9084949307438, + "language_loss": 0.81992388, + "learning_rate": 4.546011991495513e-08, + "loss": 0.83420539, + "num_input_tokens_seen": 335136425, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.20861816, + "step": 15535, + "time_per_iteration": 2.663069009780884 + }, + { + "auxiliary_loss_clip": 0.0124136, + "auxiliary_loss_mlp": 0.00227957, + "balance_loss_clip": 1.02724993, + "balance_loss_mlp": 0.20425861, + "epoch": 0.9340748534495716, + "flos": 28654452576000.0, + "grad_norm": 4.519069718895035, + "language_loss": 0.85826504, + "learning_rate": 4.537759158925292e-08, + "loss": 0.87295818, + "num_input_tokens_seen": 335157925, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.23718262, + "step": 15536, + "time_per_iteration": 2.7499163150787354 + }, + { + "auxiliary_loss_clip": 0.01239147, + "auxiliary_loss_mlp": 0.00231833, + "balance_loss_clip": 1.0244782, + "balance_loss_mlp": 0.20658422, + "epoch": 0.9341349767022396, + "flos": 24899633285760.0, + "grad_norm": 9.236322965020197, + "language_loss": 0.87169814, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.88640785, + "num_input_tokens_seen": 335177840, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.25256348, + "step": 15537, + "time_per_iteration": 2.6641745567321777 + }, + { + "auxiliary_loss_clip": 0.01247896, + "auxiliary_loss_mlp": 0.00216095, + "balance_loss_clip": 1.03049016, + "balance_loss_mlp": 0.1923603, + "epoch": 0.9341950999549076, + "flos": 29059525226880.0, + "grad_norm": 11.83637431301546, + "language_loss": 0.8600992, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.87473911, + "num_input_tokens_seen": 335199470, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.23718262, + "step": 15538, + "time_per_iteration": 2.7360665798187256 + }, + { + "auxiliary_loss_clip": 0.01223555, + "auxiliary_loss_mlp": 0.00237166, + "balance_loss_clip": 1.01377082, + "balance_loss_mlp": 0.21185784, + "epoch": 0.9342552232075755, + "flos": 23587062497280.0, + "grad_norm": 14.040589423277787, + "language_loss": 0.80585945, + "learning_rate": 4.513045134151672e-08, + "loss": 0.82046664, + "num_input_tokens_seen": 335218885, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.2532959, + "step": 15539, + "time_per_iteration": 2.6802611351013184 + }, + { + "auxiliary_loss_clip": 0.01233453, + "auxiliary_loss_mlp": 0.00224281, + "balance_loss_clip": 1.0204258, + "balance_loss_mlp": 0.2002241, + "epoch": 0.9343153464602435, + "flos": 36721389646080.0, + "grad_norm": 3.909132210147867, + "language_loss": 0.72396731, + "learning_rate": 4.504821951247373e-08, + "loss": 0.7385447, + "num_input_tokens_seen": 335239485, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.24047852, + "step": 15540, + "time_per_iteration": 2.7938761711120605 + }, + { + "auxiliary_loss_clip": 0.01243459, + "auxiliary_loss_mlp": 0.00241075, + "balance_loss_clip": 1.02693975, + "balance_loss_mlp": 0.21596974, + "epoch": 0.9343754697129115, + "flos": 22236498097920.0, + "grad_norm": 21.168169693624638, + "language_loss": 0.80411768, + "learning_rate": 4.496606181539864e-08, + "loss": 0.81896305, + "num_input_tokens_seen": 335258355, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.2512207, + "step": 15541, + "time_per_iteration": 2.632828712463379 + }, + { + "auxiliary_loss_clip": 0.01242288, + "auxiliary_loss_mlp": 0.00222963, + "balance_loss_clip": 1.02550721, + "balance_loss_mlp": 0.19798848, + "epoch": 0.9344355929655794, + "flos": 29710333797120.0, + "grad_norm": 369.3265493594942, + "language_loss": 0.76219708, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.77684957, + "num_input_tokens_seen": 335276835, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24975586, + "step": 15542, + "time_per_iteration": 2.7179527282714844 + }, + { + "auxiliary_loss_clip": 0.01238268, + "auxiliary_loss_mlp": 0.00233228, + "balance_loss_clip": 1.02498233, + "balance_loss_mlp": 0.20938663, + "epoch": 0.9344957162182475, + "flos": 18880394751360.0, + "grad_norm": 24.40880566007105, + "language_loss": 0.77448571, + "learning_rate": 4.480196882960907e-08, + "loss": 0.78920066, + "num_input_tokens_seen": 335296220, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23840332, + "step": 15543, + "time_per_iteration": 2.6407833099365234 + }, + { + "auxiliary_loss_clip": 0.01261871, + "auxiliary_loss_mlp": 0.00217431, + "balance_loss_clip": 1.03780186, + "balance_loss_mlp": 0.19164571, + "epoch": 0.9345558394709154, + "flos": 27417761268480.0, + "grad_norm": 7.796354761430544, + "language_loss": 0.78977919, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.80457217, + "num_input_tokens_seen": 335316335, + "router_z_loss_clip": 2.23925781, + "router_z_loss_mlp": 0.2578125, + "step": 15544, + "time_per_iteration": 2.705824613571167 + }, + { + "auxiliary_loss_clip": 0.01251484, + "auxiliary_loss_mlp": 0.00228718, + "balance_loss_clip": 1.03190482, + "balance_loss_mlp": 0.20321959, + "epoch": 0.9346159627235834, + "flos": 20741285629440.0, + "grad_norm": 3.8186101469634823, + "language_loss": 0.86884946, + "learning_rate": 4.463817240903789e-08, + "loss": 0.88365149, + "num_input_tokens_seen": 335335545, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25512695, + "step": 15545, + "time_per_iteration": 2.667757034301758 + }, + { + "auxiliary_loss_clip": 0.01247583, + "auxiliary_loss_mlp": 0.00224977, + "balance_loss_clip": 1.0273304, + "balance_loss_mlp": 0.2004915, + "epoch": 0.9346760859762513, + "flos": 21069221823360.0, + "grad_norm": 3.57802594515728, + "language_loss": 0.78580326, + "learning_rate": 4.455638541847495e-08, + "loss": 0.80052888, + "num_input_tokens_seen": 335355350, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.24487305, + "step": 15546, + "time_per_iteration": 2.661632776260376 + }, + { + "auxiliary_loss_clip": 0.01223394, + "auxiliary_loss_mlp": 0.00224726, + "balance_loss_clip": 1.01364005, + "balance_loss_mlp": 0.20027612, + "epoch": 0.9347362092289193, + "flos": 29204927481600.0, + "grad_norm": 3.088034252620135, + "language_loss": 0.88482535, + "learning_rate": 4.447467257852966e-08, + "loss": 0.8993066, + "num_input_tokens_seen": 335375160, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.24475098, + "step": 15547, + "time_per_iteration": 5.526913404464722 + }, + { + "auxiliary_loss_clip": 0.01216998, + "auxiliary_loss_mlp": 0.00222849, + "balance_loss_clip": 1.01392639, + "balance_loss_mlp": 0.2009626, + "epoch": 0.9347963324815872, + "flos": 19427350124160.0, + "grad_norm": 9.51847517466302, + "language_loss": 0.91040492, + "learning_rate": 4.439303389230087e-08, + "loss": 0.92480338, + "num_input_tokens_seen": 335394080, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.21887207, + "step": 15548, + "time_per_iteration": 2.652658700942993 + }, + { + "auxiliary_loss_clip": 0.01267922, + "auxiliary_loss_mlp": 0.0023947, + "balance_loss_clip": 1.03837454, + "balance_loss_mlp": 0.21330363, + "epoch": 0.9348564557342552, + "flos": 36901840596480.0, + "grad_norm": 7.947792243132131, + "language_loss": 0.73651099, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.75158489, + "num_input_tokens_seen": 335414230, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.26171875, + "step": 15549, + "time_per_iteration": 2.8802950382232666 + }, + { + "auxiliary_loss_clip": 0.01253469, + "auxiliary_loss_mlp": 0.0023129, + "balance_loss_clip": 1.03775668, + "balance_loss_mlp": 0.20684013, + "epoch": 0.9349165789869232, + "flos": 21690117342720.0, + "grad_norm": 6.7874475544550315, + "language_loss": 0.87089157, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.88573909, + "num_input_tokens_seen": 335432890, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.24462891, + "step": 15550, + "time_per_iteration": 2.6804585456848145 + }, + { + "auxiliary_loss_clip": 0.01233588, + "auxiliary_loss_mlp": 0.00217591, + "balance_loss_clip": 1.01946604, + "balance_loss_mlp": 0.19353443, + "epoch": 0.9349767022395912, + "flos": 18844053166080.0, + "grad_norm": 61.962439854561005, + "language_loss": 0.82515919, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.83967102, + "num_input_tokens_seen": 335452085, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.24072266, + "step": 15551, + "time_per_iteration": 2.7184085845947266 + }, + { + "auxiliary_loss_clip": 0.01218079, + "auxiliary_loss_mlp": 0.00220913, + "balance_loss_clip": 1.01544821, + "balance_loss_mlp": 0.19945522, + "epoch": 0.9350368254922591, + "flos": 24973429777920.0, + "grad_norm": 3.953087860862921, + "language_loss": 0.80305195, + "learning_rate": 4.406722074642255e-08, + "loss": 0.81744182, + "num_input_tokens_seen": 335472130, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.21459961, + "step": 15552, + "time_per_iteration": 2.757685422897339 + }, + { + "auxiliary_loss_clip": 0.01246181, + "auxiliary_loss_mlp": 0.00226797, + "balance_loss_clip": 1.02892375, + "balance_loss_mlp": 0.20157282, + "epoch": 0.9350969487449271, + "flos": 23070594792960.0, + "grad_norm": 6.023764845713289, + "language_loss": 0.85063016, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.8653599, + "num_input_tokens_seen": 335489970, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25256348, + "step": 15553, + "time_per_iteration": 2.681281805038452 + }, + { + "auxiliary_loss_clip": 0.01246224, + "auxiliary_loss_mlp": 0.00248471, + "balance_loss_clip": 1.02575111, + "balance_loss_mlp": 0.22058779, + "epoch": 0.9351570719975951, + "flos": 18625177641600.0, + "grad_norm": 2.8859647275255322, + "language_loss": 0.87742686, + "learning_rate": 4.390475917613723e-08, + "loss": 0.8923738, + "num_input_tokens_seen": 335509125, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.27868652, + "step": 15554, + "time_per_iteration": 2.660900831222534 + }, + { + "auxiliary_loss_clip": 0.01217031, + "auxiliary_loss_mlp": 0.00236714, + "balance_loss_clip": 1.01236379, + "balance_loss_mlp": 0.21451716, + "epoch": 0.935217195250263, + "flos": 15888353702400.0, + "grad_norm": 46.62174090583487, + "language_loss": 0.76082867, + "learning_rate": 4.382363965244695e-08, + "loss": 0.77536613, + "num_input_tokens_seen": 335525620, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.22192383, + "step": 15555, + "time_per_iteration": 4.031850337982178 + }, + { + "auxiliary_loss_clip": 0.01247607, + "auxiliary_loss_mlp": 0.00220453, + "balance_loss_clip": 1.03314972, + "balance_loss_mlp": 0.19469154, + "epoch": 0.935277318502931, + "flos": 24390312387840.0, + "grad_norm": 2.957483627653401, + "language_loss": 0.81505609, + "learning_rate": 4.374259430715965e-08, + "loss": 0.82973665, + "num_input_tokens_seen": 335547565, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.25756836, + "step": 15556, + "time_per_iteration": 2.669058322906494 + }, + { + "auxiliary_loss_clip": 0.0124056, + "auxiliary_loss_mlp": 0.00220572, + "balance_loss_clip": 1.02509272, + "balance_loss_mlp": 0.19535917, + "epoch": 0.935337441755599, + "flos": 27600259294080.0, + "grad_norm": 37.4779983918222, + "language_loss": 0.82186794, + "learning_rate": 4.366162314334953e-08, + "loss": 0.83647931, + "num_input_tokens_seen": 335570285, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.25219727, + "step": 15557, + "time_per_iteration": 2.736241340637207 + }, + { + "auxiliary_loss_clip": 0.01254665, + "auxiliary_loss_mlp": 0.00240419, + "balance_loss_clip": 1.03342152, + "balance_loss_mlp": 0.21348988, + "epoch": 0.935397565008267, + "flos": 20482872209280.0, + "grad_norm": 15.154014205114041, + "language_loss": 0.6955837, + "learning_rate": 4.358072616408681e-08, + "loss": 0.71053451, + "num_input_tokens_seen": 335588600, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26940918, + "step": 15558, + "time_per_iteration": 3.998095750808716 + }, + { + "auxiliary_loss_clip": 0.01247285, + "auxiliary_loss_mlp": 0.00228277, + "balance_loss_clip": 1.02841163, + "balance_loss_mlp": 0.20385063, + "epoch": 0.9354576882609349, + "flos": 23654394541440.0, + "grad_norm": 54.270410743296786, + "language_loss": 0.80125457, + "learning_rate": 4.34999033724388e-08, + "loss": 0.81601024, + "num_input_tokens_seen": 335606235, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.24438477, + "step": 15559, + "time_per_iteration": 2.625577926635742 + }, + { + "auxiliary_loss_clip": 0.01229919, + "auxiliary_loss_mlp": 0.00215407, + "balance_loss_clip": 1.01653337, + "balance_loss_mlp": 0.1925907, + "epoch": 0.9355178115136029, + "flos": 36684904406400.0, + "grad_norm": 131.66655623220962, + "language_loss": 0.71224546, + "learning_rate": 4.341915477147062e-08, + "loss": 0.72669876, + "num_input_tokens_seen": 335628240, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.22802734, + "step": 15560, + "time_per_iteration": 2.8129944801330566 + }, + { + "auxiliary_loss_clip": 0.01288174, + "auxiliary_loss_mlp": 0.00258969, + "balance_loss_clip": 1.05146217, + "balance_loss_mlp": 0.22991754, + "epoch": 0.9355779347662708, + "flos": 14460401450880.0, + "grad_norm": 102.84219749090408, + "language_loss": 0.74849677, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.76396823, + "num_input_tokens_seen": 335643755, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.2902832, + "step": 15561, + "time_per_iteration": 2.570456027984619 + }, + { + "auxiliary_loss_clip": 0.01225772, + "auxiliary_loss_mlp": 0.00215949, + "balance_loss_clip": 1.01322699, + "balance_loss_mlp": 0.19046235, + "epoch": 0.9356380580189388, + "flos": 23185976256000.0, + "grad_norm": 6.653802437921711, + "language_loss": 0.83289671, + "learning_rate": 4.325788015381859e-08, + "loss": 0.84731388, + "num_input_tokens_seen": 335665160, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.25500488, + "step": 15562, + "time_per_iteration": 2.683290719985962 + }, + { + "auxiliary_loss_clip": 0.01097375, + "auxiliary_loss_mlp": 0.00084832, + "balance_loss_clip": 0.96184158, + "balance_loss_mlp": 0.07701227, + "epoch": 0.9356981812716068, + "flos": 67471626090240.0, + "grad_norm": 0.9204581917690923, + "language_loss": 0.61278003, + "learning_rate": 4.31773541432503e-08, + "loss": 0.62460208, + "num_input_tokens_seen": 335715240, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.078125, + "step": 15563, + "time_per_iteration": 3.068084955215454 + }, + { + "auxiliary_loss_clip": 0.01229546, + "auxiliary_loss_mlp": 0.0022949, + "balance_loss_clip": 1.02093279, + "balance_loss_mlp": 0.20445579, + "epoch": 0.9357583045242748, + "flos": 24681619687680.0, + "grad_norm": 35.71172037073797, + "language_loss": 0.85818642, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.87277675, + "num_input_tokens_seen": 335734970, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.25012207, + "step": 15564, + "time_per_iteration": 2.736680269241333 + }, + { + "auxiliary_loss_clip": 0.01246226, + "auxiliary_loss_mlp": 0.00226285, + "balance_loss_clip": 1.02477789, + "balance_loss_mlp": 0.19925982, + "epoch": 0.9358184277769427, + "flos": 19463727623040.0, + "grad_norm": 40.68632055537719, + "language_loss": 0.8832193, + "learning_rate": 4.301652473389694e-08, + "loss": 0.89794439, + "num_input_tokens_seen": 335753435, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.27026367, + "step": 15565, + "time_per_iteration": 2.648409843444824 + }, + { + "auxiliary_loss_clip": 0.01226955, + "auxiliary_loss_mlp": 0.00223557, + "balance_loss_clip": 1.01460111, + "balance_loss_mlp": 0.199274, + "epoch": 0.9358785510296107, + "flos": 18916987731840.0, + "grad_norm": 12.105627541937784, + "language_loss": 0.81814492, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.83265007, + "num_input_tokens_seen": 335772105, + "router_z_loss_clip": 2.12597656, + "router_z_loss_mlp": 0.24267578, + "step": 15566, + "time_per_iteration": 2.6648855209350586 + }, + { + "auxiliary_loss_clip": 0.0124249, + "auxiliary_loss_mlp": 0.00248258, + "balance_loss_clip": 1.02351069, + "balance_loss_mlp": 0.22230583, + "epoch": 0.9359386742822787, + "flos": 23441265192960.0, + "grad_norm": 10.502668450642062, + "language_loss": 0.75556701, + "learning_rate": 4.285599216057889e-08, + "loss": 0.77047449, + "num_input_tokens_seen": 335789125, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25952148, + "step": 15567, + "time_per_iteration": 2.675158977508545 + }, + { + "auxiliary_loss_clip": 0.01239153, + "auxiliary_loss_mlp": 0.00233083, + "balance_loss_clip": 1.02288496, + "balance_loss_mlp": 0.20826343, + "epoch": 0.9359987975349466, + "flos": 32744067557760.0, + "grad_norm": 57.76794043675989, + "language_loss": 0.6951375, + "learning_rate": 4.277583719504418e-08, + "loss": 0.70985979, + "num_input_tokens_seen": 335810995, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.24804688, + "step": 15568, + "time_per_iteration": 2.7356882095336914 + }, + { + "auxiliary_loss_clip": 0.01229397, + "auxiliary_loss_mlp": 0.00224151, + "balance_loss_clip": 1.01685834, + "balance_loss_mlp": 0.19873545, + "epoch": 0.9360589207876147, + "flos": 22819651401600.0, + "grad_norm": 91.7720085321227, + "language_loss": 0.85990536, + "learning_rate": 4.269575644764556e-08, + "loss": 0.87444085, + "num_input_tokens_seen": 335830580, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.25415039, + "step": 15569, + "time_per_iteration": 2.7031190395355225 + }, + { + "auxiliary_loss_clip": 0.01257587, + "auxiliary_loss_mlp": 0.00210591, + "balance_loss_clip": 1.03071713, + "balance_loss_mlp": 0.18341164, + "epoch": 0.9361190440402826, + "flos": 20885251340160.0, + "grad_norm": 28.16502857516713, + "language_loss": 0.8329919, + "learning_rate": 4.261574992142014e-08, + "loss": 0.84767365, + "num_input_tokens_seen": 335846515, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.27185059, + "step": 15570, + "time_per_iteration": 2.6091902256011963 + }, + { + "auxiliary_loss_clip": 0.01244664, + "auxiliary_loss_mlp": 0.00241007, + "balance_loss_clip": 1.02232325, + "balance_loss_mlp": 0.21671197, + "epoch": 0.9361791672929506, + "flos": 19317822577920.0, + "grad_norm": 2.6804914802671735, + "language_loss": 0.8715207, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.88637745, + "num_input_tokens_seen": 335863350, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.24291992, + "step": 15571, + "time_per_iteration": 2.686521053314209 + }, + { + "auxiliary_loss_clip": 0.01229389, + "auxiliary_loss_mlp": 0.00233028, + "balance_loss_clip": 1.014678, + "balance_loss_mlp": 0.20880422, + "epoch": 0.9362392905456185, + "flos": 15158182032000.0, + "grad_norm": 67.00881105714546, + "language_loss": 0.83190626, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.84653044, + "num_input_tokens_seen": 335880510, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24206543, + "step": 15572, + "time_per_iteration": 2.729785203933716 + }, + { + "auxiliary_loss_clip": 0.0121357, + "auxiliary_loss_mlp": 0.00227544, + "balance_loss_clip": 1.00843155, + "balance_loss_mlp": 0.20288, + "epoch": 0.9362994137982865, + "flos": 22085888371200.0, + "grad_norm": 81.56616342439204, + "language_loss": 0.83900851, + "learning_rate": 4.237617570010688e-08, + "loss": 0.85341966, + "num_input_tokens_seen": 335899440, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.24682617, + "step": 15573, + "time_per_iteration": 2.7123303413391113 + }, + { + "auxiliary_loss_clip": 0.01235444, + "auxiliary_loss_mlp": 0.0023282, + "balance_loss_clip": 1.02573013, + "balance_loss_mlp": 0.20970556, + "epoch": 0.9363595370509544, + "flos": 23512260424320.0, + "grad_norm": 6.776627446798069, + "language_loss": 0.80763531, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.82231796, + "num_input_tokens_seen": 335919540, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.23120117, + "step": 15574, + "time_per_iteration": 2.63759708404541 + }, + { + "auxiliary_loss_clip": 0.01231699, + "auxiliary_loss_mlp": 0.00229333, + "balance_loss_clip": 1.01854086, + "balance_loss_mlp": 0.20414406, + "epoch": 0.9364196603036224, + "flos": 27123473139840.0, + "grad_norm": 2.9230057169037162, + "language_loss": 0.77109325, + "learning_rate": 4.221683071397564e-08, + "loss": 0.78570354, + "num_input_tokens_seen": 335939665, + "router_z_loss_clip": 2.13378906, + "router_z_loss_mlp": 0.25170898, + "step": 15575, + "time_per_iteration": 2.7454733848571777 + }, + { + "auxiliary_loss_clip": 0.01226149, + "auxiliary_loss_mlp": 0.00228041, + "balance_loss_clip": 1.01702785, + "balance_loss_mlp": 0.2038776, + "epoch": 0.9364797835562904, + "flos": 18479057114880.0, + "grad_norm": 14.086532551305359, + "language_loss": 0.73572528, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.75026721, + "num_input_tokens_seen": 335958580, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.24157715, + "step": 15576, + "time_per_iteration": 2.66841983795166 + }, + { + "auxiliary_loss_clip": 0.01253144, + "auxiliary_loss_mlp": 0.00258524, + "balance_loss_clip": 1.03214765, + "balance_loss_mlp": 0.23111805, + "epoch": 0.9365399068089584, + "flos": 13005552890880.0, + "grad_norm": 6.22552093588715, + "language_loss": 0.84477103, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.85988772, + "num_input_tokens_seen": 335974965, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.27416992, + "step": 15577, + "time_per_iteration": 2.6386115550994873 + }, + { + "auxiliary_loss_clip": 0.01249341, + "auxiliary_loss_mlp": 0.00231461, + "balance_loss_clip": 1.02621555, + "balance_loss_mlp": 0.20673685, + "epoch": 0.9366000300616263, + "flos": 25666433850240.0, + "grad_norm": 53.30732995077369, + "language_loss": 0.6215694, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.63637745, + "num_input_tokens_seen": 335996575, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.24755859, + "step": 15578, + "time_per_iteration": 2.7307257652282715 + }, + { + "auxiliary_loss_clip": 0.01214792, + "auxiliary_loss_mlp": 0.00218386, + "balance_loss_clip": 1.00597119, + "balance_loss_mlp": 0.19568864, + "epoch": 0.9366601533142943, + "flos": 21433355948160.0, + "grad_norm": 167.97711694274545, + "language_loss": 0.7594949, + "learning_rate": 4.189903163783692e-08, + "loss": 0.77382672, + "num_input_tokens_seen": 336017265, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.22705078, + "step": 15579, + "time_per_iteration": 2.659555435180664 + }, + { + "auxiliary_loss_clip": 0.01238792, + "auxiliary_loss_mlp": 0.00212741, + "balance_loss_clip": 1.02287889, + "balance_loss_mlp": 0.18851727, + "epoch": 0.9367202765669622, + "flos": 24093222998400.0, + "grad_norm": 48.5082654826905, + "language_loss": 0.83760655, + "learning_rate": 4.181976748973959e-08, + "loss": 0.85212189, + "num_input_tokens_seen": 336035905, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.2421875, + "step": 15580, + "time_per_iteration": 2.6931943893432617 + }, + { + "auxiliary_loss_clip": 0.01247764, + "auxiliary_loss_mlp": 0.00206262, + "balance_loss_clip": 1.02630711, + "balance_loss_mlp": 0.18124047, + "epoch": 0.9367803998196302, + "flos": 20888842700160.0, + "grad_norm": 6.418798410295584, + "language_loss": 0.75445402, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.76899427, + "num_input_tokens_seen": 336055585, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25024414, + "step": 15581, + "time_per_iteration": 2.6866257190704346 + }, + { + "auxiliary_loss_clip": 0.01235759, + "auxiliary_loss_mlp": 0.00234573, + "balance_loss_clip": 1.02097654, + "balance_loss_mlp": 0.21024235, + "epoch": 0.9368405230722983, + "flos": 22564362464640.0, + "grad_norm": 13.165358457303302, + "language_loss": 0.8197577, + "learning_rate": 4.166146195972042e-08, + "loss": 0.83446103, + "num_input_tokens_seen": 336076695, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.2434082, + "step": 15582, + "time_per_iteration": 2.7942373752593994 + }, + { + "auxiliary_loss_clip": 0.01231657, + "auxiliary_loss_mlp": 0.00215656, + "balance_loss_clip": 1.01513743, + "balance_loss_mlp": 0.19044307, + "epoch": 0.9369006463249662, + "flos": 18880215183360.0, + "grad_norm": 7.122201322974767, + "language_loss": 0.82150388, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.83597702, + "num_input_tokens_seen": 336094740, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.25183105, + "step": 15583, + "time_per_iteration": 2.6281843185424805 + }, + { + "auxiliary_loss_clip": 0.012542, + "auxiliary_loss_mlp": 0.00242512, + "balance_loss_clip": 1.03360689, + "balance_loss_mlp": 0.21713229, + "epoch": 0.9369607695776342, + "flos": 26432516142720.0, + "grad_norm": 12.27016678872365, + "language_loss": 0.92929423, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.94426131, + "num_input_tokens_seen": 336113985, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25390625, + "step": 15584, + "time_per_iteration": 2.726494073867798 + }, + { + "auxiliary_loss_clip": 0.01260023, + "auxiliary_loss_mlp": 0.002265, + "balance_loss_clip": 1.03512144, + "balance_loss_mlp": 0.20022628, + "epoch": 0.9370208928303021, + "flos": 39567346081920.0, + "grad_norm": 10.217554247871849, + "language_loss": 0.7957384, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.81060362, + "num_input_tokens_seen": 336136395, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.26293945, + "step": 15585, + "time_per_iteration": 2.786827564239502 + }, + { + "auxiliary_loss_clip": 0.01230696, + "auxiliary_loss_mlp": 0.00208964, + "balance_loss_clip": 1.02005827, + "balance_loss_mlp": 0.18578991, + "epoch": 0.9370810160829701, + "flos": 22963114321920.0, + "grad_norm": 11.745579038144808, + "language_loss": 0.89168096, + "learning_rate": 4.134574204836316e-08, + "loss": 0.90607756, + "num_input_tokens_seen": 336156345, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.23156738, + "step": 15586, + "time_per_iteration": 2.695124864578247 + }, + { + "auxiliary_loss_clip": 0.01232509, + "auxiliary_loss_mlp": 0.00233349, + "balance_loss_clip": 1.01655364, + "balance_loss_mlp": 0.20929249, + "epoch": 0.937141139335638, + "flos": 23075048079360.0, + "grad_norm": 32.09513910952519, + "language_loss": 0.83060992, + "learning_rate": 4.126699774396258e-08, + "loss": 0.84526849, + "num_input_tokens_seen": 336176760, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.24023438, + "step": 15587, + "time_per_iteration": 2.6536314487457275 + }, + { + "auxiliary_loss_clip": 0.01259399, + "auxiliary_loss_mlp": 0.00224455, + "balance_loss_clip": 1.03489554, + "balance_loss_mlp": 0.19746608, + "epoch": 0.937201262588306, + "flos": 16356664247040.0, + "grad_norm": 29.025155728378124, + "language_loss": 0.95954072, + "learning_rate": 4.118832771491387e-08, + "loss": 0.9743793, + "num_input_tokens_seen": 336193285, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.26977539, + "step": 15588, + "time_per_iteration": 2.6686251163482666 + }, + { + "auxiliary_loss_clip": 0.01216351, + "auxiliary_loss_mlp": 0.00196638, + "balance_loss_clip": 1.0103935, + "balance_loss_mlp": 0.17444128, + "epoch": 0.937261385840974, + "flos": 20194078861440.0, + "grad_norm": 9.909509425637324, + "language_loss": 0.86493671, + "learning_rate": 4.11097319642002e-08, + "loss": 0.87906659, + "num_input_tokens_seen": 336211425, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.22180176, + "step": 15589, + "time_per_iteration": 5.517268657684326 + }, + { + "auxiliary_loss_clip": 0.01226828, + "auxiliary_loss_mlp": 0.00217464, + "balance_loss_clip": 1.01574111, + "balance_loss_mlp": 0.19304956, + "epoch": 0.937321509093642, + "flos": 18295948558080.0, + "grad_norm": 2.475140703648732, + "language_loss": 0.86413586, + "learning_rate": 4.103121049480163e-08, + "loss": 0.87857878, + "num_input_tokens_seen": 336230205, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.24414062, + "step": 15590, + "time_per_iteration": 2.684887409210205 + }, + { + "auxiliary_loss_clip": 0.01267152, + "auxiliary_loss_mlp": 0.00227692, + "balance_loss_clip": 1.04278207, + "balance_loss_mlp": 0.20002401, + "epoch": 0.9373816323463099, + "flos": 25884662929920.0, + "grad_norm": 52.10301991218458, + "language_loss": 0.7804935, + "learning_rate": 4.095276330969577e-08, + "loss": 0.79544193, + "num_input_tokens_seen": 336252440, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.27697754, + "step": 15591, + "time_per_iteration": 2.696643829345703 + }, + { + "auxiliary_loss_clip": 0.01273426, + "auxiliary_loss_mlp": 0.00249396, + "balance_loss_clip": 1.04427814, + "balance_loss_mlp": 0.22129866, + "epoch": 0.9374417555989779, + "flos": 27198849830400.0, + "grad_norm": 155.75189421282445, + "language_loss": 0.66441083, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.67963898, + "num_input_tokens_seen": 336273845, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.28088379, + "step": 15592, + "time_per_iteration": 2.744729995727539 + }, + { + "auxiliary_loss_clip": 0.01229341, + "auxiliary_loss_mlp": 0.00209789, + "balance_loss_clip": 1.01821697, + "balance_loss_mlp": 0.18551832, + "epoch": 0.9375018788516458, + "flos": 23621249266560.0, + "grad_norm": 17.407201122891987, + "language_loss": 0.73709363, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.75148493, + "num_input_tokens_seen": 336292790, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.24267578, + "step": 15593, + "time_per_iteration": 2.6868793964385986 + }, + { + "auxiliary_loss_clip": 0.01241029, + "auxiliary_loss_mlp": 0.00216193, + "balance_loss_clip": 1.02632666, + "balance_loss_mlp": 0.19180259, + "epoch": 0.9375620021043138, + "flos": 22678774260480.0, + "grad_norm": 5.59303225658966, + "language_loss": 0.80763471, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.82220697, + "num_input_tokens_seen": 336312600, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.24389648, + "step": 15594, + "time_per_iteration": 2.7398180961608887 + }, + { + "auxiliary_loss_clip": 0.01233022, + "auxiliary_loss_mlp": 0.00232054, + "balance_loss_clip": 1.02151453, + "balance_loss_mlp": 0.20747307, + "epoch": 0.9376221253569819, + "flos": 27560254521600.0, + "grad_norm": 44.027464868045485, + "language_loss": 0.79140443, + "learning_rate": 4.063971747165351e-08, + "loss": 0.80605519, + "num_input_tokens_seen": 336332770, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.24572754, + "step": 15595, + "time_per_iteration": 2.6829793453216553 + }, + { + "auxiliary_loss_clip": 0.0124043, + "auxiliary_loss_mlp": 0.00230295, + "balance_loss_clip": 1.02151012, + "balance_loss_mlp": 0.20441501, + "epoch": 0.9376822486096498, + "flos": 24129887806080.0, + "grad_norm": 9.34100108349207, + "language_loss": 0.81811535, + "learning_rate": 4.056164175257626e-08, + "loss": 0.83282256, + "num_input_tokens_seen": 336351445, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.25878906, + "step": 15596, + "time_per_iteration": 2.71138334274292 + }, + { + "auxiliary_loss_clip": 0.01247395, + "auxiliary_loss_mlp": 0.00231901, + "balance_loss_clip": 1.03117943, + "balance_loss_mlp": 0.20712924, + "epoch": 0.9377423718623178, + "flos": 22784028088320.0, + "grad_norm": 20.196752903894055, + "language_loss": 0.8462072, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.86100018, + "num_input_tokens_seen": 336368690, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24780273, + "step": 15597, + "time_per_iteration": 4.131103515625 + }, + { + "auxiliary_loss_clip": 0.01250066, + "auxiliary_loss_mlp": 0.00236423, + "balance_loss_clip": 1.02876329, + "balance_loss_mlp": 0.21115083, + "epoch": 0.9378024951149857, + "flos": 19168900790400.0, + "grad_norm": 13.748635552159744, + "language_loss": 0.87158203, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.88644695, + "num_input_tokens_seen": 336388165, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.25292969, + "step": 15598, + "time_per_iteration": 2.6725594997406006 + }, + { + "auxiliary_loss_clip": 0.01280679, + "auxiliary_loss_mlp": 0.00237021, + "balance_loss_clip": 1.04786325, + "balance_loss_mlp": 0.20918593, + "epoch": 0.9378626183676537, + "flos": 23505508667520.0, + "grad_norm": 4.911277721057267, + "language_loss": 0.73466897, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.74984598, + "num_input_tokens_seen": 336406475, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.27819824, + "step": 15599, + "time_per_iteration": 2.6827194690704346 + }, + { + "auxiliary_loss_clip": 0.01238463, + "auxiliary_loss_mlp": 0.00209474, + "balance_loss_clip": 1.02073383, + "balance_loss_mlp": 0.18447554, + "epoch": 0.9379227416203216, + "flos": 18405655672320.0, + "grad_norm": 443.5368653062814, + "language_loss": 0.8340435, + "learning_rate": 4.0250081926821e-08, + "loss": 0.84852278, + "num_input_tokens_seen": 336424690, + "router_z_loss_clip": 2.17871094, + "router_z_loss_mlp": 0.24975586, + "step": 15600, + "time_per_iteration": 2.6001124382019043 + }, + { + "auxiliary_loss_clip": 0.01229441, + "auxiliary_loss_mlp": 0.00218745, + "balance_loss_clip": 1.02088976, + "balance_loss_mlp": 0.19549896, + "epoch": 0.9379828648729897, + "flos": 17821855923840.0, + "grad_norm": 3.4352255670744762, + "language_loss": 0.79057831, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.80506015, + "num_input_tokens_seen": 336443055, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.23242188, + "step": 15601, + "time_per_iteration": 4.02301025390625 + }, + { + "auxiliary_loss_clip": 0.01090783, + "auxiliary_loss_mlp": 0.00101922, + "balance_loss_clip": 0.95688087, + "balance_loss_mlp": 0.09381577, + "epoch": 0.9380429881256576, + "flos": 68024399466240.0, + "grad_norm": 0.7371018664960794, + "language_loss": 0.57574141, + "learning_rate": 4.009474788561573e-08, + "loss": 0.58766854, + "num_input_tokens_seen": 336510190, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.08105469, + "step": 15602, + "time_per_iteration": 3.3500747680664062 + }, + { + "auxiliary_loss_clip": 0.0124523, + "auxiliary_loss_mlp": 0.00244236, + "balance_loss_clip": 1.02723718, + "balance_loss_mlp": 0.22007236, + "epoch": 0.9381031113783256, + "flos": 20776980769920.0, + "grad_norm": 7.549429443611423, + "language_loss": 0.82028437, + "learning_rate": 4.001719234324663e-08, + "loss": 0.83517909, + "num_input_tokens_seen": 336529250, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24169922, + "step": 15603, + "time_per_iteration": 2.671347141265869 + }, + { + "auxiliary_loss_clip": 0.01216283, + "auxiliary_loss_mlp": 0.00230527, + "balance_loss_clip": 1.00939953, + "balance_loss_mlp": 0.20663723, + "epoch": 0.9381632346309935, + "flos": 19025078734080.0, + "grad_norm": 318.5546859022231, + "language_loss": 0.81854486, + "learning_rate": 3.993971112362171e-08, + "loss": 0.833013, + "num_input_tokens_seen": 336548530, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.23901367, + "step": 15604, + "time_per_iteration": 2.6916964054107666 + }, + { + "auxiliary_loss_clip": 0.01250634, + "auxiliary_loss_mlp": 0.00232621, + "balance_loss_clip": 1.03052628, + "balance_loss_mlp": 0.2067645, + "epoch": 0.9382233578836615, + "flos": 23513840622720.0, + "grad_norm": 79.64482429123558, + "language_loss": 0.75084627, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.76567876, + "num_input_tokens_seen": 336568510, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25842285, + "step": 15605, + "time_per_iteration": 2.6507906913757324 + }, + { + "auxiliary_loss_clip": 0.0125165, + "auxiliary_loss_mlp": 0.00242309, + "balance_loss_clip": 1.03123748, + "balance_loss_mlp": 0.21479563, + "epoch": 0.9382834811363294, + "flos": 43067882016000.0, + "grad_norm": 14.286230486228872, + "language_loss": 0.74980503, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.76474464, + "num_input_tokens_seen": 336592020, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.27526855, + "step": 15606, + "time_per_iteration": 2.892580986022949 + }, + { + "auxiliary_loss_clip": 0.01233699, + "auxiliary_loss_mlp": 0.00240415, + "balance_loss_clip": 1.02205586, + "balance_loss_mlp": 0.21684712, + "epoch": 0.9383436043889974, + "flos": 16436242828800.0, + "grad_norm": 49.432217073185825, + "language_loss": 0.84331143, + "learning_rate": 3.970771343058166e-08, + "loss": 0.85805255, + "num_input_tokens_seen": 336610010, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.23583984, + "step": 15607, + "time_per_iteration": 2.6296467781066895 + }, + { + "auxiliary_loss_clip": 0.01251161, + "auxiliary_loss_mlp": 0.00222664, + "balance_loss_clip": 1.03408122, + "balance_loss_mlp": 0.19937068, + "epoch": 0.9384037276416655, + "flos": 20740603271040.0, + "grad_norm": 5.86093155042477, + "language_loss": 0.89507985, + "learning_rate": 3.963052953128776e-08, + "loss": 0.90981811, + "num_input_tokens_seen": 336628520, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.23303223, + "step": 15608, + "time_per_iteration": 2.7089502811431885 + }, + { + "auxiliary_loss_clip": 0.01255307, + "auxiliary_loss_mlp": 0.00216828, + "balance_loss_clip": 1.04004169, + "balance_loss_mlp": 0.19355837, + "epoch": 0.9384638508943334, + "flos": 19062677295360.0, + "grad_norm": 5.202069831087432, + "language_loss": 0.78269136, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.79741275, + "num_input_tokens_seen": 336647365, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.23278809, + "step": 15609, + "time_per_iteration": 2.6806883811950684 + }, + { + "auxiliary_loss_clip": 0.01242111, + "auxiliary_loss_mlp": 0.00234266, + "balance_loss_clip": 1.02071404, + "balance_loss_mlp": 0.20888655, + "epoch": 0.9385239741470014, + "flos": 23404887694080.0, + "grad_norm": 6.457780001609047, + "language_loss": 0.83411562, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.84887934, + "num_input_tokens_seen": 336667165, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25415039, + "step": 15610, + "time_per_iteration": 2.767110586166382 + }, + { + "auxiliary_loss_clip": 0.01235812, + "auxiliary_loss_mlp": 0.00235364, + "balance_loss_clip": 1.02456987, + "balance_loss_mlp": 0.21189138, + "epoch": 0.9385840973996693, + "flos": 12824742804480.0, + "grad_norm": 13.079266632995255, + "language_loss": 0.83590078, + "learning_rate": 3.939942386953987e-08, + "loss": 0.85061252, + "num_input_tokens_seen": 336684130, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.23498535, + "step": 15611, + "time_per_iteration": 2.6698975563049316 + }, + { + "auxiliary_loss_clip": 0.01247282, + "auxiliary_loss_mlp": 0.00212267, + "balance_loss_clip": 1.03052449, + "balance_loss_mlp": 0.18831766, + "epoch": 0.9386442206523373, + "flos": 15486980152320.0, + "grad_norm": 53.94276312771302, + "language_loss": 0.74979019, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.76438558, + "num_input_tokens_seen": 336701520, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.23950195, + "step": 15612, + "time_per_iteration": 2.6834936141967773 + }, + { + "auxiliary_loss_clip": 0.01229521, + "auxiliary_loss_mlp": 0.00216509, + "balance_loss_clip": 1.01656771, + "balance_loss_mlp": 0.19010389, + "epoch": 0.9387043439050052, + "flos": 21178821196800.0, + "grad_norm": 16.945195546177292, + "language_loss": 0.66490912, + "learning_rate": 3.924572515435742e-08, + "loss": 0.67936945, + "num_input_tokens_seen": 336720675, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.26416016, + "step": 15613, + "time_per_iteration": 2.6714515686035156 + }, + { + "auxiliary_loss_clip": 0.01241736, + "auxiliary_loss_mlp": 0.00234087, + "balance_loss_clip": 1.02465951, + "balance_loss_mlp": 0.20869544, + "epoch": 0.9387644671576733, + "flos": 27668273696640.0, + "grad_norm": 11.60002047952367, + "language_loss": 0.78522861, + "learning_rate": 3.916898732330764e-08, + "loss": 0.79998684, + "num_input_tokens_seen": 336741005, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25402832, + "step": 15614, + "time_per_iteration": 2.779069423675537 + }, + { + "auxiliary_loss_clip": 0.01256194, + "auxiliary_loss_mlp": 0.00239693, + "balance_loss_clip": 1.0326736, + "balance_loss_mlp": 0.21253729, + "epoch": 0.9388245904103412, + "flos": 18836331742080.0, + "grad_norm": 15.474924624744057, + "language_loss": 0.90762484, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.9225837, + "num_input_tokens_seen": 336757990, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.27160645, + "step": 15615, + "time_per_iteration": 2.6993496417999268 + }, + { + "auxiliary_loss_clip": 0.01243446, + "auxiliary_loss_mlp": 0.00223868, + "balance_loss_clip": 1.02536201, + "balance_loss_mlp": 0.19971621, + "epoch": 0.9388847136630092, + "flos": 25483828083840.0, + "grad_norm": 12.284332679502512, + "language_loss": 0.77541935, + "learning_rate": 3.901573472884134e-08, + "loss": 0.79009253, + "num_input_tokens_seen": 336777705, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24145508, + "step": 15616, + "time_per_iteration": 2.699192523956299 + }, + { + "auxiliary_loss_clip": 0.01243905, + "auxiliary_loss_mlp": 0.00220704, + "balance_loss_clip": 1.02896047, + "balance_loss_mlp": 0.19777964, + "epoch": 0.9389448369156771, + "flos": 18734992496640.0, + "grad_norm": 39.26288293255402, + "language_loss": 0.74589604, + "learning_rate": 3.89392199712355e-08, + "loss": 0.76054209, + "num_input_tokens_seen": 336798275, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.22937012, + "step": 15617, + "time_per_iteration": 2.6532461643218994 + }, + { + "auxiliary_loss_clip": 0.01238896, + "auxiliary_loss_mlp": 0.00220168, + "balance_loss_clip": 1.02008963, + "balance_loss_mlp": 0.19508649, + "epoch": 0.9390049601683451, + "flos": 21717839664000.0, + "grad_norm": 10.006928979725021, + "language_loss": 0.83380651, + "learning_rate": 3.886277957725092e-08, + "loss": 0.8483972, + "num_input_tokens_seen": 336813835, + "router_z_loss_clip": 2.19042969, + "router_z_loss_mlp": 0.25073242, + "step": 15618, + "time_per_iteration": 2.7879798412323 + }, + { + "auxiliary_loss_clip": 0.01246857, + "auxiliary_loss_mlp": 0.00200213, + "balance_loss_clip": 1.02245426, + "balance_loss_mlp": 0.17451145, + "epoch": 0.939065083421013, + "flos": 19391224020480.0, + "grad_norm": 10.80969959299753, + "language_loss": 0.78913873, + "learning_rate": 3.878641354978662e-08, + "loss": 0.80360943, + "num_input_tokens_seen": 336832210, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.25695801, + "step": 15619, + "time_per_iteration": 2.663057804107666 + }, + { + "auxiliary_loss_clip": 0.01254989, + "auxiliary_loss_mlp": 0.00233386, + "balance_loss_clip": 1.03325677, + "balance_loss_mlp": 0.20804235, + "epoch": 0.939125206673681, + "flos": 24681511946880.0, + "grad_norm": 13.383182424451329, + "language_loss": 0.87282836, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.88771206, + "num_input_tokens_seen": 336851380, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25354004, + "step": 15620, + "time_per_iteration": 2.734651565551758 + }, + { + "auxiliary_loss_clip": 0.012471, + "auxiliary_loss_mlp": 0.00235201, + "balance_loss_clip": 1.02617967, + "balance_loss_mlp": 0.20827135, + "epoch": 0.9391853299263491, + "flos": 16325961096960.0, + "grad_norm": 13.772363416544232, + "language_loss": 0.82441831, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.83924127, + "num_input_tokens_seen": 336868525, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26916504, + "step": 15621, + "time_per_iteration": 2.664625644683838 + }, + { + "auxiliary_loss_clip": 0.01265256, + "auxiliary_loss_mlp": 0.00219515, + "balance_loss_clip": 1.03538752, + "balance_loss_mlp": 0.19240654, + "epoch": 0.939245453179017, + "flos": 11655778590720.0, + "grad_norm": 37.20460682349716, + "language_loss": 0.80092055, + "learning_rate": 3.855776169545688e-08, + "loss": 0.81576824, + "num_input_tokens_seen": 336886200, + "router_z_loss_clip": 2.29785156, + "router_z_loss_mlp": 0.27087402, + "step": 15622, + "time_per_iteration": 2.645827293395996 + }, + { + "auxiliary_loss_clip": 0.01230015, + "auxiliary_loss_mlp": 0.00213784, + "balance_loss_clip": 1.01795435, + "balance_loss_mlp": 0.19121777, + "epoch": 0.939305576431685, + "flos": 23148700917120.0, + "grad_norm": 159.65286078472315, + "language_loss": 0.80688196, + "learning_rate": 3.848169316300209e-08, + "loss": 0.82132006, + "num_input_tokens_seen": 336905815, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.22570801, + "step": 15623, + "time_per_iteration": 2.6366541385650635 + }, + { + "auxiliary_loss_clip": 0.01258747, + "auxiliary_loss_mlp": 0.00229461, + "balance_loss_clip": 1.03830314, + "balance_loss_mlp": 0.20302007, + "epoch": 0.9393656996843529, + "flos": 33287790706560.0, + "grad_norm": 4.426365744157299, + "language_loss": 0.80840802, + "learning_rate": 3.84056990115178e-08, + "loss": 0.82329011, + "num_input_tokens_seen": 336928460, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.2644043, + "step": 15624, + "time_per_iteration": 2.811030149459839 + }, + { + "auxiliary_loss_clip": 0.01237323, + "auxiliary_loss_mlp": 0.00210727, + "balance_loss_clip": 1.02009726, + "balance_loss_mlp": 0.18616965, + "epoch": 0.9394258229370209, + "flos": 21689434984320.0, + "grad_norm": 34.411038116492236, + "language_loss": 0.9703005, + "learning_rate": 3.832977924388614e-08, + "loss": 0.98478091, + "num_input_tokens_seen": 336948320, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.24572754, + "step": 15625, + "time_per_iteration": 2.650332450866699 + }, + { + "auxiliary_loss_clip": 0.01229424, + "auxiliary_loss_mlp": 0.00213564, + "balance_loss_clip": 1.01394463, + "balance_loss_mlp": 0.18841124, + "epoch": 0.9394859461896888, + "flos": 23874203819520.0, + "grad_norm": 28.30830936233703, + "language_loss": 0.92614198, + "learning_rate": 3.825393386298592e-08, + "loss": 0.9405719, + "num_input_tokens_seen": 336967670, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.25158691, + "step": 15626, + "time_per_iteration": 2.783263921737671 + }, + { + "auxiliary_loss_clip": 0.01090569, + "auxiliary_loss_mlp": 0.00096645, + "balance_loss_clip": 0.95351541, + "balance_loss_mlp": 0.08849104, + "epoch": 0.9395460694423569, + "flos": 61566116993280.0, + "grad_norm": 2.2826832672892454, + "language_loss": 0.55237955, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.56425166, + "num_input_tokens_seen": 337028395, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.08154297, + "step": 15627, + "time_per_iteration": 3.1157634258270264 + }, + { + "auxiliary_loss_clip": 0.01241674, + "auxiliary_loss_mlp": 0.00218691, + "balance_loss_clip": 1.02499735, + "balance_loss_mlp": 0.1939434, + "epoch": 0.9396061926950248, + "flos": 20995712640000.0, + "grad_norm": 3.597551364923694, + "language_loss": 0.77822161, + "learning_rate": 3.810246627288105e-08, + "loss": 0.79282522, + "num_input_tokens_seen": 337048150, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.24755859, + "step": 15628, + "time_per_iteration": 2.6912858486175537 + }, + { + "auxiliary_loss_clip": 0.01233095, + "auxiliary_loss_mlp": 0.00219292, + "balance_loss_clip": 1.02182245, + "balance_loss_mlp": 0.19666627, + "epoch": 0.9396663159476928, + "flos": 27487786832640.0, + "grad_norm": 11.761129850436555, + "language_loss": 0.81836796, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.83289182, + "num_input_tokens_seen": 337069315, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.22631836, + "step": 15629, + "time_per_iteration": 2.7018728256225586 + }, + { + "auxiliary_loss_clip": 0.01230228, + "auxiliary_loss_mlp": 0.00208662, + "balance_loss_clip": 1.02030718, + "balance_loss_mlp": 0.1860723, + "epoch": 0.9397264392003607, + "flos": 19427457864960.0, + "grad_norm": 176.1296119216745, + "language_loss": 0.80932754, + "learning_rate": 3.795129626417748e-08, + "loss": 0.82371646, + "num_input_tokens_seen": 337087765, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.22583008, + "step": 15630, + "time_per_iteration": 2.6834423542022705 + }, + { + "auxiliary_loss_clip": 0.01238088, + "auxiliary_loss_mlp": 0.00215468, + "balance_loss_clip": 1.02429032, + "balance_loss_mlp": 0.19170983, + "epoch": 0.9397865624530287, + "flos": 18004820826240.0, + "grad_norm": 10.15756685524353, + "language_loss": 0.77395606, + "learning_rate": 3.787582286001845e-08, + "loss": 0.78849167, + "num_input_tokens_seen": 337106265, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.23742676, + "step": 15631, + "time_per_iteration": 3.9837496280670166 + }, + { + "auxiliary_loss_clip": 0.01231066, + "auxiliary_loss_mlp": 0.00216114, + "balance_loss_clip": 1.02007151, + "balance_loss_mlp": 0.19380975, + "epoch": 0.9398466857056966, + "flos": 22564613859840.0, + "grad_norm": 10.352757855643768, + "language_loss": 0.80600321, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.82047504, + "num_input_tokens_seen": 337126090, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.22314453, + "step": 15632, + "time_per_iteration": 4.11297082901001 + }, + { + "auxiliary_loss_clip": 0.01274906, + "auxiliary_loss_mlp": 0.00223225, + "balance_loss_clip": 1.04364145, + "balance_loss_mlp": 0.19518733, + "epoch": 0.9399068089583646, + "flos": 24535678728960.0, + "grad_norm": 4.163891407541728, + "language_loss": 0.82799459, + "learning_rate": 3.772509926639622e-08, + "loss": 0.84297597, + "num_input_tokens_seen": 337145655, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.28063965, + "step": 15633, + "time_per_iteration": 2.6861555576324463 + }, + { + "auxiliary_loss_clip": 0.01250414, + "auxiliary_loss_mlp": 0.00242885, + "balance_loss_clip": 1.03059173, + "balance_loss_mlp": 0.21636045, + "epoch": 0.9399669322110327, + "flos": 25630343660160.0, + "grad_norm": 22.90618264421856, + "language_loss": 0.79794091, + "learning_rate": 3.764984908264823e-08, + "loss": 0.81287396, + "num_input_tokens_seen": 337164805, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26525879, + "step": 15634, + "time_per_iteration": 2.697537660598755 + }, + { + "auxiliary_loss_clip": 0.01241188, + "auxiliary_loss_mlp": 0.00227368, + "balance_loss_clip": 1.02376604, + "balance_loss_mlp": 0.20240557, + "epoch": 0.9400270554637006, + "flos": 17089385783040.0, + "grad_norm": 27.88210834808814, + "language_loss": 0.77165049, + "learning_rate": 3.75746733114144e-08, + "loss": 0.78633606, + "num_input_tokens_seen": 337182280, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24987793, + "step": 15635, + "time_per_iteration": 2.638913631439209 + }, + { + "auxiliary_loss_clip": 0.01216617, + "auxiliary_loss_mlp": 0.0022427, + "balance_loss_clip": 1.00301909, + "balance_loss_mlp": 0.20040408, + "epoch": 0.9400871787163686, + "flos": 22055113393920.0, + "grad_norm": 5.059159360943777, + "language_loss": 0.80568051, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.82008934, + "num_input_tokens_seen": 337203495, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2388916, + "step": 15636, + "time_per_iteration": 2.7413289546966553 + }, + { + "auxiliary_loss_clip": 0.01247489, + "auxiliary_loss_mlp": 0.00234239, + "balance_loss_clip": 1.03246784, + "balance_loss_mlp": 0.2095392, + "epoch": 0.9401473019690365, + "flos": 16982767238400.0, + "grad_norm": 9.00614526166143, + "language_loss": 0.92175972, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.93657708, + "num_input_tokens_seen": 337220435, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24743652, + "step": 15637, + "time_per_iteration": 2.5982284545898438 + }, + { + "auxiliary_loss_clip": 0.0125005, + "auxiliary_loss_mlp": 0.00246433, + "balance_loss_clip": 1.0272119, + "balance_loss_mlp": 0.21983761, + "epoch": 0.9402074252217045, + "flos": 19681956702720.0, + "grad_norm": 7.283390182518419, + "language_loss": 0.77404106, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.78900588, + "num_input_tokens_seen": 337238095, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26611328, + "step": 15638, + "time_per_iteration": 2.7237775325775146 + }, + { + "auxiliary_loss_clip": 0.01224397, + "auxiliary_loss_mlp": 0.00226458, + "balance_loss_clip": 1.01642239, + "balance_loss_mlp": 0.20302191, + "epoch": 0.9402675484743724, + "flos": 24754302858240.0, + "grad_norm": 97.55517157476578, + "language_loss": 0.909621, + "learning_rate": 3.727471440859498e-08, + "loss": 0.92412961, + "num_input_tokens_seen": 337256645, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.234375, + "step": 15639, + "time_per_iteration": 2.6810500621795654 + }, + { + "auxiliary_loss_clip": 0.01232658, + "auxiliary_loss_mlp": 0.00228331, + "balance_loss_clip": 1.01874518, + "balance_loss_mlp": 0.20402375, + "epoch": 0.9403276717270405, + "flos": 25558630156800.0, + "grad_norm": 20.55576841693115, + "language_loss": 0.83593059, + "learning_rate": 3.719991074263662e-08, + "loss": 0.85054046, + "num_input_tokens_seen": 337278360, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.24328613, + "step": 15640, + "time_per_iteration": 4.236801624298096 + }, + { + "auxiliary_loss_clip": 0.01246773, + "auxiliary_loss_mlp": 0.00237241, + "balance_loss_clip": 1.02834892, + "balance_loss_mlp": 0.21223059, + "epoch": 0.9403877949797084, + "flos": 26689852154880.0, + "grad_norm": 2.687163274392545, + "language_loss": 0.79833561, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.8131758, + "num_input_tokens_seen": 337302480, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25024414, + "step": 15641, + "time_per_iteration": 2.7950894832611084 + }, + { + "auxiliary_loss_clip": 0.01265237, + "auxiliary_loss_mlp": 0.00261548, + "balance_loss_clip": 1.03668773, + "balance_loss_mlp": 0.23112628, + "epoch": 0.9404479182323764, + "flos": 15011666455680.0, + "grad_norm": 158.97083110046825, + "language_loss": 0.923531, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.9387989, + "num_input_tokens_seen": 337316600, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.30407715, + "step": 15642, + "time_per_iteration": 2.636763334274292 + }, + { + "auxiliary_loss_clip": 0.01239127, + "auxiliary_loss_mlp": 0.00209139, + "balance_loss_clip": 1.026124, + "balance_loss_mlp": 0.18411711, + "epoch": 0.9405080414850443, + "flos": 24973573432320.0, + "grad_norm": 15.072632171253085, + "language_loss": 0.76460469, + "learning_rate": 3.697594633355084e-08, + "loss": 0.77908742, + "num_input_tokens_seen": 337336895, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.25, + "step": 15643, + "time_per_iteration": 4.086908340454102 + }, + { + "auxiliary_loss_clip": 0.0125455, + "auxiliary_loss_mlp": 0.00232653, + "balance_loss_clip": 1.03472161, + "balance_loss_mlp": 0.20544964, + "epoch": 0.9405681647377123, + "flos": 20844743777280.0, + "grad_norm": 27.530910808664522, + "language_loss": 0.83704811, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.85192013, + "num_input_tokens_seen": 337355105, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.27209473, + "step": 15644, + "time_per_iteration": 2.7179343700408936 + }, + { + "auxiliary_loss_clip": 0.0122738, + "auxiliary_loss_mlp": 0.00215655, + "balance_loss_clip": 1.01503801, + "balance_loss_mlp": 0.19257598, + "epoch": 0.9406282879903802, + "flos": 23805578885760.0, + "grad_norm": 3.591609952433629, + "language_loss": 0.75482804, + "learning_rate": 3.682700891311974e-08, + "loss": 0.76925844, + "num_input_tokens_seen": 337374905, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.23095703, + "step": 15645, + "time_per_iteration": 2.7667152881622314 + }, + { + "auxiliary_loss_clip": 0.01228871, + "auxiliary_loss_mlp": 0.00216694, + "balance_loss_clip": 1.01516581, + "balance_loss_mlp": 0.1922922, + "epoch": 0.9406884112430483, + "flos": 27674953626240.0, + "grad_norm": 5.865773232753186, + "language_loss": 0.75676346, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.77121913, + "num_input_tokens_seen": 337397130, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24389648, + "step": 15646, + "time_per_iteration": 2.8120265007019043 + }, + { + "auxiliary_loss_clip": 0.01227737, + "auxiliary_loss_mlp": 0.00226074, + "balance_loss_clip": 1.01789224, + "balance_loss_mlp": 0.20211318, + "epoch": 0.9407485344957163, + "flos": 23075048079360.0, + "grad_norm": 18.661525358271636, + "language_loss": 0.80407864, + "learning_rate": 3.667836926755208e-08, + "loss": 0.81861675, + "num_input_tokens_seen": 337418660, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.23974609, + "step": 15647, + "time_per_iteration": 2.6581857204437256 + }, + { + "auxiliary_loss_clip": 0.01095832, + "auxiliary_loss_mlp": 0.00068686, + "balance_loss_clip": 0.95981598, + "balance_loss_mlp": 0.0615331, + "epoch": 0.9408086577483842, + "flos": 71014034304000.0, + "grad_norm": 1.0260680415798746, + "language_loss": 0.62633562, + "learning_rate": 3.660416111738907e-08, + "loss": 0.63798082, + "num_input_tokens_seen": 337478055, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07128906, + "step": 15648, + "time_per_iteration": 3.289170026779175 + }, + { + "auxiliary_loss_clip": 0.01219973, + "auxiliary_loss_mlp": 0.00212746, + "balance_loss_clip": 1.01106191, + "balance_loss_mlp": 0.19067986, + "epoch": 0.9408687810010522, + "flos": 23730956380800.0, + "grad_norm": 8.370204705130892, + "language_loss": 0.72940803, + "learning_rate": 3.653002741939337e-08, + "loss": 0.74373519, + "num_input_tokens_seen": 337499405, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.2208252, + "step": 15649, + "time_per_iteration": 2.723191261291504 + }, + { + "auxiliary_loss_clip": 0.01247651, + "auxiliary_loss_mlp": 0.00209631, + "balance_loss_clip": 1.0252775, + "balance_loss_mlp": 0.18444172, + "epoch": 0.9409289042537201, + "flos": 18369314087040.0, + "grad_norm": 7.189506878780832, + "language_loss": 0.85659873, + "learning_rate": 3.645596817637586e-08, + "loss": 0.87117159, + "num_input_tokens_seen": 337517195, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.2520752, + "step": 15650, + "time_per_iteration": 2.652817964553833 + }, + { + "auxiliary_loss_clip": 0.01245194, + "auxiliary_loss_mlp": 0.00215154, + "balance_loss_clip": 1.02594757, + "balance_loss_mlp": 0.19114533, + "epoch": 0.9409890275063881, + "flos": 23878333883520.0, + "grad_norm": 147.28767515227554, + "language_loss": 0.81290758, + "learning_rate": 3.638198339114451e-08, + "loss": 0.82751107, + "num_input_tokens_seen": 337535245, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.24035645, + "step": 15651, + "time_per_iteration": 2.732922077178955 + }, + { + "auxiliary_loss_clip": 0.01227989, + "auxiliary_loss_mlp": 0.00213045, + "balance_loss_clip": 1.0156666, + "balance_loss_mlp": 0.18891658, + "epoch": 0.941049150759056, + "flos": 16545088016640.0, + "grad_norm": 142.28307556640908, + "language_loss": 0.80218947, + "learning_rate": 3.630807306650507e-08, + "loss": 0.81659979, + "num_input_tokens_seen": 337553040, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.24121094, + "step": 15652, + "time_per_iteration": 2.6501452922821045 + }, + { + "auxiliary_loss_clip": 0.01252789, + "auxiliary_loss_mlp": 0.00227181, + "balance_loss_clip": 1.02980638, + "balance_loss_mlp": 0.19904782, + "epoch": 0.9411092740117241, + "flos": 25118401069440.0, + "grad_norm": 5.349472972063294, + "language_loss": 0.73626351, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.75106329, + "num_input_tokens_seen": 337574580, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.28173828, + "step": 15653, + "time_per_iteration": 2.7915754318237305 + }, + { + "auxiliary_loss_clip": 0.01260927, + "auxiliary_loss_mlp": 0.00235411, + "balance_loss_clip": 1.03963065, + "balance_loss_mlp": 0.2102699, + "epoch": 0.941169397264392, + "flos": 21142264129920.0, + "grad_norm": 7.607088992613365, + "language_loss": 0.87590325, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.89086658, + "num_input_tokens_seen": 337593010, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.25146484, + "step": 15654, + "time_per_iteration": 2.714036226272583 + }, + { + "auxiliary_loss_clip": 0.01244065, + "auxiliary_loss_mlp": 0.00238477, + "balance_loss_clip": 1.02544415, + "balance_loss_mlp": 0.21253729, + "epoch": 0.94122952051706, + "flos": 38508914995200.0, + "grad_norm": 2.284868444639572, + "language_loss": 0.75053543, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.76536083, + "num_input_tokens_seen": 337616170, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25952148, + "step": 15655, + "time_per_iteration": 2.8940610885620117 + }, + { + "auxiliary_loss_clip": 0.01240458, + "auxiliary_loss_mlp": 0.00234995, + "balance_loss_clip": 1.02299833, + "balance_loss_mlp": 0.20954387, + "epoch": 0.9412896437697279, + "flos": 18369206346240.0, + "grad_norm": 1983.1586769552289, + "language_loss": 0.81338745, + "learning_rate": 3.601317642987944e-08, + "loss": 0.82814199, + "num_input_tokens_seen": 337635215, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.25439453, + "step": 15656, + "time_per_iteration": 2.728567600250244 + }, + { + "auxiliary_loss_clip": 0.01223967, + "auxiliary_loss_mlp": 0.0020751, + "balance_loss_clip": 1.0130322, + "balance_loss_mlp": 0.1824642, + "epoch": 0.9413497670223959, + "flos": 25884950238720.0, + "grad_norm": 1629.7223447638899, + "language_loss": 0.86420512, + "learning_rate": 3.593963845018377e-08, + "loss": 0.87851983, + "num_input_tokens_seen": 337654195, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.25048828, + "step": 15657, + "time_per_iteration": 2.8645620346069336 + }, + { + "auxiliary_loss_clip": 0.01244838, + "auxiliary_loss_mlp": 0.00213085, + "balance_loss_clip": 1.02200747, + "balance_loss_mlp": 0.18832564, + "epoch": 0.9414098902750638, + "flos": 16618309891200.0, + "grad_norm": 5.645295654178704, + "language_loss": 0.93124795, + "learning_rate": 3.586617494785371e-08, + "loss": 0.94582713, + "num_input_tokens_seen": 337671810, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.24780273, + "step": 15658, + "time_per_iteration": 2.70286226272583 + }, + { + "auxiliary_loss_clip": 0.01257021, + "auxiliary_loss_mlp": 0.00226153, + "balance_loss_clip": 1.0328474, + "balance_loss_mlp": 0.19995122, + "epoch": 0.9414700135277319, + "flos": 18625033987200.0, + "grad_norm": 255.95684633953167, + "language_loss": 0.80776066, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.82259238, + "num_input_tokens_seen": 337689410, + "router_z_loss_clip": 2.24121094, + "router_z_loss_mlp": 0.26208496, + "step": 15659, + "time_per_iteration": 2.656669855117798 + }, + { + "auxiliary_loss_clip": 0.01240213, + "auxiliary_loss_mlp": 0.002079, + "balance_loss_clip": 1.02769852, + "balance_loss_mlp": 0.18352146, + "epoch": 0.9415301367803999, + "flos": 26280146649600.0, + "grad_norm": 11.09209277650587, + "language_loss": 0.8641237, + "learning_rate": 3.571947138643172e-08, + "loss": 0.87860483, + "num_input_tokens_seen": 337709950, + "router_z_loss_clip": 2.12597656, + "router_z_loss_mlp": 0.24377441, + "step": 15660, + "time_per_iteration": 2.727726697921753 + }, + { + "auxiliary_loss_clip": 0.01219394, + "auxiliary_loss_mlp": 0.00214335, + "balance_loss_clip": 1.00951374, + "balance_loss_mlp": 0.1901716, + "epoch": 0.9415902600330678, + "flos": 23261388860160.0, + "grad_norm": 22.883985712888748, + "language_loss": 0.73730528, + "learning_rate": 3.564623133290201e-08, + "loss": 0.75164258, + "num_input_tokens_seen": 337731320, + "router_z_loss_clip": 2.10058594, + "router_z_loss_mlp": 0.24169922, + "step": 15661, + "time_per_iteration": 2.7433485984802246 + }, + { + "auxiliary_loss_clip": 0.01227765, + "auxiliary_loss_mlp": 0.00212382, + "balance_loss_clip": 1.01228809, + "balance_loss_mlp": 0.18824178, + "epoch": 0.9416503832857358, + "flos": 14719138093440.0, + "grad_norm": 5.525631363688128, + "language_loss": 0.76446772, + "learning_rate": 3.557306576786434e-08, + "loss": 0.77886915, + "num_input_tokens_seen": 337747720, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24145508, + "step": 15662, + "time_per_iteration": 2.7142772674560547 + }, + { + "auxiliary_loss_clip": 0.01092847, + "auxiliary_loss_mlp": 0.00078251, + "balance_loss_clip": 0.95503354, + "balance_loss_mlp": 0.0702879, + "epoch": 0.9417105065384037, + "flos": 70312698276480.0, + "grad_norm": 0.7501407916938616, + "language_loss": 0.58655727, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.59826827, + "num_input_tokens_seen": 337806930, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.07958984, + "step": 15663, + "time_per_iteration": 3.2916345596313477 + }, + { + "auxiliary_loss_clip": 0.01265942, + "auxiliary_loss_mlp": 0.00244355, + "balance_loss_clip": 1.03843009, + "balance_loss_mlp": 0.21722302, + "epoch": 0.9417706297910717, + "flos": 34057895322240.0, + "grad_norm": 3.549241326502264, + "language_loss": 0.75702524, + "learning_rate": 3.542695811435914e-08, + "loss": 0.77212822, + "num_input_tokens_seen": 337828100, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.27111816, + "step": 15664, + "time_per_iteration": 2.9771177768707275 + }, + { + "auxiliary_loss_clip": 0.01240319, + "auxiliary_loss_mlp": 0.00237972, + "balance_loss_clip": 1.02715683, + "balance_loss_mlp": 0.21237752, + "epoch": 0.9418307530437396, + "flos": 16471614746880.0, + "grad_norm": 11.405582327334304, + "language_loss": 0.83135545, + "learning_rate": 3.535401603143207e-08, + "loss": 0.84613836, + "num_input_tokens_seen": 337844805, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.25585938, + "step": 15665, + "time_per_iteration": 2.663813829421997 + }, + { + "auxiliary_loss_clip": 0.01224038, + "auxiliary_loss_mlp": 0.00229821, + "balance_loss_clip": 1.01609814, + "balance_loss_mlp": 0.20588349, + "epoch": 0.9418908762964077, + "flos": 11253543114240.0, + "grad_norm": 81.93067326715413, + "language_loss": 0.71682018, + "learning_rate": 3.528114844807773e-08, + "loss": 0.73135877, + "num_input_tokens_seen": 337860490, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.23937988, + "step": 15666, + "time_per_iteration": 2.6352028846740723 + }, + { + "auxiliary_loss_clip": 0.01250229, + "auxiliary_loss_mlp": 0.00238396, + "balance_loss_clip": 1.02857482, + "balance_loss_mlp": 0.21208695, + "epoch": 0.9419509995490756, + "flos": 18438836860800.0, + "grad_norm": 93.83440938262397, + "language_loss": 0.86540508, + "learning_rate": 3.520835536705902e-08, + "loss": 0.8802914, + "num_input_tokens_seen": 337878360, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.26293945, + "step": 15667, + "time_per_iteration": 2.6307830810546875 + }, + { + "auxiliary_loss_clip": 0.0121948, + "auxiliary_loss_mlp": 0.00209995, + "balance_loss_clip": 1.01027179, + "balance_loss_mlp": 0.18635587, + "epoch": 0.9420111228017436, + "flos": 20737945664640.0, + "grad_norm": 10.796567202950989, + "language_loss": 0.82885945, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.84315419, + "num_input_tokens_seen": 337895635, + "router_z_loss_clip": 2.09277344, + "router_z_loss_mlp": 0.23669434, + "step": 15668, + "time_per_iteration": 2.6976938247680664 + }, + { + "auxiliary_loss_clip": 0.01246405, + "auxiliary_loss_mlp": 0.00226576, + "balance_loss_clip": 1.02378416, + "balance_loss_mlp": 0.20086288, + "epoch": 0.9420712460544115, + "flos": 21141940907520.0, + "grad_norm": 688.5155290487963, + "language_loss": 0.71732587, + "learning_rate": 3.506299272306723e-08, + "loss": 0.73205566, + "num_input_tokens_seen": 337913940, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25744629, + "step": 15669, + "time_per_iteration": 2.6740849018096924 + }, + { + "auxiliary_loss_clip": 0.01230405, + "auxiliary_loss_mlp": 0.00222815, + "balance_loss_clip": 1.0191896, + "balance_loss_mlp": 0.1988782, + "epoch": 0.9421313693070795, + "flos": 15851760721920.0, + "grad_norm": 29.01230211677344, + "language_loss": 0.84367359, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.8582058, + "num_input_tokens_seen": 337932015, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.23937988, + "step": 15670, + "time_per_iteration": 2.63840651512146 + }, + { + "auxiliary_loss_clip": 0.01265909, + "auxiliary_loss_mlp": 0.00249883, + "balance_loss_clip": 1.03801453, + "balance_loss_mlp": 0.22264403, + "epoch": 0.9421914925597474, + "flos": 32415915882240.0, + "grad_norm": 8.769852245690254, + "language_loss": 0.72789866, + "learning_rate": 3.491792812150574e-08, + "loss": 0.7430566, + "num_input_tokens_seen": 337953345, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.27246094, + "step": 15671, + "time_per_iteration": 2.737896680831909 + }, + { + "auxiliary_loss_clip": 0.01232351, + "auxiliary_loss_mlp": 0.00223913, + "balance_loss_clip": 1.01753783, + "balance_loss_mlp": 0.19918934, + "epoch": 0.9422516158124155, + "flos": 19718513769600.0, + "grad_norm": 2.3917286994247577, + "language_loss": 0.85724956, + "learning_rate": 3.48455075935139e-08, + "loss": 0.87181222, + "num_input_tokens_seen": 337973685, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.24731445, + "step": 15672, + "time_per_iteration": 2.6532561779022217 + }, + { + "auxiliary_loss_clip": 0.01249036, + "auxiliary_loss_mlp": 0.00240271, + "balance_loss_clip": 1.02469921, + "balance_loss_mlp": 0.21307981, + "epoch": 0.9423117390650835, + "flos": 16253277926400.0, + "grad_norm": 10.062525107733494, + "language_loss": 0.82210726, + "learning_rate": 3.47731615843776e-08, + "loss": 0.83700037, + "num_input_tokens_seen": 337989175, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.27209473, + "step": 15673, + "time_per_iteration": 4.03102707862854 + }, + { + "auxiliary_loss_clip": 0.01233968, + "auxiliary_loss_mlp": 0.00206832, + "balance_loss_clip": 1.01470399, + "balance_loss_mlp": 0.18157168, + "epoch": 0.9423718623177514, + "flos": 31796564647680.0, + "grad_norm": 23.47217890119892, + "language_loss": 0.76989591, + "learning_rate": 3.470089009683974e-08, + "loss": 0.78430396, + "num_input_tokens_seen": 338011800, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25244141, + "step": 15674, + "time_per_iteration": 4.169728994369507 + }, + { + "auxiliary_loss_clip": 0.01238294, + "auxiliary_loss_mlp": 0.00220918, + "balance_loss_clip": 1.02121854, + "balance_loss_mlp": 0.19646837, + "epoch": 0.9424319855704194, + "flos": 23331809473920.0, + "grad_norm": 37.91519345134589, + "language_loss": 0.88790083, + "learning_rate": 3.462869313364125e-08, + "loss": 0.90249288, + "num_input_tokens_seen": 338032120, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24450684, + "step": 15675, + "time_per_iteration": 2.6434576511383057 + }, + { + "auxiliary_loss_clip": 0.01235955, + "auxiliary_loss_mlp": 0.00233184, + "balance_loss_clip": 1.02253628, + "balance_loss_mlp": 0.20912716, + "epoch": 0.9424921088230873, + "flos": 20777627214720.0, + "grad_norm": 115.8410655340488, + "language_loss": 0.69207919, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.70677054, + "num_input_tokens_seen": 338051880, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24035645, + "step": 15676, + "time_per_iteration": 2.7256834506988525 + }, + { + "auxiliary_loss_clip": 0.01246416, + "auxiliary_loss_mlp": 0.00232017, + "balance_loss_clip": 1.02499676, + "balance_loss_mlp": 0.20602974, + "epoch": 0.9425522320757553, + "flos": 19026658932480.0, + "grad_norm": 9.621141295280562, + "language_loss": 0.74754894, + "learning_rate": 3.448452279120984e-08, + "loss": 0.76233327, + "num_input_tokens_seen": 338069665, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.25964355, + "step": 15677, + "time_per_iteration": 2.6897754669189453 + }, + { + "auxiliary_loss_clip": 0.0125112, + "auxiliary_loss_mlp": 0.00227882, + "balance_loss_clip": 1.03010416, + "balance_loss_mlp": 0.20090494, + "epoch": 0.9426123553284232, + "flos": 25155353185920.0, + "grad_norm": 11.644822652736954, + "language_loss": 0.7399869, + "learning_rate": 3.441254941744387e-08, + "loss": 0.7547769, + "num_input_tokens_seen": 338090490, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.26989746, + "step": 15678, + "time_per_iteration": 2.7791101932525635 + }, + { + "auxiliary_loss_clip": 0.01233465, + "auxiliary_loss_mlp": 0.00212267, + "balance_loss_clip": 1.01925719, + "balance_loss_mlp": 0.18621978, + "epoch": 0.9426724785810913, + "flos": 21179359900800.0, + "grad_norm": 34.98105525144641, + "language_loss": 0.83807099, + "learning_rate": 3.434065057895097e-08, + "loss": 0.85252827, + "num_input_tokens_seen": 338109825, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.26037598, + "step": 15679, + "time_per_iteration": 2.700272798538208 + }, + { + "auxiliary_loss_clip": 0.01243998, + "auxiliary_loss_mlp": 0.00218446, + "balance_loss_clip": 1.02684402, + "balance_loss_mlp": 0.19281596, + "epoch": 0.9427326018337592, + "flos": 14756916222720.0, + "grad_norm": 9.987865765847612, + "language_loss": 0.87023199, + "learning_rate": 3.426882627845762e-08, + "loss": 0.8848564, + "num_input_tokens_seen": 338125790, + "router_z_loss_clip": 2.17285156, + "router_z_loss_mlp": 0.25646973, + "step": 15680, + "time_per_iteration": 2.6354875564575195 + }, + { + "auxiliary_loss_clip": 0.01241609, + "auxiliary_loss_mlp": 0.00233103, + "balance_loss_clip": 1.02758193, + "balance_loss_mlp": 0.20883152, + "epoch": 0.9427927250864272, + "flos": 20923640000640.0, + "grad_norm": 34791.49453295058, + "language_loss": 0.82811165, + "learning_rate": 3.419707651868742e-08, + "loss": 0.84285873, + "num_input_tokens_seen": 338145610, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.24291992, + "step": 15681, + "time_per_iteration": 2.6990833282470703 + }, + { + "auxiliary_loss_clip": 0.01252683, + "auxiliary_loss_mlp": 0.00219697, + "balance_loss_clip": 1.03142881, + "balance_loss_mlp": 0.19336328, + "epoch": 0.9428528483390951, + "flos": 19752520970880.0, + "grad_norm": 27.788291523080776, + "language_loss": 0.75010806, + "learning_rate": 3.412540130236086e-08, + "loss": 0.76483184, + "num_input_tokens_seen": 338165960, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.26342773, + "step": 15682, + "time_per_iteration": 4.171631097793579 + }, + { + "auxiliary_loss_clip": 0.01243218, + "auxiliary_loss_mlp": 0.00231426, + "balance_loss_clip": 1.02283573, + "balance_loss_mlp": 0.20518845, + "epoch": 0.9429129715917631, + "flos": 24534996370560.0, + "grad_norm": 39.45542646452124, + "language_loss": 0.85814542, + "learning_rate": 3.405380063219665e-08, + "loss": 0.87289184, + "num_input_tokens_seen": 338187215, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.2623291, + "step": 15683, + "time_per_iteration": 2.7601516246795654 + }, + { + "auxiliary_loss_clip": 0.01259847, + "auxiliary_loss_mlp": 0.00233324, + "balance_loss_clip": 1.0363636, + "balance_loss_mlp": 0.20736055, + "epoch": 0.942973094844431, + "flos": 17959824063360.0, + "grad_norm": 22.70085955848421, + "language_loss": 0.87751251, + "learning_rate": 3.398227451090885e-08, + "loss": 0.89244425, + "num_input_tokens_seen": 338201825, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.25976562, + "step": 15684, + "time_per_iteration": 2.648496150970459 + }, + { + "auxiliary_loss_clip": 0.01228428, + "auxiliary_loss_mlp": 0.00225953, + "balance_loss_clip": 1.01907372, + "balance_loss_mlp": 0.20252812, + "epoch": 0.9430332180970991, + "flos": 26137689310080.0, + "grad_norm": 138.5949563535763, + "language_loss": 0.84160048, + "learning_rate": 3.391082294121017e-08, + "loss": 0.85614431, + "num_input_tokens_seen": 338220865, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.234375, + "step": 15685, + "time_per_iteration": 4.080591440200806 + }, + { + "auxiliary_loss_clip": 0.01212939, + "auxiliary_loss_mlp": 0.00192793, + "balance_loss_clip": 1.01077056, + "balance_loss_mlp": 0.16839108, + "epoch": 0.943093341349767, + "flos": 23951376190080.0, + "grad_norm": 156.00964018211724, + "language_loss": 0.84126246, + "learning_rate": 3.383944592581023e-08, + "loss": 0.8553198, + "num_input_tokens_seen": 338240160, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.24377441, + "step": 15686, + "time_per_iteration": 2.6758170127868652 + }, + { + "auxiliary_loss_clip": 0.01231645, + "auxiliary_loss_mlp": 0.00231346, + "balance_loss_clip": 1.02198625, + "balance_loss_mlp": 0.20619301, + "epoch": 0.943153464602435, + "flos": 17968407413760.0, + "grad_norm": 15.387021263275203, + "language_loss": 0.88993758, + "learning_rate": 3.376814346741575e-08, + "loss": 0.90456754, + "num_input_tokens_seen": 338259305, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.25195312, + "step": 15687, + "time_per_iteration": 2.639714241027832 + }, + { + "auxiliary_loss_clip": 0.01252664, + "auxiliary_loss_mlp": 0.00237813, + "balance_loss_clip": 1.02946949, + "balance_loss_mlp": 0.21091893, + "epoch": 0.943213587855103, + "flos": 14501519544960.0, + "grad_norm": 52.46079674870258, + "language_loss": 0.86028248, + "learning_rate": 3.369691556873011e-08, + "loss": 0.87518728, + "num_input_tokens_seen": 338274950, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26904297, + "step": 15688, + "time_per_iteration": 2.6492702960968018 + }, + { + "auxiliary_loss_clip": 0.0122704, + "auxiliary_loss_mlp": 0.00200298, + "balance_loss_clip": 1.01605868, + "balance_loss_mlp": 0.17656311, + "epoch": 0.9432737111077709, + "flos": 28986411093120.0, + "grad_norm": 56.278707544026695, + "language_loss": 0.75295079, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.76722413, + "num_input_tokens_seen": 338295585, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.23718262, + "step": 15689, + "time_per_iteration": 2.813739538192749 + }, + { + "auxiliary_loss_clip": 0.01206354, + "auxiliary_loss_mlp": 0.00209702, + "balance_loss_clip": 1.00180483, + "balance_loss_mlp": 0.18801789, + "epoch": 0.9433338343604389, + "flos": 21609066303360.0, + "grad_norm": 49.62747263622572, + "language_loss": 0.87407422, + "learning_rate": 3.35546834612872e-08, + "loss": 0.88823485, + "num_input_tokens_seen": 338314555, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.21691895, + "step": 15690, + "time_per_iteration": 2.659653902053833 + }, + { + "auxiliary_loss_clip": 0.01233055, + "auxiliary_loss_mlp": 0.00239025, + "balance_loss_clip": 1.02285016, + "balance_loss_mlp": 0.21469411, + "epoch": 0.9433939576131068, + "flos": 33182285483520.0, + "grad_norm": 25.80031863183923, + "language_loss": 0.67235577, + "learning_rate": 3.348367925792317e-08, + "loss": 0.68707657, + "num_input_tokens_seen": 338336260, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.24328613, + "step": 15691, + "time_per_iteration": 2.7773056030273438 + }, + { + "auxiliary_loss_clip": 0.01243276, + "auxiliary_loss_mlp": 0.00217586, + "balance_loss_clip": 1.02443421, + "balance_loss_mlp": 0.19260009, + "epoch": 0.9434540808657749, + "flos": 20486391742080.0, + "grad_norm": 26.147835712215763, + "language_loss": 0.75869465, + "learning_rate": 3.341274962505514e-08, + "loss": 0.77330327, + "num_input_tokens_seen": 338354680, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.25, + "step": 15692, + "time_per_iteration": 2.6313483715057373 + }, + { + "auxiliary_loss_clip": 0.01246486, + "auxiliary_loss_mlp": 0.00225613, + "balance_loss_clip": 1.0254612, + "balance_loss_mlp": 0.19941071, + "epoch": 0.9435142041184428, + "flos": 21542955321600.0, + "grad_norm": 264.6690311241186, + "language_loss": 0.83295, + "learning_rate": 3.334189456537251e-08, + "loss": 0.84767097, + "num_input_tokens_seen": 338372490, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.2623291, + "step": 15693, + "time_per_iteration": 2.6765332221984863 + }, + { + "auxiliary_loss_clip": 0.0124293, + "auxiliary_loss_mlp": 0.00222085, + "balance_loss_clip": 1.02768743, + "balance_loss_mlp": 0.19793274, + "epoch": 0.9435743273711108, + "flos": 25009089004800.0, + "grad_norm": 7.865532122250188, + "language_loss": 0.80489045, + "learning_rate": 3.327111408156291e-08, + "loss": 0.81954062, + "num_input_tokens_seen": 338390870, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.24145508, + "step": 15694, + "time_per_iteration": 2.675974130630493 + }, + { + "auxiliary_loss_clip": 0.01091121, + "auxiliary_loss_mlp": 0.0006953, + "balance_loss_clip": 0.95436841, + "balance_loss_mlp": 0.06252004, + "epoch": 0.9436344506237787, + "flos": 60158707320960.0, + "grad_norm": 2.517113411525512, + "language_loss": 0.49771762, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.50932413, + "num_input_tokens_seen": 338453075, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.0703125, + "step": 15695, + "time_per_iteration": 3.23744797706604 + }, + { + "auxiliary_loss_clip": 0.01210919, + "auxiliary_loss_mlp": 0.00204321, + "balance_loss_clip": 1.00638103, + "balance_loss_mlp": 0.18204062, + "epoch": 0.9436945738764467, + "flos": 22237252283520.0, + "grad_norm": 65.58963433816983, + "language_loss": 0.73341918, + "learning_rate": 3.312977685229335e-08, + "loss": 0.74757159, + "num_input_tokens_seen": 338471770, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.22290039, + "step": 15696, + "time_per_iteration": 2.6998977661132812 + }, + { + "auxiliary_loss_clip": 0.01237733, + "auxiliary_loss_mlp": 0.00204366, + "balance_loss_clip": 1.02534473, + "balance_loss_mlp": 0.1807034, + "epoch": 0.9437546971291146, + "flos": 25045179194880.0, + "grad_norm": 95.31960541045768, + "language_loss": 0.75372344, + "learning_rate": 3.305922011219353e-08, + "loss": 0.76814437, + "num_input_tokens_seen": 338492190, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.2364502, + "step": 15697, + "time_per_iteration": 2.728760004043579 + }, + { + "auxiliary_loss_clip": 0.01090505, + "auxiliary_loss_mlp": 0.00058958, + "balance_loss_clip": 0.95396364, + "balance_loss_mlp": 0.05180521, + "epoch": 0.9438148203817827, + "flos": 56790788400000.0, + "grad_norm": 0.8212770561454543, + "language_loss": 0.61694348, + "learning_rate": 3.298873795868506e-08, + "loss": 0.62843812, + "num_input_tokens_seen": 338552560, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07128906, + "step": 15698, + "time_per_iteration": 3.1211838722229004 + }, + { + "auxiliary_loss_clip": 0.01244883, + "auxiliary_loss_mlp": 0.00229928, + "balance_loss_clip": 1.02251661, + "balance_loss_mlp": 0.20111531, + "epoch": 0.9438749436344506, + "flos": 22346384780160.0, + "grad_norm": 21.6318022610425, + "language_loss": 0.77989185, + "learning_rate": 3.291833039444092e-08, + "loss": 0.79463995, + "num_input_tokens_seen": 338571770, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.28845215, + "step": 15699, + "time_per_iteration": 2.6793012619018555 + }, + { + "auxiliary_loss_clip": 0.01223978, + "auxiliary_loss_mlp": 0.00229896, + "balance_loss_clip": 1.01636434, + "balance_loss_mlp": 0.20663871, + "epoch": 0.9439350668871186, + "flos": 13370800337280.0, + "grad_norm": 25.727182048325727, + "language_loss": 0.83786809, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.85240686, + "num_input_tokens_seen": 338587310, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.23278809, + "step": 15700, + "time_per_iteration": 2.6991260051727295 + }, + { + "auxiliary_loss_clip": 0.01232981, + "auxiliary_loss_mlp": 0.00213098, + "balance_loss_clip": 1.0251534, + "balance_loss_mlp": 0.18844587, + "epoch": 0.9439951901397866, + "flos": 17785334770560.0, + "grad_norm": 55.62835655494158, + "language_loss": 0.79562187, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.81008261, + "num_input_tokens_seen": 338606235, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.24633789, + "step": 15701, + "time_per_iteration": 2.7022478580474854 + }, + { + "auxiliary_loss_clip": 0.0125177, + "auxiliary_loss_mlp": 0.00250024, + "balance_loss_clip": 1.03014827, + "balance_loss_mlp": 0.22472805, + "epoch": 0.9440553133924545, + "flos": 18879568738560.0, + "grad_norm": 119.6071011945671, + "language_loss": 0.86477602, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.879794, + "num_input_tokens_seen": 338624090, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.25305176, + "step": 15702, + "time_per_iteration": 2.698657274246216 + }, + { + "auxiliary_loss_clip": 0.01268919, + "auxiliary_loss_mlp": 0.00242156, + "balance_loss_clip": 1.04316044, + "balance_loss_mlp": 0.21484488, + "epoch": 0.9441154366451225, + "flos": 19572967860480.0, + "grad_norm": 394.0047743146646, + "language_loss": 0.7644282, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.77953893, + "num_input_tokens_seen": 338643695, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.2734375, + "step": 15703, + "time_per_iteration": 2.660036325454712 + }, + { + "auxiliary_loss_clip": 0.01252622, + "auxiliary_loss_mlp": 0.00244581, + "balance_loss_clip": 1.02913916, + "balance_loss_mlp": 0.21816452, + "epoch": 0.9441755598977905, + "flos": 30294995472000.0, + "grad_norm": 77.58822439235817, + "language_loss": 0.81658566, + "learning_rate": 3.256741150552833e-08, + "loss": 0.83155775, + "num_input_tokens_seen": 338664725, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26428223, + "step": 15704, + "time_per_iteration": 2.755995273590088 + }, + { + "auxiliary_loss_clip": 0.01229649, + "auxiliary_loss_mlp": 0.00222651, + "balance_loss_clip": 1.01328778, + "balance_loss_mlp": 0.19767642, + "epoch": 0.9442356831504585, + "flos": 20667884186880.0, + "grad_norm": 163.94844616316863, + "language_loss": 0.85442102, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.86894393, + "num_input_tokens_seen": 338683990, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.24987793, + "step": 15705, + "time_per_iteration": 2.648963212966919 + }, + { + "auxiliary_loss_clip": 0.01237604, + "auxiliary_loss_mlp": 0.00203458, + "balance_loss_clip": 1.02403951, + "balance_loss_mlp": 0.1801286, + "epoch": 0.9442958064031264, + "flos": 16107265140480.0, + "grad_norm": 183.21745023682266, + "language_loss": 0.8626045, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.87701511, + "num_input_tokens_seen": 338702025, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.23352051, + "step": 15706, + "time_per_iteration": 2.681553602218628 + }, + { + "auxiliary_loss_clip": 0.01213753, + "auxiliary_loss_mlp": 0.00237072, + "balance_loss_clip": 1.00709081, + "balance_loss_mlp": 0.21380232, + "epoch": 0.9443559296557944, + "flos": 20447392550400.0, + "grad_norm": 13.469468247638869, + "language_loss": 0.75027323, + "learning_rate": 3.23577554137866e-08, + "loss": 0.76478148, + "num_input_tokens_seen": 338720920, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.23254395, + "step": 15707, + "time_per_iteration": 2.6461386680603027 + }, + { + "auxiliary_loss_clip": 0.01227866, + "auxiliary_loss_mlp": 0.00209166, + "balance_loss_clip": 1.02189827, + "balance_loss_mlp": 0.18617094, + "epoch": 0.9444160529084623, + "flos": 21610897896960.0, + "grad_norm": 78.13475560854229, + "language_loss": 0.76378375, + "learning_rate": 3.22880192727244e-08, + "loss": 0.77815408, + "num_input_tokens_seen": 338739590, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.2298584, + "step": 15708, + "time_per_iteration": 2.670027017593384 + }, + { + "auxiliary_loss_clip": 0.01242581, + "auxiliary_loss_mlp": 0.00215688, + "balance_loss_clip": 1.02978802, + "balance_loss_mlp": 0.19222759, + "epoch": 0.9444761761611303, + "flos": 18441781776000.0, + "grad_norm": 244.58267413296477, + "language_loss": 0.80655605, + "learning_rate": 3.221835774749748e-08, + "loss": 0.82113874, + "num_input_tokens_seen": 338757240, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.23461914, + "step": 15709, + "time_per_iteration": 2.6113405227661133 + }, + { + "auxiliary_loss_clip": 0.01219952, + "auxiliary_loss_mlp": 0.00216348, + "balance_loss_clip": 1.01004136, + "balance_loss_mlp": 0.19546258, + "epoch": 0.9445362994137982, + "flos": 20957144411520.0, + "grad_norm": 1810.358016725857, + "language_loss": 0.92839062, + "learning_rate": 3.214877084074774e-08, + "loss": 0.94275361, + "num_input_tokens_seen": 338773750, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.2088623, + "step": 15710, + "time_per_iteration": 2.778749942779541 + }, + { + "auxiliary_loss_clip": 0.01234657, + "auxiliary_loss_mlp": 0.00238363, + "balance_loss_clip": 1.0176537, + "balance_loss_mlp": 0.21117173, + "epoch": 0.9445964226664663, + "flos": 20303283185280.0, + "grad_norm": 9.97297737973938, + "language_loss": 0.78154385, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.79627407, + "num_input_tokens_seen": 338792115, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.27197266, + "step": 15711, + "time_per_iteration": 2.7165844440460205 + }, + { + "auxiliary_loss_clip": 0.01243434, + "auxiliary_loss_mlp": 0.00221197, + "balance_loss_clip": 1.02868664, + "balance_loss_mlp": 0.19718787, + "epoch": 0.9446565459191342, + "flos": 26396030903040.0, + "grad_norm": 15.056205502301392, + "language_loss": 0.76234686, + "learning_rate": 3.200982089323179e-08, + "loss": 0.77699322, + "num_input_tokens_seen": 338812480, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.2401123, + "step": 15712, + "time_per_iteration": 2.6959824562072754 + }, + { + "auxiliary_loss_clip": 0.01261868, + "auxiliary_loss_mlp": 0.00243505, + "balance_loss_clip": 1.03405583, + "balance_loss_mlp": 0.21423946, + "epoch": 0.9447166691718022, + "flos": 16544764794240.0, + "grad_norm": 31.438380638228004, + "language_loss": 0.79215163, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.80720532, + "num_input_tokens_seen": 338829105, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.29248047, + "step": 15713, + "time_per_iteration": 2.6425936222076416 + }, + { + "auxiliary_loss_clip": 0.01234022, + "auxiliary_loss_mlp": 0.00225362, + "balance_loss_clip": 1.02211618, + "balance_loss_mlp": 0.20260471, + "epoch": 0.9447767924244702, + "flos": 29164635400320.0, + "grad_norm": 6.538795757938669, + "language_loss": 0.8395859, + "learning_rate": 3.187116945125212e-08, + "loss": 0.8541798, + "num_input_tokens_seen": 338850670, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.22766113, + "step": 15714, + "time_per_iteration": 2.711700677871704 + }, + { + "auxiliary_loss_clip": 0.01256644, + "auxiliary_loss_mlp": 0.00231958, + "balance_loss_clip": 1.03074455, + "balance_loss_mlp": 0.20589897, + "epoch": 0.9448369156771381, + "flos": 19274908803840.0, + "grad_norm": 113.25434964580083, + "language_loss": 0.75276303, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.76764905, + "num_input_tokens_seen": 338867795, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.26074219, + "step": 15715, + "time_per_iteration": 4.067935466766357 + }, + { + "auxiliary_loss_clip": 0.0123552, + "auxiliary_loss_mlp": 0.00240659, + "balance_loss_clip": 1.02324867, + "balance_loss_mlp": 0.21543387, + "epoch": 0.9448970389298061, + "flos": 23841166285440.0, + "grad_norm": 15.808150731392649, + "language_loss": 0.82287467, + "learning_rate": 3.173281653583948e-08, + "loss": 0.83763647, + "num_input_tokens_seen": 338887205, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.25244141, + "step": 15716, + "time_per_iteration": 2.6892614364624023 + }, + { + "auxiliary_loss_clip": 0.01261524, + "auxiliary_loss_mlp": 0.00228553, + "balance_loss_clip": 1.03557158, + "balance_loss_mlp": 0.20074144, + "epoch": 0.944957162182474, + "flos": 22382259488640.0, + "grad_norm": 3.0024220774128745, + "language_loss": 0.69863164, + "learning_rate": 3.166375203215565e-08, + "loss": 0.71353245, + "num_input_tokens_seen": 338906130, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.27807617, + "step": 15717, + "time_per_iteration": 4.1310882568359375 + }, + { + "auxiliary_loss_clip": 0.01236044, + "auxiliary_loss_mlp": 0.00228448, + "balance_loss_clip": 1.02387428, + "balance_loss_mlp": 0.20468968, + "epoch": 0.9450172854351421, + "flos": 17383889393280.0, + "grad_norm": 3.1921625785407772, + "language_loss": 0.84888691, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.86353183, + "num_input_tokens_seen": 338923045, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.23779297, + "step": 15718, + "time_per_iteration": 2.672595262527466 + }, + { + "auxiliary_loss_clip": 0.01087381, + "auxiliary_loss_mlp": 0.00070449, + "balance_loss_clip": 0.9510597, + "balance_loss_mlp": 0.06343907, + "epoch": 0.94507740868781, + "flos": 68466352406400.0, + "grad_norm": 0.7299572399325407, + "language_loss": 0.57400268, + "learning_rate": 3.152584694592719e-08, + "loss": 0.58558095, + "num_input_tokens_seen": 338987545, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.0703125, + "step": 15719, + "time_per_iteration": 3.190312147140503 + }, + { + "auxiliary_loss_clip": 0.01237589, + "auxiliary_loss_mlp": 0.00228009, + "balance_loss_clip": 1.02214622, + "balance_loss_mlp": 0.2030105, + "epoch": 0.945137531940478, + "flos": 21142479611520.0, + "grad_norm": 4.700827870151682, + "language_loss": 0.81297356, + "learning_rate": 3.145700636861193e-08, + "loss": 0.82762957, + "num_input_tokens_seen": 339007830, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.24987793, + "step": 15720, + "time_per_iteration": 2.6379942893981934 + }, + { + "auxiliary_loss_clip": 0.01216733, + "auxiliary_loss_mlp": 0.00221522, + "balance_loss_clip": 1.00888824, + "balance_loss_mlp": 0.19820431, + "epoch": 0.9451976551931459, + "flos": 24533918962560.0, + "grad_norm": 69.73470005751041, + "language_loss": 0.78503847, + "learning_rate": 3.138824043864452e-08, + "loss": 0.79942101, + "num_input_tokens_seen": 339028980, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.23327637, + "step": 15721, + "time_per_iteration": 2.7217938899993896 + }, + { + "auxiliary_loss_clip": 0.01243563, + "auxiliary_loss_mlp": 0.00256708, + "balance_loss_clip": 1.02677286, + "balance_loss_mlp": 0.23140022, + "epoch": 0.9452577784458139, + "flos": 23440582834560.0, + "grad_norm": 8.265496637982888, + "language_loss": 0.93670821, + "learning_rate": 3.131954915863244e-08, + "loss": 0.95171082, + "num_input_tokens_seen": 339047950, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.2532959, + "step": 15722, + "time_per_iteration": 2.6794915199279785 + }, + { + "auxiliary_loss_clip": 0.01090123, + "auxiliary_loss_mlp": 0.00084107, + "balance_loss_clip": 0.95304322, + "balance_loss_mlp": 0.07695486, + "epoch": 0.9453179016984818, + "flos": 52017686449920.0, + "grad_norm": 0.9419840470695992, + "language_loss": 0.63580012, + "learning_rate": 3.125093253118005e-08, + "loss": 0.64754242, + "num_input_tokens_seen": 339104535, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.07128906, + "step": 15723, + "time_per_iteration": 3.0852644443511963 + }, + { + "auxiliary_loss_clip": 0.01238377, + "auxiliary_loss_mlp": 0.0021768, + "balance_loss_clip": 1.02648282, + "balance_loss_mlp": 0.19365975, + "epoch": 0.9453780249511499, + "flos": 13473001509120.0, + "grad_norm": 10.189135454611666, + "language_loss": 0.81973696, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.83429748, + "num_input_tokens_seen": 339122050, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.24023438, + "step": 15724, + "time_per_iteration": 2.620089054107666 + }, + { + "auxiliary_loss_clip": 0.01221228, + "auxiliary_loss_mlp": 0.0021731, + "balance_loss_clip": 1.0114727, + "balance_loss_mlp": 0.19371887, + "epoch": 0.9454381482038178, + "flos": 23258515772160.0, + "grad_norm": 5.267125747039644, + "language_loss": 0.93175691, + "learning_rate": 3.111392324436024e-08, + "loss": 0.94614232, + "num_input_tokens_seen": 339138940, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23583984, + "step": 15725, + "time_per_iteration": 4.122671842575073 + }, + { + "auxiliary_loss_clip": 0.01232633, + "auxiliary_loss_mlp": 0.00218278, + "balance_loss_clip": 1.02010369, + "balance_loss_mlp": 0.19412634, + "epoch": 0.9454982714564858, + "flos": 19496621502720.0, + "grad_norm": 22.0289124579766, + "language_loss": 0.77926028, + "learning_rate": 3.104553059018822e-08, + "loss": 0.79376936, + "num_input_tokens_seen": 339158245, + "router_z_loss_clip": 2.12402344, + "router_z_loss_mlp": 0.24133301, + "step": 15726, + "time_per_iteration": 2.6906869411468506 + }, + { + "auxiliary_loss_clip": 0.01228729, + "auxiliary_loss_mlp": 0.00232868, + "balance_loss_clip": 1.014153, + "balance_loss_mlp": 0.20587894, + "epoch": 0.9455583947091538, + "flos": 23258120722560.0, + "grad_norm": 8.573864251115046, + "language_loss": 0.73305929, + "learning_rate": 3.097721259896735e-08, + "loss": 0.74767518, + "num_input_tokens_seen": 339178200, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.26977539, + "step": 15727, + "time_per_iteration": 4.192553520202637 + }, + { + "auxiliary_loss_clip": 0.01222463, + "auxiliary_loss_mlp": 0.00247646, + "balance_loss_clip": 1.01489723, + "balance_loss_mlp": 0.22305307, + "epoch": 0.9456185179618217, + "flos": 17673041877120.0, + "grad_norm": 53.16807866616446, + "language_loss": 0.88463551, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.89933658, + "num_input_tokens_seen": 339193950, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.24584961, + "step": 15728, + "time_per_iteration": 2.6553094387054443 + }, + { + "auxiliary_loss_clip": 0.01086142, + "auxiliary_loss_mlp": 0.00087755, + "balance_loss_clip": 0.95176065, + "balance_loss_mlp": 0.07917231, + "epoch": 0.9456786412144897, + "flos": 61415040389760.0, + "grad_norm": 0.7009839136374952, + "language_loss": 0.577613, + "learning_rate": 3.08408006157368e-08, + "loss": 0.58935201, + "num_input_tokens_seen": 339252330, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.0859375, + "step": 15729, + "time_per_iteration": 3.0930285453796387 + }, + { + "auxiliary_loss_clip": 0.01232755, + "auxiliary_loss_mlp": 0.0020865, + "balance_loss_clip": 1.01551235, + "balance_loss_mlp": 0.18294895, + "epoch": 0.9457387644671577, + "flos": 18588369179520.0, + "grad_norm": 65.70178223015573, + "language_loss": 0.86070645, + "learning_rate": 3.077270662890052e-08, + "loss": 0.87512052, + "num_input_tokens_seen": 339270325, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.25695801, + "step": 15730, + "time_per_iteration": 2.6084976196289062 + }, + { + "auxiliary_loss_clip": 0.01236314, + "auxiliary_loss_mlp": 0.00226189, + "balance_loss_clip": 1.02001715, + "balance_loss_mlp": 0.20040429, + "epoch": 0.9457988877198257, + "flos": 21108544237440.0, + "grad_norm": 2.9301037077457672, + "language_loss": 0.7307173, + "learning_rate": 3.070468731536047e-08, + "loss": 0.74534237, + "num_input_tokens_seen": 339291980, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.25793457, + "step": 15731, + "time_per_iteration": 2.698867082595825 + }, + { + "auxiliary_loss_clip": 0.01244389, + "auxiliary_loss_mlp": 0.00246418, + "balance_loss_clip": 1.02668273, + "balance_loss_mlp": 0.22205195, + "epoch": 0.9458590109724936, + "flos": 26688379697280.0, + "grad_norm": 7.795217950399212, + "language_loss": 0.71666729, + "learning_rate": 3.063674267769589e-08, + "loss": 0.73157537, + "num_input_tokens_seen": 339311795, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24401855, + "step": 15732, + "time_per_iteration": 2.6917245388031006 + }, + { + "auxiliary_loss_clip": 0.01267152, + "auxiliary_loss_mlp": 0.00241307, + "balance_loss_clip": 1.04171801, + "balance_loss_mlp": 0.21499777, + "epoch": 0.9459191342251616, + "flos": 18661591054080.0, + "grad_norm": 14.405907350351852, + "language_loss": 0.92965657, + "learning_rate": 3.056887271848363e-08, + "loss": 0.94474113, + "num_input_tokens_seen": 339327745, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.26318359, + "step": 15733, + "time_per_iteration": 2.6496026515960693 + }, + { + "auxiliary_loss_clip": 0.01229095, + "auxiliary_loss_mlp": 0.0023959, + "balance_loss_clip": 1.02185869, + "balance_loss_mlp": 0.21672592, + "epoch": 0.9459792574778295, + "flos": 23398459159680.0, + "grad_norm": 8.288664939624198, + "language_loss": 0.79992259, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.81460941, + "num_input_tokens_seen": 339346445, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.22888184, + "step": 15734, + "time_per_iteration": 2.6284732818603516 + }, + { + "auxiliary_loss_clip": 0.01212266, + "auxiliary_loss_mlp": 0.00197771, + "balance_loss_clip": 1.00999486, + "balance_loss_mlp": 0.17714766, + "epoch": 0.9460393807304975, + "flos": 24392969994240.0, + "grad_norm": 21.57427035347676, + "language_loss": 0.91685855, + "learning_rate": 3.043335684570692e-08, + "loss": 0.93095893, + "num_input_tokens_seen": 339367945, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.20629883, + "step": 15735, + "time_per_iteration": 2.712923049926758 + }, + { + "auxiliary_loss_clip": 0.01254401, + "auxiliary_loss_mlp": 0.00217316, + "balance_loss_clip": 1.03567362, + "balance_loss_mlp": 0.19441627, + "epoch": 0.9460995039831654, + "flos": 21939408708480.0, + "grad_norm": 48.86565973331044, + "language_loss": 0.79086906, + "learning_rate": 3.036571093728102e-08, + "loss": 0.80558622, + "num_input_tokens_seen": 339386060, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.22900391, + "step": 15736, + "time_per_iteration": 2.632089853286743 + }, + { + "auxiliary_loss_clip": 0.01088819, + "auxiliary_loss_mlp": 0.0008576, + "balance_loss_clip": 0.95198214, + "balance_loss_mlp": 0.078178, + "epoch": 0.9461596272358335, + "flos": 70322466775680.0, + "grad_norm": 0.8597885364064569, + "language_loss": 0.64887071, + "learning_rate": 3.029813971758499e-08, + "loss": 0.66061652, + "num_input_tokens_seen": 339446695, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.07568359, + "step": 15737, + "time_per_iteration": 3.154442071914673 + }, + { + "auxiliary_loss_clip": 0.01091013, + "auxiliary_loss_mlp": 0.00068646, + "balance_loss_clip": 0.95336968, + "balance_loss_mlp": 0.06125479, + "epoch": 0.9462197504885014, + "flos": 58591242645120.0, + "grad_norm": 0.774236130461802, + "language_loss": 0.58061618, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.5922128, + "num_input_tokens_seen": 339510080, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.07373047, + "step": 15738, + "time_per_iteration": 3.1332743167877197 + }, + { + "auxiliary_loss_clip": 0.01217825, + "auxiliary_loss_mlp": 0.00225028, + "balance_loss_clip": 1.00871992, + "balance_loss_mlp": 0.20204467, + "epoch": 0.9462798737411694, + "flos": 23433759250560.0, + "grad_norm": 6.228703859454261, + "language_loss": 0.8036027, + "learning_rate": 3.016322135462834e-08, + "loss": 0.81803119, + "num_input_tokens_seen": 339529335, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.2298584, + "step": 15739, + "time_per_iteration": 2.7031428813934326 + }, + { + "auxiliary_loss_clip": 0.01236207, + "auxiliary_loss_mlp": 0.00246449, + "balance_loss_clip": 1.02193117, + "balance_loss_mlp": 0.22093853, + "epoch": 0.9463399969938374, + "flos": 25046077034880.0, + "grad_norm": 9.390772681550105, + "language_loss": 0.7414313, + "learning_rate": 3.009587421648363e-08, + "loss": 0.75625789, + "num_input_tokens_seen": 339548820, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.25537109, + "step": 15740, + "time_per_iteration": 2.743234395980835 + }, + { + "auxiliary_loss_clip": 0.01218703, + "auxiliary_loss_mlp": 0.00201707, + "balance_loss_clip": 1.01201117, + "balance_loss_mlp": 0.17849684, + "epoch": 0.9464001202465053, + "flos": 24352606085760.0, + "grad_norm": 20.35511196366351, + "language_loss": 0.72139323, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.73559737, + "num_input_tokens_seen": 339566775, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.23205566, + "step": 15741, + "time_per_iteration": 2.6970322132110596 + }, + { + "auxiliary_loss_clip": 0.01241691, + "auxiliary_loss_mlp": 0.00246218, + "balance_loss_clip": 1.02480066, + "balance_loss_mlp": 0.22080255, + "epoch": 0.9464602434991733, + "flos": 17165444832000.0, + "grad_norm": 17.96833780584023, + "language_loss": 0.82542169, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.8403008, + "num_input_tokens_seen": 339581905, + "router_z_loss_clip": 2.16699219, + "router_z_loss_mlp": 0.25427246, + "step": 15742, + "time_per_iteration": 2.681450128555298 + }, + { + "auxiliary_loss_clip": 0.01219229, + "auxiliary_loss_mlp": 0.00213326, + "balance_loss_clip": 1.01065385, + "balance_loss_mlp": 0.1911885, + "epoch": 0.9465203667518413, + "flos": 19938107566080.0, + "grad_norm": 4.826208602805238, + "language_loss": 0.79283613, + "learning_rate": 2.989428100602187e-08, + "loss": 0.80716169, + "num_input_tokens_seen": 339599870, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.22155762, + "step": 15743, + "time_per_iteration": 2.6483638286590576 + }, + { + "auxiliary_loss_clip": 0.01252093, + "auxiliary_loss_mlp": 0.00216228, + "balance_loss_clip": 1.03150129, + "balance_loss_mlp": 0.18945324, + "epoch": 0.9465804900045093, + "flos": 20120318282880.0, + "grad_norm": 38.115111743554834, + "language_loss": 0.87346917, + "learning_rate": 2.982723267901943e-08, + "loss": 0.88815236, + "num_input_tokens_seen": 339620250, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.2677002, + "step": 15744, + "time_per_iteration": 2.7056326866149902 + }, + { + "auxiliary_loss_clip": 0.01253355, + "auxiliary_loss_mlp": 0.00242275, + "balance_loss_clip": 1.03040123, + "balance_loss_mlp": 0.21746796, + "epoch": 0.9466406132571772, + "flos": 23911622812800.0, + "grad_norm": 12.972836540878014, + "language_loss": 0.86229932, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.87725568, + "num_input_tokens_seen": 339639900, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.24816895, + "step": 15745, + "time_per_iteration": 2.664133310317993 + }, + { + "auxiliary_loss_clip": 0.01247693, + "auxiliary_loss_mlp": 0.00241633, + "balance_loss_clip": 1.02964687, + "balance_loss_mlp": 0.21680146, + "epoch": 0.9467007365098452, + "flos": 19933223316480.0, + "grad_norm": 9.456357144756616, + "language_loss": 0.76226425, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.77715755, + "num_input_tokens_seen": 339658970, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24853516, + "step": 15746, + "time_per_iteration": 2.6889138221740723 + }, + { + "auxiliary_loss_clip": 0.01254136, + "auxiliary_loss_mlp": 0.00221886, + "balance_loss_clip": 1.03328156, + "balance_loss_mlp": 0.19761544, + "epoch": 0.9467608597625131, + "flos": 19310496203520.0, + "grad_norm": 12.576369917924946, + "language_loss": 0.67153013, + "learning_rate": 2.962653596305964e-08, + "loss": 0.68629038, + "num_input_tokens_seen": 339675600, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24267578, + "step": 15747, + "time_per_iteration": 2.745791435241699 + }, + { + "auxiliary_loss_clip": 0.01087785, + "auxiliary_loss_mlp": 0.00078588, + "balance_loss_clip": 0.95149469, + "balance_loss_mlp": 0.07095869, + "epoch": 0.9468209830151811, + "flos": 69630252802560.0, + "grad_norm": 0.6825674353355761, + "language_loss": 0.52817667, + "learning_rate": 2.955978648787871e-08, + "loss": 0.53984034, + "num_input_tokens_seen": 339744505, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07617188, + "step": 15748, + "time_per_iteration": 3.378309726715088 + }, + { + "auxiliary_loss_clip": 0.01252578, + "auxiliary_loss_mlp": 0.00223012, + "balance_loss_clip": 1.03039503, + "balance_loss_mlp": 0.19778773, + "epoch": 0.946881106267849, + "flos": 27016639113600.0, + "grad_norm": 38.917986448243745, + "language_loss": 0.74875224, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.76350808, + "num_input_tokens_seen": 339765810, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25219727, + "step": 15749, + "time_per_iteration": 2.728712558746338 + }, + { + "auxiliary_loss_clip": 0.01238102, + "auxiliary_loss_mlp": 0.00223402, + "balance_loss_clip": 1.01977825, + "balance_loss_mlp": 0.19741407, + "epoch": 0.9469412295205171, + "flos": 20190092451840.0, + "grad_norm": 9.430849423535632, + "language_loss": 0.85508168, + "learning_rate": 2.942651169791621e-08, + "loss": 0.86969674, + "num_input_tokens_seen": 339784125, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.26000977, + "step": 15750, + "time_per_iteration": 2.6390621662139893 + }, + { + "auxiliary_loss_clip": 0.01232143, + "auxiliary_loss_mlp": 0.00211763, + "balance_loss_clip": 1.02197218, + "balance_loss_mlp": 0.18905368, + "epoch": 0.947001352773185, + "flos": 21324905809920.0, + "grad_norm": 38.39850773149587, + "language_loss": 0.74682248, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.76126152, + "num_input_tokens_seen": 339803450, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.22729492, + "step": 15751, + "time_per_iteration": 2.7050259113311768 + }, + { + "auxiliary_loss_clip": 0.01254736, + "auxiliary_loss_mlp": 0.00243836, + "balance_loss_clip": 1.03303623, + "balance_loss_mlp": 0.21877824, + "epoch": 0.947061476025853, + "flos": 21944041562880.0, + "grad_norm": 149.4918828541659, + "language_loss": 0.71580291, + "learning_rate": 2.929353580532723e-08, + "loss": 0.73078865, + "num_input_tokens_seen": 339823215, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25048828, + "step": 15752, + "time_per_iteration": 2.6872706413269043 + }, + { + "auxiliary_loss_clip": 0.01234937, + "auxiliary_loss_mlp": 0.00220482, + "balance_loss_clip": 1.02004671, + "balance_loss_mlp": 0.19437516, + "epoch": 0.947121599278521, + "flos": 21394715892480.0, + "grad_norm": 200.15320783420603, + "language_loss": 0.780828, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.79538214, + "num_input_tokens_seen": 339842230, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.2611084, + "step": 15753, + "time_per_iteration": 2.7307941913604736 + }, + { + "auxiliary_loss_clip": 0.01251588, + "auxiliary_loss_mlp": 0.00220368, + "balance_loss_clip": 1.02381897, + "balance_loss_mlp": 0.19440463, + "epoch": 0.9471817225311889, + "flos": 23075730437760.0, + "grad_norm": 609.5336300258261, + "language_loss": 0.81315756, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.8278771, + "num_input_tokens_seen": 339861640, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.25952148, + "step": 15754, + "time_per_iteration": 2.7217957973480225 + }, + { + "auxiliary_loss_clip": 0.01246131, + "auxiliary_loss_mlp": 0.00211269, + "balance_loss_clip": 1.02846432, + "balance_loss_mlp": 0.18560307, + "epoch": 0.947241845783857, + "flos": 11910744305280.0, + "grad_norm": 10.436702092921726, + "language_loss": 0.89188612, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.90646017, + "num_input_tokens_seen": 339878210, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.25646973, + "step": 15755, + "time_per_iteration": 2.7593705654144287 + }, + { + "auxiliary_loss_clip": 0.01282824, + "auxiliary_loss_mlp": 0.00251221, + "balance_loss_clip": 1.04882717, + "balance_loss_mlp": 0.22273028, + "epoch": 0.9473019690365249, + "flos": 20740675098240.0, + "grad_norm": 4.953453320097887, + "language_loss": 0.85347211, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.86881262, + "num_input_tokens_seen": 339894255, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.28503418, + "step": 15756, + "time_per_iteration": 2.786654233932495 + }, + { + "auxiliary_loss_clip": 0.01235052, + "auxiliary_loss_mlp": 0.00241769, + "balance_loss_clip": 1.02004814, + "balance_loss_mlp": 0.21700966, + "epoch": 0.9473620922891929, + "flos": 17639896602240.0, + "grad_norm": 4.183806560923486, + "language_loss": 0.8446905, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.85945874, + "num_input_tokens_seen": 339912425, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24743652, + "step": 15757, + "time_per_iteration": 2.647167444229126 + }, + { + "auxiliary_loss_clip": 0.01245534, + "auxiliary_loss_mlp": 0.00228905, + "balance_loss_clip": 1.02720034, + "balance_loss_mlp": 0.20371626, + "epoch": 0.9474222155418608, + "flos": 23550002640000.0, + "grad_norm": 48.00510411600294, + "language_loss": 0.87263107, + "learning_rate": 2.889640171327512e-08, + "loss": 0.88737547, + "num_input_tokens_seen": 339929635, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25195312, + "step": 15758, + "time_per_iteration": 4.057866334915161 + }, + { + "auxiliary_loss_clip": 0.01231825, + "auxiliary_loss_mlp": 0.0023367, + "balance_loss_clip": 1.01764512, + "balance_loss_mlp": 0.20799184, + "epoch": 0.9474823387945288, + "flos": 27089753247360.0, + "grad_norm": 540.8298597907101, + "language_loss": 0.78374803, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.79840297, + "num_input_tokens_seen": 339951200, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.25683594, + "step": 15759, + "time_per_iteration": 4.179820775985718 + }, + { + "auxiliary_loss_clip": 0.0123342, + "auxiliary_loss_mlp": 0.00209597, + "balance_loss_clip": 1.02932382, + "balance_loss_mlp": 0.18916428, + "epoch": 0.9475424620471967, + "flos": 22966526113920.0, + "grad_norm": 21.675174209074235, + "language_loss": 0.83364701, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.84807718, + "num_input_tokens_seen": 339971820, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.20446777, + "step": 15760, + "time_per_iteration": 2.6980113983154297 + }, + { + "auxiliary_loss_clip": 0.01230835, + "auxiliary_loss_mlp": 0.00228127, + "balance_loss_clip": 1.01790071, + "balance_loss_mlp": 0.20401117, + "epoch": 0.9476025852998647, + "flos": 20047671025920.0, + "grad_norm": 31.952381981582032, + "language_loss": 0.80883062, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.82342029, + "num_input_tokens_seen": 339989420, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.24121094, + "step": 15761, + "time_per_iteration": 2.665698289871216 + }, + { + "auxiliary_loss_clip": 0.01235686, + "auxiliary_loss_mlp": 0.00227016, + "balance_loss_clip": 1.02176118, + "balance_loss_mlp": 0.20362671, + "epoch": 0.9476627085525327, + "flos": 14975468524800.0, + "grad_norm": 28.185537648324015, + "language_loss": 0.80800015, + "learning_rate": 2.863314050734722e-08, + "loss": 0.82262719, + "num_input_tokens_seen": 340006690, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.23388672, + "step": 15762, + "time_per_iteration": 2.6210334300994873 + }, + { + "auxiliary_loss_clip": 0.0125514, + "auxiliary_loss_mlp": 0.0023915, + "balance_loss_clip": 1.034464, + "balance_loss_mlp": 0.21371059, + "epoch": 0.9477228318052007, + "flos": 18697788984960.0, + "grad_norm": 11.080046364427545, + "language_loss": 0.76733065, + "learning_rate": 2.856751208570518e-08, + "loss": 0.78227359, + "num_input_tokens_seen": 340025480, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.25463867, + "step": 15763, + "time_per_iteration": 2.7327163219451904 + }, + { + "auxiliary_loss_clip": 0.01237916, + "auxiliary_loss_mlp": 0.00218537, + "balance_loss_clip": 1.02472913, + "balance_loss_mlp": 0.19334812, + "epoch": 0.9477829550578686, + "flos": 23875065745920.0, + "grad_norm": 2.3104922000517476, + "language_loss": 0.769871, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.78443551, + "num_input_tokens_seen": 340043785, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.2520752, + "step": 15764, + "time_per_iteration": 2.725487232208252 + }, + { + "auxiliary_loss_clip": 0.01213303, + "auxiliary_loss_mlp": 0.00223323, + "balance_loss_clip": 1.01144016, + "balance_loss_mlp": 0.20087565, + "epoch": 0.9478430783105366, + "flos": 22562890007040.0, + "grad_norm": 7.051341868670931, + "language_loss": 0.76267946, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.77704573, + "num_input_tokens_seen": 340064360, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.22460938, + "step": 15765, + "time_per_iteration": 2.7210450172424316 + }, + { + "auxiliary_loss_clip": 0.01090633, + "auxiliary_loss_mlp": 0.00080987, + "balance_loss_clip": 0.95254612, + "balance_loss_mlp": 0.07359651, + "epoch": 0.9479032015632046, + "flos": 60857885554560.0, + "grad_norm": 0.7836456366488854, + "language_loss": 0.57481772, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.5865339, + "num_input_tokens_seen": 340114425, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.07373047, + "step": 15766, + "time_per_iteration": 2.984362840652466 + }, + { + "auxiliary_loss_clip": 0.01225778, + "auxiliary_loss_mlp": 0.00223566, + "balance_loss_clip": 1.01168919, + "balance_loss_mlp": 0.19931872, + "epoch": 0.9479633248158725, + "flos": 14683873916160.0, + "grad_norm": 49.301189647079575, + "language_loss": 0.83082867, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.84532213, + "num_input_tokens_seen": 340132200, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.2421875, + "step": 15767, + "time_per_iteration": 4.086865663528442 + }, + { + "auxiliary_loss_clip": 0.01259603, + "auxiliary_loss_mlp": 0.00233726, + "balance_loss_clip": 1.03918386, + "balance_loss_mlp": 0.20842943, + "epoch": 0.9480234480685406, + "flos": 20333878594560.0, + "grad_norm": 264.75682910197935, + "language_loss": 0.82018417, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.83511746, + "num_input_tokens_seen": 340149175, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25292969, + "step": 15768, + "time_per_iteration": 2.7303524017333984 + }, + { + "auxiliary_loss_clip": 0.0109065, + "auxiliary_loss_mlp": 0.00118778, + "balance_loss_clip": 0.9520539, + "balance_loss_mlp": 0.10957488, + "epoch": 0.9480835713212085, + "flos": 70293092428800.0, + "grad_norm": 0.7330031000873575, + "language_loss": 0.54329777, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.55539203, + "num_input_tokens_seen": 340208155, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.09179688, + "step": 15769, + "time_per_iteration": 4.572947978973389 + }, + { + "auxiliary_loss_clip": 0.0123529, + "auxiliary_loss_mlp": 0.00225996, + "balance_loss_clip": 1.01852429, + "balance_loss_mlp": 0.20203494, + "epoch": 0.9481436945738765, + "flos": 25449749055360.0, + "grad_norm": 6.904436941537562, + "language_loss": 0.83348733, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.84810019, + "num_input_tokens_seen": 340229275, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.23937988, + "step": 15770, + "time_per_iteration": 2.7193915843963623 + }, + { + "auxiliary_loss_clip": 0.01239983, + "auxiliary_loss_mlp": 0.00215979, + "balance_loss_clip": 1.02394617, + "balance_loss_mlp": 0.18895395, + "epoch": 0.9482038178265444, + "flos": 26979902478720.0, + "grad_norm": 41.590468030805695, + "language_loss": 0.85442245, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.86898208, + "num_input_tokens_seen": 340248920, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.27001953, + "step": 15771, + "time_per_iteration": 2.723235845565796 + }, + { + "auxiliary_loss_clip": 0.01230519, + "auxiliary_loss_mlp": 0.00234975, + "balance_loss_clip": 1.01606631, + "balance_loss_mlp": 0.2099527, + "epoch": 0.9482639410792124, + "flos": 17785442511360.0, + "grad_norm": 20.034405428830496, + "language_loss": 0.77176154, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.78641641, + "num_input_tokens_seen": 340266775, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.25012207, + "step": 15772, + "time_per_iteration": 2.6266589164733887 + }, + { + "auxiliary_loss_clip": 0.01230866, + "auxiliary_loss_mlp": 0.00213337, + "balance_loss_clip": 1.01743269, + "balance_loss_mlp": 0.18908992, + "epoch": 0.9483240643318803, + "flos": 20996682307200.0, + "grad_norm": 18.174757000235882, + "language_loss": 0.81349337, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.82793534, + "num_input_tokens_seen": 340285295, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24255371, + "step": 15773, + "time_per_iteration": 2.6718454360961914 + }, + { + "auxiliary_loss_clip": 0.01249343, + "auxiliary_loss_mlp": 0.00204598, + "balance_loss_clip": 1.0286994, + "balance_loss_mlp": 0.17803809, + "epoch": 0.9483841875845483, + "flos": 20083294339200.0, + "grad_norm": 3.6625026049680387, + "language_loss": 0.74839157, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.76293099, + "num_input_tokens_seen": 340304265, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.265625, + "step": 15774, + "time_per_iteration": 2.7701330184936523 + }, + { + "auxiliary_loss_clip": 0.01249747, + "auxiliary_loss_mlp": 0.00217617, + "balance_loss_clip": 1.03127599, + "balance_loss_mlp": 0.19173664, + "epoch": 0.9484443108372163, + "flos": 20813645577600.0, + "grad_norm": 3.885092001277878, + "language_loss": 0.6703856, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.68505919, + "num_input_tokens_seen": 340323690, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.25915527, + "step": 15775, + "time_per_iteration": 2.66629695892334 + }, + { + "auxiliary_loss_clip": 0.01243273, + "auxiliary_loss_mlp": 0.00213421, + "balance_loss_clip": 1.02426481, + "balance_loss_mlp": 0.18898261, + "epoch": 0.9485044340898843, + "flos": 36429184506240.0, + "grad_norm": 15154.756483698007, + "language_loss": 0.67898792, + "learning_rate": 2.772114638584555e-08, + "loss": 0.69355488, + "num_input_tokens_seen": 340345830, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24475098, + "step": 15776, + "time_per_iteration": 2.8397116661071777 + }, + { + "auxiliary_loss_clip": 0.01247983, + "auxiliary_loss_mlp": 0.00227118, + "balance_loss_clip": 1.02523685, + "balance_loss_mlp": 0.19962817, + "epoch": 0.9485645573425522, + "flos": 22602535643520.0, + "grad_norm": 12.385746189089813, + "language_loss": 0.8462075, + "learning_rate": 2.765656478622458e-08, + "loss": 0.86095852, + "num_input_tokens_seen": 340365910, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.27502441, + "step": 15777, + "time_per_iteration": 2.672736406326294 + }, + { + "auxiliary_loss_clip": 0.01267159, + "auxiliary_loss_mlp": 0.00234097, + "balance_loss_clip": 1.04010332, + "balance_loss_mlp": 0.20642836, + "epoch": 0.9486246805952202, + "flos": 22017766227840.0, + "grad_norm": 19.722782116912896, + "language_loss": 0.8686105, + "learning_rate": 2.759205797806441e-08, + "loss": 0.88362312, + "num_input_tokens_seen": 340383935, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.2767334, + "step": 15778, + "time_per_iteration": 2.6551849842071533 + }, + { + "auxiliary_loss_clip": 0.01209231, + "auxiliary_loss_mlp": 0.00219015, + "balance_loss_clip": 1.00675309, + "balance_loss_mlp": 0.19698542, + "epoch": 0.9486848038478882, + "flos": 16508674604160.0, + "grad_norm": 4.436244724610979, + "language_loss": 0.77652967, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.79081213, + "num_input_tokens_seen": 340402760, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.22045898, + "step": 15779, + "time_per_iteration": 2.6172664165496826 + }, + { + "auxiliary_loss_clip": 0.01245489, + "auxiliary_loss_mlp": 0.00249872, + "balance_loss_clip": 1.02498829, + "balance_loss_mlp": 0.2222158, + "epoch": 0.9487449271005561, + "flos": 19244385221760.0, + "grad_norm": 6.107454060978959, + "language_loss": 0.88772881, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.90268242, + "num_input_tokens_seen": 340422105, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.27661133, + "step": 15780, + "time_per_iteration": 2.6514127254486084 + }, + { + "auxiliary_loss_clip": 0.01237805, + "auxiliary_loss_mlp": 0.00222132, + "balance_loss_clip": 1.02231085, + "balance_loss_mlp": 0.19781324, + "epoch": 0.9488050503532242, + "flos": 21762692772480.0, + "grad_norm": 3.6662047746337016, + "language_loss": 0.73469532, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.74929464, + "num_input_tokens_seen": 340441160, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.2434082, + "step": 15781, + "time_per_iteration": 2.645259141921997 + }, + { + "auxiliary_loss_clip": 0.01234895, + "auxiliary_loss_mlp": 0.00231383, + "balance_loss_clip": 1.02092183, + "balance_loss_mlp": 0.20482326, + "epoch": 0.9488651736058921, + "flos": 18368919037440.0, + "grad_norm": 42.214615418477166, + "language_loss": 0.88175082, + "learning_rate": 2.733477870890999e-08, + "loss": 0.89641368, + "num_input_tokens_seen": 340458200, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.265625, + "step": 15782, + "time_per_iteration": 2.871408224105835 + }, + { + "auxiliary_loss_clip": 0.01084597, + "auxiliary_loss_mlp": 0.0004825, + "balance_loss_clip": 0.94735539, + "balance_loss_mlp": 0.04228922, + "epoch": 0.9489252968585601, + "flos": 70084057230720.0, + "grad_norm": 1.5440127501073468, + "language_loss": 0.5933212, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.60464966, + "num_input_tokens_seen": 340526420, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.05957031, + "step": 15783, + "time_per_iteration": 3.280482769012451 + }, + { + "auxiliary_loss_clip": 0.01238698, + "auxiliary_loss_mlp": 0.00219474, + "balance_loss_clip": 1.02144945, + "balance_loss_mlp": 0.19380793, + "epoch": 0.948985420111228, + "flos": 27855440490240.0, + "grad_norm": 55.29551359691361, + "language_loss": 0.79139811, + "learning_rate": 2.720658788656105e-08, + "loss": 0.80597979, + "num_input_tokens_seen": 340546325, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.25671387, + "step": 15784, + "time_per_iteration": 2.7334346771240234 + }, + { + "auxiliary_loss_clip": 0.01234985, + "auxiliary_loss_mlp": 0.0023144, + "balance_loss_clip": 1.02104867, + "balance_loss_mlp": 0.20681109, + "epoch": 0.949045543363896, + "flos": 24316049018880.0, + "grad_norm": 5.669556360518885, + "language_loss": 0.77159965, + "learning_rate": 2.714260468695806e-08, + "loss": 0.78626394, + "num_input_tokens_seen": 340565145, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24621582, + "step": 15785, + "time_per_iteration": 2.728456974029541 + }, + { + "auxiliary_loss_clip": 0.0124846, + "auxiliary_loss_mlp": 0.00247464, + "balance_loss_clip": 1.02793288, + "balance_loss_mlp": 0.22145236, + "epoch": 0.9491056666165639, + "flos": 24241677909120.0, + "grad_norm": 21.004689177963268, + "language_loss": 0.82455122, + "learning_rate": 2.707869629830495e-08, + "loss": 0.83951044, + "num_input_tokens_seen": 340585465, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.26025391, + "step": 15786, + "time_per_iteration": 2.703324556350708 + }, + { + "auxiliary_loss_clip": 0.01231533, + "auxiliary_loss_mlp": 0.00202256, + "balance_loss_clip": 1.01870847, + "balance_loss_mlp": 0.17744815, + "epoch": 0.949165789869232, + "flos": 24531261356160.0, + "grad_norm": 6.7348755986581725, + "language_loss": 0.86257833, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.87691629, + "num_input_tokens_seen": 340606010, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.24780273, + "step": 15787, + "time_per_iteration": 2.723062753677368 + }, + { + "auxiliary_loss_clip": 0.012397, + "auxiliary_loss_mlp": 0.00204989, + "balance_loss_clip": 1.02552867, + "balance_loss_mlp": 0.18065877, + "epoch": 0.9492259131218999, + "flos": 22235348862720.0, + "grad_norm": 21.212599265297058, + "language_loss": 0.82351696, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.83796388, + "num_input_tokens_seen": 340626135, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.2434082, + "step": 15788, + "time_per_iteration": 2.6704752445220947 + }, + { + "auxiliary_loss_clip": 0.01257553, + "auxiliary_loss_mlp": 0.00249951, + "balance_loss_clip": 1.03421152, + "balance_loss_mlp": 0.22153118, + "epoch": 0.9492860363745679, + "flos": 22966310632320.0, + "grad_norm": 75.5700422038416, + "language_loss": 0.80905122, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.82412618, + "num_input_tokens_seen": 340644870, + "router_z_loss_clip": 2.22949219, + "router_z_loss_mlp": 0.28430176, + "step": 15789, + "time_per_iteration": 2.6900794506073 + }, + { + "auxiliary_loss_clip": 0.01250251, + "auxiliary_loss_mlp": 0.00245906, + "balance_loss_clip": 1.03248119, + "balance_loss_mlp": 0.21951291, + "epoch": 0.9493461596272358, + "flos": 18370283754240.0, + "grad_norm": 4.450523661736236, + "language_loss": 0.83416915, + "learning_rate": 2.682381090161989e-08, + "loss": 0.84913075, + "num_input_tokens_seen": 340663695, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.26379395, + "step": 15790, + "time_per_iteration": 2.6581103801727295 + }, + { + "auxiliary_loss_clip": 0.01257591, + "auxiliary_loss_mlp": 0.00220605, + "balance_loss_clip": 1.03429842, + "balance_loss_mlp": 0.19468907, + "epoch": 0.9494062828799038, + "flos": 20011724490240.0, + "grad_norm": 5.2308508273767105, + "language_loss": 0.88449275, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.89927471, + "num_input_tokens_seen": 340682970, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.25927734, + "step": 15791, + "time_per_iteration": 2.695176124572754 + }, + { + "auxiliary_loss_clip": 0.01267563, + "auxiliary_loss_mlp": 0.00243452, + "balance_loss_clip": 1.04052901, + "balance_loss_mlp": 0.21707103, + "epoch": 0.9494664061325718, + "flos": 27228583313280.0, + "grad_norm": 70.8311351801471, + "language_loss": 0.83533746, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.85044765, + "num_input_tokens_seen": 340702275, + "router_z_loss_clip": 2.26855469, + "router_z_loss_mlp": 0.26403809, + "step": 15792, + "time_per_iteration": 2.6934125423431396 + }, + { + "auxiliary_loss_clip": 0.01247655, + "auxiliary_loss_mlp": 0.00241071, + "balance_loss_clip": 1.0255326, + "balance_loss_mlp": 0.21396306, + "epoch": 0.9495265293852397, + "flos": 18369816877440.0, + "grad_norm": 3.441796488048126, + "language_loss": 0.86009514, + "learning_rate": 2.663343248754679e-08, + "loss": 0.87498236, + "num_input_tokens_seen": 340719060, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.27111816, + "step": 15793, + "time_per_iteration": 2.8286287784576416 + }, + { + "auxiliary_loss_clip": 0.01231329, + "auxiliary_loss_mlp": 0.00226113, + "balance_loss_clip": 1.01307726, + "balance_loss_mlp": 0.19985139, + "epoch": 0.9495866526379078, + "flos": 23075766351360.0, + "grad_norm": 2.3425633336079636, + "language_loss": 0.85555297, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.87012738, + "num_input_tokens_seen": 340737815, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.26293945, + "step": 15794, + "time_per_iteration": 2.665341377258301 + }, + { + "auxiliary_loss_clip": 0.01255144, + "auxiliary_loss_mlp": 0.00232887, + "balance_loss_clip": 1.03382874, + "balance_loss_mlp": 0.2048724, + "epoch": 0.9496467758905757, + "flos": 17529902179200.0, + "grad_norm": 11.99780752676853, + "language_loss": 0.70593619, + "learning_rate": 2.650688769211107e-08, + "loss": 0.72081649, + "num_input_tokens_seen": 340756035, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.28039551, + "step": 15795, + "time_per_iteration": 2.6450936794281006 + }, + { + "auxiliary_loss_clip": 0.01241955, + "auxiliary_loss_mlp": 0.00203781, + "balance_loss_clip": 1.0286814, + "balance_loss_mlp": 0.17782864, + "epoch": 0.9497068991432437, + "flos": 24133910129280.0, + "grad_norm": 2.215066795912897, + "language_loss": 0.89060235, + "learning_rate": 2.644372754577895e-08, + "loss": 0.9050597, + "num_input_tokens_seen": 340775620, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.25964355, + "step": 15796, + "time_per_iteration": 2.7032852172851562 + }, + { + "auxiliary_loss_clip": 0.01242385, + "auxiliary_loss_mlp": 0.00226981, + "balance_loss_clip": 1.0270977, + "balance_loss_mlp": 0.20019495, + "epoch": 0.9497670223959116, + "flos": 20303319098880.0, + "grad_norm": 162.80154829777493, + "language_loss": 0.86418766, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.87888134, + "num_input_tokens_seen": 340794510, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.26794434, + "step": 15797, + "time_per_iteration": 2.6758038997650146 + }, + { + "auxiliary_loss_clip": 0.01254165, + "auxiliary_loss_mlp": 0.00232599, + "balance_loss_clip": 1.03138411, + "balance_loss_mlp": 0.20582479, + "epoch": 0.9498271456485796, + "flos": 13698916099200.0, + "grad_norm": 188.7339248069587, + "language_loss": 0.79422653, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.80909419, + "num_input_tokens_seen": 340812955, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.26782227, + "step": 15798, + "time_per_iteration": 2.691800594329834 + }, + { + "auxiliary_loss_clip": 0.01257335, + "auxiliary_loss_mlp": 0.00238321, + "balance_loss_clip": 1.03290868, + "balance_loss_mlp": 0.21165362, + "epoch": 0.9498872689012475, + "flos": 20814004713600.0, + "grad_norm": 28.58240724760987, + "language_loss": 0.84338123, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.85833776, + "num_input_tokens_seen": 340829200, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.26696777, + "step": 15799, + "time_per_iteration": 2.652902603149414 + }, + { + "auxiliary_loss_clip": 0.01230441, + "auxiliary_loss_mlp": 0.00219745, + "balance_loss_clip": 1.02278316, + "balance_loss_mlp": 0.19730966, + "epoch": 0.9499473921539155, + "flos": 21032700670080.0, + "grad_norm": 8.430299119264365, + "language_loss": 0.77819777, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.79269964, + "num_input_tokens_seen": 340848035, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.22424316, + "step": 15800, + "time_per_iteration": 4.0798609256744385 + }, + { + "auxiliary_loss_clip": 0.01233619, + "auxiliary_loss_mlp": 0.00226236, + "balance_loss_clip": 1.02303946, + "balance_loss_mlp": 0.20087993, + "epoch": 0.9500075154065835, + "flos": 20998693468800.0, + "grad_norm": 13.177532421379135, + "language_loss": 0.77782845, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.792427, + "num_input_tokens_seen": 340870025, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.25366211, + "step": 15801, + "time_per_iteration": 4.2112648487091064 + }, + { + "auxiliary_loss_clip": 0.01241503, + "auxiliary_loss_mlp": 0.00221205, + "balance_loss_clip": 1.02448666, + "balance_loss_mlp": 0.19583774, + "epoch": 0.9500676386592515, + "flos": 25121956515840.0, + "grad_norm": 2.906936185526266, + "language_loss": 0.8666169, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.88124394, + "num_input_tokens_seen": 340892290, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.25390625, + "step": 15802, + "time_per_iteration": 2.7257747650146484 + }, + { + "auxiliary_loss_clip": 0.01248146, + "auxiliary_loss_mlp": 0.00213281, + "balance_loss_clip": 1.02761185, + "balance_loss_mlp": 0.18692344, + "epoch": 0.9501277619119194, + "flos": 27523625627520.0, + "grad_norm": 91.95782266895279, + "language_loss": 0.76250482, + "learning_rate": 2.60037021038646e-08, + "loss": 0.77711904, + "num_input_tokens_seen": 340912260, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.26379395, + "step": 15803, + "time_per_iteration": 2.771003246307373 + }, + { + "auxiliary_loss_clip": 0.01244187, + "auxiliary_loss_mlp": 0.00236462, + "balance_loss_clip": 1.02728593, + "balance_loss_mlp": 0.2106652, + "epoch": 0.9501878851645874, + "flos": 20813968800000.0, + "grad_norm": 38.62721439579709, + "language_loss": 0.82368028, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.83848679, + "num_input_tokens_seen": 340928930, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.25805664, + "step": 15804, + "time_per_iteration": 2.6887571811676025 + }, + { + "auxiliary_loss_clip": 0.01257666, + "auxiliary_loss_mlp": 0.00242847, + "balance_loss_clip": 1.03739059, + "balance_loss_mlp": 0.21699041, + "epoch": 0.9502480084172553, + "flos": 18369385914240.0, + "grad_norm": 27.974530026838927, + "language_loss": 0.80130172, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.81630683, + "num_input_tokens_seen": 340946615, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.25842285, + "step": 15805, + "time_per_iteration": 2.723966360092163 + }, + { + "auxiliary_loss_clip": 0.01251312, + "auxiliary_loss_mlp": 0.00228244, + "balance_loss_clip": 1.0321523, + "balance_loss_mlp": 0.20392539, + "epoch": 0.9503081316699233, + "flos": 23549607590400.0, + "grad_norm": 5.876291151439991, + "language_loss": 0.86986649, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.88466203, + "num_input_tokens_seen": 340967545, + "router_z_loss_clip": 2.18847656, + "router_z_loss_mlp": 0.24304199, + "step": 15806, + "time_per_iteration": 2.7442362308502197 + }, + { + "auxiliary_loss_clip": 0.01255911, + "auxiliary_loss_mlp": 0.00212589, + "balance_loss_clip": 1.03265548, + "balance_loss_mlp": 0.18810326, + "epoch": 0.9503682549225914, + "flos": 18040444139520.0, + "grad_norm": 13.598647980095388, + "language_loss": 0.91444814, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.92913318, + "num_input_tokens_seen": 340984955, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.24511719, + "step": 15807, + "time_per_iteration": 2.665917158126831 + }, + { + "auxiliary_loss_clip": 0.01230484, + "auxiliary_loss_mlp": 0.00225407, + "balance_loss_clip": 1.02362561, + "balance_loss_mlp": 0.20231596, + "epoch": 0.9504283781752593, + "flos": 25886135387520.0, + "grad_norm": 432.7997886520582, + "language_loss": 0.8009311, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.81549001, + "num_input_tokens_seen": 341007300, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.2310791, + "step": 15808, + "time_per_iteration": 2.744380235671997 + }, + { + "auxiliary_loss_clip": 0.01238264, + "auxiliary_loss_mlp": 0.00231731, + "balance_loss_clip": 1.02418756, + "balance_loss_mlp": 0.20608936, + "epoch": 0.9504885014279273, + "flos": 22124025636480.0, + "grad_norm": 54.90380081068397, + "language_loss": 0.75802827, + "learning_rate": 2.562945671948058e-08, + "loss": 0.7727282, + "num_input_tokens_seen": 341026695, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.25671387, + "step": 15809, + "time_per_iteration": 4.1642937660217285 + }, + { + "auxiliary_loss_clip": 0.01236566, + "auxiliary_loss_mlp": 0.00218693, + "balance_loss_clip": 1.02084446, + "balance_loss_mlp": 0.1942198, + "epoch": 0.9505486246805952, + "flos": 21615961714560.0, + "grad_norm": 62.046279382403114, + "language_loss": 0.83705997, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.85161257, + "num_input_tokens_seen": 341047080, + "router_z_loss_clip": 2.15722656, + "router_z_loss_mlp": 0.24487305, + "step": 15810, + "time_per_iteration": 2.7499430179595947 + }, + { + "auxiliary_loss_clip": 0.01248478, + "auxiliary_loss_mlp": 0.00206142, + "balance_loss_clip": 1.02388275, + "balance_loss_mlp": 0.18070242, + "epoch": 0.9506087479332632, + "flos": 22528236360960.0, + "grad_norm": 19.51031163321085, + "language_loss": 0.88401151, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.89855772, + "num_input_tokens_seen": 341067310, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.25427246, + "step": 15811, + "time_per_iteration": 4.119182109832764 + }, + { + "auxiliary_loss_clip": 0.01236521, + "auxiliary_loss_mlp": 0.00210729, + "balance_loss_clip": 1.02386701, + "balance_loss_mlp": 0.18710166, + "epoch": 0.9506688711859311, + "flos": 27527360641920.0, + "grad_norm": 37.65993977337847, + "language_loss": 0.7821309, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.79660344, + "num_input_tokens_seen": 341085110, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.23632812, + "step": 15812, + "time_per_iteration": 2.6937525272369385 + }, + { + "auxiliary_loss_clip": 0.01259926, + "auxiliary_loss_mlp": 0.00231235, + "balance_loss_clip": 1.03974652, + "balance_loss_mlp": 0.20568863, + "epoch": 0.9507289944385992, + "flos": 19865783531520.0, + "grad_norm": 38.169982487320624, + "language_loss": 0.7111901, + "learning_rate": 2.538145713158446e-08, + "loss": 0.72610164, + "num_input_tokens_seen": 341103190, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.25524902, + "step": 15813, + "time_per_iteration": 2.711503028869629 + }, + { + "auxiliary_loss_clip": 0.01249919, + "auxiliary_loss_mlp": 0.00228486, + "balance_loss_clip": 1.02822697, + "balance_loss_mlp": 0.20350008, + "epoch": 0.9507891176912671, + "flos": 25193274969600.0, + "grad_norm": 17.16236292377557, + "language_loss": 0.76454437, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.77932847, + "num_input_tokens_seen": 341125695, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25, + "step": 15814, + "time_per_iteration": 2.7025868892669678 + }, + { + "auxiliary_loss_clip": 0.01226529, + "auxiliary_loss_mlp": 0.00230603, + "balance_loss_clip": 1.01698017, + "balance_loss_mlp": 0.20846581, + "epoch": 0.9508492409439351, + "flos": 24899561458560.0, + "grad_norm": 65.49330742687987, + "language_loss": 0.73092973, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.74550104, + "num_input_tokens_seen": 341143930, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.22143555, + "step": 15815, + "time_per_iteration": 2.714184045791626 + }, + { + "auxiliary_loss_clip": 0.01235814, + "auxiliary_loss_mlp": 0.00213862, + "balance_loss_clip": 1.02276266, + "balance_loss_mlp": 0.19133148, + "epoch": 0.950909364196603, + "flos": 29784094375680.0, + "grad_norm": 4.768734191581832, + "language_loss": 0.63186324, + "learning_rate": 2.519624364862061e-08, + "loss": 0.64635998, + "num_input_tokens_seen": 341164280, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.22521973, + "step": 15816, + "time_per_iteration": 2.7738850116729736 + }, + { + "auxiliary_loss_clip": 0.01255216, + "auxiliary_loss_mlp": 0.00248135, + "balance_loss_clip": 1.03621101, + "balance_loss_mlp": 0.22311309, + "epoch": 0.950969487449271, + "flos": 24717781704960.0, + "grad_norm": 63.51649822574304, + "language_loss": 0.791291, + "learning_rate": 2.513465558735994e-08, + "loss": 0.80632454, + "num_input_tokens_seen": 341183670, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25012207, + "step": 15817, + "time_per_iteration": 2.767761707305908 + }, + { + "auxiliary_loss_clip": 0.01245371, + "auxiliary_loss_mlp": 0.00233579, + "balance_loss_clip": 1.02439094, + "balance_loss_mlp": 0.20663744, + "epoch": 0.9510296107019389, + "flos": 13699167494400.0, + "grad_norm": 13.631983546026476, + "language_loss": 0.68452245, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.69931197, + "num_input_tokens_seen": 341201900, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26953125, + "step": 15818, + "time_per_iteration": 2.633450984954834 + }, + { + "auxiliary_loss_clip": 0.01251864, + "auxiliary_loss_mlp": 0.00233413, + "balance_loss_clip": 1.03431439, + "balance_loss_mlp": 0.20730636, + "epoch": 0.9510897339546069, + "flos": 17311852667520.0, + "grad_norm": 16.665778810156606, + "language_loss": 0.77738762, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.79224038, + "num_input_tokens_seen": 341218340, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.26135254, + "step": 15819, + "time_per_iteration": 2.612177610397339 + }, + { + "auxiliary_loss_clip": 0.01254017, + "auxiliary_loss_mlp": 0.00219544, + "balance_loss_clip": 1.0339365, + "balance_loss_mlp": 0.19448633, + "epoch": 0.951149857207275, + "flos": 14793940166400.0, + "grad_norm": 69.09766270961251, + "language_loss": 0.819929, + "learning_rate": 2.49503407354561e-08, + "loss": 0.83466464, + "num_input_tokens_seen": 341235885, + "router_z_loss_clip": 2.20019531, + "router_z_loss_mlp": 0.25061035, + "step": 15820, + "time_per_iteration": 2.659472703933716 + }, + { + "auxiliary_loss_clip": 0.01266789, + "auxiliary_loss_mlp": 0.00234878, + "balance_loss_clip": 1.04197502, + "balance_loss_mlp": 0.20742384, + "epoch": 0.9512099804599429, + "flos": 19391152193280.0, + "grad_norm": 16.6682541806507, + "language_loss": 0.8548826, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.86989927, + "num_input_tokens_seen": 341255280, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.2746582, + "step": 15821, + "time_per_iteration": 2.611767530441284 + }, + { + "auxiliary_loss_clip": 0.01229726, + "auxiliary_loss_mlp": 0.00222343, + "balance_loss_clip": 1.01658607, + "balance_loss_mlp": 0.19724897, + "epoch": 0.9512701037126109, + "flos": 36757874885760.0, + "grad_norm": 21.824817660886406, + "language_loss": 0.7621069, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.7766276, + "num_input_tokens_seen": 341279055, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.25085449, + "step": 15822, + "time_per_iteration": 2.821018934249878 + }, + { + "auxiliary_loss_clip": 0.01233222, + "auxiliary_loss_mlp": 0.00211161, + "balance_loss_clip": 1.01894116, + "balance_loss_mlp": 0.18787916, + "epoch": 0.9513302269652788, + "flos": 22638266697600.0, + "grad_norm": 5.441965348864809, + "language_loss": 0.74043095, + "learning_rate": 2.47666999302647e-08, + "loss": 0.75487483, + "num_input_tokens_seen": 341298560, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.23291016, + "step": 15823, + "time_per_iteration": 2.642061710357666 + }, + { + "auxiliary_loss_clip": 0.01230477, + "auxiliary_loss_mlp": 0.00234623, + "balance_loss_clip": 1.01905215, + "balance_loss_mlp": 0.20825371, + "epoch": 0.9513903502179468, + "flos": 22893232412160.0, + "grad_norm": 214.19550912266715, + "language_loss": 0.84060454, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.8552556, + "num_input_tokens_seen": 341316650, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.26367188, + "step": 15824, + "time_per_iteration": 2.685042381286621 + }, + { + "auxiliary_loss_clip": 0.01257571, + "auxiliary_loss_mlp": 0.00204022, + "balance_loss_clip": 1.02976084, + "balance_loss_mlp": 0.17783222, + "epoch": 0.9514504734706147, + "flos": 27928626451200.0, + "grad_norm": 26.62635312743441, + "language_loss": 0.85215592, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.86677188, + "num_input_tokens_seen": 341336185, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.26208496, + "step": 15825, + "time_per_iteration": 2.716860055923462 + }, + { + "auxiliary_loss_clip": 0.01091333, + "auxiliary_loss_mlp": 0.00075097, + "balance_loss_clip": 0.95536971, + "balance_loss_mlp": 0.06746782, + "epoch": 0.9515105967232828, + "flos": 67366767312000.0, + "grad_norm": 0.7928457181296782, + "language_loss": 0.52059829, + "learning_rate": 2.458373323445806e-08, + "loss": 0.53226256, + "num_input_tokens_seen": 341395795, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07617188, + "step": 15826, + "time_per_iteration": 3.1069536209106445 + }, + { + "auxiliary_loss_clip": 0.01244294, + "auxiliary_loss_mlp": 0.00216215, + "balance_loss_clip": 1.02161682, + "balance_loss_mlp": 0.19081165, + "epoch": 0.9515707199759507, + "flos": 25846525664640.0, + "grad_norm": 120.43605357139815, + "language_loss": 0.82052052, + "learning_rate": 2.452289414874076e-08, + "loss": 0.83512557, + "num_input_tokens_seen": 341415675, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25390625, + "step": 15827, + "time_per_iteration": 2.74379825592041 + }, + { + "auxiliary_loss_clip": 0.01237683, + "auxiliary_loss_mlp": 0.00232185, + "balance_loss_clip": 1.02304697, + "balance_loss_mlp": 0.20823607, + "epoch": 0.9516308432286187, + "flos": 21828983322240.0, + "grad_norm": 453.8631536641007, + "language_loss": 0.82970583, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.84440458, + "num_input_tokens_seen": 341432990, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.23925781, + "step": 15828, + "time_per_iteration": 2.7452690601348877 + }, + { + "auxiliary_loss_clip": 0.01220302, + "auxiliary_loss_mlp": 0.00224835, + "balance_loss_clip": 1.00999951, + "balance_loss_mlp": 0.20008703, + "epoch": 0.9516909664812866, + "flos": 27269593666560.0, + "grad_norm": 32.369517875903504, + "language_loss": 0.7913242, + "learning_rate": 2.440144071047978e-08, + "loss": 0.80577558, + "num_input_tokens_seen": 341454100, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.24731445, + "step": 15829, + "time_per_iteration": 2.729369640350342 + }, + { + "auxiliary_loss_clip": 0.01239557, + "auxiliary_loss_mlp": 0.00216856, + "balance_loss_clip": 1.0223707, + "balance_loss_mlp": 0.19388404, + "epoch": 0.9517510897339546, + "flos": 21215342350080.0, + "grad_norm": 87.64141959212265, + "language_loss": 0.69429004, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.7088542, + "num_input_tokens_seen": 341472955, + "router_z_loss_clip": 2.17285156, + "router_z_loss_mlp": 0.22973633, + "step": 15830, + "time_per_iteration": 2.649383068084717 + }, + { + "auxiliary_loss_clip": 0.01248624, + "auxiliary_loss_mlp": 0.00228171, + "balance_loss_clip": 1.02809107, + "balance_loss_mlp": 0.20237377, + "epoch": 0.9518112129866225, + "flos": 18733986915840.0, + "grad_norm": 33.45550860641747, + "language_loss": 0.81951559, + "learning_rate": 2.428028693179729e-08, + "loss": 0.83428359, + "num_input_tokens_seen": 341490165, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.2578125, + "step": 15831, + "time_per_iteration": 2.696910858154297 + }, + { + "auxiliary_loss_clip": 0.01221581, + "auxiliary_loss_mlp": 0.00214675, + "balance_loss_clip": 1.0099299, + "balance_loss_mlp": 0.19169113, + "epoch": 0.9518713362392905, + "flos": 16763676232320.0, + "grad_norm": 1766.2300499468508, + "language_loss": 0.74046969, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.75483221, + "num_input_tokens_seen": 341508055, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.2298584, + "step": 15832, + "time_per_iteration": 2.718071222305298 + }, + { + "auxiliary_loss_clip": 0.01221477, + "auxiliary_loss_mlp": 0.00210261, + "balance_loss_clip": 1.01167262, + "balance_loss_mlp": 0.18700278, + "epoch": 0.9519314594919586, + "flos": 15230649720960.0, + "grad_norm": 14.52903486153226, + "language_loss": 0.86416197, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.87847936, + "num_input_tokens_seen": 341526155, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23266602, + "step": 15833, + "time_per_iteration": 2.6821751594543457 + }, + { + "auxiliary_loss_clip": 0.01238676, + "auxiliary_loss_mlp": 0.00227988, + "balance_loss_clip": 1.02014637, + "balance_loss_mlp": 0.20363332, + "epoch": 0.9519915827446265, + "flos": 19352943100800.0, + "grad_norm": 7.299862758045733, + "language_loss": 0.85776585, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.87243253, + "num_input_tokens_seen": 341540450, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.2434082, + "step": 15834, + "time_per_iteration": 2.6397647857666016 + }, + { + "auxiliary_loss_clip": 0.01253608, + "auxiliary_loss_mlp": 0.00221663, + "balance_loss_clip": 1.02820647, + "balance_loss_mlp": 0.19447155, + "epoch": 0.9520517059972945, + "flos": 22266303408000.0, + "grad_norm": 31.505206182640507, + "language_loss": 0.84827602, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.86302871, + "num_input_tokens_seen": 341557865, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.27172852, + "step": 15835, + "time_per_iteration": 2.6756808757781982 + }, + { + "auxiliary_loss_clip": 0.01247187, + "auxiliary_loss_mlp": 0.00235921, + "balance_loss_clip": 1.02731669, + "balance_loss_mlp": 0.21025509, + "epoch": 0.9521118292499624, + "flos": 14862313704960.0, + "grad_norm": 149.68162431699727, + "language_loss": 0.76962686, + "learning_rate": 2.397871361623238e-08, + "loss": 0.78445792, + "num_input_tokens_seen": 341573890, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25671387, + "step": 15836, + "time_per_iteration": 2.608229398727417 + }, + { + "auxiliary_loss_clip": 0.01245938, + "auxiliary_loss_mlp": 0.00218451, + "balance_loss_clip": 1.03066587, + "balance_loss_mlp": 0.19327396, + "epoch": 0.9521719525026304, + "flos": 23508812718720.0, + "grad_norm": 34.112026010186405, + "language_loss": 0.76744372, + "learning_rate": 2.391862373676057e-08, + "loss": 0.78208756, + "num_input_tokens_seen": 341593770, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.25183105, + "step": 15837, + "time_per_iteration": 2.683856725692749 + }, + { + "auxiliary_loss_clip": 0.01233756, + "auxiliary_loss_mlp": 0.0024266, + "balance_loss_clip": 1.01159692, + "balance_loss_mlp": 0.21540861, + "epoch": 0.9522320757552983, + "flos": 19714922409600.0, + "grad_norm": 91.022437355982, + "language_loss": 0.81142086, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.82618505, + "num_input_tokens_seen": 341612065, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.2722168, + "step": 15838, + "time_per_iteration": 2.6518428325653076 + }, + { + "auxiliary_loss_clip": 0.0125046, + "auxiliary_loss_mlp": 0.00226315, + "balance_loss_clip": 1.02868783, + "balance_loss_mlp": 0.20087577, + "epoch": 0.9522921990079664, + "flos": 25921291824000.0, + "grad_norm": 38.86694199492652, + "language_loss": 0.86085773, + "learning_rate": 2.379866877970449e-08, + "loss": 0.87562549, + "num_input_tokens_seen": 341631365, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25427246, + "step": 15839, + "time_per_iteration": 2.7992429733276367 + }, + { + "auxiliary_loss_clip": 0.01236438, + "auxiliary_loss_mlp": 0.00208777, + "balance_loss_clip": 1.01970875, + "balance_loss_mlp": 0.18344527, + "epoch": 0.9523523222606343, + "flos": 19208115463680.0, + "grad_norm": 3.7964539939132242, + "language_loss": 0.87128985, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.88574201, + "num_input_tokens_seen": 341650300, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.2532959, + "step": 15840, + "time_per_iteration": 2.688999652862549 + }, + { + "auxiliary_loss_clip": 0.01223907, + "auxiliary_loss_mlp": 0.00227297, + "balance_loss_clip": 1.01324701, + "balance_loss_mlp": 0.20347893, + "epoch": 0.9524124455133023, + "flos": 20921269703040.0, + "grad_norm": 5.581072108015814, + "language_loss": 0.79206991, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.80658197, + "num_input_tokens_seen": 341667680, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.23803711, + "step": 15841, + "time_per_iteration": 2.6731691360473633 + }, + { + "auxiliary_loss_clip": 0.0123923, + "auxiliary_loss_mlp": 0.00228328, + "balance_loss_clip": 1.02756381, + "balance_loss_mlp": 0.20381817, + "epoch": 0.9524725687659702, + "flos": 18843550375680.0, + "grad_norm": 5.3916060844006575, + "language_loss": 0.8748824, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.88955796, + "num_input_tokens_seen": 341685760, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.24487305, + "step": 15842, + "time_per_iteration": 4.064564943313599 + }, + { + "auxiliary_loss_clip": 0.01253513, + "auxiliary_loss_mlp": 0.00220816, + "balance_loss_clip": 1.03129387, + "balance_loss_mlp": 0.19394675, + "epoch": 0.9525326920186382, + "flos": 22674680110080.0, + "grad_norm": 4.697284272879988, + "language_loss": 0.81310779, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.82785112, + "num_input_tokens_seen": 341705300, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.26904297, + "step": 15843, + "time_per_iteration": 2.647260904312134 + }, + { + "auxiliary_loss_clip": 0.01249675, + "auxiliary_loss_mlp": 0.0021396, + "balance_loss_clip": 1.02622592, + "balance_loss_mlp": 0.1886514, + "epoch": 0.9525928152713061, + "flos": 22086642556800.0, + "grad_norm": 46.88213696738439, + "language_loss": 0.84954143, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.86417782, + "num_input_tokens_seen": 341724565, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25305176, + "step": 15844, + "time_per_iteration": 4.122177362442017 + }, + { + "auxiliary_loss_clip": 0.01251968, + "auxiliary_loss_mlp": 0.00240796, + "balance_loss_clip": 1.02378774, + "balance_loss_mlp": 0.2146412, + "epoch": 0.9526529385239741, + "flos": 20704728562560.0, + "grad_norm": 10.241567782462893, + "language_loss": 0.82662791, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.8415556, + "num_input_tokens_seen": 341743605, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.26159668, + "step": 15845, + "time_per_iteration": 2.706728219985962 + }, + { + "auxiliary_loss_clip": 0.01262623, + "auxiliary_loss_mlp": 0.00217962, + "balance_loss_clip": 1.03659177, + "balance_loss_mlp": 0.19292837, + "epoch": 0.9527130617766422, + "flos": 23368043318400.0, + "grad_norm": 61.06808207929735, + "language_loss": 0.81801969, + "learning_rate": 2.338118708818282e-08, + "loss": 0.83282554, + "num_input_tokens_seen": 341763475, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.25048828, + "step": 15846, + "time_per_iteration": 2.6651999950408936 + }, + { + "auxiliary_loss_clip": 0.01226909, + "auxiliary_loss_mlp": 0.00204859, + "balance_loss_clip": 1.01405263, + "balance_loss_mlp": 0.18137513, + "epoch": 0.9527731850293101, + "flos": 18985935888000.0, + "grad_norm": 14.655008896201254, + "language_loss": 0.85936385, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.87368155, + "num_input_tokens_seen": 341781265, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23474121, + "step": 15847, + "time_per_iteration": 2.6938254833221436 + }, + { + "auxiliary_loss_clip": 0.01236661, + "auxiliary_loss_mlp": 0.00214498, + "balance_loss_clip": 1.02083135, + "balance_loss_mlp": 0.18940419, + "epoch": 0.9528333082819781, + "flos": 19318038059520.0, + "grad_norm": 12.248967435863854, + "language_loss": 0.85911191, + "learning_rate": 2.326258115328672e-08, + "loss": 0.87362349, + "num_input_tokens_seen": 341798825, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.25109863, + "step": 15848, + "time_per_iteration": 2.6351847648620605 + }, + { + "auxiliary_loss_clip": 0.01265045, + "auxiliary_loss_mlp": 0.00248797, + "balance_loss_clip": 1.04193234, + "balance_loss_mlp": 0.22062817, + "epoch": 0.952893431534646, + "flos": 23951340276480.0, + "grad_norm": 14.14838595563198, + "language_loss": 0.82728994, + "learning_rate": 2.320339062183674e-08, + "loss": 0.84242839, + "num_input_tokens_seen": 341819480, + "router_z_loss_clip": 2.23144531, + "router_z_loss_mlp": 0.28161621, + "step": 15849, + "time_per_iteration": 2.6979191303253174 + }, + { + "auxiliary_loss_clip": 0.01264244, + "auxiliary_loss_mlp": 0.00258588, + "balance_loss_clip": 1.03667545, + "balance_loss_mlp": 0.22994165, + "epoch": 0.952953554787314, + "flos": 21030545854080.0, + "grad_norm": 18.23138867739095, + "language_loss": 0.81977415, + "learning_rate": 2.314427505071226e-08, + "loss": 0.83500254, + "num_input_tokens_seen": 341838035, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.28637695, + "step": 15850, + "time_per_iteration": 2.6633379459381104 + }, + { + "auxiliary_loss_clip": 0.01238741, + "auxiliary_loss_mlp": 0.00237195, + "balance_loss_clip": 1.02703977, + "balance_loss_mlp": 0.21356785, + "epoch": 0.9530136780399819, + "flos": 22382870019840.0, + "grad_norm": 27.608871880433526, + "language_loss": 0.80675769, + "learning_rate": 2.308523444215482e-08, + "loss": 0.82151711, + "num_input_tokens_seen": 341855895, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.23608398, + "step": 15851, + "time_per_iteration": 4.108676195144653 + }, + { + "auxiliary_loss_clip": 0.01243376, + "auxiliary_loss_mlp": 0.00213787, + "balance_loss_clip": 1.02339792, + "balance_loss_mlp": 0.18915869, + "epoch": 0.95307380129265, + "flos": 22159613036160.0, + "grad_norm": 3.4991070710054166, + "language_loss": 0.87152362, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.88609529, + "num_input_tokens_seen": 341875240, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.24621582, + "step": 15852, + "time_per_iteration": 2.705815076828003 + }, + { + "auxiliary_loss_clip": 0.0124305, + "auxiliary_loss_mlp": 0.00218618, + "balance_loss_clip": 1.02584934, + "balance_loss_mlp": 0.19367908, + "epoch": 0.9531339245453179, + "flos": 44022747214080.0, + "grad_norm": 23.695794842561423, + "language_loss": 0.67846072, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.69307733, + "num_input_tokens_seen": 341901020, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24938965, + "step": 15853, + "time_per_iteration": 4.358322381973267 + }, + { + "auxiliary_loss_clip": 0.01214701, + "auxiliary_loss_mlp": 0.00198441, + "balance_loss_clip": 1.0070343, + "balance_loss_mlp": 0.17525478, + "epoch": 0.9531940477979859, + "flos": 20266690204800.0, + "grad_norm": 5.8159073689184995, + "language_loss": 0.80295396, + "learning_rate": 2.290856241425998e-08, + "loss": 0.81708539, + "num_input_tokens_seen": 341919365, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.23181152, + "step": 15854, + "time_per_iteration": 2.713099479675293 + }, + { + "auxiliary_loss_clip": 0.01243084, + "auxiliary_loss_mlp": 0.00211816, + "balance_loss_clip": 1.02424479, + "balance_loss_mlp": 0.18690088, + "epoch": 0.9532541710506538, + "flos": 25335732309120.0, + "grad_norm": 2.1595460111099425, + "language_loss": 0.75753617, + "learning_rate": 2.284982167833127e-08, + "loss": 0.77208513, + "num_input_tokens_seen": 341939985, + "router_z_loss_clip": 2.19042969, + "router_z_loss_mlp": 0.24902344, + "step": 15855, + "time_per_iteration": 2.7499821186065674 + }, + { + "auxiliary_loss_clip": 0.01233651, + "auxiliary_loss_mlp": 0.00232343, + "balance_loss_clip": 1.01551747, + "balance_loss_mlp": 0.20670119, + "epoch": 0.9533142943033218, + "flos": 26469288691200.0, + "grad_norm": 440.85673392297986, + "language_loss": 0.84044361, + "learning_rate": 2.279115591613556e-08, + "loss": 0.85510361, + "num_input_tokens_seen": 341959255, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25634766, + "step": 15856, + "time_per_iteration": 2.7109339237213135 + }, + { + "auxiliary_loss_clip": 0.0123782, + "auxiliary_loss_mlp": 0.00230438, + "balance_loss_clip": 1.02240372, + "balance_loss_mlp": 0.20596385, + "epoch": 0.9533744175559897, + "flos": 23656944407040.0, + "grad_norm": 24.924025841971236, + "language_loss": 0.85832417, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.8730067, + "num_input_tokens_seen": 341977205, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.24475098, + "step": 15857, + "time_per_iteration": 2.705632448196411 + }, + { + "auxiliary_loss_clip": 0.01091418, + "auxiliary_loss_mlp": 0.00072602, + "balance_loss_clip": 0.95501828, + "balance_loss_mlp": 0.06630792, + "epoch": 0.9534345408086577, + "flos": 61052055500160.0, + "grad_norm": 0.6852831605729798, + "language_loss": 0.61741602, + "learning_rate": 2.267404932183803e-08, + "loss": 0.62905627, + "num_input_tokens_seen": 342038545, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.06298828, + "step": 15858, + "time_per_iteration": 3.152625322341919 + }, + { + "auxiliary_loss_clip": 0.01223646, + "auxiliary_loss_mlp": 0.0022611, + "balance_loss_clip": 1.01015496, + "balance_loss_mlp": 0.20055123, + "epoch": 0.9534946640613258, + "flos": 18951677291520.0, + "grad_norm": 245.6938377183869, + "language_loss": 0.63467133, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.64916891, + "num_input_tokens_seen": 342058195, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.2557373, + "step": 15859, + "time_per_iteration": 2.6484739780426025 + }, + { + "auxiliary_loss_clip": 0.0123293, + "auxiliary_loss_mlp": 0.00226585, + "balance_loss_clip": 1.0207448, + "balance_loss_mlp": 0.2032681, + "epoch": 0.9535547873139937, + "flos": 16654292340480.0, + "grad_norm": 18.507550229277935, + "language_loss": 0.90853316, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.92312831, + "num_input_tokens_seen": 342075025, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.23327637, + "step": 15860, + "time_per_iteration": 2.718653440475464 + }, + { + "auxiliary_loss_clip": 0.01242793, + "auxiliary_loss_mlp": 0.00219144, + "balance_loss_clip": 1.02823436, + "balance_loss_mlp": 0.19501579, + "epoch": 0.9536149105666617, + "flos": 20667776446080.0, + "grad_norm": 15.936233713435355, + "language_loss": 0.76070714, + "learning_rate": 2.249895178891159e-08, + "loss": 0.77532649, + "num_input_tokens_seen": 342094595, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24108887, + "step": 15861, + "time_per_iteration": 2.641321897506714 + }, + { + "auxiliary_loss_clip": 0.0126462, + "auxiliary_loss_mlp": 0.00233473, + "balance_loss_clip": 1.03959179, + "balance_loss_mlp": 0.20789102, + "epoch": 0.9536750338193296, + "flos": 30700499086080.0, + "grad_norm": 86.24497226355157, + "language_loss": 0.73860776, + "learning_rate": 2.244073591573037e-08, + "loss": 0.75358868, + "num_input_tokens_seen": 342115970, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.25585938, + "step": 15862, + "time_per_iteration": 2.736992597579956 + }, + { + "auxiliary_loss_clip": 0.01221601, + "auxiliary_loss_mlp": 0.00203956, + "balance_loss_clip": 1.01649141, + "balance_loss_mlp": 0.18286785, + "epoch": 0.9537351570719976, + "flos": 20405484357120.0, + "grad_norm": 8.196515165240898, + "language_loss": 0.75634778, + "learning_rate": 2.238259503179485e-08, + "loss": 0.77060342, + "num_input_tokens_seen": 342134080, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.2109375, + "step": 15863, + "time_per_iteration": 2.640151262283325 + }, + { + "auxiliary_loss_clip": 0.01238076, + "auxiliary_loss_mlp": 0.00245316, + "balance_loss_clip": 1.02377737, + "balance_loss_mlp": 0.2198168, + "epoch": 0.9537952803246655, + "flos": 29929245235200.0, + "grad_norm": 18.601700691740344, + "language_loss": 0.85299253, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.8678264, + "num_input_tokens_seen": 342154725, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.25524902, + "step": 15864, + "time_per_iteration": 2.7463579177856445 + }, + { + "auxiliary_loss_clip": 0.01229441, + "auxiliary_loss_mlp": 0.00220741, + "balance_loss_clip": 1.01817966, + "balance_loss_mlp": 0.19633929, + "epoch": 0.9538554035773336, + "flos": 20521404524160.0, + "grad_norm": 30.934684706518723, + "language_loss": 0.71287978, + "learning_rate": 2.226653824047586e-08, + "loss": 0.72738159, + "num_input_tokens_seen": 342172275, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.24401855, + "step": 15865, + "time_per_iteration": 2.636202812194824 + }, + { + "auxiliary_loss_clip": 0.01244949, + "auxiliary_loss_mlp": 0.00216886, + "balance_loss_clip": 1.0236721, + "balance_loss_mlp": 0.19334209, + "epoch": 0.9539155268300015, + "flos": 18406517598720.0, + "grad_norm": 34.359420817967816, + "language_loss": 0.7856338, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.8002522, + "num_input_tokens_seen": 342190880, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.23547363, + "step": 15866, + "time_per_iteration": 2.7195987701416016 + }, + { + "auxiliary_loss_clip": 0.01251148, + "auxiliary_loss_mlp": 0.00234279, + "balance_loss_clip": 1.03313816, + "balance_loss_mlp": 0.20881617, + "epoch": 0.9539756500826695, + "flos": 26213281482240.0, + "grad_norm": 6.295607558188785, + "language_loss": 0.94648206, + "learning_rate": 2.215078143255855e-08, + "loss": 0.96133637, + "num_input_tokens_seen": 342208165, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25488281, + "step": 15867, + "time_per_iteration": 2.692237138748169 + }, + { + "auxiliary_loss_clip": 0.01084833, + "auxiliary_loss_mlp": 0.00085535, + "balance_loss_clip": 0.94769204, + "balance_loss_mlp": 0.07814423, + "epoch": 0.9540357733353374, + "flos": 68289097766400.0, + "grad_norm": 0.7573580884529483, + "language_loss": 0.61441362, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.62611729, + "num_input_tokens_seen": 342277110, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.07373047, + "step": 15868, + "time_per_iteration": 3.2424678802490234 + }, + { + "auxiliary_loss_clip": 0.01246118, + "auxiliary_loss_mlp": 0.00227553, + "balance_loss_clip": 1.02882481, + "balance_loss_mlp": 0.20138651, + "epoch": 0.9540958965880054, + "flos": 21288276915840.0, + "grad_norm": 25.27196440063528, + "language_loss": 0.69949901, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.71423566, + "num_input_tokens_seen": 342294695, + "router_z_loss_clip": 2.16894531, + "router_z_loss_mlp": 0.26208496, + "step": 15869, + "time_per_iteration": 2.731685161590576 + }, + { + "auxiliary_loss_clip": 0.01239651, + "auxiliary_loss_mlp": 0.00222193, + "balance_loss_clip": 1.02631593, + "balance_loss_mlp": 0.19773081, + "epoch": 0.9541560198406733, + "flos": 19751407649280.0, + "grad_norm": 23.23193031247423, + "language_loss": 0.77223408, + "learning_rate": 2.197770872795579e-08, + "loss": 0.78685248, + "num_input_tokens_seen": 342314970, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24462891, + "step": 15870, + "time_per_iteration": 2.6886661052703857 + }, + { + "auxiliary_loss_clip": 0.01223564, + "auxiliary_loss_mlp": 0.00219146, + "balance_loss_clip": 1.0134989, + "balance_loss_mlp": 0.19517308, + "epoch": 0.9542161430933414, + "flos": 24715626888960.0, + "grad_norm": 10.392400797034151, + "language_loss": 0.84210193, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.856529, + "num_input_tokens_seen": 342334255, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23974609, + "step": 15871, + "time_per_iteration": 2.801481246948242 + }, + { + "auxiliary_loss_clip": 0.01249257, + "auxiliary_loss_mlp": 0.00222733, + "balance_loss_clip": 1.03103185, + "balance_loss_mlp": 0.19644764, + "epoch": 0.9542762663460094, + "flos": 31065818359680.0, + "grad_norm": 58.441071624515274, + "language_loss": 0.67652518, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.69124508, + "num_input_tokens_seen": 342354730, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.26293945, + "step": 15872, + "time_per_iteration": 2.7740161418914795 + }, + { + "auxiliary_loss_clip": 0.01251181, + "auxiliary_loss_mlp": 0.00246494, + "balance_loss_clip": 1.03285646, + "balance_loss_mlp": 0.22036333, + "epoch": 0.9543363895986773, + "flos": 20776729374720.0, + "grad_norm": 5.141404940545341, + "language_loss": 0.80183136, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.8168081, + "num_input_tokens_seen": 342374565, + "router_z_loss_clip": 2.18457031, + "router_z_loss_mlp": 0.26171875, + "step": 15873, + "time_per_iteration": 2.7065377235412598 + }, + { + "auxiliary_loss_clip": 0.0123988, + "auxiliary_loss_mlp": 0.0023038, + "balance_loss_clip": 1.02119684, + "balance_loss_mlp": 0.20566808, + "epoch": 0.9543965128513453, + "flos": 24462744163200.0, + "grad_norm": 93.19581214879203, + "language_loss": 0.71859574, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.7332983, + "num_input_tokens_seen": 342394590, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24731445, + "step": 15874, + "time_per_iteration": 2.719271659851074 + }, + { + "auxiliary_loss_clip": 0.01220718, + "auxiliary_loss_mlp": 0.00193123, + "balance_loss_clip": 1.01087427, + "balance_loss_mlp": 0.16966246, + "epoch": 0.9544566361040132, + "flos": 15261532439040.0, + "grad_norm": 26.911968920757936, + "language_loss": 0.96972775, + "learning_rate": 2.169075438538104e-08, + "loss": 0.98386621, + "num_input_tokens_seen": 342410445, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.23486328, + "step": 15875, + "time_per_iteration": 2.679400682449341 + }, + { + "auxiliary_loss_clip": 0.01244825, + "auxiliary_loss_mlp": 0.0021989, + "balance_loss_clip": 1.02358747, + "balance_loss_mlp": 0.19502321, + "epoch": 0.9545167593566812, + "flos": 25918777872000.0, + "grad_norm": 25.495137398754864, + "language_loss": 0.73820359, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.75285077, + "num_input_tokens_seen": 342430970, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.24865723, + "step": 15876, + "time_per_iteration": 2.705110549926758 + }, + { + "auxiliary_loss_clip": 0.01271853, + "auxiliary_loss_mlp": 0.00236146, + "balance_loss_clip": 1.04285049, + "balance_loss_mlp": 0.20919245, + "epoch": 0.9545768826093491, + "flos": 25628188844160.0, + "grad_norm": 20.488919263616246, + "language_loss": 0.80516779, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.82024777, + "num_input_tokens_seen": 342449505, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.26940918, + "step": 15877, + "time_per_iteration": 2.7232794761657715 + }, + { + "auxiliary_loss_clip": 0.01261815, + "auxiliary_loss_mlp": 0.0024166, + "balance_loss_clip": 1.03770602, + "balance_loss_mlp": 0.21408646, + "epoch": 0.9546370058620172, + "flos": 22491499726080.0, + "grad_norm": 3.2663031324487535, + "language_loss": 0.79137534, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.80641007, + "num_input_tokens_seen": 342470390, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.27612305, + "step": 15878, + "time_per_iteration": 2.666131019592285 + }, + { + "auxiliary_loss_clip": 0.01235739, + "auxiliary_loss_mlp": 0.00211912, + "balance_loss_clip": 1.0204823, + "balance_loss_mlp": 0.18704453, + "epoch": 0.9546971291146851, + "flos": 24609582961920.0, + "grad_norm": 38.054938263771966, + "language_loss": 0.74891102, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.76338756, + "num_input_tokens_seen": 342492560, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24890137, + "step": 15879, + "time_per_iteration": 2.737482786178589 + }, + { + "auxiliary_loss_clip": 0.01223149, + "auxiliary_loss_mlp": 0.00208683, + "balance_loss_clip": 1.01309276, + "balance_loss_mlp": 0.18524694, + "epoch": 0.9547572523673531, + "flos": 28657756627200.0, + "grad_norm": 45.26124743866386, + "language_loss": 0.92222697, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.93654525, + "num_input_tokens_seen": 342512315, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.234375, + "step": 15880, + "time_per_iteration": 2.7568304538726807 + }, + { + "auxiliary_loss_clip": 0.01219021, + "auxiliary_loss_mlp": 0.00205189, + "balance_loss_clip": 1.0048039, + "balance_loss_mlp": 0.18029803, + "epoch": 0.954817375620021, + "flos": 33802606385280.0, + "grad_norm": 4.5791957522684825, + "language_loss": 0.82277012, + "learning_rate": 2.134888478151753e-08, + "loss": 0.83701217, + "num_input_tokens_seen": 342533060, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.24890137, + "step": 15881, + "time_per_iteration": 2.737898826599121 + }, + { + "auxiliary_loss_clip": 0.01241171, + "auxiliary_loss_mlp": 0.00212549, + "balance_loss_clip": 1.02163482, + "balance_loss_mlp": 0.18750308, + "epoch": 0.954877498872689, + "flos": 14428225843200.0, + "grad_norm": 6.658612232628167, + "language_loss": 0.79098082, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.80551809, + "num_input_tokens_seen": 342550830, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25048828, + "step": 15882, + "time_per_iteration": 2.639922618865967 + }, + { + "auxiliary_loss_clip": 0.01252194, + "auxiliary_loss_mlp": 0.0021495, + "balance_loss_clip": 1.03489816, + "balance_loss_mlp": 0.19045229, + "epoch": 0.9549376221253569, + "flos": 59269447336320.0, + "grad_norm": 12.996853146148107, + "language_loss": 0.7463342, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.76100564, + "num_input_tokens_seen": 342575070, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.24487305, + "step": 15883, + "time_per_iteration": 3.019486665725708 + }, + { + "auxiliary_loss_clip": 0.01265475, + "auxiliary_loss_mlp": 0.00255203, + "balance_loss_clip": 1.04186869, + "balance_loss_mlp": 0.22826147, + "epoch": 0.954997745378025, + "flos": 17274397760640.0, + "grad_norm": 9.83511828139019, + "language_loss": 0.87885416, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.89406085, + "num_input_tokens_seen": 342592215, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26940918, + "step": 15884, + "time_per_iteration": 4.1777870655059814 + }, + { + "auxiliary_loss_clip": 0.0126492, + "auxiliary_loss_mlp": 0.00241198, + "balance_loss_clip": 1.03959882, + "balance_loss_mlp": 0.21487637, + "epoch": 0.955057868630693, + "flos": 13006378903680.0, + "grad_norm": 21.07892828914089, + "language_loss": 0.85587341, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.87093461, + "num_input_tokens_seen": 342610030, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.26330566, + "step": 15885, + "time_per_iteration": 2.7532761096954346 + }, + { + "auxiliary_loss_clip": 0.01259575, + "auxiliary_loss_mlp": 0.00231756, + "balance_loss_clip": 1.03556466, + "balance_loss_mlp": 0.20542276, + "epoch": 0.9551179918833609, + "flos": 22637692080000.0, + "grad_norm": 6.197469903524882, + "language_loss": 0.79985321, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.81476653, + "num_input_tokens_seen": 342626475, + "router_z_loss_clip": 2.24121094, + "router_z_loss_mlp": 0.26306152, + "step": 15886, + "time_per_iteration": 4.0955681800842285 + }, + { + "auxiliary_loss_clip": 0.01256602, + "auxiliary_loss_mlp": 0.00237615, + "balance_loss_clip": 1.03632498, + "balance_loss_mlp": 0.21167536, + "epoch": 0.9551781151360289, + "flos": 21542811667200.0, + "grad_norm": 21.99549910262301, + "language_loss": 0.82715786, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.84209996, + "num_input_tokens_seen": 342646645, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25939941, + "step": 15887, + "time_per_iteration": 2.696613311767578 + }, + { + "auxiliary_loss_clip": 0.01224235, + "auxiliary_loss_mlp": 0.0021052, + "balance_loss_clip": 1.01441276, + "balance_loss_mlp": 0.18813211, + "epoch": 0.9552382383886968, + "flos": 20702250524160.0, + "grad_norm": 476.64486757951545, + "language_loss": 0.66141391, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.67576146, + "num_input_tokens_seen": 342663615, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.22387695, + "step": 15888, + "time_per_iteration": 2.682880401611328 + }, + { + "auxiliary_loss_clip": 0.01081951, + "auxiliary_loss_mlp": 0.00084177, + "balance_loss_clip": 0.94551998, + "balance_loss_mlp": 0.07716745, + "epoch": 0.9552983616413648, + "flos": 67769792887680.0, + "grad_norm": 0.6991587068506511, + "language_loss": 0.57269555, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.58435684, + "num_input_tokens_seen": 342728275, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.0703125, + "step": 15889, + "time_per_iteration": 3.2431600093841553 + }, + { + "auxiliary_loss_clip": 0.01247425, + "auxiliary_loss_mlp": 0.00245909, + "balance_loss_clip": 1.0262599, + "balance_loss_mlp": 0.21980263, + "epoch": 0.9553584848940327, + "flos": 21579979265280.0, + "grad_norm": 8.039679440392607, + "language_loss": 0.74330539, + "learning_rate": 2.084114508877466e-08, + "loss": 0.75823873, + "num_input_tokens_seen": 342748860, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.26123047, + "step": 15890, + "time_per_iteration": 2.729304313659668 + }, + { + "auxiliary_loss_clip": 0.01235052, + "auxiliary_loss_mlp": 0.0020429, + "balance_loss_clip": 1.02020967, + "balance_loss_mlp": 0.18125868, + "epoch": 0.9554186081467008, + "flos": 24208173498240.0, + "grad_norm": 20.985417470580906, + "language_loss": 0.81045008, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.82484353, + "num_input_tokens_seen": 342769705, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.23022461, + "step": 15891, + "time_per_iteration": 2.7491800785064697 + }, + { + "auxiliary_loss_clip": 0.01234446, + "auxiliary_loss_mlp": 0.00188142, + "balance_loss_clip": 1.02403879, + "balance_loss_mlp": 0.16515833, + "epoch": 0.9554787313993687, + "flos": 16251554073600.0, + "grad_norm": 28.090973489665902, + "language_loss": 0.8379274, + "learning_rate": 2.072913954011435e-08, + "loss": 0.8521533, + "num_input_tokens_seen": 342787000, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.2298584, + "step": 15892, + "time_per_iteration": 2.7048635482788086 + }, + { + "auxiliary_loss_clip": 0.01239115, + "auxiliary_loss_mlp": 0.00238278, + "balance_loss_clip": 1.02312851, + "balance_loss_mlp": 0.21264753, + "epoch": 0.9555388546520367, + "flos": 23404133508480.0, + "grad_norm": 14.104615642358187, + "language_loss": 0.77913976, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.7939136, + "num_input_tokens_seen": 342807795, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.25646973, + "step": 15893, + "time_per_iteration": 2.7344796657562256 + }, + { + "auxiliary_loss_clip": 0.01236045, + "auxiliary_loss_mlp": 0.00224489, + "balance_loss_clip": 1.02442586, + "balance_loss_mlp": 0.19900244, + "epoch": 0.9555989779047046, + "flos": 14794047907200.0, + "grad_norm": 12.166697816160342, + "language_loss": 0.74041295, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.75501829, + "num_input_tokens_seen": 342825490, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.25476074, + "step": 15894, + "time_per_iteration": 4.1294496059417725 + }, + { + "auxiliary_loss_clip": 0.01243219, + "auxiliary_loss_mlp": 0.00233433, + "balance_loss_clip": 1.02385354, + "balance_loss_mlp": 0.20633629, + "epoch": 0.9556591011573726, + "flos": 22236749493120.0, + "grad_norm": 12.616301325545763, + "language_loss": 0.89595377, + "learning_rate": 2.056169412853581e-08, + "loss": 0.91072029, + "num_input_tokens_seen": 342844965, + "router_z_loss_clip": 2.19238281, + "router_z_loss_mlp": 0.27075195, + "step": 15895, + "time_per_iteration": 2.766310691833496 + }, + { + "auxiliary_loss_clip": 0.01250457, + "auxiliary_loss_mlp": 0.00218546, + "balance_loss_clip": 1.03313613, + "balance_loss_mlp": 0.19457263, + "epoch": 0.9557192244100405, + "flos": 27855296835840.0, + "grad_norm": 10.920498968007214, + "language_loss": 0.78938699, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.80407703, + "num_input_tokens_seen": 342865915, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.23986816, + "step": 15896, + "time_per_iteration": 4.172297477722168 + }, + { + "auxiliary_loss_clip": 0.01233666, + "auxiliary_loss_mlp": 0.0023288, + "balance_loss_clip": 1.01953185, + "balance_loss_mlp": 0.20875227, + "epoch": 0.9557793476627086, + "flos": 17602800831360.0, + "grad_norm": 65.119240543668, + "language_loss": 0.86861992, + "learning_rate": 2.045043915311706e-08, + "loss": 0.8832854, + "num_input_tokens_seen": 342884000, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24145508, + "step": 15897, + "time_per_iteration": 2.641435146331787 + }, + { + "auxiliary_loss_clip": 0.01246921, + "auxiliary_loss_mlp": 0.00229887, + "balance_loss_clip": 1.02592659, + "balance_loss_mlp": 0.20538951, + "epoch": 0.9558394709153766, + "flos": 23875496709120.0, + "grad_norm": 6.7600612071742106, + "language_loss": 0.79458272, + "learning_rate": 2.03949242614303e-08, + "loss": 0.80935079, + "num_input_tokens_seen": 342903095, + "router_z_loss_clip": 2.21191406, + "router_z_loss_mlp": 0.24487305, + "step": 15898, + "time_per_iteration": 2.6880149841308594 + }, + { + "auxiliary_loss_clip": 0.01084355, + "auxiliary_loss_mlp": 0.00059335, + "balance_loss_clip": 0.94837749, + "balance_loss_mlp": 0.05289748, + "epoch": 0.9558995941680445, + "flos": 53682001171200.0, + "grad_norm": 0.8166010222731609, + "language_loss": 0.51703405, + "learning_rate": 2.033948443656652e-08, + "loss": 0.52847099, + "num_input_tokens_seen": 342958155, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06445312, + "step": 15899, + "time_per_iteration": 3.111462354660034 + }, + { + "auxiliary_loss_clip": 0.01286395, + "auxiliary_loss_mlp": 0.0022905, + "balance_loss_clip": 1.05215681, + "balance_loss_mlp": 0.20177488, + "epoch": 0.9559597174207125, + "flos": 13764488376960.0, + "grad_norm": 35.826620685942984, + "language_loss": 0.80884004, + "learning_rate": 2.028411968062782e-08, + "loss": 0.82399452, + "num_input_tokens_seen": 342972500, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.27282715, + "step": 15900, + "time_per_iteration": 2.6648924350738525 + }, + { + "auxiliary_loss_clip": 0.01245139, + "auxiliary_loss_mlp": 0.0024248, + "balance_loss_clip": 1.02468693, + "balance_loss_mlp": 0.21599191, + "epoch": 0.9560198406733804, + "flos": 19936347799680.0, + "grad_norm": 270.2633841270569, + "language_loss": 0.90506506, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.91994119, + "num_input_tokens_seen": 342989035, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.26477051, + "step": 15901, + "time_per_iteration": 2.6180145740509033 + }, + { + "auxiliary_loss_clip": 0.01086872, + "auxiliary_loss_mlp": 0.00098638, + "balance_loss_clip": 0.95079273, + "balance_loss_mlp": 0.09086575, + "epoch": 0.9560799639260484, + "flos": 57289550699520.0, + "grad_norm": 0.6986916228166169, + "language_loss": 0.5348382, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.54669333, + "num_input_tokens_seen": 343051675, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07763672, + "step": 15902, + "time_per_iteration": 3.218428611755371 + }, + { + "auxiliary_loss_clip": 0.01223441, + "auxiliary_loss_mlp": 0.0022353, + "balance_loss_clip": 1.01533866, + "balance_loss_mlp": 0.20036739, + "epoch": 0.9561400871787163, + "flos": 18917167299840.0, + "grad_norm": 97.16681958650014, + "language_loss": 0.91564459, + "learning_rate": 2.01184758473425e-08, + "loss": 0.93011433, + "num_input_tokens_seen": 343068895, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.23144531, + "step": 15903, + "time_per_iteration": 2.730149984359741 + }, + { + "auxiliary_loss_clip": 0.01228674, + "auxiliary_loss_mlp": 0.00213939, + "balance_loss_clip": 1.01597691, + "balance_loss_mlp": 0.18945304, + "epoch": 0.9562002104313844, + "flos": 18038576632320.0, + "grad_norm": 83.76818372810664, + "language_loss": 0.88065505, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.89508116, + "num_input_tokens_seen": 343087115, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.24475098, + "step": 15904, + "time_per_iteration": 2.659135580062866 + }, + { + "auxiliary_loss_clip": 0.01244994, + "auxiliary_loss_mlp": 0.00239197, + "balance_loss_clip": 1.026245, + "balance_loss_mlp": 0.21376985, + "epoch": 0.9562603336840523, + "flos": 24717673964160.0, + "grad_norm": 14.650404858455657, + "language_loss": 0.70043761, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.71527958, + "num_input_tokens_seen": 343105575, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25402832, + "step": 15905, + "time_per_iteration": 2.723362684249878 + }, + { + "auxiliary_loss_clip": 0.01222498, + "auxiliary_loss_mlp": 0.00212819, + "balance_loss_clip": 1.01297975, + "balance_loss_mlp": 0.18964455, + "epoch": 0.9563204569367203, + "flos": 21177205084800.0, + "grad_norm": 434.7777526391976, + "language_loss": 0.78855145, + "learning_rate": 1.995350770979254e-08, + "loss": 0.80290455, + "num_input_tokens_seen": 343123025, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.23193359, + "step": 15906, + "time_per_iteration": 2.6490638256073 + }, + { + "auxiliary_loss_clip": 0.01244364, + "auxiliary_loss_mlp": 0.00229822, + "balance_loss_clip": 1.02605832, + "balance_loss_mlp": 0.20562238, + "epoch": 0.9563805801893882, + "flos": 20229738088320.0, + "grad_norm": 44.1766920936188, + "language_loss": 0.80780828, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.82255018, + "num_input_tokens_seen": 343141625, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24182129, + "step": 15907, + "time_per_iteration": 2.686408042907715 + }, + { + "auxiliary_loss_clip": 0.01224997, + "auxiliary_loss_mlp": 0.00201581, + "balance_loss_clip": 1.0126971, + "balance_loss_mlp": 0.17835921, + "epoch": 0.9564407034420562, + "flos": 25411001258880.0, + "grad_norm": 7.123229354793934, + "language_loss": 0.80737293, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.8216387, + "num_input_tokens_seen": 343161300, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.23217773, + "step": 15908, + "time_per_iteration": 2.6864986419677734 + }, + { + "auxiliary_loss_clip": 0.01244838, + "auxiliary_loss_mlp": 0.00207321, + "balance_loss_clip": 1.02783036, + "balance_loss_mlp": 0.18139294, + "epoch": 0.9565008266947241, + "flos": 18623884752000.0, + "grad_norm": 3.9155384182783446, + "language_loss": 0.90314496, + "learning_rate": 1.978921532427802e-08, + "loss": 0.91766655, + "num_input_tokens_seen": 343177815, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.25964355, + "step": 15909, + "time_per_iteration": 2.673283576965332 + }, + { + "auxiliary_loss_clip": 0.01227449, + "auxiliary_loss_mlp": 0.00237502, + "balance_loss_clip": 1.01648045, + "balance_loss_mlp": 0.21321906, + "epoch": 0.9565609499473922, + "flos": 24862142465280.0, + "grad_norm": 823.1594420849292, + "language_loss": 0.76585096, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.78050041, + "num_input_tokens_seen": 343198140, + "router_z_loss_clip": 2.11035156, + "router_z_loss_mlp": 0.24304199, + "step": 15910, + "time_per_iteration": 2.6603896617889404 + }, + { + "auxiliary_loss_clip": 0.0125583, + "auxiliary_loss_mlp": 0.00231482, + "balance_loss_clip": 1.03454518, + "balance_loss_mlp": 0.20598298, + "epoch": 0.9566210732000601, + "flos": 21798459740160.0, + "grad_norm": 11.98535381646701, + "language_loss": 0.83550346, + "learning_rate": 1.968006251276444e-08, + "loss": 0.85037655, + "num_input_tokens_seen": 343218280, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.25476074, + "step": 15911, + "time_per_iteration": 2.673964500427246 + }, + { + "auxiliary_loss_clip": 0.01230797, + "auxiliary_loss_mlp": 0.00233122, + "balance_loss_clip": 1.019081, + "balance_loss_mlp": 0.20829055, + "epoch": 0.9566811964527281, + "flos": 18697609416960.0, + "grad_norm": 65.8967945486838, + "language_loss": 0.77356827, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.78820753, + "num_input_tokens_seen": 343236850, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.24841309, + "step": 15912, + "time_per_iteration": 2.7108404636383057 + }, + { + "auxiliary_loss_clip": 0.01272181, + "auxiliary_loss_mlp": 0.00220251, + "balance_loss_clip": 1.04558802, + "balance_loss_mlp": 0.19416848, + "epoch": 0.9567413197053961, + "flos": 13000632727680.0, + "grad_norm": 14.548586147972852, + "language_loss": 0.82852709, + "learning_rate": 1.95712100769696e-08, + "loss": 0.84345144, + "num_input_tokens_seen": 343253065, + "router_z_loss_clip": 2.26464844, + "router_z_loss_mlp": 0.26098633, + "step": 15913, + "time_per_iteration": 2.8356926441192627 + }, + { + "auxiliary_loss_clip": 0.01235255, + "auxiliary_loss_mlp": 0.00214964, + "balance_loss_clip": 1.02257252, + "balance_loss_mlp": 0.19242191, + "epoch": 0.956801442958064, + "flos": 19719267955200.0, + "grad_norm": 58.905304445457475, + "language_loss": 0.81291282, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.82741499, + "num_input_tokens_seen": 343270330, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.22521973, + "step": 15914, + "time_per_iteration": 2.8147265911102295 + }, + { + "auxiliary_loss_clip": 0.01241895, + "auxiliary_loss_mlp": 0.0022546, + "balance_loss_clip": 1.02374911, + "balance_loss_mlp": 0.20024735, + "epoch": 0.956861566210732, + "flos": 18222834424320.0, + "grad_norm": 12.858078953240724, + "language_loss": 0.7499243, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.76459789, + "num_input_tokens_seen": 343289625, + "router_z_loss_clip": 2.18066406, + "router_z_loss_mlp": 0.25183105, + "step": 15915, + "time_per_iteration": 2.743666887283325 + }, + { + "auxiliary_loss_clip": 0.0121754, + "auxiliary_loss_mlp": 0.00221154, + "balance_loss_clip": 1.01005113, + "balance_loss_mlp": 0.1992196, + "epoch": 0.9569216894634, + "flos": 22196960202240.0, + "grad_norm": 11.794568840850472, + "language_loss": 0.722453, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.73683995, + "num_input_tokens_seen": 343309200, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.21948242, + "step": 15916, + "time_per_iteration": 2.768366813659668 + }, + { + "auxiliary_loss_clip": 0.01237621, + "auxiliary_loss_mlp": 0.00214565, + "balance_loss_clip": 1.02704382, + "balance_loss_mlp": 0.19105729, + "epoch": 0.956981812716068, + "flos": 21689291329920.0, + "grad_norm": 28.93940060099328, + "language_loss": 0.87089527, + "learning_rate": 1.935440639853536e-08, + "loss": 0.8854171, + "num_input_tokens_seen": 343326270, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.23522949, + "step": 15917, + "time_per_iteration": 2.7304859161376953 + }, + { + "auxiliary_loss_clip": 0.01234627, + "auxiliary_loss_mlp": 0.00204015, + "balance_loss_clip": 1.01789427, + "balance_loss_mlp": 0.17814687, + "epoch": 0.9570419359687359, + "flos": 13990905757440.0, + "grad_norm": 4.944997785652027, + "language_loss": 0.83538622, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.84977269, + "num_input_tokens_seen": 343344430, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.25891113, + "step": 15918, + "time_per_iteration": 2.703498363494873 + }, + { + "auxiliary_loss_clip": 0.01075093, + "auxiliary_loss_mlp": 0.00056329, + "balance_loss_clip": 0.93959868, + "balance_loss_mlp": 0.04960566, + "epoch": 0.9571020592214039, + "flos": 65196938534400.0, + "grad_norm": 0.6864530837422297, + "language_loss": 0.52609986, + "learning_rate": 1.924645518878032e-08, + "loss": 0.53741407, + "num_input_tokens_seen": 343416155, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06738281, + "step": 15919, + "time_per_iteration": 3.250814914703369 + }, + { + "auxiliary_loss_clip": 0.01260112, + "auxiliary_loss_mlp": 0.00250543, + "balance_loss_clip": 1.03600049, + "balance_loss_mlp": 0.22355425, + "epoch": 0.9571621824740718, + "flos": 17384068961280.0, + "grad_norm": 11.674950824952175, + "language_loss": 0.88344324, + "learning_rate": 1.919259224843972e-08, + "loss": 0.8985498, + "num_input_tokens_seen": 343431715, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.26989746, + "step": 15920, + "time_per_iteration": 2.7001428604125977 + }, + { + "auxiliary_loss_clip": 0.01247687, + "auxiliary_loss_mlp": 0.0021763, + "balance_loss_clip": 1.02853179, + "balance_loss_mlp": 0.19248873, + "epoch": 0.9572223057267398, + "flos": 14538184352640.0, + "grad_norm": 62.815352327567325, + "language_loss": 0.89102972, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.90568286, + "num_input_tokens_seen": 343450425, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25170898, + "step": 15921, + "time_per_iteration": 2.6014506816864014 + }, + { + "auxiliary_loss_clip": 0.0126305, + "auxiliary_loss_mlp": 0.00250177, + "balance_loss_clip": 1.03516173, + "balance_loss_mlp": 0.22333059, + "epoch": 0.9572824289794077, + "flos": 33947793158400.0, + "grad_norm": 3.5020670111838963, + "language_loss": 0.6195761, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.6347084, + "num_input_tokens_seen": 343470445, + "router_z_loss_clip": 2.27832031, + "router_z_loss_mlp": 0.26867676, + "step": 15922, + "time_per_iteration": 2.7867624759674072 + }, + { + "auxiliary_loss_clip": 0.01232018, + "auxiliary_loss_mlp": 0.00228065, + "balance_loss_clip": 1.01958013, + "balance_loss_mlp": 0.20338902, + "epoch": 0.9573425522320758, + "flos": 18694915896960.0, + "grad_norm": 221.7724046475925, + "language_loss": 0.89966863, + "learning_rate": 1.903145411006557e-08, + "loss": 0.91426945, + "num_input_tokens_seen": 343485200, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.2467041, + "step": 15923, + "time_per_iteration": 2.6207916736602783 + }, + { + "auxiliary_loss_clip": 0.01237178, + "auxiliary_loss_mlp": 0.00225679, + "balance_loss_clip": 1.02546835, + "balance_loss_mlp": 0.20102663, + "epoch": 0.9574026754847437, + "flos": 28510307297280.0, + "grad_norm": 3.0565194644960925, + "language_loss": 0.83020705, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.84483564, + "num_input_tokens_seen": 343505080, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.24658203, + "step": 15924, + "time_per_iteration": 2.722047805786133 + }, + { + "auxiliary_loss_clip": 0.01240791, + "auxiliary_loss_mlp": 0.00208001, + "balance_loss_clip": 1.02231252, + "balance_loss_mlp": 0.18347955, + "epoch": 0.9574627987374117, + "flos": 24352390604160.0, + "grad_norm": 10.07935776101723, + "language_loss": 0.93399519, + "learning_rate": 1.892440427371711e-08, + "loss": 0.94848311, + "num_input_tokens_seen": 343523995, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24523926, + "step": 15925, + "time_per_iteration": 2.8003714084625244 + }, + { + "auxiliary_loss_clip": 0.01239344, + "auxiliary_loss_mlp": 0.00230738, + "balance_loss_clip": 1.02159047, + "balance_loss_mlp": 0.2069913, + "epoch": 0.9575229219900797, + "flos": 23510680225920.0, + "grad_norm": 18.498261909115836, + "language_loss": 0.85128474, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.86598551, + "num_input_tokens_seen": 343542015, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.23718262, + "step": 15926, + "time_per_iteration": 4.083993434906006 + }, + { + "auxiliary_loss_clip": 0.01218041, + "auxiliary_loss_mlp": 0.00223916, + "balance_loss_clip": 1.01163673, + "balance_loss_mlp": 0.20132574, + "epoch": 0.9575830452427476, + "flos": 22674823764480.0, + "grad_norm": 9.913960155852877, + "language_loss": 0.85872042, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.87313998, + "num_input_tokens_seen": 343561680, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.22595215, + "step": 15927, + "time_per_iteration": 2.6955087184906006 + }, + { + "auxiliary_loss_clip": 0.01244503, + "auxiliary_loss_mlp": 0.00208445, + "balance_loss_clip": 1.02574754, + "balance_loss_mlp": 0.1826123, + "epoch": 0.9576431684954156, + "flos": 30485250835200.0, + "grad_norm": 6.365563804515498, + "language_loss": 0.79112267, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.80565214, + "num_input_tokens_seen": 343585290, + "router_z_loss_clip": 2.18847656, + "router_z_loss_mlp": 0.25830078, + "step": 15928, + "time_per_iteration": 4.2189271450042725 + }, + { + "auxiliary_loss_clip": 0.01257034, + "auxiliary_loss_mlp": 0.00244113, + "balance_loss_clip": 1.03594422, + "balance_loss_mlp": 0.21927023, + "epoch": 0.9577032917480836, + "flos": 21687387909120.0, + "grad_norm": 57.161611549720675, + "language_loss": 0.89095914, + "learning_rate": 1.871120608822485e-08, + "loss": 0.90597057, + "num_input_tokens_seen": 343604045, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.24853516, + "step": 15929, + "time_per_iteration": 2.695303201675415 + }, + { + "auxiliary_loss_clip": 0.012664, + "auxiliary_loss_mlp": 0.00234275, + "balance_loss_clip": 1.03987908, + "balance_loss_mlp": 0.20770347, + "epoch": 0.9577634150007516, + "flos": 29023147728000.0, + "grad_norm": 10.340216679243799, + "language_loss": 0.79556477, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.81057143, + "num_input_tokens_seen": 343626595, + "router_z_loss_clip": 2.26269531, + "router_z_loss_mlp": 0.26574707, + "step": 15930, + "time_per_iteration": 2.76374888420105 + }, + { + "auxiliary_loss_clip": 0.0123089, + "auxiliary_loss_mlp": 0.00217518, + "balance_loss_clip": 1.01788306, + "balance_loss_mlp": 0.19443919, + "epoch": 0.9578235382534195, + "flos": 19282235178240.0, + "grad_norm": 6.731238866114594, + "language_loss": 0.70711613, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.72160023, + "num_input_tokens_seen": 343646195, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.23083496, + "step": 15931, + "time_per_iteration": 2.640101432800293 + }, + { + "auxiliary_loss_clip": 0.01233203, + "auxiliary_loss_mlp": 0.00216386, + "balance_loss_clip": 1.02476335, + "balance_loss_mlp": 0.19327137, + "epoch": 0.9578836615060875, + "flos": 13699275235200.0, + "grad_norm": 6.34569102565622, + "language_loss": 0.78714442, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.80164033, + "num_input_tokens_seen": 343663665, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.23132324, + "step": 15932, + "time_per_iteration": 2.675977945327759 + }, + { + "auxiliary_loss_clip": 0.0124068, + "auxiliary_loss_mlp": 0.00217293, + "balance_loss_clip": 1.02264941, + "balance_loss_mlp": 0.19217509, + "epoch": 0.9579437847587554, + "flos": 17054516655360.0, + "grad_norm": 5.062825875338505, + "language_loss": 0.83797127, + "learning_rate": 1.849920999338961e-08, + "loss": 0.85255098, + "num_input_tokens_seen": 343682145, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.2512207, + "step": 15933, + "time_per_iteration": 2.6474924087524414 + }, + { + "auxiliary_loss_clip": 0.01085492, + "auxiliary_loss_mlp": 0.00091758, + "balance_loss_clip": 0.9460727, + "balance_loss_mlp": 0.08532095, + "epoch": 0.9580039080114234, + "flos": 60570887886720.0, + "grad_norm": 0.771698629647379, + "language_loss": 0.56679499, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.5785675, + "num_input_tokens_seen": 343744685, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.06445312, + "step": 15934, + "time_per_iteration": 3.2700588703155518 + }, + { + "auxiliary_loss_clip": 0.01085803, + "auxiliary_loss_mlp": 0.00076792, + "balance_loss_clip": 0.94735265, + "balance_loss_mlp": 0.06997342, + "epoch": 0.9580640312640913, + "flos": 66235365745920.0, + "grad_norm": 0.9231450210099374, + "language_loss": 0.65114504, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.66277099, + "num_input_tokens_seen": 343801835, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.06835938, + "step": 15935, + "time_per_iteration": 3.1240625381469727 + }, + { + "auxiliary_loss_clip": 0.01084516, + "auxiliary_loss_mlp": 0.00077481, + "balance_loss_clip": 0.94810927, + "balance_loss_mlp": 0.07061409, + "epoch": 0.9581241545167594, + "flos": 62218002971520.0, + "grad_norm": 1.2682911897164928, + "language_loss": 0.56021047, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.57183039, + "num_input_tokens_seen": 343861515, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06884766, + "step": 15936, + "time_per_iteration": 4.5569727420806885 + }, + { + "auxiliary_loss_clip": 0.01235018, + "auxiliary_loss_mlp": 0.00215317, + "balance_loss_clip": 1.02064705, + "balance_loss_mlp": 0.19002058, + "epoch": 0.9581842777694273, + "flos": 23768088065280.0, + "grad_norm": 227.98601711354974, + "language_loss": 0.84960777, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.86411107, + "num_input_tokens_seen": 343881240, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.25317383, + "step": 15937, + "time_per_iteration": 2.685835361480713 + }, + { + "auxiliary_loss_clip": 0.01235263, + "auxiliary_loss_mlp": 0.00215358, + "balance_loss_clip": 1.02149701, + "balance_loss_mlp": 0.19171843, + "epoch": 0.9582444010220953, + "flos": 21213079793280.0, + "grad_norm": 201.49208286132023, + "language_loss": 0.75102943, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.76553565, + "num_input_tokens_seen": 343900885, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.23657227, + "step": 15938, + "time_per_iteration": 4.153298377990723 + }, + { + "auxiliary_loss_clip": 0.0124105, + "auxiliary_loss_mlp": 0.00208033, + "balance_loss_clip": 1.02339959, + "balance_loss_mlp": 0.18279657, + "epoch": 0.9583045242747633, + "flos": 23805147922560.0, + "grad_norm": 29.778663762207774, + "language_loss": 0.75155139, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.76604223, + "num_input_tokens_seen": 343918460, + "router_z_loss_clip": 2.17675781, + "router_z_loss_mlp": 0.25256348, + "step": 15939, + "time_per_iteration": 2.6777939796447754 + }, + { + "auxiliary_loss_clip": 0.01236298, + "auxiliary_loss_mlp": 0.00212226, + "balance_loss_clip": 1.02033818, + "balance_loss_mlp": 0.1882056, + "epoch": 0.9583646475274312, + "flos": 24131468004480.0, + "grad_norm": 40.63321780662848, + "language_loss": 0.813739, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.82822424, + "num_input_tokens_seen": 343938030, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.2401123, + "step": 15940, + "time_per_iteration": 2.747972011566162 + }, + { + "auxiliary_loss_clip": 0.01244301, + "auxiliary_loss_mlp": 0.00233206, + "balance_loss_clip": 1.02958333, + "balance_loss_mlp": 0.20869663, + "epoch": 0.9584247707800992, + "flos": 20886651970560.0, + "grad_norm": 5.615152832245876, + "language_loss": 0.78436553, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.79914063, + "num_input_tokens_seen": 343956635, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24499512, + "step": 15941, + "time_per_iteration": 2.691894292831421 + }, + { + "auxiliary_loss_clip": 0.01237007, + "auxiliary_loss_mlp": 0.00216618, + "balance_loss_clip": 1.01822734, + "balance_loss_mlp": 0.19092828, + "epoch": 0.9584848940327672, + "flos": 26067591918720.0, + "grad_norm": 10.932385313947638, + "language_loss": 0.80878943, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.82332563, + "num_input_tokens_seen": 343976625, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25695801, + "step": 15942, + "time_per_iteration": 2.7199442386627197 + }, + { + "auxiliary_loss_clip": 0.01263565, + "auxiliary_loss_mlp": 0.00248738, + "balance_loss_clip": 1.03975105, + "balance_loss_mlp": 0.221523, + "epoch": 0.9585450172854352, + "flos": 34492988764800.0, + "grad_norm": 12.585601489018357, + "language_loss": 0.77954161, + "learning_rate": 1.797447974521571e-08, + "loss": 0.79466462, + "num_input_tokens_seen": 343997790, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.27197266, + "step": 15943, + "time_per_iteration": 2.797304391860962 + }, + { + "auxiliary_loss_clip": 0.01241538, + "auxiliary_loss_mlp": 0.00236064, + "balance_loss_clip": 1.02659464, + "balance_loss_mlp": 0.21234107, + "epoch": 0.9586051405381031, + "flos": 23110743219840.0, + "grad_norm": 16.70943876026597, + "language_loss": 0.77835703, + "learning_rate": 1.792242006001965e-08, + "loss": 0.79313302, + "num_input_tokens_seen": 344016935, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.23730469, + "step": 15944, + "time_per_iteration": 2.6319422721862793 + }, + { + "auxiliary_loss_clip": 0.01237677, + "auxiliary_loss_mlp": 0.00219569, + "balance_loss_clip": 1.02231467, + "balance_loss_mlp": 0.1941779, + "epoch": 0.9586652637907711, + "flos": 19603994232960.0, + "grad_norm": 41.47750343514497, + "language_loss": 0.74929971, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.76387215, + "num_input_tokens_seen": 344035590, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.25415039, + "step": 15945, + "time_per_iteration": 2.7991485595703125 + }, + { + "auxiliary_loss_clip": 0.01081458, + "auxiliary_loss_mlp": 0.00051716, + "balance_loss_clip": 0.9447782, + "balance_loss_mlp": 0.04577913, + "epoch": 0.958725387043439, + "flos": 72073327317120.0, + "grad_norm": 1.3892362992953164, + "language_loss": 0.60752285, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.61885464, + "num_input_tokens_seen": 344100845, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.05932617, + "step": 15946, + "time_per_iteration": 3.212904930114746 + }, + { + "auxiliary_loss_clip": 0.01218549, + "auxiliary_loss_mlp": 0.00219589, + "balance_loss_clip": 1.00850153, + "balance_loss_mlp": 0.19491276, + "epoch": 0.958785510296107, + "flos": 28911932242560.0, + "grad_norm": 27.749108004146336, + "language_loss": 0.83144337, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.84582472, + "num_input_tokens_seen": 344121780, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.24682617, + "step": 15947, + "time_per_iteration": 2.7232885360717773 + }, + { + "auxiliary_loss_clip": 0.01232539, + "auxiliary_loss_mlp": 0.00223633, + "balance_loss_clip": 1.01982069, + "balance_loss_mlp": 0.19836034, + "epoch": 0.958845633548775, + "flos": 18477189607680.0, + "grad_norm": 7.771897169768314, + "language_loss": 0.80773497, + "learning_rate": 1.771493294473747e-08, + "loss": 0.82229668, + "num_input_tokens_seen": 344140150, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.25292969, + "step": 15948, + "time_per_iteration": 2.6499760150909424 + }, + { + "auxiliary_loss_clip": 0.01242454, + "auxiliary_loss_mlp": 0.00219088, + "balance_loss_clip": 1.02668405, + "balance_loss_mlp": 0.19432794, + "epoch": 0.958905756801443, + "flos": 24206916522240.0, + "grad_norm": 20.473381154955653, + "language_loss": 0.87777555, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.89239097, + "num_input_tokens_seen": 344158200, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24768066, + "step": 15949, + "time_per_iteration": 2.6811866760253906 + }, + { + "auxiliary_loss_clip": 0.01256278, + "auxiliary_loss_mlp": 0.00235527, + "balance_loss_clip": 1.03542233, + "balance_loss_mlp": 0.210815, + "epoch": 0.9589658800541109, + "flos": 25007939769600.0, + "grad_norm": 57.84139942192315, + "language_loss": 0.75275302, + "learning_rate": 1.761164038992602e-08, + "loss": 0.76767105, + "num_input_tokens_seen": 344174720, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.24731445, + "step": 15950, + "time_per_iteration": 2.779010534286499 + }, + { + "auxiliary_loss_clip": 0.01237786, + "auxiliary_loss_mlp": 0.00229319, + "balance_loss_clip": 1.02109885, + "balance_loss_mlp": 0.20466611, + "epoch": 0.9590260033067789, + "flos": 23514558894720.0, + "grad_norm": 6.531131279492148, + "language_loss": 0.9166795, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.93135053, + "num_input_tokens_seen": 344192580, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24609375, + "step": 15951, + "time_per_iteration": 2.689840316772461 + }, + { + "auxiliary_loss_clip": 0.01273048, + "auxiliary_loss_mlp": 0.0023996, + "balance_loss_clip": 1.0399456, + "balance_loss_mlp": 0.21287596, + "epoch": 0.9590861265594469, + "flos": 25520349237120.0, + "grad_norm": 13.949562302174128, + "language_loss": 0.91226876, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.9273988, + "num_input_tokens_seen": 344210345, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.27075195, + "step": 15952, + "time_per_iteration": 2.7003345489501953 + }, + { + "auxiliary_loss_clip": 0.01246013, + "auxiliary_loss_mlp": 0.00232762, + "balance_loss_clip": 1.02843928, + "balance_loss_mlp": 0.2077879, + "epoch": 0.9591462498121148, + "flos": 21179323987200.0, + "grad_norm": 4.644159547058579, + "language_loss": 0.76747394, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.78226167, + "num_input_tokens_seen": 344229540, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24975586, + "step": 15953, + "time_per_iteration": 2.69694185256958 + }, + { + "auxiliary_loss_clip": 0.01241264, + "auxiliary_loss_mlp": 0.00239911, + "balance_loss_clip": 1.01898241, + "balance_loss_mlp": 0.21406657, + "epoch": 0.9592063730647828, + "flos": 21723047136000.0, + "grad_norm": 32.270364495961495, + "language_loss": 0.69964105, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.71445286, + "num_input_tokens_seen": 344247830, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25842285, + "step": 15954, + "time_per_iteration": 2.7470951080322266 + }, + { + "auxiliary_loss_clip": 0.01255116, + "auxiliary_loss_mlp": 0.00240695, + "balance_loss_clip": 1.03096199, + "balance_loss_mlp": 0.21378908, + "epoch": 0.9592664963174508, + "flos": 29891395278720.0, + "grad_norm": 15.190123707692134, + "language_loss": 0.80602998, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.82098812, + "num_input_tokens_seen": 344267760, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26916504, + "step": 15955, + "time_per_iteration": 2.718113422393799 + }, + { + "auxiliary_loss_clip": 0.01226891, + "auxiliary_loss_mlp": 0.00227759, + "balance_loss_clip": 1.01027513, + "balance_loss_mlp": 0.20119914, + "epoch": 0.9593266195701188, + "flos": 17999613354240.0, + "grad_norm": 3.0765382166215622, + "language_loss": 0.71874583, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.73329228, + "num_input_tokens_seen": 344284905, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.26550293, + "step": 15956, + "time_per_iteration": 2.7358133792877197 + }, + { + "auxiliary_loss_clip": 0.01236686, + "auxiliary_loss_mlp": 0.00209601, + "balance_loss_clip": 1.01601887, + "balance_loss_mlp": 0.18288597, + "epoch": 0.9593867428227867, + "flos": 18838271076480.0, + "grad_norm": 81.37440947153556, + "language_loss": 0.69996786, + "learning_rate": 1.725248447997507e-08, + "loss": 0.71443063, + "num_input_tokens_seen": 344302025, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.26696777, + "step": 15957, + "time_per_iteration": 2.6533398628234863 + }, + { + "auxiliary_loss_clip": 0.01278504, + "auxiliary_loss_mlp": 0.00224121, + "balance_loss_clip": 1.04912722, + "balance_loss_mlp": 0.19715557, + "epoch": 0.9594468660754547, + "flos": 29567050444800.0, + "grad_norm": 75.96590726456219, + "language_loss": 0.83091795, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.84594423, + "num_input_tokens_seen": 344321935, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.2701416, + "step": 15958, + "time_per_iteration": 2.7950828075408936 + }, + { + "auxiliary_loss_clip": 0.01224881, + "auxiliary_loss_mlp": 0.00219703, + "balance_loss_clip": 1.01188004, + "balance_loss_mlp": 0.19559871, + "epoch": 0.9595069893281226, + "flos": 20703256104960.0, + "grad_norm": 7.730235145322037, + "language_loss": 0.81198502, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.8264308, + "num_input_tokens_seen": 344340405, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.24133301, + "step": 15959, + "time_per_iteration": 2.735825777053833 + }, + { + "auxiliary_loss_clip": 0.01241596, + "auxiliary_loss_mlp": 0.00217817, + "balance_loss_clip": 1.02264559, + "balance_loss_mlp": 0.1921279, + "epoch": 0.9595671125807906, + "flos": 22453613856000.0, + "grad_norm": 8.87321350925989, + "language_loss": 0.77466559, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.78925973, + "num_input_tokens_seen": 344359925, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25695801, + "step": 15960, + "time_per_iteration": 2.700773239135742 + }, + { + "auxiliary_loss_clip": 0.01230214, + "auxiliary_loss_mlp": 0.00231906, + "balance_loss_clip": 1.01732588, + "balance_loss_mlp": 0.20570403, + "epoch": 0.9596272358334585, + "flos": 23915214172800.0, + "grad_norm": 5.25175740173542, + "language_loss": 0.8489219, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.86354315, + "num_input_tokens_seen": 344379100, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.26196289, + "step": 15961, + "time_per_iteration": 2.686474323272705 + }, + { + "auxiliary_loss_clip": 0.01229335, + "auxiliary_loss_mlp": 0.00230954, + "balance_loss_clip": 1.01717007, + "balance_loss_mlp": 0.20794667, + "epoch": 0.9596873590861266, + "flos": 17672539086720.0, + "grad_norm": 74.53591552744773, + "language_loss": 0.83661747, + "learning_rate": 1.699820008484698e-08, + "loss": 0.85122037, + "num_input_tokens_seen": 344396895, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.23010254, + "step": 15962, + "time_per_iteration": 2.619694471359253 + }, + { + "auxiliary_loss_clip": 0.0124226, + "auxiliary_loss_mlp": 0.00266638, + "balance_loss_clip": 1.02593005, + "balance_loss_mlp": 0.23802751, + "epoch": 0.9597474823387945, + "flos": 25808532053760.0, + "grad_norm": 7.391797460805174, + "language_loss": 0.8199175, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.83500648, + "num_input_tokens_seen": 344415115, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.28601074, + "step": 15963, + "time_per_iteration": 2.6970748901367188 + }, + { + "auxiliary_loss_clip": 0.01218121, + "auxiliary_loss_mlp": 0.00203776, + "balance_loss_clip": 1.01123083, + "balance_loss_mlp": 0.18140058, + "epoch": 0.9598076055914625, + "flos": 23768519028480.0, + "grad_norm": 24.28100492726069, + "language_loss": 0.80507231, + "learning_rate": 1.689701268270527e-08, + "loss": 0.81929123, + "num_input_tokens_seen": 344435185, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.22399902, + "step": 15964, + "time_per_iteration": 2.6624326705932617 + }, + { + "auxiliary_loss_clip": 0.0107452, + "auxiliary_loss_mlp": 0.00043124, + "balance_loss_clip": 0.93739879, + "balance_loss_mlp": 0.03733046, + "epoch": 0.9598677288441305, + "flos": 56515962464640.0, + "grad_norm": 1.0605869786334077, + "language_loss": 0.56964982, + "learning_rate": 1.684653177987161e-08, + "loss": 0.58082628, + "num_input_tokens_seen": 344488950, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.05786133, + "step": 15965, + "time_per_iteration": 3.176532506942749 + }, + { + "auxiliary_loss_clip": 0.01248922, + "auxiliary_loss_mlp": 0.00218392, + "balance_loss_clip": 1.0289762, + "balance_loss_mlp": 0.19458576, + "epoch": 0.9599278520967984, + "flos": 22997480659200.0, + "grad_norm": 3.522406266792881, + "language_loss": 0.84665602, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.8613292, + "num_input_tokens_seen": 344506740, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.23828125, + "step": 15966, + "time_per_iteration": 2.6461853981018066 + }, + { + "auxiliary_loss_clip": 0.01235602, + "auxiliary_loss_mlp": 0.00227526, + "balance_loss_clip": 1.02135539, + "balance_loss_mlp": 0.20168176, + "epoch": 0.9599879753494664, + "flos": 23039676161280.0, + "grad_norm": 12.85437816837059, + "language_loss": 0.86602283, + "learning_rate": 1.674579558025102e-08, + "loss": 0.8806541, + "num_input_tokens_seen": 344526670, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.25854492, + "step": 15967, + "time_per_iteration": 2.730241298675537 + }, + { + "auxiliary_loss_clip": 0.01258545, + "auxiliary_loss_mlp": 0.00227131, + "balance_loss_clip": 1.03032541, + "balance_loss_mlp": 0.20052361, + "epoch": 0.9600480986021344, + "flos": 16392287560320.0, + "grad_norm": 31.311893056058967, + "language_loss": 0.90425193, + "learning_rate": 1.669554028728348e-08, + "loss": 0.91910863, + "num_input_tokens_seen": 344541995, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.26611328, + "step": 15968, + "time_per_iteration": 2.6620426177978516 + }, + { + "auxiliary_loss_clip": 0.01260228, + "auxiliary_loss_mlp": 0.00241758, + "balance_loss_clip": 1.03422403, + "balance_loss_mlp": 0.21599722, + "epoch": 0.9601082218548024, + "flos": 24276439296000.0, + "grad_norm": 111.08449222666309, + "language_loss": 0.787875, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.80289483, + "num_input_tokens_seen": 344559980, + "router_z_loss_clip": 2.25683594, + "router_z_loss_mlp": 0.25744629, + "step": 15969, + "time_per_iteration": 4.0765769481658936 + }, + { + "auxiliary_loss_clip": 0.01242111, + "auxiliary_loss_mlp": 0.00237599, + "balance_loss_clip": 1.02597213, + "balance_loss_mlp": 0.21289897, + "epoch": 0.9601683451074703, + "flos": 19609991804160.0, + "grad_norm": 98.57274819820601, + "language_loss": 0.88086843, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.89566553, + "num_input_tokens_seen": 344577765, + "router_z_loss_clip": 2.16699219, + "router_z_loss_mlp": 0.24719238, + "step": 15970, + "time_per_iteration": 4.077553987503052 + }, + { + "auxiliary_loss_clip": 0.01242014, + "auxiliary_loss_mlp": 0.0022549, + "balance_loss_clip": 1.02610326, + "balance_loss_mlp": 0.20094456, + "epoch": 0.9602284683601383, + "flos": 26651104358400.0, + "grad_norm": 12.85000161317417, + "language_loss": 0.83179367, + "learning_rate": 1.654522565861316e-08, + "loss": 0.84646869, + "num_input_tokens_seen": 344597650, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.24536133, + "step": 15971, + "time_per_iteration": 2.7990846633911133 + }, + { + "auxiliary_loss_clip": 0.01251964, + "auxiliary_loss_mlp": 0.00253374, + "balance_loss_clip": 1.02635455, + "balance_loss_mlp": 0.22358373, + "epoch": 0.9602885916128062, + "flos": 15554096714880.0, + "grad_norm": 20.811595878718666, + "language_loss": 0.7506392, + "learning_rate": 1.64952712054669e-08, + "loss": 0.76569259, + "num_input_tokens_seen": 344613580, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.29797363, + "step": 15972, + "time_per_iteration": 2.6459720134735107 + }, + { + "auxiliary_loss_clip": 0.01233009, + "auxiliary_loss_mlp": 0.00228846, + "balance_loss_clip": 1.01955748, + "balance_loss_mlp": 0.20474172, + "epoch": 0.9603487148654742, + "flos": 16502353810560.0, + "grad_norm": 45.95099964452309, + "language_loss": 0.83929384, + "learning_rate": 1.644539196701844e-08, + "loss": 0.85391235, + "num_input_tokens_seen": 344626910, + "router_z_loss_clip": 2.13574219, + "router_z_loss_mlp": 0.2409668, + "step": 15973, + "time_per_iteration": 2.6219522953033447 + }, + { + "auxiliary_loss_clip": 0.01225111, + "auxiliary_loss_mlp": 0.00212336, + "balance_loss_clip": 1.01701701, + "balance_loss_mlp": 0.18913756, + "epoch": 0.9604088381181421, + "flos": 20845354308480.0, + "grad_norm": 24.423503758393455, + "language_loss": 0.75223374, + "learning_rate": 1.639558794515983e-08, + "loss": 0.76660824, + "num_input_tokens_seen": 344644330, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.23193359, + "step": 15974, + "time_per_iteration": 2.6559197902679443 + }, + { + "auxiliary_loss_clip": 0.0124353, + "auxiliary_loss_mlp": 0.00209047, + "balance_loss_clip": 1.0263015, + "balance_loss_mlp": 0.18381086, + "epoch": 0.9604689613708102, + "flos": 19683105937920.0, + "grad_norm": 567.6978911996267, + "language_loss": 0.75696886, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.77149469, + "num_input_tokens_seen": 344663910, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.25219727, + "step": 15975, + "time_per_iteration": 2.708954095840454 + }, + { + "auxiliary_loss_clip": 0.01220537, + "auxiliary_loss_mlp": 0.00200794, + "balance_loss_clip": 1.01334453, + "balance_loss_mlp": 0.17702374, + "epoch": 0.9605290846234781, + "flos": 24097568544000.0, + "grad_norm": 406.0729957797124, + "language_loss": 0.65627599, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.67048931, + "num_input_tokens_seen": 344682320, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.2376709, + "step": 15976, + "time_per_iteration": 2.6741058826446533 + }, + { + "auxiliary_loss_clip": 0.01218083, + "auxiliary_loss_mlp": 0.0022483, + "balance_loss_clip": 1.01001501, + "balance_loss_mlp": 0.20005873, + "epoch": 0.9605892078761461, + "flos": 27122575299840.0, + "grad_norm": 627.1399249780445, + "language_loss": 0.74704051, + "learning_rate": 1.624662719799219e-08, + "loss": 0.76146966, + "num_input_tokens_seen": 344701355, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.24780273, + "step": 15977, + "time_per_iteration": 2.7639200687408447 + }, + { + "auxiliary_loss_clip": 0.01220298, + "auxiliary_loss_mlp": 0.00209768, + "balance_loss_clip": 1.00671709, + "balance_loss_mlp": 0.18593821, + "epoch": 0.9606493311288141, + "flos": 14136918543360.0, + "grad_norm": 5.203916626320149, + "language_loss": 0.90661657, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.92091721, + "num_input_tokens_seen": 344717980, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.23840332, + "step": 15978, + "time_per_iteration": 4.174534320831299 + }, + { + "auxiliary_loss_clip": 0.01268796, + "auxiliary_loss_mlp": 0.00221151, + "balance_loss_clip": 1.03908932, + "balance_loss_mlp": 0.19364971, + "epoch": 0.960709454381482, + "flos": 15813336147840.0, + "grad_norm": 663.7601205374656, + "language_loss": 0.92558652, + "learning_rate": 1.614769615070921e-08, + "loss": 0.94048595, + "num_input_tokens_seen": 344733480, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.27514648, + "step": 15979, + "time_per_iteration": 2.6592459678649902 + }, + { + "auxiliary_loss_clip": 0.01255919, + "auxiliary_loss_mlp": 0.00234165, + "balance_loss_clip": 1.03442955, + "balance_loss_mlp": 0.21012081, + "epoch": 0.96076957763415, + "flos": 22565403959040.0, + "grad_norm": 9.062463266687596, + "language_loss": 0.88011748, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.89501834, + "num_input_tokens_seen": 344752130, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.24060059, + "step": 15980, + "time_per_iteration": 4.120023488998413 + }, + { + "auxiliary_loss_clip": 0.01250198, + "auxiliary_loss_mlp": 0.00229712, + "balance_loss_clip": 1.02711618, + "balance_loss_mlp": 0.20304462, + "epoch": 0.960829700886818, + "flos": 24681260551680.0, + "grad_norm": 223.5245389722119, + "language_loss": 0.76244473, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.77724385, + "num_input_tokens_seen": 344771195, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.26672363, + "step": 15981, + "time_per_iteration": 2.741119623184204 + }, + { + "auxiliary_loss_clip": 0.01226565, + "auxiliary_loss_mlp": 0.00225621, + "balance_loss_clip": 1.01539111, + "balance_loss_mlp": 0.20280465, + "epoch": 0.960889824139486, + "flos": 26542223256960.0, + "grad_norm": 22.682792721485242, + "language_loss": 0.77175117, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.78627306, + "num_input_tokens_seen": 344793150, + "router_z_loss_clip": 2.11425781, + "router_z_loss_mlp": 0.22827148, + "step": 15982, + "time_per_iteration": 2.705253839492798 + }, + { + "auxiliary_loss_clip": 0.0107976, + "auxiliary_loss_mlp": 0.00115943, + "balance_loss_clip": 0.94331849, + "balance_loss_mlp": 0.10821827, + "epoch": 0.9609499473921539, + "flos": 71114942586240.0, + "grad_norm": 0.6669234889349531, + "language_loss": 0.5283258, + "learning_rate": 1.595073680563286e-08, + "loss": 0.54028285, + "num_input_tokens_seen": 344852855, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07714844, + "step": 15983, + "time_per_iteration": 3.2680556774139404 + }, + { + "auxiliary_loss_clip": 0.01240779, + "auxiliary_loss_mlp": 0.00232799, + "balance_loss_clip": 1.02168238, + "balance_loss_mlp": 0.20566732, + "epoch": 0.9610100706448219, + "flos": 20552466810240.0, + "grad_norm": 2127.070369428902, + "language_loss": 0.77831501, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.79305077, + "num_input_tokens_seen": 344869830, + "router_z_loss_clip": 2.19433594, + "router_z_loss_mlp": 0.27124023, + "step": 15984, + "time_per_iteration": 2.6434197425842285 + }, + { + "auxiliary_loss_clip": 0.01215898, + "auxiliary_loss_mlp": 0.00229265, + "balance_loss_clip": 1.01143622, + "balance_loss_mlp": 0.20698512, + "epoch": 0.9610701938974898, + "flos": 14064199459200.0, + "grad_norm": 91.73398460032372, + "language_loss": 0.75889838, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.77335006, + "num_input_tokens_seen": 344888905, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.22277832, + "step": 15985, + "time_per_iteration": 2.701310634613037 + }, + { + "auxiliary_loss_clip": 0.01243757, + "auxiliary_loss_mlp": 0.00239023, + "balance_loss_clip": 1.02986622, + "balance_loss_mlp": 0.21547963, + "epoch": 0.9611303171501578, + "flos": 20229989483520.0, + "grad_norm": 16.688619520016573, + "language_loss": 0.86604214, + "learning_rate": 1.580380726142283e-08, + "loss": 0.88086998, + "num_input_tokens_seen": 344907160, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.23535156, + "step": 15986, + "time_per_iteration": 2.647540330886841 + }, + { + "auxiliary_loss_clip": 0.0125287, + "auxiliary_loss_mlp": 0.00234686, + "balance_loss_clip": 1.0329144, + "balance_loss_mlp": 0.20971204, + "epoch": 0.9611904404028258, + "flos": 20951075013120.0, + "grad_norm": 109.37862831168249, + "language_loss": 0.72853833, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.74341393, + "num_input_tokens_seen": 344922400, + "router_z_loss_clip": 2.20214844, + "router_z_loss_mlp": 0.25, + "step": 15987, + "time_per_iteration": 2.6470534801483154 + }, + { + "auxiliary_loss_clip": 0.01241391, + "auxiliary_loss_mlp": 0.00213684, + "balance_loss_clip": 1.02690446, + "balance_loss_mlp": 0.18907873, + "epoch": 0.9612505636554938, + "flos": 24827740214400.0, + "grad_norm": 30.380931107448582, + "language_loss": 0.71649033, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.73104101, + "num_input_tokens_seen": 344941910, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.24597168, + "step": 15988, + "time_per_iteration": 2.699697494506836 + }, + { + "auxiliary_loss_clip": 0.01227558, + "auxiliary_loss_mlp": 0.00239966, + "balance_loss_clip": 1.01680911, + "balance_loss_mlp": 0.21666095, + "epoch": 0.9613106869081617, + "flos": 17164977955200.0, + "grad_norm": 266.73201484310965, + "language_loss": 0.8165729, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.83124816, + "num_input_tokens_seen": 344960020, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.23303223, + "step": 15989, + "time_per_iteration": 2.629465341567993 + }, + { + "auxiliary_loss_clip": 0.01077481, + "auxiliary_loss_mlp": 0.00064872, + "balance_loss_clip": 0.93940878, + "balance_loss_mlp": 0.05941209, + "epoch": 0.9613708101608297, + "flos": 61563818522880.0, + "grad_norm": 0.799933855185931, + "language_loss": 0.62860483, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.64002836, + "num_input_tokens_seen": 345018290, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.0546875, + "step": 15990, + "time_per_iteration": 3.054628372192383 + }, + { + "auxiliary_loss_clip": 0.01238695, + "auxiliary_loss_mlp": 0.00234281, + "balance_loss_clip": 1.02034342, + "balance_loss_mlp": 0.20879391, + "epoch": 0.9614309334134977, + "flos": 27417904922880.0, + "grad_norm": 29.91382051003049, + "language_loss": 0.87859643, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.89332616, + "num_input_tokens_seen": 345040235, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25439453, + "step": 15991, + "time_per_iteration": 2.731978178024292 + }, + { + "auxiliary_loss_clip": 0.01244318, + "auxiliary_loss_mlp": 0.00227816, + "balance_loss_clip": 1.02296591, + "balance_loss_mlp": 0.20275854, + "epoch": 0.9614910566661656, + "flos": 22819148611200.0, + "grad_norm": 19.79734238262538, + "language_loss": 0.96620643, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.98092777, + "num_input_tokens_seen": 345054540, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25073242, + "step": 15992, + "time_per_iteration": 2.725853681564331 + }, + { + "auxiliary_loss_clip": 0.01241396, + "auxiliary_loss_mlp": 0.00248356, + "balance_loss_clip": 1.02346909, + "balance_loss_mlp": 0.22389433, + "epoch": 0.9615511799188337, + "flos": 20667812359680.0, + "grad_norm": 250.5092454152884, + "language_loss": 0.81260806, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.82750559, + "num_input_tokens_seen": 345074035, + "router_z_loss_clip": 2.18066406, + "router_z_loss_mlp": 0.2442627, + "step": 15993, + "time_per_iteration": 2.6639461517333984 + }, + { + "auxiliary_loss_clip": 0.01239158, + "auxiliary_loss_mlp": 0.00233515, + "balance_loss_clip": 1.02370954, + "balance_loss_mlp": 0.20832609, + "epoch": 0.9616113031715016, + "flos": 33149212035840.0, + "grad_norm": 68.48854326557878, + "language_loss": 0.73996973, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.75469649, + "num_input_tokens_seen": 345099270, + "router_z_loss_clip": 2.15527344, + "router_z_loss_mlp": 0.2520752, + "step": 15994, + "time_per_iteration": 2.8319759368896484 + }, + { + "auxiliary_loss_clip": 0.01224656, + "auxiliary_loss_mlp": 0.00242739, + "balance_loss_clip": 1.01208007, + "balance_loss_mlp": 0.21914771, + "epoch": 0.9616714264241696, + "flos": 25009807276800.0, + "grad_norm": 47.79394965923078, + "language_loss": 0.89550966, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.91018361, + "num_input_tokens_seen": 345116975, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.23583984, + "step": 15995, + "time_per_iteration": 2.6702170372009277 + }, + { + "auxiliary_loss_clip": 0.01246743, + "auxiliary_loss_mlp": 0.00231196, + "balance_loss_clip": 1.02606797, + "balance_loss_mlp": 0.20541114, + "epoch": 0.9617315496768375, + "flos": 13547480359680.0, + "grad_norm": 2.364739993183151, + "language_loss": 0.84167957, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.8564589, + "num_input_tokens_seen": 345133645, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25793457, + "step": 15996, + "time_per_iteration": 2.7916643619537354 + }, + { + "auxiliary_loss_clip": 0.0123787, + "auxiliary_loss_mlp": 0.00224075, + "balance_loss_clip": 1.01823974, + "balance_loss_mlp": 0.19859987, + "epoch": 0.9617916729295055, + "flos": 11254512781440.0, + "grad_norm": 157.89000651975698, + "language_loss": 0.87579119, + "learning_rate": 1.52708595287494e-08, + "loss": 0.8904106, + "num_input_tokens_seen": 345150740, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25439453, + "step": 15997, + "time_per_iteration": 2.6400372982025146 + }, + { + "auxiliary_loss_clip": 0.01233563, + "auxiliary_loss_mlp": 0.00225415, + "balance_loss_clip": 1.02158356, + "balance_loss_mlp": 0.20079809, + "epoch": 0.9618517961821734, + "flos": 22819723228800.0, + "grad_norm": 3.108762394418708, + "language_loss": 0.74650019, + "learning_rate": 1.522286126505001e-08, + "loss": 0.76108992, + "num_input_tokens_seen": 345170365, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.24633789, + "step": 15998, + "time_per_iteration": 2.7263853549957275 + }, + { + "auxiliary_loss_clip": 0.01234491, + "auxiliary_loss_mlp": 0.00220357, + "balance_loss_clip": 1.01914299, + "balance_loss_mlp": 0.1954782, + "epoch": 0.9619119194348414, + "flos": 16617340224000.0, + "grad_norm": 119.83350670735315, + "language_loss": 0.80151176, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.81606025, + "num_input_tokens_seen": 345188930, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.24865723, + "step": 15999, + "time_per_iteration": 2.698713779449463 + }, + { + "auxiliary_loss_clip": 0.01224555, + "auxiliary_loss_mlp": 0.00208292, + "balance_loss_clip": 1.01315427, + "balance_loss_mlp": 0.18360405, + "epoch": 0.9619720426875094, + "flos": 24535140024960.0, + "grad_norm": 7.668157566458209, + "language_loss": 0.73685062, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.7511791, + "num_input_tokens_seen": 345209615, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.24694824, + "step": 16000, + "time_per_iteration": 2.704021453857422 + }, + { + "auxiliary_loss_clip": 0.01238268, + "auxiliary_loss_mlp": 0.00234363, + "balance_loss_clip": 1.02433038, + "balance_loss_mlp": 0.21072398, + "epoch": 0.9620321659401774, + "flos": 20632224960000.0, + "grad_norm": 3.653878887377098, + "language_loss": 0.80802357, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.82274985, + "num_input_tokens_seen": 345229175, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.23620605, + "step": 16001, + "time_per_iteration": 2.723879337310791 + }, + { + "auxiliary_loss_clip": 0.01243041, + "auxiliary_loss_mlp": 0.0023468, + "balance_loss_clip": 1.02037251, + "balance_loss_mlp": 0.20835811, + "epoch": 0.9620922891928453, + "flos": 18515290959360.0, + "grad_norm": 10.436337070351978, + "language_loss": 0.76543903, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.78021622, + "num_input_tokens_seen": 345247815, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26306152, + "step": 16002, + "time_per_iteration": 2.6429059505462646 + }, + { + "auxiliary_loss_clip": 0.01236214, + "auxiliary_loss_mlp": 0.00237863, + "balance_loss_clip": 1.02395248, + "balance_loss_mlp": 0.21120733, + "epoch": 0.9621524124455133, + "flos": 28767391914240.0, + "grad_norm": 5.387286623586943, + "language_loss": 0.71115541, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.72589624, + "num_input_tokens_seen": 345269935, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.26635742, + "step": 16003, + "time_per_iteration": 2.7399063110351562 + }, + { + "auxiliary_loss_clip": 0.01257569, + "auxiliary_loss_mlp": 0.00221566, + "balance_loss_clip": 1.03325307, + "balance_loss_mlp": 0.19697304, + "epoch": 0.9622125356981813, + "flos": 19098875226240.0, + "grad_norm": 91.99047213810094, + "language_loss": 0.86478066, + "learning_rate": 1.493645226826512e-08, + "loss": 0.87957203, + "num_input_tokens_seen": 345288310, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.24597168, + "step": 16004, + "time_per_iteration": 2.761401891708374 + }, + { + "auxiliary_loss_clip": 0.0124251, + "auxiliary_loss_mlp": 0.00231512, + "balance_loss_clip": 1.02787447, + "balance_loss_mlp": 0.20505928, + "epoch": 0.9622726589508492, + "flos": 20302816308480.0, + "grad_norm": 6.7286130954680505, + "language_loss": 0.8936283, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.90836853, + "num_input_tokens_seen": 345306615, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.26477051, + "step": 16005, + "time_per_iteration": 2.823108434677124 + }, + { + "auxiliary_loss_clip": 0.01242692, + "auxiliary_loss_mlp": 0.00221077, + "balance_loss_clip": 1.02632642, + "balance_loss_mlp": 0.1956259, + "epoch": 0.9623327822035173, + "flos": 54929750889600.0, + "grad_norm": 281.06047501978907, + "language_loss": 0.74683022, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.76146793, + "num_input_tokens_seen": 345331935, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25463867, + "step": 16006, + "time_per_iteration": 2.952655553817749 + }, + { + "auxiliary_loss_clip": 0.01215302, + "auxiliary_loss_mlp": 0.00219938, + "balance_loss_clip": 1.01034999, + "balance_loss_mlp": 0.19707385, + "epoch": 0.9623929054561852, + "flos": 21759029585280.0, + "grad_norm": 21.865208298661358, + "language_loss": 0.83234167, + "learning_rate": 1.479426394188521e-08, + "loss": 0.84669399, + "num_input_tokens_seen": 345351510, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.2286377, + "step": 16007, + "time_per_iteration": 2.6530075073242188 + }, + { + "auxiliary_loss_clip": 0.01270499, + "auxiliary_loss_mlp": 0.00242255, + "balance_loss_clip": 1.04483199, + "balance_loss_mlp": 0.2141096, + "epoch": 0.9624530287088532, + "flos": 17931563038080.0, + "grad_norm": 27.51757825185796, + "language_loss": 0.75370044, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.76882803, + "num_input_tokens_seen": 345367750, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.28186035, + "step": 16008, + "time_per_iteration": 2.627997875213623 + }, + { + "auxiliary_loss_clip": 0.0125697, + "auxiliary_loss_mlp": 0.00233993, + "balance_loss_clip": 1.03220487, + "balance_loss_mlp": 0.20911407, + "epoch": 0.9625131519615211, + "flos": 23253739263360.0, + "grad_norm": 54.213821397284015, + "language_loss": 0.84549916, + "learning_rate": 1.469984811730529e-08, + "loss": 0.86040878, + "num_input_tokens_seen": 345384790, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.24890137, + "step": 16009, + "time_per_iteration": 2.656085252761841 + }, + { + "auxiliary_loss_clip": 0.01238935, + "auxiliary_loss_mlp": 0.00222662, + "balance_loss_clip": 1.02529073, + "balance_loss_mlp": 0.19673394, + "epoch": 0.9625732752141891, + "flos": 18916628595840.0, + "grad_norm": 15.2611642599492, + "language_loss": 0.84141439, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.85603034, + "num_input_tokens_seen": 345403390, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.25915527, + "step": 16010, + "time_per_iteration": 2.6238789558410645 + }, + { + "auxiliary_loss_clip": 0.01264377, + "auxiliary_loss_mlp": 0.00230664, + "balance_loss_clip": 1.03553188, + "balance_loss_mlp": 0.20379402, + "epoch": 0.962633398466857, + "flos": 16252918790400.0, + "grad_norm": 29.545010267224107, + "language_loss": 0.78018612, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.79513651, + "num_input_tokens_seen": 345418685, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.26831055, + "step": 16011, + "time_per_iteration": 4.0425310134887695 + }, + { + "auxiliary_loss_clip": 0.01229991, + "auxiliary_loss_mlp": 0.00230799, + "balance_loss_clip": 1.01702547, + "balance_loss_mlp": 0.20768446, + "epoch": 0.962693521719525, + "flos": 54197424403200.0, + "grad_norm": 38.86128334567636, + "language_loss": 0.77808642, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.79269433, + "num_input_tokens_seen": 345442380, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.23083496, + "step": 16012, + "time_per_iteration": 4.448180198669434 + }, + { + "auxiliary_loss_clip": 0.01276264, + "auxiliary_loss_mlp": 0.00227461, + "balance_loss_clip": 1.04357457, + "balance_loss_mlp": 0.19921988, + "epoch": 0.962753644972193, + "flos": 33105795471360.0, + "grad_norm": 8.464249907310474, + "language_loss": 0.8292551, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.84429228, + "num_input_tokens_seen": 345463815, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.28234863, + "step": 16013, + "time_per_iteration": 2.7786550521850586 + }, + { + "auxiliary_loss_clip": 0.01245501, + "auxiliary_loss_mlp": 0.00220325, + "balance_loss_clip": 1.0302422, + "balance_loss_mlp": 0.19475421, + "epoch": 0.962813768224861, + "flos": 42230660837760.0, + "grad_norm": 12.255167723647027, + "language_loss": 0.75574982, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.77040809, + "num_input_tokens_seen": 345484525, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.25561523, + "step": 16014, + "time_per_iteration": 2.827484607696533 + }, + { + "auxiliary_loss_clip": 0.01231364, + "auxiliary_loss_mlp": 0.002251, + "balance_loss_clip": 1.02143335, + "balance_loss_mlp": 0.20136544, + "epoch": 0.9628738914775289, + "flos": 43944677003520.0, + "grad_norm": 33.61944243503026, + "language_loss": 0.79991311, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.81447768, + "num_input_tokens_seen": 345508295, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23730469, + "step": 16015, + "time_per_iteration": 2.8674893379211426 + }, + { + "auxiliary_loss_clip": 0.01244533, + "auxiliary_loss_mlp": 0.00245413, + "balance_loss_clip": 1.02367663, + "balance_loss_mlp": 0.21918671, + "epoch": 0.9629340147301969, + "flos": 15596184476160.0, + "grad_norm": 59.693810804997355, + "language_loss": 0.85606247, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.8709619, + "num_input_tokens_seen": 345525155, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.2623291, + "step": 16016, + "time_per_iteration": 2.6473381519317627 + }, + { + "auxiliary_loss_clip": 0.01079679, + "auxiliary_loss_mlp": 0.00061333, + "balance_loss_clip": 0.94173801, + "balance_loss_mlp": 0.0549906, + "epoch": 0.9629941379828649, + "flos": 62951011816320.0, + "grad_norm": 0.79992087670393, + "language_loss": 0.62327689, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.63468701, + "num_input_tokens_seen": 345578905, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06347656, + "step": 16017, + "time_per_iteration": 3.078737258911133 + }, + { + "auxiliary_loss_clip": 0.01246887, + "auxiliary_loss_mlp": 0.00234507, + "balance_loss_clip": 1.02648914, + "balance_loss_mlp": 0.21086776, + "epoch": 0.9630542612355328, + "flos": 29899116702720.0, + "grad_norm": 53.02526087777699, + "language_loss": 0.77861971, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.79343367, + "num_input_tokens_seen": 345598965, + "router_z_loss_clip": 2.20605469, + "router_z_loss_mlp": 0.23632812, + "step": 16018, + "time_per_iteration": 2.837155818939209 + }, + { + "auxiliary_loss_clip": 0.01229788, + "auxiliary_loss_mlp": 0.00208995, + "balance_loss_clip": 1.0210197, + "balance_loss_mlp": 0.18517706, + "epoch": 0.9631143844882009, + "flos": 17894575008000.0, + "grad_norm": 3.869184366893518, + "language_loss": 0.88202071, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.8964085, + "num_input_tokens_seen": 345617945, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.23815918, + "step": 16019, + "time_per_iteration": 2.639145851135254 + }, + { + "auxiliary_loss_clip": 0.01218472, + "auxiliary_loss_mlp": 0.00211638, + "balance_loss_clip": 1.01169574, + "balance_loss_mlp": 0.18919113, + "epoch": 0.9631745077408688, + "flos": 26139161767680.0, + "grad_norm": 320.13845137603016, + "language_loss": 0.77001917, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.78432029, + "num_input_tokens_seen": 345637920, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.2244873, + "step": 16020, + "time_per_iteration": 2.725978136062622 + }, + { + "auxiliary_loss_clip": 0.01234489, + "auxiliary_loss_mlp": 0.00230425, + "balance_loss_clip": 1.02101707, + "balance_loss_mlp": 0.20648748, + "epoch": 0.9632346309935368, + "flos": 24973645259520.0, + "grad_norm": 7.270055386161854, + "language_loss": 0.84972632, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.86437547, + "num_input_tokens_seen": 345656195, + "router_z_loss_clip": 2.13574219, + "router_z_loss_mlp": 0.23937988, + "step": 16021, + "time_per_iteration": 4.176476716995239 + }, + { + "auxiliary_loss_clip": 0.01255277, + "auxiliary_loss_mlp": 0.00237463, + "balance_loss_clip": 1.03592038, + "balance_loss_mlp": 0.20832866, + "epoch": 0.9632947542462047, + "flos": 23617226943360.0, + "grad_norm": 3.0421000489921677, + "language_loss": 0.73653698, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.75146443, + "num_input_tokens_seen": 345676700, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.29138184, + "step": 16022, + "time_per_iteration": 4.086475849151611 + }, + { + "auxiliary_loss_clip": 0.01245801, + "auxiliary_loss_mlp": 0.00221793, + "balance_loss_clip": 1.0310024, + "balance_loss_mlp": 0.19702142, + "epoch": 0.9633548774988727, + "flos": 26395599939840.0, + "grad_norm": 98.90981022054248, + "language_loss": 0.81558239, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.83025837, + "num_input_tokens_seen": 345696725, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.2479248, + "step": 16023, + "time_per_iteration": 2.735689640045166 + }, + { + "auxiliary_loss_clip": 0.01234759, + "auxiliary_loss_mlp": 0.00234188, + "balance_loss_clip": 1.01863194, + "balance_loss_mlp": 0.20982145, + "epoch": 0.9634150007515406, + "flos": 23767728929280.0, + "grad_norm": 20.806852464153888, + "language_loss": 0.88528061, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.89997005, + "num_input_tokens_seen": 345716245, + "router_z_loss_clip": 2.16113281, + "router_z_loss_mlp": 0.24353027, + "step": 16024, + "time_per_iteration": 2.7104156017303467 + }, + { + "auxiliary_loss_clip": 0.01251302, + "auxiliary_loss_mlp": 0.00232544, + "balance_loss_clip": 1.02921748, + "balance_loss_mlp": 0.20745055, + "epoch": 0.9634751240042086, + "flos": 24135346673280.0, + "grad_norm": 250.80005476418194, + "language_loss": 0.8673138, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.88215232, + "num_input_tokens_seen": 345739060, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.25097656, + "step": 16025, + "time_per_iteration": 2.7182412147521973 + }, + { + "auxiliary_loss_clip": 0.01239826, + "auxiliary_loss_mlp": 0.00233286, + "balance_loss_clip": 1.02412081, + "balance_loss_mlp": 0.20752476, + "epoch": 0.9635352472568766, + "flos": 24349086552960.0, + "grad_norm": 30.952100972337007, + "language_loss": 0.83572853, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.85045969, + "num_input_tokens_seen": 345758325, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.25756836, + "step": 16026, + "time_per_iteration": 2.7038135528564453 + }, + { + "auxiliary_loss_clip": 0.01233223, + "auxiliary_loss_mlp": 0.00215872, + "balance_loss_clip": 1.01661646, + "balance_loss_mlp": 0.19070736, + "epoch": 0.9635953705095446, + "flos": 23984772860160.0, + "grad_norm": 35.289960653634324, + "language_loss": 0.72313213, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.73762304, + "num_input_tokens_seen": 345778530, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.25170898, + "step": 16027, + "time_per_iteration": 2.6863901615142822 + }, + { + "auxiliary_loss_clip": 0.01247694, + "auxiliary_loss_mlp": 0.0022811, + "balance_loss_clip": 1.02986646, + "balance_loss_mlp": 0.20245585, + "epoch": 0.9636554937622125, + "flos": 19828436365440.0, + "grad_norm": 23.35389592962768, + "language_loss": 0.9500103, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.96476829, + "num_input_tokens_seen": 345796535, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.25646973, + "step": 16028, + "time_per_iteration": 2.674448251724243 + }, + { + "auxiliary_loss_clip": 0.01080138, + "auxiliary_loss_mlp": 0.00069902, + "balance_loss_clip": 0.9434604, + "balance_loss_mlp": 0.06391731, + "epoch": 0.9637156170148805, + "flos": 67435499986560.0, + "grad_norm": 0.7004204566291904, + "language_loss": 0.52196383, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.53346419, + "num_input_tokens_seen": 345859700, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.05981445, + "step": 16029, + "time_per_iteration": 3.1287684440612793 + }, + { + "auxiliary_loss_clip": 0.01247187, + "auxiliary_loss_mlp": 0.0024951, + "balance_loss_clip": 1.02532721, + "balance_loss_mlp": 0.22467905, + "epoch": 0.9637757402675484, + "flos": 20300912887680.0, + "grad_norm": 733.3273720037168, + "language_loss": 0.80289453, + "learning_rate": 1.372666546129797e-08, + "loss": 0.8178615, + "num_input_tokens_seen": 345878760, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24841309, + "step": 16030, + "time_per_iteration": 2.726562738418579 + }, + { + "auxiliary_loss_clip": 0.01233166, + "auxiliary_loss_mlp": 0.00235527, + "balance_loss_clip": 1.02386749, + "balance_loss_mlp": 0.21150681, + "epoch": 0.9638358635202164, + "flos": 27234544970880.0, + "grad_norm": 147.555808896212, + "language_loss": 0.72499293, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.73967993, + "num_input_tokens_seen": 345900445, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.24023438, + "step": 16031, + "time_per_iteration": 2.6884586811065674 + }, + { + "auxiliary_loss_clip": 0.01079563, + "auxiliary_loss_mlp": 0.00063665, + "balance_loss_clip": 0.94269633, + "balance_loss_mlp": 0.05627393, + "epoch": 0.9638959867728845, + "flos": 70288998278400.0, + "grad_norm": 0.8073286176547437, + "language_loss": 0.59893489, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.61036718, + "num_input_tokens_seen": 345961020, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.07373047, + "step": 16032, + "time_per_iteration": 3.168043851852417 + }, + { + "auxiliary_loss_clip": 0.01218073, + "auxiliary_loss_mlp": 0.00203991, + "balance_loss_clip": 1.01032019, + "balance_loss_mlp": 0.18089983, + "epoch": 0.9639561100255524, + "flos": 25407517639680.0, + "grad_norm": 47.19508905505992, + "language_loss": 0.75502861, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.7692492, + "num_input_tokens_seen": 345980210, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.23083496, + "step": 16033, + "time_per_iteration": 2.64340877532959 + }, + { + "auxiliary_loss_clip": 0.01224122, + "auxiliary_loss_mlp": 0.00207235, + "balance_loss_clip": 1.01228762, + "balance_loss_mlp": 0.18036509, + "epoch": 0.9640162332782204, + "flos": 18113881495680.0, + "grad_norm": 57.4227907816985, + "language_loss": 0.74824029, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.76255387, + "num_input_tokens_seen": 345998280, + "router_z_loss_clip": 2.12011719, + "router_z_loss_mlp": 0.26843262, + "step": 16034, + "time_per_iteration": 2.6642842292785645 + }, + { + "auxiliary_loss_clip": 0.01247581, + "auxiliary_loss_mlp": 0.00218701, + "balance_loss_clip": 1.02397656, + "balance_loss_mlp": 0.19232035, + "epoch": 0.9640763565308883, + "flos": 23440295525760.0, + "grad_norm": 122.12069369765945, + "language_loss": 0.83964038, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.85430324, + "num_input_tokens_seen": 346015545, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26379395, + "step": 16035, + "time_per_iteration": 2.649488925933838 + }, + { + "auxiliary_loss_clip": 0.01239232, + "auxiliary_loss_mlp": 0.002166, + "balance_loss_clip": 1.02888918, + "balance_loss_mlp": 0.19172059, + "epoch": 0.9641364797835563, + "flos": 22419355259520.0, + "grad_norm": 4.58348700199132, + "language_loss": 0.90196854, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.91652679, + "num_input_tokens_seen": 346034055, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.2487793, + "step": 16036, + "time_per_iteration": 2.6674163341522217 + }, + { + "auxiliary_loss_clip": 0.01234053, + "auxiliary_loss_mlp": 0.00223416, + "balance_loss_clip": 1.01938927, + "balance_loss_mlp": 0.19829878, + "epoch": 0.9641966030362242, + "flos": 30622357048320.0, + "grad_norm": 17.91679758735931, + "language_loss": 0.76210952, + "learning_rate": 1.340965177371789e-08, + "loss": 0.77668417, + "num_input_tokens_seen": 346054130, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.2512207, + "step": 16037, + "time_per_iteration": 2.7418813705444336 + }, + { + "auxiliary_loss_clip": 0.01233621, + "auxiliary_loss_mlp": 0.00230228, + "balance_loss_clip": 1.02035666, + "balance_loss_mlp": 0.2074708, + "epoch": 0.9642567262888923, + "flos": 20953122088320.0, + "grad_norm": 14.065202096460201, + "language_loss": 0.69478887, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.70942736, + "num_input_tokens_seen": 346072990, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.22741699, + "step": 16038, + "time_per_iteration": 2.659046173095703 + }, + { + "auxiliary_loss_clip": 0.01248821, + "auxiliary_loss_mlp": 0.00232834, + "balance_loss_clip": 1.03330028, + "balance_loss_mlp": 0.20801473, + "epoch": 0.9643168495415602, + "flos": 22639415932800.0, + "grad_norm": 227.94115804067437, + "language_loss": 0.78803706, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.80285358, + "num_input_tokens_seen": 346093745, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.24829102, + "step": 16039, + "time_per_iteration": 2.7354507446289062 + }, + { + "auxiliary_loss_clip": 0.01257119, + "auxiliary_loss_mlp": 0.00216227, + "balance_loss_clip": 1.03519535, + "balance_loss_mlp": 0.19093126, + "epoch": 0.9643769727942282, + "flos": 20266259241600.0, + "grad_norm": 15.870405697518342, + "language_loss": 0.85004413, + "learning_rate": 1.327491870605657e-08, + "loss": 0.86477757, + "num_input_tokens_seen": 346110115, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25317383, + "step": 16040, + "time_per_iteration": 2.6476831436157227 + }, + { + "auxiliary_loss_clip": 0.0123178, + "auxiliary_loss_mlp": 0.00222095, + "balance_loss_clip": 1.01946509, + "balance_loss_mlp": 0.19880167, + "epoch": 0.9644370960468961, + "flos": 13881845088000.0, + "grad_norm": 159.44970579045307, + "language_loss": 0.83261269, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.8471514, + "num_input_tokens_seen": 346127165, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.23266602, + "step": 16041, + "time_per_iteration": 2.662736654281616 + }, + { + "auxiliary_loss_clip": 0.01225236, + "auxiliary_loss_mlp": 0.0022784, + "balance_loss_clip": 1.02179646, + "balance_loss_mlp": 0.20429617, + "epoch": 0.9644972192995641, + "flos": 17238199829760.0, + "grad_norm": 28.368779296668453, + "language_loss": 0.80384845, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.81837922, + "num_input_tokens_seen": 346145950, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.23522949, + "step": 16042, + "time_per_iteration": 2.8859167098999023 + }, + { + "auxiliary_loss_clip": 0.01244769, + "auxiliary_loss_mlp": 0.00242585, + "balance_loss_clip": 1.02647448, + "balance_loss_mlp": 0.21836179, + "epoch": 0.964557342552232, + "flos": 23840340272640.0, + "grad_norm": 690.989249903127, + "language_loss": 0.87576449, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.89063799, + "num_input_tokens_seen": 346165005, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24230957, + "step": 16043, + "time_per_iteration": 2.8195996284484863 + }, + { + "auxiliary_loss_clip": 0.01243456, + "auxiliary_loss_mlp": 0.00231267, + "balance_loss_clip": 1.0251565, + "balance_loss_mlp": 0.20700774, + "epoch": 0.9646174658049, + "flos": 21653129312640.0, + "grad_norm": 12.357221797274441, + "language_loss": 0.79001093, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.80475819, + "num_input_tokens_seen": 346185095, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24255371, + "step": 16044, + "time_per_iteration": 2.6677896976470947 + }, + { + "auxiliary_loss_clip": 0.01223471, + "auxiliary_loss_mlp": 0.00212885, + "balance_loss_clip": 1.0124011, + "balance_loss_mlp": 0.18812475, + "epoch": 0.9646775890575681, + "flos": 17129570123520.0, + "grad_norm": 9.234018698491953, + "language_loss": 0.76792979, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.78229338, + "num_input_tokens_seen": 346202580, + "router_z_loss_clip": 2.11035156, + "router_z_loss_mlp": 0.24755859, + "step": 16045, + "time_per_iteration": 2.6754860877990723 + }, + { + "auxiliary_loss_clip": 0.01243115, + "auxiliary_loss_mlp": 0.00220753, + "balance_loss_clip": 1.02417874, + "balance_loss_mlp": 0.19693494, + "epoch": 0.964737712310236, + "flos": 13005732458880.0, + "grad_norm": 19.70780348796743, + "language_loss": 0.84962606, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.86426473, + "num_input_tokens_seen": 346219395, + "router_z_loss_clip": 2.18652344, + "router_z_loss_mlp": 0.23828125, + "step": 16046, + "time_per_iteration": 2.62587833404541 + }, + { + "auxiliary_loss_clip": 0.01249644, + "auxiliary_loss_mlp": 0.00238038, + "balance_loss_clip": 1.02752352, + "balance_loss_mlp": 0.21073928, + "epoch": 0.964797835562904, + "flos": 24279240556800.0, + "grad_norm": 14.9231426429612, + "language_loss": 0.7069878, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.72186464, + "num_input_tokens_seen": 346239715, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.27270508, + "step": 16047, + "time_per_iteration": 2.7213146686553955 + }, + { + "auxiliary_loss_clip": 0.01230453, + "auxiliary_loss_mlp": 0.00226928, + "balance_loss_clip": 1.02066231, + "balance_loss_mlp": 0.2032773, + "epoch": 0.9648579588155719, + "flos": 20522697413760.0, + "grad_norm": 40.308074546095426, + "language_loss": 0.78927666, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.80385047, + "num_input_tokens_seen": 346258500, + "router_z_loss_clip": 2.10058594, + "router_z_loss_mlp": 0.2364502, + "step": 16048, + "time_per_iteration": 2.66385817527771 + }, + { + "auxiliary_loss_clip": 0.01229334, + "auxiliary_loss_mlp": 0.00229304, + "balance_loss_clip": 1.01680589, + "balance_loss_mlp": 0.20461527, + "epoch": 0.9649180820682399, + "flos": 32154844855680.0, + "grad_norm": 37.66494921235357, + "language_loss": 0.70861268, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.72319901, + "num_input_tokens_seen": 346279110, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.24694824, + "step": 16049, + "time_per_iteration": 2.7230241298675537 + }, + { + "auxiliary_loss_clip": 0.01239714, + "auxiliary_loss_mlp": 0.00220159, + "balance_loss_clip": 1.02529359, + "balance_loss_mlp": 0.19417137, + "epoch": 0.9649782053209078, + "flos": 20522589672960.0, + "grad_norm": 4.18536910132653, + "language_loss": 0.78156394, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.79616266, + "num_input_tokens_seen": 346297860, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.2598877, + "step": 16050, + "time_per_iteration": 2.7811851501464844 + }, + { + "auxiliary_loss_clip": 0.01262523, + "auxiliary_loss_mlp": 0.00254795, + "balance_loss_clip": 1.03670716, + "balance_loss_mlp": 0.22710255, + "epoch": 0.9650383285735759, + "flos": 43067953843200.0, + "grad_norm": 25.248348732429484, + "language_loss": 0.7908054, + "learning_rate": 1.278669873970606e-08, + "loss": 0.8059786, + "num_input_tokens_seen": 346319860, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.27685547, + "step": 16051, + "time_per_iteration": 2.8524117469787598 + }, + { + "auxiliary_loss_clip": 0.010845, + "auxiliary_loss_mlp": 0.00067519, + "balance_loss_clip": 0.9487431, + "balance_loss_mlp": 0.06122473, + "epoch": 0.9650984518262438, + "flos": 61748255882880.0, + "grad_norm": 1.1735954464003318, + "language_loss": 0.58413297, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.59565318, + "num_input_tokens_seen": 346379025, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06298828, + "step": 16052, + "time_per_iteration": 3.2335007190704346 + }, + { + "auxiliary_loss_clip": 0.01252899, + "auxiliary_loss_mlp": 0.00224525, + "balance_loss_clip": 1.03534567, + "balance_loss_mlp": 0.19945529, + "epoch": 0.9651585750789118, + "flos": 29789337761280.0, + "grad_norm": 6.759906974662552, + "language_loss": 0.8341375, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.84891164, + "num_input_tokens_seen": 346402250, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25085449, + "step": 16053, + "time_per_iteration": 4.137972593307495 + }, + { + "auxiliary_loss_clip": 0.01242267, + "auxiliary_loss_mlp": 0.00213632, + "balance_loss_clip": 1.02542102, + "balance_loss_mlp": 0.18928899, + "epoch": 0.9652186983315797, + "flos": 16873060124160.0, + "grad_norm": 6.834424232153116, + "language_loss": 0.77356136, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.78812027, + "num_input_tokens_seen": 346419555, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24365234, + "step": 16054, + "time_per_iteration": 2.59033203125 + }, + { + "auxiliary_loss_clip": 0.01227844, + "auxiliary_loss_mlp": 0.0023117, + "balance_loss_clip": 1.01742721, + "balance_loss_mlp": 0.20764959, + "epoch": 0.9652788215842477, + "flos": 31649761762560.0, + "grad_norm": 1358.806664114813, + "language_loss": 0.69074845, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.7053386, + "num_input_tokens_seen": 346441245, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.23535156, + "step": 16055, + "time_per_iteration": 4.151713848114014 + }, + { + "auxiliary_loss_clip": 0.01239327, + "auxiliary_loss_mlp": 0.00213715, + "balance_loss_clip": 1.02547514, + "balance_loss_mlp": 0.18816891, + "epoch": 0.9653389448369156, + "flos": 24754266944640.0, + "grad_norm": 5.286538478779013, + "language_loss": 0.83959264, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.85412306, + "num_input_tokens_seen": 346460065, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.25524902, + "step": 16056, + "time_per_iteration": 2.782090187072754 + }, + { + "auxiliary_loss_clip": 0.0124927, + "auxiliary_loss_mlp": 0.00247447, + "balance_loss_clip": 1.0299046, + "balance_loss_mlp": 0.22190058, + "epoch": 0.9653990680895836, + "flos": 20297249700480.0, + "grad_norm": 106.34545396185382, + "language_loss": 0.80572629, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.82069343, + "num_input_tokens_seen": 346478005, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25561523, + "step": 16057, + "time_per_iteration": 2.6287038326263428 + }, + { + "auxiliary_loss_clip": 0.01239761, + "auxiliary_loss_mlp": 0.00225339, + "balance_loss_clip": 1.02648568, + "balance_loss_mlp": 0.20136558, + "epoch": 0.9654591913422517, + "flos": 22528775064960.0, + "grad_norm": 21.586023371900346, + "language_loss": 0.78263527, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.79728627, + "num_input_tokens_seen": 346497575, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.23986816, + "step": 16058, + "time_per_iteration": 2.6838817596435547 + }, + { + "auxiliary_loss_clip": 0.0123785, + "auxiliary_loss_mlp": 0.00192192, + "balance_loss_clip": 1.0279448, + "balance_loss_mlp": 0.16918433, + "epoch": 0.9655193145949196, + "flos": 26763002202240.0, + "grad_norm": 19.02626840679074, + "language_loss": 0.82097745, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.83527786, + "num_input_tokens_seen": 346520000, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23010254, + "step": 16059, + "time_per_iteration": 2.745992422103882 + }, + { + "auxiliary_loss_clip": 0.01252805, + "auxiliary_loss_mlp": 0.00256187, + "balance_loss_clip": 1.02906477, + "balance_loss_mlp": 0.2282919, + "epoch": 0.9655794378475876, + "flos": 41970703132800.0, + "grad_norm": 134.7781265006225, + "language_loss": 0.80284786, + "learning_rate": 1.239402791721722e-08, + "loss": 0.81793773, + "num_input_tokens_seen": 346541605, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.27905273, + "step": 16060, + "time_per_iteration": 2.85612154006958 + }, + { + "auxiliary_loss_clip": 0.01228715, + "auxiliary_loss_mlp": 0.00199633, + "balance_loss_clip": 1.02012515, + "balance_loss_mlp": 0.17695987, + "epoch": 0.9656395611002555, + "flos": 27709427704320.0, + "grad_norm": 31.15301023410572, + "language_loss": 0.83108926, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.8453728, + "num_input_tokens_seen": 346560955, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.22680664, + "step": 16061, + "time_per_iteration": 2.737530469894409 + }, + { + "auxiliary_loss_clip": 0.01080239, + "auxiliary_loss_mlp": 0.0008479, + "balance_loss_clip": 0.94041443, + "balance_loss_mlp": 0.07844776, + "epoch": 0.9656996843529235, + "flos": 68968562411520.0, + "grad_norm": 0.7100571784626574, + "language_loss": 0.63340652, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.64505678, + "num_input_tokens_seen": 346621615, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.06347656, + "step": 16062, + "time_per_iteration": 3.187025785446167 + }, + { + "auxiliary_loss_clip": 0.01228986, + "auxiliary_loss_mlp": 0.00221133, + "balance_loss_clip": 1.01367259, + "balance_loss_mlp": 0.19781616, + "epoch": 0.9657598076055914, + "flos": 20631327120000.0, + "grad_norm": 25.919293353340915, + "language_loss": 1.01887298, + "learning_rate": 1.226449424760867e-08, + "loss": 1.03337419, + "num_input_tokens_seen": 346637460, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.23303223, + "step": 16063, + "time_per_iteration": 4.122468709945679 + }, + { + "auxiliary_loss_clip": 0.01244846, + "auxiliary_loss_mlp": 0.0022323, + "balance_loss_clip": 1.02610791, + "balance_loss_mlp": 0.19900671, + "epoch": 0.9658199308582595, + "flos": 20448577699200.0, + "grad_norm": 68.34429802492816, + "language_loss": 0.88882756, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.90350831, + "num_input_tokens_seen": 346655625, + "router_z_loss_clip": 2.19042969, + "router_z_loss_mlp": 0.24230957, + "step": 16064, + "time_per_iteration": 4.067443132400513 + }, + { + "auxiliary_loss_clip": 0.0124312, + "auxiliary_loss_mlp": 0.00211311, + "balance_loss_clip": 1.03405166, + "balance_loss_mlp": 0.18767229, + "epoch": 0.9658800541109274, + "flos": 24718033100160.0, + "grad_norm": 8.291143834987027, + "language_loss": 0.90097052, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.91551483, + "num_input_tokens_seen": 346675220, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.2364502, + "step": 16065, + "time_per_iteration": 2.678403854370117 + }, + { + "auxiliary_loss_clip": 0.01240591, + "auxiliary_loss_mlp": 0.00216324, + "balance_loss_clip": 1.02351415, + "balance_loss_mlp": 0.19031286, + "epoch": 0.9659401773635954, + "flos": 21610035970560.0, + "grad_norm": 20.490582944723464, + "language_loss": 0.74095845, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.75552762, + "num_input_tokens_seen": 346694710, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.2598877, + "step": 16066, + "time_per_iteration": 2.7183218002319336 + }, + { + "auxiliary_loss_clip": 0.01237954, + "auxiliary_loss_mlp": 0.00234252, + "balance_loss_clip": 1.02466965, + "balance_loss_mlp": 0.21010032, + "epoch": 0.9660003006162633, + "flos": 20301200196480.0, + "grad_norm": 86.76660110778045, + "language_loss": 0.86932957, + "learning_rate": 1.209283794752558e-08, + "loss": 0.88405162, + "num_input_tokens_seen": 346712645, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24157715, + "step": 16067, + "time_per_iteration": 2.704571485519409 + }, + { + "auxiliary_loss_clip": 0.01232128, + "auxiliary_loss_mlp": 0.0021376, + "balance_loss_clip": 1.0198586, + "balance_loss_mlp": 0.19022846, + "epoch": 0.9660604238689313, + "flos": 24461954064000.0, + "grad_norm": 32.15760322170617, + "language_loss": 0.77083075, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.78528959, + "num_input_tokens_seen": 346732375, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.23547363, + "step": 16068, + "time_per_iteration": 2.7286553382873535 + }, + { + "auxiliary_loss_clip": 0.01210744, + "auxiliary_loss_mlp": 0.00209306, + "balance_loss_clip": 1.00820589, + "balance_loss_mlp": 0.18588185, + "epoch": 0.9661205471215992, + "flos": 19864023765120.0, + "grad_norm": 6.737551698146475, + "language_loss": 0.7539562, + "learning_rate": 1.20074620808146e-08, + "loss": 0.76815677, + "num_input_tokens_seen": 346750430, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.23413086, + "step": 16069, + "time_per_iteration": 2.624368190765381 + }, + { + "auxiliary_loss_clip": 0.01234567, + "auxiliary_loss_mlp": 0.00211083, + "balance_loss_clip": 1.02290881, + "balance_loss_mlp": 0.18820685, + "epoch": 0.9661806703742672, + "flos": 20557889763840.0, + "grad_norm": 3.8803610642127073, + "language_loss": 0.95437944, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.96883595, + "num_input_tokens_seen": 346768455, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.22888184, + "step": 16070, + "time_per_iteration": 2.6665706634521484 + }, + { + "auxiliary_loss_clip": 0.01254747, + "auxiliary_loss_mlp": 0.00250542, + "balance_loss_clip": 1.03235102, + "balance_loss_mlp": 0.22398247, + "epoch": 0.9662407936269353, + "flos": 21430949736960.0, + "grad_norm": 33.27540364110205, + "language_loss": 0.85548556, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.87053847, + "num_input_tokens_seen": 346786530, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26574707, + "step": 16071, + "time_per_iteration": 2.6475629806518555 + }, + { + "auxiliary_loss_clip": 0.01236508, + "auxiliary_loss_mlp": 0.00203119, + "balance_loss_clip": 1.02290416, + "balance_loss_mlp": 0.17967048, + "epoch": 0.9663009168796032, + "flos": 14902893095040.0, + "grad_norm": 13.31786272731078, + "language_loss": 0.76811254, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.78250885, + "num_input_tokens_seen": 346804635, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.234375, + "step": 16072, + "time_per_iteration": 2.697726011276245 + }, + { + "auxiliary_loss_clip": 0.01241605, + "auxiliary_loss_mlp": 0.00215316, + "balance_loss_clip": 1.02516127, + "balance_loss_mlp": 0.18972206, + "epoch": 0.9663610401322712, + "flos": 24310877460480.0, + "grad_norm": 12.83078097193414, + "language_loss": 0.83839679, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.85296595, + "num_input_tokens_seen": 346823070, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.25610352, + "step": 16073, + "time_per_iteration": 2.7032251358032227 + }, + { + "auxiliary_loss_clip": 0.01255378, + "auxiliary_loss_mlp": 0.00236716, + "balance_loss_clip": 1.03215456, + "balance_loss_mlp": 0.21075241, + "epoch": 0.9664211633849391, + "flos": 17637849527040.0, + "grad_norm": 128.12336563022203, + "language_loss": 0.85517573, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.87009668, + "num_input_tokens_seen": 346841180, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.2598877, + "step": 16074, + "time_per_iteration": 2.6254944801330566 + }, + { + "auxiliary_loss_clip": 0.0124155, + "auxiliary_loss_mlp": 0.00233484, + "balance_loss_clip": 1.02683616, + "balance_loss_mlp": 0.20779479, + "epoch": 0.9664812866376071, + "flos": 29789409588480.0, + "grad_norm": 8.868503954418076, + "language_loss": 0.82085472, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.83560503, + "num_input_tokens_seen": 346864250, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.25671387, + "step": 16075, + "time_per_iteration": 2.7335364818573 + }, + { + "auxiliary_loss_clip": 0.01235204, + "auxiliary_loss_mlp": 0.00203798, + "balance_loss_clip": 1.02386856, + "balance_loss_mlp": 0.17940757, + "epoch": 0.966541409890275, + "flos": 14282320798080.0, + "grad_norm": 320.97164014551845, + "language_loss": 0.87752211, + "learning_rate": 1.171102125547696e-08, + "loss": 0.89191216, + "num_input_tokens_seen": 346881955, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.24401855, + "step": 16076, + "time_per_iteration": 2.625866413116455 + }, + { + "auxiliary_loss_clip": 0.01248496, + "auxiliary_loss_mlp": 0.00250752, + "balance_loss_clip": 1.02826619, + "balance_loss_mlp": 0.22381061, + "epoch": 0.9666015331429431, + "flos": 19860432405120.0, + "grad_norm": 8.025939785534883, + "language_loss": 0.77308279, + "learning_rate": 1.166897413780532e-08, + "loss": 0.78807521, + "num_input_tokens_seen": 346900445, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.26904297, + "step": 16077, + "time_per_iteration": 2.724830389022827 + }, + { + "auxiliary_loss_clip": 0.0124553, + "auxiliary_loss_mlp": 0.0022611, + "balance_loss_clip": 1.02727222, + "balance_loss_mlp": 0.20102805, + "epoch": 0.966661656395611, + "flos": 27125951178240.0, + "grad_norm": 3.392560170517307, + "language_loss": 0.68255758, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.69727397, + "num_input_tokens_seen": 346920135, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25109863, + "step": 16078, + "time_per_iteration": 2.6919450759887695 + }, + { + "auxiliary_loss_clip": 0.01251089, + "auxiliary_loss_mlp": 0.00233187, + "balance_loss_clip": 1.0327239, + "balance_loss_mlp": 0.20859386, + "epoch": 0.966721779648279, + "flos": 21508229848320.0, + "grad_norm": 3.0807456211007618, + "language_loss": 0.80859441, + "learning_rate": 1.158510609718899e-08, + "loss": 0.82343721, + "num_input_tokens_seen": 346940450, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24621582, + "step": 16079, + "time_per_iteration": 2.6972713470458984 + }, + { + "auxiliary_loss_clip": 0.01221111, + "auxiliary_loss_mlp": 0.00213379, + "balance_loss_clip": 1.01140475, + "balance_loss_mlp": 0.1904674, + "epoch": 0.9667819029009469, + "flos": 23878118401920.0, + "grad_norm": 17.879021800709342, + "language_loss": 0.78431839, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.79866326, + "num_input_tokens_seen": 346960935, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.22924805, + "step": 16080, + "time_per_iteration": 2.7048211097717285 + }, + { + "auxiliary_loss_clip": 0.01247863, + "auxiliary_loss_mlp": 0.00212207, + "balance_loss_clip": 1.02765965, + "balance_loss_mlp": 0.18804336, + "epoch": 0.9668420261536149, + "flos": 21507224267520.0, + "grad_norm": 6.474950784707148, + "language_loss": 0.85457557, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.86917627, + "num_input_tokens_seen": 346980100, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24133301, + "step": 16081, + "time_per_iteration": 2.7167351245880127 + }, + { + "auxiliary_loss_clip": 0.01229996, + "auxiliary_loss_mlp": 0.0021256, + "balance_loss_clip": 1.01990521, + "balance_loss_mlp": 0.18970716, + "epoch": 0.9669021494062828, + "flos": 26687266375680.0, + "grad_norm": 30.666187238825437, + "language_loss": 0.74176311, + "learning_rate": 1.145986954691236e-08, + "loss": 0.75618863, + "num_input_tokens_seen": 347001250, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.2286377, + "step": 16082, + "time_per_iteration": 2.699021100997925 + }, + { + "auxiliary_loss_clip": 0.01217567, + "auxiliary_loss_mlp": 0.00212382, + "balance_loss_clip": 1.00807619, + "balance_loss_mlp": 0.18985185, + "epoch": 0.9669622726589508, + "flos": 29825032901760.0, + "grad_norm": 4.82468993756781, + "language_loss": 0.83906853, + "learning_rate": 1.141827483932789e-08, + "loss": 0.85336804, + "num_input_tokens_seen": 347022975, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.2253418, + "step": 16083, + "time_per_iteration": 2.7624294757843018 + }, + { + "auxiliary_loss_clip": 0.01239618, + "auxiliary_loss_mlp": 0.00232577, + "balance_loss_clip": 1.02222252, + "balance_loss_mlp": 0.20880625, + "epoch": 0.9670223959116189, + "flos": 22922499018240.0, + "grad_norm": 6.545236104497162, + "language_loss": 0.86622488, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.88094687, + "num_input_tokens_seen": 347038780, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.2376709, + "step": 16084, + "time_per_iteration": 2.6289479732513428 + }, + { + "auxiliary_loss_clip": 0.0126003, + "auxiliary_loss_mlp": 0.00241102, + "balance_loss_clip": 1.03129029, + "balance_loss_mlp": 0.21447098, + "epoch": 0.9670825191642868, + "flos": 18624495283200.0, + "grad_norm": 107.25675004009703, + "language_loss": 0.80954093, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.82455224, + "num_input_tokens_seen": 347056705, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.26635742, + "step": 16085, + "time_per_iteration": 2.749258041381836 + }, + { + "auxiliary_loss_clip": 0.01260526, + "auxiliary_loss_mlp": 0.00233739, + "balance_loss_clip": 1.0360868, + "balance_loss_mlp": 0.20689301, + "epoch": 0.9671426424169548, + "flos": 24497936513280.0, + "grad_norm": 106.67068275163957, + "language_loss": 0.78121781, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.79616046, + "num_input_tokens_seen": 347075710, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26855469, + "step": 16086, + "time_per_iteration": 2.697448492050171 + }, + { + "auxiliary_loss_clip": 0.01253822, + "auxiliary_loss_mlp": 0.00234794, + "balance_loss_clip": 1.03109562, + "balance_loss_mlp": 0.2087705, + "epoch": 0.9672027656696227, + "flos": 20371189847040.0, + "grad_norm": 7.851238971414784, + "language_loss": 0.86185652, + "learning_rate": 1.125265009690235e-08, + "loss": 0.87674272, + "num_input_tokens_seen": 347092325, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.26037598, + "step": 16087, + "time_per_iteration": 2.722717761993408 + }, + { + "auxiliary_loss_clip": 0.01241534, + "auxiliary_loss_mlp": 0.00230275, + "balance_loss_clip": 1.02307081, + "balance_loss_mlp": 0.20424002, + "epoch": 0.9672628889222907, + "flos": 18880179269760.0, + "grad_norm": 53.778709715209565, + "language_loss": 0.83198428, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.84670234, + "num_input_tokens_seen": 347110595, + "router_z_loss_clip": 2.18652344, + "router_z_loss_mlp": 0.26062012, + "step": 16088, + "time_per_iteration": 2.7862236499786377 + }, + { + "auxiliary_loss_clip": 0.01224409, + "auxiliary_loss_mlp": 0.00212828, + "balance_loss_clip": 1.01403594, + "balance_loss_mlp": 0.18920034, + "epoch": 0.9673230121749586, + "flos": 28695247447680.0, + "grad_norm": 9.188387246578614, + "language_loss": 0.77402389, + "learning_rate": 1.117029020040916e-08, + "loss": 0.78839624, + "num_input_tokens_seen": 347131625, + "router_z_loss_clip": 2.10253906, + "router_z_loss_mlp": 0.23632812, + "step": 16089, + "time_per_iteration": 2.737764835357666 + }, + { + "auxiliary_loss_clip": 0.01251492, + "auxiliary_loss_mlp": 0.00229818, + "balance_loss_clip": 1.03609085, + "balance_loss_mlp": 0.20448619, + "epoch": 0.9673831354276267, + "flos": 20484452407680.0, + "grad_norm": 32.48472914990659, + "language_loss": 0.84526157, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.86007464, + "num_input_tokens_seen": 347147910, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.25292969, + "step": 16090, + "time_per_iteration": 2.699033260345459 + }, + { + "auxiliary_loss_clip": 0.01255913, + "auxiliary_loss_mlp": 0.00219504, + "balance_loss_clip": 1.03343391, + "balance_loss_mlp": 0.19314706, + "epoch": 0.9674432586802946, + "flos": 26797548107520.0, + "grad_norm": 11.695896630996101, + "language_loss": 0.76526582, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.78002, + "num_input_tokens_seen": 347168805, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.26367188, + "step": 16091, + "time_per_iteration": 2.744002342224121 + }, + { + "auxiliary_loss_clip": 0.01219191, + "auxiliary_loss_mlp": 0.00209476, + "balance_loss_clip": 1.01402569, + "balance_loss_mlp": 0.18584868, + "epoch": 0.9675033819329626, + "flos": 22310941034880.0, + "grad_norm": 4.95826007215197, + "language_loss": 0.83764458, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.85193121, + "num_input_tokens_seen": 347189455, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.2364502, + "step": 16092, + "time_per_iteration": 2.706245183944702 + }, + { + "auxiliary_loss_clip": 0.01234271, + "auxiliary_loss_mlp": 0.00214687, + "balance_loss_clip": 1.02002501, + "balance_loss_mlp": 0.19082169, + "epoch": 0.9675635051856305, + "flos": 12675713276160.0, + "grad_norm": 167.75898387761623, + "language_loss": 0.86145008, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.87593961, + "num_input_tokens_seen": 347206030, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.2388916, + "step": 16093, + "time_per_iteration": 2.6254518032073975 + }, + { + "auxiliary_loss_clip": 0.01236436, + "auxiliary_loss_mlp": 0.00261138, + "balance_loss_clip": 1.01974225, + "balance_loss_mlp": 0.23493563, + "epoch": 0.9676236284382985, + "flos": 24608469640320.0, + "grad_norm": 35.127874809093534, + "language_loss": 0.76325339, + "learning_rate": 1.096571027726112e-08, + "loss": 0.77822918, + "num_input_tokens_seen": 347226250, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.26196289, + "step": 16094, + "time_per_iteration": 2.791022300720215 + }, + { + "auxiliary_loss_clip": 0.01243058, + "auxiliary_loss_mlp": 0.00228812, + "balance_loss_clip": 1.02477884, + "balance_loss_mlp": 0.20303863, + "epoch": 0.9676837516909664, + "flos": 23367145478400.0, + "grad_norm": 6.562757356260783, + "language_loss": 0.83825397, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.85297263, + "num_input_tokens_seen": 347247350, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25805664, + "step": 16095, + "time_per_iteration": 4.2886643409729 + }, + { + "auxiliary_loss_clip": 0.01254482, + "auxiliary_loss_mlp": 0.00245084, + "balance_loss_clip": 1.02969384, + "balance_loss_mlp": 0.21876267, + "epoch": 0.9677438749436345, + "flos": 20486894532480.0, + "grad_norm": 7.581739807520376, + "language_loss": 0.80317825, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.81817389, + "num_input_tokens_seen": 347266870, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.26318359, + "step": 16096, + "time_per_iteration": 2.732398748397827 + }, + { + "auxiliary_loss_clip": 0.0127083, + "auxiliary_loss_mlp": 0.00239652, + "balance_loss_clip": 1.04782891, + "balance_loss_mlp": 0.21193582, + "epoch": 0.9678039981963025, + "flos": 47555889719040.0, + "grad_norm": 27.65889983206515, + "language_loss": 0.79284012, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.80794495, + "num_input_tokens_seen": 347290120, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.2767334, + "step": 16097, + "time_per_iteration": 4.2704455852508545 + }, + { + "auxiliary_loss_clip": 0.01237039, + "auxiliary_loss_mlp": 0.00236001, + "balance_loss_clip": 1.0242666, + "balance_loss_mlp": 0.21096659, + "epoch": 0.9678641214489704, + "flos": 25040474513280.0, + "grad_norm": 9.429070093084126, + "language_loss": 0.84187287, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.85660326, + "num_input_tokens_seen": 347308785, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.25048828, + "step": 16098, + "time_per_iteration": 2.705796241760254 + }, + { + "auxiliary_loss_clip": 0.01217245, + "auxiliary_loss_mlp": 0.00209617, + "balance_loss_clip": 1.01264465, + "balance_loss_mlp": 0.186777, + "epoch": 0.9679242447016384, + "flos": 19240937516160.0, + "grad_norm": 7.4084666126506225, + "language_loss": 0.95897555, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.97324419, + "num_input_tokens_seen": 347326375, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.22839355, + "step": 16099, + "time_per_iteration": 2.642177104949951 + }, + { + "auxiliary_loss_clip": 0.01240338, + "auxiliary_loss_mlp": 0.00225028, + "balance_loss_clip": 1.02286744, + "balance_loss_mlp": 0.20070945, + "epoch": 0.9679843679543063, + "flos": 33254681345280.0, + "grad_norm": 104.52143700240686, + "language_loss": 0.74254668, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.75720036, + "num_input_tokens_seen": 347348250, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24304199, + "step": 16100, + "time_per_iteration": 2.781959056854248 + }, + { + "auxiliary_loss_clip": 0.01235055, + "auxiliary_loss_mlp": 0.00222138, + "balance_loss_clip": 1.01937461, + "balance_loss_mlp": 0.19686571, + "epoch": 0.9680444912069743, + "flos": 22783633038720.0, + "grad_norm": 4.795172440562857, + "language_loss": 0.81104398, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.82561588, + "num_input_tokens_seen": 347367400, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.25280762, + "step": 16101, + "time_per_iteration": 2.674607038497925 + }, + { + "auxiliary_loss_clip": 0.01245257, + "auxiliary_loss_mlp": 0.00224702, + "balance_loss_clip": 1.02607715, + "balance_loss_mlp": 0.19894043, + "epoch": 0.9681046144596422, + "flos": 24024095274240.0, + "grad_norm": 13.191644825324829, + "language_loss": 0.82361829, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.83831787, + "num_input_tokens_seen": 347387600, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25769043, + "step": 16102, + "time_per_iteration": 2.809123992919922 + }, + { + "auxiliary_loss_clip": 0.01259428, + "auxiliary_loss_mlp": 0.00244189, + "balance_loss_clip": 1.03445828, + "balance_loss_mlp": 0.21689023, + "epoch": 0.9681647377123103, + "flos": 23441013797760.0, + "grad_norm": 77.6135917296932, + "language_loss": 0.87260389, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.88764006, + "num_input_tokens_seen": 347406915, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.27294922, + "step": 16103, + "time_per_iteration": 2.6587398052215576 + }, + { + "auxiliary_loss_clip": 0.01250877, + "auxiliary_loss_mlp": 0.0021419, + "balance_loss_clip": 1.03190804, + "balance_loss_mlp": 0.18923935, + "epoch": 0.9682248609649782, + "flos": 22675075159680.0, + "grad_norm": 394.30577439037637, + "language_loss": 0.89178312, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.90643376, + "num_input_tokens_seen": 347425140, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24951172, + "step": 16104, + "time_per_iteration": 2.6975202560424805 + }, + { + "auxiliary_loss_clip": 0.01228999, + "auxiliary_loss_mlp": 0.00225192, + "balance_loss_clip": 1.01742816, + "balance_loss_mlp": 0.20098066, + "epoch": 0.9682849842176462, + "flos": 24428413739520.0, + "grad_norm": 7.476333050915163, + "language_loss": 0.84878832, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.86333019, + "num_input_tokens_seen": 347446350, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.2421875, + "step": 16105, + "time_per_iteration": 4.2865309715271 + }, + { + "auxiliary_loss_clip": 0.01074759, + "auxiliary_loss_mlp": 0.00069345, + "balance_loss_clip": 0.93851793, + "balance_loss_mlp": 0.06286027, + "epoch": 0.9683451074703141, + "flos": 59995132784640.0, + "grad_norm": 0.8254122886656305, + "language_loss": 0.56179547, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.57323647, + "num_input_tokens_seen": 347510135, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06494141, + "step": 16106, + "time_per_iteration": 4.731706142425537 + }, + { + "auxiliary_loss_clip": 0.0108216, + "auxiliary_loss_mlp": 0.00072546, + "balance_loss_clip": 0.94513786, + "balance_loss_mlp": 0.06587067, + "epoch": 0.9684052307229821, + "flos": 52696145514240.0, + "grad_norm": 0.852318865068155, + "language_loss": 0.60928154, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.62082863, + "num_input_tokens_seen": 347562505, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.06689453, + "step": 16107, + "time_per_iteration": 3.042938470840454 + }, + { + "auxiliary_loss_clip": 0.01276594, + "auxiliary_loss_mlp": 0.00234989, + "balance_loss_clip": 1.04686487, + "balance_loss_mlp": 0.20752297, + "epoch": 0.96846535397565, + "flos": 22783848520320.0, + "grad_norm": 73.80572342415742, + "language_loss": 0.83229673, + "learning_rate": 1.040291854638875e-08, + "loss": 0.84741253, + "num_input_tokens_seen": 347579150, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.2746582, + "step": 16108, + "time_per_iteration": 2.701876640319824 + }, + { + "auxiliary_loss_clip": 0.01258148, + "auxiliary_loss_mlp": 0.00230521, + "balance_loss_clip": 1.03415668, + "balance_loss_mlp": 0.20428297, + "epoch": 0.968525477228318, + "flos": 23323980309120.0, + "grad_norm": 15.698614204420853, + "language_loss": 0.68577373, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.70066035, + "num_input_tokens_seen": 347596705, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26196289, + "step": 16109, + "time_per_iteration": 2.6440136432647705 + }, + { + "auxiliary_loss_clip": 0.01080268, + "auxiliary_loss_mlp": 0.00087428, + "balance_loss_clip": 0.94336665, + "balance_loss_mlp": 0.0807521, + "epoch": 0.9685856004809861, + "flos": 67882947707520.0, + "grad_norm": 0.6558160345838183, + "language_loss": 0.53616893, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.54784596, + "num_input_tokens_seen": 347661870, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.06689453, + "step": 16110, + "time_per_iteration": 3.116230010986328 + }, + { + "auxiliary_loss_clip": 0.01268428, + "auxiliary_loss_mlp": 0.00235304, + "balance_loss_clip": 1.04123223, + "balance_loss_mlp": 0.20649149, + "epoch": 0.968645723733654, + "flos": 33947900899200.0, + "grad_norm": 3.4842799127336725, + "language_loss": 0.71407557, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.72911292, + "num_input_tokens_seen": 347684295, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.28833008, + "step": 16111, + "time_per_iteration": 2.806518793106079 + }, + { + "auxiliary_loss_clip": 0.01218812, + "auxiliary_loss_mlp": 0.00232599, + "balance_loss_clip": 1.01250768, + "balance_loss_mlp": 0.20985344, + "epoch": 0.968705846986322, + "flos": 18551488890240.0, + "grad_norm": 17.356067521667402, + "language_loss": 0.81794864, + "learning_rate": 1.024483677309118e-08, + "loss": 0.83246273, + "num_input_tokens_seen": 347702585, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.22753906, + "step": 16112, + "time_per_iteration": 2.636869430541992 + }, + { + "auxiliary_loss_clip": 0.01216439, + "auxiliary_loss_mlp": 0.00206427, + "balance_loss_clip": 1.00825775, + "balance_loss_mlp": 0.18355042, + "epoch": 0.9687659702389899, + "flos": 17420913336960.0, + "grad_norm": 211.05151314804283, + "language_loss": 0.77653462, + "learning_rate": 1.020550495531558e-08, + "loss": 0.79076326, + "num_input_tokens_seen": 347721810, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.2286377, + "step": 16113, + "time_per_iteration": 2.7376270294189453 + }, + { + "auxiliary_loss_clip": 0.01078656, + "auxiliary_loss_mlp": 0.00061109, + "balance_loss_clip": 0.94231987, + "balance_loss_mlp": 0.05495805, + "epoch": 0.9688260934916579, + "flos": 62047176865920.0, + "grad_norm": 0.7907912796422909, + "language_loss": 0.5589307, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.57032835, + "num_input_tokens_seen": 347782330, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06152344, + "step": 16114, + "time_per_iteration": 3.154376983642578 + }, + { + "auxiliary_loss_clip": 0.012459, + "auxiliary_loss_mlp": 0.00238793, + "balance_loss_clip": 1.02996266, + "balance_loss_mlp": 0.21313956, + "epoch": 0.9688862167443258, + "flos": 15076520461440.0, + "grad_norm": 89.848044004011, + "language_loss": 0.88673425, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.90158117, + "num_input_tokens_seen": 347794835, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.2565918, + "step": 16115, + "time_per_iteration": 2.6489615440368652 + }, + { + "auxiliary_loss_clip": 0.01225309, + "auxiliary_loss_mlp": 0.00217477, + "balance_loss_clip": 1.02159011, + "balance_loss_mlp": 0.19356328, + "epoch": 0.9689463399969939, + "flos": 19938215306880.0, + "grad_norm": 2.2789533367332893, + "language_loss": 0.77319521, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.78762311, + "num_input_tokens_seen": 347814320, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.23913574, + "step": 16116, + "time_per_iteration": 2.7303414344787598 + }, + { + "auxiliary_loss_clip": 0.01257576, + "auxiliary_loss_mlp": 0.00214654, + "balance_loss_clip": 1.03393865, + "balance_loss_mlp": 0.19035889, + "epoch": 0.9690064632496618, + "flos": 19573039687680.0, + "grad_norm": 56.387747263273624, + "language_loss": 0.87247378, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.88719606, + "num_input_tokens_seen": 347832125, + "router_z_loss_clip": 2.23535156, + "router_z_loss_mlp": 0.24279785, + "step": 16117, + "time_per_iteration": 2.709587335586548 + }, + { + "auxiliary_loss_clip": 0.01251106, + "auxiliary_loss_mlp": 0.0021898, + "balance_loss_clip": 1.02801251, + "balance_loss_mlp": 0.19362424, + "epoch": 0.9690665865023298, + "flos": 21872292145920.0, + "grad_norm": 5.122109792731385, + "language_loss": 0.86402804, + "learning_rate": 1.000997769426548e-08, + "loss": 0.87872893, + "num_input_tokens_seen": 347850765, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.25366211, + "step": 16118, + "time_per_iteration": 2.6755597591400146 + }, + { + "auxiliary_loss_clip": 0.01267629, + "auxiliary_loss_mlp": 0.00220592, + "balance_loss_clip": 1.04355097, + "balance_loss_mlp": 0.19561791, + "epoch": 0.9691267097549977, + "flos": 20994491577600.0, + "grad_norm": 8.288463906834535, + "language_loss": 0.84269589, + "learning_rate": 9.971098618001272e-09, + "loss": 0.85757816, + "num_input_tokens_seen": 347870125, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.24975586, + "step": 16119, + "time_per_iteration": 2.6615943908691406 + }, + { + "auxiliary_loss_clip": 0.01218188, + "auxiliary_loss_mlp": 0.00204486, + "balance_loss_clip": 1.01408279, + "balance_loss_mlp": 0.18112087, + "epoch": 0.9691868330076657, + "flos": 24279132816000.0, + "grad_norm": 2469.0820264052363, + "language_loss": 0.82214129, + "learning_rate": 9.932295003832747e-09, + "loss": 0.83636808, + "num_input_tokens_seen": 347890615, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.23364258, + "step": 16120, + "time_per_iteration": 2.7635676860809326 + }, + { + "auxiliary_loss_clip": 0.01254123, + "auxiliary_loss_mlp": 0.00211269, + "balance_loss_clip": 1.03321433, + "balance_loss_mlp": 0.18660414, + "epoch": 0.9692469562603336, + "flos": 17675699483520.0, + "grad_norm": 18.07260513677884, + "language_loss": 0.7875793, + "learning_rate": 9.89356685323095e-09, + "loss": 0.80223322, + "num_input_tokens_seen": 347908685, + "router_z_loss_clip": 2.20996094, + "router_z_loss_mlp": 0.24658203, + "step": 16121, + "time_per_iteration": 2.6621673107147217 + }, + { + "auxiliary_loss_clip": 0.01233041, + "auxiliary_loss_mlp": 0.00219676, + "balance_loss_clip": 1.01579773, + "balance_loss_mlp": 0.19576298, + "epoch": 0.9693070795130017, + "flos": 26834392483200.0, + "grad_norm": 11.87878769602746, + "language_loss": 0.78171158, + "learning_rate": 9.854914167664486e-09, + "loss": 0.79623878, + "num_input_tokens_seen": 347926385, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.23913574, + "step": 16122, + "time_per_iteration": 2.7750794887542725 + }, + { + "auxiliary_loss_clip": 0.01242132, + "auxiliary_loss_mlp": 0.00224186, + "balance_loss_clip": 1.02266312, + "balance_loss_mlp": 0.19840136, + "epoch": 0.9693672027656697, + "flos": 18077288515200.0, + "grad_norm": 13.529469625731599, + "language_loss": 0.86865222, + "learning_rate": 9.81633694859907e-09, + "loss": 0.88331544, + "num_input_tokens_seen": 347945290, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25756836, + "step": 16123, + "time_per_iteration": 2.6430623531341553 + }, + { + "auxiliary_loss_clip": 0.01237606, + "auxiliary_loss_mlp": 0.00237776, + "balance_loss_clip": 1.02049685, + "balance_loss_mlp": 0.21263444, + "epoch": 0.9694273260183376, + "flos": 21763015994880.0, + "grad_norm": 10.248951420221792, + "language_loss": 0.79751807, + "learning_rate": 9.777835197497753e-09, + "loss": 0.81227189, + "num_input_tokens_seen": 347966330, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25146484, + "step": 16124, + "time_per_iteration": 2.7112903594970703 + }, + { + "auxiliary_loss_clip": 0.01243099, + "auxiliary_loss_mlp": 0.00228569, + "balance_loss_clip": 1.03272748, + "balance_loss_mlp": 0.20513274, + "epoch": 0.9694874492710056, + "flos": 24426115269120.0, + "grad_norm": 7.377086402552472, + "language_loss": 0.8091563, + "learning_rate": 9.739408915820258e-09, + "loss": 0.82387292, + "num_input_tokens_seen": 347982590, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.23425293, + "step": 16125, + "time_per_iteration": 2.7148406505584717 + }, + { + "auxiliary_loss_clip": 0.01080609, + "auxiliary_loss_mlp": 0.00071942, + "balance_loss_clip": 0.94335878, + "balance_loss_mlp": 0.06607695, + "epoch": 0.9695475725236735, + "flos": 67650748237440.0, + "grad_norm": 0.8720065858694876, + "language_loss": 0.614043, + "learning_rate": 9.70105810502364e-09, + "loss": 0.62556851, + "num_input_tokens_seen": 348043310, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.05859375, + "step": 16126, + "time_per_iteration": 3.194054365158081 + }, + { + "auxiliary_loss_clip": 0.01223991, + "auxiliary_loss_mlp": 0.00193215, + "balance_loss_clip": 1.01540494, + "balance_loss_mlp": 0.17110123, + "epoch": 0.9696076957763415, + "flos": 19129326981120.0, + "grad_norm": 144.57010919841505, + "language_loss": 0.82249486, + "learning_rate": 9.662782766562738e-09, + "loss": 0.83666688, + "num_input_tokens_seen": 348062200, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.22119141, + "step": 16127, + "time_per_iteration": 2.6848373413085938 + }, + { + "auxiliary_loss_clip": 0.01258241, + "auxiliary_loss_mlp": 0.00225474, + "balance_loss_clip": 1.03138983, + "balance_loss_mlp": 0.19715011, + "epoch": 0.9696678190290094, + "flos": 15486836497920.0, + "grad_norm": 34.14115045495221, + "language_loss": 0.77493697, + "learning_rate": 9.62458290188839e-09, + "loss": 0.78977406, + "num_input_tokens_seen": 348080685, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.2833252, + "step": 16128, + "time_per_iteration": 2.7133736610412598 + }, + { + "auxiliary_loss_clip": 0.01243167, + "auxiliary_loss_mlp": 0.00221751, + "balance_loss_clip": 1.02712631, + "balance_loss_mlp": 0.19814759, + "epoch": 0.9697279422816775, + "flos": 36208692869760.0, + "grad_norm": 35.57161310597105, + "language_loss": 0.70640939, + "learning_rate": 9.586458512449213e-09, + "loss": 0.72105861, + "num_input_tokens_seen": 348102500, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.23620605, + "step": 16129, + "time_per_iteration": 2.811979055404663 + }, + { + "auxiliary_loss_clip": 0.01264735, + "auxiliary_loss_mlp": 0.00224374, + "balance_loss_clip": 1.04146671, + "balance_loss_mlp": 0.1979091, + "epoch": 0.9697880655343454, + "flos": 25484007651840.0, + "grad_norm": 8.76165874049283, + "language_loss": 0.7428351, + "learning_rate": 9.548409599691166e-09, + "loss": 0.75772619, + "num_input_tokens_seen": 348122515, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26464844, + "step": 16130, + "time_per_iteration": 2.7834250926971436 + }, + { + "auxiliary_loss_clip": 0.01260944, + "auxiliary_loss_mlp": 0.00235311, + "balance_loss_clip": 1.0343138, + "balance_loss_mlp": 0.20722538, + "epoch": 0.9698481887870134, + "flos": 15333533251200.0, + "grad_norm": 47.21777294756925, + "language_loss": 0.82213926, + "learning_rate": 9.510436165056867e-09, + "loss": 0.83710182, + "num_input_tokens_seen": 348138775, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.28076172, + "step": 16131, + "time_per_iteration": 2.692765951156616 + }, + { + "auxiliary_loss_clip": 0.0125186, + "auxiliary_loss_mlp": 0.00242891, + "balance_loss_clip": 1.03289044, + "balance_loss_mlp": 0.21677238, + "epoch": 0.9699083120396813, + "flos": 21982250655360.0, + "grad_norm": 150.89689424080146, + "language_loss": 0.84399819, + "learning_rate": 9.472538209986058e-09, + "loss": 0.85894573, + "num_input_tokens_seen": 348157115, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26135254, + "step": 16132, + "time_per_iteration": 2.6768553256988525 + }, + { + "auxiliary_loss_clip": 0.01251638, + "auxiliary_loss_mlp": 0.00232319, + "balance_loss_clip": 1.02947271, + "balance_loss_mlp": 0.20468658, + "epoch": 0.9699684352923493, + "flos": 15664055224320.0, + "grad_norm": 13.97366293206911, + "language_loss": 0.87451875, + "learning_rate": 9.434715735916477e-09, + "loss": 0.88935828, + "num_input_tokens_seen": 348173035, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.27624512, + "step": 16133, + "time_per_iteration": 2.7264420986175537 + }, + { + "auxiliary_loss_clip": 0.01234135, + "auxiliary_loss_mlp": 0.00210453, + "balance_loss_clip": 1.02178681, + "balance_loss_mlp": 0.1870876, + "epoch": 0.9700285585450172, + "flos": 21908382336000.0, + "grad_norm": 181.75962266216848, + "language_loss": 0.7301681, + "learning_rate": 9.396968744281863e-09, + "loss": 0.74461401, + "num_input_tokens_seen": 348192960, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.23352051, + "step": 16134, + "time_per_iteration": 2.6613759994506836 + }, + { + "auxiliary_loss_clip": 0.01239777, + "auxiliary_loss_mlp": 0.00222467, + "balance_loss_clip": 1.02300024, + "balance_loss_mlp": 0.19681363, + "epoch": 0.9700886817976853, + "flos": 23914890950400.0, + "grad_norm": 70.87027077835961, + "language_loss": 0.90880895, + "learning_rate": 9.359297236513519e-09, + "loss": 0.92343134, + "num_input_tokens_seen": 348212805, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.2565918, + "step": 16135, + "time_per_iteration": 2.685509204864502 + }, + { + "auxiliary_loss_clip": 0.01253677, + "auxiliary_loss_mlp": 0.00233731, + "balance_loss_clip": 1.03474391, + "balance_loss_mlp": 0.20639601, + "epoch": 0.9701488050503532, + "flos": 25447845634560.0, + "grad_norm": 1177.6285166939435, + "language_loss": 0.79777145, + "learning_rate": 9.321701214040079e-09, + "loss": 0.81264555, + "num_input_tokens_seen": 348232900, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.27355957, + "step": 16136, + "time_per_iteration": 2.672851324081421 + }, + { + "auxiliary_loss_clip": 0.01234455, + "auxiliary_loss_mlp": 0.0021769, + "balance_loss_clip": 1.01886761, + "balance_loss_mlp": 0.19290632, + "epoch": 0.9702089283030212, + "flos": 20590855470720.0, + "grad_norm": 192.44384770132842, + "language_loss": 0.82551211, + "learning_rate": 9.28418067828729e-09, + "loss": 0.84003353, + "num_input_tokens_seen": 348253065, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24768066, + "step": 16137, + "time_per_iteration": 4.075519323348999 + }, + { + "auxiliary_loss_clip": 0.01078142, + "auxiliary_loss_mlp": 0.00068368, + "balance_loss_clip": 0.94221455, + "balance_loss_mlp": 0.06221697, + "epoch": 0.9702690515556892, + "flos": 70651516291200.0, + "grad_norm": 1.3490824488432207, + "language_loss": 0.54268032, + "learning_rate": 9.246735630678015e-09, + "loss": 0.5541454, + "num_input_tokens_seen": 348316075, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.0612793, + "step": 16138, + "time_per_iteration": 3.257861852645874 + }, + { + "auxiliary_loss_clip": 0.01240679, + "auxiliary_loss_mlp": 0.00231803, + "balance_loss_clip": 1.02357888, + "balance_loss_mlp": 0.20638794, + "epoch": 0.9703291748083571, + "flos": 35881439034240.0, + "grad_norm": 3.1882888792733466, + "language_loss": 0.78377271, + "learning_rate": 9.209366072632007e-09, + "loss": 0.79849756, + "num_input_tokens_seen": 348337605, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25402832, + "step": 16139, + "time_per_iteration": 4.29990553855896 + }, + { + "auxiliary_loss_clip": 0.01248983, + "auxiliary_loss_mlp": 0.00238955, + "balance_loss_clip": 1.03197443, + "balance_loss_mlp": 0.21455257, + "epoch": 0.9703892980610251, + "flos": 24316479982080.0, + "grad_norm": 18.37017949837744, + "language_loss": 0.79655123, + "learning_rate": 9.172072005566134e-09, + "loss": 0.81143057, + "num_input_tokens_seen": 348359430, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24401855, + "step": 16140, + "time_per_iteration": 2.8178303241729736 + }, + { + "auxiliary_loss_clip": 0.01264005, + "auxiliary_loss_mlp": 0.00243737, + "balance_loss_clip": 1.03666019, + "balance_loss_mlp": 0.21745145, + "epoch": 0.970449421313693, + "flos": 18003743418240.0, + "grad_norm": 29.671142634656846, + "language_loss": 0.78326499, + "learning_rate": 9.13485343089504e-09, + "loss": 0.79834247, + "num_input_tokens_seen": 348377890, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.26269531, + "step": 16141, + "time_per_iteration": 2.720022439956665 + }, + { + "auxiliary_loss_clip": 0.01235025, + "auxiliary_loss_mlp": 0.00215588, + "balance_loss_clip": 1.02289987, + "balance_loss_mlp": 0.19302157, + "epoch": 0.9705095445663611, + "flos": 25337994865920.0, + "grad_norm": 34.986783928083504, + "language_loss": 0.77097392, + "learning_rate": 9.097710350029597e-09, + "loss": 0.78548002, + "num_input_tokens_seen": 348396550, + "router_z_loss_clip": 2.12011719, + "router_z_loss_mlp": 0.22558594, + "step": 16142, + "time_per_iteration": 2.736443281173706 + }, + { + "auxiliary_loss_clip": 0.01225813, + "auxiliary_loss_mlp": 0.00220516, + "balance_loss_clip": 1.01228178, + "balance_loss_mlp": 0.19591075, + "epoch": 0.970569667819029, + "flos": 26833602384000.0, + "grad_norm": 19.24213589651548, + "language_loss": 0.63621467, + "learning_rate": 9.060642764378457e-09, + "loss": 0.65067792, + "num_input_tokens_seen": 348417120, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.24609375, + "step": 16143, + "time_per_iteration": 2.770338773727417 + }, + { + "auxiliary_loss_clip": 0.01241893, + "auxiliary_loss_mlp": 0.00225787, + "balance_loss_clip": 1.02787375, + "balance_loss_mlp": 0.20148018, + "epoch": 0.970629791071697, + "flos": 25848644567040.0, + "grad_norm": 3.6916547797739354, + "language_loss": 0.78038508, + "learning_rate": 9.023650675347382e-09, + "loss": 0.79506189, + "num_input_tokens_seen": 348437750, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.24291992, + "step": 16144, + "time_per_iteration": 2.726402759552002 + }, + { + "auxiliary_loss_clip": 0.0123442, + "auxiliary_loss_mlp": 0.00224021, + "balance_loss_clip": 1.02225304, + "balance_loss_mlp": 0.19938016, + "epoch": 0.9706899143243649, + "flos": 36540184510080.0, + "grad_norm": 4.532033751261966, + "language_loss": 0.78374505, + "learning_rate": 8.986734084339253e-09, + "loss": 0.79832947, + "num_input_tokens_seen": 348460935, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.24633789, + "step": 16145, + "time_per_iteration": 2.8008430004119873 + }, + { + "auxiliary_loss_clip": 0.01240872, + "auxiliary_loss_mlp": 0.00219732, + "balance_loss_clip": 1.02254915, + "balance_loss_mlp": 0.19364899, + "epoch": 0.9707500375770329, + "flos": 12268234414080.0, + "grad_norm": 91.54954527634001, + "language_loss": 0.89441991, + "learning_rate": 8.949892992753395e-09, + "loss": 0.90902591, + "num_input_tokens_seen": 348474480, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.26062012, + "step": 16146, + "time_per_iteration": 2.6258480548858643 + }, + { + "auxiliary_loss_clip": 0.01080837, + "auxiliary_loss_mlp": 0.00067316, + "balance_loss_clip": 0.94352353, + "balance_loss_mlp": 0.06073585, + "epoch": 0.9708101608297008, + "flos": 60853040196480.0, + "grad_norm": 0.7432567410949653, + "language_loss": 0.53896737, + "learning_rate": 8.91312740198713e-09, + "loss": 0.55044889, + "num_input_tokens_seen": 348541220, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06591797, + "step": 16147, + "time_per_iteration": 4.592820167541504 + }, + { + "auxiliary_loss_clip": 0.01247352, + "auxiliary_loss_mlp": 0.00229051, + "balance_loss_clip": 1.02795458, + "balance_loss_mlp": 0.20281324, + "epoch": 0.9708702840823689, + "flos": 27124766029440.0, + "grad_norm": 9.600658139443269, + "language_loss": 0.70865583, + "learning_rate": 8.876437313434682e-09, + "loss": 0.72341985, + "num_input_tokens_seen": 348559230, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.2623291, + "step": 16148, + "time_per_iteration": 4.1794867515563965 + }, + { + "auxiliary_loss_clip": 0.01240603, + "auxiliary_loss_mlp": 0.00239185, + "balance_loss_clip": 1.02641475, + "balance_loss_mlp": 0.21294671, + "epoch": 0.9709304073350368, + "flos": 20777699041920.0, + "grad_norm": 205.34246988543683, + "language_loss": 0.8147409, + "learning_rate": 8.839822728487155e-09, + "loss": 0.82953882, + "num_input_tokens_seen": 348577850, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.26220703, + "step": 16149, + "time_per_iteration": 2.682129383087158 + }, + { + "auxiliary_loss_clip": 0.01225253, + "auxiliary_loss_mlp": 0.00236385, + "balance_loss_clip": 1.01515365, + "balance_loss_mlp": 0.21257904, + "epoch": 0.9709905305877048, + "flos": 41934541115520.0, + "grad_norm": 64.46422473559372, + "language_loss": 0.85759497, + "learning_rate": 8.803283648533222e-09, + "loss": 0.87221128, + "num_input_tokens_seen": 348598345, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.23815918, + "step": 16150, + "time_per_iteration": 2.827500343322754 + }, + { + "auxiliary_loss_clip": 0.01266345, + "auxiliary_loss_mlp": 0.00260517, + "balance_loss_clip": 1.04052186, + "balance_loss_mlp": 0.23359931, + "epoch": 0.9710506538403728, + "flos": 17165588486400.0, + "grad_norm": 37.19044690109298, + "language_loss": 0.82765192, + "learning_rate": 8.766820074958214e-09, + "loss": 0.84292054, + "num_input_tokens_seen": 348616300, + "router_z_loss_clip": 2.25488281, + "router_z_loss_mlp": 0.2689209, + "step": 16151, + "time_per_iteration": 2.69598650932312 + }, + { + "auxiliary_loss_clip": 0.01237168, + "auxiliary_loss_mlp": 0.00227746, + "balance_loss_clip": 1.02139425, + "balance_loss_mlp": 0.20287867, + "epoch": 0.9711107770930407, + "flos": 21173470070400.0, + "grad_norm": 41.044402825625895, + "language_loss": 0.81737089, + "learning_rate": 8.730432009145027e-09, + "loss": 0.83201998, + "num_input_tokens_seen": 348633845, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24902344, + "step": 16152, + "time_per_iteration": 2.698352575302124 + }, + { + "auxiliary_loss_clip": 0.01239556, + "auxiliary_loss_mlp": 0.00210185, + "balance_loss_clip": 1.02617252, + "balance_loss_mlp": 0.18580648, + "epoch": 0.9711709003457087, + "flos": 22237072715520.0, + "grad_norm": 17.490892697082142, + "language_loss": 0.76405615, + "learning_rate": 8.694119452473448e-09, + "loss": 0.77855355, + "num_input_tokens_seen": 348653070, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.24353027, + "step": 16153, + "time_per_iteration": 2.723904848098755 + }, + { + "auxiliary_loss_clip": 0.01235683, + "auxiliary_loss_mlp": 0.00232406, + "balance_loss_clip": 1.02232003, + "balance_loss_mlp": 0.20882687, + "epoch": 0.9712310235983767, + "flos": 26213856099840.0, + "grad_norm": 10.236122073091812, + "language_loss": 0.78827095, + "learning_rate": 8.65788240632037e-09, + "loss": 0.80295187, + "num_input_tokens_seen": 348672145, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.23596191, + "step": 16154, + "time_per_iteration": 2.7038395404815674 + }, + { + "auxiliary_loss_clip": 0.01254454, + "auxiliary_loss_mlp": 0.00216314, + "balance_loss_clip": 1.02963018, + "balance_loss_mlp": 0.18943188, + "epoch": 0.9712911468510447, + "flos": 20668171495680.0, + "grad_norm": 6.90298909560916, + "language_loss": 0.89571005, + "learning_rate": 8.621720872059812e-09, + "loss": 0.9104178, + "num_input_tokens_seen": 348690615, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.2689209, + "step": 16155, + "time_per_iteration": 2.6977810859680176 + }, + { + "auxiliary_loss_clip": 0.0124701, + "auxiliary_loss_mlp": 0.00222661, + "balance_loss_clip": 1.02359617, + "balance_loss_mlp": 0.19473039, + "epoch": 0.9713512701037126, + "flos": 13552903313280.0, + "grad_norm": 15.777199612942702, + "language_loss": 0.8013922, + "learning_rate": 8.58563485106334e-09, + "loss": 0.81608886, + "num_input_tokens_seen": 348708665, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.27905273, + "step": 16156, + "time_per_iteration": 2.7109899520874023 + }, + { + "auxiliary_loss_clip": 0.01249901, + "auxiliary_loss_mlp": 0.00238973, + "balance_loss_clip": 1.02684224, + "balance_loss_mlp": 0.21300924, + "epoch": 0.9714113933563806, + "flos": 25848752307840.0, + "grad_norm": 232.3388311567096, + "language_loss": 1.01259995, + "learning_rate": 8.54962434469919e-09, + "loss": 1.02748871, + "num_input_tokens_seen": 348726105, + "router_z_loss_clip": 2.23144531, + "router_z_loss_mlp": 0.25964355, + "step": 16157, + "time_per_iteration": 2.688114881515503 + }, + { + "auxiliary_loss_clip": 0.01236267, + "auxiliary_loss_mlp": 0.00223658, + "balance_loss_clip": 1.02211106, + "balance_loss_mlp": 0.19825462, + "epoch": 0.9714715166090485, + "flos": 12743081233920.0, + "grad_norm": 9.730551903531165, + "language_loss": 0.80624837, + "learning_rate": 8.513689354332721e-09, + "loss": 0.82084763, + "num_input_tokens_seen": 348743360, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.25378418, + "step": 16158, + "time_per_iteration": 2.6883795261383057 + }, + { + "auxiliary_loss_clip": 0.01234582, + "auxiliary_loss_mlp": 0.0021457, + "balance_loss_clip": 1.02396512, + "balance_loss_mlp": 0.19158642, + "epoch": 0.9715316398617165, + "flos": 18405547931520.0, + "grad_norm": 81.52111117810479, + "language_loss": 0.6791296, + "learning_rate": 8.477829881326836e-09, + "loss": 0.6936211, + "num_input_tokens_seen": 348759045, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.23010254, + "step": 16159, + "time_per_iteration": 2.6136252880096436 + }, + { + "auxiliary_loss_clip": 0.01221756, + "auxiliary_loss_mlp": 0.00215913, + "balance_loss_clip": 1.01616788, + "balance_loss_mlp": 0.19247648, + "epoch": 0.9715917631143844, + "flos": 28913799749760.0, + "grad_norm": 33812.07552406592, + "language_loss": 0.86918032, + "learning_rate": 8.44204592704112e-09, + "loss": 0.88355702, + "num_input_tokens_seen": 348779910, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.234375, + "step": 16160, + "time_per_iteration": 2.7283012866973877 + }, + { + "auxiliary_loss_clip": 0.0108412, + "auxiliary_loss_mlp": 0.00079899, + "balance_loss_clip": 0.94620144, + "balance_loss_mlp": 0.07312768, + "epoch": 0.9716518863670525, + "flos": 65939712900480.0, + "grad_norm": 0.7613475340704952, + "language_loss": 0.53800535, + "learning_rate": 8.406337492832704e-09, + "loss": 0.54964554, + "num_input_tokens_seen": 348838995, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06787109, + "step": 16161, + "time_per_iteration": 3.194183826446533 + }, + { + "auxiliary_loss_clip": 0.01244118, + "auxiliary_loss_mlp": 0.00196578, + "balance_loss_clip": 1.02702737, + "balance_loss_mlp": 0.17073366, + "epoch": 0.9717120096197204, + "flos": 17712759340800.0, + "grad_norm": 6.982070291968716, + "language_loss": 0.80206621, + "learning_rate": 8.3707045800554e-09, + "loss": 0.81647325, + "num_input_tokens_seen": 348858090, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25854492, + "step": 16162, + "time_per_iteration": 2.671966791152954 + }, + { + "auxiliary_loss_clip": 0.01220877, + "auxiliary_loss_mlp": 0.00213328, + "balance_loss_clip": 1.0107528, + "balance_loss_mlp": 0.18868747, + "epoch": 0.9717721328723884, + "flos": 24463426521600.0, + "grad_norm": 368.37018411811397, + "language_loss": 0.86874235, + "learning_rate": 8.335147190060787e-09, + "loss": 0.88308436, + "num_input_tokens_seen": 348877885, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.24645996, + "step": 16163, + "time_per_iteration": 2.6728408336639404 + }, + { + "auxiliary_loss_clip": 0.0122648, + "auxiliary_loss_mlp": 0.00226789, + "balance_loss_clip": 1.01667655, + "balance_loss_mlp": 0.20385271, + "epoch": 0.9718322561250564, + "flos": 20776477979520.0, + "grad_norm": 51.6533614962847, + "language_loss": 0.80211997, + "learning_rate": 8.299665324196903e-09, + "loss": 0.81665266, + "num_input_tokens_seen": 348897720, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.22937012, + "step": 16164, + "time_per_iteration": 2.6946167945861816 + }, + { + "auxiliary_loss_clip": 0.01255407, + "auxiliary_loss_mlp": 0.00228748, + "balance_loss_clip": 1.03621793, + "balance_loss_mlp": 0.20212862, + "epoch": 0.9718923793777243, + "flos": 19025904746880.0, + "grad_norm": 4.367374204647696, + "language_loss": 0.93567157, + "learning_rate": 8.264258983809114e-09, + "loss": 0.95051312, + "num_input_tokens_seen": 348915410, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26599121, + "step": 16165, + "time_per_iteration": 2.6646156311035156 + }, + { + "auxiliary_loss_clip": 0.01234666, + "auxiliary_loss_mlp": 0.00209051, + "balance_loss_clip": 1.01787448, + "balance_loss_mlp": 0.18710414, + "epoch": 0.9719525026303923, + "flos": 21871717528320.0, + "grad_norm": 8.957334054696027, + "language_loss": 0.86505729, + "learning_rate": 8.228928170240345e-09, + "loss": 0.87949443, + "num_input_tokens_seen": 348934335, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.21948242, + "step": 16166, + "time_per_iteration": 2.6677448749542236 + }, + { + "auxiliary_loss_clip": 0.01241484, + "auxiliary_loss_mlp": 0.00210096, + "balance_loss_clip": 1.02509177, + "balance_loss_mlp": 0.1864689, + "epoch": 0.9720126258830603, + "flos": 14429303251200.0, + "grad_norm": 2166.687489652843, + "language_loss": 0.78464788, + "learning_rate": 8.193672884830195e-09, + "loss": 0.7991637, + "num_input_tokens_seen": 348952405, + "router_z_loss_clip": 2.16308594, + "router_z_loss_mlp": 0.23632812, + "step": 16167, + "time_per_iteration": 2.6740047931671143 + }, + { + "auxiliary_loss_clip": 0.01231765, + "auxiliary_loss_mlp": 0.00247937, + "balance_loss_clip": 1.01965153, + "balance_loss_mlp": 0.22316484, + "epoch": 0.9720727491357283, + "flos": 26251167352320.0, + "grad_norm": 7.151812035628794, + "language_loss": 0.81795907, + "learning_rate": 8.158493128915812e-09, + "loss": 0.83275604, + "num_input_tokens_seen": 348973580, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.2479248, + "step": 16168, + "time_per_iteration": 2.7491254806518555 + }, + { + "auxiliary_loss_clip": 0.0124733, + "auxiliary_loss_mlp": 0.00217115, + "balance_loss_clip": 1.02703047, + "balance_loss_mlp": 0.18980427, + "epoch": 0.9721328723883962, + "flos": 22674105492480.0, + "grad_norm": 7.349375676582137, + "language_loss": 0.85337591, + "learning_rate": 8.123388903830797e-09, + "loss": 0.8680203, + "num_input_tokens_seen": 348992035, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.27319336, + "step": 16169, + "time_per_iteration": 2.7156646251678467 + }, + { + "auxiliary_loss_clip": 0.01259349, + "auxiliary_loss_mlp": 0.0024774, + "balance_loss_clip": 1.03627825, + "balance_loss_mlp": 0.22010753, + "epoch": 0.9721929956410642, + "flos": 28074172360320.0, + "grad_norm": 10.223338073570817, + "language_loss": 0.6727283, + "learning_rate": 8.088360210906309e-09, + "loss": 0.68779916, + "num_input_tokens_seen": 349013160, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.27636719, + "step": 16170, + "time_per_iteration": 2.7602391242980957 + }, + { + "auxiliary_loss_clip": 0.01236262, + "auxiliary_loss_mlp": 0.00209421, + "balance_loss_clip": 1.01808178, + "balance_loss_mlp": 0.18512601, + "epoch": 0.9722531188937321, + "flos": 20996251344000.0, + "grad_norm": 1972.6106664433446, + "language_loss": 0.79689497, + "learning_rate": 8.053407051471062e-09, + "loss": 0.81135178, + "num_input_tokens_seen": 349033485, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24316406, + "step": 16171, + "time_per_iteration": 2.7074804306030273 + }, + { + "auxiliary_loss_clip": 0.01240606, + "auxiliary_loss_mlp": 0.00214624, + "balance_loss_clip": 1.01697755, + "balance_loss_mlp": 0.18976936, + "epoch": 0.9723132421464001, + "flos": 16070600332800.0, + "grad_norm": 38.61372859961805, + "language_loss": 0.76917857, + "learning_rate": 8.018529426850218e-09, + "loss": 0.78373092, + "num_input_tokens_seen": 349051705, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.2487793, + "step": 16172, + "time_per_iteration": 2.7399237155914307 + }, + { + "auxiliary_loss_clip": 0.01232322, + "auxiliary_loss_mlp": 0.00195862, + "balance_loss_clip": 1.02010179, + "balance_loss_mlp": 0.17054188, + "epoch": 0.972373365399068, + "flos": 27745769289600.0, + "grad_norm": 8.016700891334938, + "language_loss": 0.93527317, + "learning_rate": 7.983727338366274e-09, + "loss": 0.94955504, + "num_input_tokens_seen": 349070825, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.25305176, + "step": 16173, + "time_per_iteration": 2.7563977241516113 + }, + { + "auxiliary_loss_clip": 0.01258625, + "auxiliary_loss_mlp": 0.00254419, + "balance_loss_clip": 1.03185296, + "balance_loss_mlp": 0.22721532, + "epoch": 0.9724334886517361, + "flos": 23002939526400.0, + "grad_norm": 15.489471920503966, + "language_loss": 0.75836974, + "learning_rate": 7.949000787339289e-09, + "loss": 0.7735002, + "num_input_tokens_seen": 349089730, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.27185059, + "step": 16174, + "time_per_iteration": 2.657973527908325 + }, + { + "auxiliary_loss_clip": 0.01237604, + "auxiliary_loss_mlp": 0.00230455, + "balance_loss_clip": 1.02613235, + "balance_loss_mlp": 0.20467016, + "epoch": 0.972493611904404, + "flos": 25447055535360.0, + "grad_norm": 24.64826117803069, + "language_loss": 0.83066183, + "learning_rate": 7.914349775085538e-09, + "loss": 0.8453424, + "num_input_tokens_seen": 349111315, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.25769043, + "step": 16175, + "time_per_iteration": 2.7292118072509766 + }, + { + "auxiliary_loss_clip": 0.01238017, + "auxiliary_loss_mlp": 0.00217031, + "balance_loss_clip": 1.02247787, + "balance_loss_mlp": 0.19169861, + "epoch": 0.972553735157072, + "flos": 16983054547200.0, + "grad_norm": 72724.42511937155, + "language_loss": 0.70742083, + "learning_rate": 7.879774302919307e-09, + "loss": 0.72197127, + "num_input_tokens_seen": 349129495, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.25317383, + "step": 16176, + "time_per_iteration": 2.625964641571045 + }, + { + "auxiliary_loss_clip": 0.01247369, + "auxiliary_loss_mlp": 0.00211207, + "balance_loss_clip": 1.03487349, + "balance_loss_mlp": 0.1864592, + "epoch": 0.97261385840974, + "flos": 26104651776000.0, + "grad_norm": 14.995685107146182, + "language_loss": 0.82433259, + "learning_rate": 7.845274372151545e-09, + "loss": 0.83891833, + "num_input_tokens_seen": 349148850, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.24755859, + "step": 16177, + "time_per_iteration": 2.685791015625 + }, + { + "auxiliary_loss_clip": 0.01222909, + "auxiliary_loss_mlp": 0.00211821, + "balance_loss_clip": 1.01164556, + "balance_loss_mlp": 0.18707332, + "epoch": 0.9726739816624079, + "flos": 25447881548160.0, + "grad_norm": 60.14388732392215, + "language_loss": 0.76493841, + "learning_rate": 7.810849984090984e-09, + "loss": 0.77928573, + "num_input_tokens_seen": 349167620, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.24755859, + "step": 16178, + "time_per_iteration": 2.7299063205718994 + }, + { + "auxiliary_loss_clip": 0.01255829, + "auxiliary_loss_mlp": 0.00212962, + "balance_loss_clip": 1.03445256, + "balance_loss_mlp": 0.18720068, + "epoch": 0.972734104915076, + "flos": 29014923513600.0, + "grad_norm": 154.81021133459683, + "language_loss": 0.78787154, + "learning_rate": 7.776501140042358e-09, + "loss": 0.80255949, + "num_input_tokens_seen": 349185845, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25744629, + "step": 16179, + "time_per_iteration": 2.7671890258789062 + }, + { + "auxiliary_loss_clip": 0.01221544, + "auxiliary_loss_mlp": 0.00239817, + "balance_loss_clip": 1.01183665, + "balance_loss_mlp": 0.21564104, + "epoch": 0.9727942281677439, + "flos": 23437637919360.0, + "grad_norm": 1186.0420506668936, + "language_loss": 0.83634329, + "learning_rate": 7.742227841308624e-09, + "loss": 0.85095692, + "num_input_tokens_seen": 349204525, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.24169922, + "step": 16180, + "time_per_iteration": 4.115322828292847 + }, + { + "auxiliary_loss_clip": 0.01259639, + "auxiliary_loss_mlp": 0.00218757, + "balance_loss_clip": 1.0353719, + "balance_loss_mlp": 0.19241145, + "epoch": 0.9728543514204119, + "flos": 31724599749120.0, + "grad_norm": 36.53420727656358, + "language_loss": 0.8421303, + "learning_rate": 7.708030089189188e-09, + "loss": 0.85691428, + "num_input_tokens_seen": 349228075, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26342773, + "step": 16181, + "time_per_iteration": 4.219792127609253 + }, + { + "auxiliary_loss_clip": 0.01225926, + "auxiliary_loss_mlp": 0.00214165, + "balance_loss_clip": 1.01842844, + "balance_loss_mlp": 0.19118121, + "epoch": 0.9729144746730798, + "flos": 16289368116480.0, + "grad_norm": 6.0717880781844675, + "language_loss": 0.72154367, + "learning_rate": 7.67390788498079e-09, + "loss": 0.73594463, + "num_input_tokens_seen": 349246990, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.2298584, + "step": 16182, + "time_per_iteration": 2.625229597091675 + }, + { + "auxiliary_loss_clip": 0.01243928, + "auxiliary_loss_mlp": 0.00229666, + "balance_loss_clip": 1.0245353, + "balance_loss_mlp": 0.20506072, + "epoch": 0.9729745979257478, + "flos": 25041408266880.0, + "grad_norm": 3.4798191984159295, + "language_loss": 0.71040291, + "learning_rate": 7.639861229977507e-09, + "loss": 0.72513884, + "num_input_tokens_seen": 349265890, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.24609375, + "step": 16183, + "time_per_iteration": 2.72348952293396 + }, + { + "auxiliary_loss_clip": 0.01234486, + "auxiliary_loss_mlp": 0.00215357, + "balance_loss_clip": 1.02252817, + "balance_loss_mlp": 0.19002491, + "epoch": 0.9730347211784157, + "flos": 22638733574400.0, + "grad_norm": 41.992670404513156, + "language_loss": 0.84214449, + "learning_rate": 7.605890125470527e-09, + "loss": 0.85664284, + "num_input_tokens_seen": 349285275, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.25317383, + "step": 16184, + "time_per_iteration": 2.657150983810425 + }, + { + "auxiliary_loss_clip": 0.01226817, + "auxiliary_loss_mlp": 0.00218618, + "balance_loss_clip": 1.01556838, + "balance_loss_mlp": 0.19556344, + "epoch": 0.9730948444310837, + "flos": 10998613313280.0, + "grad_norm": 333.62272018975904, + "language_loss": 0.90088528, + "learning_rate": 7.571994572747709e-09, + "loss": 0.91533959, + "num_input_tokens_seen": 349301515, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.23071289, + "step": 16185, + "time_per_iteration": 2.6689112186431885 + }, + { + "auxiliary_loss_clip": 0.01236555, + "auxiliary_loss_mlp": 0.00243423, + "balance_loss_clip": 1.02673626, + "balance_loss_mlp": 0.2191759, + "epoch": 0.9731549676837516, + "flos": 16799479113600.0, + "grad_norm": 9.545509518770686, + "language_loss": 0.86543292, + "learning_rate": 7.538174573094469e-09, + "loss": 0.88023269, + "num_input_tokens_seen": 349319590, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.24243164, + "step": 16186, + "time_per_iteration": 2.842618227005005 + }, + { + "auxiliary_loss_clip": 0.01225718, + "auxiliary_loss_mlp": 0.00214563, + "balance_loss_clip": 1.01651073, + "balance_loss_mlp": 0.19058996, + "epoch": 0.9732150909364197, + "flos": 21141761339520.0, + "grad_norm": 10.16871007870013, + "language_loss": 0.73288918, + "learning_rate": 7.504430127793337e-09, + "loss": 0.74729204, + "num_input_tokens_seen": 349339230, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.23974609, + "step": 16187, + "time_per_iteration": 2.6906025409698486 + }, + { + "auxiliary_loss_clip": 0.01224174, + "auxiliary_loss_mlp": 0.00228449, + "balance_loss_clip": 1.01837254, + "balance_loss_mlp": 0.205704, + "epoch": 0.9732752141890876, + "flos": 33727337435520.0, + "grad_norm": 98.99851540636057, + "language_loss": 0.86266941, + "learning_rate": 7.47076123812418e-09, + "loss": 0.87719566, + "num_input_tokens_seen": 349361155, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.22766113, + "step": 16188, + "time_per_iteration": 2.81126070022583 + }, + { + "auxiliary_loss_clip": 0.01223556, + "auxiliary_loss_mlp": 0.00224608, + "balance_loss_clip": 1.01663637, + "balance_loss_mlp": 0.20268494, + "epoch": 0.9733353374417556, + "flos": 23404384903680.0, + "grad_norm": 3288.8542512170943, + "language_loss": 0.85514325, + "learning_rate": 7.437167905363084e-09, + "loss": 0.86962485, + "num_input_tokens_seen": 349379335, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.21899414, + "step": 16189, + "time_per_iteration": 4.122826337814331 + }, + { + "auxiliary_loss_clip": 0.01239729, + "auxiliary_loss_mlp": 0.00238461, + "balance_loss_clip": 1.01984239, + "balance_loss_mlp": 0.21089996, + "epoch": 0.9733954606944236, + "flos": 39165792963840.0, + "grad_norm": 23.73755643716483, + "language_loss": 0.63533264, + "learning_rate": 7.403650130784367e-09, + "loss": 0.65011454, + "num_input_tokens_seen": 349401575, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.27587891, + "step": 16190, + "time_per_iteration": 4.2825987339019775 + }, + { + "auxiliary_loss_clip": 0.01229352, + "auxiliary_loss_mlp": 0.00220153, + "balance_loss_clip": 1.01536942, + "balance_loss_mlp": 0.19585842, + "epoch": 0.9734555839470915, + "flos": 21981819692160.0, + "grad_norm": 36.96559132832664, + "language_loss": 0.88826704, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.90276206, + "num_input_tokens_seen": 349420650, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24316406, + "step": 16191, + "time_per_iteration": 2.7092318534851074 + }, + { + "auxiliary_loss_clip": 0.01231305, + "auxiliary_loss_mlp": 0.00211123, + "balance_loss_clip": 1.02100742, + "balance_loss_mlp": 0.18719798, + "epoch": 0.9735157071997596, + "flos": 16575539771520.0, + "grad_norm": 15.841699905906337, + "language_loss": 0.88652003, + "learning_rate": 7.336841261255111e-09, + "loss": 0.90094435, + "num_input_tokens_seen": 349436830, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.23937988, + "step": 16192, + "time_per_iteration": 2.668213367462158 + }, + { + "auxiliary_loss_clip": 0.01248944, + "auxiliary_loss_mlp": 0.00218819, + "balance_loss_clip": 1.03405869, + "balance_loss_mlp": 0.19352314, + "epoch": 0.9735758304524275, + "flos": 20223237726720.0, + "grad_norm": 16.508139538438147, + "language_loss": 0.82374287, + "learning_rate": 7.303550168837658e-09, + "loss": 0.83842051, + "num_input_tokens_seen": 349454325, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.25292969, + "step": 16193, + "time_per_iteration": 2.711557149887085 + }, + { + "auxiliary_loss_clip": 0.01221649, + "auxiliary_loss_mlp": 0.00203723, + "balance_loss_clip": 1.01299214, + "balance_loss_mlp": 0.17972568, + "epoch": 0.9736359537050955, + "flos": 23653353047040.0, + "grad_norm": 7.090746767104533, + "language_loss": 0.89834678, + "learning_rate": 7.270334639669417e-09, + "loss": 0.91260052, + "num_input_tokens_seen": 349470230, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.2401123, + "step": 16194, + "time_per_iteration": 2.6752498149871826 + }, + { + "auxiliary_loss_clip": 0.01219762, + "auxiliary_loss_mlp": 0.00210079, + "balance_loss_clip": 1.0103004, + "balance_loss_mlp": 0.18736999, + "epoch": 0.9736960769577634, + "flos": 15560202026880.0, + "grad_norm": 43.90341564661956, + "language_loss": 0.82619631, + "learning_rate": 7.237194675009828e-09, + "loss": 0.84049469, + "num_input_tokens_seen": 349486250, + "router_z_loss_clip": 2.09277344, + "router_z_loss_mlp": 0.22717285, + "step": 16195, + "time_per_iteration": 2.7164461612701416 + }, + { + "auxiliary_loss_clip": 0.01078079, + "auxiliary_loss_mlp": 0.00075072, + "balance_loss_clip": 0.94246942, + "balance_loss_mlp": 0.06863426, + "epoch": 0.9737562002104314, + "flos": 65351783088000.0, + "grad_norm": 0.7422950747260958, + "language_loss": 0.52011031, + "learning_rate": 7.204130276115439e-09, + "loss": 0.53164184, + "num_input_tokens_seen": 349545865, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06445312, + "step": 16196, + "time_per_iteration": 3.1543633937835693 + }, + { + "auxiliary_loss_clip": 0.0124413, + "auxiliary_loss_mlp": 0.00217083, + "balance_loss_clip": 1.02611375, + "balance_loss_mlp": 0.19071397, + "epoch": 0.9738163234630993, + "flos": 27196730928000.0, + "grad_norm": 26.70620439471528, + "language_loss": 0.84123582, + "learning_rate": 7.171141444240136e-09, + "loss": 0.85584795, + "num_input_tokens_seen": 349566080, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.26379395, + "step": 16197, + "time_per_iteration": 2.717935562133789 + }, + { + "auxiliary_loss_clip": 0.01248778, + "auxiliary_loss_mlp": 0.00235586, + "balance_loss_clip": 1.03062844, + "balance_loss_mlp": 0.20946687, + "epoch": 0.9738764467157673, + "flos": 21069365477760.0, + "grad_norm": 34.91683933718339, + "language_loss": 0.74952787, + "learning_rate": 7.13822818063492e-09, + "loss": 0.76437151, + "num_input_tokens_seen": 349585665, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.26123047, + "step": 16198, + "time_per_iteration": 2.654101610183716 + }, + { + "auxiliary_loss_clip": 0.0123683, + "auxiliary_loss_mlp": 0.00218029, + "balance_loss_clip": 1.02176261, + "balance_loss_mlp": 0.19232726, + "epoch": 0.9739365699684353, + "flos": 21361211481600.0, + "grad_norm": 56.73073765595318, + "language_loss": 0.86036587, + "learning_rate": 7.10539048654768e-09, + "loss": 0.87491441, + "num_input_tokens_seen": 349605125, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.25683594, + "step": 16199, + "time_per_iteration": 2.7174887657165527 + }, + { + "auxiliary_loss_clip": 0.01245056, + "auxiliary_loss_mlp": 0.00227704, + "balance_loss_clip": 1.02454495, + "balance_loss_mlp": 0.20119202, + "epoch": 0.9739966932211033, + "flos": 21902061542400.0, + "grad_norm": 2.890389247197782, + "language_loss": 0.86548007, + "learning_rate": 7.072628363223865e-09, + "loss": 0.88020766, + "num_input_tokens_seen": 349623360, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.26525879, + "step": 16200, + "time_per_iteration": 2.6927223205566406 + }, + { + "auxiliary_loss_clip": 0.01260854, + "auxiliary_loss_mlp": 0.00223019, + "balance_loss_clip": 1.03597021, + "balance_loss_mlp": 0.19467102, + "epoch": 0.9740568164737712, + "flos": 24827345164800.0, + "grad_norm": 202.21681968890906, + "language_loss": 0.79432976, + "learning_rate": 7.039941811905592e-09, + "loss": 0.80916852, + "num_input_tokens_seen": 349644390, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.28356934, + "step": 16201, + "time_per_iteration": 2.756596565246582 + }, + { + "auxiliary_loss_clip": 0.01233989, + "auxiliary_loss_mlp": 0.00224309, + "balance_loss_clip": 1.01973724, + "balance_loss_mlp": 0.19929919, + "epoch": 0.9741169397264392, + "flos": 23623583650560.0, + "grad_norm": 136.1084581379726, + "language_loss": 0.78499126, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.79957426, + "num_input_tokens_seen": 349663200, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.25, + "step": 16202, + "time_per_iteration": 2.7647225856781006 + }, + { + "auxiliary_loss_clip": 0.01245748, + "auxiliary_loss_mlp": 0.00232941, + "balance_loss_clip": 1.02516258, + "balance_loss_mlp": 0.20635705, + "epoch": 0.9741770629791072, + "flos": 18841144164480.0, + "grad_norm": 14.717247721433038, + "language_loss": 0.80242538, + "learning_rate": 6.974795430241265e-09, + "loss": 0.81721228, + "num_input_tokens_seen": 349681975, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.26611328, + "step": 16203, + "time_per_iteration": 2.6258127689361572 + }, + { + "auxiliary_loss_clip": 0.01220691, + "auxiliary_loss_mlp": 0.00212219, + "balance_loss_clip": 1.01120138, + "balance_loss_mlp": 0.18991506, + "epoch": 0.9742371862317751, + "flos": 22346241125760.0, + "grad_norm": 118.90728882985239, + "language_loss": 0.84442729, + "learning_rate": 6.942335602365235e-09, + "loss": 0.85875642, + "num_input_tokens_seen": 349701185, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.22302246, + "step": 16204, + "time_per_iteration": 2.7884714603424072 + }, + { + "auxiliary_loss_clip": 0.01239175, + "auxiliary_loss_mlp": 0.00211778, + "balance_loss_clip": 1.02524495, + "balance_loss_mlp": 0.18843725, + "epoch": 0.9742973094844432, + "flos": 21762764599680.0, + "grad_norm": 6.89950299509813, + "language_loss": 0.88741112, + "learning_rate": 6.909951351435905e-09, + "loss": 0.90192062, + "num_input_tokens_seen": 349720360, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.23352051, + "step": 16205, + "time_per_iteration": 2.694004774093628 + }, + { + "auxiliary_loss_clip": 0.01223879, + "auxiliary_loss_mlp": 0.00236659, + "balance_loss_clip": 1.01132464, + "balance_loss_mlp": 0.21324611, + "epoch": 0.9743574327371111, + "flos": 26248725227520.0, + "grad_norm": 18.83700736305046, + "language_loss": 0.81524849, + "learning_rate": 6.87764267868074e-09, + "loss": 0.82985389, + "num_input_tokens_seen": 349741040, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.23425293, + "step": 16206, + "time_per_iteration": 2.796604871749878 + }, + { + "auxiliary_loss_clip": 0.01235296, + "auxiliary_loss_mlp": 0.00217832, + "balance_loss_clip": 1.01692772, + "balance_loss_mlp": 0.1927381, + "epoch": 0.9744175559897791, + "flos": 12349321367040.0, + "grad_norm": 624.1739698970022, + "language_loss": 0.94442916, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.95896041, + "num_input_tokens_seen": 349758895, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25134277, + "step": 16207, + "time_per_iteration": 2.6921350955963135 + }, + { + "auxiliary_loss_clip": 0.01229222, + "auxiliary_loss_mlp": 0.00222562, + "balance_loss_clip": 1.01878762, + "balance_loss_mlp": 0.19732541, + "epoch": 0.974477679242447, + "flos": 28397834835840.0, + "grad_norm": 254.7772606763417, + "language_loss": 0.76936287, + "learning_rate": 6.813252072591425e-09, + "loss": 0.78388071, + "num_input_tokens_seen": 349779740, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.25244141, + "step": 16208, + "time_per_iteration": 2.767425537109375 + }, + { + "auxiliary_loss_clip": 0.01219456, + "auxiliary_loss_mlp": 0.00214422, + "balance_loss_clip": 1.01535034, + "balance_loss_mlp": 0.19233257, + "epoch": 0.974537802495115, + "flos": 17785370684160.0, + "grad_norm": 108.3852404222588, + "language_loss": 0.8151387, + "learning_rate": 6.781170141698878e-09, + "loss": 0.82947743, + "num_input_tokens_seen": 349796820, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.22094727, + "step": 16209, + "time_per_iteration": 2.7001402378082275 + }, + { + "auxiliary_loss_clip": 0.01221134, + "auxiliary_loss_mlp": 0.00210737, + "balance_loss_clip": 1.01225102, + "balance_loss_mlp": 0.18767044, + "epoch": 0.9745979257477829, + "flos": 23842315520640.0, + "grad_norm": 9.514206310568769, + "language_loss": 0.88109934, + "learning_rate": 6.749163793864144e-09, + "loss": 0.89541805, + "num_input_tokens_seen": 349816550, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.23059082, + "step": 16210, + "time_per_iteration": 2.7096471786499023 + }, + { + "auxiliary_loss_clip": 0.0123871, + "auxiliary_loss_mlp": 0.00231179, + "balance_loss_clip": 1.01951218, + "balance_loss_mlp": 0.20608559, + "epoch": 0.9746580490004509, + "flos": 27016172236800.0, + "grad_norm": 13.173118903806367, + "language_loss": 0.88108277, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.8957817, + "num_input_tokens_seen": 349834350, + "router_z_loss_clip": 2.19433594, + "router_z_loss_mlp": 0.25109863, + "step": 16211, + "time_per_iteration": 2.7420713901519775 + }, + { + "auxiliary_loss_clip": 0.01253969, + "auxiliary_loss_mlp": 0.00228304, + "balance_loss_clip": 1.03026175, + "balance_loss_mlp": 0.20253114, + "epoch": 0.9747181722531189, + "flos": 19792022952960.0, + "grad_norm": 39.76931154596857, + "language_loss": 0.89896613, + "learning_rate": 6.685377852219787e-09, + "loss": 0.9137888, + "num_input_tokens_seen": 349853460, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.25793457, + "step": 16212, + "time_per_iteration": 2.75981068611145 + }, + { + "auxiliary_loss_clip": 0.01223371, + "auxiliary_loss_mlp": 0.00220139, + "balance_loss_clip": 1.01234388, + "balance_loss_mlp": 0.19567722, + "epoch": 0.9747782955057869, + "flos": 31430598929280.0, + "grad_norm": 507.7371110739221, + "language_loss": 0.86659813, + "learning_rate": 6.653598260829118e-09, + "loss": 0.88103318, + "num_input_tokens_seen": 349874830, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.24462891, + "step": 16213, + "time_per_iteration": 2.8016834259033203 + }, + { + "auxiliary_loss_clip": 0.01227509, + "auxiliary_loss_mlp": 0.00225488, + "balance_loss_clip": 1.01001608, + "balance_loss_mlp": 0.19984564, + "epoch": 0.9748384187584548, + "flos": 15961288268160.0, + "grad_norm": 8.754681251521369, + "language_loss": 0.77145827, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.78598821, + "num_input_tokens_seen": 349893690, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25634766, + "step": 16214, + "time_per_iteration": 2.6834027767181396 + }, + { + "auxiliary_loss_clip": 0.01253431, + "auxiliary_loss_mlp": 0.00241599, + "balance_loss_clip": 1.02692294, + "balance_loss_mlp": 0.21280977, + "epoch": 0.9748985420111228, + "flos": 20558715776640.0, + "grad_norm": 107.0960892995615, + "language_loss": 0.86019433, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.8751446, + "num_input_tokens_seen": 349912480, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.28796387, + "step": 16215, + "time_per_iteration": 2.7155840396881104 + }, + { + "auxiliary_loss_clip": 0.0122002, + "auxiliary_loss_mlp": 0.00220182, + "balance_loss_clip": 1.01131856, + "balance_loss_mlp": 0.19575624, + "epoch": 0.9749586652637908, + "flos": 36721605127680.0, + "grad_norm": 44.35629571278392, + "language_loss": 0.75383413, + "learning_rate": 6.558713018834483e-09, + "loss": 0.76823616, + "num_input_tokens_seen": 349932470, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.24438477, + "step": 16216, + "time_per_iteration": 2.8109610080718994 + }, + { + "auxiliary_loss_clip": 0.01246105, + "auxiliary_loss_mlp": 0.00234674, + "balance_loss_clip": 1.02770948, + "balance_loss_mlp": 0.20799449, + "epoch": 0.9750187885164587, + "flos": 10999223844480.0, + "grad_norm": 36.24676825344804, + "language_loss": 0.80790502, + "learning_rate": 6.527235786226937e-09, + "loss": 0.82271278, + "num_input_tokens_seen": 349949060, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.26696777, + "step": 16217, + "time_per_iteration": 2.6724417209625244 + }, + { + "auxiliary_loss_clip": 0.01231546, + "auxiliary_loss_mlp": 0.00211608, + "balance_loss_clip": 1.02012753, + "balance_loss_mlp": 0.18730086, + "epoch": 0.9750789117691268, + "flos": 25739512070400.0, + "grad_norm": 124.51398956129525, + "language_loss": 0.84845591, + "learning_rate": 6.495834146306167e-09, + "loss": 0.8628875, + "num_input_tokens_seen": 349968010, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.24316406, + "step": 16218, + "time_per_iteration": 2.7818009853363037 + }, + { + "auxiliary_loss_clip": 0.01217979, + "auxiliary_loss_mlp": 0.00220499, + "balance_loss_clip": 1.00905657, + "balance_loss_mlp": 0.19663364, + "epoch": 0.9751390350217947, + "flos": 13333955961600.0, + "grad_norm": 22.242053712478572, + "language_loss": 0.89091384, + "learning_rate": 6.464508100263222e-09, + "loss": 0.90529859, + "num_input_tokens_seen": 349985270, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.23840332, + "step": 16219, + "time_per_iteration": 2.647954225540161 + }, + { + "auxiliary_loss_clip": 0.01240323, + "auxiliary_loss_mlp": 0.00233117, + "balance_loss_clip": 1.02108002, + "balance_loss_mlp": 0.2079404, + "epoch": 0.9751991582744627, + "flos": 22820621068800.0, + "grad_norm": 7.2529682929922705, + "language_loss": 0.87173021, + "learning_rate": 6.433257649285817e-09, + "loss": 0.8864646, + "num_input_tokens_seen": 350003935, + "router_z_loss_clip": 2.19238281, + "router_z_loss_mlp": 0.25183105, + "step": 16220, + "time_per_iteration": 2.706794500350952 + }, + { + "auxiliary_loss_clip": 0.01227577, + "auxiliary_loss_mlp": 0.00216821, + "balance_loss_clip": 1.0131408, + "balance_loss_mlp": 0.1916679, + "epoch": 0.9752592815271306, + "flos": 19646189735040.0, + "grad_norm": 44.00887711918814, + "language_loss": 0.82914245, + "learning_rate": 6.402082794559227e-09, + "loss": 0.84358644, + "num_input_tokens_seen": 350023595, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.25146484, + "step": 16221, + "time_per_iteration": 2.6257498264312744 + }, + { + "auxiliary_loss_clip": 0.01227366, + "auxiliary_loss_mlp": 0.00221809, + "balance_loss_clip": 1.01502728, + "balance_loss_mlp": 0.19691788, + "epoch": 0.9753194047797986, + "flos": 26690462686080.0, + "grad_norm": 5.316208194262544, + "language_loss": 0.72151834, + "learning_rate": 6.370983537265395e-09, + "loss": 0.73601007, + "num_input_tokens_seen": 350045920, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.24890137, + "step": 16222, + "time_per_iteration": 4.162256240844727 + }, + { + "auxiliary_loss_clip": 0.0122852, + "auxiliary_loss_mlp": 0.00224673, + "balance_loss_clip": 1.01826024, + "balance_loss_mlp": 0.20034245, + "epoch": 0.9753795280324665, + "flos": 23221779137280.0, + "grad_norm": 11.257583943360041, + "language_loss": 0.96743888, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.98197079, + "num_input_tokens_seen": 350063925, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.24316406, + "step": 16223, + "time_per_iteration": 4.177023410797119 + }, + { + "auxiliary_loss_clip": 0.01213111, + "auxiliary_loss_mlp": 0.00213872, + "balance_loss_clip": 1.00939155, + "balance_loss_mlp": 0.18982738, + "epoch": 0.9754396512851345, + "flos": 19463835363840.0, + "grad_norm": 3.3906571597513087, + "language_loss": 0.81838667, + "learning_rate": 6.309011819690457e-09, + "loss": 0.8326565, + "num_input_tokens_seen": 350080900, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.24035645, + "step": 16224, + "time_per_iteration": 2.689598321914673 + }, + { + "auxiliary_loss_clip": 0.01081995, + "auxiliary_loss_mlp": 0.00095061, + "balance_loss_clip": 0.94338739, + "balance_loss_mlp": 0.08790831, + "epoch": 0.9754997745378025, + "flos": 68459313340800.0, + "grad_norm": 0.822374471881492, + "language_loss": 0.57825315, + "learning_rate": 6.278139361759249e-09, + "loss": 0.59002376, + "num_input_tokens_seen": 350144550, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.07128906, + "step": 16225, + "time_per_iteration": 3.1108663082122803 + }, + { + "auxiliary_loss_clip": 0.01239274, + "auxiliary_loss_mlp": 0.00225892, + "balance_loss_clip": 1.02353072, + "balance_loss_mlp": 0.20236011, + "epoch": 0.9755598977904705, + "flos": 26395168976640.0, + "grad_norm": 29.162433941785103, + "language_loss": 0.75766957, + "learning_rate": 6.247342505960818e-09, + "loss": 0.77232122, + "num_input_tokens_seen": 350164050, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.23547363, + "step": 16226, + "time_per_iteration": 2.7377209663391113 + }, + { + "auxiliary_loss_clip": 0.01226362, + "auxiliary_loss_mlp": 0.00236603, + "balance_loss_clip": 1.01267838, + "balance_loss_mlp": 0.21026964, + "epoch": 0.9756200210431384, + "flos": 16617663446400.0, + "grad_norm": 77.69331581242963, + "language_loss": 0.89924324, + "learning_rate": 6.216621253462894e-09, + "loss": 0.91387284, + "num_input_tokens_seen": 350181350, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.26330566, + "step": 16227, + "time_per_iteration": 2.6521334648132324 + }, + { + "auxiliary_loss_clip": 0.01226087, + "auxiliary_loss_mlp": 0.00216341, + "balance_loss_clip": 1.01378274, + "balance_loss_mlp": 0.19372651, + "epoch": 0.9756801442958064, + "flos": 23623044946560.0, + "grad_norm": 48.88447484243357, + "language_loss": 0.82017934, + "learning_rate": 6.185975605430549e-09, + "loss": 0.83460367, + "num_input_tokens_seen": 350199765, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.22631836, + "step": 16228, + "time_per_iteration": 2.719388961791992 + }, + { + "auxiliary_loss_clip": 0.01081534, + "auxiliary_loss_mlp": 0.00100444, + "balance_loss_clip": 0.94448125, + "balance_loss_mlp": 0.09319642, + "epoch": 0.9757402675484744, + "flos": 61625799440640.0, + "grad_norm": 0.8205117420542077, + "language_loss": 0.54928732, + "learning_rate": 6.155405563025962e-09, + "loss": 0.5611071, + "num_input_tokens_seen": 350256420, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.07226562, + "step": 16229, + "time_per_iteration": 3.0592432022094727 + }, + { + "auxiliary_loss_clip": 0.01244379, + "auxiliary_loss_mlp": 0.00221963, + "balance_loss_clip": 1.02703214, + "balance_loss_mlp": 0.19707173, + "epoch": 0.9758003908011423, + "flos": 24058964401920.0, + "grad_norm": 9.702327605002319, + "language_loss": 0.81072885, + "learning_rate": 6.124911127407984e-09, + "loss": 0.82539225, + "num_input_tokens_seen": 350276270, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.24890137, + "step": 16230, + "time_per_iteration": 2.7876923084259033 + }, + { + "auxiliary_loss_clip": 0.01233081, + "auxiliary_loss_mlp": 0.00213242, + "balance_loss_clip": 1.02079844, + "balance_loss_mlp": 0.18851823, + "epoch": 0.9758605140538104, + "flos": 17493093717120.0, + "grad_norm": 27.675026655545082, + "language_loss": 0.78647989, + "learning_rate": 6.094492299733245e-09, + "loss": 0.80094314, + "num_input_tokens_seen": 350295000, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.24719238, + "step": 16231, + "time_per_iteration": 4.214065790176392 + }, + { + "auxiliary_loss_clip": 0.01257066, + "auxiliary_loss_mlp": 0.00227646, + "balance_loss_clip": 1.03174448, + "balance_loss_mlp": 0.2022664, + "epoch": 0.9759206373064783, + "flos": 24826950115200.0, + "grad_norm": 464.5336924419608, + "language_loss": 0.87547898, + "learning_rate": 6.064149081155267e-09, + "loss": 0.89032608, + "num_input_tokens_seen": 350314980, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.25366211, + "step": 16232, + "time_per_iteration": 4.183711767196655 + }, + { + "auxiliary_loss_clip": 0.0108685, + "auxiliary_loss_mlp": 0.00079617, + "balance_loss_clip": 0.94937658, + "balance_loss_mlp": 0.07317932, + "epoch": 0.9759807605591463, + "flos": 68161182456960.0, + "grad_norm": 0.7148727283411961, + "language_loss": 0.53418088, + "learning_rate": 6.033881472824465e-09, + "loss": 0.54584551, + "num_input_tokens_seen": 350371985, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06445312, + "step": 16233, + "time_per_iteration": 2.9971158504486084 + }, + { + "auxiliary_loss_clip": 0.01223377, + "auxiliary_loss_mlp": 0.00220019, + "balance_loss_clip": 1.01064324, + "balance_loss_mlp": 0.19435287, + "epoch": 0.9760408838118142, + "flos": 18989239939200.0, + "grad_norm": 1559.686644095083, + "language_loss": 0.79997528, + "learning_rate": 6.003689475888807e-09, + "loss": 0.81440926, + "num_input_tokens_seen": 350390590, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.25671387, + "step": 16234, + "time_per_iteration": 2.667994499206543 + }, + { + "auxiliary_loss_clip": 0.01255573, + "auxiliary_loss_mlp": 0.00263549, + "balance_loss_clip": 1.0342927, + "balance_loss_mlp": 0.23621476, + "epoch": 0.9761010070644822, + "flos": 17125978763520.0, + "grad_norm": 9117.643117787062, + "language_loss": 0.88877207, + "learning_rate": 5.973573091493156e-09, + "loss": 0.90396321, + "num_input_tokens_seen": 350403770, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.27294922, + "step": 16235, + "time_per_iteration": 2.6228573322296143 + }, + { + "auxiliary_loss_clip": 0.01241167, + "auxiliary_loss_mlp": 0.00225065, + "balance_loss_clip": 1.02503586, + "balance_loss_mlp": 0.19947119, + "epoch": 0.9761611303171501, + "flos": 22052599441920.0, + "grad_norm": 42.85426679509969, + "language_loss": 0.84784937, + "learning_rate": 5.943532320779265e-09, + "loss": 0.86251163, + "num_input_tokens_seen": 350421870, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25610352, + "step": 16236, + "time_per_iteration": 2.6841397285461426 + }, + { + "auxiliary_loss_clip": 0.01225012, + "auxiliary_loss_mlp": 0.00207278, + "balance_loss_clip": 1.01322007, + "balance_loss_mlp": 0.1838291, + "epoch": 0.9762212535698181, + "flos": 21757521214080.0, + "grad_norm": 2.7229030867723676, + "language_loss": 0.8113538, + "learning_rate": 5.913567164886446e-09, + "loss": 0.82567668, + "num_input_tokens_seen": 350440025, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.23461914, + "step": 16237, + "time_per_iteration": 2.647490978240967 + }, + { + "auxiliary_loss_clip": 0.01232162, + "auxiliary_loss_mlp": 0.00233487, + "balance_loss_clip": 1.01799655, + "balance_loss_mlp": 0.20665297, + "epoch": 0.9762813768224861, + "flos": 25921615046400.0, + "grad_norm": 575.6691389052797, + "language_loss": 0.81598473, + "learning_rate": 5.8836776249509e-09, + "loss": 0.83064127, + "num_input_tokens_seen": 350459435, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.26867676, + "step": 16238, + "time_per_iteration": 2.737009286880493 + }, + { + "auxiliary_loss_clip": 0.01235335, + "auxiliary_loss_mlp": 0.00216492, + "balance_loss_clip": 1.01904273, + "balance_loss_mlp": 0.19167233, + "epoch": 0.9763415000751541, + "flos": 24051853509120.0, + "grad_norm": 4.330286897739265, + "language_loss": 0.91517144, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.92968971, + "num_input_tokens_seen": 350472655, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24804688, + "step": 16239, + "time_per_iteration": 2.633209228515625 + }, + { + "auxiliary_loss_clip": 0.01255109, + "auxiliary_loss_mlp": 0.00220956, + "balance_loss_clip": 1.03179741, + "balance_loss_mlp": 0.19525474, + "epoch": 0.976401623327822, + "flos": 17018677860480.0, + "grad_norm": 298.2840742130606, + "language_loss": 0.72634709, + "learning_rate": 5.824125397483115e-09, + "loss": 0.74110776, + "num_input_tokens_seen": 350488160, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25732422, + "step": 16240, + "time_per_iteration": 2.6207425594329834 + }, + { + "auxiliary_loss_clip": 0.01231404, + "auxiliary_loss_mlp": 0.00240589, + "balance_loss_clip": 1.02119827, + "balance_loss_mlp": 0.21565083, + "epoch": 0.97646174658049, + "flos": 16106941918080.0, + "grad_norm": 6.848011378138832, + "language_loss": 0.90181112, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.91653103, + "num_input_tokens_seen": 350506065, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.24963379, + "step": 16241, + "time_per_iteration": 2.616647481918335 + }, + { + "auxiliary_loss_clip": 0.01234329, + "auxiliary_loss_mlp": 0.00238372, + "balance_loss_clip": 1.02063894, + "balance_loss_mlp": 0.21329011, + "epoch": 0.9765218698331579, + "flos": 21252725429760.0, + "grad_norm": 14.502495673999231, + "language_loss": 0.90238667, + "learning_rate": 5.764875647408463e-09, + "loss": 0.91711366, + "num_input_tokens_seen": 350524495, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.25097656, + "step": 16242, + "time_per_iteration": 2.736884355545044 + }, + { + "auxiliary_loss_clip": 0.01260368, + "auxiliary_loss_mlp": 0.00232511, + "balance_loss_clip": 1.03687882, + "balance_loss_mlp": 0.20545033, + "epoch": 0.9765819930858259, + "flos": 18588045957120.0, + "grad_norm": 73.41753821292835, + "language_loss": 0.84038484, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.85531366, + "num_input_tokens_seen": 350544185, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.27062988, + "step": 16243, + "time_per_iteration": 2.6889610290527344 + }, + { + "auxiliary_loss_clip": 0.01251989, + "auxiliary_loss_mlp": 0.00241154, + "balance_loss_clip": 1.03429818, + "balance_loss_mlp": 0.21428385, + "epoch": 0.976642116338494, + "flos": 20266833859200.0, + "grad_norm": 39.05945211415106, + "language_loss": 0.77910221, + "learning_rate": 5.705928383713754e-09, + "loss": 0.79403359, + "num_input_tokens_seen": 350562675, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.26879883, + "step": 16244, + "time_per_iteration": 2.7319159507751465 + }, + { + "auxiliary_loss_clip": 0.01265688, + "auxiliary_loss_mlp": 0.00238714, + "balance_loss_clip": 1.0400219, + "balance_loss_mlp": 0.21080747, + "epoch": 0.9767022395911619, + "flos": 25550477769600.0, + "grad_norm": 20.51221854786558, + "language_loss": 0.91079926, + "learning_rate": 5.676568187055197e-09, + "loss": 0.9258433, + "num_input_tokens_seen": 350581535, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.27905273, + "step": 16245, + "time_per_iteration": 2.6892638206481934 + }, + { + "auxiliary_loss_clip": 0.01211263, + "auxiliary_loss_mlp": 0.0018415, + "balance_loss_clip": 1.0044744, + "balance_loss_mlp": 0.16211993, + "epoch": 0.9767623628438299, + "flos": 21762656858880.0, + "grad_norm": 5.77747716387729, + "language_loss": 0.84085852, + "learning_rate": 5.647283615340726e-09, + "loss": 0.85481262, + "num_input_tokens_seen": 350601615, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.22033691, + "step": 16246, + "time_per_iteration": 2.6546645164489746 + }, + { + "auxiliary_loss_clip": 0.0120461, + "auxiliary_loss_mlp": 0.00201288, + "balance_loss_clip": 1.00142276, + "balance_loss_mlp": 0.17849557, + "epoch": 0.9768224860964978, + "flos": 15851114277120.0, + "grad_norm": 3.7837761873468234, + "language_loss": 0.81544459, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.8295036, + "num_input_tokens_seen": 350619580, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.22790527, + "step": 16247, + "time_per_iteration": 2.6310787200927734 + }, + { + "auxiliary_loss_clip": 0.01233321, + "auxiliary_loss_mlp": 0.00220498, + "balance_loss_clip": 1.01917517, + "balance_loss_mlp": 0.19555925, + "epoch": 0.9768826093491658, + "flos": 25151151294720.0, + "grad_norm": 124.7805352865306, + "language_loss": 0.87772298, + "learning_rate": 5.58894135118404e-09, + "loss": 0.89226115, + "num_input_tokens_seen": 350640015, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.24951172, + "step": 16248, + "time_per_iteration": 2.711103916168213 + }, + { + "auxiliary_loss_clip": 0.0128193, + "auxiliary_loss_mlp": 0.00237763, + "balance_loss_clip": 1.04828215, + "balance_loss_mlp": 0.21038045, + "epoch": 0.9769427326018337, + "flos": 22967028904320.0, + "grad_norm": 9.741223619467215, + "language_loss": 0.87144804, + "learning_rate": 5.559883660954278e-09, + "loss": 0.88664496, + "num_input_tokens_seen": 350659155, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.27368164, + "step": 16249, + "time_per_iteration": 2.685798168182373 + }, + { + "auxiliary_loss_clip": 0.0123218, + "auxiliary_loss_mlp": 0.00232307, + "balance_loss_clip": 1.02219558, + "balance_loss_mlp": 0.20821477, + "epoch": 0.9770028558545018, + "flos": 15264297786240.0, + "grad_norm": 3.463755993551968, + "language_loss": 0.74200439, + "learning_rate": 5.530901600093507e-09, + "loss": 0.75664926, + "num_input_tokens_seen": 350676615, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.24108887, + "step": 16250, + "time_per_iteration": 2.6292941570281982 + }, + { + "auxiliary_loss_clip": 0.01085779, + "auxiliary_loss_mlp": 0.00096035, + "balance_loss_clip": 0.94753325, + "balance_loss_mlp": 0.08955018, + "epoch": 0.9770629791071697, + "flos": 71450348808960.0, + "grad_norm": 0.8068208371561941, + "language_loss": 0.58551288, + "learning_rate": 5.501995169700846e-09, + "loss": 0.59733105, + "num_input_tokens_seen": 350736805, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.06494141, + "step": 16251, + "time_per_iteration": 3.2168140411376953 + }, + { + "auxiliary_loss_clip": 0.01231139, + "auxiliary_loss_mlp": 0.00226459, + "balance_loss_clip": 1.01919329, + "balance_loss_mlp": 0.20209301, + "epoch": 0.9771231023598377, + "flos": 22412854897920.0, + "grad_norm": 2.272999807730126, + "language_loss": 0.84602702, + "learning_rate": 5.473164370872307e-09, + "loss": 0.86060297, + "num_input_tokens_seen": 350753600, + "router_z_loss_clip": 2.11425781, + "router_z_loss_mlp": 0.2434082, + "step": 16252, + "time_per_iteration": 2.643819570541382 + }, + { + "auxiliary_loss_clip": 0.01239095, + "auxiliary_loss_mlp": 0.00211959, + "balance_loss_clip": 1.02488732, + "balance_loss_mlp": 0.18653166, + "epoch": 0.9771832256125056, + "flos": 19025940660480.0, + "grad_norm": 11.18882064298086, + "language_loss": 0.74528778, + "learning_rate": 5.444409204701461e-09, + "loss": 0.75979829, + "num_input_tokens_seen": 350771225, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.2545166, + "step": 16253, + "time_per_iteration": 2.674994707107544 + }, + { + "auxiliary_loss_clip": 0.01257267, + "auxiliary_loss_mlp": 0.00232338, + "balance_loss_clip": 1.0374074, + "balance_loss_mlp": 0.20613563, + "epoch": 0.9772433488651736, + "flos": 17822143232640.0, + "grad_norm": 27.64473994552685, + "language_loss": 0.87091553, + "learning_rate": 5.415729672278324e-09, + "loss": 0.88581157, + "num_input_tokens_seen": 350789100, + "router_z_loss_clip": 2.20019531, + "router_z_loss_mlp": 0.26196289, + "step": 16254, + "time_per_iteration": 2.6136586666107178 + }, + { + "auxiliary_loss_clip": 0.01247793, + "auxiliary_loss_mlp": 0.00218928, + "balance_loss_clip": 1.02479386, + "balance_loss_mlp": 0.19303623, + "epoch": 0.9773034721178415, + "flos": 37629785623680.0, + "grad_norm": 23.395900252812392, + "language_loss": 0.73954725, + "learning_rate": 5.387125774690471e-09, + "loss": 0.75421453, + "num_input_tokens_seen": 350811085, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25927734, + "step": 16255, + "time_per_iteration": 2.907097578048706 + }, + { + "auxiliary_loss_clip": 0.01253963, + "auxiliary_loss_mlp": 0.00236953, + "balance_loss_clip": 1.02722728, + "balance_loss_mlp": 0.20850942, + "epoch": 0.9773635953705095, + "flos": 20302457172480.0, + "grad_norm": 60.67629590516847, + "language_loss": 0.85057545, + "learning_rate": 5.358597513023033e-09, + "loss": 0.8654846, + "num_input_tokens_seen": 350831065, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.28430176, + "step": 16256, + "time_per_iteration": 2.74334454536438 + }, + { + "auxiliary_loss_clip": 0.01228621, + "auxiliary_loss_mlp": 0.00221578, + "balance_loss_clip": 1.0203706, + "balance_loss_mlp": 0.19814169, + "epoch": 0.9774237186231776, + "flos": 22309253095680.0, + "grad_norm": 1022.2956328169155, + "language_loss": 0.84368372, + "learning_rate": 5.330144888357369e-09, + "loss": 0.85818577, + "num_input_tokens_seen": 350849675, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.23425293, + "step": 16257, + "time_per_iteration": 2.703503131866455 + }, + { + "auxiliary_loss_clip": 0.01235395, + "auxiliary_loss_mlp": 0.00204463, + "balance_loss_clip": 1.01809728, + "balance_loss_mlp": 0.17907117, + "epoch": 0.9774838418758455, + "flos": 24204905360640.0, + "grad_norm": 27.15278134334487, + "language_loss": 0.83392042, + "learning_rate": 5.301767901772391e-09, + "loss": 0.84831893, + "num_input_tokens_seen": 350868955, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25390625, + "step": 16258, + "time_per_iteration": 2.7626097202301025 + }, + { + "auxiliary_loss_clip": 0.01079858, + "auxiliary_loss_mlp": 0.00064478, + "balance_loss_clip": 0.94313622, + "balance_loss_mlp": 0.05799286, + "epoch": 0.9775439651285135, + "flos": 66357139829760.0, + "grad_norm": 0.6517648830383103, + "language_loss": 0.58762157, + "learning_rate": 5.273466554344353e-09, + "loss": 0.59906495, + "num_input_tokens_seen": 350935110, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.06494141, + "step": 16259, + "time_per_iteration": 3.2399446964263916 + }, + { + "auxiliary_loss_clip": 0.01263439, + "auxiliary_loss_mlp": 0.00256942, + "balance_loss_clip": 1.03842437, + "balance_loss_mlp": 0.23072787, + "epoch": 0.9776040883811814, + "flos": 22601565976320.0, + "grad_norm": 7.060549281545783, + "language_loss": 0.80907238, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.82427621, + "num_input_tokens_seen": 350953220, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.26208496, + "step": 16260, + "time_per_iteration": 2.736387014389038 + }, + { + "auxiliary_loss_clip": 0.01250372, + "auxiliary_loss_mlp": 0.00218553, + "balance_loss_clip": 1.02846956, + "balance_loss_mlp": 0.19334051, + "epoch": 0.9776642116338494, + "flos": 18442176825600.0, + "grad_norm": 17.085578628646612, + "language_loss": 0.86664057, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.88132983, + "num_input_tokens_seen": 350971915, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25219727, + "step": 16261, + "time_per_iteration": 2.6369025707244873 + }, + { + "auxiliary_loss_clip": 0.01240701, + "auxiliary_loss_mlp": 0.00223241, + "balance_loss_clip": 1.02483225, + "balance_loss_mlp": 0.19800401, + "epoch": 0.9777243348865173, + "flos": 22638446265600.0, + "grad_norm": 9.168908657971407, + "language_loss": 0.82087898, + "learning_rate": 5.189016357718845e-09, + "loss": 0.83551842, + "num_input_tokens_seen": 350990470, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.25268555, + "step": 16262, + "time_per_iteration": 2.672481060028076 + }, + { + "auxiliary_loss_clip": 0.01258662, + "auxiliary_loss_mlp": 0.00226998, + "balance_loss_clip": 1.03781867, + "balance_loss_mlp": 0.20093869, + "epoch": 0.9777844581391854, + "flos": 31321394605440.0, + "grad_norm": 20.39665158937016, + "language_loss": 0.82785302, + "learning_rate": 5.16101757762133e-09, + "loss": 0.84270966, + "num_input_tokens_seen": 351010755, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26037598, + "step": 16263, + "time_per_iteration": 2.7242703437805176 + }, + { + "auxiliary_loss_clip": 0.01247152, + "auxiliary_loss_mlp": 0.00213649, + "balance_loss_clip": 1.02817035, + "balance_loss_mlp": 0.18843587, + "epoch": 0.9778445813918533, + "flos": 23039101543680.0, + "grad_norm": 91.8419861418273, + "language_loss": 0.7637046, + "learning_rate": 5.133094442018038e-09, + "loss": 0.77831262, + "num_input_tokens_seen": 351029965, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.2520752, + "step": 16264, + "time_per_iteration": 4.064895868301392 + }, + { + "auxiliary_loss_clip": 0.01258742, + "auxiliary_loss_mlp": 0.00251361, + "balance_loss_clip": 1.03412986, + "balance_loss_mlp": 0.22357304, + "epoch": 0.9779047046445213, + "flos": 17566351505280.0, + "grad_norm": 6.957040622787914, + "language_loss": 0.80186272, + "learning_rate": 5.105246951967679e-09, + "loss": 0.81696373, + "num_input_tokens_seen": 351046205, + "router_z_loss_clip": 2.24316406, + "router_z_loss_mlp": 0.27770996, + "step": 16265, + "time_per_iteration": 4.115423679351807 + }, + { + "auxiliary_loss_clip": 0.01231166, + "auxiliary_loss_mlp": 0.00204429, + "balance_loss_clip": 1.01733613, + "balance_loss_mlp": 0.17914446, + "epoch": 0.9779648278971892, + "flos": 20741141975040.0, + "grad_norm": 2.9394924362489663, + "language_loss": 0.76859546, + "learning_rate": 5.077475108526297e-09, + "loss": 0.78295135, + "num_input_tokens_seen": 351065390, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.25280762, + "step": 16266, + "time_per_iteration": 2.6692798137664795 + }, + { + "auxiliary_loss_clip": 0.01220781, + "auxiliary_loss_mlp": 0.00215853, + "balance_loss_clip": 1.00975513, + "balance_loss_mlp": 0.19236879, + "epoch": 0.9780249511498572, + "flos": 21026954494080.0, + "grad_norm": 58.197109480576614, + "language_loss": 0.92261618, + "learning_rate": 5.049778912747049e-09, + "loss": 0.93698251, + "num_input_tokens_seen": 351084355, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.23486328, + "step": 16267, + "time_per_iteration": 2.64871883392334 + }, + { + "auxiliary_loss_clip": 0.01252033, + "auxiliary_loss_mlp": 0.00230112, + "balance_loss_clip": 1.02947712, + "balance_loss_mlp": 0.20483944, + "epoch": 0.9780850744025251, + "flos": 30774223751040.0, + "grad_norm": 6.8907062284222285, + "language_loss": 0.80825776, + "learning_rate": 5.022158365679985e-09, + "loss": 0.82307923, + "num_input_tokens_seen": 351105870, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25292969, + "step": 16268, + "time_per_iteration": 2.786489486694336 + }, + { + "auxiliary_loss_clip": 0.01244154, + "auxiliary_loss_mlp": 0.00218888, + "balance_loss_clip": 1.02761769, + "balance_loss_mlp": 0.19428343, + "epoch": 0.9781451976551931, + "flos": 20302995876480.0, + "grad_norm": 18.62926350093626, + "language_loss": 0.81520212, + "learning_rate": 4.994613468372711e-09, + "loss": 0.82983261, + "num_input_tokens_seen": 351124760, + "router_z_loss_clip": 2.16699219, + "router_z_loss_mlp": 0.24645996, + "step": 16269, + "time_per_iteration": 2.6774442195892334 + }, + { + "auxiliary_loss_clip": 0.01240105, + "auxiliary_loss_mlp": 0.00216283, + "balance_loss_clip": 1.02399457, + "balance_loss_mlp": 0.19172561, + "epoch": 0.9782053209078612, + "flos": 24316479982080.0, + "grad_norm": 7.2838684262576106, + "language_loss": 0.79586476, + "learning_rate": 4.967144221869501e-09, + "loss": 0.81042862, + "num_input_tokens_seen": 351142820, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.24536133, + "step": 16270, + "time_per_iteration": 2.84726881980896 + }, + { + "auxiliary_loss_clip": 0.01260886, + "auxiliary_loss_mlp": 0.00226677, + "balance_loss_clip": 1.0387826, + "balance_loss_mlp": 0.20083275, + "epoch": 0.9782654441605291, + "flos": 32489425065600.0, + "grad_norm": 5.576487829372038, + "language_loss": 0.74707645, + "learning_rate": 4.939750627212191e-09, + "loss": 0.7619521, + "num_input_tokens_seen": 351164805, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.25842285, + "step": 16271, + "time_per_iteration": 2.75707745552063 + }, + { + "auxiliary_loss_clip": 0.01240706, + "auxiliary_loss_mlp": 0.00226163, + "balance_loss_clip": 1.02858293, + "balance_loss_mlp": 0.20276181, + "epoch": 0.9783255674131971, + "flos": 26979076465920.0, + "grad_norm": 11.932406703058483, + "language_loss": 0.77447152, + "learning_rate": 4.912432685439505e-09, + "loss": 0.78914022, + "num_input_tokens_seen": 351187005, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.23413086, + "step": 16272, + "time_per_iteration": 2.800302267074585 + }, + { + "auxiliary_loss_clip": 0.01246189, + "auxiliary_loss_mlp": 0.00219347, + "balance_loss_clip": 1.0297308, + "balance_loss_mlp": 0.19551691, + "epoch": 0.978385690665865, + "flos": 23112251591040.0, + "grad_norm": 1.8542855234406044, + "language_loss": 0.7302947, + "learning_rate": 4.88519039758728e-09, + "loss": 0.74495006, + "num_input_tokens_seen": 351208450, + "router_z_loss_clip": 2.16699219, + "router_z_loss_mlp": 0.23864746, + "step": 16273, + "time_per_iteration": 4.140228748321533 + }, + { + "auxiliary_loss_clip": 0.01235557, + "auxiliary_loss_mlp": 0.00217906, + "balance_loss_clip": 1.02208853, + "balance_loss_mlp": 0.19361128, + "epoch": 0.978445813918533, + "flos": 25409672455680.0, + "grad_norm": 2071.381482089908, + "language_loss": 0.81077611, + "learning_rate": 4.85802376468869e-09, + "loss": 0.82531077, + "num_input_tokens_seen": 351229585, + "router_z_loss_clip": 2.13574219, + "router_z_loss_mlp": 0.24328613, + "step": 16274, + "time_per_iteration": 4.232133150100708 + }, + { + "auxiliary_loss_clip": 0.0123357, + "auxiliary_loss_mlp": 0.00199903, + "balance_loss_clip": 1.01934755, + "balance_loss_mlp": 0.17752695, + "epoch": 0.9785059371712009, + "flos": 23550218121600.0, + "grad_norm": 961.1306205425983, + "language_loss": 0.83969748, + "learning_rate": 4.830932787773579e-09, + "loss": 0.85403222, + "num_input_tokens_seen": 351249525, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.22363281, + "step": 16275, + "time_per_iteration": 2.7170255184173584 + }, + { + "auxiliary_loss_clip": 0.01242723, + "auxiliary_loss_mlp": 0.00226682, + "balance_loss_clip": 1.02387118, + "balance_loss_mlp": 0.19946614, + "epoch": 0.978566060423869, + "flos": 34351177870080.0, + "grad_norm": 8.287289845731094, + "language_loss": 0.77747613, + "learning_rate": 4.803917467869567e-09, + "loss": 0.79217011, + "num_input_tokens_seen": 351272530, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.27246094, + "step": 16276, + "time_per_iteration": 2.916033983230591 + }, + { + "auxiliary_loss_clip": 0.01215125, + "auxiliary_loss_mlp": 0.00236035, + "balance_loss_clip": 1.00388944, + "balance_loss_mlp": 0.21159747, + "epoch": 0.9786261836765369, + "flos": 11618862387840.0, + "grad_norm": 3.158904131297437, + "language_loss": 0.94788188, + "learning_rate": 4.776977806000726e-09, + "loss": 0.96239352, + "num_input_tokens_seen": 351288530, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.24450684, + "step": 16277, + "time_per_iteration": 2.685847282409668 + }, + { + "auxiliary_loss_clip": 0.01219534, + "auxiliary_loss_mlp": 0.00202781, + "balance_loss_clip": 1.01411283, + "balance_loss_mlp": 0.1793322, + "epoch": 0.9786863069292049, + "flos": 17420949250560.0, + "grad_norm": 25.384552093134108, + "language_loss": 0.76931733, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.78354043, + "num_input_tokens_seen": 351305890, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.23449707, + "step": 16278, + "time_per_iteration": 2.7139861583709717 + }, + { + "auxiliary_loss_clip": 0.01243086, + "auxiliary_loss_mlp": 0.00245174, + "balance_loss_clip": 1.02673936, + "balance_loss_mlp": 0.2199139, + "epoch": 0.9787464301818728, + "flos": 20844923345280.0, + "grad_norm": 5.926996921124684, + "language_loss": 0.89178962, + "learning_rate": 4.723325460453065e-09, + "loss": 0.90667224, + "num_input_tokens_seen": 351325010, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.25268555, + "step": 16279, + "time_per_iteration": 2.658730983734131 + }, + { + "auxiliary_loss_clip": 0.01241513, + "auxiliary_loss_mlp": 0.00234864, + "balance_loss_clip": 1.0250001, + "balance_loss_mlp": 0.21099798, + "epoch": 0.9788065534345408, + "flos": 18222942165120.0, + "grad_norm": 4.360775846687186, + "language_loss": 0.85680628, + "learning_rate": 4.696612778808395e-09, + "loss": 0.87157011, + "num_input_tokens_seen": 351343060, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.2388916, + "step": 16280, + "time_per_iteration": 2.702376365661621 + }, + { + "auxiliary_loss_clip": 0.01231615, + "auxiliary_loss_mlp": 0.00237014, + "balance_loss_clip": 1.02069116, + "balance_loss_mlp": 0.21213463, + "epoch": 0.9788666766872087, + "flos": 21578219498880.0, + "grad_norm": 6.5799659530107775, + "language_loss": 0.85148239, + "learning_rate": 4.669975759268085e-09, + "loss": 0.86616868, + "num_input_tokens_seen": 351363260, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.24853516, + "step": 16281, + "time_per_iteration": 2.717907667160034 + }, + { + "auxiliary_loss_clip": 0.0123204, + "auxiliary_loss_mlp": 0.00219559, + "balance_loss_clip": 1.01826501, + "balance_loss_mlp": 0.19472784, + "epoch": 0.9789267999398767, + "flos": 24900495212160.0, + "grad_norm": 8.59731671448413, + "language_loss": 0.88432813, + "learning_rate": 4.643414402842216e-09, + "loss": 0.89884418, + "num_input_tokens_seen": 351382610, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24829102, + "step": 16282, + "time_per_iteration": 2.7499947547912598 + }, + { + "auxiliary_loss_clip": 0.01230885, + "auxiliary_loss_mlp": 0.00221829, + "balance_loss_clip": 1.01782894, + "balance_loss_mlp": 0.19765362, + "epoch": 0.9789869231925448, + "flos": 19573111514880.0, + "grad_norm": 54.3564905762967, + "language_loss": 0.91296953, + "learning_rate": 4.616928710538204e-09, + "loss": 0.92749667, + "num_input_tokens_seen": 351401075, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24169922, + "step": 16283, + "time_per_iteration": 2.6116442680358887 + }, + { + "auxiliary_loss_clip": 0.01232789, + "auxiliary_loss_mlp": 0.00214414, + "balance_loss_clip": 1.02102172, + "balance_loss_mlp": 0.19050071, + "epoch": 0.9790470464452127, + "flos": 16796641939200.0, + "grad_norm": 4.968554677013541, + "language_loss": 0.78013003, + "learning_rate": 4.590518683360134e-09, + "loss": 0.79460204, + "num_input_tokens_seen": 351419275, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.2388916, + "step": 16284, + "time_per_iteration": 2.6286303997039795 + }, + { + "auxiliary_loss_clip": 0.01221895, + "auxiliary_loss_mlp": 0.00193142, + "balance_loss_clip": 1.01139832, + "balance_loss_mlp": 0.17181505, + "epoch": 0.9791071696978807, + "flos": 18369350000640.0, + "grad_norm": 31.419694620312537, + "language_loss": 0.71195686, + "learning_rate": 4.56418432230965e-09, + "loss": 0.72610724, + "num_input_tokens_seen": 351437375, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.21325684, + "step": 16285, + "time_per_iteration": 2.6587142944335938 + }, + { + "auxiliary_loss_clip": 0.01243948, + "auxiliary_loss_mlp": 0.00245761, + "balance_loss_clip": 1.02971506, + "balance_loss_mlp": 0.22032146, + "epoch": 0.9791672929505486, + "flos": 24170323541760.0, + "grad_norm": 2.2798835602713234, + "language_loss": 0.76264369, + "learning_rate": 4.537925628385286e-09, + "loss": 0.77754074, + "num_input_tokens_seen": 351457810, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.25439453, + "step": 16286, + "time_per_iteration": 2.6801750659942627 + }, + { + "auxiliary_loss_clip": 0.01207596, + "auxiliary_loss_mlp": 0.00253681, + "balance_loss_clip": 1.00208676, + "balance_loss_mlp": 0.2286348, + "epoch": 0.9792274162032166, + "flos": 24354114456960.0, + "grad_norm": 4.98261146570125, + "language_loss": 0.65179592, + "learning_rate": 4.511742602582691e-09, + "loss": 0.66640872, + "num_input_tokens_seen": 351478825, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.25036621, + "step": 16287, + "time_per_iteration": 2.7861382961273193 + }, + { + "auxiliary_loss_clip": 0.01227015, + "auxiliary_loss_mlp": 0.0022689, + "balance_loss_clip": 1.01643372, + "balance_loss_mlp": 0.20109317, + "epoch": 0.9792875394558845, + "flos": 26395779507840.0, + "grad_norm": 144.08679208986095, + "language_loss": 0.8872, + "learning_rate": 4.485635245894626e-09, + "loss": 0.901739, + "num_input_tokens_seen": 351498785, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.25805664, + "step": 16288, + "time_per_iteration": 2.6793642044067383 + }, + { + "auxiliary_loss_clip": 0.01240272, + "auxiliary_loss_mlp": 0.00241142, + "balance_loss_clip": 1.02415478, + "balance_loss_mlp": 0.21566735, + "epoch": 0.9793476627085526, + "flos": 28148004766080.0, + "grad_norm": 6.473004714924833, + "language_loss": 0.78820592, + "learning_rate": 4.459603559311631e-09, + "loss": 0.80302006, + "num_input_tokens_seen": 351520235, + "router_z_loss_clip": 2.16308594, + "router_z_loss_mlp": 0.25500488, + "step": 16289, + "time_per_iteration": 2.767590284347534 + }, + { + "auxiliary_loss_clip": 0.01243047, + "auxiliary_loss_mlp": 0.00217871, + "balance_loss_clip": 1.0285238, + "balance_loss_mlp": 0.19393378, + "epoch": 0.9794077859612205, + "flos": 16763927627520.0, + "grad_norm": 136.7993264209483, + "language_loss": 0.85195804, + "learning_rate": 4.43364754382003e-09, + "loss": 0.86656713, + "num_input_tokens_seen": 351538900, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.23962402, + "step": 16290, + "time_per_iteration": 2.605208396911621 + }, + { + "auxiliary_loss_clip": 0.01247117, + "auxiliary_loss_mlp": 0.00233734, + "balance_loss_clip": 1.02609658, + "balance_loss_mlp": 0.20920098, + "epoch": 0.9794679092138885, + "flos": 19280834547840.0, + "grad_norm": 7.332824560948242, + "language_loss": 0.73815191, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.75296038, + "num_input_tokens_seen": 351558715, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24536133, + "step": 16291, + "time_per_iteration": 2.6688232421875 + }, + { + "auxiliary_loss_clip": 0.01247259, + "auxiliary_loss_mlp": 0.0020966, + "balance_loss_clip": 1.02847242, + "balance_loss_mlp": 0.1840423, + "epoch": 0.9795280324665564, + "flos": 32156640535680.0, + "grad_norm": 12.046315721228824, + "language_loss": 0.71307516, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.72764426, + "num_input_tokens_seen": 351578450, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25622559, + "step": 16292, + "time_per_iteration": 2.7112526893615723 + }, + { + "auxiliary_loss_clip": 0.01245147, + "auxiliary_loss_mlp": 0.00210536, + "balance_loss_clip": 1.0248127, + "balance_loss_mlp": 0.18512104, + "epoch": 0.9795881557192244, + "flos": 19060953442560.0, + "grad_norm": 69.02848360732574, + "language_loss": 0.8175298, + "learning_rate": 4.356233533724829e-09, + "loss": 0.83208668, + "num_input_tokens_seen": 351597195, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.25439453, + "step": 16293, + "time_per_iteration": 2.7284018993377686 + }, + { + "auxiliary_loss_clip": 0.01250461, + "auxiliary_loss_mlp": 0.00226605, + "balance_loss_clip": 1.0285604, + "balance_loss_mlp": 0.19912696, + "epoch": 0.9796482789718923, + "flos": 28329928174080.0, + "grad_norm": 21.383546625368126, + "language_loss": 0.9059186, + "learning_rate": 4.330580212414503e-09, + "loss": 0.92068923, + "num_input_tokens_seen": 351617460, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.27490234, + "step": 16294, + "time_per_iteration": 2.744008779525757 + }, + { + "auxiliary_loss_clip": 0.01219376, + "auxiliary_loss_mlp": 0.00214979, + "balance_loss_clip": 1.01340175, + "balance_loss_mlp": 0.19052938, + "epoch": 0.9797084022245603, + "flos": 17967976450560.0, + "grad_norm": 87.98606562165072, + "language_loss": 0.80753756, + "learning_rate": 4.305002567088767e-09, + "loss": 0.82188118, + "num_input_tokens_seen": 351635900, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.24462891, + "step": 16295, + "time_per_iteration": 2.7190592288970947 + }, + { + "auxiliary_loss_clip": 0.01254276, + "auxiliary_loss_mlp": 0.0022086, + "balance_loss_clip": 1.03292453, + "balance_loss_mlp": 0.19563575, + "epoch": 0.9797685254772284, + "flos": 20266726118400.0, + "grad_norm": 115.48868009921229, + "language_loss": 0.88724929, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.90200067, + "num_input_tokens_seen": 351655400, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.25256348, + "step": 16296, + "time_per_iteration": 2.686901807785034 + }, + { + "auxiliary_loss_clip": 0.01240046, + "auxiliary_loss_mlp": 0.00215379, + "balance_loss_clip": 1.02332687, + "balance_loss_mlp": 0.19148964, + "epoch": 0.9798286487298963, + "flos": 26907147480960.0, + "grad_norm": 9.175257161792059, + "language_loss": 0.82807332, + "learning_rate": 4.254074308266853e-09, + "loss": 0.84262753, + "num_input_tokens_seen": 351675505, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.2388916, + "step": 16297, + "time_per_iteration": 2.735041856765747 + }, + { + "auxiliary_loss_clip": 0.01233555, + "auxiliary_loss_mlp": 0.0022653, + "balance_loss_clip": 1.0162859, + "balance_loss_mlp": 0.20086384, + "epoch": 0.9798887719825643, + "flos": 27161071701120.0, + "grad_norm": 21.97678809805317, + "language_loss": 0.85470533, + "learning_rate": 4.228723696702019e-09, + "loss": 0.86930621, + "num_input_tokens_seen": 351697920, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25646973, + "step": 16298, + "time_per_iteration": 2.8203988075256348 + }, + { + "auxiliary_loss_clip": 0.01229653, + "auxiliary_loss_mlp": 0.0020252, + "balance_loss_clip": 1.0161252, + "balance_loss_mlp": 0.17894034, + "epoch": 0.9799488952352322, + "flos": 20668422890880.0, + "grad_norm": 839.7778042942178, + "language_loss": 0.80217326, + "learning_rate": 4.203448764984019e-09, + "loss": 0.81649506, + "num_input_tokens_seen": 351717615, + "router_z_loss_clip": 2.13574219, + "router_z_loss_mlp": 0.2355957, + "step": 16299, + "time_per_iteration": 2.688943862915039 + }, + { + "auxiliary_loss_clip": 0.01242693, + "auxiliary_loss_mlp": 0.00251464, + "balance_loss_clip": 1.02400422, + "balance_loss_mlp": 0.22595364, + "epoch": 0.9800090184879002, + "flos": 21981209160960.0, + "grad_norm": 87.30744216215206, + "language_loss": 0.96241319, + "learning_rate": 4.178249514071419e-09, + "loss": 0.97735476, + "num_input_tokens_seen": 351735260, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25524902, + "step": 16300, + "time_per_iteration": 2.68994402885437 + }, + { + "auxiliary_loss_clip": 0.01264652, + "auxiliary_loss_mlp": 0.00248221, + "balance_loss_clip": 1.03562045, + "balance_loss_mlp": 0.22130369, + "epoch": 0.9800691417405681, + "flos": 21288420570240.0, + "grad_norm": 13.587509752242063, + "language_loss": 0.87498879, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.89011753, + "num_input_tokens_seen": 351755800, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.26916504, + "step": 16301, + "time_per_iteration": 2.708635091781616 + }, + { + "auxiliary_loss_clip": 0.01246035, + "auxiliary_loss_mlp": 0.0020576, + "balance_loss_clip": 1.02700615, + "balance_loss_mlp": 0.18098858, + "epoch": 0.9801292649932362, + "flos": 18439878355200.0, + "grad_norm": 15.051102795327642, + "language_loss": 0.83959901, + "learning_rate": 4.128078058480921e-09, + "loss": 0.85411698, + "num_input_tokens_seen": 351774790, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.2479248, + "step": 16302, + "time_per_iteration": 2.7335031032562256 + }, + { + "auxiliary_loss_clip": 0.01247061, + "auxiliary_loss_mlp": 0.00221472, + "balance_loss_clip": 1.02949739, + "balance_loss_mlp": 0.19546065, + "epoch": 0.9801893882459041, + "flos": 25046364343680.0, + "grad_norm": 42.40916283866196, + "language_loss": 0.87303162, + "learning_rate": 4.103105855705724e-09, + "loss": 0.88771695, + "num_input_tokens_seen": 351792855, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.25976562, + "step": 16303, + "time_per_iteration": 2.6880338191986084 + }, + { + "auxiliary_loss_clip": 0.0124373, + "auxiliary_loss_mlp": 0.00225144, + "balance_loss_clip": 1.02299118, + "balance_loss_mlp": 0.19965683, + "epoch": 0.9802495114985721, + "flos": 18511484117760.0, + "grad_norm": 21.362129794492535, + "language_loss": 0.94276816, + "learning_rate": 4.078209337540883e-09, + "loss": 0.95745689, + "num_input_tokens_seen": 351811450, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25488281, + "step": 16304, + "time_per_iteration": 2.7117700576782227 + }, + { + "auxiliary_loss_clip": 0.01211076, + "auxiliary_loss_mlp": 0.00213844, + "balance_loss_clip": 1.01137984, + "balance_loss_mlp": 0.19165912, + "epoch": 0.98030963475124, + "flos": 21469841187840.0, + "grad_norm": 7.513123927249419, + "language_loss": 0.76263905, + "learning_rate": 4.053388504930089e-09, + "loss": 0.77688825, + "num_input_tokens_seen": 351831960, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.22192383, + "step": 16305, + "time_per_iteration": 2.718914747238159 + }, + { + "auxiliary_loss_clip": 0.012406, + "auxiliary_loss_mlp": 0.00219112, + "balance_loss_clip": 1.0252192, + "balance_loss_mlp": 0.19374451, + "epoch": 0.980369758003908, + "flos": 20412272027520.0, + "grad_norm": 19.310188764008878, + "language_loss": 0.81420398, + "learning_rate": 4.028643358815032e-09, + "loss": 0.82880116, + "num_input_tokens_seen": 351851585, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.25390625, + "step": 16306, + "time_per_iteration": 4.041212320327759 + }, + { + "auxiliary_loss_clip": 0.0121446, + "auxiliary_loss_mlp": 0.00213713, + "balance_loss_clip": 1.00596571, + "balance_loss_mlp": 0.19091991, + "epoch": 0.9804298812565759, + "flos": 23399177431680.0, + "grad_norm": 22.864723072124242, + "language_loss": 0.79563648, + "learning_rate": 4.00397390013385e-09, + "loss": 0.80991817, + "num_input_tokens_seen": 351871085, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.22802734, + "step": 16307, + "time_per_iteration": 2.7245934009552 + }, + { + "auxiliary_loss_clip": 0.01208774, + "auxiliary_loss_mlp": 0.00230744, + "balance_loss_clip": 1.00564742, + "balance_loss_mlp": 0.20781967, + "epoch": 0.980490004509244, + "flos": 23292666627840.0, + "grad_norm": 9.04174134322885, + "language_loss": 0.79470301, + "learning_rate": 3.979380129822018e-09, + "loss": 0.80909818, + "num_input_tokens_seen": 351891775, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.22912598, + "step": 16308, + "time_per_iteration": 4.273146152496338 + }, + { + "auxiliary_loss_clip": 0.01080238, + "auxiliary_loss_mlp": 0.00062446, + "balance_loss_clip": 0.94224513, + "balance_loss_mlp": 0.05610429, + "epoch": 0.980550127761912, + "flos": 56051027798400.0, + "grad_norm": 0.7371754450680106, + "language_loss": 0.56852931, + "learning_rate": 3.954862048811902e-09, + "loss": 0.57995617, + "num_input_tokens_seen": 351946770, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06347656, + "step": 16309, + "time_per_iteration": 3.0559093952178955 + }, + { + "auxiliary_loss_clip": 0.01223614, + "auxiliary_loss_mlp": 0.00192915, + "balance_loss_clip": 1.01148129, + "balance_loss_mlp": 0.16807216, + "epoch": 0.9806102510145799, + "flos": 25333290184320.0, + "grad_norm": 54.04474305867808, + "language_loss": 0.76143271, + "learning_rate": 3.930419658033646e-09, + "loss": 0.77559799, + "num_input_tokens_seen": 351966155, + "router_z_loss_clip": 2.12597656, + "router_z_loss_mlp": 0.24841309, + "step": 16310, + "time_per_iteration": 2.7110953330993652 + }, + { + "auxiliary_loss_clip": 0.01080591, + "auxiliary_loss_mlp": 0.00054102, + "balance_loss_clip": 0.94453526, + "balance_loss_mlp": 0.04799844, + "epoch": 0.9806703742672479, + "flos": 67274837429760.0, + "grad_norm": 0.8007131983654494, + "language_loss": 0.54008162, + "learning_rate": 3.906052958413841e-09, + "loss": 0.55142856, + "num_input_tokens_seen": 352031655, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06103516, + "step": 16311, + "time_per_iteration": 3.1736879348754883 + }, + { + "auxiliary_loss_clip": 0.01236506, + "auxiliary_loss_mlp": 0.00207719, + "balance_loss_clip": 1.02348828, + "balance_loss_mlp": 0.18295926, + "epoch": 0.9807304975199158, + "flos": 25228970110080.0, + "grad_norm": 2208.536004938229, + "language_loss": 0.86248791, + "learning_rate": 3.881761950876638e-09, + "loss": 0.87693012, + "num_input_tokens_seen": 352051920, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24743652, + "step": 16312, + "time_per_iteration": 2.6949682235717773 + }, + { + "auxiliary_loss_clip": 0.01215149, + "auxiliary_loss_mlp": 0.00205405, + "balance_loss_clip": 1.01005936, + "balance_loss_mlp": 0.1827555, + "epoch": 0.9807906207725838, + "flos": 17456392995840.0, + "grad_norm": 22.853361168194095, + "language_loss": 0.72337198, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.73757756, + "num_input_tokens_seen": 352069315, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.22668457, + "step": 16313, + "time_per_iteration": 2.606459379196167 + }, + { + "auxiliary_loss_clip": 0.01230707, + "auxiliary_loss_mlp": 0.0021193, + "balance_loss_clip": 1.0173384, + "balance_loss_mlp": 0.1884577, + "epoch": 0.9808507440252517, + "flos": 21032413361280.0, + "grad_norm": 45.61679953300054, + "language_loss": 0.83077353, + "learning_rate": 3.833407015731316e-09, + "loss": 0.84519988, + "num_input_tokens_seen": 352089480, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.23461914, + "step": 16314, + "time_per_iteration": 2.6594932079315186 + }, + { + "auxiliary_loss_clip": 0.01082003, + "auxiliary_loss_mlp": 0.00070798, + "balance_loss_clip": 0.94410026, + "balance_loss_mlp": 0.06417038, + "epoch": 0.9809108672779198, + "flos": 64044491598720.0, + "grad_norm": 0.7043468409988751, + "language_loss": 0.51217192, + "learning_rate": 3.80934308995684e-09, + "loss": 0.52369994, + "num_input_tokens_seen": 352150000, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06640625, + "step": 16315, + "time_per_iteration": 3.164409637451172 + }, + { + "auxiliary_loss_clip": 0.01241685, + "auxiliary_loss_mlp": 0.00218571, + "balance_loss_clip": 1.02328873, + "balance_loss_mlp": 0.19409713, + "epoch": 0.9809709905305877, + "flos": 22780616296320.0, + "grad_norm": 12.533499791220915, + "language_loss": 0.75832176, + "learning_rate": 3.785354859932033e-09, + "loss": 0.7729243, + "num_input_tokens_seen": 352170990, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24499512, + "step": 16316, + "time_per_iteration": 4.140840768814087 + }, + { + "auxiliary_loss_clip": 0.01234327, + "auxiliary_loss_mlp": 0.0022274, + "balance_loss_clip": 1.01755857, + "balance_loss_mlp": 0.19740823, + "epoch": 0.9810311137832557, + "flos": 37013415217920.0, + "grad_norm": 12.389761143328121, + "language_loss": 0.63903576, + "learning_rate": 3.76144232656661e-09, + "loss": 0.65360641, + "num_input_tokens_seen": 352195335, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.25354004, + "step": 16317, + "time_per_iteration": 4.228930234909058 + }, + { + "auxiliary_loss_clip": 0.01227885, + "auxiliary_loss_mlp": 0.00219988, + "balance_loss_clip": 1.01652622, + "balance_loss_mlp": 0.19665906, + "epoch": 0.9810912370359236, + "flos": 18916305373440.0, + "grad_norm": 146.41845677356477, + "language_loss": 0.81286526, + "learning_rate": 3.737605490767404e-09, + "loss": 0.82734406, + "num_input_tokens_seen": 352214170, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.2331543, + "step": 16318, + "time_per_iteration": 2.7606089115142822 + }, + { + "auxiliary_loss_clip": 0.01235807, + "auxiliary_loss_mlp": 0.00213653, + "balance_loss_clip": 1.01957774, + "balance_loss_mlp": 0.18811798, + "epoch": 0.9811513602885916, + "flos": 18441602208000.0, + "grad_norm": 21.68365443455554, + "language_loss": 0.90730274, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.92179728, + "num_input_tokens_seen": 352231470, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25524902, + "step": 16319, + "time_per_iteration": 2.6557388305664062 + }, + { + "auxiliary_loss_clip": 0.01079175, + "auxiliary_loss_mlp": 0.00078291, + "balance_loss_clip": 0.94265497, + "balance_loss_mlp": 0.07233032, + "epoch": 0.9812114835412595, + "flos": 68058945371520.0, + "grad_norm": 0.7010632284204807, + "language_loss": 0.52835375, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.53992844, + "num_input_tokens_seen": 352291770, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.05957031, + "step": 16320, + "time_per_iteration": 3.058997869491577 + }, + { + "auxiliary_loss_clip": 0.01244882, + "auxiliary_loss_mlp": 0.00237713, + "balance_loss_clip": 1.02633214, + "balance_loss_mlp": 0.21221446, + "epoch": 0.9812716067939276, + "flos": 25373007648000.0, + "grad_norm": 30.879759936882248, + "language_loss": 0.82124043, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.83606637, + "num_input_tokens_seen": 352310735, + "router_z_loss_clip": 2.18457031, + "router_z_loss_mlp": 0.25500488, + "step": 16321, + "time_per_iteration": 2.746926784515381 + }, + { + "auxiliary_loss_clip": 0.01236353, + "auxiliary_loss_mlp": 0.00196979, + "balance_loss_clip": 1.02712727, + "balance_loss_mlp": 0.17367369, + "epoch": 0.9813317300465956, + "flos": 22856818999680.0, + "grad_norm": 19.959766840070778, + "language_loss": 0.89244521, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.90677851, + "num_input_tokens_seen": 352329545, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.2331543, + "step": 16322, + "time_per_iteration": 2.8419382572174072 + }, + { + "auxiliary_loss_clip": 0.01237849, + "auxiliary_loss_mlp": 0.00229083, + "balance_loss_clip": 1.01867974, + "balance_loss_mlp": 0.20388207, + "epoch": 0.9813918532992635, + "flos": 23586954756480.0, + "grad_norm": 18.700631654211502, + "language_loss": 0.86929464, + "learning_rate": 3.619556806799595e-09, + "loss": 0.88396394, + "num_input_tokens_seen": 352352080, + "router_z_loss_clip": 2.19042969, + "router_z_loss_mlp": 0.2520752, + "step": 16323, + "time_per_iteration": 2.7461953163146973 + }, + { + "auxiliary_loss_clip": 0.01263506, + "auxiliary_loss_mlp": 0.00221154, + "balance_loss_clip": 1.03520489, + "balance_loss_mlp": 0.1936402, + "epoch": 0.9814519765519315, + "flos": 19606328616960.0, + "grad_norm": 14.959315904548063, + "language_loss": 0.94089782, + "learning_rate": 3.596174175278799e-09, + "loss": 0.9557445, + "num_input_tokens_seen": 352366455, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27539062, + "step": 16324, + "time_per_iteration": 2.6415398120880127 + }, + { + "auxiliary_loss_clip": 0.01236732, + "auxiliary_loss_mlp": 0.00222467, + "balance_loss_clip": 1.02205646, + "balance_loss_mlp": 0.19825545, + "epoch": 0.9815120998045994, + "flos": 33946284787200.0, + "grad_norm": 14.29343369702156, + "language_loss": 0.81294191, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.8275339, + "num_input_tokens_seen": 352386090, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24243164, + "step": 16325, + "time_per_iteration": 2.819620370864868 + }, + { + "auxiliary_loss_clip": 0.01213865, + "auxiliary_loss_mlp": 0.00212462, + "balance_loss_clip": 1.00723839, + "balance_loss_mlp": 0.18953855, + "epoch": 0.9815722230572674, + "flos": 20850023076480.0, + "grad_norm": 7.202271795796581, + "language_loss": 0.8270641, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.84132737, + "num_input_tokens_seen": 352404000, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.22912598, + "step": 16326, + "time_per_iteration": 2.61122989654541 + }, + { + "auxiliary_loss_clip": 0.01263894, + "auxiliary_loss_mlp": 0.0024433, + "balance_loss_clip": 1.03792715, + "balance_loss_mlp": 0.21905762, + "epoch": 0.9816323463099353, + "flos": 22894525301760.0, + "grad_norm": 102.96195625050345, + "language_loss": 0.76130801, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.77639019, + "num_input_tokens_seen": 352423540, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.25280762, + "step": 16327, + "time_per_iteration": 2.70525860786438 + }, + { + "auxiliary_loss_clip": 0.01253112, + "auxiliary_loss_mlp": 0.00232144, + "balance_loss_clip": 1.02798676, + "balance_loss_mlp": 0.20489277, + "epoch": 0.9816924695626034, + "flos": 31539444117120.0, + "grad_norm": 7.154738555415265, + "language_loss": 0.80570781, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.82056034, + "num_input_tokens_seen": 352445530, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.27246094, + "step": 16328, + "time_per_iteration": 2.746405839920044 + }, + { + "auxiliary_loss_clip": 0.0126981, + "auxiliary_loss_mlp": 0.00246829, + "balance_loss_clip": 1.04041338, + "balance_loss_mlp": 0.21916078, + "epoch": 0.9817525928152713, + "flos": 21506901045120.0, + "grad_norm": 171.4038196051224, + "language_loss": 0.89174187, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.90690833, + "num_input_tokens_seen": 352466325, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.2767334, + "step": 16329, + "time_per_iteration": 2.731083631515503 + }, + { + "auxiliary_loss_clip": 0.01259267, + "auxiliary_loss_mlp": 0.00241021, + "balance_loss_clip": 1.03313494, + "balance_loss_mlp": 0.21546285, + "epoch": 0.9818127160679393, + "flos": 25550513683200.0, + "grad_norm": 35.431375575714895, + "language_loss": 0.85334426, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.86834717, + "num_input_tokens_seen": 352485505, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.25598145, + "step": 16330, + "time_per_iteration": 2.691133737564087 + }, + { + "auxiliary_loss_clip": 0.01296083, + "auxiliary_loss_mlp": 0.00237256, + "balance_loss_clip": 1.05521393, + "balance_loss_mlp": 0.20726249, + "epoch": 0.9818728393206072, + "flos": 28803661672320.0, + "grad_norm": 2.8907639717446214, + "language_loss": 0.77325886, + "learning_rate": 3.434615511252126e-09, + "loss": 0.78859228, + "num_input_tokens_seen": 352505360, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.29980469, + "step": 16331, + "time_per_iteration": 2.765388250350952 + }, + { + "auxiliary_loss_clip": 0.01240375, + "auxiliary_loss_mlp": 0.00227066, + "balance_loss_clip": 1.02531087, + "balance_loss_mlp": 0.20246109, + "epoch": 0.9819329625732752, + "flos": 23222246014080.0, + "grad_norm": 21.506227201949756, + "language_loss": 0.80609918, + "learning_rate": 3.411838534981948e-09, + "loss": 0.8207736, + "num_input_tokens_seen": 352524035, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.24609375, + "step": 16332, + "time_per_iteration": 2.7377254962921143 + }, + { + "auxiliary_loss_clip": 0.01232305, + "auxiliary_loss_mlp": 0.00223859, + "balance_loss_clip": 1.0164001, + "balance_loss_mlp": 0.19987422, + "epoch": 0.9819930858259431, + "flos": 17530440883200.0, + "grad_norm": 8.407458066707925, + "language_loss": 0.84010047, + "learning_rate": 3.389137269534936e-09, + "loss": 0.85466212, + "num_input_tokens_seen": 352543210, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.23986816, + "step": 16333, + "time_per_iteration": 2.620948553085327 + }, + { + "auxiliary_loss_clip": 0.01245366, + "auxiliary_loss_mlp": 0.00214521, + "balance_loss_clip": 1.02265418, + "balance_loss_mlp": 0.18903369, + "epoch": 0.9820532090786112, + "flos": 12529915971840.0, + "grad_norm": 5.518848978105254, + "language_loss": 0.82783538, + "learning_rate": 3.366511715771958e-09, + "loss": 0.84243429, + "num_input_tokens_seen": 352559770, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25524902, + "step": 16334, + "time_per_iteration": 2.6263182163238525 + }, + { + "auxiliary_loss_clip": 0.01252203, + "auxiliary_loss_mlp": 0.00220835, + "balance_loss_clip": 1.03075576, + "balance_loss_mlp": 0.1945134, + "epoch": 0.9821133323312792, + "flos": 18840174497280.0, + "grad_norm": 4.876408395253795, + "language_loss": 0.85732037, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.87205076, + "num_input_tokens_seen": 352577690, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26367188, + "step": 16335, + "time_per_iteration": 2.6166598796844482 + }, + { + "auxiliary_loss_clip": 0.0126168, + "auxiliary_loss_mlp": 0.00249664, + "balance_loss_clip": 1.03634477, + "balance_loss_mlp": 0.22168523, + "epoch": 0.9821734555839471, + "flos": 34824013528320.0, + "grad_norm": 2.5759403515421435, + "language_loss": 0.74387151, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.75898492, + "num_input_tokens_seen": 352598850, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.27990723, + "step": 16336, + "time_per_iteration": 2.7779881954193115 + }, + { + "auxiliary_loss_clip": 0.01245986, + "auxiliary_loss_mlp": 0.00249131, + "balance_loss_clip": 1.01958132, + "balance_loss_mlp": 0.22153422, + "epoch": 0.9822335788366151, + "flos": 17128169493120.0, + "grad_norm": 14.197571501863143, + "language_loss": 0.82151115, + "learning_rate": 3.299089333152372e-09, + "loss": 0.83646238, + "num_input_tokens_seen": 352616130, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.27612305, + "step": 16337, + "time_per_iteration": 2.624941349029541 + }, + { + "auxiliary_loss_clip": 0.01243806, + "auxiliary_loss_mlp": 0.00217311, + "balance_loss_clip": 1.02017057, + "balance_loss_mlp": 0.19228876, + "epoch": 0.982293702089283, + "flos": 20813250528000.0, + "grad_norm": 15.193677770722369, + "language_loss": 0.81189036, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.82650149, + "num_input_tokens_seen": 352636885, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.25048828, + "step": 16338, + "time_per_iteration": 2.6886606216430664 + }, + { + "auxiliary_loss_clip": 0.01251522, + "auxiliary_loss_mlp": 0.00232878, + "balance_loss_clip": 1.02984822, + "balance_loss_mlp": 0.2077252, + "epoch": 0.982353825341951, + "flos": 24680829588480.0, + "grad_norm": 12.210128205471857, + "language_loss": 0.88270968, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.89755368, + "num_input_tokens_seen": 352657905, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25170898, + "step": 16339, + "time_per_iteration": 2.670863151550293 + }, + { + "auxiliary_loss_clip": 0.0121986, + "auxiliary_loss_mlp": 0.00224441, + "balance_loss_clip": 1.0133723, + "balance_loss_mlp": 0.20196998, + "epoch": 0.982413948594619, + "flos": 20850489953280.0, + "grad_norm": 10.276968412153517, + "language_loss": 0.69490111, + "learning_rate": 3.232348386403405e-09, + "loss": 0.70934415, + "num_input_tokens_seen": 352676320, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.22473145, + "step": 16340, + "time_per_iteration": 2.6659348011016846 + }, + { + "auxiliary_loss_clip": 0.01245047, + "auxiliary_loss_mlp": 0.00208425, + "balance_loss_clip": 1.02694941, + "balance_loss_mlp": 0.18374825, + "epoch": 0.982474071847287, + "flos": 15377380778880.0, + "grad_norm": 3.147864296772023, + "language_loss": 0.95151114, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.96604586, + "num_input_tokens_seen": 352692665, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.24694824, + "step": 16341, + "time_per_iteration": 2.657302141189575 + }, + { + "auxiliary_loss_clip": 0.01225258, + "auxiliary_loss_mlp": 0.00217056, + "balance_loss_clip": 1.01311612, + "balance_loss_mlp": 0.19285697, + "epoch": 0.9825341950999549, + "flos": 23774732081280.0, + "grad_norm": 15.50509299432167, + "language_loss": 0.73085439, + "learning_rate": 3.188233008645014e-09, + "loss": 0.74527764, + "num_input_tokens_seen": 352716130, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.24194336, + "step": 16342, + "time_per_iteration": 2.7434628009796143 + }, + { + "auxiliary_loss_clip": 0.01244461, + "auxiliary_loss_mlp": 0.00241829, + "balance_loss_clip": 1.02554965, + "balance_loss_mlp": 0.21605608, + "epoch": 0.9825943183526229, + "flos": 22746285872640.0, + "grad_norm": 38.01637448040974, + "language_loss": 0.83291149, + "learning_rate": 3.16628889830195e-09, + "loss": 0.84777439, + "num_input_tokens_seen": 352734705, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.25769043, + "step": 16343, + "time_per_iteration": 2.646665573120117 + }, + { + "auxiliary_loss_clip": 0.01219397, + "auxiliary_loss_mlp": 0.00203439, + "balance_loss_clip": 1.01299357, + "balance_loss_mlp": 0.18134905, + "epoch": 0.9826544416052908, + "flos": 27709966408320.0, + "grad_norm": 3.994293756621373, + "language_loss": 0.82596111, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.84018952, + "num_input_tokens_seen": 352756225, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.22094727, + "step": 16344, + "time_per_iteration": 2.766458749771118 + }, + { + "auxiliary_loss_clip": 0.01250234, + "auxiliary_loss_mlp": 0.00219297, + "balance_loss_clip": 1.03626585, + "balance_loss_mlp": 0.19472775, + "epoch": 0.9827145648579588, + "flos": 26941657472640.0, + "grad_norm": 39.130770738517015, + "language_loss": 0.75858271, + "learning_rate": 3.122627838848313e-09, + "loss": 0.773278, + "num_input_tokens_seen": 352776210, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.24560547, + "step": 16345, + "time_per_iteration": 2.762996196746826 + }, + { + "auxiliary_loss_clip": 0.01212923, + "auxiliary_loss_mlp": 0.00202, + "balance_loss_clip": 1.00766456, + "balance_loss_mlp": 0.1808878, + "epoch": 0.9827746881106267, + "flos": 21866545969920.0, + "grad_norm": 5.967524072558988, + "language_loss": 0.84543037, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.85957962, + "num_input_tokens_seen": 352795455, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.21118164, + "step": 16346, + "time_per_iteration": 2.715184450149536 + }, + { + "auxiliary_loss_clip": 0.01274955, + "auxiliary_loss_mlp": 0.00231706, + "balance_loss_clip": 1.04665446, + "balance_loss_mlp": 0.20503844, + "epoch": 0.9828348113632948, + "flos": 20850777262080.0, + "grad_norm": 50.7969416860526, + "language_loss": 0.84999108, + "learning_rate": 3.079269666552031e-09, + "loss": 0.86505765, + "num_input_tokens_seen": 352812895, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.26660156, + "step": 16347, + "time_per_iteration": 2.6930296421051025 + }, + { + "auxiliary_loss_clip": 0.01221812, + "auxiliary_loss_mlp": 0.00207935, + "balance_loss_clip": 1.01170981, + "balance_loss_mlp": 0.18398547, + "epoch": 0.9828949346159628, + "flos": 34569227381760.0, + "grad_norm": 8.486976570000698, + "language_loss": 0.74149811, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.7557956, + "num_input_tokens_seen": 352835470, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.23950195, + "step": 16348, + "time_per_iteration": 4.204597473144531 + }, + { + "auxiliary_loss_clip": 0.01243793, + "auxiliary_loss_mlp": 0.00217941, + "balance_loss_clip": 1.02624822, + "balance_loss_mlp": 0.19313323, + "epoch": 0.9829550578686307, + "flos": 24457464864000.0, + "grad_norm": 3.6578316268219684, + "language_loss": 0.78743058, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.80204791, + "num_input_tokens_seen": 352854295, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24816895, + "step": 16349, + "time_per_iteration": 2.7074873447418213 + }, + { + "auxiliary_loss_clip": 0.01215515, + "auxiliary_loss_mlp": 0.00210159, + "balance_loss_clip": 1.00628459, + "balance_loss_mlp": 0.18656704, + "epoch": 0.9830151811212987, + "flos": 16910084067840.0, + "grad_norm": 2.0708150125518796, + "language_loss": 0.83197749, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.8462342, + "num_input_tokens_seen": 352869695, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.23583984, + "step": 16350, + "time_per_iteration": 4.074397563934326 + }, + { + "auxiliary_loss_clip": 0.01245995, + "auxiliary_loss_mlp": 0.00237886, + "balance_loss_clip": 1.02479041, + "balance_loss_mlp": 0.21173169, + "epoch": 0.9830753043739666, + "flos": 21288312829440.0, + "grad_norm": 21.722559710679455, + "language_loss": 0.91796279, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.9328016, + "num_input_tokens_seen": 352887430, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26147461, + "step": 16351, + "time_per_iteration": 2.7260775566101074 + }, + { + "auxiliary_loss_clip": 0.01227039, + "auxiliary_loss_mlp": 0.00203054, + "balance_loss_clip": 1.01573586, + "balance_loss_mlp": 0.17912897, + "epoch": 0.9831354276266346, + "flos": 31723522341120.0, + "grad_norm": 2.5116152081205105, + "language_loss": 0.74231958, + "learning_rate": 2.972199410170795e-09, + "loss": 0.75662053, + "num_input_tokens_seen": 352907555, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.23937988, + "step": 16352, + "time_per_iteration": 2.7652695178985596 + }, + { + "auxiliary_loss_clip": 0.01233004, + "auxiliary_loss_mlp": 0.00200484, + "balance_loss_clip": 1.01856577, + "balance_loss_mlp": 0.17750092, + "epoch": 0.9831955508793025, + "flos": 21619050284160.0, + "grad_norm": 9.688368469649223, + "language_loss": 0.72848034, + "learning_rate": 2.951012538143782e-09, + "loss": 0.74281526, + "num_input_tokens_seen": 352928670, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.2298584, + "step": 16353, + "time_per_iteration": 2.7003748416900635 + }, + { + "auxiliary_loss_clip": 0.0121213, + "auxiliary_loss_mlp": 0.00189307, + "balance_loss_clip": 1.00498295, + "balance_loss_mlp": 0.16740791, + "epoch": 0.9832556741319706, + "flos": 22968214053120.0, + "grad_norm": 20.167362215565937, + "language_loss": 0.80905867, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.82307303, + "num_input_tokens_seen": 352948345, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.21899414, + "step": 16354, + "time_per_iteration": 2.7592215538024902 + }, + { + "auxiliary_loss_clip": 0.01222032, + "auxiliary_loss_mlp": 0.00205012, + "balance_loss_clip": 1.01194692, + "balance_loss_mlp": 0.18170664, + "epoch": 0.9833157973846385, + "flos": 21323900229120.0, + "grad_norm": 3.6719376992198223, + "language_loss": 0.86134505, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.87561554, + "num_input_tokens_seen": 352967250, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.23291016, + "step": 16355, + "time_per_iteration": 2.705031633377075 + }, + { + "auxiliary_loss_clip": 0.01219229, + "auxiliary_loss_mlp": 0.00218532, + "balance_loss_clip": 1.01339531, + "balance_loss_mlp": 0.1948213, + "epoch": 0.9833759206373065, + "flos": 21068719032960.0, + "grad_norm": 6.4160101369193825, + "language_loss": 0.79746854, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.81184614, + "num_input_tokens_seen": 352984725, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.23730469, + "step": 16356, + "time_per_iteration": 2.6544947624206543 + }, + { + "auxiliary_loss_clip": 0.01233073, + "auxiliary_loss_mlp": 0.00232559, + "balance_loss_clip": 1.02042949, + "balance_loss_mlp": 0.20752501, + "epoch": 0.9834360438899744, + "flos": 18697322108160.0, + "grad_norm": 16.536782786641233, + "language_loss": 0.83779532, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.85245162, + "num_input_tokens_seen": 353003480, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.25024414, + "step": 16357, + "time_per_iteration": 2.7038164138793945 + }, + { + "auxiliary_loss_clip": 0.01241755, + "auxiliary_loss_mlp": 0.00248286, + "balance_loss_clip": 1.02764702, + "balance_loss_mlp": 0.22298992, + "epoch": 0.9834961671426424, + "flos": 21105240186240.0, + "grad_norm": 235.81257248650513, + "language_loss": 0.88934773, + "learning_rate": 2.846214118442436e-09, + "loss": 0.90424818, + "num_input_tokens_seen": 353021425, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.25305176, + "step": 16358, + "time_per_iteration": 4.22212553024292 + }, + { + "auxiliary_loss_clip": 0.01239966, + "auxiliary_loss_mlp": 0.00219385, + "balance_loss_clip": 1.02906156, + "balance_loss_mlp": 0.19585294, + "epoch": 0.9835562903953103, + "flos": 26687625511680.0, + "grad_norm": 5.440816572205234, + "language_loss": 0.78122157, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.79581505, + "num_input_tokens_seen": 353039870, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.23535156, + "step": 16359, + "time_per_iteration": 4.180644989013672 + }, + { + "auxiliary_loss_clip": 0.01228857, + "auxiliary_loss_mlp": 0.00235393, + "balance_loss_clip": 1.01674294, + "balance_loss_mlp": 0.20985827, + "epoch": 0.9836164136479784, + "flos": 22090162089600.0, + "grad_norm": 2.292414177773084, + "language_loss": 0.75498801, + "learning_rate": 2.804824870920264e-09, + "loss": 0.76963055, + "num_input_tokens_seen": 353059750, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.25549316, + "step": 16360, + "time_per_iteration": 2.7426106929779053 + }, + { + "auxiliary_loss_clip": 0.01238765, + "auxiliary_loss_mlp": 0.00217568, + "balance_loss_clip": 1.02384329, + "balance_loss_mlp": 0.19169971, + "epoch": 0.9836765369006463, + "flos": 23878405710720.0, + "grad_norm": 6.291054791336306, + "language_loss": 0.9167968, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.93136013, + "num_input_tokens_seen": 353079940, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.25891113, + "step": 16361, + "time_per_iteration": 2.860017776489258 + }, + { + "auxiliary_loss_clip": 0.01231205, + "auxiliary_loss_mlp": 0.00228539, + "balance_loss_clip": 1.01747155, + "balance_loss_mlp": 0.20475662, + "epoch": 0.9837366601533143, + "flos": 25845017293440.0, + "grad_norm": 237.14876962525628, + "language_loss": 0.83254361, + "learning_rate": 2.76373855876022e-09, + "loss": 0.84714115, + "num_input_tokens_seen": 353099990, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.23791504, + "step": 16362, + "time_per_iteration": 2.75726056098938 + }, + { + "auxiliary_loss_clip": 0.01224876, + "auxiliary_loss_mlp": 0.00231932, + "balance_loss_clip": 1.01480067, + "balance_loss_mlp": 0.2071601, + "epoch": 0.9837967834059823, + "flos": 21358015171200.0, + "grad_norm": 6.380756521277208, + "language_loss": 0.80325198, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.81782001, + "num_input_tokens_seen": 353118710, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.24780273, + "step": 16363, + "time_per_iteration": 2.7395401000976562 + }, + { + "auxiliary_loss_clip": 0.01220073, + "auxiliary_loss_mlp": 0.00230602, + "balance_loss_clip": 1.01297128, + "balance_loss_mlp": 0.20687929, + "epoch": 0.9838569066586502, + "flos": 18515793749760.0, + "grad_norm": 19.961129288573545, + "language_loss": 0.70281357, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.71732032, + "num_input_tokens_seen": 353136415, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.23730469, + "step": 16364, + "time_per_iteration": 2.6733365058898926 + }, + { + "auxiliary_loss_clip": 0.01227677, + "auxiliary_loss_mlp": 0.00204121, + "balance_loss_clip": 1.01393747, + "balance_loss_mlp": 0.17994547, + "epoch": 0.9839170299113182, + "flos": 22452392793600.0, + "grad_norm": 5.856562579590682, + "language_loss": 0.83391511, + "learning_rate": 2.702677107943252e-09, + "loss": 0.8482331, + "num_input_tokens_seen": 353154650, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24182129, + "step": 16365, + "time_per_iteration": 2.728226661682129 + }, + { + "auxiliary_loss_clip": 0.01243391, + "auxiliary_loss_mlp": 0.00211899, + "balance_loss_clip": 1.02778256, + "balance_loss_mlp": 0.18830763, + "epoch": 0.9839771531639862, + "flos": 27892320779520.0, + "grad_norm": 15.086750701295855, + "language_loss": 0.84841496, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.86296785, + "num_input_tokens_seen": 353174065, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.23596191, + "step": 16366, + "time_per_iteration": 2.785456895828247 + }, + { + "auxiliary_loss_clip": 0.01230742, + "auxiliary_loss_mlp": 0.00205767, + "balance_loss_clip": 1.01512122, + "balance_loss_mlp": 0.1818534, + "epoch": 0.9840372764166542, + "flos": 28214510797440.0, + "grad_norm": 3.725087710940824, + "language_loss": 0.81667161, + "learning_rate": 2.662348161352357e-09, + "loss": 0.83103669, + "num_input_tokens_seen": 353193560, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.23901367, + "step": 16367, + "time_per_iteration": 2.793383836746216 + }, + { + "auxiliary_loss_clip": 0.01230698, + "auxiliary_loss_mlp": 0.00236326, + "balance_loss_clip": 1.01910424, + "balance_loss_mlp": 0.2113757, + "epoch": 0.9840973996693221, + "flos": 23403989854080.0, + "grad_norm": 27.607646312335376, + "language_loss": 0.67451012, + "learning_rate": 2.642297296540974e-09, + "loss": 0.68918037, + "num_input_tokens_seen": 353213525, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.24951172, + "step": 16368, + "time_per_iteration": 2.793736696243286 + }, + { + "auxiliary_loss_clip": 0.01220354, + "auxiliary_loss_mlp": 0.00210497, + "balance_loss_clip": 1.01062846, + "balance_loss_mlp": 0.18648843, + "epoch": 0.9841575229219901, + "flos": 21395865127680.0, + "grad_norm": 212.85127187963755, + "language_loss": 0.71559489, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.7299034, + "num_input_tokens_seen": 353234000, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.24023438, + "step": 16369, + "time_per_iteration": 2.7768707275390625 + }, + { + "auxiliary_loss_clip": 0.01245788, + "auxiliary_loss_mlp": 0.00238946, + "balance_loss_clip": 1.02633893, + "balance_loss_mlp": 0.21106303, + "epoch": 0.984217646174658, + "flos": 24464072966400.0, + "grad_norm": 990.577907218728, + "language_loss": 0.75038683, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.76523423, + "num_input_tokens_seen": 353254940, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.27893066, + "step": 16370, + "time_per_iteration": 2.7357654571533203 + }, + { + "auxiliary_loss_clip": 0.01233245, + "auxiliary_loss_mlp": 0.00221703, + "balance_loss_clip": 1.01991487, + "balance_loss_mlp": 0.19703819, + "epoch": 0.984277769427326, + "flos": 16435057680000.0, + "grad_norm": 10.316440628278242, + "language_loss": 0.82253224, + "learning_rate": 2.582599145159792e-09, + "loss": 0.83708173, + "num_input_tokens_seen": 353272590, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.24682617, + "step": 16371, + "time_per_iteration": 2.6558477878570557 + }, + { + "auxiliary_loss_clip": 0.01083638, + "auxiliary_loss_mlp": 0.00057207, + "balance_loss_clip": 0.94734263, + "balance_loss_mlp": 0.05067455, + "epoch": 0.9843378926799939, + "flos": 64530615288960.0, + "grad_norm": 0.7465845376841311, + "language_loss": 0.64136517, + "learning_rate": 2.562851244898745e-09, + "loss": 0.65277362, + "num_input_tokens_seen": 353334380, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06542969, + "step": 16372, + "time_per_iteration": 3.2055323123931885 + }, + { + "auxiliary_loss_clip": 0.01231717, + "auxiliary_loss_mlp": 0.00226396, + "balance_loss_clip": 1.01934469, + "balance_loss_mlp": 0.2007187, + "epoch": 0.984398015932662, + "flos": 17382811985280.0, + "grad_norm": 25.796506074918124, + "language_loss": 0.78770506, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.80228615, + "num_input_tokens_seen": 353351640, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.25683594, + "step": 16373, + "time_per_iteration": 2.6613762378692627 + }, + { + "auxiliary_loss_clip": 0.01232633, + "auxiliary_loss_mlp": 0.00231252, + "balance_loss_clip": 1.01984429, + "balance_loss_mlp": 0.20524061, + "epoch": 0.9844581391853299, + "flos": 23879088069120.0, + "grad_norm": 7.962208605674262, + "language_loss": 0.86982298, + "learning_rate": 2.523582674173186e-09, + "loss": 0.88446188, + "num_input_tokens_seen": 353372555, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.26000977, + "step": 16374, + "time_per_iteration": 2.720186710357666 + }, + { + "auxiliary_loss_clip": 0.01231442, + "auxiliary_loss_mlp": 0.00218571, + "balance_loss_clip": 1.01490617, + "balance_loss_mlp": 0.19418055, + "epoch": 0.9845182624379979, + "flos": 19865352568320.0, + "grad_norm": 16.18081962717638, + "language_loss": 0.77196968, + "learning_rate": 2.504062005197927e-09, + "loss": 0.78646982, + "num_input_tokens_seen": 353391385, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24389648, + "step": 16375, + "time_per_iteration": 2.638761520385742 + }, + { + "auxiliary_loss_clip": 0.01233759, + "auxiliary_loss_mlp": 0.00222921, + "balance_loss_clip": 1.02156615, + "balance_loss_mlp": 0.19637334, + "epoch": 0.9845783856906659, + "flos": 28254659224320.0, + "grad_norm": 18.630572921708257, + "language_loss": 0.88860404, + "learning_rate": 2.484617081468521e-09, + "loss": 0.90317082, + "num_input_tokens_seen": 353411630, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.265625, + "step": 16376, + "time_per_iteration": 2.730957269668579 + }, + { + "auxiliary_loss_clip": 0.01227615, + "auxiliary_loss_mlp": 0.00204067, + "balance_loss_clip": 1.0175668, + "balance_loss_mlp": 0.18182291, + "epoch": 0.9846385089433338, + "flos": 28328383889280.0, + "grad_norm": 8.375875121825002, + "language_loss": 0.67728102, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.69159788, + "num_input_tokens_seen": 353432895, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.22253418, + "step": 16377, + "time_per_iteration": 2.67885684967041 + }, + { + "auxiliary_loss_clip": 0.01243033, + "auxiliary_loss_mlp": 0.00202314, + "balance_loss_clip": 1.02223873, + "balance_loss_mlp": 0.17829378, + "epoch": 0.9846986321960018, + "flos": 24316767290880.0, + "grad_norm": 4.156157520967601, + "language_loss": 0.8208456, + "learning_rate": 2.445954472695133e-09, + "loss": 0.83529902, + "num_input_tokens_seen": 353454195, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24035645, + "step": 16378, + "time_per_iteration": 2.7189829349517822 + }, + { + "auxiliary_loss_clip": 0.01235126, + "auxiliary_loss_mlp": 0.00229236, + "balance_loss_clip": 1.02129543, + "balance_loss_mlp": 0.2026765, + "epoch": 0.9847587554486698, + "flos": 27271999877760.0, + "grad_norm": 16.962534961668887, + "language_loss": 0.79567063, + "learning_rate": 2.426736789116868e-09, + "loss": 0.81031424, + "num_input_tokens_seen": 353475125, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.26550293, + "step": 16379, + "time_per_iteration": 2.693990468978882 + }, + { + "auxiliary_loss_clip": 0.01261718, + "auxiliary_loss_mlp": 0.00220586, + "balance_loss_clip": 1.04067791, + "balance_loss_mlp": 0.19384748, + "epoch": 0.9848188787013378, + "flos": 16542717719040.0, + "grad_norm": 42.11610177677969, + "language_loss": 0.80519331, + "learning_rate": 2.407594853716999e-09, + "loss": 0.82001626, + "num_input_tokens_seen": 353493265, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.26733398, + "step": 16380, + "time_per_iteration": 2.6727030277252197 + }, + { + "auxiliary_loss_clip": 0.01253224, + "auxiliary_loss_mlp": 0.00240441, + "balance_loss_clip": 1.03147912, + "balance_loss_mlp": 0.21369022, + "epoch": 0.9848790019540057, + "flos": 20193647898240.0, + "grad_norm": 35.35147739979233, + "language_loss": 0.86590654, + "learning_rate": 2.38852866722139e-09, + "loss": 0.88084316, + "num_input_tokens_seen": 353511650, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.26745605, + "step": 16381, + "time_per_iteration": 2.6617817878723145 + }, + { + "auxiliary_loss_clip": 0.01251137, + "auxiliary_loss_mlp": 0.00207947, + "balance_loss_clip": 1.02655399, + "balance_loss_mlp": 0.18234076, + "epoch": 0.9849391252066737, + "flos": 28259723041920.0, + "grad_norm": 4.414370858625201, + "language_loss": 0.87670392, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.89129472, + "num_input_tokens_seen": 353534035, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.25610352, + "step": 16382, + "time_per_iteration": 2.7850043773651123 + }, + { + "auxiliary_loss_clip": 0.01245893, + "auxiliary_loss_mlp": 0.0024416, + "balance_loss_clip": 1.026052, + "balance_loss_mlp": 0.21736196, + "epoch": 0.9849992484593416, + "flos": 22454942659200.0, + "grad_norm": 361.6144307366848, + "language_loss": 0.83231592, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.84721649, + "num_input_tokens_seen": 353549950, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26831055, + "step": 16383, + "time_per_iteration": 2.699254274368286 + }, + { + "auxiliary_loss_clip": 0.01239109, + "auxiliary_loss_mlp": 0.00222757, + "balance_loss_clip": 1.02482247, + "balance_loss_mlp": 0.1987008, + "epoch": 0.9850593717120096, + "flos": 34497190656000.0, + "grad_norm": 16.960925158402933, + "language_loss": 0.73477328, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.74939197, + "num_input_tokens_seen": 353573745, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.24072266, + "step": 16384, + "time_per_iteration": 2.8294875621795654 + }, + { + "auxiliary_loss_clip": 0.0125733, + "auxiliary_loss_mlp": 0.00239878, + "balance_loss_clip": 1.03554738, + "balance_loss_mlp": 0.21220919, + "epoch": 0.9851194949646775, + "flos": 38837282152320.0, + "grad_norm": 3.8612851249219613, + "language_loss": 0.79788548, + "learning_rate": 2.313021424697359e-09, + "loss": 0.81285757, + "num_input_tokens_seen": 353595335, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.27685547, + "step": 16385, + "time_per_iteration": 2.8501267433166504 + }, + { + "auxiliary_loss_clip": 0.01240387, + "auxiliary_loss_mlp": 0.0023197, + "balance_loss_clip": 1.02149534, + "balance_loss_mlp": 0.20493332, + "epoch": 0.9851796182173456, + "flos": 17712436118400.0, + "grad_norm": 8.788532031189666, + "language_loss": 0.8995384, + "learning_rate": 2.294333993509978e-09, + "loss": 0.91426194, + "num_input_tokens_seen": 353614270, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.27050781, + "step": 16386, + "time_per_iteration": 2.6563351154327393 + }, + { + "auxiliary_loss_clip": 0.0124937, + "auxiliary_loss_mlp": 0.00233123, + "balance_loss_clip": 1.03072977, + "balance_loss_mlp": 0.20614588, + "epoch": 0.9852397414700135, + "flos": 27454318335360.0, + "grad_norm": 6.144629341693529, + "language_loss": 0.76949036, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.78431535, + "num_input_tokens_seen": 353634900, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.26977539, + "step": 16387, + "time_per_iteration": 2.7453458309173584 + }, + { + "auxiliary_loss_clip": 0.01229778, + "auxiliary_loss_mlp": 0.00230379, + "balance_loss_clip": 1.02083445, + "balance_loss_mlp": 0.20572606, + "epoch": 0.9852998647226815, + "flos": 18296702743680.0, + "grad_norm": 128.41994672436783, + "language_loss": 0.81588531, + "learning_rate": 2.257186391438237e-09, + "loss": 0.83048689, + "num_input_tokens_seen": 353652890, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.2467041, + "step": 16388, + "time_per_iteration": 2.62351655960083 + }, + { + "auxiliary_loss_clip": 0.01232614, + "auxiliary_loss_mlp": 0.00235573, + "balance_loss_clip": 1.01907527, + "balance_loss_mlp": 0.21088502, + "epoch": 0.9853599879753495, + "flos": 19642562461440.0, + "grad_norm": 3.588088708279799, + "language_loss": 0.88837665, + "learning_rate": 2.238726221962528e-09, + "loss": 0.90305853, + "num_input_tokens_seen": 353671295, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24694824, + "step": 16389, + "time_per_iteration": 2.7184340953826904 + }, + { + "auxiliary_loss_clip": 0.01231545, + "auxiliary_loss_mlp": 0.00243551, + "balance_loss_clip": 1.01797509, + "balance_loss_mlp": 0.22056708, + "epoch": 0.9854201112280174, + "flos": 23841956384640.0, + "grad_norm": 46.80467645764888, + "language_loss": 0.7519896, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.76674056, + "num_input_tokens_seen": 353690560, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.22998047, + "step": 16390, + "time_per_iteration": 2.6916801929473877 + }, + { + "auxiliary_loss_clip": 0.01259854, + "auxiliary_loss_mlp": 0.00234681, + "balance_loss_clip": 1.04290819, + "balance_loss_mlp": 0.20982623, + "epoch": 0.9854802344806854, + "flos": 30080573233920.0, + "grad_norm": 23.73618254374838, + "language_loss": 0.84447026, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.85941565, + "num_input_tokens_seen": 353710660, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.24865723, + "step": 16391, + "time_per_iteration": 4.096127033233643 + }, + { + "auxiliary_loss_clip": 0.01216039, + "auxiliary_loss_mlp": 0.00203367, + "balance_loss_clip": 1.00936091, + "balance_loss_mlp": 0.18037142, + "epoch": 0.9855403577333534, + "flos": 21907412668800.0, + "grad_norm": 53.810183722697545, + "language_loss": 0.75471842, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.76891249, + "num_input_tokens_seen": 353730440, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.22998047, + "step": 16392, + "time_per_iteration": 4.153555631637573 + }, + { + "auxiliary_loss_clip": 0.01268518, + "auxiliary_loss_mlp": 0.00244004, + "balance_loss_clip": 1.03850079, + "balance_loss_mlp": 0.21652591, + "epoch": 0.9856004809860214, + "flos": 15413794191360.0, + "grad_norm": 2.860824928032303, + "language_loss": 0.69126254, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.70638776, + "num_input_tokens_seen": 353748360, + "router_z_loss_clip": 2.30175781, + "router_z_loss_mlp": 0.2746582, + "step": 16393, + "time_per_iteration": 2.720158338546753 + }, + { + "auxiliary_loss_clip": 0.01259492, + "auxiliary_loss_mlp": 0.00237797, + "balance_loss_clip": 1.03164959, + "balance_loss_mlp": 0.21164235, + "epoch": 0.9856606042386893, + "flos": 13653201064320.0, + "grad_norm": 41.45401790234757, + "language_loss": 0.93302929, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.94800216, + "num_input_tokens_seen": 353760880, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.26135254, + "step": 16394, + "time_per_iteration": 2.7025628089904785 + }, + { + "auxiliary_loss_clip": 0.01242245, + "auxiliary_loss_mlp": 0.00211469, + "balance_loss_clip": 1.0251298, + "balance_loss_mlp": 0.18519574, + "epoch": 0.9857207274913573, + "flos": 23479151063040.0, + "grad_norm": 3.6969732109418696, + "language_loss": 0.83568096, + "learning_rate": 2.129556090869178e-09, + "loss": 0.85021818, + "num_input_tokens_seen": 353782255, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.26257324, + "step": 16395, + "time_per_iteration": 2.7305617332458496 + }, + { + "auxiliary_loss_clip": 0.01240485, + "auxiliary_loss_mlp": 0.00224957, + "balance_loss_clip": 1.02886915, + "balance_loss_mlp": 0.19964886, + "epoch": 0.9857808507440252, + "flos": 21065486808960.0, + "grad_norm": 5.33168840537741, + "language_loss": 0.81654978, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.83120424, + "num_input_tokens_seen": 353803580, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.25317383, + "step": 16396, + "time_per_iteration": 2.7122530937194824 + }, + { + "auxiliary_loss_clip": 0.01219516, + "auxiliary_loss_mlp": 0.00209233, + "balance_loss_clip": 1.01153326, + "balance_loss_mlp": 0.18697646, + "epoch": 0.9858409739966932, + "flos": 25301365971840.0, + "grad_norm": 15.515059865084526, + "language_loss": 0.7760787, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.79036617, + "num_input_tokens_seen": 353824200, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.22241211, + "step": 16397, + "time_per_iteration": 2.698887586593628 + }, + { + "auxiliary_loss_clip": 0.01227811, + "auxiliary_loss_mlp": 0.00203822, + "balance_loss_clip": 1.0208863, + "balance_loss_mlp": 0.18076703, + "epoch": 0.9859010972493611, + "flos": 20558751690240.0, + "grad_norm": 86.60596968445357, + "language_loss": 0.81496525, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.82928157, + "num_input_tokens_seen": 353843350, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.23059082, + "step": 16398, + "time_per_iteration": 2.6342132091522217 + }, + { + "auxiliary_loss_clip": 0.01225098, + "auxiliary_loss_mlp": 0.00204061, + "balance_loss_clip": 1.01513827, + "balance_loss_mlp": 0.18013521, + "epoch": 0.9859612205020292, + "flos": 24754985216640.0, + "grad_norm": 51.057099002963916, + "language_loss": 0.78875601, + "learning_rate": 2.058291183208771e-09, + "loss": 0.80304754, + "num_input_tokens_seen": 353864520, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23913574, + "step": 16399, + "time_per_iteration": 2.7026000022888184 + }, + { + "auxiliary_loss_clip": 0.01227959, + "auxiliary_loss_mlp": 0.00230461, + "balance_loss_clip": 1.01551771, + "balance_loss_mlp": 0.2075614, + "epoch": 0.9860213437546971, + "flos": 21105850717440.0, + "grad_norm": 22.79957982057928, + "language_loss": 0.6580013, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.67258555, + "num_input_tokens_seen": 353882240, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.22912598, + "step": 16400, + "time_per_iteration": 4.0594072341918945 + }, + { + "auxiliary_loss_clip": 0.01278123, + "auxiliary_loss_mlp": 0.00235307, + "balance_loss_clip": 1.05025578, + "balance_loss_mlp": 0.20844944, + "epoch": 0.9860814670073651, + "flos": 19136078737920.0, + "grad_norm": 4.2772631477449, + "language_loss": 0.89006186, + "learning_rate": 2.023113299582491e-09, + "loss": 0.90519607, + "num_input_tokens_seen": 353901590, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.26831055, + "step": 16401, + "time_per_iteration": 4.06519889831543 + }, + { + "auxiliary_loss_clip": 0.01245427, + "auxiliary_loss_mlp": 0.00218789, + "balance_loss_clip": 1.02665639, + "balance_loss_mlp": 0.19292036, + "epoch": 0.9861415902600331, + "flos": 17237050594560.0, + "grad_norm": 74.56161738468194, + "language_loss": 0.88180673, + "learning_rate": 2.005638002662069e-09, + "loss": 0.89644891, + "num_input_tokens_seen": 353918785, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25866699, + "step": 16402, + "time_per_iteration": 2.6880178451538086 + }, + { + "auxiliary_loss_clip": 0.0124026, + "auxiliary_loss_mlp": 0.00240636, + "balance_loss_clip": 1.02151275, + "balance_loss_mlp": 0.21431419, + "epoch": 0.986201713512701, + "flos": 27782577751680.0, + "grad_norm": 4.3486179960691445, + "language_loss": 0.80565351, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.82046252, + "num_input_tokens_seen": 353940390, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.26330566, + "step": 16403, + "time_per_iteration": 2.702977418899536 + }, + { + "auxiliary_loss_clip": 0.01242139, + "auxiliary_loss_mlp": 0.0021046, + "balance_loss_clip": 1.02836275, + "balance_loss_mlp": 0.18695208, + "epoch": 0.986261836765369, + "flos": 28730403884160.0, + "grad_norm": 150.3131364017385, + "language_loss": 0.81787896, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.83240497, + "num_input_tokens_seen": 353962180, + "router_z_loss_clip": 2.13964844, + "router_z_loss_mlp": 0.23535156, + "step": 16404, + "time_per_iteration": 2.7581045627593994 + }, + { + "auxiliary_loss_clip": 0.01228209, + "auxiliary_loss_mlp": 0.00229213, + "balance_loss_clip": 1.01837957, + "balance_loss_mlp": 0.20550179, + "epoch": 0.986321960018037, + "flos": 34313471568000.0, + "grad_norm": 33.799213093090565, + "language_loss": 0.76572967, + "learning_rate": 1.953666699415768e-09, + "loss": 0.7803039, + "num_input_tokens_seen": 353984305, + "router_z_loss_clip": 2.10058594, + "router_z_loss_mlp": 0.23730469, + "step": 16405, + "time_per_iteration": 2.833843469619751 + }, + { + "auxiliary_loss_clip": 0.01222728, + "auxiliary_loss_mlp": 0.0021896, + "balance_loss_clip": 1.01712441, + "balance_loss_mlp": 0.19639409, + "epoch": 0.986382083270705, + "flos": 25189755436800.0, + "grad_norm": 4.386479667536548, + "language_loss": 0.76589203, + "learning_rate": 1.93649446302846e-09, + "loss": 0.7803089, + "num_input_tokens_seen": 354004495, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.22558594, + "step": 16406, + "time_per_iteration": 2.696772575378418 + }, + { + "auxiliary_loss_clip": 0.01218092, + "auxiliary_loss_mlp": 0.00235591, + "balance_loss_clip": 1.00649679, + "balance_loss_mlp": 0.21242872, + "epoch": 0.9864422065233729, + "flos": 11025904671360.0, + "grad_norm": 3.5201019950977117, + "language_loss": 0.85072708, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.86526394, + "num_input_tokens_seen": 354015985, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.23168945, + "step": 16407, + "time_per_iteration": 2.817225456237793 + }, + { + "auxiliary_loss_clip": 0.01231006, + "auxiliary_loss_mlp": 0.00249727, + "balance_loss_clip": 1.01959491, + "balance_loss_mlp": 0.22449014, + "epoch": 0.9865023297760409, + "flos": 16545590807040.0, + "grad_norm": 17.363303659358063, + "language_loss": 0.85850745, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.87331486, + "num_input_tokens_seen": 354033260, + "router_z_loss_clip": 2.11230469, + "router_z_loss_mlp": 0.2520752, + "step": 16408, + "time_per_iteration": 2.6257805824279785 + }, + { + "auxiliary_loss_clip": 0.01263574, + "auxiliary_loss_mlp": 0.00239854, + "balance_loss_clip": 1.03710938, + "balance_loss_mlp": 0.21292505, + "epoch": 0.9865624530287088, + "flos": 18880179269760.0, + "grad_norm": 3.6357402548718274, + "language_loss": 0.77910113, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.79413533, + "num_input_tokens_seen": 354052825, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.26940918, + "step": 16409, + "time_per_iteration": 2.6606240272521973 + }, + { + "auxiliary_loss_clip": 0.01078615, + "auxiliary_loss_mlp": 0.00089556, + "balance_loss_clip": 0.94291478, + "balance_loss_mlp": 0.08288059, + "epoch": 0.9866225762813768, + "flos": 68887798680960.0, + "grad_norm": 0.7769489494028117, + "language_loss": 0.60205215, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.61373389, + "num_input_tokens_seen": 354113920, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06689453, + "step": 16410, + "time_per_iteration": 3.19671630859375 + }, + { + "auxiliary_loss_clip": 0.01235728, + "auxiliary_loss_mlp": 0.0022588, + "balance_loss_clip": 1.02251267, + "balance_loss_mlp": 0.20284829, + "epoch": 0.9866826995340447, + "flos": 29023111814400.0, + "grad_norm": 12.511303536442396, + "language_loss": 0.76365489, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.77827096, + "num_input_tokens_seen": 354134210, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.23046875, + "step": 16411, + "time_per_iteration": 2.695289134979248 + }, + { + "auxiliary_loss_clip": 0.01082229, + "auxiliary_loss_mlp": 0.00072971, + "balance_loss_clip": 0.94648147, + "balance_loss_mlp": 0.06529434, + "epoch": 0.9867428227867128, + "flos": 65376814867200.0, + "grad_norm": 0.7093398609285745, + "language_loss": 0.55780268, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.56935471, + "num_input_tokens_seen": 354198010, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.07666016, + "step": 16412, + "time_per_iteration": 3.2130746841430664 + }, + { + "auxiliary_loss_clip": 0.01262183, + "auxiliary_loss_mlp": 0.00241337, + "balance_loss_clip": 1.03620267, + "balance_loss_mlp": 0.21465787, + "epoch": 0.9868029460393807, + "flos": 26506312634880.0, + "grad_norm": 2871.6392798962825, + "language_loss": 0.79360855, + "learning_rate": 1.818410313934926e-09, + "loss": 0.80864382, + "num_input_tokens_seen": 354220000, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.26672363, + "step": 16413, + "time_per_iteration": 2.7395803928375244 + }, + { + "auxiliary_loss_clip": 0.01231004, + "auxiliary_loss_mlp": 0.00236123, + "balance_loss_clip": 1.01649189, + "balance_loss_mlp": 0.21130365, + "epoch": 0.9868630692920487, + "flos": 22967280299520.0, + "grad_norm": 886.413056916961, + "language_loss": 0.77427101, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.78894228, + "num_input_tokens_seen": 354240910, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.24829102, + "step": 16414, + "time_per_iteration": 2.8171451091766357 + }, + { + "auxiliary_loss_clip": 0.0123239, + "auxiliary_loss_mlp": 0.00208247, + "balance_loss_clip": 1.01970506, + "balance_loss_mlp": 0.18378516, + "epoch": 0.9869231925447167, + "flos": 19828687760640.0, + "grad_norm": 38.4954820571449, + "language_loss": 0.78934395, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.80375028, + "num_input_tokens_seen": 354259430, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.24475098, + "step": 16415, + "time_per_iteration": 2.683988332748413 + }, + { + "auxiliary_loss_clip": 0.01211441, + "auxiliary_loss_mlp": 0.00216509, + "balance_loss_clip": 1.00612116, + "balance_loss_mlp": 0.193955, + "epoch": 0.9869833157973846, + "flos": 20195228096640.0, + "grad_norm": 67.10134157224223, + "language_loss": 0.81099963, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.82527912, + "num_input_tokens_seen": 354279490, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.22558594, + "step": 16416, + "time_per_iteration": 2.6917765140533447 + }, + { + "auxiliary_loss_clip": 0.0123036, + "auxiliary_loss_mlp": 0.00241658, + "balance_loss_clip": 1.0152303, + "balance_loss_mlp": 0.21583733, + "epoch": 0.9870434390500527, + "flos": 16099507802880.0, + "grad_norm": 59.439778349237834, + "language_loss": 0.80651474, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.82123494, + "num_input_tokens_seen": 354295080, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.25830078, + "step": 16417, + "time_per_iteration": 2.597642660140991 + }, + { + "auxiliary_loss_clip": 0.01245082, + "auxiliary_loss_mlp": 0.002132, + "balance_loss_clip": 1.02383161, + "balance_loss_mlp": 0.18692668, + "epoch": 0.9871035623027206, + "flos": 21760753438080.0, + "grad_norm": 69.19657778660932, + "language_loss": 0.79488182, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.80946457, + "num_input_tokens_seen": 354314610, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26269531, + "step": 16418, + "time_per_iteration": 2.7293643951416016 + }, + { + "auxiliary_loss_clip": 0.01079949, + "auxiliary_loss_mlp": 0.00069093, + "balance_loss_clip": 0.94287336, + "balance_loss_mlp": 0.06222672, + "epoch": 0.9871636855553886, + "flos": 70219583245440.0, + "grad_norm": 0.6379620772736818, + "language_loss": 0.52984536, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.54133576, + "num_input_tokens_seen": 354383115, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06884766, + "step": 16419, + "time_per_iteration": 3.2431344985961914 + }, + { + "auxiliary_loss_clip": 0.01247088, + "auxiliary_loss_mlp": 0.00235328, + "balance_loss_clip": 1.02875888, + "balance_loss_mlp": 0.21053293, + "epoch": 0.9872238088080565, + "flos": 25045825639680.0, + "grad_norm": 21.635862113780163, + "language_loss": 0.85203528, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.86685944, + "num_input_tokens_seen": 354403115, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.2479248, + "step": 16420, + "time_per_iteration": 2.6969125270843506 + }, + { + "auxiliary_loss_clip": 0.01240055, + "auxiliary_loss_mlp": 0.00221995, + "balance_loss_clip": 1.02738225, + "balance_loss_mlp": 0.19810551, + "epoch": 0.9872839320607245, + "flos": 19465846525440.0, + "grad_norm": 76.72788794156608, + "language_loss": 0.77461231, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.78923285, + "num_input_tokens_seen": 354424520, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.2388916, + "step": 16421, + "time_per_iteration": 2.7131736278533936 + }, + { + "auxiliary_loss_clip": 0.0125702, + "auxiliary_loss_mlp": 0.00215244, + "balance_loss_clip": 1.03198004, + "balance_loss_mlp": 0.1907825, + "epoch": 0.9873440553133924, + "flos": 26942914448640.0, + "grad_norm": 9.264761210599916, + "language_loss": 0.91015172, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.92487431, + "num_input_tokens_seen": 354444800, + "router_z_loss_clip": 2.24707031, + "router_z_loss_mlp": 0.24462891, + "step": 16422, + "time_per_iteration": 2.7011077404022217 + }, + { + "auxiliary_loss_clip": 0.01231584, + "auxiliary_loss_mlp": 0.00227171, + "balance_loss_clip": 1.02353024, + "balance_loss_mlp": 0.20411548, + "epoch": 0.9874041785660604, + "flos": 19062210418560.0, + "grad_norm": 852.4570303915461, + "language_loss": 0.92928463, + "learning_rate": 1.656159280223779e-09, + "loss": 0.94387215, + "num_input_tokens_seen": 354464590, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.23083496, + "step": 16423, + "time_per_iteration": 2.7133634090423584 + }, + { + "auxiliary_loss_clip": 0.01230667, + "auxiliary_loss_mlp": 0.0022557, + "balance_loss_clip": 1.01859689, + "balance_loss_mlp": 0.20250256, + "epoch": 0.9874643018187284, + "flos": 21105814803840.0, + "grad_norm": 116.86908193710957, + "language_loss": 0.7678808, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.78244317, + "num_input_tokens_seen": 354484145, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.23071289, + "step": 16424, + "time_per_iteration": 2.635361433029175 + }, + { + "auxiliary_loss_clip": 0.01238906, + "auxiliary_loss_mlp": 0.00215849, + "balance_loss_clip": 1.02709293, + "balance_loss_mlp": 0.19062476, + "epoch": 0.9875244250713964, + "flos": 24426043441920.0, + "grad_norm": 2.9256027796776087, + "language_loss": 0.87361169, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.88815922, + "num_input_tokens_seen": 354502475, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.25231934, + "step": 16425, + "time_per_iteration": 2.73378586769104 + }, + { + "auxiliary_loss_clip": 0.01233957, + "auxiliary_loss_mlp": 0.002115, + "balance_loss_clip": 1.01599669, + "balance_loss_mlp": 0.18560734, + "epoch": 0.9875845483240643, + "flos": 25117610970240.0, + "grad_norm": 33.69641255530035, + "language_loss": 0.87630737, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.89076191, + "num_input_tokens_seen": 354521855, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.25891113, + "step": 16426, + "time_per_iteration": 2.694972515106201 + }, + { + "auxiliary_loss_clip": 0.01241599, + "auxiliary_loss_mlp": 0.00225397, + "balance_loss_clip": 1.0239352, + "balance_loss_mlp": 0.19993374, + "epoch": 0.9876446715767323, + "flos": 16581788737920.0, + "grad_norm": 43.298836983496976, + "language_loss": 0.95846617, + "learning_rate": 1.593380599750338e-09, + "loss": 0.97313619, + "num_input_tokens_seen": 354539535, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25463867, + "step": 16427, + "time_per_iteration": 2.683358907699585 + }, + { + "auxiliary_loss_clip": 0.01235894, + "auxiliary_loss_mlp": 0.0021388, + "balance_loss_clip": 1.02198207, + "balance_loss_mlp": 0.18979982, + "epoch": 0.9877047948294003, + "flos": 21616141282560.0, + "grad_norm": 6.014965487043642, + "language_loss": 0.76970154, + "learning_rate": 1.577875377599458e-09, + "loss": 0.7841993, + "num_input_tokens_seen": 354557430, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.24060059, + "step": 16428, + "time_per_iteration": 2.6441667079925537 + }, + { + "auxiliary_loss_clip": 0.01223781, + "auxiliary_loss_mlp": 0.001929, + "balance_loss_clip": 1.01361728, + "balance_loss_mlp": 0.16984487, + "epoch": 0.9877649180820682, + "flos": 21178497974400.0, + "grad_norm": 24.620375465063912, + "language_loss": 0.89368927, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.90785611, + "num_input_tokens_seen": 354574735, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.23034668, + "step": 16429, + "time_per_iteration": 2.699779987335205 + }, + { + "auxiliary_loss_clip": 0.012285, + "auxiliary_loss_mlp": 0.00220794, + "balance_loss_clip": 1.02004957, + "balance_loss_mlp": 0.19732144, + "epoch": 0.9878250413347363, + "flos": 39749233576320.0, + "grad_norm": 188.40185637165885, + "language_loss": 0.69735992, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.71185291, + "num_input_tokens_seen": 354597050, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.23449707, + "step": 16430, + "time_per_iteration": 2.812028646469116 + }, + { + "auxiliary_loss_clip": 0.01226301, + "auxiliary_loss_mlp": 0.00216124, + "balance_loss_clip": 1.01383269, + "balance_loss_mlp": 0.19165048, + "epoch": 0.9878851645874042, + "flos": 29425634599680.0, + "grad_norm": 47.84433847425038, + "language_loss": 0.78748453, + "learning_rate": 1.531814395687725e-09, + "loss": 0.80190873, + "num_input_tokens_seen": 354619095, + "router_z_loss_clip": 2.12402344, + "router_z_loss_mlp": 0.24487305, + "step": 16431, + "time_per_iteration": 2.7460155487060547 + }, + { + "auxiliary_loss_clip": 0.01254292, + "auxiliary_loss_mlp": 0.00220596, + "balance_loss_clip": 1.03156209, + "balance_loss_mlp": 0.19372663, + "epoch": 0.9879452878400722, + "flos": 15806261168640.0, + "grad_norm": 41.87557923670202, + "language_loss": 0.89964139, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.91439033, + "num_input_tokens_seen": 354633790, + "router_z_loss_clip": 2.22558594, + "router_z_loss_mlp": 0.2689209, + "step": 16432, + "time_per_iteration": 2.6498472690582275 + }, + { + "auxiliary_loss_clip": 0.01222481, + "auxiliary_loss_mlp": 0.00209478, + "balance_loss_clip": 1.01651406, + "balance_loss_mlp": 0.18668547, + "epoch": 0.9880054110927401, + "flos": 22233912318720.0, + "grad_norm": 52.85548425442688, + "language_loss": 0.86500263, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.87932223, + "num_input_tokens_seen": 354653180, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.22790527, + "step": 16433, + "time_per_iteration": 4.077907085418701 + }, + { + "auxiliary_loss_clip": 0.01234405, + "auxiliary_loss_mlp": 0.00212997, + "balance_loss_clip": 1.02071023, + "balance_loss_mlp": 0.1873197, + "epoch": 0.9880655343454081, + "flos": 28763836467840.0, + "grad_norm": 13.755083144502663, + "language_loss": 0.73020351, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.74467754, + "num_input_tokens_seen": 354669900, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.25671387, + "step": 16434, + "time_per_iteration": 4.135472059249878 + }, + { + "auxiliary_loss_clip": 0.01253088, + "auxiliary_loss_mlp": 0.00228419, + "balance_loss_clip": 1.02988863, + "balance_loss_mlp": 0.20146608, + "epoch": 0.988125657598076, + "flos": 32853379622400.0, + "grad_norm": 13.061179887771765, + "language_loss": 0.77151084, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.78632593, + "num_input_tokens_seen": 354693165, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26989746, + "step": 16435, + "time_per_iteration": 2.813199043273926 + }, + { + "auxiliary_loss_clip": 0.01241254, + "auxiliary_loss_mlp": 0.00238254, + "balance_loss_clip": 1.0200386, + "balance_loss_mlp": 0.20923892, + "epoch": 0.988185780850744, + "flos": 19390685316480.0, + "grad_norm": 152.9427038754238, + "language_loss": 0.85283339, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.8676284, + "num_input_tokens_seen": 354711915, + "router_z_loss_clip": 2.21386719, + "router_z_loss_mlp": 0.29016113, + "step": 16436, + "time_per_iteration": 2.6134495735168457 + }, + { + "auxiliary_loss_clip": 0.01237217, + "auxiliary_loss_mlp": 0.00219173, + "balance_loss_clip": 1.02193248, + "balance_loss_mlp": 0.19285193, + "epoch": 0.988245904103412, + "flos": 22528415928960.0, + "grad_norm": 28.10490421508015, + "language_loss": 0.82421231, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.83877623, + "num_input_tokens_seen": 354729135, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.26318359, + "step": 16437, + "time_per_iteration": 2.722653388977051 + }, + { + "auxiliary_loss_clip": 0.0123424, + "auxiliary_loss_mlp": 0.0022754, + "balance_loss_clip": 1.02105427, + "balance_loss_mlp": 0.20108792, + "epoch": 0.98830602735608, + "flos": 28659193171200.0, + "grad_norm": 3.697763235972394, + "language_loss": 0.67059362, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.68521142, + "num_input_tokens_seen": 354752530, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.26489258, + "step": 16438, + "time_per_iteration": 2.69144868850708 + }, + { + "auxiliary_loss_clip": 0.01234342, + "auxiliary_loss_mlp": 0.00225933, + "balance_loss_clip": 1.01971495, + "balance_loss_mlp": 0.20163862, + "epoch": 0.9883661506087479, + "flos": 20996035862400.0, + "grad_norm": 236.27611688756699, + "language_loss": 0.81839895, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.83300167, + "num_input_tokens_seen": 354771135, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.24279785, + "step": 16439, + "time_per_iteration": 2.6881725788116455 + }, + { + "auxiliary_loss_clip": 0.01232111, + "auxiliary_loss_mlp": 0.00220299, + "balance_loss_clip": 1.01648068, + "balance_loss_mlp": 0.19600391, + "epoch": 0.9884262738614159, + "flos": 32706109860480.0, + "grad_norm": 6.311669706479498, + "language_loss": 0.68757361, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.70209765, + "num_input_tokens_seen": 354791800, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24291992, + "step": 16440, + "time_per_iteration": 2.751551866531372 + }, + { + "auxiliary_loss_clip": 0.01232398, + "auxiliary_loss_mlp": 0.00222098, + "balance_loss_clip": 1.01759052, + "balance_loss_mlp": 0.19866177, + "epoch": 0.9884863971140839, + "flos": 17564699479680.0, + "grad_norm": 11.74374714482979, + "language_loss": 0.85746014, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.87200516, + "num_input_tokens_seen": 354809200, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.234375, + "step": 16441, + "time_per_iteration": 2.665485382080078 + }, + { + "auxiliary_loss_clip": 0.01234009, + "auxiliary_loss_mlp": 0.00207924, + "balance_loss_clip": 1.02320039, + "balance_loss_mlp": 0.1841895, + "epoch": 0.9885465203667518, + "flos": 40552519380480.0, + "grad_norm": 14.538058876408806, + "language_loss": 0.76861089, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.78303027, + "num_input_tokens_seen": 354829945, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.23754883, + "step": 16442, + "time_per_iteration": 4.3051393032073975 + }, + { + "auxiliary_loss_clip": 0.01223774, + "auxiliary_loss_mlp": 0.00210615, + "balance_loss_clip": 1.0099262, + "balance_loss_mlp": 0.18615375, + "epoch": 0.9886066436194199, + "flos": 13807976768640.0, + "grad_norm": 47.88662784574514, + "language_loss": 0.83856696, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.85291088, + "num_input_tokens_seen": 354845055, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.24462891, + "step": 16443, + "time_per_iteration": 2.5895583629608154 + }, + { + "auxiliary_loss_clip": 0.01238126, + "auxiliary_loss_mlp": 0.00221743, + "balance_loss_clip": 1.02330208, + "balance_loss_mlp": 0.19686446, + "epoch": 0.9886667668720878, + "flos": 23325129544320.0, + "grad_norm": 45.083914691830614, + "language_loss": 0.81172395, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.82632262, + "num_input_tokens_seen": 354864680, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24865723, + "step": 16444, + "time_per_iteration": 4.11650013923645 + }, + { + "auxiliary_loss_clip": 0.0123481, + "auxiliary_loss_mlp": 0.00239805, + "balance_loss_clip": 1.02340698, + "balance_loss_mlp": 0.21528399, + "epoch": 0.9887268901247558, + "flos": 22706029704960.0, + "grad_norm": 34.73207559201181, + "language_loss": 0.74180526, + "learning_rate": 1.325881465858547e-09, + "loss": 0.75655138, + "num_input_tokens_seen": 354885685, + "router_z_loss_clip": 2.11035156, + "router_z_loss_mlp": 0.24511719, + "step": 16445, + "time_per_iteration": 2.7107250690460205 + }, + { + "auxiliary_loss_clip": 0.01248213, + "auxiliary_loss_mlp": 0.00236603, + "balance_loss_clip": 1.02836645, + "balance_loss_mlp": 0.21321425, + "epoch": 0.9887870133774237, + "flos": 13041283944960.0, + "grad_norm": 6916.711010310122, + "language_loss": 0.70706242, + "learning_rate": 1.311740377491155e-09, + "loss": 0.7219106, + "num_input_tokens_seen": 354901505, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.23388672, + "step": 16446, + "time_per_iteration": 2.671391725540161 + }, + { + "auxiliary_loss_clip": 0.01227635, + "auxiliary_loss_mlp": 0.00212078, + "balance_loss_clip": 1.0138849, + "balance_loss_mlp": 0.18825966, + "epoch": 0.9888471366300917, + "flos": 15158864390400.0, + "grad_norm": 123.23157119517614, + "language_loss": 0.80312574, + "learning_rate": 1.297675079582783e-09, + "loss": 0.81752288, + "num_input_tokens_seen": 354920060, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.23828125, + "step": 16447, + "time_per_iteration": 2.714468240737915 + }, + { + "auxiliary_loss_clip": 0.01242046, + "auxiliary_loss_mlp": 0.00239836, + "balance_loss_clip": 1.02512908, + "balance_loss_mlp": 0.21380042, + "epoch": 0.9889072598827596, + "flos": 25118796119040.0, + "grad_norm": 38.27350631737183, + "language_loss": 0.91745949, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.93227828, + "num_input_tokens_seen": 354938690, + "router_z_loss_clip": 2.17089844, + "router_z_loss_mlp": 0.26037598, + "step": 16448, + "time_per_iteration": 2.7000789642333984 + }, + { + "auxiliary_loss_clip": 0.01227777, + "auxiliary_loss_mlp": 0.00218446, + "balance_loss_clip": 1.01517379, + "balance_loss_mlp": 0.19393663, + "epoch": 0.9889673831354276, + "flos": 16728663450240.0, + "grad_norm": 16.606542102246024, + "language_loss": 0.77397728, + "learning_rate": 1.26977185727406e-09, + "loss": 0.78843951, + "num_input_tokens_seen": 354956955, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.24511719, + "step": 16449, + "time_per_iteration": 2.680272102355957 + }, + { + "auxiliary_loss_clip": 0.01250077, + "auxiliary_loss_mlp": 0.00223059, + "balance_loss_clip": 1.0344398, + "balance_loss_mlp": 0.19792983, + "epoch": 0.9890275063880956, + "flos": 35585175657600.0, + "grad_norm": 13.3998368658885, + "language_loss": 0.81919342, + "learning_rate": 1.25593393393153e-09, + "loss": 0.83392477, + "num_input_tokens_seen": 354976800, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.25158691, + "step": 16450, + "time_per_iteration": 2.7828776836395264 + }, + { + "auxiliary_loss_clip": 0.0126546, + "auxiliary_loss_mlp": 0.00238025, + "balance_loss_clip": 1.03279495, + "balance_loss_mlp": 0.21134584, + "epoch": 0.9890876296407636, + "flos": 18952359649920.0, + "grad_norm": 28.28922196961217, + "language_loss": 0.86970431, + "learning_rate": 1.242171803164549e-09, + "loss": 0.88473916, + "num_input_tokens_seen": 354996625, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.26696777, + "step": 16451, + "time_per_iteration": 2.679987668991089 + }, + { + "auxiliary_loss_clip": 0.01244776, + "auxiliary_loss_mlp": 0.00222186, + "balance_loss_clip": 1.02300406, + "balance_loss_mlp": 0.19655649, + "epoch": 0.9891477528934315, + "flos": 23769309127680.0, + "grad_norm": 100.56224707016807, + "language_loss": 0.82247907, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.83714867, + "num_input_tokens_seen": 355014535, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.2565918, + "step": 16452, + "time_per_iteration": 2.7512590885162354 + }, + { + "auxiliary_loss_clip": 0.0122728, + "auxiliary_loss_mlp": 0.0021122, + "balance_loss_clip": 1.01880217, + "balance_loss_mlp": 0.18859437, + "epoch": 0.9892078761460995, + "flos": 20772922533120.0, + "grad_norm": 35.13283438718066, + "language_loss": 0.80741739, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.82180244, + "num_input_tokens_seen": 355033280, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.22631836, + "step": 16453, + "time_per_iteration": 2.748844623565674 + }, + { + "auxiliary_loss_clip": 0.01241514, + "auxiliary_loss_mlp": 0.00212283, + "balance_loss_clip": 1.02402282, + "balance_loss_mlp": 0.18795279, + "epoch": 0.9892679993987675, + "flos": 23367827836800.0, + "grad_norm": 21.989045780661897, + "language_loss": 0.81170386, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.82624179, + "num_input_tokens_seen": 355053320, + "router_z_loss_clip": 2.17480469, + "router_z_loss_mlp": 0.2434082, + "step": 16454, + "time_per_iteration": 2.6580522060394287 + }, + { + "auxiliary_loss_clip": 0.01216087, + "auxiliary_loss_mlp": 0.002161, + "balance_loss_clip": 1.00642431, + "balance_loss_mlp": 0.19274732, + "epoch": 0.9893281226514354, + "flos": 22705419173760.0, + "grad_norm": 52.84502920929598, + "language_loss": 0.82347512, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.83779699, + "num_input_tokens_seen": 355070230, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.23339844, + "step": 16455, + "time_per_iteration": 2.6646130084991455 + }, + { + "auxiliary_loss_clip": 0.01212282, + "auxiliary_loss_mlp": 0.00209325, + "balance_loss_clip": 1.00597405, + "balance_loss_mlp": 0.18756916, + "epoch": 0.9893882459041035, + "flos": 21796664060160.0, + "grad_norm": 88.18578060183438, + "language_loss": 0.71783686, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.73205292, + "num_input_tokens_seen": 355090125, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.21740723, + "step": 16456, + "time_per_iteration": 2.7494754791259766 + }, + { + "auxiliary_loss_clip": 0.01247474, + "auxiliary_loss_mlp": 0.00221394, + "balance_loss_clip": 1.03281999, + "balance_loss_mlp": 0.19762415, + "epoch": 0.9894483691567714, + "flos": 18113773754880.0, + "grad_norm": 22.603881533748783, + "language_loss": 0.81290811, + "learning_rate": 1.161190691666203e-09, + "loss": 0.82759678, + "num_input_tokens_seen": 355107890, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.23779297, + "step": 16457, + "time_per_iteration": 2.620739459991455 + }, + { + "auxiliary_loss_clip": 0.01247042, + "auxiliary_loss_mlp": 0.00222712, + "balance_loss_clip": 1.02409577, + "balance_loss_mlp": 0.1970461, + "epoch": 0.9895084924094394, + "flos": 31211615664000.0, + "grad_norm": 6.708450096746502, + "language_loss": 0.77029181, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.78498936, + "num_input_tokens_seen": 355126340, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.25683594, + "step": 16458, + "time_per_iteration": 2.924142837524414 + }, + { + "auxiliary_loss_clip": 0.0123023, + "auxiliary_loss_mlp": 0.00227242, + "balance_loss_clip": 1.01522422, + "balance_loss_mlp": 0.20293482, + "epoch": 0.9895686156621073, + "flos": 19678042120320.0, + "grad_norm": 176.14930563080875, + "language_loss": 0.87276745, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.8873421, + "num_input_tokens_seen": 355144025, + "router_z_loss_clip": 2.15136719, + "router_z_loss_mlp": 0.24316406, + "step": 16459, + "time_per_iteration": 2.717639923095703 + }, + { + "auxiliary_loss_clip": 0.01234877, + "auxiliary_loss_mlp": 0.00250801, + "balance_loss_clip": 1.01977384, + "balance_loss_mlp": 0.22319236, + "epoch": 0.9896287389147753, + "flos": 23581675457280.0, + "grad_norm": 3.3913475190075184, + "language_loss": 0.82665551, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.84151232, + "num_input_tokens_seen": 355163125, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.27600098, + "step": 16460, + "time_per_iteration": 2.8002874851226807 + }, + { + "auxiliary_loss_clip": 0.0122507, + "auxiliary_loss_mlp": 0.00216442, + "balance_loss_clip": 1.01386929, + "balance_loss_mlp": 0.19211158, + "epoch": 0.9896888621674432, + "flos": 29605331364480.0, + "grad_norm": 8.137511663840248, + "language_loss": 0.94816637, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.96258152, + "num_input_tokens_seen": 355184060, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.24316406, + "step": 16461, + "time_per_iteration": 2.7109920978546143 + }, + { + "auxiliary_loss_clip": 0.01237076, + "auxiliary_loss_mlp": 0.00231767, + "balance_loss_clip": 1.02278352, + "balance_loss_mlp": 0.20537391, + "epoch": 0.9897489854201112, + "flos": 23695045758720.0, + "grad_norm": 2.3857857408952934, + "language_loss": 0.71074742, + "learning_rate": 1.09579082189315e-09, + "loss": 0.72543585, + "num_input_tokens_seen": 355204505, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.26367188, + "step": 16462, + "time_per_iteration": 2.6969478130340576 + }, + { + "auxiliary_loss_clip": 0.01223015, + "auxiliary_loss_mlp": 0.00219424, + "balance_loss_clip": 1.0134325, + "balance_loss_mlp": 0.19480726, + "epoch": 0.9898091086727792, + "flos": 13225146687360.0, + "grad_norm": 88.10375310957765, + "language_loss": 0.82813895, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.84256327, + "num_input_tokens_seen": 355223055, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.24597168, + "step": 16463, + "time_per_iteration": 2.6140565872192383 + }, + { + "auxiliary_loss_clip": 0.01232138, + "auxiliary_loss_mlp": 0.00225734, + "balance_loss_clip": 1.01764536, + "balance_loss_mlp": 0.20029427, + "epoch": 0.9898692319254472, + "flos": 22930400010240.0, + "grad_norm": 6.927904185855233, + "language_loss": 0.79052961, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.80510837, + "num_input_tokens_seen": 355242000, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.25463867, + "step": 16464, + "time_per_iteration": 2.697918653488159 + }, + { + "auxiliary_loss_clip": 0.01250219, + "auxiliary_loss_mlp": 0.00205469, + "balance_loss_clip": 1.02887106, + "balance_loss_mlp": 0.18060169, + "epoch": 0.9899293551781151, + "flos": 12458346122880.0, + "grad_norm": 13.578977714351863, + "language_loss": 0.84849107, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.8630479, + "num_input_tokens_seen": 355260175, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.2487793, + "step": 16465, + "time_per_iteration": 2.6960396766662598 + }, + { + "auxiliary_loss_clip": 0.01226297, + "auxiliary_loss_mlp": 0.00224391, + "balance_loss_clip": 1.01549768, + "balance_loss_mlp": 0.20104951, + "epoch": 0.9899894784307831, + "flos": 26871129118080.0, + "grad_norm": 8.72001707126764, + "language_loss": 0.92993367, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.94444048, + "num_input_tokens_seen": 355281930, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.23339844, + "step": 16466, + "time_per_iteration": 2.6999242305755615 + }, + { + "auxiliary_loss_clip": 0.01254704, + "auxiliary_loss_mlp": 0.00228398, + "balance_loss_clip": 1.03706193, + "balance_loss_mlp": 0.20243391, + "epoch": 0.990049601683451, + "flos": 21542093395200.0, + "grad_norm": 18.53942164226143, + "language_loss": 0.80671507, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.82154608, + "num_input_tokens_seen": 355301555, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.25964355, + "step": 16467, + "time_per_iteration": 2.7069664001464844 + }, + { + "auxiliary_loss_clip": 0.01225765, + "auxiliary_loss_mlp": 0.00231475, + "balance_loss_clip": 1.01404011, + "balance_loss_mlp": 0.20751387, + "epoch": 0.990109724936119, + "flos": 28771809287040.0, + "grad_norm": 137.13526532170113, + "language_loss": 0.70592195, + "learning_rate": 1.019812338686643e-09, + "loss": 0.72049439, + "num_input_tokens_seen": 355324925, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.23962402, + "step": 16468, + "time_per_iteration": 2.766195058822632 + }, + { + "auxiliary_loss_clip": 0.01234179, + "auxiliary_loss_mlp": 0.00219026, + "balance_loss_clip": 1.01648903, + "balance_loss_mlp": 0.19411097, + "epoch": 0.9901698481887871, + "flos": 29274270687360.0, + "grad_norm": 41.8644551709993, + "language_loss": 0.68848401, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.70301604, + "num_input_tokens_seen": 355343875, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.24890137, + "step": 16469, + "time_per_iteration": 2.823916435241699 + }, + { + "auxiliary_loss_clip": 0.01237172, + "auxiliary_loss_mlp": 0.00212871, + "balance_loss_clip": 1.02054024, + "balance_loss_mlp": 0.18907705, + "epoch": 0.990229971441455, + "flos": 15959025711360.0, + "grad_norm": 2563.997388038879, + "language_loss": 0.83910179, + "learning_rate": 9.950925847685976e-10, + "loss": 0.85360223, + "num_input_tokens_seen": 355358835, + "router_z_loss_clip": 2.16308594, + "router_z_loss_mlp": 0.2376709, + "step": 16470, + "time_per_iteration": 2.6320197582244873 + }, + { + "auxiliary_loss_clip": 0.01088751, + "auxiliary_loss_mlp": 0.00106489, + "balance_loss_clip": 0.95048898, + "balance_loss_mlp": 0.09924134, + "epoch": 0.990290094694123, + "flos": 69780287911680.0, + "grad_norm": 0.6575699287367043, + "language_loss": 0.55110282, + "learning_rate": 9.828464112755509e-10, + "loss": 0.56305522, + "num_input_tokens_seen": 355431225, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.07226562, + "step": 16471, + "time_per_iteration": 3.362274169921875 + }, + { + "auxiliary_loss_clip": 0.01236642, + "auxiliary_loss_mlp": 0.00237883, + "balance_loss_clip": 1.01890516, + "balance_loss_mlp": 0.21152577, + "epoch": 0.9903502179467909, + "flos": 16252451913600.0, + "grad_norm": 36.594762281984714, + "language_loss": 0.9517501, + "learning_rate": 9.706760407131032e-10, + "loss": 0.96649528, + "num_input_tokens_seen": 355448250, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.26367188, + "step": 16472, + "time_per_iteration": 2.722848892211914 + }, + { + "auxiliary_loss_clip": 0.01254624, + "auxiliary_loss_mlp": 0.0022028, + "balance_loss_clip": 1.03299952, + "balance_loss_mlp": 0.19619986, + "epoch": 0.9904103411994589, + "flos": 21688393489920.0, + "grad_norm": 63.37112388152848, + "language_loss": 0.9490642, + "learning_rate": 9.585814735431075e-10, + "loss": 0.96381319, + "num_input_tokens_seen": 355467040, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.2409668, + "step": 16473, + "time_per_iteration": 2.689098834991455 + }, + { + "auxiliary_loss_clip": 0.01223879, + "auxiliary_loss_mlp": 0.0020678, + "balance_loss_clip": 1.01474881, + "balance_loss_mlp": 0.18459526, + "epoch": 0.9904704644521268, + "flos": 25739440243200.0, + "grad_norm": 12.987051515433265, + "language_loss": 0.91381192, + "learning_rate": 9.465627102240859e-10, + "loss": 0.92811853, + "num_input_tokens_seen": 355487825, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.22192383, + "step": 16474, + "time_per_iteration": 2.7350127696990967 + }, + { + "auxiliary_loss_clip": 0.01216367, + "auxiliary_loss_mlp": 0.0020451, + "balance_loss_clip": 1.00859213, + "balance_loss_mlp": 0.18194371, + "epoch": 0.9905305877047949, + "flos": 21908346422400.0, + "grad_norm": 24.81956710580109, + "language_loss": 0.83664644, + "learning_rate": 9.346197512116738e-10, + "loss": 0.85085523, + "num_input_tokens_seen": 355507445, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.22558594, + "step": 16475, + "time_per_iteration": 4.0634543895721436 + }, + { + "auxiliary_loss_clip": 0.01231604, + "auxiliary_loss_mlp": 0.00227771, + "balance_loss_clip": 1.01784658, + "balance_loss_mlp": 0.20282093, + "epoch": 0.9905907109574628, + "flos": 21392417422080.0, + "grad_norm": 18.237719639998318, + "language_loss": 0.8119579, + "learning_rate": 9.227525969588423e-10, + "loss": 0.82655168, + "num_input_tokens_seen": 355527205, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.24975586, + "step": 16476, + "time_per_iteration": 4.107767820358276 + }, + { + "auxiliary_loss_clip": 0.01260448, + "auxiliary_loss_mlp": 0.0022137, + "balance_loss_clip": 1.03243864, + "balance_loss_mlp": 0.19430988, + "epoch": 0.9906508342101308, + "flos": 20521620005760.0, + "grad_norm": 10.135148583812596, + "language_loss": 0.77246821, + "learning_rate": 9.109612479154538e-10, + "loss": 0.7872864, + "num_input_tokens_seen": 355544740, + "router_z_loss_clip": 2.27636719, + "router_z_loss_mlp": 0.27050781, + "step": 16477, + "time_per_iteration": 2.6859357357025146 + }, + { + "auxiliary_loss_clip": 0.01249481, + "auxiliary_loss_mlp": 0.00246665, + "balance_loss_clip": 1.02520919, + "balance_loss_mlp": 0.21960446, + "epoch": 0.9907109574627987, + "flos": 21361211481600.0, + "grad_norm": 44.224488453388396, + "language_loss": 0.8175329, + "learning_rate": 8.992457045289282e-10, + "loss": 0.83249438, + "num_input_tokens_seen": 355564385, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.27075195, + "step": 16478, + "time_per_iteration": 2.6302011013031006 + }, + { + "auxiliary_loss_clip": 0.01236374, + "auxiliary_loss_mlp": 0.00209432, + "balance_loss_clip": 1.02226639, + "balance_loss_mlp": 0.18594778, + "epoch": 0.9907710807154667, + "flos": 17338605321600.0, + "grad_norm": 86.82168390300431, + "language_loss": 0.9258827, + "learning_rate": 8.876059672433545e-10, + "loss": 0.94034076, + "num_input_tokens_seen": 355579260, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.23486328, + "step": 16479, + "time_per_iteration": 2.672309160232544 + }, + { + "auxiliary_loss_clip": 0.01243947, + "auxiliary_loss_mlp": 0.00222094, + "balance_loss_clip": 1.02821696, + "balance_loss_mlp": 0.19834751, + "epoch": 0.9908312039681346, + "flos": 28621881918720.0, + "grad_norm": 68.05892358093139, + "language_loss": 0.74830759, + "learning_rate": 8.760420364999355e-10, + "loss": 0.76296806, + "num_input_tokens_seen": 355599790, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.23754883, + "step": 16480, + "time_per_iteration": 2.7031054496765137 + }, + { + "auxiliary_loss_clip": 0.01222342, + "auxiliary_loss_mlp": 0.00233177, + "balance_loss_clip": 1.01574147, + "balance_loss_mlp": 0.20975247, + "epoch": 0.9908913272208026, + "flos": 35770654512000.0, + "grad_norm": 6.3083010710251175, + "language_loss": 0.79272914, + "learning_rate": 8.645539127374313e-10, + "loss": 0.80728436, + "num_input_tokens_seen": 355620925, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.234375, + "step": 16481, + "time_per_iteration": 2.8135292530059814 + }, + { + "auxiliary_loss_clip": 0.01232996, + "auxiliary_loss_mlp": 0.00206936, + "balance_loss_clip": 1.0225544, + "balance_loss_mlp": 0.18308258, + "epoch": 0.9909514504734707, + "flos": 19902196944000.0, + "grad_norm": 51.62188905973563, + "language_loss": 0.86403012, + "learning_rate": 8.531415963912713e-10, + "loss": 0.87842953, + "num_input_tokens_seen": 355639165, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.23840332, + "step": 16482, + "time_per_iteration": 2.6286721229553223 + }, + { + "auxiliary_loss_clip": 0.01235073, + "auxiliary_loss_mlp": 0.00202759, + "balance_loss_clip": 1.01591098, + "balance_loss_mlp": 0.17767732, + "epoch": 0.9910115737261386, + "flos": 20004793165440.0, + "grad_norm": 6.24750070624695, + "language_loss": 0.83844936, + "learning_rate": 8.418050878944427e-10, + "loss": 0.85282767, + "num_input_tokens_seen": 355657320, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25073242, + "step": 16483, + "time_per_iteration": 2.671978712081909 + }, + { + "auxiliary_loss_clip": 0.010865, + "auxiliary_loss_mlp": 0.00122258, + "balance_loss_clip": 0.94951642, + "balance_loss_mlp": 0.1139606, + "epoch": 0.9910716969788066, + "flos": 70688432494080.0, + "grad_norm": 0.702064029248106, + "language_loss": 0.5328595, + "learning_rate": 8.305443876768237e-10, + "loss": 0.54494703, + "num_input_tokens_seen": 355726370, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.08300781, + "step": 16484, + "time_per_iteration": 4.759593725204468 + }, + { + "auxiliary_loss_clip": 0.01216224, + "auxiliary_loss_mlp": 0.0018326, + "balance_loss_clip": 1.00791478, + "balance_loss_mlp": 0.16034836, + "epoch": 0.9911318202314745, + "flos": 21434038306560.0, + "grad_norm": 2.9789052154267885, + "language_loss": 0.88739002, + "learning_rate": 8.19359496165184e-10, + "loss": 0.90138489, + "num_input_tokens_seen": 355745840, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.22912598, + "step": 16485, + "time_per_iteration": 2.6591391563415527 + }, + { + "auxiliary_loss_clip": 0.012157, + "auxiliary_loss_mlp": 0.00208065, + "balance_loss_clip": 1.00800359, + "balance_loss_mlp": 0.18378167, + "epoch": 0.9911919434841425, + "flos": 19826820253440.0, + "grad_norm": 1939.1946951997734, + "language_loss": 0.8825385, + "learning_rate": 8.082504137836288e-10, + "loss": 0.89677614, + "num_input_tokens_seen": 355763385, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.24291992, + "step": 16486, + "time_per_iteration": 4.165836572647095 + }, + { + "auxiliary_loss_clip": 0.01253933, + "auxiliary_loss_mlp": 0.00222119, + "balance_loss_clip": 1.03351092, + "balance_loss_mlp": 0.19865896, + "epoch": 0.9912520667368104, + "flos": 41719364691840.0, + "grad_norm": 56.743499163267835, + "language_loss": 0.727633, + "learning_rate": 7.972171409538209e-10, + "loss": 0.74239355, + "num_input_tokens_seen": 355786075, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.234375, + "step": 16487, + "time_per_iteration": 2.8944039344787598 + }, + { + "auxiliary_loss_clip": 0.01224933, + "auxiliary_loss_mlp": 0.0022268, + "balance_loss_clip": 1.01569772, + "balance_loss_mlp": 0.19948146, + "epoch": 0.9913121899894785, + "flos": 23769668263680.0, + "grad_norm": 6.567665056829688, + "language_loss": 0.85840666, + "learning_rate": 7.862596780936481e-10, + "loss": 0.87288284, + "num_input_tokens_seen": 355806295, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.23217773, + "step": 16488, + "time_per_iteration": 2.781287670135498 + }, + { + "auxiliary_loss_clip": 0.01270747, + "auxiliary_loss_mlp": 0.0023709, + "balance_loss_clip": 1.04078507, + "balance_loss_mlp": 0.21105474, + "epoch": 0.9913723132421464, + "flos": 23769668263680.0, + "grad_norm": 51.27256282801621, + "language_loss": 0.7835114, + "learning_rate": 7.753780256190001e-10, + "loss": 0.79858977, + "num_input_tokens_seen": 355825730, + "router_z_loss_clip": 2.29980469, + "router_z_loss_mlp": 0.26037598, + "step": 16489, + "time_per_iteration": 2.789767265319824 + }, + { + "auxiliary_loss_clip": 0.01088402, + "auxiliary_loss_mlp": 0.00108, + "balance_loss_clip": 0.94971979, + "balance_loss_mlp": 0.1006094, + "epoch": 0.9914324364948144, + "flos": 71267419820160.0, + "grad_norm": 0.5997531979115908, + "language_loss": 0.51975977, + "learning_rate": 7.645721839424357e-10, + "loss": 0.53172386, + "num_input_tokens_seen": 355891545, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.07373047, + "step": 16490, + "time_per_iteration": 3.305001974105835 + }, + { + "auxiliary_loss_clip": 0.01262552, + "auxiliary_loss_mlp": 0.00267257, + "balance_loss_clip": 1.03977883, + "balance_loss_mlp": 0.24000612, + "epoch": 0.9914925597474823, + "flos": 23695440808320.0, + "grad_norm": 2.345374789350484, + "language_loss": 0.81744695, + "learning_rate": 7.538421534734052e-10, + "loss": 0.83274496, + "num_input_tokens_seen": 355909920, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.27233887, + "step": 16491, + "time_per_iteration": 2.6855058670043945 + }, + { + "auxiliary_loss_clip": 0.01276683, + "auxiliary_loss_mlp": 0.00233155, + "balance_loss_clip": 1.04867268, + "balance_loss_mlp": 0.20558232, + "epoch": 0.9915526830001503, + "flos": 13433822749440.0, + "grad_norm": 39.87257292475765, + "language_loss": 0.80002749, + "learning_rate": 7.431879346191383e-10, + "loss": 0.81512582, + "num_input_tokens_seen": 355923130, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.27563477, + "step": 16492, + "time_per_iteration": 2.6651148796081543 + }, + { + "auxiliary_loss_clip": 0.01223799, + "auxiliary_loss_mlp": 0.00238253, + "balance_loss_clip": 1.0115298, + "balance_loss_mlp": 0.2121934, + "epoch": 0.9916128062528182, + "flos": 20740962407040.0, + "grad_norm": 11.344732675848794, + "language_loss": 0.77369201, + "learning_rate": 7.326095277837563e-10, + "loss": 0.78831255, + "num_input_tokens_seen": 355941960, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.26098633, + "step": 16493, + "time_per_iteration": 2.6620824337005615 + }, + { + "auxiliary_loss_clip": 0.01247373, + "auxiliary_loss_mlp": 0.0021201, + "balance_loss_clip": 1.02881646, + "balance_loss_mlp": 0.18661799, + "epoch": 0.9916729295054862, + "flos": 22487082353280.0, + "grad_norm": 30.812334382461028, + "language_loss": 0.80193341, + "learning_rate": 7.221069333678276e-10, + "loss": 0.81652725, + "num_input_tokens_seen": 355961640, + "router_z_loss_clip": 2.18652344, + "router_z_loss_mlp": 0.25402832, + "step": 16494, + "time_per_iteration": 2.742496967315674 + }, + { + "auxiliary_loss_clip": 0.01248836, + "auxiliary_loss_mlp": 0.00248078, + "balance_loss_clip": 1.02494371, + "balance_loss_mlp": 0.2219232, + "epoch": 0.9917330527581543, + "flos": 14792467708800.0, + "grad_norm": 26.481006820147865, + "language_loss": 0.7633431, + "learning_rate": 7.116801517701443e-10, + "loss": 0.77831221, + "num_input_tokens_seen": 355977980, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26123047, + "step": 16495, + "time_per_iteration": 2.649563789367676 + }, + { + "auxiliary_loss_clip": 0.01088736, + "auxiliary_loss_mlp": 0.0010933, + "balance_loss_clip": 0.95109546, + "balance_loss_mlp": 0.10165288, + "epoch": 0.9917931760108222, + "flos": 59191595585280.0, + "grad_norm": 0.6927210930794526, + "language_loss": 0.52838516, + "learning_rate": 7.013291833859458e-10, + "loss": 0.54036587, + "num_input_tokens_seen": 356042900, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.07666016, + "step": 16496, + "time_per_iteration": 3.286686420440674 + }, + { + "auxiliary_loss_clip": 0.01243679, + "auxiliary_loss_mlp": 0.00235548, + "balance_loss_clip": 1.02640569, + "balance_loss_mlp": 0.20933348, + "epoch": 0.9918532992634902, + "flos": 26761637485440.0, + "grad_norm": 451.09802006460455, + "language_loss": 0.79210699, + "learning_rate": 6.91054028607585e-10, + "loss": 0.80689925, + "num_input_tokens_seen": 356063000, + "router_z_loss_clip": 2.17089844, + "router_z_loss_mlp": 0.2623291, + "step": 16497, + "time_per_iteration": 2.7016396522521973 + }, + { + "auxiliary_loss_clip": 0.0128441, + "auxiliary_loss_mlp": 0.00229553, + "balance_loss_clip": 1.05094624, + "balance_loss_mlp": 0.20237315, + "epoch": 0.9919134225161581, + "flos": 14975719920000.0, + "grad_norm": 9.372951219711378, + "language_loss": 0.92390704, + "learning_rate": 6.808546878249721e-10, + "loss": 0.93904674, + "num_input_tokens_seen": 356078130, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.27185059, + "step": 16498, + "time_per_iteration": 2.765265703201294 + }, + { + "auxiliary_loss_clip": 0.01250437, + "auxiliary_loss_mlp": 0.00251857, + "balance_loss_clip": 1.03353095, + "balance_loss_mlp": 0.22532137, + "epoch": 0.9919735457688261, + "flos": 27818201064960.0, + "grad_norm": 43.68345690115969, + "language_loss": 0.74326146, + "learning_rate": 6.707311614246869e-10, + "loss": 0.75828439, + "num_input_tokens_seen": 356101655, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.26538086, + "step": 16499, + "time_per_iteration": 2.740226984024048 + }, + { + "auxiliary_loss_clip": 0.01245327, + "auxiliary_loss_mlp": 0.0023696, + "balance_loss_clip": 1.02677989, + "balance_loss_mlp": 0.21135369, + "epoch": 0.992033669021494, + "flos": 22562782266240.0, + "grad_norm": 1082.2475868752913, + "language_loss": 0.89712691, + "learning_rate": 6.606834497904223e-10, + "loss": 0.91194975, + "num_input_tokens_seen": 356121425, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.25610352, + "step": 16500, + "time_per_iteration": 2.649876117706299 + }, + { + "auxiliary_loss_clip": 0.01225527, + "auxiliary_loss_mlp": 0.0022061, + "balance_loss_clip": 1.01183832, + "balance_loss_mlp": 0.19699436, + "epoch": 0.9920937922741621, + "flos": 25374587846400.0, + "grad_norm": 14.609997346629363, + "language_loss": 0.91276085, + "learning_rate": 6.507115533036511e-10, + "loss": 0.92722219, + "num_input_tokens_seen": 356140710, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.23596191, + "step": 16501, + "time_per_iteration": 2.700427293777466 + }, + { + "auxiliary_loss_clip": 0.01238807, + "auxiliary_loss_mlp": 0.00216541, + "balance_loss_clip": 1.02098107, + "balance_loss_mlp": 0.1937838, + "epoch": 0.99215391552683, + "flos": 22054466949120.0, + "grad_norm": 19.84579360401703, + "language_loss": 0.85513222, + "learning_rate": 6.408154723420711e-10, + "loss": 0.86968565, + "num_input_tokens_seen": 356159835, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.22766113, + "step": 16502, + "time_per_iteration": 2.663682222366333 + }, + { + "auxiliary_loss_clip": 0.01252836, + "auxiliary_loss_mlp": 0.00223564, + "balance_loss_clip": 1.03765345, + "balance_loss_mlp": 0.1992806, + "epoch": 0.992214038779498, + "flos": 15413937845760.0, + "grad_norm": 195.8174558279032, + "language_loss": 0.83684552, + "learning_rate": 6.309952072811597e-10, + "loss": 0.85160953, + "num_input_tokens_seen": 356177555, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24291992, + "step": 16503, + "time_per_iteration": 2.724957227706909 + }, + { + "auxiliary_loss_clip": 0.01085114, + "auxiliary_loss_mlp": 0.00071305, + "balance_loss_clip": 0.94821107, + "balance_loss_mlp": 0.06448591, + "epoch": 0.9922741620321659, + "flos": 62014498467840.0, + "grad_norm": 0.6367992032610935, + "language_loss": 0.54621208, + "learning_rate": 6.212507584932858e-10, + "loss": 0.55777633, + "num_input_tokens_seen": 356244975, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06835938, + "step": 16504, + "time_per_iteration": 3.2451038360595703 + }, + { + "auxiliary_loss_clip": 0.01233876, + "auxiliary_loss_mlp": 0.00225205, + "balance_loss_clip": 1.01755095, + "balance_loss_mlp": 0.19955149, + "epoch": 0.9923342852848339, + "flos": 17165480745600.0, + "grad_norm": 4.3926460890777985, + "language_loss": 0.77783507, + "learning_rate": 6.115821263481536e-10, + "loss": 0.79242587, + "num_input_tokens_seen": 356262605, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.2565918, + "step": 16505, + "time_per_iteration": 2.863426446914673 + }, + { + "auxiliary_loss_clip": 0.01258116, + "auxiliary_loss_mlp": 0.00244128, + "balance_loss_clip": 1.03209043, + "balance_loss_mlp": 0.21533938, + "epoch": 0.9923944085375018, + "flos": 23183210908800.0, + "grad_norm": 10.568142206761422, + "language_loss": 0.75122857, + "learning_rate": 6.019893112119146e-10, + "loss": 0.76625097, + "num_input_tokens_seen": 356278935, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.28771973, + "step": 16506, + "time_per_iteration": 2.701571464538574 + }, + { + "auxiliary_loss_clip": 0.01229873, + "auxiliary_loss_mlp": 0.00208499, + "balance_loss_clip": 1.0131371, + "balance_loss_mlp": 0.18402497, + "epoch": 0.9924545317901698, + "flos": 20813861059200.0, + "grad_norm": 69.61601861732785, + "language_loss": 0.71377832, + "learning_rate": 5.924723134487219e-10, + "loss": 0.72816205, + "num_input_tokens_seen": 356295675, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24450684, + "step": 16507, + "time_per_iteration": 2.669567108154297 + }, + { + "auxiliary_loss_clip": 0.01242046, + "auxiliary_loss_mlp": 0.00232625, + "balance_loss_clip": 1.02399099, + "balance_loss_mlp": 0.20630321, + "epoch": 0.9925146550428379, + "flos": 20083437993600.0, + "grad_norm": 6.307447943007849, + "language_loss": 0.81269348, + "learning_rate": 5.830311334193983e-10, + "loss": 0.82744014, + "num_input_tokens_seen": 356312885, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.26306152, + "step": 16508, + "time_per_iteration": 2.677767276763916 + }, + { + "auxiliary_loss_clip": 0.01235445, + "auxiliary_loss_mlp": 0.00227328, + "balance_loss_clip": 1.02018547, + "balance_loss_mlp": 0.20314057, + "epoch": 0.9925747782955058, + "flos": 24973717086720.0, + "grad_norm": 23.922033594928056, + "language_loss": 0.75346953, + "learning_rate": 5.736657714818793e-10, + "loss": 0.76809728, + "num_input_tokens_seen": 356334070, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24206543, + "step": 16509, + "time_per_iteration": 2.688413143157959 + }, + { + "auxiliary_loss_clip": 0.01238567, + "auxiliary_loss_mlp": 0.00244894, + "balance_loss_clip": 1.02387702, + "balance_loss_mlp": 0.21850082, + "epoch": 0.9926349015481738, + "flos": 60472526492160.0, + "grad_norm": 25.99686019039775, + "language_loss": 0.77343249, + "learning_rate": 5.643762279912146e-10, + "loss": 0.78826702, + "num_input_tokens_seen": 356359410, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.26391602, + "step": 16510, + "time_per_iteration": 3.0585341453552246 + }, + { + "auxiliary_loss_clip": 0.01271799, + "auxiliary_loss_mlp": 0.00261881, + "balance_loss_clip": 1.04779255, + "balance_loss_mlp": 0.23445086, + "epoch": 0.9926950248008417, + "flos": 20741716592640.0, + "grad_norm": 14.19674017905556, + "language_loss": 0.91134918, + "learning_rate": 5.551625032997886e-10, + "loss": 0.92668605, + "num_input_tokens_seen": 356378345, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.2746582, + "step": 16511, + "time_per_iteration": 2.6557037830352783 + }, + { + "auxiliary_loss_clip": 0.01225801, + "auxiliary_loss_mlp": 0.00243388, + "balance_loss_clip": 1.01189685, + "balance_loss_mlp": 0.21929535, + "epoch": 0.9927551480535097, + "flos": 24352965221760.0, + "grad_norm": 21.78224778428894, + "language_loss": 0.98380369, + "learning_rate": 5.460245977570998e-10, + "loss": 0.99849558, + "num_input_tokens_seen": 356397345, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24084473, + "step": 16512, + "time_per_iteration": 2.7030177116394043 + }, + { + "auxiliary_loss_clip": 0.01089314, + "auxiliary_loss_mlp": 0.00078893, + "balance_loss_clip": 0.95154464, + "balance_loss_mlp": 0.07207419, + "epoch": 0.9928152713061776, + "flos": 71275572207360.0, + "grad_norm": 0.665306866476308, + "language_loss": 0.54098523, + "learning_rate": 5.369625117095378e-10, + "loss": 0.55266726, + "num_input_tokens_seen": 356459160, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06835938, + "step": 16513, + "time_per_iteration": 3.246744155883789 + }, + { + "auxiliary_loss_clip": 0.012396, + "auxiliary_loss_mlp": 0.00220273, + "balance_loss_clip": 1.02449322, + "balance_loss_mlp": 0.1963481, + "epoch": 0.9928753945588457, + "flos": 57809499045120.0, + "grad_norm": 15.610215247421506, + "language_loss": 0.71279252, + "learning_rate": 5.279762455006054e-10, + "loss": 0.7273913, + "num_input_tokens_seen": 356486405, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.23937988, + "step": 16514, + "time_per_iteration": 3.0181102752685547 + }, + { + "auxiliary_loss_clip": 0.01250774, + "auxiliary_loss_mlp": 0.00232304, + "balance_loss_clip": 1.02942741, + "balance_loss_mlp": 0.20631675, + "epoch": 0.9929355178115136, + "flos": 19568981450880.0, + "grad_norm": 32.56298812599633, + "language_loss": 0.82532811, + "learning_rate": 5.190657994713632e-10, + "loss": 0.84015894, + "num_input_tokens_seen": 356502905, + "router_z_loss_clip": 2.21386719, + "router_z_loss_mlp": 0.25964355, + "step": 16515, + "time_per_iteration": 2.688469648361206 + }, + { + "auxiliary_loss_clip": 0.01238723, + "auxiliary_loss_mlp": 0.0020308, + "balance_loss_clip": 1.02326679, + "balance_loss_mlp": 0.1787971, + "epoch": 0.9929956410641816, + "flos": 22964658606720.0, + "grad_norm": 5.067708829560748, + "language_loss": 0.83268368, + "learning_rate": 5.102311739593191e-10, + "loss": 0.84710175, + "num_input_tokens_seen": 356523830, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.24255371, + "step": 16516, + "time_per_iteration": 2.6610054969787598 + }, + { + "auxiliary_loss_clip": 0.01236034, + "auxiliary_loss_mlp": 0.00221713, + "balance_loss_clip": 1.02134478, + "balance_loss_mlp": 0.19725154, + "epoch": 0.9930557643168495, + "flos": 22566409539840.0, + "grad_norm": 15.255650842330422, + "language_loss": 0.83660555, + "learning_rate": 5.014723692997602e-10, + "loss": 0.851183, + "num_input_tokens_seen": 356543965, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24450684, + "step": 16517, + "time_per_iteration": 4.103203058242798 + }, + { + "auxiliary_loss_clip": 0.01252495, + "auxiliary_loss_mlp": 0.00230877, + "balance_loss_clip": 1.03038847, + "balance_loss_mlp": 0.20306534, + "epoch": 0.9931158875695175, + "flos": 17201032231680.0, + "grad_norm": 11.76157041536115, + "language_loss": 0.78712457, + "learning_rate": 4.927893858248655e-10, + "loss": 0.80195826, + "num_input_tokens_seen": 356561530, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.27807617, + "step": 16518, + "time_per_iteration": 2.6290438175201416 + }, + { + "auxiliary_loss_clip": 0.01084361, + "auxiliary_loss_mlp": 0.00065596, + "balance_loss_clip": 0.94700444, + "balance_loss_mlp": 0.05920672, + "epoch": 0.9931760108221854, + "flos": 63711204278400.0, + "grad_norm": 0.7375659773293618, + "language_loss": 0.52797472, + "learning_rate": 4.84182223863483e-10, + "loss": 0.53947437, + "num_input_tokens_seen": 356616845, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06396484, + "step": 16519, + "time_per_iteration": 4.490699291229248 + }, + { + "auxiliary_loss_clip": 0.01247899, + "auxiliary_loss_mlp": 0.00227513, + "balance_loss_clip": 1.0298419, + "balance_loss_mlp": 0.20201382, + "epoch": 0.9932361340748534, + "flos": 15304805349120.0, + "grad_norm": 3.346449943622986, + "language_loss": 0.6741367, + "learning_rate": 4.756508837426842e-10, + "loss": 0.68889081, + "num_input_tokens_seen": 356633560, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.25524902, + "step": 16520, + "time_per_iteration": 2.7774269580841064 + }, + { + "auxiliary_loss_clip": 0.01236239, + "auxiliary_loss_mlp": 0.00227908, + "balance_loss_clip": 1.02196121, + "balance_loss_mlp": 0.20313603, + "epoch": 0.9932962573275215, + "flos": 36064906727040.0, + "grad_norm": 3.235038121903276, + "language_loss": 0.70467424, + "learning_rate": 4.671953657853223e-10, + "loss": 0.71931571, + "num_input_tokens_seen": 356657600, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.24780273, + "step": 16521, + "time_per_iteration": 2.8796026706695557 + }, + { + "auxiliary_loss_clip": 0.01251033, + "auxiliary_loss_mlp": 0.00222094, + "balance_loss_clip": 1.03157949, + "balance_loss_mlp": 0.19427007, + "epoch": 0.9933563805801894, + "flos": 21470523546240.0, + "grad_norm": 268.6881163513795, + "language_loss": 0.81401491, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.8287462, + "num_input_tokens_seen": 356675880, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.27832031, + "step": 16522, + "time_per_iteration": 2.63860821723938 + }, + { + "auxiliary_loss_clip": 0.01217505, + "auxiliary_loss_mlp": 0.00194987, + "balance_loss_clip": 1.00954056, + "balance_loss_mlp": 0.1734817, + "epoch": 0.9934165038328574, + "flos": 23986532626560.0, + "grad_norm": 17.733554998542473, + "language_loss": 0.78994232, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.80406719, + "num_input_tokens_seen": 356696000, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.21508789, + "step": 16523, + "time_per_iteration": 2.659036159515381 + }, + { + "auxiliary_loss_clip": 0.01250369, + "auxiliary_loss_mlp": 0.00228858, + "balance_loss_clip": 1.03021049, + "balance_loss_mlp": 0.20313291, + "epoch": 0.9934766270855253, + "flos": 21907807718400.0, + "grad_norm": 413.7155240636684, + "language_loss": 0.78849387, + "learning_rate": 4.422837480875241e-10, + "loss": 0.80328614, + "num_input_tokens_seen": 356716845, + "router_z_loss_clip": 2.20019531, + "router_z_loss_mlp": 0.25732422, + "step": 16524, + "time_per_iteration": 2.661891460418701 + }, + { + "auxiliary_loss_clip": 0.01241181, + "auxiliary_loss_mlp": 0.00230202, + "balance_loss_clip": 1.02653074, + "balance_loss_mlp": 0.20600277, + "epoch": 0.9935367503381933, + "flos": 17129139160320.0, + "grad_norm": 9.089336059557317, + "language_loss": 0.87728471, + "learning_rate": 4.341315219624775e-10, + "loss": 0.89199853, + "num_input_tokens_seen": 356732100, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.24182129, + "step": 16525, + "time_per_iteration": 2.655880928039551 + }, + { + "auxiliary_loss_clip": 0.01229739, + "auxiliary_loss_mlp": 0.00232114, + "balance_loss_clip": 1.01518941, + "balance_loss_mlp": 0.20957118, + "epoch": 0.9935968735908612, + "flos": 22346241125760.0, + "grad_norm": 46.41809797221321, + "language_loss": 0.84127688, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.85589552, + "num_input_tokens_seen": 356751480, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.22509766, + "step": 16526, + "time_per_iteration": 2.6889169216156006 + }, + { + "auxiliary_loss_clip": 0.01212159, + "auxiliary_loss_mlp": 0.00199854, + "balance_loss_clip": 1.00658011, + "balance_loss_mlp": 0.17714453, + "epoch": 0.9936569968435293, + "flos": 29460539640960.0, + "grad_norm": 4.0628877623638875, + "language_loss": 0.78654516, + "learning_rate": 4.180545412333369e-10, + "loss": 0.80066538, + "num_input_tokens_seen": 356772650, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.22717285, + "step": 16527, + "time_per_iteration": 4.133192777633667 + }, + { + "auxiliary_loss_clip": 0.01234067, + "auxiliary_loss_mlp": 0.00218054, + "balance_loss_clip": 1.01647925, + "balance_loss_mlp": 0.19481966, + "epoch": 0.9937171200961972, + "flos": 16544046522240.0, + "grad_norm": 21.51108404834354, + "language_loss": 0.88168871, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.89620996, + "num_input_tokens_seen": 356788510, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.23242188, + "step": 16528, + "time_per_iteration": 4.021409749984741 + }, + { + "auxiliary_loss_clip": 0.01241714, + "auxiliary_loss_mlp": 0.00228304, + "balance_loss_clip": 1.02434254, + "balance_loss_mlp": 0.20096895, + "epoch": 0.9937772433488652, + "flos": 24390276474240.0, + "grad_norm": 5.924734428495689, + "language_loss": 0.80365825, + "learning_rate": 4.022808578922898e-10, + "loss": 0.81835842, + "num_input_tokens_seen": 356809115, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.27307129, + "step": 16529, + "time_per_iteration": 2.7030584812164307 + }, + { + "auxiliary_loss_clip": 0.01260959, + "auxiliary_loss_mlp": 0.00245837, + "balance_loss_clip": 1.03268123, + "balance_loss_mlp": 0.21814454, + "epoch": 0.9938373666015331, + "flos": 15669909141120.0, + "grad_norm": 28.876709035402538, + "language_loss": 0.73407495, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.74914289, + "num_input_tokens_seen": 356826410, + "router_z_loss_clip": 2.28222656, + "router_z_loss_mlp": 0.27722168, + "step": 16530, + "time_per_iteration": 2.695976972579956 + }, + { + "auxiliary_loss_clip": 0.01246542, + "auxiliary_loss_mlp": 0.00229392, + "balance_loss_clip": 1.0310322, + "balance_loss_mlp": 0.20463207, + "epoch": 0.9938974898542011, + "flos": 19496190539520.0, + "grad_norm": 16.352853902482398, + "language_loss": 0.80081052, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.81556988, + "num_input_tokens_seen": 356844990, + "router_z_loss_clip": 2.15527344, + "router_z_loss_mlp": 0.24768066, + "step": 16531, + "time_per_iteration": 2.666436195373535 + }, + { + "auxiliary_loss_clip": 0.01242317, + "auxiliary_loss_mlp": 0.00220054, + "balance_loss_clip": 1.02233791, + "balance_loss_mlp": 0.19413838, + "epoch": 0.993957613106869, + "flos": 26906896085760.0, + "grad_norm": 37.086312649465306, + "language_loss": 0.80915499, + "learning_rate": 3.791890207045512e-10, + "loss": 0.82377875, + "num_input_tokens_seen": 356866530, + "router_z_loss_clip": 2.19824219, + "router_z_loss_mlp": 0.25927734, + "step": 16532, + "time_per_iteration": 2.773681879043579 + }, + { + "auxiliary_loss_clip": 0.01215938, + "auxiliary_loss_mlp": 0.00235918, + "balance_loss_clip": 1.01108873, + "balance_loss_mlp": 0.21395996, + "epoch": 0.994017736359537, + "flos": 14939593816320.0, + "grad_norm": 55.54423628640272, + "language_loss": 0.79124629, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.80576479, + "num_input_tokens_seen": 356884660, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.21936035, + "step": 16533, + "time_per_iteration": 2.683507204055786 + }, + { + "auxiliary_loss_clip": 0.01263787, + "auxiliary_loss_mlp": 0.0025771, + "balance_loss_clip": 1.04028702, + "balance_loss_mlp": 0.22950517, + "epoch": 0.9940778596122051, + "flos": 15377883569280.0, + "grad_norm": 34.96823256752456, + "language_loss": 0.92600626, + "learning_rate": 3.641735912007782e-10, + "loss": 0.94122124, + "num_input_tokens_seen": 356900895, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.28222656, + "step": 16534, + "time_per_iteration": 2.6230242252349854 + }, + { + "auxiliary_loss_clip": 0.01207699, + "auxiliary_loss_mlp": 0.00194396, + "balance_loss_clip": 1.00696874, + "balance_loss_mlp": 0.17149633, + "epoch": 0.994137982864873, + "flos": 25228108183680.0, + "grad_norm": 6.1689319252438475, + "language_loss": 0.73476595, + "learning_rate": 3.567796158934211e-10, + "loss": 0.74878693, + "num_input_tokens_seen": 356920985, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.22900391, + "step": 16535, + "time_per_iteration": 2.7016384601593018 + }, + { + "auxiliary_loss_clip": 0.01238051, + "auxiliary_loss_mlp": 0.00219705, + "balance_loss_clip": 1.02570534, + "balance_loss_mlp": 0.19520727, + "epoch": 0.994198106117541, + "flos": 18442140912000.0, + "grad_norm": 249.1881609087331, + "language_loss": 0.71938372, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.73396122, + "num_input_tokens_seen": 356939800, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.24475098, + "step": 16536, + "time_per_iteration": 2.6551599502563477 + }, + { + "auxiliary_loss_clip": 0.01246292, + "auxiliary_loss_mlp": 0.00229449, + "balance_loss_clip": 1.03154373, + "balance_loss_mlp": 0.20471302, + "epoch": 0.9942582293702089, + "flos": 16654112772480.0, + "grad_norm": 7.446334202852814, + "language_loss": 0.872841, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.88759851, + "num_input_tokens_seen": 356957780, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.24743652, + "step": 16537, + "time_per_iteration": 2.6629555225372314 + }, + { + "auxiliary_loss_clip": 0.01249932, + "auxiliary_loss_mlp": 0.00241747, + "balance_loss_clip": 1.02849364, + "balance_loss_mlp": 0.2155807, + "epoch": 0.9943183526228769, + "flos": 21944580266880.0, + "grad_norm": 736.9505408900905, + "language_loss": 0.79420137, + "learning_rate": 3.35052651107004e-10, + "loss": 0.80911809, + "num_input_tokens_seen": 356979185, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26184082, + "step": 16538, + "time_per_iteration": 2.6753618717193604 + }, + { + "auxiliary_loss_clip": 0.01239173, + "auxiliary_loss_mlp": 0.00221762, + "balance_loss_clip": 1.02561069, + "balance_loss_mlp": 0.19809903, + "epoch": 0.9943784758755448, + "flos": 23842566915840.0, + "grad_norm": 43.8959737107112, + "language_loss": 0.84435934, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.85896868, + "num_input_tokens_seen": 356997735, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.23657227, + "step": 16539, + "time_per_iteration": 2.7180936336517334 + }, + { + "auxiliary_loss_clip": 0.01241723, + "auxiliary_loss_mlp": 0.00223174, + "balance_loss_clip": 1.02241075, + "balance_loss_mlp": 0.1991173, + "epoch": 0.9944385991282129, + "flos": 21469984842240.0, + "grad_norm": 12.887900147739076, + "language_loss": 0.81786144, + "learning_rate": 3.209471449341361e-10, + "loss": 0.83251035, + "num_input_tokens_seen": 357015660, + "router_z_loss_clip": 2.19433594, + "router_z_loss_mlp": 0.24060059, + "step": 16540, + "time_per_iteration": 2.630388021469116 + }, + { + "auxiliary_loss_clip": 0.01220305, + "auxiliary_loss_mlp": 0.00203473, + "balance_loss_clip": 1.00966227, + "balance_loss_mlp": 0.18088315, + "epoch": 0.9944987223808808, + "flos": 22927024131840.0, + "grad_norm": 1.814587393317536, + "language_loss": 0.83907974, + "learning_rate": 3.140081337600353e-10, + "loss": 0.8533175, + "num_input_tokens_seen": 357034800, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.22607422, + "step": 16541, + "time_per_iteration": 2.687471866607666 + }, + { + "auxiliary_loss_clip": 0.01231458, + "auxiliary_loss_mlp": 0.0021425, + "balance_loss_clip": 1.01880085, + "balance_loss_mlp": 0.19032449, + "epoch": 0.9945588456335488, + "flos": 22383013674240.0, + "grad_norm": 6736.877188587143, + "language_loss": 0.85630476, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.87076181, + "num_input_tokens_seen": 357053785, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.23937988, + "step": 16542, + "time_per_iteration": 2.658353090286255 + }, + { + "auxiliary_loss_clip": 0.01266786, + "auxiliary_loss_mlp": 0.00246556, + "balance_loss_clip": 1.03963876, + "balance_loss_mlp": 0.21961501, + "epoch": 0.9946189688862167, + "flos": 21397517153280.0, + "grad_norm": 18.05348106060521, + "language_loss": 0.84559542, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.86072886, + "num_input_tokens_seen": 357072025, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26965332, + "step": 16543, + "time_per_iteration": 2.691941499710083 + }, + { + "auxiliary_loss_clip": 0.01255551, + "auxiliary_loss_mlp": 0.00224341, + "balance_loss_clip": 1.03724766, + "balance_loss_mlp": 0.19940251, + "epoch": 0.9946790921388847, + "flos": 12416545670400.0, + "grad_norm": 34.53069055293938, + "language_loss": 0.91206157, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.92686045, + "num_input_tokens_seen": 357086960, + "router_z_loss_clip": 2.18261719, + "router_z_loss_mlp": 0.24951172, + "step": 16544, + "time_per_iteration": 2.6720240116119385 + }, + { + "auxiliary_loss_clip": 0.01228445, + "auxiliary_loss_mlp": 0.00203651, + "balance_loss_clip": 1.01948833, + "balance_loss_mlp": 0.17930859, + "epoch": 0.9947392153915526, + "flos": 19058295836160.0, + "grad_norm": 5.794105852599902, + "language_loss": 0.86749327, + "learning_rate": 2.870103745831187e-10, + "loss": 0.88181412, + "num_input_tokens_seen": 357105095, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.24353027, + "step": 16545, + "time_per_iteration": 2.696570634841919 + }, + { + "auxiliary_loss_clip": 0.01252518, + "auxiliary_loss_mlp": 0.00211528, + "balance_loss_clip": 1.03010106, + "balance_loss_mlp": 0.18885455, + "epoch": 0.9947993386442207, + "flos": 27308808339840.0, + "grad_norm": 38.54969671364069, + "language_loss": 0.79915905, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.8137995, + "num_input_tokens_seen": 357125065, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.22692871, + "step": 16546, + "time_per_iteration": 2.7621941566467285 + }, + { + "auxiliary_loss_clip": 0.01227187, + "auxiliary_loss_mlp": 0.00218297, + "balance_loss_clip": 1.01581597, + "balance_loss_mlp": 0.1925718, + "epoch": 0.9948594618968887, + "flos": 20806498771200.0, + "grad_norm": 198.91047910188874, + "language_loss": 0.84345233, + "learning_rate": 2.739664698798716e-10, + "loss": 0.85790718, + "num_input_tokens_seen": 357141600, + "router_z_loss_clip": 2.11035156, + "router_z_loss_mlp": 0.25720215, + "step": 16547, + "time_per_iteration": 2.639575481414795 + }, + { + "auxiliary_loss_clip": 0.01231192, + "auxiliary_loss_mlp": 0.00232206, + "balance_loss_clip": 1.01576996, + "balance_loss_mlp": 0.2066716, + "epoch": 0.9949195851495566, + "flos": 23292953936640.0, + "grad_norm": 43.90621936781188, + "language_loss": 0.78009737, + "learning_rate": 2.67558262122769e-10, + "loss": 0.79473138, + "num_input_tokens_seen": 357157880, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.25537109, + "step": 16548, + "time_per_iteration": 2.6841349601745605 + }, + { + "auxiliary_loss_clip": 0.01240538, + "auxiliary_loss_mlp": 0.00217103, + "balance_loss_clip": 1.02895141, + "balance_loss_mlp": 0.19366659, + "epoch": 0.9949797084022246, + "flos": 18515470527360.0, + "grad_norm": 9.219354374492578, + "language_loss": 0.85068393, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.86526024, + "num_input_tokens_seen": 357176705, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.234375, + "step": 16549, + "time_per_iteration": 2.6400370597839355 + }, + { + "auxiliary_loss_clip": 0.01250084, + "auxiliary_loss_mlp": 0.00233914, + "balance_loss_clip": 1.03151083, + "balance_loss_mlp": 0.20691293, + "epoch": 0.9950398316548925, + "flos": 30407719328640.0, + "grad_norm": 118.4599220095371, + "language_loss": 0.8165369, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.83137685, + "num_input_tokens_seen": 357197630, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.2701416, + "step": 16550, + "time_per_iteration": 2.7480826377868652 + }, + { + "auxiliary_loss_clip": 0.01237946, + "auxiliary_loss_mlp": 0.00217184, + "balance_loss_clip": 1.02177799, + "balance_loss_mlp": 0.19248411, + "epoch": 0.9950999549075605, + "flos": 19900868140800.0, + "grad_norm": 6.063127787408688, + "language_loss": 0.83576131, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.85031259, + "num_input_tokens_seen": 357215445, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24707031, + "step": 16551, + "time_per_iteration": 2.779247283935547 + }, + { + "auxiliary_loss_clip": 0.01203267, + "auxiliary_loss_mlp": 0.00211276, + "balance_loss_clip": 1.00003695, + "balance_loss_mlp": 0.18925828, + "epoch": 0.9951600781602284, + "flos": 17603555016960.0, + "grad_norm": 354.01264893102916, + "language_loss": 0.72995353, + "learning_rate": 2.426837340270271e-10, + "loss": 0.7440989, + "num_input_tokens_seen": 357234285, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.22033691, + "step": 16552, + "time_per_iteration": 2.7281394004821777 + }, + { + "auxiliary_loss_clip": 0.01221615, + "auxiliary_loss_mlp": 0.00239444, + "balance_loss_clip": 1.00997329, + "balance_loss_mlp": 0.21547058, + "epoch": 0.9952202014128965, + "flos": 28950715952640.0, + "grad_norm": 5.600250806083065, + "language_loss": 0.87305117, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.88766181, + "num_input_tokens_seen": 357257565, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.23950195, + "step": 16553, + "time_per_iteration": 2.7295591831207275 + }, + { + "auxiliary_loss_clip": 0.01083027, + "auxiliary_loss_mlp": 0.00076406, + "balance_loss_clip": 0.94577932, + "balance_loss_mlp": 0.06953974, + "epoch": 0.9952803246655644, + "flos": 70810386145920.0, + "grad_norm": 0.9798022398788457, + "language_loss": 0.56611049, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.57770479, + "num_input_tokens_seen": 357320205, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.06884766, + "step": 16554, + "time_per_iteration": 3.282322883605957 + }, + { + "auxiliary_loss_clip": 0.01238617, + "auxiliary_loss_mlp": 0.00219026, + "balance_loss_clip": 1.02302575, + "balance_loss_mlp": 0.19431348, + "epoch": 0.9953404479182324, + "flos": 21799070271360.0, + "grad_norm": 25.19005127189776, + "language_loss": 0.82095063, + "learning_rate": 2.24824062597051e-10, + "loss": 0.83552706, + "num_input_tokens_seen": 357340695, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.24731445, + "step": 16555, + "time_per_iteration": 2.692577600479126 + }, + { + "auxiliary_loss_clip": 0.01249703, + "auxiliary_loss_mlp": 0.00220412, + "balance_loss_clip": 1.02843344, + "balance_loss_mlp": 0.19393578, + "epoch": 0.9954005711709003, + "flos": 21937397546880.0, + "grad_norm": 15.96413358545849, + "language_loss": 0.92395854, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.93865955, + "num_input_tokens_seen": 357357505, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26477051, + "step": 16556, + "time_per_iteration": 2.685053825378418 + }, + { + "auxiliary_loss_clip": 0.01226517, + "auxiliary_loss_mlp": 0.00214344, + "balance_loss_clip": 1.01271999, + "balance_loss_mlp": 0.18783179, + "epoch": 0.9954606944235683, + "flos": 19354559212800.0, + "grad_norm": 2.6647194237411083, + "language_loss": 0.81331629, + "learning_rate": 2.132967729762125e-10, + "loss": 0.82772493, + "num_input_tokens_seen": 357375395, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.26525879, + "step": 16557, + "time_per_iteration": 2.759101390838623 + }, + { + "auxiliary_loss_clip": 0.01233433, + "auxiliary_loss_mlp": 0.00235975, + "balance_loss_clip": 1.02154195, + "balance_loss_mlp": 0.21026182, + "epoch": 0.9955208176762362, + "flos": 30518611591680.0, + "grad_norm": 19.381380807276656, + "language_loss": 0.83515757, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.84985161, + "num_input_tokens_seen": 357397375, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.25683594, + "step": 16558, + "time_per_iteration": 2.75089168548584 + }, + { + "auxiliary_loss_clip": 0.01233752, + "auxiliary_loss_mlp": 0.0023462, + "balance_loss_clip": 1.0191406, + "balance_loss_mlp": 0.21036127, + "epoch": 0.9955809409289043, + "flos": 30008249199360.0, + "grad_norm": 1.9791932274768513, + "language_loss": 0.70890087, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.72358453, + "num_input_tokens_seen": 357418880, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.24279785, + "step": 16559, + "time_per_iteration": 4.188792943954468 + }, + { + "auxiliary_loss_clip": 0.01241359, + "auxiliary_loss_mlp": 0.00244616, + "balance_loss_clip": 1.02277827, + "balance_loss_mlp": 0.21776983, + "epoch": 0.9956410641815723, + "flos": 21543278544000.0, + "grad_norm": 12.748680880236638, + "language_loss": 0.81753409, + "learning_rate": 1.965745799148433e-10, + "loss": 0.83239383, + "num_input_tokens_seen": 357438310, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26831055, + "step": 16560, + "time_per_iteration": 2.727598190307617 + }, + { + "auxiliary_loss_clip": 0.0123281, + "auxiliary_loss_mlp": 0.00232795, + "balance_loss_clip": 1.01936352, + "balance_loss_mlp": 0.20833334, + "epoch": 0.9957011874342402, + "flos": 21689470897920.0, + "grad_norm": 18.231731519089898, + "language_loss": 0.86671293, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.88136899, + "num_input_tokens_seen": 357457155, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.24475098, + "step": 16561, + "time_per_iteration": 4.2704758644104 + }, + { + "auxiliary_loss_clip": 0.01229128, + "auxiliary_loss_mlp": 0.00217329, + "balance_loss_clip": 1.02071369, + "balance_loss_mlp": 0.19279601, + "epoch": 0.9957613106869082, + "flos": 17702667619200.0, + "grad_norm": 12.206746929082898, + "language_loss": 0.72697496, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.74143958, + "num_input_tokens_seen": 357468060, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.24523926, + "step": 16562, + "time_per_iteration": 2.618220090866089 + }, + { + "auxiliary_loss_clip": 0.0125648, + "auxiliary_loss_mlp": 0.00228769, + "balance_loss_clip": 1.03812885, + "balance_loss_mlp": 0.20385432, + "epoch": 0.9958214339395761, + "flos": 30555994671360.0, + "grad_norm": 70.4109955026384, + "language_loss": 0.73124826, + "learning_rate": 1.805348815528962e-10, + "loss": 0.74610066, + "num_input_tokens_seen": 357489665, + "router_z_loss_clip": 2.18652344, + "router_z_loss_mlp": 0.24890137, + "step": 16563, + "time_per_iteration": 2.7206223011016846 + }, + { + "auxiliary_loss_clip": 0.01242874, + "auxiliary_loss_mlp": 0.00222604, + "balance_loss_clip": 1.02329481, + "balance_loss_mlp": 0.19777274, + "epoch": 0.9958815571922441, + "flos": 24169174306560.0, + "grad_norm": 2.6414132453819783, + "language_loss": 0.70556778, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.72022259, + "num_input_tokens_seen": 357511975, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24816895, + "step": 16564, + "time_per_iteration": 2.738818407058716 + }, + { + "auxiliary_loss_clip": 0.01241572, + "auxiliary_loss_mlp": 0.00209061, + "balance_loss_clip": 1.02289438, + "balance_loss_mlp": 0.1845994, + "epoch": 0.995941680444912, + "flos": 15487016065920.0, + "grad_norm": 6.667707215835814, + "language_loss": 0.81976861, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.83427495, + "num_input_tokens_seen": 357529345, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24475098, + "step": 16565, + "time_per_iteration": 2.6399688720703125 + }, + { + "auxiliary_loss_clip": 0.01229217, + "auxiliary_loss_mlp": 0.00211343, + "balance_loss_clip": 1.01464939, + "balance_loss_mlp": 0.18715498, + "epoch": 0.9960018036975801, + "flos": 18621227145600.0, + "grad_norm": 4.387997349070509, + "language_loss": 0.86823797, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.88264358, + "num_input_tokens_seen": 357547615, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.24194336, + "step": 16566, + "time_per_iteration": 2.638319730758667 + }, + { + "auxiliary_loss_clip": 0.01226626, + "auxiliary_loss_mlp": 0.00216615, + "balance_loss_clip": 1.01545262, + "balance_loss_mlp": 0.19237974, + "epoch": 0.996061926950248, + "flos": 20084120352000.0, + "grad_norm": 4.131045265756462, + "language_loss": 0.77857691, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.79300928, + "num_input_tokens_seen": 357567380, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.24243164, + "step": 16567, + "time_per_iteration": 2.650552272796631 + }, + { + "auxiliary_loss_clip": 0.01242411, + "auxiliary_loss_mlp": 0.00215419, + "balance_loss_clip": 1.02323961, + "balance_loss_mlp": 0.19118418, + "epoch": 0.996122050202916, + "flos": 24347829576960.0, + "grad_norm": 297.37611073071065, + "language_loss": 0.88468057, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.89925885, + "num_input_tokens_seen": 357586435, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24230957, + "step": 16568, + "time_per_iteration": 2.7046265602111816 + }, + { + "auxiliary_loss_clip": 0.01212038, + "auxiliary_loss_mlp": 0.0021773, + "balance_loss_clip": 1.00705624, + "balance_loss_mlp": 0.19403136, + "epoch": 0.9961821734555839, + "flos": 24199302839040.0, + "grad_norm": 18.69045864020417, + "language_loss": 0.88015926, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.89445698, + "num_input_tokens_seen": 357604720, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.23706055, + "step": 16569, + "time_per_iteration": 4.096976280212402 + }, + { + "auxiliary_loss_clip": 0.01225923, + "auxiliary_loss_mlp": 0.00231325, + "balance_loss_clip": 1.01978445, + "balance_loss_mlp": 0.20782936, + "epoch": 0.9962422967082519, + "flos": 22633741584000.0, + "grad_norm": 13.958920508058076, + "language_loss": 0.76898801, + "learning_rate": 1.457630950747468e-10, + "loss": 0.78356051, + "num_input_tokens_seen": 357622345, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.23522949, + "step": 16570, + "time_per_iteration": 4.087749481201172 + }, + { + "auxiliary_loss_clip": 0.01215949, + "auxiliary_loss_mlp": 0.00181654, + "balance_loss_clip": 1.00635207, + "balance_loss_mlp": 0.1579428, + "epoch": 0.9963024199609198, + "flos": 26396030903040.0, + "grad_norm": 4.863374272225943, + "language_loss": 0.82557249, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.83954859, + "num_input_tokens_seen": 357642710, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.23718262, + "step": 16571, + "time_per_iteration": 2.7431440353393555 + }, + { + "auxiliary_loss_clip": 0.01241125, + "auxiliary_loss_mlp": 0.00242046, + "balance_loss_clip": 1.02764606, + "balance_loss_mlp": 0.21680962, + "epoch": 0.9963625432135879, + "flos": 16581537342720.0, + "grad_norm": 76.33164146272001, + "language_loss": 0.86799371, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.88282543, + "num_input_tokens_seen": 357659870, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.25268555, + "step": 16572, + "time_per_iteration": 2.7001986503601074 + }, + { + "auxiliary_loss_clip": 0.01235518, + "auxiliary_loss_mlp": 0.00212113, + "balance_loss_clip": 1.02112651, + "balance_loss_mlp": 0.18684021, + "epoch": 0.9964226664662559, + "flos": 26468534505600.0, + "grad_norm": 114.31999327130974, + "language_loss": 0.78541636, + "learning_rate": 1.3199841727074e-10, + "loss": 0.79989266, + "num_input_tokens_seen": 357677075, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.25305176, + "step": 16573, + "time_per_iteration": 2.706543207168579 + }, + { + "auxiliary_loss_clip": 0.01272125, + "auxiliary_loss_mlp": 0.00245024, + "balance_loss_clip": 1.04142118, + "balance_loss_mlp": 0.21593669, + "epoch": 0.9964827897189238, + "flos": 27448320764160.0, + "grad_norm": 120.40077990328838, + "language_loss": 0.71303147, + "learning_rate": 1.275618614968721e-10, + "loss": 0.72820294, + "num_input_tokens_seen": 357696715, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.29089355, + "step": 16574, + "time_per_iteration": 2.774317502975464 + }, + { + "auxiliary_loss_clip": 0.01270942, + "auxiliary_loss_mlp": 0.00223624, + "balance_loss_clip": 1.04610467, + "balance_loss_mlp": 0.1961585, + "epoch": 0.9965429129715918, + "flos": 11721566350080.0, + "grad_norm": 15.764739332690324, + "language_loss": 0.87139094, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.88633668, + "num_input_tokens_seen": 357712345, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.2746582, + "step": 16575, + "time_per_iteration": 2.7576189041137695 + }, + { + "auxiliary_loss_clip": 0.0123875, + "auxiliary_loss_mlp": 0.00231847, + "balance_loss_clip": 1.01826203, + "balance_loss_mlp": 0.2065749, + "epoch": 0.9966030362242597, + "flos": 19756004590080.0, + "grad_norm": 4.235594879055588, + "language_loss": 0.80493522, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.81964123, + "num_input_tokens_seen": 357731815, + "router_z_loss_clip": 2.20410156, + "router_z_loss_mlp": 0.25268555, + "step": 16576, + "time_per_iteration": 2.7277677059173584 + }, + { + "auxiliary_loss_clip": 0.01230544, + "auxiliary_loss_mlp": 0.00208263, + "balance_loss_clip": 1.02124691, + "balance_loss_mlp": 0.18507686, + "epoch": 0.9966631594769277, + "flos": 23915178259200.0, + "grad_norm": 61.6547062776753, + "language_loss": 0.77934408, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.79373217, + "num_input_tokens_seen": 357751640, + "router_z_loss_clip": 2.09277344, + "router_z_loss_mlp": 0.23181152, + "step": 16577, + "time_per_iteration": 2.8478941917419434 + }, + { + "auxiliary_loss_clip": 0.01246877, + "auxiliary_loss_mlp": 0.00229938, + "balance_loss_clip": 1.02783048, + "balance_loss_mlp": 0.20426027, + "epoch": 0.9967232827295956, + "flos": 15559591495680.0, + "grad_norm": 4.805930754237019, + "language_loss": 0.876863, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.89163107, + "num_input_tokens_seen": 357769850, + "router_z_loss_clip": 2.19042969, + "router_z_loss_mlp": 0.25671387, + "step": 16578, + "time_per_iteration": 2.675095319747925 + }, + { + "auxiliary_loss_clip": 0.0123887, + "auxiliary_loss_mlp": 0.0022181, + "balance_loss_clip": 1.01881897, + "balance_loss_mlp": 0.19833767, + "epoch": 0.9967834059822637, + "flos": 20813035046400.0, + "grad_norm": 11.271707333796156, + "language_loss": 0.84350896, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.85811579, + "num_input_tokens_seen": 357789550, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.23486328, + "step": 16579, + "time_per_iteration": 2.7197325229644775 + }, + { + "auxiliary_loss_clip": 0.01269765, + "auxiliary_loss_mlp": 0.00245666, + "balance_loss_clip": 1.04918456, + "balance_loss_mlp": 0.21982156, + "epoch": 0.9968435292349316, + "flos": 36719234830080.0, + "grad_norm": 12.863825556038911, + "language_loss": 0.77734882, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.79250312, + "num_input_tokens_seen": 357809525, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25842285, + "step": 16580, + "time_per_iteration": 2.800299882888794 + }, + { + "auxiliary_loss_clip": 0.01234298, + "auxiliary_loss_mlp": 0.00234767, + "balance_loss_clip": 1.01697159, + "balance_loss_mlp": 0.21087721, + "epoch": 0.9969036524875996, + "flos": 26760919213440.0, + "grad_norm": 224.80733546663268, + "language_loss": 0.87307882, + "learning_rate": 9.862937031113184e-11, + "loss": 0.88776946, + "num_input_tokens_seen": 357829795, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.2388916, + "step": 16581, + "time_per_iteration": 2.764221429824829 + }, + { + "auxiliary_loss_clip": 0.0121551, + "auxiliary_loss_mlp": 0.00209566, + "balance_loss_clip": 1.01035118, + "balance_loss_mlp": 0.18626085, + "epoch": 0.9969637757402675, + "flos": 24827237424000.0, + "grad_norm": 11.497046465882892, + "language_loss": 0.86911052, + "learning_rate": 9.479950191249031e-11, + "loss": 0.88336122, + "num_input_tokens_seen": 357851655, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.23327637, + "step": 16582, + "time_per_iteration": 2.7055647373199463 + }, + { + "auxiliary_loss_clip": 0.01226184, + "auxiliary_loss_mlp": 0.00210405, + "balance_loss_clip": 1.01907778, + "balance_loss_mlp": 0.1874337, + "epoch": 0.9970238989929355, + "flos": 23038742407680.0, + "grad_norm": 8.67845671192374, + "language_loss": 0.68216878, + "learning_rate": 9.104547011951069e-11, + "loss": 0.69653469, + "num_input_tokens_seen": 357871205, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.22949219, + "step": 16583, + "time_per_iteration": 2.7280802726745605 + }, + { + "auxiliary_loss_clip": 0.0122813, + "auxiliary_loss_mlp": 0.00218527, + "balance_loss_clip": 1.01613939, + "balance_loss_mlp": 0.19430397, + "epoch": 0.9970840222456034, + "flos": 25298816106240.0, + "grad_norm": 28.895955705319665, + "language_loss": 0.8458727, + "learning_rate": 8.736727507452357e-11, + "loss": 0.86033928, + "num_input_tokens_seen": 357892145, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.2421875, + "step": 16584, + "time_per_iteration": 2.760244846343994 + }, + { + "auxiliary_loss_clip": 0.01225375, + "auxiliary_loss_mlp": 0.00214366, + "balance_loss_clip": 1.01693368, + "balance_loss_mlp": 0.19246739, + "epoch": 0.9971441454982715, + "flos": 21615602578560.0, + "grad_norm": 85.73223505717876, + "language_loss": 0.76127326, + "learning_rate": 8.376491691697297e-11, + "loss": 0.77567065, + "num_input_tokens_seen": 357911205, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.21911621, + "step": 16585, + "time_per_iteration": 2.713378667831421 + }, + { + "auxiliary_loss_clip": 0.01246552, + "auxiliary_loss_mlp": 0.00218528, + "balance_loss_clip": 1.02888942, + "balance_loss_mlp": 0.19450778, + "epoch": 0.9972042687509394, + "flos": 14975612179200.0, + "grad_norm": 50.40521665268173, + "language_loss": 0.89362746, + "learning_rate": 8.023839578363834e-11, + "loss": 0.90827835, + "num_input_tokens_seen": 357928190, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.23999023, + "step": 16586, + "time_per_iteration": 2.720348358154297 + }, + { + "auxiliary_loss_clip": 0.01248909, + "auxiliary_loss_mlp": 0.00218561, + "balance_loss_clip": 1.02985525, + "balance_loss_mlp": 0.19361043, + "epoch": 0.9972643920036074, + "flos": 25806664546560.0, + "grad_norm": 40.11626285163719, + "language_loss": 0.85713166, + "learning_rate": 7.678771180796851e-11, + "loss": 0.87180638, + "num_input_tokens_seen": 357946985, + "router_z_loss_clip": 2.19238281, + "router_z_loss_mlp": 0.24963379, + "step": 16587, + "time_per_iteration": 2.72541880607605 + }, + { + "auxiliary_loss_clip": 0.01236196, + "auxiliary_loss_mlp": 0.00225766, + "balance_loss_clip": 1.02574849, + "balance_loss_mlp": 0.20012385, + "epoch": 0.9973245152562754, + "flos": 23326242865920.0, + "grad_norm": 33.82044944937642, + "language_loss": 0.79572654, + "learning_rate": 7.341286512074773e-11, + "loss": 0.81034619, + "num_input_tokens_seen": 357966720, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.25634766, + "step": 16588, + "time_per_iteration": 2.738438129425049 + }, + { + "auxiliary_loss_clip": 0.01255737, + "auxiliary_loss_mlp": 0.00213172, + "balance_loss_clip": 1.03069735, + "balance_loss_mlp": 0.18725547, + "epoch": 0.9973846385089433, + "flos": 12166212810240.0, + "grad_norm": 66.32120986831885, + "language_loss": 0.93798906, + "learning_rate": 7.011385585031781e-11, + "loss": 0.95267808, + "num_input_tokens_seen": 357981375, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.25939941, + "step": 16589, + "time_per_iteration": 2.6630773544311523 + }, + { + "auxiliary_loss_clip": 0.01257722, + "auxiliary_loss_mlp": 0.002453, + "balance_loss_clip": 1.032493, + "balance_loss_mlp": 0.21826319, + "epoch": 0.9974447617616113, + "flos": 20045157073920.0, + "grad_norm": 105.15835151509867, + "language_loss": 0.83371812, + "learning_rate": 6.689068412168986e-11, + "loss": 0.84874833, + "num_input_tokens_seen": 358000290, + "router_z_loss_clip": 2.25488281, + "router_z_loss_mlp": 0.27038574, + "step": 16590, + "time_per_iteration": 2.871161699295044 + }, + { + "auxiliary_loss_clip": 0.01239275, + "auxiliary_loss_mlp": 0.00218805, + "balance_loss_clip": 1.02329445, + "balance_loss_mlp": 0.19421199, + "epoch": 0.9975048850142793, + "flos": 32014614159360.0, + "grad_norm": 13.216788827906528, + "language_loss": 0.71428025, + "learning_rate": 6.374335005676634e-11, + "loss": 0.72886097, + "num_input_tokens_seen": 358022075, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.24597168, + "step": 16591, + "time_per_iteration": 2.7827186584472656 + }, + { + "auxiliary_loss_clip": 0.01216157, + "auxiliary_loss_mlp": 0.00242118, + "balance_loss_clip": 1.00438094, + "balance_loss_mlp": 0.21828815, + "epoch": 0.9975650082669473, + "flos": 36933728895360.0, + "grad_norm": 43.13138871391549, + "language_loss": 0.8043561, + "learning_rate": 6.067185377522933e-11, + "loss": 0.81893885, + "num_input_tokens_seen": 358043940, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.23815918, + "step": 16592, + "time_per_iteration": 2.8508174419403076 + }, + { + "auxiliary_loss_clip": 0.0125357, + "auxiliary_loss_mlp": 0.00237318, + "balance_loss_clip": 1.03837943, + "balance_loss_mlp": 0.21173602, + "epoch": 0.9976251315196152, + "flos": 16472117537280.0, + "grad_norm": 80.97008592739985, + "language_loss": 0.9208622, + "learning_rate": 5.767619539343016e-11, + "loss": 0.93577111, + "num_input_tokens_seen": 358062720, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.2557373, + "step": 16593, + "time_per_iteration": 2.690086841583252 + }, + { + "auxiliary_loss_clip": 0.01227598, + "auxiliary_loss_mlp": 0.0023413, + "balance_loss_clip": 1.01545095, + "balance_loss_mlp": 0.20882148, + "epoch": 0.9976852547722832, + "flos": 19646836179840.0, + "grad_norm": 1.9778662728342842, + "language_loss": 0.76737225, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.78198951, + "num_input_tokens_seen": 358081560, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.25317383, + "step": 16594, + "time_per_iteration": 2.7187623977661133 + }, + { + "auxiliary_loss_clip": 0.01239083, + "auxiliary_loss_mlp": 0.00220407, + "balance_loss_clip": 1.02224064, + "balance_loss_mlp": 0.19580173, + "epoch": 0.9977453780249511, + "flos": 20448434044800.0, + "grad_norm": 34.84663259618439, + "language_loss": 0.82754183, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.84213674, + "num_input_tokens_seen": 358099065, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.24597168, + "step": 16595, + "time_per_iteration": 2.684847831726074 + }, + { + "auxiliary_loss_clip": 0.01085348, + "auxiliary_loss_mlp": 0.00070982, + "balance_loss_clip": 0.94971257, + "balance_loss_mlp": 0.06440124, + "epoch": 0.9978055012776191, + "flos": 65455097581440.0, + "grad_norm": 0.7721913134771462, + "language_loss": 0.5953052, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.60686851, + "num_input_tokens_seen": 358156095, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06591797, + "step": 16596, + "time_per_iteration": 3.1451401710510254 + }, + { + "auxiliary_loss_clip": 0.01234748, + "auxiliary_loss_mlp": 0.00229013, + "balance_loss_clip": 1.0217495, + "balance_loss_mlp": 0.20407426, + "epoch": 0.997865624530287, + "flos": 20631506688000.0, + "grad_norm": 91.71640841262044, + "language_loss": 0.8545481, + "learning_rate": 4.645194309227385e-11, + "loss": 0.86918581, + "num_input_tokens_seen": 358175230, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.24914551, + "step": 16597, + "time_per_iteration": 2.815448760986328 + }, + { + "auxiliary_loss_clip": 0.01245414, + "auxiliary_loss_mlp": 0.00249797, + "balance_loss_clip": 1.02788365, + "balance_loss_mlp": 0.22400045, + "epoch": 0.9979257477829551, + "flos": 29387102284800.0, + "grad_norm": 18.26954040829393, + "language_loss": 0.88949037, + "learning_rate": 4.383547585562475e-11, + "loss": 0.90444243, + "num_input_tokens_seen": 358197075, + "router_z_loss_clip": 2.17675781, + "router_z_loss_mlp": 0.25793457, + "step": 16598, + "time_per_iteration": 2.745711088180542 + }, + { + "auxiliary_loss_clip": 0.01272956, + "auxiliary_loss_mlp": 0.00256252, + "balance_loss_clip": 1.04042387, + "balance_loss_mlp": 0.22795135, + "epoch": 0.997985871035623, + "flos": 22635070387200.0, + "grad_norm": 416.70286109257705, + "language_loss": 0.73960924, + "learning_rate": 4.129484715709175e-11, + "loss": 0.75490141, + "num_input_tokens_seen": 358215925, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.28320312, + "step": 16599, + "time_per_iteration": 2.7535345554351807 + }, + { + "auxiliary_loss_clip": 0.01085617, + "auxiliary_loss_mlp": 0.00063743, + "balance_loss_clip": 0.94962943, + "balance_loss_mlp": 0.05725785, + "epoch": 0.998045994288291, + "flos": 61806968663040.0, + "grad_norm": 0.8514978621142071, + "language_loss": 0.61087346, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.62236702, + "num_input_tokens_seen": 358269035, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06494141, + "step": 16600, + "time_per_iteration": 3.010524034500122 + }, + { + "auxiliary_loss_clip": 0.012317, + "auxiliary_loss_mlp": 0.00230307, + "balance_loss_clip": 1.01969457, + "balance_loss_mlp": 0.20637017, + "epoch": 0.998106117540959, + "flos": 19245534456960.0, + "grad_norm": 5.26331455980335, + "language_loss": 0.83937508, + "learning_rate": 3.644110575717896e-11, + "loss": 0.85399514, + "num_input_tokens_seen": 358287680, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.23925781, + "step": 16601, + "time_per_iteration": 2.7195663452148438 + }, + { + "auxiliary_loss_clip": 0.01251561, + "auxiliary_loss_mlp": 0.00235573, + "balance_loss_clip": 1.0301671, + "balance_loss_mlp": 0.20856048, + "epoch": 0.9981662407936269, + "flos": 21106209853440.0, + "grad_norm": 43.03967984297676, + "language_loss": 0.90042174, + "learning_rate": 3.412799323987414e-11, + "loss": 0.9152931, + "num_input_tokens_seen": 358304080, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.2701416, + "step": 16602, + "time_per_iteration": 4.1708009243011475 + }, + { + "auxiliary_loss_clip": 0.01254136, + "auxiliary_loss_mlp": 0.00238026, + "balance_loss_clip": 1.03659689, + "balance_loss_mlp": 0.21433963, + "epoch": 0.998226364046295, + "flos": 24316839118080.0, + "grad_norm": 189.34743735927208, + "language_loss": 0.70251256, + "learning_rate": 3.189071962883538e-11, + "loss": 0.71743417, + "num_input_tokens_seen": 358323670, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.23706055, + "step": 16603, + "time_per_iteration": 4.102585077285767 + }, + { + "auxiliary_loss_clip": 0.01255078, + "auxiliary_loss_mlp": 0.00223068, + "balance_loss_clip": 1.03723168, + "balance_loss_mlp": 0.19682986, + "epoch": 0.9982864872989629, + "flos": 23836389776640.0, + "grad_norm": 2.772419191033544, + "language_loss": 0.80179638, + "learning_rate": 2.972928500866168e-11, + "loss": 0.81657785, + "num_input_tokens_seen": 358341980, + "router_z_loss_clip": 2.18066406, + "router_z_loss_mlp": 0.26245117, + "step": 16604, + "time_per_iteration": 2.6704256534576416 + }, + { + "auxiliary_loss_clip": 0.01232769, + "auxiliary_loss_mlp": 0.00233147, + "balance_loss_clip": 1.0159657, + "balance_loss_mlp": 0.20774341, + "epoch": 0.9983466105516309, + "flos": 18333116156160.0, + "grad_norm": 104.84921817012014, + "language_loss": 0.73264331, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.74730247, + "num_input_tokens_seen": 358360400, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.25415039, + "step": 16605, + "time_per_iteration": 2.6932003498077393 + }, + { + "auxiliary_loss_clip": 0.01216271, + "auxiliary_loss_mlp": 0.0021426, + "balance_loss_clip": 1.01156545, + "balance_loss_mlp": 0.1919795, + "epoch": 0.9984067338042988, + "flos": 17236763285760.0, + "grad_norm": 8.275200020715154, + "language_loss": 0.76782995, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.78213525, + "num_input_tokens_seen": 358378990, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.22277832, + "step": 16606, + "time_per_iteration": 2.658618211746216 + }, + { + "auxiliary_loss_clip": 0.01224974, + "auxiliary_loss_mlp": 0.00223893, + "balance_loss_clip": 1.01523137, + "balance_loss_mlp": 0.20055167, + "epoch": 0.9984668570569668, + "flos": 20667884186880.0, + "grad_norm": 169.45549261058227, + "language_loss": 0.90447199, + "learning_rate": 2.370001590090709e-11, + "loss": 0.91896069, + "num_input_tokens_seen": 358395970, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23352051, + "step": 16607, + "time_per_iteration": 2.7428290843963623 + }, + { + "auxiliary_loss_clip": 0.01243133, + "auxiliary_loss_mlp": 0.00226519, + "balance_loss_clip": 1.02162552, + "balance_loss_mlp": 0.20048413, + "epoch": 0.9985269803096347, + "flos": 30262532555520.0, + "grad_norm": 34.7064364922944, + "language_loss": 0.74430275, + "learning_rate": 2.184193803622669e-11, + "loss": 0.75899935, + "num_input_tokens_seen": 358417355, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.26025391, + "step": 16608, + "time_per_iteration": 2.7291312217712402 + }, + { + "auxiliary_loss_clip": 0.01244234, + "auxiliary_loss_mlp": 0.0022159, + "balance_loss_clip": 1.02608705, + "balance_loss_mlp": 0.19603148, + "epoch": 0.9985871035623027, + "flos": 10560970005120.0, + "grad_norm": 8.07253556070073, + "language_loss": 0.91101086, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.92566907, + "num_input_tokens_seen": 358434345, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25561523, + "step": 16609, + "time_per_iteration": 2.6860036849975586 + }, + { + "auxiliary_loss_clip": 0.0121751, + "auxiliary_loss_mlp": 0.00216224, + "balance_loss_clip": 1.00841737, + "balance_loss_mlp": 0.19332406, + "epoch": 0.9986472268149706, + "flos": 16873455173760.0, + "grad_norm": 108.56073159261118, + "language_loss": 0.69237393, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.70671129, + "num_input_tokens_seen": 358452870, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.22888184, + "step": 16610, + "time_per_iteration": 2.6298043727874756 + }, + { + "auxiliary_loss_clip": 0.01233943, + "auxiliary_loss_mlp": 0.00208072, + "balance_loss_clip": 1.01778078, + "balance_loss_mlp": 0.18427771, + "epoch": 0.9987073500676387, + "flos": 22054538776320.0, + "grad_norm": 8.024679860661672, + "language_loss": 0.75596386, + "learning_rate": 1.672274094288717e-11, + "loss": 0.77038395, + "num_input_tokens_seen": 358472210, + "router_z_loss_clip": 2.15917969, + "router_z_loss_mlp": 0.23791504, + "step": 16611, + "time_per_iteration": 4.084531307220459 + }, + { + "auxiliary_loss_clip": 0.01240438, + "auxiliary_loss_mlp": 0.00230101, + "balance_loss_clip": 1.02536023, + "balance_loss_mlp": 0.20591329, + "epoch": 0.9987674733203066, + "flos": 30482880537600.0, + "grad_norm": 20.16336911341428, + "language_loss": 0.77149248, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.78619784, + "num_input_tokens_seen": 358493840, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.24194336, + "step": 16612, + "time_per_iteration": 4.161989212036133 + }, + { + "auxiliary_loss_clip": 0.01225455, + "auxiliary_loss_mlp": 0.00220386, + "balance_loss_clip": 1.01623535, + "balance_loss_mlp": 0.19800995, + "epoch": 0.9988275965729746, + "flos": 27745230585600.0, + "grad_norm": 126.13989831645686, + "language_loss": 0.81432384, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.8287822, + "num_input_tokens_seen": 358515060, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.22375488, + "step": 16613, + "time_per_iteration": 2.78298020362854 + }, + { + "auxiliary_loss_clip": 0.01237675, + "auxiliary_loss_mlp": 0.00222662, + "balance_loss_clip": 1.01965547, + "balance_loss_mlp": 0.19826031, + "epoch": 0.9988877198256426, + "flos": 17524191916800.0, + "grad_norm": 29.270666894562073, + "language_loss": 0.80658853, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.82119191, + "num_input_tokens_seen": 358528200, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24389648, + "step": 16614, + "time_per_iteration": 2.662229299545288 + }, + { + "auxiliary_loss_clip": 0.01234397, + "auxiliary_loss_mlp": 0.00232046, + "balance_loss_clip": 1.02368605, + "balance_loss_mlp": 0.20736995, + "epoch": 0.9989478430783105, + "flos": 20996502739200.0, + "grad_norm": 1572.1607684513924, + "language_loss": 0.78420424, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.79886866, + "num_input_tokens_seen": 358548360, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.24694824, + "step": 16615, + "time_per_iteration": 2.689746618270874 + }, + { + "auxiliary_loss_clip": 0.01254954, + "auxiliary_loss_mlp": 0.00228871, + "balance_loss_clip": 1.03128457, + "balance_loss_mlp": 0.20380098, + "epoch": 0.9990079663309785, + "flos": 13370620769280.0, + "grad_norm": 12.36423137567669, + "language_loss": 0.89674729, + "learning_rate": 9.70753783247069e-12, + "loss": 0.91158557, + "num_input_tokens_seen": 358566270, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.25061035, + "step": 16616, + "time_per_iteration": 2.692957639694214 + }, + { + "auxiliary_loss_clip": 0.01239947, + "auxiliary_loss_mlp": 0.00233181, + "balance_loss_clip": 1.02516544, + "balance_loss_mlp": 0.2090297, + "epoch": 0.9990680895836465, + "flos": 17310236555520.0, + "grad_norm": 15.042175561482889, + "language_loss": 0.91082335, + "learning_rate": 8.532016508855378e-12, + "loss": 0.92555463, + "num_input_tokens_seen": 358584710, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.24169922, + "step": 16617, + "time_per_iteration": 2.797769784927368 + }, + { + "auxiliary_loss_clip": 0.01246822, + "auxiliary_loss_mlp": 0.00221173, + "balance_loss_clip": 1.02932525, + "balance_loss_mlp": 0.19621015, + "epoch": 0.9991282128363145, + "flos": 24207993930240.0, + "grad_norm": 70.04045839005306, + "language_loss": 0.85250551, + "learning_rate": 7.43233506206309e-12, + "loss": 0.86718547, + "num_input_tokens_seen": 358606750, + "router_z_loss_clip": 2.17285156, + "router_z_loss_mlp": 0.24951172, + "step": 16618, + "time_per_iteration": 2.7307963371276855 + }, + { + "auxiliary_loss_clip": 0.01228112, + "auxiliary_loss_mlp": 0.0022525, + "balance_loss_clip": 1.01792145, + "balance_loss_mlp": 0.20024002, + "epoch": 0.9991883360889824, + "flos": 21175301664000.0, + "grad_norm": 111.89641564932002, + "language_loss": 0.81983757, + "learning_rate": 6.408493534060255e-12, + "loss": 0.83437121, + "num_input_tokens_seen": 358624675, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.25012207, + "step": 16619, + "time_per_iteration": 2.7559618949890137 + }, + { + "auxiliary_loss_clip": 0.0121823, + "auxiliary_loss_mlp": 0.00197524, + "balance_loss_clip": 1.01097941, + "balance_loss_mlp": 0.17609021, + "epoch": 0.9992484593416504, + "flos": 19901155449600.0, + "grad_norm": 6.359363516533516, + "language_loss": 0.94724643, + "learning_rate": 5.460491963260594e-12, + "loss": 0.96140403, + "num_input_tokens_seen": 358640715, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.21447754, + "step": 16620, + "time_per_iteration": 2.6698157787323 + }, + { + "auxiliary_loss_clip": 0.0124508, + "auxiliary_loss_mlp": 0.00243164, + "balance_loss_clip": 1.02928853, + "balance_loss_mlp": 0.21735501, + "epoch": 0.9993085825943183, + "flos": 24857832833280.0, + "grad_norm": 7.648131820358017, + "language_loss": 0.79182458, + "learning_rate": 4.58833038607942e-12, + "loss": 0.80670702, + "num_input_tokens_seen": 358659630, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25830078, + "step": 16621, + "time_per_iteration": 2.720644235610962 + }, + { + "auxiliary_loss_clip": 0.01084786, + "auxiliary_loss_mlp": 0.00071425, + "balance_loss_clip": 0.94855797, + "balance_loss_mlp": 0.0649403, + "epoch": 0.9993687058469863, + "flos": 71284478780160.0, + "grad_norm": 0.7292509272902782, + "language_loss": 0.56009519, + "learning_rate": 3.79200883515729e-12, + "loss": 0.5716573, + "num_input_tokens_seen": 358727840, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.06494141, + "step": 16622, + "time_per_iteration": 3.4020938873291016 + }, + { + "auxiliary_loss_clip": 0.0124198, + "auxiliary_loss_mlp": 0.00216208, + "balance_loss_clip": 1.02532113, + "balance_loss_mlp": 0.19138816, + "epoch": 0.9994288290996542, + "flos": 12199573566720.0, + "grad_norm": 3.1985054719277186, + "language_loss": 0.8121208, + "learning_rate": 3.071527340914315e-12, + "loss": 0.82670265, + "num_input_tokens_seen": 358744125, + "router_z_loss_clip": 2.16699219, + "router_z_loss_mlp": 0.24816895, + "step": 16623, + "time_per_iteration": 2.680408000946045 + }, + { + "auxiliary_loss_clip": 0.01247879, + "auxiliary_loss_mlp": 0.00224933, + "balance_loss_clip": 1.02670395, + "balance_loss_mlp": 0.19812256, + "epoch": 0.9994889523523223, + "flos": 17889942153600.0, + "grad_norm": 12.681448969547212, + "language_loss": 0.8310861, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.84581423, + "num_input_tokens_seen": 358761420, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26782227, + "step": 16624, + "time_per_iteration": 2.6754543781280518 + }, + { + "auxiliary_loss_clip": 0.01234609, + "auxiliary_loss_mlp": 0.00247478, + "balance_loss_clip": 1.02176785, + "balance_loss_mlp": 0.22163385, + "epoch": 0.9995490756049902, + "flos": 26578888064640.0, + "grad_norm": 30.221078176492927, + "language_loss": 0.82475013, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.839571, + "num_input_tokens_seen": 358782600, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.25878906, + "step": 16625, + "time_per_iteration": 2.6785974502563477 + }, + { + "auxiliary_loss_clip": 0.01218161, + "auxiliary_loss_mlp": 0.00203055, + "balance_loss_clip": 1.00777519, + "balance_loss_mlp": 0.17957032, + "epoch": 0.9996091988576582, + "flos": 22200048771840.0, + "grad_norm": 267.96574208083814, + "language_loss": 0.85752964, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.87174177, + "num_input_tokens_seen": 358801220, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.23474121, + "step": 16626, + "time_per_iteration": 2.764575958251953 + }, + { + "auxiliary_loss_clip": 0.0125091, + "auxiliary_loss_mlp": 0.00243048, + "balance_loss_clip": 1.03341556, + "balance_loss_mlp": 0.21742991, + "epoch": 0.9996693221103262, + "flos": 27373195468800.0, + "grad_norm": 185.36679579226887, + "language_loss": 0.87218666, + "learning_rate": 9.480024334429515e-13, + "loss": 0.88712621, + "num_input_tokens_seen": 358819190, + "router_z_loss_clip": 2.17675781, + "router_z_loss_mlp": 0.25598145, + "step": 16627, + "time_per_iteration": 2.676591396331787 + }, + { + "auxiliary_loss_clip": 0.01241842, + "auxiliary_loss_mlp": 0.00240874, + "balance_loss_clip": 1.02119279, + "balance_loss_mlp": 0.21545884, + "epoch": 0.9997294453629941, + "flos": 26870410846080.0, + "grad_norm": 29.223261145139087, + "language_loss": 0.79282004, + "learning_rate": 6.067215747584952e-13, + "loss": 0.80764717, + "num_input_tokens_seen": 358839850, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25427246, + "step": 16628, + "time_per_iteration": 2.822291851043701 + }, + { + "auxiliary_loss_clip": 0.0123628, + "auxiliary_loss_mlp": 0.00227987, + "balance_loss_clip": 1.01825774, + "balance_loss_mlp": 0.20133162, + "epoch": 0.9997895686156621, + "flos": 23476996247040.0, + "grad_norm": 18.278218376381442, + "language_loss": 0.81810993, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.83275259, + "num_input_tokens_seen": 358859805, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.26660156, + "step": 16629, + "time_per_iteration": 2.660444736480713 + }, + { + "auxiliary_loss_clip": 0.0126585, + "auxiliary_loss_mlp": 0.00245981, + "balance_loss_clip": 1.03863335, + "balance_loss_mlp": 0.22045828, + "epoch": 0.9998496918683301, + "flos": 20224961579520.0, + "grad_norm": 31.24288768614072, + "language_loss": 0.69713485, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.71225315, + "num_input_tokens_seen": 358877900, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.25537109, + "step": 16630, + "time_per_iteration": 2.6606853008270264 + }, + { + "auxiliary_loss_clip": 0.01256557, + "auxiliary_loss_mlp": 0.00236441, + "balance_loss_clip": 1.03392315, + "balance_loss_mlp": 0.2107513, + "epoch": 0.9999098151209981, + "flos": 21652913831040.0, + "grad_norm": 8.989207531647557, + "language_loss": 0.70419616, + "learning_rate": 3.792010017100722e-14, + "loss": 0.71912611, + "num_input_tokens_seen": 358897285, + "router_z_loss_clip": 2.22363281, + "router_z_loss_mlp": 0.25683594, + "step": 16631, + "time_per_iteration": 2.629532814025879 + }, + { + "auxiliary_loss_clip": 0.01215986, + "auxiliary_loss_mlp": 0.00212989, + "balance_loss_clip": 1.00726497, + "balance_loss_mlp": 0.18977913, + "epoch": 0.999969938373666, + "flos": 11544599018880.0, + "grad_norm": 59.75671724993049, + "language_loss": 0.80082119, + "learning_rate": 0.0, + "loss": 0.81511098, + "num_input_tokens_seen": 358911570, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.23193359, + "step": 16632, + "time_per_iteration": 2.624641180038452 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3992169073237033e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}